# HG changeset patch
# User aoqi
# Date 1464588098 14400
# Node ID bc227c49eaae7b4e9c392073b6f22a782e7aa36a
# Parent  62f1a7e7d6e5cb6a04a2509a288b7de389eaf237
[C2] Rewrite generate_disjoint_short_copy.
Eliminated unaligned access and Optimized copy algorithm.
xml.transform improved by 50%, total GEO improved by 13%.
Copy Algorithm:
 Generate stub for disjoint short copy.  If "aligned" is true, the
 "from" and "to" addresses are assumed to be heapword aligned.

 Arguments for generated stub:
      from:  A0
      to:    A1
  elm.count: A2 treated as signed
  one element: 2 bytes

 Strategy for aligned==true:

  If length <= 9:
     1. copy 1 elements at a time (l_5)

  If length > 9:
     1. copy 4 elements at a time until less than 4 elements are left (l_7)
     2. copy 2 elements at a time until less than 2 elements are left (l_6)
     3. copy last element if one was left in step 2. (l_1)


 Strategy for aligned==false:

  If length <= 9: same as aligned==true case

  If length > 9:
     1. continue with step 7. if the alignment of from and to mod 4
        is different.
     2. align from and to to 4 bytes by copying 1 element if necessary
     3. at l_2 from and to are 4 byte aligned; continue with
        6. if they cannot be aligned to 8 bytes because they have
        got different alignment mod 8.
     4. at this point we know that both, from and to, have the same
        alignment mod 8, now copy one element if necessary to get
        8 byte alignment of from and to.
     5. copy 4 elements at a time until less than 4 elements are
        left; depending on step 3. all load/stores are aligned.
     6. copy 2 elements at a time until less than 2 elements are
        left. (l_6)
     7. copy 1 element at a time. (l_5)
     8. copy last element if one was left in step 6. (l_1)

  TODO:

  1. use loongson 128-bit load/store
  2. use loop unrolling optimization when len is big enough, for example if
len > 0x2000:
    __ bind(l_x);
    __ ld(AT, tmp1, 0);
    __ ld(tmp, tmp1, 8);
    __ sd(AT, tmp2, 0);
    __ sd(tmp, tmp2, 8);
    __ ld(AT, tmp1, 16);
    __ ld(tmp, tmp1, 24);
    __ sd(AT, tmp2, 16);
    __ sd(tmp, tmp2, 24);
    __ daddi(tmp1, tmp1, 32);
    __ daddi(tmp2, tmp2, 32);
    __ daddi(tmp3, tmp3, -16);
    __ daddi(AT, tmp3, -16);
    __ bgez(AT, l_x);
    __ delayed()->nop();

diff -r 62f1a7e7d6e5 -r bc227c49eaae src/cpu/mips/vm/stubGenerator_mips_64.cpp
--- a/src/cpu/mips/vm/stubGenerator_mips_64.cpp	Mon May 30 01:30:23 2016 -0400
+++ b/src/cpu/mips/vm/stubGenerator_mips_64.cpp	Mon May 30 02:01:38 2016 -0400
@@ -849,85 +849,208 @@
     return start;
   }
 
-  // Arguments:
-  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
-  //             ignored
-  //   name    - stub name string
+  // Generate stub for disjoint short copy.  If "aligned" is true, the
+  // "from" and "to" addresses are assumed to be heapword aligned.
   //
-  // Inputs:
-  //   c_rarg0   - source array address
-  //   c_rarg1   - destination array address
-  //   c_rarg2   - element count, treated as ssize_t, can be zero
+  // Arguments for generated stub:
+  //      from:  A0
+  //      to:    A1
+  //  elm.count: A2 treated as signed
+  //  one element: 2 bytes
   //
-  // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
-  // let the hardware handle it.  The two or four words within dwords
-  // or qwords that span cache line boundaries will still be loaded
-  // and stored atomically.
+  // Strategy for aligned==true:
   //
-  // Side Effects:
-  //   disjoint_short_copy_entry is set to the no-overlap entry point
-  //   used by generate_conjoint_short_copy().
+  //  If length <= 9:
+  //     1. copy 1 elements at a time (l_5)
   //
-  address generate_disjoint_short_copy(bool aligned, const char *name) {
-		Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
-		StubCodeMark mark(this, "StubRoutines", name);
-		__ align(CodeEntryAlignment);
-		address start = __ pc();
+  //  If length > 9:
+  //     1. copy 4 elements at a time until less than 4 elements are left (l_7)
+  //     2. copy 2 elements at a time until less than 2 elements are left (l_6)
+  //     3. copy last element if one was left in step 2. (l_1)
+  //
+  //
+  // Strategy for aligned==false:
+  //
+  //  If length <= 9: same as aligned==true case
+  //
+  //  If length > 9:
+  //     1. continue with step 7. if the alignment of from and to mod 4
+  //        is different.
+  //     2. align from and to to 4 bytes by copying 1 element if necessary
+  //     3. at l_2 from and to are 4 byte aligned; continue with
+  //        6. if they cannot be aligned to 8 bytes because they have
+  //        got different alignment mod 8.
+  //     4. at this point we know that both, from and to, have the same
+  //        alignment mod 8, now copy one element if necessary to get
+  //        8 byte alignment of from and to.
+  //     5. copy 4 elements at a time until less than 4 elements are
+  //        left; depending on step 3. all load/stores are aligned.
+  //     6. copy 2 elements at a time until less than 2 elements are
+  //        left. (l_6)
+  //     7. copy 1 element at a time. (l_5)
+  //     8. copy last element if one was left in step 6. (l_1)
+  //
+  //  TODO:
+  //
+  //  1. use loongson 128-bit load/store
+  //  2. use loop unrolling optimization when len is big enough, for example if len > 0x2000:
+  //    __ bind(l_x);
+  //    __ ld(AT, tmp1, 0);
+  //    __ ld(tmp, tmp1, 8);
+  //    __ sd(AT, tmp2, 0);
+  //    __ sd(tmp, tmp2, 8);
+  //    __ ld(AT, tmp1, 16);
+  //    __ ld(tmp, tmp1, 24);
+  //    __ sd(AT, tmp2, 16);
+  //    __ sd(tmp, tmp2, 24);
+  //    __ daddi(tmp1, tmp1, 32);
+  //    __ daddi(tmp2, tmp2, 32);
+  //    __ daddi(tmp3, tmp3, -16);
+  //    __ daddi(AT, tmp3, -16);
+  //    __ bgez(AT, l_x);
+  //    __ delayed()->nop();
+  //
+  address generate_disjoint_short_copy(bool aligned, const char * name) {
+    StubCodeMark mark(this, "StubRoutines", name);
+    __ align(CodeEntryAlignment);
 
-		__ push(T3);	
-		__ push(T0);	
-		__ push(T1);	
-		__ push(T8);	
-		__ move(T1, A2);  
-		__ move(T3, A0); 
-		__ move(T0, A1);
+    Register tmp1 = T0;
+    Register tmp2 = T1;
+    Register tmp3 = T3;
 
-		if (!aligned) {
-			__ beq(T1, R0, l_5);
-			__ delayed()->nop(); 
-			// align source address at dword address boundary
-			__ move(T8, T3); // original from
-			__ andi(T8, T8, 3); // either 0 or 2
-			__ beq(T8, R0, l_1); // no prefix
-			__ delayed()->nop();
-			// copy prefix
-			__ lh(AT, T3, 0);
-			__ sh(AT, T0, 0); 
-			__ add(T3, T3, T8); 
-			__ add(T0, T0, T8);
-			__ addi(T1, T1, -1); 
-			__ bind(l_1);
-		}
-		__ move(T8, T1);            // word count less prefix
-		__ sra(T1, T1, 1); 
-		__ beq(T1, R0, l_4); 
-		__ delayed()->nop(); 
-    // copy aligned dwords
-		__ bind(l_2);
-		__ align(16);
-		__ bind(l_3);
-		__ lw(AT, T3, 0);   
-		__ sw(AT, T0, 0 ); 
-		__ addi(T3, T3, 4); 
-		__ addi(T0, T0, 4); 
-		__ addi(T1, T1, -1); 
-		__ bne(T1, R0, l_3); 
-		__ delayed()->nop(); 
-		__ bind(l_4);
-		__ andi(T8, T8, 1); 
-		__ beq(T8, R0, l_5);  
-		__ delayed()->nop(); 
-		// copy suffix
-		__ lh(AT, T3, 0); 
-		__ sh(AT, T0, 0); 
-		__ bind(l_5);
-		__ pop(T8);	
-		__ pop(T1);	
-		__ pop(T0);	
-		__ pop(T3);	
-		__ jr(RA); 
-		__ delayed()->nop();  
-		return start;
+    address start = __ pc();
+
+    __ push(tmp1);
+    __ push(tmp2);
+    __ push(tmp3);
+    __ move(tmp1, A0);
+    __ move(tmp2, A1);
+    __ move(tmp3, A2);
+
+    Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
+    Label l_debug;
+    // don't try anything fancy if arrays don't have many elements
+    __ daddi(AT, tmp3, -9);
+    __ blez(AT, l_1);
+    __ delayed()->nop();
+
+    if (!aligned) {
+      __ xorr(AT, A0, A1);
+      __ andi(AT, AT, 1);
+      __ bne(AT, R0, l_debug); // if arrays don't have the same alignment mod 2, can this happen?
+      __ delayed()->nop();
+
+      __ xorr(AT, A0, A1);
+      __ andi(AT, AT, 3);
+      __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 1 element copy
+      __ delayed()->nop();
+
+      // At this point it is guaranteed that both, from and to have the same alignment mod 4.
+
+      // Copy 1 element if necessary to align to 4 bytes.
+      __ andi(AT, A0, 3);
+      __ beq(AT, R0, l_2);
+      __ delayed()->nop();
+
+      __ lhu(AT, tmp1, 0);
+      __ daddi(tmp1, tmp1, 2);
+      __ sh(AT, tmp2, 0);
+      __ daddi(tmp2, tmp2, 2);
+      __ daddi(tmp3, tmp3, -1);
+      __ bind(l_2);
+
+      // At this point the positions of both, from and to, are at least 4 byte aligned.
+
+      // Copy 4 elements at a time.
+      // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
+      __ xorr(AT, tmp1, tmp2);
+      __ andi(AT, AT, 7);
+      __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned
+      __ delayed()->nop();
+
+      // Copy a 2-element word if necessary to align to 8 bytes.
+      __ andi(AT, tmp1, 7);
+      __ beq(AT, R0, l_7);
+      __ delayed()->nop();
+
+      __ lw(AT, tmp1, 0);
+      __ daddi(tmp3, tmp3, -2);
+      __ sw(AT, tmp2, 0);
+      { // FasterArrayCopy
+        __ daddi(tmp1, tmp1, 4);
+        __ daddi(tmp2, tmp2, 4);
+      }
+    }
+
+    __ bind(l_7);
+
+    // Copy 4 elements at a time; either the loads or the stores can
+    // be unaligned if aligned == false.
+
+    { // FasterArrayCopy
+      __ daddi(AT, tmp3, -15);
+      __ blez(AT, l_6); // copy 2 at a time if less than 16 elements remain
+      __ delayed()->nop();
+
+      __ bind(l_8);
+      // For Loongson, there is 128-bit memory access. TODO
+      __ ld(AT, tmp1, 0);
+      __ sd(AT, tmp2, 0);
+      __ daddi(tmp1, tmp1, 8);
+      __ daddi(tmp2, tmp2, 8);
+      __ daddi(tmp3, tmp3, -4);
+      __ daddi(AT, tmp3, -4);
+      __ bgez(AT, l_8);
+      __ delayed()->nop();
+    }
+    __ bind(l_6);
+
+    // copy 2 element at a time
+    { // FasterArrayCopy
+      __ daddi(AT, tmp3, -1);
+      __ blez(AT, l_1);
+      __ delayed()->nop();
+
+      __ bind(l_3);
+      __ lw(AT, tmp1, 0);
+      __ sw(AT, tmp2, 0);
+      __ daddi(tmp1, tmp1, 4);
+      __ daddi(tmp2, tmp2, 4);
+      __ daddi(tmp3, tmp3, -2);
+      __ daddi(AT, tmp3, -2);
+      __ bgez(AT, l_3);
+      __ delayed()->nop();
+
+    }
+
+    // do single element copy (8 bit), can this happen?
+    __ bind(l_1);
+    __ beq(R0, tmp3, l_4);
+    __ delayed()->nop();
+
+    { // FasterArrayCopy
+
+      __ bind(l_5);
+      __ lhu(AT, tmp1, 0);
+      __ daddi(tmp3, tmp3, -1);
+      __ sh(AT, tmp2, 0);
+      __ daddi(tmp1, tmp1, 2);
+      __ daddi(tmp2, tmp2, 2);
+      __ daddi(AT, tmp3, -1);
+      __ bgez(AT, l_5);
+      __ delayed()->nop();
+    }
+    __ bind(l_4);
+    __ pop(tmp3);
+    __ pop(tmp2);
+    __ pop(tmp1);
+
+    __ jr(RA);
+    __ delayed()->nop();
+
+    __ bind(l_debug);
+    __ stop("generate_disjoint_short_copy should not reach here");
+    return start;
   }
 
   // Arguments: