#4537 Rewrite generate_disjoint_byte_copy

Tue, 20 Sep 2016 11:48:21 +0800

author
jiangshaofeng
date
Tue, 20 Sep 2016 11:48:21 +0800
changeset 117
89e1dfe996be
parent 107
68d7c979cca6
child 118
bf4b1d1988a6

#4537 Rewrite generate_disjoint_byte_copy
Eliminated unaligned access and Optimized copy algorithm. The same as changeset 114
The unaligned account does not increase, has passed the SPECjvm2008 test.
20% speed up at the test program.
The test program:

public class ByteCopyTest{
public static void main(String args[]){
int count = 100000;
char []A = new char[count];
char []B = new char[count];
for(int i = 0; i < count; i++){
A[i] = (char)(i % 26 + 97);
}
long startTime = System.nanoTime();
System.arraycopy(A, 0, B, 0, count);
long endTime = System.nanoTime();
System.out.println(endTime - startTime);
}
}

src/cpu/mips/vm/stubGenerator_mips_64.cpp file | annotate | diff | comparison | revisions
     1.1 --- a/src/cpu/mips/vm/stubGenerator_mips_64.cpp	Sun Sep 18 13:43:10 2016 +0800
     1.2 +++ b/src/cpu/mips/vm/stubGenerator_mips_64.cpp	Tue Sep 20 11:48:21 2016 +0800
     1.3 @@ -629,85 +629,176 @@
     1.4    //   disjoint_byte_copy_entry is set to the no-overlap entry point
     1.5    //   used by generate_conjoint_byte_copy().
     1.6    //
     1.7 -	address generate_disjoint_byte_copy(bool aligned, const char *name) {
     1.8 -	  StubCodeMark mark(this, "StubRoutines", name);
     1.9 -	  __ align(CodeEntryAlignment);
    1.10 -	  address start = __ pc();
    1.11 -	  Label l_0, l_1, l_2, l_3, l_4, l_5, l_6;
    1.12 -
    1.13 -	  __ push(T3);
    1.14 -	  __ push(T0);
    1.15 -	  __ push(T1);
    1.16 -	  __ push(T8);
    1.17 -	  __ move(T3, A0); 
    1.18 -	  __ move(T0, A1);
    1.19 -	  __ move(T1, A2);  
    1.20 -	  __ move(T8, T1);             // original count in T1
    1.21 -	  __ daddi(AT, T1, -3); 
    1.22 -	  __ blez(AT, l_4);  
    1.23 -	  __ delayed()->nop();	
    1.24 -	  if (!aligned) {
    1.25 -          //TODO: copy 8 bytes at one time
    1.26 -	    // 2016/5/8 Jin: only when src and dest has the same alignment can we do lw/sw */
    1.27 -	    __ andi(AT, T3, 3); 
    1.28 -	    __ andi(T9, T0, 3); 
    1.29 -	    __ bne(AT, T9, l_5); 
    1.30 -	    __ delayed()->nop();	
    1.31 -	  
    1.32 -	    // align source address at dword address boundary
    1.33 -	    __ move(T1, 4); 
    1.34 -	    __ sub(T1, T1, T3); 
    1.35 -	    __ andi(T1, T1, 3); 
    1.36 -	    __ beq(T1, R0, l_1); 
    1.37 -	    __ delayed()->nop();	
    1.38 -	    __ sub(T8,T8,T1); 
    1.39 -	    __ bind(l_0);
    1.40 -	    __ lb(AT, T3, 0); 
    1.41 -	    __ sb(AT, T0, 0); 
    1.42 -	    __ addi(T3, T3, 1); 
    1.43 -	    __ addi(T0, T0, 1); 
    1.44 -	    __ addi(T1 ,T1, -1);  
    1.45 -	    __ bne(T1, R0, l_0); 
    1.46 -	    __ delayed()->nop(); 
    1.47 -	    __ bind(l_1);
    1.48 -	    __ move(T1, T8); 
    1.49 -	  }
    1.50 -	  __ shr(T1, 2); 
    1.51 -	  __ beq(T1, R0, l_4);     // no dwords to move
    1.52 -	  __ delayed()->nop(); 
    1.53 -	  // copy aligned dwords
    1.54 -	  __ bind(l_2);
    1.55 -	  __ align(16);
    1.56 -	  __ bind(l_3);
    1.57 -	  __ lw(AT, T3, 0);   
    1.58 -	  __ sw(AT, T0, 0 ); 
    1.59 -	  __ addi(T3, T3, 4); 
    1.60 -	  __ addi(T0, T0, 4); 
    1.61 -	  __ addi(T1, T1, -1); 
    1.62 -	  __ bne(T1, R0, l_3); 
    1.63 -	  __ delayed()->nop(); 
    1.64 -	  __ bind(l_4);
    1.65 -	  __ move(T1, T8); 
    1.66 -	  __ andi(T1, T1, 3); 
    1.67 -	  __ beq(T1, R0, l_6);  
    1.68 -	  __ delayed()->nop(); 
    1.69 -	  // copy suffix
    1.70 -	  __ bind(l_5);
    1.71 -	  __ lb(AT, T3, 0); 
    1.72 -	  __ sb(AT, T0, 0); 
    1.73 -	  __ addi(T3, T3, 1);  
    1.74 -	  __ addi(T0, T0, 1);  
    1.75 -	  __ addi(T1, T1, -1); 
    1.76 -	  __ bne(T1, R0, l_5 ); 
    1.77 -	  __ delayed()->nop(); 
    1.78 -	  __ bind(l_6);
    1.79 -	  __ pop(T8); 
    1.80 -	  __ pop(T1); 
    1.81 -	  __ pop(T0); 
    1.82 -	  __ pop(T3); 
    1.83 -	  __ jr(RA); 
    1.84 -	  __ delayed()->nop(); 
    1.85 -	  return start;
    1.86 +  address generate_disjoint_byte_copy(bool aligned, const char * name) {
    1.87 +    StubCodeMark mark(this, "StubRoutines", name);
    1.88 +    __ align(CodeEntryAlignment);
    1.89 +
    1.90 +
    1.91 +    Register tmp1 = T0;
    1.92 +    Register tmp2 = T1;
    1.93 +    Register tmp3 = T3;
    1.94 +
    1.95 +    address start = __ pc();
    1.96 +
    1.97 +    __ push(tmp1);
    1.98 +    __ push(tmp2);
    1.99 +    __ push(tmp3);
   1.100 +    __ move(tmp1, A0);
   1.101 +    __ move(tmp2, A1);
   1.102 +    __ move(tmp3, A2);
   1.103 +
   1.104 +
   1.105 +    Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10, l_11;
   1.106 +    Label l_debug;
   1.107 +
   1.108 +    __ daddi(AT, tmp3, -9); //why the number is 9 ?
   1.109 +    __ blez(AT, l_9);
   1.110 +    __ delayed()->nop();
   1.111 +
   1.112 +    if (!aligned) {
   1.113 +      __ xorr(AT, tmp1, tmp2);
   1.114 +      __ andi(AT, AT, 1);
   1.115 +      __ bne(AT, R0, l_9); // if arrays don't have the same alignment mod 2, do 1 element copy
   1.116 +      __ delayed()->nop();
   1.117 +
   1.118 +      __ andi(AT, tmp1, 1);
   1.119 +      __ beq(AT, R0, l_10); //copy 1 enlement if necessary to aligh to 2 bytes
   1.120 +      __ delayed()->nop();
   1.121 +
   1.122 +      __ lb(AT, tmp1, 0);
   1.123 +      __ daddi(tmp1, tmp1, 1);
   1.124 +      __ sb(AT, tmp2, 0);
   1.125 +      __ daddi(tmp2, tmp2, 1);
   1.126 +      __ daddi(tmp3, tmp3, -1);
   1.127 +      __ bind(l_10);
   1.128 +
   1.129 +      __ xorr(AT, tmp1, tmp2);
   1.130 +      __ andi(AT, AT, 3);
   1.131 +      __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 2 elements copy
   1.132 +      __ delayed()->nop();
   1.133 +
   1.134 +      // At this point it is guaranteed that both, from and to have the same alignment mod 4.
   1.135 +
   1.136 +      // Copy 2 elements if necessary to align to 4 bytes.
   1.137 +      __ andi(AT, tmp1, 3);
   1.138 +      __ beq(AT, R0, l_2);
   1.139 +      __ delayed()->nop();
   1.140 +
   1.141 +      __ lhu(AT, tmp1, 0);
   1.142 +      __ daddi(tmp1, tmp1, 2);
   1.143 +      __ sh(AT, tmp2, 0);
   1.144 +      __ daddi(tmp2, tmp2, 2);
   1.145 +      __ daddi(tmp3, tmp3, -2);
   1.146 +      __ bind(l_2);
   1.147 +
   1.148 +      // At this point the positions of both, from and to, are at least 4 byte aligned.
   1.149 +
   1.150 +      // Copy 4 elements at a time.
   1.151 +      // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
   1.152 +      __ xorr(AT, tmp1, tmp2);
   1.153 +      __ andi(AT, AT, 7);
   1.154 +      __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned
   1.155 +      __ delayed()->nop();
   1.156 +
   1.157 +      // Copy a 4 elements if necessary to align to 8 bytes.
   1.158 +      __ andi(AT, tmp1, 7);
   1.159 +      __ beq(AT, R0, l_7);
   1.160 +      __ delayed()->nop();
   1.161 +
   1.162 +      __ lw(AT, tmp1, 0);
   1.163 +      __ daddi(tmp3, tmp3, -4);
   1.164 +      __ sw(AT, tmp2, 0);
   1.165 +      { // FasterArrayCopy
   1.166 +        __ daddi(tmp1, tmp1, 4);
   1.167 +        __ daddi(tmp2, tmp2, 4);
   1.168 +      }
   1.169 +    }
   1.170 +
   1.171 +    __ bind(l_7);
   1.172 +
   1.173 +    // Copy 4 elements at a time; either the loads or the stores can
   1.174 +    // be unaligned if aligned == false.
   1.175 +
   1.176 +    { // FasterArrayCopy
   1.177 +      __ daddi(AT, tmp3, -7);
   1.178 +      __ blez(AT, l_6); // copy 4 at a time if less than 4 elements remain
   1.179 +      __ delayed()->nop();
   1.180 +
   1.181 +      __ bind(l_8);
   1.182 +      // For Loongson, there is 128-bit memory access. TODO
   1.183 +      __ ld(AT, tmp1, 0);
   1.184 +      __ sd(AT, tmp2, 0);
   1.185 +      __ daddi(tmp1, tmp1, 8);
   1.186 +      __ daddi(tmp2, tmp2, 8);
   1.187 +      __ daddi(tmp3, tmp3, -8);
   1.188 +      __ daddi(AT, tmp3, -8);
   1.189 +      __ bgez(AT, l_8);
   1.190 +      __ delayed()->nop();
   1.191 +    }
   1.192 +    __ bind(l_6);
   1.193 +
   1.194 +    // copy 4 bytes at a time
   1.195 +    { // FasterArrayCopy
   1.196 +      __ daddi(AT, tmp3, -3);
   1.197 +      __ blez(AT, l_1);
   1.198 +      __ delayed()->nop();
   1.199 +
   1.200 +      __ bind(l_3);
   1.201 +      __ lw(AT, tmp1, 0);
   1.202 +      __ sw(AT, tmp2, 0);
   1.203 +      __ daddi(tmp1, tmp1, 4);
   1.204 +      __ daddi(tmp2, tmp2, 4);
   1.205 +      __ daddi(tmp3, tmp3, -4);
   1.206 +      __ daddi(AT, tmp3, -4);
   1.207 +      __ bgez(AT, l_3);
   1.208 +      __ delayed()->nop();
   1.209 +
   1.210 +    }
   1.211 +
   1.212 +    // do 2 bytes copy
   1.213 +    __ bind(l_1);
   1.214 +    { 
   1.215 +      __ daddi(AT, tmp3, -1);
   1.216 +      __ blez(AT, l_9);
   1.217 +      __ delayed()->nop();
   1.218 +
   1.219 +      __ bind(l_5);
   1.220 +      __ lhu(AT, tmp1, 0);
   1.221 +      __ daddi(tmp3, tmp3, -2);
   1.222 +      __ sh(AT, tmp2, 0);
   1.223 +      __ daddi(tmp1, tmp1, 2);
   1.224 +      __ daddi(tmp2, tmp2, 2);
   1.225 +      __ daddi(AT, tmp3, -2);
   1.226 +      __ bgez(AT, l_5);
   1.227 +      __ delayed()->nop();
   1.228 +    }
   1.229 +
   1.230 +    //do 1 element copy--byte
   1.231 +    __ bind(l_9);
   1.232 +    __ beq(R0, tmp3, l_4);
   1.233 +    __ delayed()->nop();
   1.234 +
   1.235 +    {
   1.236 +      __ bind(l_11);
   1.237 +      __ lb(AT, tmp1, 0);
   1.238 +      __ daddi(tmp3, tmp3, -1);
   1.239 +      __ sb(AT, tmp2, 0);
   1.240 +      __ daddi(tmp1, tmp1, 1);
   1.241 +      __ daddi(tmp2, tmp2, 1);
   1.242 +      __ daddi(AT, tmp3, -1);
   1.243 +      __ bgez(AT, l_11);
   1.244 +      __ delayed()->nop();
   1.245 +    }
   1.246 +
   1.247 +    __ bind(l_4);
   1.248 +    __ pop(tmp3);
   1.249 +    __ pop(tmp2);
   1.250 +    __ pop(tmp1);
   1.251 +
   1.252 +    __ jr(RA);
   1.253 +    __ delayed()->nop();
   1.254 +
   1.255 +    return start;
   1.256    }
   1.257  
   1.258    // Arguments:

mercurial