Tue, 20 Sep 2016 11:48:21 +0800
#4537 Rewrite generate_disjoint_byte_copy
Eliminated unaligned access and Optimized copy algorithm. The same as changeset 114
The unaligned account does not increase, has passed the SPECjvm2008 test.
20% speed up at the test program.
The test program:
public class ByteCopyTest{
public static void main(String args[]){
int count = 100000;
char []A = new char[count];
char []B = new char[count];
for(int i = 0; i < count; i++){
A[i] = (char)(i % 26 + 97);
}
long startTime = System.nanoTime();
System.arraycopy(A, 0, B, 0, count);
long endTime = System.nanoTime();
System.out.println(endTime - startTime);
}
}
src/cpu/mips/vm/stubGenerator_mips_64.cpp | file | annotate | diff | comparison | revisions |
1.1 --- a/src/cpu/mips/vm/stubGenerator_mips_64.cpp Sun Sep 18 13:43:10 2016 +0800 1.2 +++ b/src/cpu/mips/vm/stubGenerator_mips_64.cpp Tue Sep 20 11:48:21 2016 +0800 1.3 @@ -629,85 +629,176 @@ 1.4 // disjoint_byte_copy_entry is set to the no-overlap entry point 1.5 // used by generate_conjoint_byte_copy(). 1.6 // 1.7 - address generate_disjoint_byte_copy(bool aligned, const char *name) { 1.8 - StubCodeMark mark(this, "StubRoutines", name); 1.9 - __ align(CodeEntryAlignment); 1.10 - address start = __ pc(); 1.11 - Label l_0, l_1, l_2, l_3, l_4, l_5, l_6; 1.12 - 1.13 - __ push(T3); 1.14 - __ push(T0); 1.15 - __ push(T1); 1.16 - __ push(T8); 1.17 - __ move(T3, A0); 1.18 - __ move(T0, A1); 1.19 - __ move(T1, A2); 1.20 - __ move(T8, T1); // original count in T1 1.21 - __ daddi(AT, T1, -3); 1.22 - __ blez(AT, l_4); 1.23 - __ delayed()->nop(); 1.24 - if (!aligned) { 1.25 - //TODO: copy 8 bytes at one time 1.26 - // 2016/5/8 Jin: only when src and dest has the same alignment can we do lw/sw */ 1.27 - __ andi(AT, T3, 3); 1.28 - __ andi(T9, T0, 3); 1.29 - __ bne(AT, T9, l_5); 1.30 - __ delayed()->nop(); 1.31 - 1.32 - // align source address at dword address boundary 1.33 - __ move(T1, 4); 1.34 - __ sub(T1, T1, T3); 1.35 - __ andi(T1, T1, 3); 1.36 - __ beq(T1, R0, l_1); 1.37 - __ delayed()->nop(); 1.38 - __ sub(T8,T8,T1); 1.39 - __ bind(l_0); 1.40 - __ lb(AT, T3, 0); 1.41 - __ sb(AT, T0, 0); 1.42 - __ addi(T3, T3, 1); 1.43 - __ addi(T0, T0, 1); 1.44 - __ addi(T1 ,T1, -1); 1.45 - __ bne(T1, R0, l_0); 1.46 - __ delayed()->nop(); 1.47 - __ bind(l_1); 1.48 - __ move(T1, T8); 1.49 - } 1.50 - __ shr(T1, 2); 1.51 - __ beq(T1, R0, l_4); // no dwords to move 1.52 - __ delayed()->nop(); 1.53 - // copy aligned dwords 1.54 - __ bind(l_2); 1.55 - __ align(16); 1.56 - __ bind(l_3); 1.57 - __ lw(AT, T3, 0); 1.58 - __ sw(AT, T0, 0 ); 1.59 - __ addi(T3, T3, 4); 1.60 - __ addi(T0, T0, 4); 1.61 - __ addi(T1, T1, -1); 1.62 - __ bne(T1, R0, l_3); 1.63 - __ delayed()->nop(); 1.64 - __ bind(l_4); 1.65 - __ move(T1, T8); 1.66 - __ andi(T1, T1, 3); 1.67 - __ beq(T1, R0, l_6); 1.68 - __ delayed()->nop(); 1.69 - // copy suffix 1.70 - __ bind(l_5); 1.71 - __ lb(AT, T3, 0); 1.72 - __ sb(AT, T0, 0); 1.73 - __ addi(T3, T3, 1); 1.74 - __ addi(T0, T0, 1); 1.75 - __ addi(T1, T1, -1); 1.76 - __ bne(T1, R0, l_5 ); 1.77 - __ delayed()->nop(); 1.78 - __ bind(l_6); 1.79 - __ pop(T8); 1.80 - __ pop(T1); 1.81 - __ pop(T0); 1.82 - __ pop(T3); 1.83 - __ jr(RA); 1.84 - __ delayed()->nop(); 1.85 - return start; 1.86 + address generate_disjoint_byte_copy(bool aligned, const char * name) { 1.87 + StubCodeMark mark(this, "StubRoutines", name); 1.88 + __ align(CodeEntryAlignment); 1.89 + 1.90 + 1.91 + Register tmp1 = T0; 1.92 + Register tmp2 = T1; 1.93 + Register tmp3 = T3; 1.94 + 1.95 + address start = __ pc(); 1.96 + 1.97 + __ push(tmp1); 1.98 + __ push(tmp2); 1.99 + __ push(tmp3); 1.100 + __ move(tmp1, A0); 1.101 + __ move(tmp2, A1); 1.102 + __ move(tmp3, A2); 1.103 + 1.104 + 1.105 + Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10, l_11; 1.106 + Label l_debug; 1.107 + 1.108 + __ daddi(AT, tmp3, -9); //why the number is 9 ? 1.109 + __ blez(AT, l_9); 1.110 + __ delayed()->nop(); 1.111 + 1.112 + if (!aligned) { 1.113 + __ xorr(AT, tmp1, tmp2); 1.114 + __ andi(AT, AT, 1); 1.115 + __ bne(AT, R0, l_9); // if arrays don't have the same alignment mod 2, do 1 element copy 1.116 + __ delayed()->nop(); 1.117 + 1.118 + __ andi(AT, tmp1, 1); 1.119 + __ beq(AT, R0, l_10); //copy 1 enlement if necessary to aligh to 2 bytes 1.120 + __ delayed()->nop(); 1.121 + 1.122 + __ lb(AT, tmp1, 0); 1.123 + __ daddi(tmp1, tmp1, 1); 1.124 + __ sb(AT, tmp2, 0); 1.125 + __ daddi(tmp2, tmp2, 1); 1.126 + __ daddi(tmp3, tmp3, -1); 1.127 + __ bind(l_10); 1.128 + 1.129 + __ xorr(AT, tmp1, tmp2); 1.130 + __ andi(AT, AT, 3); 1.131 + __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 2 elements copy 1.132 + __ delayed()->nop(); 1.133 + 1.134 + // At this point it is guaranteed that both, from and to have the same alignment mod 4. 1.135 + 1.136 + // Copy 2 elements if necessary to align to 4 bytes. 1.137 + __ andi(AT, tmp1, 3); 1.138 + __ beq(AT, R0, l_2); 1.139 + __ delayed()->nop(); 1.140 + 1.141 + __ lhu(AT, tmp1, 0); 1.142 + __ daddi(tmp1, tmp1, 2); 1.143 + __ sh(AT, tmp2, 0); 1.144 + __ daddi(tmp2, tmp2, 2); 1.145 + __ daddi(tmp3, tmp3, -2); 1.146 + __ bind(l_2); 1.147 + 1.148 + // At this point the positions of both, from and to, are at least 4 byte aligned. 1.149 + 1.150 + // Copy 4 elements at a time. 1.151 + // Align to 8 bytes, but only if both, from and to, have same alignment mod 8. 1.152 + __ xorr(AT, tmp1, tmp2); 1.153 + __ andi(AT, AT, 7); 1.154 + __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned 1.155 + __ delayed()->nop(); 1.156 + 1.157 + // Copy a 4 elements if necessary to align to 8 bytes. 1.158 + __ andi(AT, tmp1, 7); 1.159 + __ beq(AT, R0, l_7); 1.160 + __ delayed()->nop(); 1.161 + 1.162 + __ lw(AT, tmp1, 0); 1.163 + __ daddi(tmp3, tmp3, -4); 1.164 + __ sw(AT, tmp2, 0); 1.165 + { // FasterArrayCopy 1.166 + __ daddi(tmp1, tmp1, 4); 1.167 + __ daddi(tmp2, tmp2, 4); 1.168 + } 1.169 + } 1.170 + 1.171 + __ bind(l_7); 1.172 + 1.173 + // Copy 4 elements at a time; either the loads or the stores can 1.174 + // be unaligned if aligned == false. 1.175 + 1.176 + { // FasterArrayCopy 1.177 + __ daddi(AT, tmp3, -7); 1.178 + __ blez(AT, l_6); // copy 4 at a time if less than 4 elements remain 1.179 + __ delayed()->nop(); 1.180 + 1.181 + __ bind(l_8); 1.182 + // For Loongson, there is 128-bit memory access. TODO 1.183 + __ ld(AT, tmp1, 0); 1.184 + __ sd(AT, tmp2, 0); 1.185 + __ daddi(tmp1, tmp1, 8); 1.186 + __ daddi(tmp2, tmp2, 8); 1.187 + __ daddi(tmp3, tmp3, -8); 1.188 + __ daddi(AT, tmp3, -8); 1.189 + __ bgez(AT, l_8); 1.190 + __ delayed()->nop(); 1.191 + } 1.192 + __ bind(l_6); 1.193 + 1.194 + // copy 4 bytes at a time 1.195 + { // FasterArrayCopy 1.196 + __ daddi(AT, tmp3, -3); 1.197 + __ blez(AT, l_1); 1.198 + __ delayed()->nop(); 1.199 + 1.200 + __ bind(l_3); 1.201 + __ lw(AT, tmp1, 0); 1.202 + __ sw(AT, tmp2, 0); 1.203 + __ daddi(tmp1, tmp1, 4); 1.204 + __ daddi(tmp2, tmp2, 4); 1.205 + __ daddi(tmp3, tmp3, -4); 1.206 + __ daddi(AT, tmp3, -4); 1.207 + __ bgez(AT, l_3); 1.208 + __ delayed()->nop(); 1.209 + 1.210 + } 1.211 + 1.212 + // do 2 bytes copy 1.213 + __ bind(l_1); 1.214 + { 1.215 + __ daddi(AT, tmp3, -1); 1.216 + __ blez(AT, l_9); 1.217 + __ delayed()->nop(); 1.218 + 1.219 + __ bind(l_5); 1.220 + __ lhu(AT, tmp1, 0); 1.221 + __ daddi(tmp3, tmp3, -2); 1.222 + __ sh(AT, tmp2, 0); 1.223 + __ daddi(tmp1, tmp1, 2); 1.224 + __ daddi(tmp2, tmp2, 2); 1.225 + __ daddi(AT, tmp3, -2); 1.226 + __ bgez(AT, l_5); 1.227 + __ delayed()->nop(); 1.228 + } 1.229 + 1.230 + //do 1 element copy--byte 1.231 + __ bind(l_9); 1.232 + __ beq(R0, tmp3, l_4); 1.233 + __ delayed()->nop(); 1.234 + 1.235 + { 1.236 + __ bind(l_11); 1.237 + __ lb(AT, tmp1, 0); 1.238 + __ daddi(tmp3, tmp3, -1); 1.239 + __ sb(AT, tmp2, 0); 1.240 + __ daddi(tmp1, tmp1, 1); 1.241 + __ daddi(tmp2, tmp2, 1); 1.242 + __ daddi(AT, tmp3, -1); 1.243 + __ bgez(AT, l_11); 1.244 + __ delayed()->nop(); 1.245 + } 1.246 + 1.247 + __ bind(l_4); 1.248 + __ pop(tmp3); 1.249 + __ pop(tmp2); 1.250 + __ pop(tmp1); 1.251 + 1.252 + __ jr(RA); 1.253 + __ delayed()->nop(); 1.254 + 1.255 + return start; 1.256 } 1.257 1.258 // Arguments: