1.1 --- a/src/cpu/mips/vm/stubGenerator_mips_64.cpp Sun May 08 22:17:54 2016 -0400 1.2 +++ b/src/cpu/mips/vm/stubGenerator_mips_64.cpp Tue May 10 15:08:51 2016 -0400 1.3 @@ -550,6 +550,14 @@ 1.4 __ bgtz(AT, no_overlap_target); 1.5 __ delayed()->nop(); 1.6 1.7 + // 2016/05/10 aoqi: If A0 = 0xf... and A1 = 0x0..., than goto no_overlap_target 1.8 + Label L; 1.9 + __ bgez(A0, L); 1.10 + __ delayed()->nop(); 1.11 + __ bgtz(A1, no_overlap_target); 1.12 + __ delayed()->nop(); 1.13 + __ bind(L); 1.14 + 1.15 } 1.16 1.17 // 1.18 @@ -638,6 +646,7 @@ 1.19 __ blez(AT, l_4); 1.20 __ delayed()->nop(); 1.21 if (!aligned) { 1.22 + //TODO: copy 8 bytes at one time 1.23 // 2016/5/8 Jin: only when src and dest has the same alignment can we do lw/sw */ 1.24 __ andi(AT, T3, 3); 1.25 __ andi(T9, T0, 3); 1.26 @@ -706,9 +715,9 @@ 1.27 // name - stub name string 1.28 // 1.29 // Inputs: 1.30 - // c_rarg0 - source array address 1.31 - // c_rarg1 - destination array address 1.32 - // c_rarg2 - element count, treated as ssize_t, can be zero 1.33 + // A0 - source array address 1.34 + // A1 - destination array address 1.35 + // A2 - element count, treated as ssize_t, can be zero 1.36 // 1.37 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1.38 // we let the hardware handle it. The one to eight bytes within words, 1.39 @@ -716,95 +725,128 @@ 1.40 // and stored atomically. 1.41 // 1.42 address generate_conjoint_byte_copy(bool aligned, const char *name) { 1.43 - Label l_1, l_2, l_3, l_4, l_5; 1.44 - Label l_unaligned, l_aligned; 1.45 - StubCodeMark mark(this, "StubRoutines", name); 1.46 - __ align(CodeEntryAlignment); 1.47 - address start = __ pc(); 1.48 - address nooverlap_target = aligned ? 1.49 - StubRoutines::arrayof_jbyte_disjoint_arraycopy() : 1.50 - StubRoutines::jbyte_disjoint_arraycopy(); 1.51 + __ align(CodeEntryAlignment); 1.52 + StubCodeMark mark(this, "StubRoutines", name); 1.53 + address start = __ pc(); 1.54 1.55 - array_overlap_test(nooverlap_target, 0); 1.56 + Label l_copy_4_bytes_loop, l_copy_suffix, l_copy_suffix_loop, l_exit; 1.57 + Label l_copy_byte, l_from_unaligned, l_unaligned, l_4_bytes_aligned; 1.58 1.59 - __ push(T3); 1.60 - __ push(T0); 1.61 - __ push(T1); 1.62 - __ push(T8); 1.63 + address nooverlap_target = aligned ? 1.64 + StubRoutines::arrayof_jbyte_disjoint_arraycopy() : 1.65 + StubRoutines::jbyte_disjoint_arraycopy(); 1.66 1.67 - // copy from high to low 1.68 - __ move(T3, A0); 1.69 - __ move(T0, A1); 1.70 - __ move(T1, A2); 1.71 - __ dadd(T3, T3, T1); 1.72 - __ dadd(T0, T0, T1); 1.73 + array_overlap_test(nooverlap_target, 0); 1.74 1.75 - // 2016/5/8 Jin: copy starting unalinged bytes 1.76 - __ bind(l_unaligned); 1.77 - __ beq(T1, R0, l_5); 1.78 - __ delayed()->nop(); 1.79 + const Register from = A0; // source array address 1.80 + const Register to = A1; // destination array address 1.81 + const Register count = A2; // elements count 1.82 + const Register end_from = T3; // source array end address 1.83 + const Register end_to = T0; // destination array end address 1.84 + const Register end_count = T1; // destination array end address 1.85 1.86 - __ andi(AT, T3, 3); 1.87 - __ beq(AT, R0, l_aligned); 1.88 - __ delayed()->nop(); 1.89 - __ lb(AT, T3, -1); 1.90 - __ sb(AT, T0, -1); 1.91 - __ daddi(AT, T1, -1); 1.92 - __ daddi(AT, T3, -1); 1.93 - __ daddi(AT, T0, -1); 1.94 - __ b(l_unaligned); 1.95 - __ delayed()->nop(); 1.96 - 1.97 - // now T0, T3 point to 4-byte aligned high-ends 1.98 - // T1 contains byte count that is not copied. 1.99 - __ bind(l_aligned); 1.100 + __ push(end_from); 1.101 + __ push(end_to); 1.102 + __ push(end_count); 1.103 + __ push(T8); 1.104 1.105 - __ move(T8, T1); 1.106 - __ daddi(AT, T1, -3); 1.107 - __ blez(AT, l_3); 1.108 - __ delayed()->nop(); 1.109 + // copy from high to low 1.110 + __ move(end_count, count); 1.111 + __ dadd(end_from, from, end_count); 1.112 + __ dadd(end_to, to, end_count); 1.113 1.114 - __ andi(T8, T8, 3); 1.115 - __ lea(T3, Address(T3, -4)); 1.116 - __ lea(T0, Address(T0, -4)); 1.117 + // 2016/05/08 aoqi: If end_from and end_to has differante alignment, unaligned copy is performed. 1.118 + __ andi(AT, end_from, 3); 1.119 + __ andi(T8, end_to, 3); 1.120 + __ bne(AT, T8, l_copy_byte); 1.121 + __ delayed()->nop(); 1.122 1.123 - __ dsrl(T1, T1, 2); 1.124 - __ align(16); 1.125 - __ bind(l_1); 1.126 - __ lw(AT, T3, 0); 1.127 - __ sw(AT, T0, 0); 1.128 - __ addi(T3, T3, -4); 1.129 - __ addi(T0, T0, -4); 1.130 - __ addi(T1, T1, -1); 1.131 - __ bne(T1, R0, l_1); 1.132 - __ delayed()->nop(); 1.133 - __ b(l_3); 1.134 - __ delayed()->nop(); 1.135 - // copy dwords aligned or not with repeat move 1.136 - __ bind(l_2); 1.137 - __ bind(l_3); 1.138 - // copy suffix (0-3 bytes) 1.139 - __ andi(T8, T8, 3); 1.140 - __ beq(T8, R0, l_5); 1.141 - __ delayed()->nop(); 1.142 - __ addi(T3, T3, 3); 1.143 - __ addi(T0, T0, 3); 1.144 - __ bind(l_4); 1.145 - __ lb(AT, T3, 0); 1.146 - __ sb(AT, T0, 0); 1.147 - __ addi(T3, T3, -1); 1.148 - __ addi(T0, T0, -1); 1.149 - __ addi(T8, T8, -1); 1.150 - __ bne(T8, R0, l_4); 1.151 - __ delayed()->nop(); 1.152 - __ bind(l_5); 1.153 - __ pop(T8); 1.154 - __ pop(T1); 1.155 - __ pop(T0); 1.156 - __ pop(T3); 1.157 - __ jr(RA); 1.158 - __ delayed()->nop(); 1.159 - return start; 1.160 + // First deal with the unaligned data at the top. 1.161 + __ bind(l_unaligned); 1.162 + __ beq(end_count, R0, l_exit); 1.163 + __ delayed()->nop(); 1.164 + 1.165 + __ andi(AT, end_from, 3); 1.166 + __ bne(AT, R0, l_from_unaligned); 1.167 + __ delayed()->nop(); 1.168 + 1.169 + __ andi(AT, end_to, 3); 1.170 + __ beq(AT, R0, l_4_bytes_aligned); 1.171 + __ delayed()->nop(); 1.172 + 1.173 + __ bind(l_from_unaligned); 1.174 + __ lb(AT, end_from, -1); 1.175 + __ sb(AT, end_to, -1); 1.176 + __ daddi(end_from, end_from, -1); 1.177 + __ daddi(end_to, end_to, -1); 1.178 + __ daddi(end_count, end_count, -1); 1.179 + __ b(l_unaligned); 1.180 + __ delayed()->nop(); 1.181 + 1.182 + // now end_to, end_from point to 4-byte aligned high-ends 1.183 + // end_count contains byte count that is not copied. 1.184 + // copy 4 bytes at a time 1.185 + __ bind(l_4_bytes_aligned); 1.186 + 1.187 + __ move(T8, end_count); 1.188 + __ daddi(AT, end_count, -3); 1.189 + __ blez(AT, l_copy_suffix); 1.190 + __ delayed()->nop(); 1.191 + 1.192 + //__ andi(T8, T8, 3); 1.193 + __ lea(end_from, Address(end_from, -4)); 1.194 + __ lea(end_to, Address(end_to, -4)); 1.195 + 1.196 + __ dsrl(end_count, end_count, 2); 1.197 + __ align(16); 1.198 + __ bind(l_copy_4_bytes_loop); //l_copy_4_bytes 1.199 + __ lw(AT, end_from, 0); 1.200 + __ sw(AT, end_to, 0); 1.201 + __ addi(end_from, end_from, -4); 1.202 + __ addi(end_to, end_to, -4); 1.203 + __ addi(end_count, end_count, -1); 1.204 + __ bne(end_count, R0, l_copy_4_bytes_loop); 1.205 + __ delayed()->nop(); 1.206 + 1.207 + __ b(l_copy_suffix); 1.208 + __ delayed()->nop(); 1.209 + // copy dwords aligned or not with repeat move 1.210 + // l_copy_suffix 1.211 + // copy suffix (0-3 bytes) 1.212 + __ bind(l_copy_suffix); 1.213 + __ andi(T8, T8, 3); 1.214 + __ beq(T8, R0, l_exit); 1.215 + __ delayed()->nop(); 1.216 + __ addi(end_from, end_from, 3); 1.217 + __ addi(end_to, end_to, 3); 1.218 + __ bind(l_copy_suffix_loop); 1.219 + __ lb(AT, end_from, 0); 1.220 + __ sb(AT, end_to, 0); 1.221 + __ addi(end_from, end_from, -1); 1.222 + __ addi(end_to, end_to, -1); 1.223 + __ addi(T8, T8, -1); 1.224 + __ bne(T8, R0, l_copy_suffix_loop); 1.225 + __ delayed()->nop(); 1.226 + 1.227 + __ bind(l_copy_byte); 1.228 + __ beq(end_count, R0, l_exit); 1.229 + __ delayed()->nop(); 1.230 + __ lb(AT, end_from, -1); 1.231 + __ sb(AT, end_to, -1); 1.232 + __ daddi(end_from, end_from, -1); 1.233 + __ daddi(end_to, end_to, -1); 1.234 + __ daddi(end_count, end_count, -1); 1.235 + __ b(l_copy_byte); 1.236 + __ delayed()->nop(); 1.237 + 1.238 + __ bind(l_exit); 1.239 + __ pop(T8); 1.240 + __ pop(end_count); 1.241 + __ pop(end_to); 1.242 + __ pop(end_from); 1.243 + __ jr(RA); 1.244 + __ delayed()->nop(); 1.245 + return start; 1.246 } 1.247 1.248 // Arguments: