1.1 --- a/src/cpu/mips/vm/stubGenerator_mips_64.cpp Mon May 30 01:30:23 2016 -0400 1.2 +++ b/src/cpu/mips/vm/stubGenerator_mips_64.cpp Mon May 30 02:01:38 2016 -0400 1.3 @@ -849,85 +849,208 @@ 1.4 return start; 1.5 } 1.6 1.7 - // Arguments: 1.8 - // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1.9 - // ignored 1.10 - // name - stub name string 1.11 + // Generate stub for disjoint short copy. If "aligned" is true, the 1.12 + // "from" and "to" addresses are assumed to be heapword aligned. 1.13 // 1.14 - // Inputs: 1.15 - // c_rarg0 - source array address 1.16 - // c_rarg1 - destination array address 1.17 - // c_rarg2 - element count, treated as ssize_t, can be zero 1.18 + // Arguments for generated stub: 1.19 + // from: A0 1.20 + // to: A1 1.21 + // elm.count: A2 treated as signed 1.22 + // one element: 2 bytes 1.23 // 1.24 - // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1.25 - // let the hardware handle it. The two or four words within dwords 1.26 - // or qwords that span cache line boundaries will still be loaded 1.27 - // and stored atomically. 1.28 + // Strategy for aligned==true: 1.29 // 1.30 - // Side Effects: 1.31 - // disjoint_short_copy_entry is set to the no-overlap entry point 1.32 - // used by generate_conjoint_short_copy(). 1.33 + // If length <= 9: 1.34 + // 1. copy 1 elements at a time (l_5) 1.35 // 1.36 - address generate_disjoint_short_copy(bool aligned, const char *name) { 1.37 - Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8; 1.38 - StubCodeMark mark(this, "StubRoutines", name); 1.39 - __ align(CodeEntryAlignment); 1.40 - address start = __ pc(); 1.41 + // If length > 9: 1.42 + // 1. copy 4 elements at a time until less than 4 elements are left (l_7) 1.43 + // 2. copy 2 elements at a time until less than 2 elements are left (l_6) 1.44 + // 3. copy last element if one was left in step 2. (l_1) 1.45 + // 1.46 + // 1.47 + // Strategy for aligned==false: 1.48 + // 1.49 + // If length <= 9: same as aligned==true case 1.50 + // 1.51 + // If length > 9: 1.52 + // 1. continue with step 7. if the alignment of from and to mod 4 1.53 + // is different. 1.54 + // 2. align from and to to 4 bytes by copying 1 element if necessary 1.55 + // 3. at l_2 from and to are 4 byte aligned; continue with 1.56 + // 6. if they cannot be aligned to 8 bytes because they have 1.57 + // got different alignment mod 8. 1.58 + // 4. at this point we know that both, from and to, have the same 1.59 + // alignment mod 8, now copy one element if necessary to get 1.60 + // 8 byte alignment of from and to. 1.61 + // 5. copy 4 elements at a time until less than 4 elements are 1.62 + // left; depending on step 3. all load/stores are aligned. 1.63 + // 6. copy 2 elements at a time until less than 2 elements are 1.64 + // left. (l_6) 1.65 + // 7. copy 1 element at a time. (l_5) 1.66 + // 8. copy last element if one was left in step 6. (l_1) 1.67 + // 1.68 + // TODO: 1.69 + // 1.70 + // 1. use loongson 128-bit load/store 1.71 + // 2. use loop unrolling optimization when len is big enough, for example if len > 0x2000: 1.72 + // __ bind(l_x); 1.73 + // __ ld(AT, tmp1, 0); 1.74 + // __ ld(tmp, tmp1, 8); 1.75 + // __ sd(AT, tmp2, 0); 1.76 + // __ sd(tmp, tmp2, 8); 1.77 + // __ ld(AT, tmp1, 16); 1.78 + // __ ld(tmp, tmp1, 24); 1.79 + // __ sd(AT, tmp2, 16); 1.80 + // __ sd(tmp, tmp2, 24); 1.81 + // __ daddi(tmp1, tmp1, 32); 1.82 + // __ daddi(tmp2, tmp2, 32); 1.83 + // __ daddi(tmp3, tmp3, -16); 1.84 + // __ daddi(AT, tmp3, -16); 1.85 + // __ bgez(AT, l_x); 1.86 + // __ delayed()->nop(); 1.87 + // 1.88 + address generate_disjoint_short_copy(bool aligned, const char * name) { 1.89 + StubCodeMark mark(this, "StubRoutines", name); 1.90 + __ align(CodeEntryAlignment); 1.91 1.92 - __ push(T3); 1.93 - __ push(T0); 1.94 - __ push(T1); 1.95 - __ push(T8); 1.96 - __ move(T1, A2); 1.97 - __ move(T3, A0); 1.98 - __ move(T0, A1); 1.99 + Register tmp1 = T0; 1.100 + Register tmp2 = T1; 1.101 + Register tmp3 = T3; 1.102 1.103 - if (!aligned) { 1.104 - __ beq(T1, R0, l_5); 1.105 - __ delayed()->nop(); 1.106 - // align source address at dword address boundary 1.107 - __ move(T8, T3); // original from 1.108 - __ andi(T8, T8, 3); // either 0 or 2 1.109 - __ beq(T8, R0, l_1); // no prefix 1.110 - __ delayed()->nop(); 1.111 - // copy prefix 1.112 - __ lh(AT, T3, 0); 1.113 - __ sh(AT, T0, 0); 1.114 - __ add(T3, T3, T8); 1.115 - __ add(T0, T0, T8); 1.116 - __ addi(T1, T1, -1); 1.117 - __ bind(l_1); 1.118 - } 1.119 - __ move(T8, T1); // word count less prefix 1.120 - __ sra(T1, T1, 1); 1.121 - __ beq(T1, R0, l_4); 1.122 - __ delayed()->nop(); 1.123 - // copy aligned dwords 1.124 - __ bind(l_2); 1.125 - __ align(16); 1.126 - __ bind(l_3); 1.127 - __ lw(AT, T3, 0); 1.128 - __ sw(AT, T0, 0 ); 1.129 - __ addi(T3, T3, 4); 1.130 - __ addi(T0, T0, 4); 1.131 - __ addi(T1, T1, -1); 1.132 - __ bne(T1, R0, l_3); 1.133 - __ delayed()->nop(); 1.134 - __ bind(l_4); 1.135 - __ andi(T8, T8, 1); 1.136 - __ beq(T8, R0, l_5); 1.137 - __ delayed()->nop(); 1.138 - // copy suffix 1.139 - __ lh(AT, T3, 0); 1.140 - __ sh(AT, T0, 0); 1.141 - __ bind(l_5); 1.142 - __ pop(T8); 1.143 - __ pop(T1); 1.144 - __ pop(T0); 1.145 - __ pop(T3); 1.146 - __ jr(RA); 1.147 - __ delayed()->nop(); 1.148 - return start; 1.149 + address start = __ pc(); 1.150 + 1.151 + __ push(tmp1); 1.152 + __ push(tmp2); 1.153 + __ push(tmp3); 1.154 + __ move(tmp1, A0); 1.155 + __ move(tmp2, A1); 1.156 + __ move(tmp3, A2); 1.157 + 1.158 + Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8; 1.159 + Label l_debug; 1.160 + // don't try anything fancy if arrays don't have many elements 1.161 + __ daddi(AT, tmp3, -9); 1.162 + __ blez(AT, l_1); 1.163 + __ delayed()->nop(); 1.164 + 1.165 + if (!aligned) { 1.166 + __ xorr(AT, A0, A1); 1.167 + __ andi(AT, AT, 1); 1.168 + __ bne(AT, R0, l_debug); // if arrays don't have the same alignment mod 2, can this happen? 1.169 + __ delayed()->nop(); 1.170 + 1.171 + __ xorr(AT, A0, A1); 1.172 + __ andi(AT, AT, 3); 1.173 + __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 1 element copy 1.174 + __ delayed()->nop(); 1.175 + 1.176 + // At this point it is guaranteed that both, from and to have the same alignment mod 4. 1.177 + 1.178 + // Copy 1 element if necessary to align to 4 bytes. 1.179 + __ andi(AT, A0, 3); 1.180 + __ beq(AT, R0, l_2); 1.181 + __ delayed()->nop(); 1.182 + 1.183 + __ lhu(AT, tmp1, 0); 1.184 + __ daddi(tmp1, tmp1, 2); 1.185 + __ sh(AT, tmp2, 0); 1.186 + __ daddi(tmp2, tmp2, 2); 1.187 + __ daddi(tmp3, tmp3, -1); 1.188 + __ bind(l_2); 1.189 + 1.190 + // At this point the positions of both, from and to, are at least 4 byte aligned. 1.191 + 1.192 + // Copy 4 elements at a time. 1.193 + // Align to 8 bytes, but only if both, from and to, have same alignment mod 8. 1.194 + __ xorr(AT, tmp1, tmp2); 1.195 + __ andi(AT, AT, 7); 1.196 + __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned 1.197 + __ delayed()->nop(); 1.198 + 1.199 + // Copy a 2-element word if necessary to align to 8 bytes. 1.200 + __ andi(AT, tmp1, 7); 1.201 + __ beq(AT, R0, l_7); 1.202 + __ delayed()->nop(); 1.203 + 1.204 + __ lw(AT, tmp1, 0); 1.205 + __ daddi(tmp3, tmp3, -2); 1.206 + __ sw(AT, tmp2, 0); 1.207 + { // FasterArrayCopy 1.208 + __ daddi(tmp1, tmp1, 4); 1.209 + __ daddi(tmp2, tmp2, 4); 1.210 + } 1.211 + } 1.212 + 1.213 + __ bind(l_7); 1.214 + 1.215 + // Copy 4 elements at a time; either the loads or the stores can 1.216 + // be unaligned if aligned == false. 1.217 + 1.218 + { // FasterArrayCopy 1.219 + __ daddi(AT, tmp3, -15); 1.220 + __ blez(AT, l_6); // copy 2 at a time if less than 16 elements remain 1.221 + __ delayed()->nop(); 1.222 + 1.223 + __ bind(l_8); 1.224 + // For Loongson, there is 128-bit memory access. TODO 1.225 + __ ld(AT, tmp1, 0); 1.226 + __ sd(AT, tmp2, 0); 1.227 + __ daddi(tmp1, tmp1, 8); 1.228 + __ daddi(tmp2, tmp2, 8); 1.229 + __ daddi(tmp3, tmp3, -4); 1.230 + __ daddi(AT, tmp3, -4); 1.231 + __ bgez(AT, l_8); 1.232 + __ delayed()->nop(); 1.233 + } 1.234 + __ bind(l_6); 1.235 + 1.236 + // copy 2 element at a time 1.237 + { // FasterArrayCopy 1.238 + __ daddi(AT, tmp3, -1); 1.239 + __ blez(AT, l_1); 1.240 + __ delayed()->nop(); 1.241 + 1.242 + __ bind(l_3); 1.243 + __ lw(AT, tmp1, 0); 1.244 + __ sw(AT, tmp2, 0); 1.245 + __ daddi(tmp1, tmp1, 4); 1.246 + __ daddi(tmp2, tmp2, 4); 1.247 + __ daddi(tmp3, tmp3, -2); 1.248 + __ daddi(AT, tmp3, -2); 1.249 + __ bgez(AT, l_3); 1.250 + __ delayed()->nop(); 1.251 + 1.252 + } 1.253 + 1.254 + // do single element copy (8 bit), can this happen? 1.255 + __ bind(l_1); 1.256 + __ beq(R0, tmp3, l_4); 1.257 + __ delayed()->nop(); 1.258 + 1.259 + { // FasterArrayCopy 1.260 + 1.261 + __ bind(l_5); 1.262 + __ lhu(AT, tmp1, 0); 1.263 + __ daddi(tmp3, tmp3, -1); 1.264 + __ sh(AT, tmp2, 0); 1.265 + __ daddi(tmp1, tmp1, 2); 1.266 + __ daddi(tmp2, tmp2, 2); 1.267 + __ daddi(AT, tmp3, -1); 1.268 + __ bgez(AT, l_5); 1.269 + __ delayed()->nop(); 1.270 + } 1.271 + __ bind(l_4); 1.272 + __ pop(tmp3); 1.273 + __ pop(tmp2); 1.274 + __ pop(tmp1); 1.275 + 1.276 + __ jr(RA); 1.277 + __ delayed()->nop(); 1.278 + 1.279 + __ bind(l_debug); 1.280 + __ stop("generate_disjoint_short_copy should not reach here"); 1.281 + return start; 1.282 } 1.283 1.284 // Arguments: