1.1 --- a/src/cpu/x86/vm/macroAssembler_x86.cpp Tue Sep 09 19:18:13 2014 +0000 1.2 +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp Tue Sep 02 12:48:45 2014 -0700 1.3 @@ -7293,6 +7293,467 @@ 1.4 bind(L_done); 1.5 } 1.6 1.7 +#ifdef _LP64 1.8 +/** 1.9 + * Helper for multiply_to_len(). 1.10 + */ 1.11 +void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) { 1.12 + addq(dest_lo, src1); 1.13 + adcq(dest_hi, 0); 1.14 + addq(dest_lo, src2); 1.15 + adcq(dest_hi, 0); 1.16 +} 1.17 + 1.18 +/** 1.19 + * Multiply 64 bit by 64 bit first loop. 1.20 + */ 1.21 +void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 1.22 + Register y, Register y_idx, Register z, 1.23 + Register carry, Register product, 1.24 + Register idx, Register kdx) { 1.25 + // 1.26 + // jlong carry, x[], y[], z[]; 1.27 + // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 1.28 + // huge_128 product = y[idx] * x[xstart] + carry; 1.29 + // z[kdx] = (jlong)product; 1.30 + // carry = (jlong)(product >>> 64); 1.31 + // } 1.32 + // z[xstart] = carry; 1.33 + // 1.34 + 1.35 + Label L_first_loop, L_first_loop_exit; 1.36 + Label L_one_x, L_one_y, L_multiply; 1.37 + 1.38 + decrementl(xstart); 1.39 + jcc(Assembler::negative, L_one_x); 1.40 + 1.41 + movq(x_xstart, Address(x, xstart, Address::times_4, 0)); 1.42 + rorq(x_xstart, 32); // convert big-endian to little-endian 1.43 + 1.44 + bind(L_first_loop); 1.45 + decrementl(idx); 1.46 + jcc(Assembler::negative, L_first_loop_exit); 1.47 + decrementl(idx); 1.48 + jcc(Assembler::negative, L_one_y); 1.49 + movq(y_idx, Address(y, idx, Address::times_4, 0)); 1.50 + rorq(y_idx, 32); // convert big-endian to little-endian 1.51 + bind(L_multiply); 1.52 + movq(product, x_xstart); 1.53 + mulq(y_idx); // product(rax) * y_idx -> rdx:rax 1.54 + addq(product, carry); 1.55 + adcq(rdx, 0); 1.56 + subl(kdx, 2); 1.57 + movl(Address(z, kdx, Address::times_4, 4), product); 1.58 + shrq(product, 32); 1.59 + movl(Address(z, kdx, Address::times_4, 0), product); 1.60 + movq(carry, rdx); 1.61 + jmp(L_first_loop); 1.62 + 1.63 + bind(L_one_y); 1.64 + movl(y_idx, Address(y, 0)); 1.65 + jmp(L_multiply); 1.66 + 1.67 + bind(L_one_x); 1.68 + movl(x_xstart, Address(x, 0)); 1.69 + jmp(L_first_loop); 1.70 + 1.71 + bind(L_first_loop_exit); 1.72 +} 1.73 + 1.74 +/** 1.75 + * Multiply 64 bit by 64 bit and add 128 bit. 1.76 + */ 1.77 +void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z, 1.78 + Register yz_idx, Register idx, 1.79 + Register carry, Register product, int offset) { 1.80 + // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 1.81 + // z[kdx] = (jlong)product; 1.82 + 1.83 + movq(yz_idx, Address(y, idx, Address::times_4, offset)); 1.84 + rorq(yz_idx, 32); // convert big-endian to little-endian 1.85 + movq(product, x_xstart); 1.86 + mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) 1.87 + movq(yz_idx, Address(z, idx, Address::times_4, offset)); 1.88 + rorq(yz_idx, 32); // convert big-endian to little-endian 1.89 + 1.90 + add2_with_carry(rdx, product, carry, yz_idx); 1.91 + 1.92 + movl(Address(z, idx, Address::times_4, offset+4), product); 1.93 + shrq(product, 32); 1.94 + movl(Address(z, idx, Address::times_4, offset), product); 1.95 + 1.96 +} 1.97 + 1.98 +/** 1.99 + * Multiply 128 bit by 128 bit. Unrolled inner loop. 1.100 + */ 1.101 +void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z, 1.102 + Register yz_idx, Register idx, Register jdx, 1.103 + Register carry, Register product, 1.104 + Register carry2) { 1.105 + // jlong carry, x[], y[], z[]; 1.106 + // int kdx = ystart+1; 1.107 + // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 1.108 + // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 1.109 + // z[kdx+idx+1] = (jlong)product; 1.110 + // jlong carry2 = (jlong)(product >>> 64); 1.111 + // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 1.112 + // z[kdx+idx] = (jlong)product; 1.113 + // carry = (jlong)(product >>> 64); 1.114 + // } 1.115 + // idx += 2; 1.116 + // if (idx > 0) { 1.117 + // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 1.118 + // z[kdx+idx] = (jlong)product; 1.119 + // carry = (jlong)(product >>> 64); 1.120 + // } 1.121 + // 1.122 + 1.123 + Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 1.124 + 1.125 + movl(jdx, idx); 1.126 + andl(jdx, 0xFFFFFFFC); 1.127 + shrl(jdx, 2); 1.128 + 1.129 + bind(L_third_loop); 1.130 + subl(jdx, 1); 1.131 + jcc(Assembler::negative, L_third_loop_exit); 1.132 + subl(idx, 4); 1.133 + 1.134 + multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8); 1.135 + movq(carry2, rdx); 1.136 + 1.137 + multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0); 1.138 + movq(carry, rdx); 1.139 + jmp(L_third_loop); 1.140 + 1.141 + bind (L_third_loop_exit); 1.142 + 1.143 + andl (idx, 0x3); 1.144 + jcc(Assembler::zero, L_post_third_loop_done); 1.145 + 1.146 + Label L_check_1; 1.147 + subl(idx, 2); 1.148 + jcc(Assembler::negative, L_check_1); 1.149 + 1.150 + multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0); 1.151 + movq(carry, rdx); 1.152 + 1.153 + bind (L_check_1); 1.154 + addl (idx, 0x2); 1.155 + andl (idx, 0x1); 1.156 + subl(idx, 1); 1.157 + jcc(Assembler::negative, L_post_third_loop_done); 1.158 + 1.159 + movl(yz_idx, Address(y, idx, Address::times_4, 0)); 1.160 + movq(product, x_xstart); 1.161 + mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) 1.162 + movl(yz_idx, Address(z, idx, Address::times_4, 0)); 1.163 + 1.164 + add2_with_carry(rdx, product, yz_idx, carry); 1.165 + 1.166 + movl(Address(z, idx, Address::times_4, 0), product); 1.167 + shrq(product, 32); 1.168 + 1.169 + shlq(rdx, 32); 1.170 + orq(product, rdx); 1.171 + movq(carry, product); 1.172 + 1.173 + bind(L_post_third_loop_done); 1.174 +} 1.175 + 1.176 +/** 1.177 + * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop. 1.178 + * 1.179 + */ 1.180 +void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z, 1.181 + Register carry, Register carry2, 1.182 + Register idx, Register jdx, 1.183 + Register yz_idx1, Register yz_idx2, 1.184 + Register tmp, Register tmp3, Register tmp4) { 1.185 + assert(UseBMI2Instructions, "should be used only when BMI2 is available"); 1.186 + 1.187 + // jlong carry, x[], y[], z[]; 1.188 + // int kdx = ystart+1; 1.189 + // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 1.190 + // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry; 1.191 + // jlong carry2 = (jlong)(tmp3 >>> 64); 1.192 + // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2; 1.193 + // carry = (jlong)(tmp4 >>> 64); 1.194 + // z[kdx+idx+1] = (jlong)tmp3; 1.195 + // z[kdx+idx] = (jlong)tmp4; 1.196 + // } 1.197 + // idx += 2; 1.198 + // if (idx > 0) { 1.199 + // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry; 1.200 + // z[kdx+idx] = (jlong)yz_idx1; 1.201 + // carry = (jlong)(yz_idx1 >>> 64); 1.202 + // } 1.203 + // 1.204 + 1.205 + Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 1.206 + 1.207 + movl(jdx, idx); 1.208 + andl(jdx, 0xFFFFFFFC); 1.209 + shrl(jdx, 2); 1.210 + 1.211 + bind(L_third_loop); 1.212 + subl(jdx, 1); 1.213 + jcc(Assembler::negative, L_third_loop_exit); 1.214 + subl(idx, 4); 1.215 + 1.216 + movq(yz_idx1, Address(y, idx, Address::times_4, 8)); 1.217 + rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 1.218 + movq(yz_idx2, Address(y, idx, Address::times_4, 0)); 1.219 + rorxq(yz_idx2, yz_idx2, 32); 1.220 + 1.221 + mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 1.222 + mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp 1.223 + 1.224 + movq(yz_idx1, Address(z, idx, Address::times_4, 8)); 1.225 + rorxq(yz_idx1, yz_idx1, 32); 1.226 + movq(yz_idx2, Address(z, idx, Address::times_4, 0)); 1.227 + rorxq(yz_idx2, yz_idx2, 32); 1.228 + 1.229 + if (VM_Version::supports_adx()) { 1.230 + adcxq(tmp3, carry); 1.231 + adoxq(tmp3, yz_idx1); 1.232 + 1.233 + adcxq(tmp4, tmp); 1.234 + adoxq(tmp4, yz_idx2); 1.235 + 1.236 + movl(carry, 0); // does not affect flags 1.237 + adcxq(carry2, carry); 1.238 + adoxq(carry2, carry); 1.239 + } else { 1.240 + add2_with_carry(tmp4, tmp3, carry, yz_idx1); 1.241 + add2_with_carry(carry2, tmp4, tmp, yz_idx2); 1.242 + } 1.243 + movq(carry, carry2); 1.244 + 1.245 + movl(Address(z, idx, Address::times_4, 12), tmp3); 1.246 + shrq(tmp3, 32); 1.247 + movl(Address(z, idx, Address::times_4, 8), tmp3); 1.248 + 1.249 + movl(Address(z, idx, Address::times_4, 4), tmp4); 1.250 + shrq(tmp4, 32); 1.251 + movl(Address(z, idx, Address::times_4, 0), tmp4); 1.252 + 1.253 + jmp(L_third_loop); 1.254 + 1.255 + bind (L_third_loop_exit); 1.256 + 1.257 + andl (idx, 0x3); 1.258 + jcc(Assembler::zero, L_post_third_loop_done); 1.259 + 1.260 + Label L_check_1; 1.261 + subl(idx, 2); 1.262 + jcc(Assembler::negative, L_check_1); 1.263 + 1.264 + movq(yz_idx1, Address(y, idx, Address::times_4, 0)); 1.265 + rorxq(yz_idx1, yz_idx1, 32); 1.266 + mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 1.267 + movq(yz_idx2, Address(z, idx, Address::times_4, 0)); 1.268 + rorxq(yz_idx2, yz_idx2, 32); 1.269 + 1.270 + add2_with_carry(tmp4, tmp3, carry, yz_idx2); 1.271 + 1.272 + movl(Address(z, idx, Address::times_4, 4), tmp3); 1.273 + shrq(tmp3, 32); 1.274 + movl(Address(z, idx, Address::times_4, 0), tmp3); 1.275 + movq(carry, tmp4); 1.276 + 1.277 + bind (L_check_1); 1.278 + addl (idx, 0x2); 1.279 + andl (idx, 0x1); 1.280 + subl(idx, 1); 1.281 + jcc(Assembler::negative, L_post_third_loop_done); 1.282 + movl(tmp4, Address(y, idx, Address::times_4, 0)); 1.283 + mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3 1.284 + movl(tmp4, Address(z, idx, Address::times_4, 0)); 1.285 + 1.286 + add2_with_carry(carry2, tmp3, tmp4, carry); 1.287 + 1.288 + movl(Address(z, idx, Address::times_4, 0), tmp3); 1.289 + shrq(tmp3, 32); 1.290 + 1.291 + shlq(carry2, 32); 1.292 + orq(tmp3, carry2); 1.293 + movq(carry, tmp3); 1.294 + 1.295 + bind(L_post_third_loop_done); 1.296 +} 1.297 + 1.298 +/** 1.299 + * Code for BigInteger::multiplyToLen() instrinsic. 1.300 + * 1.301 + * rdi: x 1.302 + * rax: xlen 1.303 + * rsi: y 1.304 + * rcx: ylen 1.305 + * r8: z 1.306 + * r11: zlen 1.307 + * r12: tmp1 1.308 + * r13: tmp2 1.309 + * r14: tmp3 1.310 + * r15: tmp4 1.311 + * rbx: tmp5 1.312 + * 1.313 + */ 1.314 +void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen, 1.315 + Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 1.316 + ShortBranchVerifier sbv(this); 1.317 + assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx); 1.318 + 1.319 + push(tmp1); 1.320 + push(tmp2); 1.321 + push(tmp3); 1.322 + push(tmp4); 1.323 + push(tmp5); 1.324 + 1.325 + push(xlen); 1.326 + push(zlen); 1.327 + 1.328 + const Register idx = tmp1; 1.329 + const Register kdx = tmp2; 1.330 + const Register xstart = tmp3; 1.331 + 1.332 + const Register y_idx = tmp4; 1.333 + const Register carry = tmp5; 1.334 + const Register product = xlen; 1.335 + const Register x_xstart = zlen; // reuse register 1.336 + 1.337 + // First Loop. 1.338 + // 1.339 + // final static long LONG_MASK = 0xffffffffL; 1.340 + // int xstart = xlen - 1; 1.341 + // int ystart = ylen - 1; 1.342 + // long carry = 0; 1.343 + // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 1.344 + // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 1.345 + // z[kdx] = (int)product; 1.346 + // carry = product >>> 32; 1.347 + // } 1.348 + // z[xstart] = (int)carry; 1.349 + // 1.350 + 1.351 + movl(idx, ylen); // idx = ylen; 1.352 + movl(kdx, zlen); // kdx = xlen+ylen; 1.353 + xorq(carry, carry); // carry = 0; 1.354 + 1.355 + Label L_done; 1.356 + 1.357 + movl(xstart, xlen); 1.358 + decrementl(xstart); 1.359 + jcc(Assembler::negative, L_done); 1.360 + 1.361 + multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 1.362 + 1.363 + Label L_second_loop; 1.364 + testl(kdx, kdx); 1.365 + jcc(Assembler::zero, L_second_loop); 1.366 + 1.367 + Label L_carry; 1.368 + subl(kdx, 1); 1.369 + jcc(Assembler::zero, L_carry); 1.370 + 1.371 + movl(Address(z, kdx, Address::times_4, 0), carry); 1.372 + shrq(carry, 32); 1.373 + subl(kdx, 1); 1.374 + 1.375 + bind(L_carry); 1.376 + movl(Address(z, kdx, Address::times_4, 0), carry); 1.377 + 1.378 + // Second and third (nested) loops. 1.379 + // 1.380 + // for (int i = xstart-1; i >= 0; i--) { // Second loop 1.381 + // carry = 0; 1.382 + // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 1.383 + // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 1.384 + // (z[k] & LONG_MASK) + carry; 1.385 + // z[k] = (int)product; 1.386 + // carry = product >>> 32; 1.387 + // } 1.388 + // z[i] = (int)carry; 1.389 + // } 1.390 + // 1.391 + // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 1.392 + 1.393 + const Register jdx = tmp1; 1.394 + 1.395 + bind(L_second_loop); 1.396 + xorl(carry, carry); // carry = 0; 1.397 + movl(jdx, ylen); // j = ystart+1 1.398 + 1.399 + subl(xstart, 1); // i = xstart-1; 1.400 + jcc(Assembler::negative, L_done); 1.401 + 1.402 + push (z); 1.403 + 1.404 + Label L_last_x; 1.405 + lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j 1.406 + subl(xstart, 1); // i = xstart-1; 1.407 + jcc(Assembler::negative, L_last_x); 1.408 + 1.409 + if (UseBMI2Instructions) { 1.410 + movq(rdx, Address(x, xstart, Address::times_4, 0)); 1.411 + rorxq(rdx, rdx, 32); // convert big-endian to little-endian 1.412 + } else { 1.413 + movq(x_xstart, Address(x, xstart, Address::times_4, 0)); 1.414 + rorq(x_xstart, 32); // convert big-endian to little-endian 1.415 + } 1.416 + 1.417 + Label L_third_loop_prologue; 1.418 + bind(L_third_loop_prologue); 1.419 + 1.420 + push (x); 1.421 + push (xstart); 1.422 + push (ylen); 1.423 + 1.424 + 1.425 + if (UseBMI2Instructions) { 1.426 + multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4); 1.427 + } else { // !UseBMI2Instructions 1.428 + multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x); 1.429 + } 1.430 + 1.431 + pop(ylen); 1.432 + pop(xlen); 1.433 + pop(x); 1.434 + pop(z); 1.435 + 1.436 + movl(tmp3, xlen); 1.437 + addl(tmp3, 1); 1.438 + movl(Address(z, tmp3, Address::times_4, 0), carry); 1.439 + subl(tmp3, 1); 1.440 + jccb(Assembler::negative, L_done); 1.441 + 1.442 + shrq(carry, 32); 1.443 + movl(Address(z, tmp3, Address::times_4, 0), carry); 1.444 + jmp(L_second_loop); 1.445 + 1.446 + // Next infrequent code is moved outside loops. 1.447 + bind(L_last_x); 1.448 + if (UseBMI2Instructions) { 1.449 + movl(rdx, Address(x, 0)); 1.450 + } else { 1.451 + movl(x_xstart, Address(x, 0)); 1.452 + } 1.453 + jmp(L_third_loop_prologue); 1.454 + 1.455 + bind(L_done); 1.456 + 1.457 + pop(zlen); 1.458 + pop(xlen); 1.459 + 1.460 + pop(tmp5); 1.461 + pop(tmp4); 1.462 + pop(tmp3); 1.463 + pop(tmp2); 1.464 + pop(tmp1); 1.465 +} 1.466 +#endif 1.467 + 1.468 /** 1.469 * Emits code to update CRC-32 with a byte value according to constants in table 1.470 *