1.1 --- a/src/cpu/x86/vm/macroAssembler_x86.cpp Thu Mar 31 14:04:14 2016 -0700 1.2 +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp Thu Mar 31 14:23:12 2016 -0700 1.3 @@ -7769,6 +7769,503 @@ 1.4 pop(tmp2); 1.5 pop(tmp1); 1.6 } 1.7 + 1.8 +//Helper functions for square_to_len() 1.9 + 1.10 +/** 1.11 + * Store the squares of x[], right shifted one bit (divided by 2) into z[] 1.12 + * Preserves x and z and modifies rest of the registers. 1.13 + */ 1.14 + 1.15 +void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 1.16 + // Perform square and right shift by 1 1.17 + // Handle odd xlen case first, then for even xlen do the following 1.18 + // jlong carry = 0; 1.19 + // for (int j=0, i=0; j < xlen; j+=2, i+=4) { 1.20 + // huge_128 product = x[j:j+1] * x[j:j+1]; 1.21 + // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65); 1.22 + // z[i+2:i+3] = (jlong)(product >>> 1); 1.23 + // carry = (jlong)product; 1.24 + // } 1.25 + 1.26 + xorq(tmp5, tmp5); // carry 1.27 + xorq(rdxReg, rdxReg); 1.28 + xorl(tmp1, tmp1); // index for x 1.29 + xorl(tmp4, tmp4); // index for z 1.30 + 1.31 + Label L_first_loop, L_first_loop_exit; 1.32 + 1.33 + testl(xlen, 1); 1.34 + jccb(Assembler::zero, L_first_loop); //jump if xlen is even 1.35 + 1.36 + // Square and right shift by 1 the odd element using 32 bit multiply 1.37 + movl(raxReg, Address(x, tmp1, Address::times_4, 0)); 1.38 + imulq(raxReg, raxReg); 1.39 + shrq(raxReg, 1); 1.40 + adcq(tmp5, 0); 1.41 + movq(Address(z, tmp4, Address::times_4, 0), raxReg); 1.42 + incrementl(tmp1); 1.43 + addl(tmp4, 2); 1.44 + 1.45 + // Square and right shift by 1 the rest using 64 bit multiply 1.46 + bind(L_first_loop); 1.47 + cmpptr(tmp1, xlen); 1.48 + jccb(Assembler::equal, L_first_loop_exit); 1.49 + 1.50 + // Square 1.51 + movq(raxReg, Address(x, tmp1, Address::times_4, 0)); 1.52 + rorq(raxReg, 32); // convert big-endian to little-endian 1.53 + mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax 1.54 + 1.55 + // Right shift by 1 and save carry 1.56 + shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1 1.57 + rcrq(rdxReg, 1); 1.58 + rcrq(raxReg, 1); 1.59 + adcq(tmp5, 0); 1.60 + 1.61 + // Store result in z 1.62 + movq(Address(z, tmp4, Address::times_4, 0), rdxReg); 1.63 + movq(Address(z, tmp4, Address::times_4, 8), raxReg); 1.64 + 1.65 + // Update indices for x and z 1.66 + addl(tmp1, 2); 1.67 + addl(tmp4, 4); 1.68 + jmp(L_first_loop); 1.69 + 1.70 + bind(L_first_loop_exit); 1.71 +} 1.72 + 1.73 + 1.74 +/** 1.75 + * Perform the following multiply add operation using BMI2 instructions 1.76 + * carry:sum = sum + op1*op2 + carry 1.77 + * op2 should be in rdx 1.78 + * op2 is preserved, all other registers are modified 1.79 + */ 1.80 +void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) { 1.81 + // assert op2 is rdx 1.82 + mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1 1.83 + addq(sum, carry); 1.84 + adcq(tmp2, 0); 1.85 + addq(sum, op1); 1.86 + adcq(tmp2, 0); 1.87 + movq(carry, tmp2); 1.88 +} 1.89 + 1.90 +/** 1.91 + * Perform the following multiply add operation: 1.92 + * carry:sum = sum + op1*op2 + carry 1.93 + * Preserves op1, op2 and modifies rest of registers 1.94 + */ 1.95 +void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) { 1.96 + // rdx:rax = op1 * op2 1.97 + movq(raxReg, op2); 1.98 + mulq(op1); 1.99 + 1.100 + // rdx:rax = sum + carry + rdx:rax 1.101 + addq(sum, carry); 1.102 + adcq(rdxReg, 0); 1.103 + addq(sum, raxReg); 1.104 + adcq(rdxReg, 0); 1.105 + 1.106 + // carry:sum = rdx:sum 1.107 + movq(carry, rdxReg); 1.108 +} 1.109 + 1.110 +/** 1.111 + * Add 64 bit long carry into z[] with carry propogation. 1.112 + * Preserves z and carry register values and modifies rest of registers. 1.113 + * 1.114 + */ 1.115 +void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) { 1.116 + Label L_fourth_loop, L_fourth_loop_exit; 1.117 + 1.118 + movl(tmp1, 1); 1.119 + subl(zlen, 2); 1.120 + addq(Address(z, zlen, Address::times_4, 0), carry); 1.121 + 1.122 + bind(L_fourth_loop); 1.123 + jccb(Assembler::carryClear, L_fourth_loop_exit); 1.124 + subl(zlen, 2); 1.125 + jccb(Assembler::negative, L_fourth_loop_exit); 1.126 + addq(Address(z, zlen, Address::times_4, 0), tmp1); 1.127 + jmp(L_fourth_loop); 1.128 + bind(L_fourth_loop_exit); 1.129 +} 1.130 + 1.131 +/** 1.132 + * Shift z[] left by 1 bit. 1.133 + * Preserves x, len, z and zlen registers and modifies rest of the registers. 1.134 + * 1.135 + */ 1.136 +void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) { 1.137 + 1.138 + Label L_fifth_loop, L_fifth_loop_exit; 1.139 + 1.140 + // Fifth loop 1.141 + // Perform primitiveLeftShift(z, zlen, 1) 1.142 + 1.143 + const Register prev_carry = tmp1; 1.144 + const Register new_carry = tmp4; 1.145 + const Register value = tmp2; 1.146 + const Register zidx = tmp3; 1.147 + 1.148 + // int zidx, carry; 1.149 + // long value; 1.150 + // carry = 0; 1.151 + // for (zidx = zlen-2; zidx >=0; zidx -= 2) { 1.152 + // (carry:value) = (z[i] << 1) | carry ; 1.153 + // z[i] = value; 1.154 + // } 1.155 + 1.156 + movl(zidx, zlen); 1.157 + xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register 1.158 + 1.159 + bind(L_fifth_loop); 1.160 + decl(zidx); // Use decl to preserve carry flag 1.161 + decl(zidx); 1.162 + jccb(Assembler::negative, L_fifth_loop_exit); 1.163 + 1.164 + if (UseBMI2Instructions) { 1.165 + movq(value, Address(z, zidx, Address::times_4, 0)); 1.166 + rclq(value, 1); 1.167 + rorxq(value, value, 32); 1.168 + movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form 1.169 + } 1.170 + else { 1.171 + // clear new_carry 1.172 + xorl(new_carry, new_carry); 1.173 + 1.174 + // Shift z[i] by 1, or in previous carry and save new carry 1.175 + movq(value, Address(z, zidx, Address::times_4, 0)); 1.176 + shlq(value, 1); 1.177 + adcl(new_carry, 0); 1.178 + 1.179 + orq(value, prev_carry); 1.180 + rorq(value, 0x20); 1.181 + movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form 1.182 + 1.183 + // Set previous carry = new carry 1.184 + movl(prev_carry, new_carry); 1.185 + } 1.186 + jmp(L_fifth_loop); 1.187 + 1.188 + bind(L_fifth_loop_exit); 1.189 +} 1.190 + 1.191 + 1.192 +/** 1.193 + * Code for BigInteger::squareToLen() intrinsic 1.194 + * 1.195 + * rdi: x 1.196 + * rsi: len 1.197 + * r8: z 1.198 + * rcx: zlen 1.199 + * r12: tmp1 1.200 + * r13: tmp2 1.201 + * r14: tmp3 1.202 + * r15: tmp4 1.203 + * rbx: tmp5 1.204 + * 1.205 + */ 1.206 +void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 1.207 + 1.208 + Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply; 1.209 + push(tmp1); 1.210 + push(tmp2); 1.211 + push(tmp3); 1.212 + push(tmp4); 1.213 + push(tmp5); 1.214 + 1.215 + // First loop 1.216 + // Store the squares, right shifted one bit (i.e., divided by 2). 1.217 + square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg); 1.218 + 1.219 + // Add in off-diagonal sums. 1.220 + // 1.221 + // Second, third (nested) and fourth loops. 1.222 + // zlen +=2; 1.223 + // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) { 1.224 + // carry = 0; 1.225 + // long op2 = x[xidx:xidx+1]; 1.226 + // for (int j=xidx-2,k=zidx; j >= 0; j-=2) { 1.227 + // k -= 2; 1.228 + // long op1 = x[j:j+1]; 1.229 + // long sum = z[k:k+1]; 1.230 + // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs); 1.231 + // z[k:k+1] = sum; 1.232 + // } 1.233 + // add_one_64(z, k, carry, tmp_regs); 1.234 + // } 1.235 + 1.236 + const Register carry = tmp5; 1.237 + const Register sum = tmp3; 1.238 + const Register op1 = tmp4; 1.239 + Register op2 = tmp2; 1.240 + 1.241 + push(zlen); 1.242 + push(len); 1.243 + addl(zlen,2); 1.244 + bind(L_second_loop); 1.245 + xorq(carry, carry); 1.246 + subl(zlen, 4); 1.247 + subl(len, 2); 1.248 + push(zlen); 1.249 + push(len); 1.250 + cmpl(len, 0); 1.251 + jccb(Assembler::lessEqual, L_second_loop_exit); 1.252 + 1.253 + // Multiply an array by one 64 bit long. 1.254 + if (UseBMI2Instructions) { 1.255 + op2 = rdxReg; 1.256 + movq(op2, Address(x, len, Address::times_4, 0)); 1.257 + rorxq(op2, op2, 32); 1.258 + } 1.259 + else { 1.260 + movq(op2, Address(x, len, Address::times_4, 0)); 1.261 + rorq(op2, 32); 1.262 + } 1.263 + 1.264 + bind(L_third_loop); 1.265 + decrementl(len); 1.266 + jccb(Assembler::negative, L_third_loop_exit); 1.267 + decrementl(len); 1.268 + jccb(Assembler::negative, L_last_x); 1.269 + 1.270 + movq(op1, Address(x, len, Address::times_4, 0)); 1.271 + rorq(op1, 32); 1.272 + 1.273 + bind(L_multiply); 1.274 + subl(zlen, 2); 1.275 + movq(sum, Address(z, zlen, Address::times_4, 0)); 1.276 + 1.277 + // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry. 1.278 + if (UseBMI2Instructions) { 1.279 + multiply_add_64_bmi2(sum, op1, op2, carry, tmp2); 1.280 + } 1.281 + else { 1.282 + multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 1.283 + } 1.284 + 1.285 + movq(Address(z, zlen, Address::times_4, 0), sum); 1.286 + 1.287 + jmp(L_third_loop); 1.288 + bind(L_third_loop_exit); 1.289 + 1.290 + // Fourth loop 1.291 + // Add 64 bit long carry into z with carry propogation. 1.292 + // Uses offsetted zlen. 1.293 + add_one_64(z, zlen, carry, tmp1); 1.294 + 1.295 + pop(len); 1.296 + pop(zlen); 1.297 + jmp(L_second_loop); 1.298 + 1.299 + // Next infrequent code is moved outside loops. 1.300 + bind(L_last_x); 1.301 + movl(op1, Address(x, 0)); 1.302 + jmp(L_multiply); 1.303 + 1.304 + bind(L_second_loop_exit); 1.305 + pop(len); 1.306 + pop(zlen); 1.307 + pop(len); 1.308 + pop(zlen); 1.309 + 1.310 + // Fifth loop 1.311 + // Shift z left 1 bit. 1.312 + lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4); 1.313 + 1.314 + // z[zlen-1] |= x[len-1] & 1; 1.315 + movl(tmp3, Address(x, len, Address::times_4, -4)); 1.316 + andl(tmp3, 1); 1.317 + orl(Address(z, zlen, Address::times_4, -4), tmp3); 1.318 + 1.319 + pop(tmp5); 1.320 + pop(tmp4); 1.321 + pop(tmp3); 1.322 + pop(tmp2); 1.323 + pop(tmp1); 1.324 +} 1.325 + 1.326 +/** 1.327 + * Helper function for mul_add() 1.328 + * Multiply the in[] by int k and add to out[] starting at offset offs using 1.329 + * 128 bit by 32 bit multiply and return the carry in tmp5. 1.330 + * Only quad int aligned length of in[] is operated on in this function. 1.331 + * k is in rdxReg for BMI2Instructions, for others it is in tmp2. 1.332 + * This function preserves out, in and k registers. 1.333 + * len and offset point to the appropriate index in "in" & "out" correspondingly 1.334 + * tmp5 has the carry. 1.335 + * other registers are temporary and are modified. 1.336 + * 1.337 + */ 1.338 +void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in, 1.339 + Register offset, Register len, Register tmp1, Register tmp2, Register tmp3, 1.340 + Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 1.341 + 1.342 + Label L_first_loop, L_first_loop_exit; 1.343 + 1.344 + movl(tmp1, len); 1.345 + shrl(tmp1, 2); 1.346 + 1.347 + bind(L_first_loop); 1.348 + subl(tmp1, 1); 1.349 + jccb(Assembler::negative, L_first_loop_exit); 1.350 + 1.351 + subl(len, 4); 1.352 + subl(offset, 4); 1.353 + 1.354 + Register op2 = tmp2; 1.355 + const Register sum = tmp3; 1.356 + const Register op1 = tmp4; 1.357 + const Register carry = tmp5; 1.358 + 1.359 + if (UseBMI2Instructions) { 1.360 + op2 = rdxReg; 1.361 + } 1.362 + 1.363 + movq(op1, Address(in, len, Address::times_4, 8)); 1.364 + rorq(op1, 32); 1.365 + movq(sum, Address(out, offset, Address::times_4, 8)); 1.366 + rorq(sum, 32); 1.367 + if (UseBMI2Instructions) { 1.368 + multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 1.369 + } 1.370 + else { 1.371 + multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 1.372 + } 1.373 + // Store back in big endian from little endian 1.374 + rorq(sum, 0x20); 1.375 + movq(Address(out, offset, Address::times_4, 8), sum); 1.376 + 1.377 + movq(op1, Address(in, len, Address::times_4, 0)); 1.378 + rorq(op1, 32); 1.379 + movq(sum, Address(out, offset, Address::times_4, 0)); 1.380 + rorq(sum, 32); 1.381 + if (UseBMI2Instructions) { 1.382 + multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 1.383 + } 1.384 + else { 1.385 + multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 1.386 + } 1.387 + // Store back in big endian from little endian 1.388 + rorq(sum, 0x20); 1.389 + movq(Address(out, offset, Address::times_4, 0), sum); 1.390 + 1.391 + jmp(L_first_loop); 1.392 + bind(L_first_loop_exit); 1.393 +} 1.394 + 1.395 +/** 1.396 + * Code for BigInteger::mulAdd() intrinsic 1.397 + * 1.398 + * rdi: out 1.399 + * rsi: in 1.400 + * r11: offs (out.length - offset) 1.401 + * rcx: len 1.402 + * r8: k 1.403 + * r12: tmp1 1.404 + * r13: tmp2 1.405 + * r14: tmp3 1.406 + * r15: tmp4 1.407 + * rbx: tmp5 1.408 + * Multiply the in[] by word k and add to out[], return the carry in rax 1.409 + */ 1.410 +void MacroAssembler::mul_add(Register out, Register in, Register offs, 1.411 + Register len, Register k, Register tmp1, Register tmp2, Register tmp3, 1.412 + Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 1.413 + 1.414 + Label L_carry, L_last_in, L_done; 1.415 + 1.416 +// carry = 0; 1.417 +// for (int j=len-1; j >= 0; j--) { 1.418 +// long product = (in[j] & LONG_MASK) * kLong + 1.419 +// (out[offs] & LONG_MASK) + carry; 1.420 +// out[offs--] = (int)product; 1.421 +// carry = product >>> 32; 1.422 +// } 1.423 +// 1.424 + push(tmp1); 1.425 + push(tmp2); 1.426 + push(tmp3); 1.427 + push(tmp4); 1.428 + push(tmp5); 1.429 + 1.430 + Register op2 = tmp2; 1.431 + const Register sum = tmp3; 1.432 + const Register op1 = tmp4; 1.433 + const Register carry = tmp5; 1.434 + 1.435 + if (UseBMI2Instructions) { 1.436 + op2 = rdxReg; 1.437 + movl(op2, k); 1.438 + } 1.439 + else { 1.440 + movl(op2, k); 1.441 + } 1.442 + 1.443 + xorq(carry, carry); 1.444 + 1.445 + //First loop 1.446 + 1.447 + //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply 1.448 + //The carry is in tmp5 1.449 + mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg); 1.450 + 1.451 + //Multiply the trailing in[] entry using 64 bit by 32 bit, if any 1.452 + decrementl(len); 1.453 + jccb(Assembler::negative, L_carry); 1.454 + decrementl(len); 1.455 + jccb(Assembler::negative, L_last_in); 1.456 + 1.457 + movq(op1, Address(in, len, Address::times_4, 0)); 1.458 + rorq(op1, 32); 1.459 + 1.460 + subl(offs, 2); 1.461 + movq(sum, Address(out, offs, Address::times_4, 0)); 1.462 + rorq(sum, 32); 1.463 + 1.464 + if (UseBMI2Instructions) { 1.465 + multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 1.466 + } 1.467 + else { 1.468 + multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 1.469 + } 1.470 + 1.471 + // Store back in big endian from little endian 1.472 + rorq(sum, 0x20); 1.473 + movq(Address(out, offs, Address::times_4, 0), sum); 1.474 + 1.475 + testl(len, len); 1.476 + jccb(Assembler::zero, L_carry); 1.477 + 1.478 + //Multiply the last in[] entry, if any 1.479 + bind(L_last_in); 1.480 + movl(op1, Address(in, 0)); 1.481 + movl(sum, Address(out, offs, Address::times_4, -4)); 1.482 + 1.483 + movl(raxReg, k); 1.484 + mull(op1); //tmp4 * eax -> edx:eax 1.485 + addl(sum, carry); 1.486 + adcl(rdxReg, 0); 1.487 + addl(sum, raxReg); 1.488 + adcl(rdxReg, 0); 1.489 + movl(carry, rdxReg); 1.490 + 1.491 + movl(Address(out, offs, Address::times_4, -4), sum); 1.492 + 1.493 + bind(L_carry); 1.494 + //return tmp5/carry as carry in rax 1.495 + movl(rax, carry); 1.496 + 1.497 + bind(L_done); 1.498 + pop(tmp5); 1.499 + pop(tmp4); 1.500 + pop(tmp3); 1.501 + pop(tmp2); 1.502 + pop(tmp1); 1.503 +} 1.504 #endif 1.505 1.506 /**