Wed, 17 Feb 2016 13:40:12 +0300
8081778: Use Intel x64 CPU instructions for RSA acceleration
Summary: Add intrinsics for BigInteger squareToLen and mulAdd methods.
Reviewed-by: kvn, jrose
1.1 --- a/src/cpu/x86/vm/assembler_x86.cpp Tue Feb 16 13:56:12 2016 +0000 1.2 +++ b/src/cpu/x86/vm/assembler_x86.cpp Wed Feb 17 13:40:12 2016 +0300 1.3 @@ -2318,6 +2318,13 @@ 1.4 emit_arith(0x0B, 0xC0, dst, src); 1.5 } 1.6 1.7 +void Assembler::orl(Address dst, Register src) { 1.8 + InstructionMark im(this); 1.9 + prefix(dst, src); 1.10 + emit_int8(0x09); 1.11 + emit_operand(src, dst); 1.12 +} 1.13 + 1.14 void Assembler::packuswb(XMMRegister dst, Address src) { 1.15 NOT_LP64(assert(VM_Version::supports_sse2(), "")); 1.16 assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); 1.17 @@ -5613,6 +5620,19 @@ 1.18 } 1.19 } 1.20 1.21 +void Assembler::rcrq(Register dst, int imm8) { 1.22 + assert(isShiftCount(imm8 >> 1), "illegal shift count"); 1.23 + int encode = prefixq_and_encode(dst->encoding()); 1.24 + if (imm8 == 1) { 1.25 + emit_int8((unsigned char)0xD1); 1.26 + emit_int8((unsigned char)(0xD8 | encode)); 1.27 + } else { 1.28 + emit_int8((unsigned char)0xC1); 1.29 + emit_int8((unsigned char)(0xD8 | encode)); 1.30 + emit_int8(imm8); 1.31 + } 1.32 +} 1.33 + 1.34 void Assembler::rorq(Register dst, int imm8) { 1.35 assert(isShiftCount(imm8 >> 1), "illegal shift count"); 1.36 int encode = prefixq_and_encode(dst->encoding());
2.1 --- a/src/cpu/x86/vm/assembler_x86.hpp Tue Feb 16 13:56:12 2016 +0000 2.2 +++ b/src/cpu/x86/vm/assembler_x86.hpp Wed Feb 17 13:40:12 2016 +0300 2.3 @@ -1455,6 +1455,7 @@ 2.4 void orl(Register dst, int32_t imm32); 2.5 void orl(Register dst, Address src); 2.6 void orl(Register dst, Register src); 2.7 + void orl(Address dst, Register src); 2.8 2.9 void orq(Address dst, int32_t imm32); 2.10 void orq(Register dst, int32_t imm32); 2.11 @@ -1555,6 +1556,8 @@ 2.12 2.13 void rclq(Register dst, int imm8); 2.14 2.15 + void rcrq(Register dst, int imm8); 2.16 + 2.17 void rdtsc(); 2.18 2.19 void ret(int imm16);
3.1 --- a/src/cpu/x86/vm/macroAssembler_x86.cpp Tue Feb 16 13:56:12 2016 +0000 3.2 +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp Wed Feb 17 13:40:12 2016 +0300 3.3 @@ -7769,6 +7769,503 @@ 3.4 pop(tmp2); 3.5 pop(tmp1); 3.6 } 3.7 + 3.8 +//Helper functions for square_to_len() 3.9 + 3.10 +/** 3.11 + * Store the squares of x[], right shifted one bit (divided by 2) into z[] 3.12 + * Preserves x and z and modifies rest of the registers. 3.13 + */ 3.14 + 3.15 +void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 3.16 + // Perform square and right shift by 1 3.17 + // Handle odd xlen case first, then for even xlen do the following 3.18 + // jlong carry = 0; 3.19 + // for (int j=0, i=0; j < xlen; j+=2, i+=4) { 3.20 + // huge_128 product = x[j:j+1] * x[j:j+1]; 3.21 + // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65); 3.22 + // z[i+2:i+3] = (jlong)(product >>> 1); 3.23 + // carry = (jlong)product; 3.24 + // } 3.25 + 3.26 + xorq(tmp5, tmp5); // carry 3.27 + xorq(rdxReg, rdxReg); 3.28 + xorl(tmp1, tmp1); // index for x 3.29 + xorl(tmp4, tmp4); // index for z 3.30 + 3.31 + Label L_first_loop, L_first_loop_exit; 3.32 + 3.33 + testl(xlen, 1); 3.34 + jccb(Assembler::zero, L_first_loop); //jump if xlen is even 3.35 + 3.36 + // Square and right shift by 1 the odd element using 32 bit multiply 3.37 + movl(raxReg, Address(x, tmp1, Address::times_4, 0)); 3.38 + imulq(raxReg, raxReg); 3.39 + shrq(raxReg, 1); 3.40 + adcq(tmp5, 0); 3.41 + movq(Address(z, tmp4, Address::times_4, 0), raxReg); 3.42 + incrementl(tmp1); 3.43 + addl(tmp4, 2); 3.44 + 3.45 + // Square and right shift by 1 the rest using 64 bit multiply 3.46 + bind(L_first_loop); 3.47 + cmpptr(tmp1, xlen); 3.48 + jccb(Assembler::equal, L_first_loop_exit); 3.49 + 3.50 + // Square 3.51 + movq(raxReg, Address(x, tmp1, Address::times_4, 0)); 3.52 + rorq(raxReg, 32); // convert big-endian to little-endian 3.53 + mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax 3.54 + 3.55 + // Right shift by 1 and save carry 3.56 + shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1 3.57 + rcrq(rdxReg, 1); 3.58 + rcrq(raxReg, 1); 3.59 + adcq(tmp5, 0); 3.60 + 3.61 + // Store result in z 3.62 + movq(Address(z, tmp4, Address::times_4, 0), rdxReg); 3.63 + movq(Address(z, tmp4, Address::times_4, 8), raxReg); 3.64 + 3.65 + // Update indices for x and z 3.66 + addl(tmp1, 2); 3.67 + addl(tmp4, 4); 3.68 + jmp(L_first_loop); 3.69 + 3.70 + bind(L_first_loop_exit); 3.71 +} 3.72 + 3.73 + 3.74 +/** 3.75 + * Perform the following multiply add operation using BMI2 instructions 3.76 + * carry:sum = sum + op1*op2 + carry 3.77 + * op2 should be in rdx 3.78 + * op2 is preserved, all other registers are modified 3.79 + */ 3.80 +void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) { 3.81 + // assert op2 is rdx 3.82 + mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1 3.83 + addq(sum, carry); 3.84 + adcq(tmp2, 0); 3.85 + addq(sum, op1); 3.86 + adcq(tmp2, 0); 3.87 + movq(carry, tmp2); 3.88 +} 3.89 + 3.90 +/** 3.91 + * Perform the following multiply add operation: 3.92 + * carry:sum = sum + op1*op2 + carry 3.93 + * Preserves op1, op2 and modifies rest of registers 3.94 + */ 3.95 +void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) { 3.96 + // rdx:rax = op1 * op2 3.97 + movq(raxReg, op2); 3.98 + mulq(op1); 3.99 + 3.100 + // rdx:rax = sum + carry + rdx:rax 3.101 + addq(sum, carry); 3.102 + adcq(rdxReg, 0); 3.103 + addq(sum, raxReg); 3.104 + adcq(rdxReg, 0); 3.105 + 3.106 + // carry:sum = rdx:sum 3.107 + movq(carry, rdxReg); 3.108 +} 3.109 + 3.110 +/** 3.111 + * Add 64 bit long carry into z[] with carry propogation. 3.112 + * Preserves z and carry register values and modifies rest of registers. 3.113 + * 3.114 + */ 3.115 +void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) { 3.116 + Label L_fourth_loop, L_fourth_loop_exit; 3.117 + 3.118 + movl(tmp1, 1); 3.119 + subl(zlen, 2); 3.120 + addq(Address(z, zlen, Address::times_4, 0), carry); 3.121 + 3.122 + bind(L_fourth_loop); 3.123 + jccb(Assembler::carryClear, L_fourth_loop_exit); 3.124 + subl(zlen, 2); 3.125 + jccb(Assembler::negative, L_fourth_loop_exit); 3.126 + addq(Address(z, zlen, Address::times_4, 0), tmp1); 3.127 + jmp(L_fourth_loop); 3.128 + bind(L_fourth_loop_exit); 3.129 +} 3.130 + 3.131 +/** 3.132 + * Shift z[] left by 1 bit. 3.133 + * Preserves x, len, z and zlen registers and modifies rest of the registers. 3.134 + * 3.135 + */ 3.136 +void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) { 3.137 + 3.138 + Label L_fifth_loop, L_fifth_loop_exit; 3.139 + 3.140 + // Fifth loop 3.141 + // Perform primitiveLeftShift(z, zlen, 1) 3.142 + 3.143 + const Register prev_carry = tmp1; 3.144 + const Register new_carry = tmp4; 3.145 + const Register value = tmp2; 3.146 + const Register zidx = tmp3; 3.147 + 3.148 + // int zidx, carry; 3.149 + // long value; 3.150 + // carry = 0; 3.151 + // for (zidx = zlen-2; zidx >=0; zidx -= 2) { 3.152 + // (carry:value) = (z[i] << 1) | carry ; 3.153 + // z[i] = value; 3.154 + // } 3.155 + 3.156 + movl(zidx, zlen); 3.157 + xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register 3.158 + 3.159 + bind(L_fifth_loop); 3.160 + decl(zidx); // Use decl to preserve carry flag 3.161 + decl(zidx); 3.162 + jccb(Assembler::negative, L_fifth_loop_exit); 3.163 + 3.164 + if (UseBMI2Instructions) { 3.165 + movq(value, Address(z, zidx, Address::times_4, 0)); 3.166 + rclq(value, 1); 3.167 + rorxq(value, value, 32); 3.168 + movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form 3.169 + } 3.170 + else { 3.171 + // clear new_carry 3.172 + xorl(new_carry, new_carry); 3.173 + 3.174 + // Shift z[i] by 1, or in previous carry and save new carry 3.175 + movq(value, Address(z, zidx, Address::times_4, 0)); 3.176 + shlq(value, 1); 3.177 + adcl(new_carry, 0); 3.178 + 3.179 + orq(value, prev_carry); 3.180 + rorq(value, 0x20); 3.181 + movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form 3.182 + 3.183 + // Set previous carry = new carry 3.184 + movl(prev_carry, new_carry); 3.185 + } 3.186 + jmp(L_fifth_loop); 3.187 + 3.188 + bind(L_fifth_loop_exit); 3.189 +} 3.190 + 3.191 + 3.192 +/** 3.193 + * Code for BigInteger::squareToLen() intrinsic 3.194 + * 3.195 + * rdi: x 3.196 + * rsi: len 3.197 + * r8: z 3.198 + * rcx: zlen 3.199 + * r12: tmp1 3.200 + * r13: tmp2 3.201 + * r14: tmp3 3.202 + * r15: tmp4 3.203 + * rbx: tmp5 3.204 + * 3.205 + */ 3.206 +void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 3.207 + 3.208 + Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply; 3.209 + push(tmp1); 3.210 + push(tmp2); 3.211 + push(tmp3); 3.212 + push(tmp4); 3.213 + push(tmp5); 3.214 + 3.215 + // First loop 3.216 + // Store the squares, right shifted one bit (i.e., divided by 2). 3.217 + square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg); 3.218 + 3.219 + // Add in off-diagonal sums. 3.220 + // 3.221 + // Second, third (nested) and fourth loops. 3.222 + // zlen +=2; 3.223 + // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) { 3.224 + // carry = 0; 3.225 + // long op2 = x[xidx:xidx+1]; 3.226 + // for (int j=xidx-2,k=zidx; j >= 0; j-=2) { 3.227 + // k -= 2; 3.228 + // long op1 = x[j:j+1]; 3.229 + // long sum = z[k:k+1]; 3.230 + // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs); 3.231 + // z[k:k+1] = sum; 3.232 + // } 3.233 + // add_one_64(z, k, carry, tmp_regs); 3.234 + // } 3.235 + 3.236 + const Register carry = tmp5; 3.237 + const Register sum = tmp3; 3.238 + const Register op1 = tmp4; 3.239 + Register op2 = tmp2; 3.240 + 3.241 + push(zlen); 3.242 + push(len); 3.243 + addl(zlen,2); 3.244 + bind(L_second_loop); 3.245 + xorq(carry, carry); 3.246 + subl(zlen, 4); 3.247 + subl(len, 2); 3.248 + push(zlen); 3.249 + push(len); 3.250 + cmpl(len, 0); 3.251 + jccb(Assembler::lessEqual, L_second_loop_exit); 3.252 + 3.253 + // Multiply an array by one 64 bit long. 3.254 + if (UseBMI2Instructions) { 3.255 + op2 = rdxReg; 3.256 + movq(op2, Address(x, len, Address::times_4, 0)); 3.257 + rorxq(op2, op2, 32); 3.258 + } 3.259 + else { 3.260 + movq(op2, Address(x, len, Address::times_4, 0)); 3.261 + rorq(op2, 32); 3.262 + } 3.263 + 3.264 + bind(L_third_loop); 3.265 + decrementl(len); 3.266 + jccb(Assembler::negative, L_third_loop_exit); 3.267 + decrementl(len); 3.268 + jccb(Assembler::negative, L_last_x); 3.269 + 3.270 + movq(op1, Address(x, len, Address::times_4, 0)); 3.271 + rorq(op1, 32); 3.272 + 3.273 + bind(L_multiply); 3.274 + subl(zlen, 2); 3.275 + movq(sum, Address(z, zlen, Address::times_4, 0)); 3.276 + 3.277 + // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry. 3.278 + if (UseBMI2Instructions) { 3.279 + multiply_add_64_bmi2(sum, op1, op2, carry, tmp2); 3.280 + } 3.281 + else { 3.282 + multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 3.283 + } 3.284 + 3.285 + movq(Address(z, zlen, Address::times_4, 0), sum); 3.286 + 3.287 + jmp(L_third_loop); 3.288 + bind(L_third_loop_exit); 3.289 + 3.290 + // Fourth loop 3.291 + // Add 64 bit long carry into z with carry propogation. 3.292 + // Uses offsetted zlen. 3.293 + add_one_64(z, zlen, carry, tmp1); 3.294 + 3.295 + pop(len); 3.296 + pop(zlen); 3.297 + jmp(L_second_loop); 3.298 + 3.299 + // Next infrequent code is moved outside loops. 3.300 + bind(L_last_x); 3.301 + movl(op1, Address(x, 0)); 3.302 + jmp(L_multiply); 3.303 + 3.304 + bind(L_second_loop_exit); 3.305 + pop(len); 3.306 + pop(zlen); 3.307 + pop(len); 3.308 + pop(zlen); 3.309 + 3.310 + // Fifth loop 3.311 + // Shift z left 1 bit. 3.312 + lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4); 3.313 + 3.314 + // z[zlen-1] |= x[len-1] & 1; 3.315 + movl(tmp3, Address(x, len, Address::times_4, -4)); 3.316 + andl(tmp3, 1); 3.317 + orl(Address(z, zlen, Address::times_4, -4), tmp3); 3.318 + 3.319 + pop(tmp5); 3.320 + pop(tmp4); 3.321 + pop(tmp3); 3.322 + pop(tmp2); 3.323 + pop(tmp1); 3.324 +} 3.325 + 3.326 +/** 3.327 + * Helper function for mul_add() 3.328 + * Multiply the in[] by int k and add to out[] starting at offset offs using 3.329 + * 128 bit by 32 bit multiply and return the carry in tmp5. 3.330 + * Only quad int aligned length of in[] is operated on in this function. 3.331 + * k is in rdxReg for BMI2Instructions, for others it is in tmp2. 3.332 + * This function preserves out, in and k registers. 3.333 + * len and offset point to the appropriate index in "in" & "out" correspondingly 3.334 + * tmp5 has the carry. 3.335 + * other registers are temporary and are modified. 3.336 + * 3.337 + */ 3.338 +void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in, 3.339 + Register offset, Register len, Register tmp1, Register tmp2, Register tmp3, 3.340 + Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 3.341 + 3.342 + Label L_first_loop, L_first_loop_exit; 3.343 + 3.344 + movl(tmp1, len); 3.345 + shrl(tmp1, 2); 3.346 + 3.347 + bind(L_first_loop); 3.348 + subl(tmp1, 1); 3.349 + jccb(Assembler::negative, L_first_loop_exit); 3.350 + 3.351 + subl(len, 4); 3.352 + subl(offset, 4); 3.353 + 3.354 + Register op2 = tmp2; 3.355 + const Register sum = tmp3; 3.356 + const Register op1 = tmp4; 3.357 + const Register carry = tmp5; 3.358 + 3.359 + if (UseBMI2Instructions) { 3.360 + op2 = rdxReg; 3.361 + } 3.362 + 3.363 + movq(op1, Address(in, len, Address::times_4, 8)); 3.364 + rorq(op1, 32); 3.365 + movq(sum, Address(out, offset, Address::times_4, 8)); 3.366 + rorq(sum, 32); 3.367 + if (UseBMI2Instructions) { 3.368 + multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 3.369 + } 3.370 + else { 3.371 + multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 3.372 + } 3.373 + // Store back in big endian from little endian 3.374 + rorq(sum, 0x20); 3.375 + movq(Address(out, offset, Address::times_4, 8), sum); 3.376 + 3.377 + movq(op1, Address(in, len, Address::times_4, 0)); 3.378 + rorq(op1, 32); 3.379 + movq(sum, Address(out, offset, Address::times_4, 0)); 3.380 + rorq(sum, 32); 3.381 + if (UseBMI2Instructions) { 3.382 + multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 3.383 + } 3.384 + else { 3.385 + multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 3.386 + } 3.387 + // Store back in big endian from little endian 3.388 + rorq(sum, 0x20); 3.389 + movq(Address(out, offset, Address::times_4, 0), sum); 3.390 + 3.391 + jmp(L_first_loop); 3.392 + bind(L_first_loop_exit); 3.393 +} 3.394 + 3.395 +/** 3.396 + * Code for BigInteger::mulAdd() intrinsic 3.397 + * 3.398 + * rdi: out 3.399 + * rsi: in 3.400 + * r11: offs (out.length - offset) 3.401 + * rcx: len 3.402 + * r8: k 3.403 + * r12: tmp1 3.404 + * r13: tmp2 3.405 + * r14: tmp3 3.406 + * r15: tmp4 3.407 + * rbx: tmp5 3.408 + * Multiply the in[] by word k and add to out[], return the carry in rax 3.409 + */ 3.410 +void MacroAssembler::mul_add(Register out, Register in, Register offs, 3.411 + Register len, Register k, Register tmp1, Register tmp2, Register tmp3, 3.412 + Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 3.413 + 3.414 + Label L_carry, L_last_in, L_done; 3.415 + 3.416 +// carry = 0; 3.417 +// for (int j=len-1; j >= 0; j--) { 3.418 +// long product = (in[j] & LONG_MASK) * kLong + 3.419 +// (out[offs] & LONG_MASK) + carry; 3.420 +// out[offs--] = (int)product; 3.421 +// carry = product >>> 32; 3.422 +// } 3.423 +// 3.424 + push(tmp1); 3.425 + push(tmp2); 3.426 + push(tmp3); 3.427 + push(tmp4); 3.428 + push(tmp5); 3.429 + 3.430 + Register op2 = tmp2; 3.431 + const Register sum = tmp3; 3.432 + const Register op1 = tmp4; 3.433 + const Register carry = tmp5; 3.434 + 3.435 + if (UseBMI2Instructions) { 3.436 + op2 = rdxReg; 3.437 + movl(op2, k); 3.438 + } 3.439 + else { 3.440 + movl(op2, k); 3.441 + } 3.442 + 3.443 + xorq(carry, carry); 3.444 + 3.445 + //First loop 3.446 + 3.447 + //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply 3.448 + //The carry is in tmp5 3.449 + mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg); 3.450 + 3.451 + //Multiply the trailing in[] entry using 64 bit by 32 bit, if any 3.452 + decrementl(len); 3.453 + jccb(Assembler::negative, L_carry); 3.454 + decrementl(len); 3.455 + jccb(Assembler::negative, L_last_in); 3.456 + 3.457 + movq(op1, Address(in, len, Address::times_4, 0)); 3.458 + rorq(op1, 32); 3.459 + 3.460 + subl(offs, 2); 3.461 + movq(sum, Address(out, offs, Address::times_4, 0)); 3.462 + rorq(sum, 32); 3.463 + 3.464 + if (UseBMI2Instructions) { 3.465 + multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 3.466 + } 3.467 + else { 3.468 + multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 3.469 + } 3.470 + 3.471 + // Store back in big endian from little endian 3.472 + rorq(sum, 0x20); 3.473 + movq(Address(out, offs, Address::times_4, 0), sum); 3.474 + 3.475 + testl(len, len); 3.476 + jccb(Assembler::zero, L_carry); 3.477 + 3.478 + //Multiply the last in[] entry, if any 3.479 + bind(L_last_in); 3.480 + movl(op1, Address(in, 0)); 3.481 + movl(sum, Address(out, offs, Address::times_4, -4)); 3.482 + 3.483 + movl(raxReg, k); 3.484 + mull(op1); //tmp4 * eax -> edx:eax 3.485 + addl(sum, carry); 3.486 + adcl(rdxReg, 0); 3.487 + addl(sum, raxReg); 3.488 + adcl(rdxReg, 0); 3.489 + movl(carry, rdxReg); 3.490 + 3.491 + movl(Address(out, offs, Address::times_4, -4), sum); 3.492 + 3.493 + bind(L_carry); 3.494 + //return tmp5/carry as carry in rax 3.495 + movl(rax, carry); 3.496 + 3.497 + bind(L_done); 3.498 + pop(tmp5); 3.499 + pop(tmp4); 3.500 + pop(tmp3); 3.501 + pop(tmp2); 3.502 + pop(tmp1); 3.503 +} 3.504 #endif 3.505 3.506 /**
4.1 --- a/src/cpu/x86/vm/macroAssembler_x86.hpp Tue Feb 16 13:56:12 2016 +0000 4.2 +++ b/src/cpu/x86/vm/macroAssembler_x86.hpp Wed Feb 17 13:40:12 2016 +0300 4.3 @@ -1241,6 +1241,25 @@ 4.4 Register carry2); 4.5 void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen, 4.6 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5); 4.7 + 4.8 + void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3, 4.9 + Register tmp4, Register tmp5, Register rdxReg, Register raxReg); 4.10 + void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, 4.11 + Register tmp2); 4.12 + void multiply_add_64(Register sum, Register op1, Register op2, Register carry, 4.13 + Register rdxReg, Register raxReg); 4.14 + void add_one_64(Register z, Register zlen, Register carry, Register tmp1); 4.15 + void lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, 4.16 + Register tmp3, Register tmp4); 4.17 + void square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, 4.18 + Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg); 4.19 + 4.20 + void mul_add_128_x_32_loop(Register out, Register in, Register offset, Register len, Register tmp1, 4.21 + Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, 4.22 + Register raxReg); 4.23 + void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1, 4.24 + Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, 4.25 + Register raxReg); 4.26 #endif 4.27 4.28 // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
5.1 --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp Tue Feb 16 13:56:12 2016 +0000 5.2 +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp Wed Feb 17 13:40:12 2016 +0300 5.3 @@ -3743,6 +3743,107 @@ 5.4 return start; 5.5 } 5.6 5.7 +/** 5.8 + * Arguments: 5.9 + * 5.10 + // Input: 5.11 + // c_rarg0 - x address 5.12 + // c_rarg1 - x length 5.13 + // c_rarg2 - z address 5.14 + // c_rarg3 - z lenth 5.15 + * 5.16 + */ 5.17 + address generate_squareToLen() { 5.18 + 5.19 + __ align(CodeEntryAlignment); 5.20 + StubCodeMark mark(this, "StubRoutines", "squareToLen"); 5.21 + 5.22 + address start = __ pc(); 5.23 + // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) 5.24 + // Unix: rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...) 5.25 + const Register x = rdi; 5.26 + const Register len = rsi; 5.27 + const Register z = r8; 5.28 + const Register zlen = rcx; 5.29 + 5.30 + const Register tmp1 = r12; 5.31 + const Register tmp2 = r13; 5.32 + const Register tmp3 = r14; 5.33 + const Register tmp4 = r15; 5.34 + const Register tmp5 = rbx; 5.35 + 5.36 + BLOCK_COMMENT("Entry:"); 5.37 + __ enter(); // required for proper stackwalking of RuntimeStub frame 5.38 + 5.39 + setup_arg_regs(4); // x => rdi, len => rsi, z => rdx 5.40 + // zlen => rcx 5.41 + // r9 and r10 may be used to save non-volatile registers 5.42 + __ movptr(r8, rdx); 5.43 + __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax); 5.44 + 5.45 + restore_arg_regs(); 5.46 + 5.47 + __ leave(); // required for proper stackwalking of RuntimeStub frame 5.48 + __ ret(0); 5.49 + 5.50 + return start; 5.51 + } 5.52 + 5.53 + /** 5.54 + * Arguments: 5.55 + * 5.56 + * Input: 5.57 + * c_rarg0 - out address 5.58 + * c_rarg1 - in address 5.59 + * c_rarg2 - offset 5.60 + * c_rarg3 - len 5.61 + * not Win64 5.62 + * c_rarg4 - k 5.63 + * Win64 5.64 + * rsp+40 - k 5.65 + */ 5.66 + address generate_mulAdd() { 5.67 + __ align(CodeEntryAlignment); 5.68 + StubCodeMark mark(this, "StubRoutines", "mulAdd"); 5.69 + 5.70 + address start = __ pc(); 5.71 + // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) 5.72 + // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) 5.73 + const Register out = rdi; 5.74 + const Register in = rsi; 5.75 + const Register offset = r11; 5.76 + const Register len = rcx; 5.77 + const Register k = r8; 5.78 + 5.79 + // Next registers will be saved on stack in mul_add(). 5.80 + const Register tmp1 = r12; 5.81 + const Register tmp2 = r13; 5.82 + const Register tmp3 = r14; 5.83 + const Register tmp4 = r15; 5.84 + const Register tmp5 = rbx; 5.85 + 5.86 + BLOCK_COMMENT("Entry:"); 5.87 + __ enter(); // required for proper stackwalking of RuntimeStub frame 5.88 + 5.89 + setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx 5.90 + // len => rcx, k => r8 5.91 + // r9 and r10 may be used to save non-volatile registers 5.92 +#ifdef _WIN64 5.93 + // last argument is on stack on Win64 5.94 + __ movl(k, Address(rsp, 6 * wordSize)); 5.95 +#endif 5.96 + __ movptr(r11, rdx); // move offset in rdx to offset(r11) 5.97 + __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax); 5.98 + 5.99 + restore_arg_regs(); 5.100 + 5.101 + __ leave(); // required for proper stackwalking of RuntimeStub frame 5.102 + __ ret(0); 5.103 + 5.104 + return start; 5.105 + } 5.106 + 5.107 + 5.108 #undef __ 5.109 #define __ masm-> 5.110 5.111 @@ -3987,6 +4088,12 @@ 5.112 if (UseMultiplyToLenIntrinsic) { 5.113 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5.114 } 5.115 + if (UseSquareToLenIntrinsic) { 5.116 + StubRoutines::_squareToLen = generate_squareToLen(); 5.117 + } 5.118 + if (UseMulAddIntrinsic) { 5.119 + StubRoutines::_mulAdd = generate_mulAdd(); 5.120 + } 5.121 #endif 5.122 } 5.123
6.1 --- a/src/cpu/x86/vm/stubRoutines_x86_64.hpp Tue Feb 16 13:56:12 2016 +0000 6.2 +++ b/src/cpu/x86/vm/stubRoutines_x86_64.hpp Wed Feb 17 13:40:12 2016 +0300 6.3 @@ -33,7 +33,7 @@ 6.4 6.5 enum platform_dependent_constants { 6.6 code_size1 = 19000, // simply increase if too small (assembler will crash if too small) 6.7 - code_size2 = 22000 // simply increase if too small (assembler will crash if too small) 6.8 + code_size2 = 23000 // simply increase if too small (assembler will crash if too small) 6.9 }; 6.10 6.11 class x86 {
7.1 --- a/src/cpu/x86/vm/vm_version_x86.cpp Tue Feb 16 13:56:12 2016 +0000 7.2 +++ b/src/cpu/x86/vm/vm_version_x86.cpp Wed Feb 17 13:40:12 2016 +0300 7.3 @@ -703,6 +703,12 @@ 7.4 if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) { 7.5 UseMultiplyToLenIntrinsic = true; 7.6 } 7.7 + if (FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) { 7.8 + UseSquareToLenIntrinsic = true; 7.9 + } 7.10 + if (FLAG_IS_DEFAULT(UseMulAddIntrinsic)) { 7.11 + UseMulAddIntrinsic = true; 7.12 + } 7.13 #else 7.14 if (UseMultiplyToLenIntrinsic) { 7.15 if (!FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) { 7.16 @@ -710,6 +716,18 @@ 7.17 } 7.18 FLAG_SET_DEFAULT(UseMultiplyToLenIntrinsic, false); 7.19 } 7.20 + if (UseSquareToLenIntrinsic) { 7.21 + if (!FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) { 7.22 + warning("squareToLen intrinsic is not available in 32-bit VM"); 7.23 + } 7.24 + FLAG_SET_DEFAULT(UseSquareToLenIntrinsic, false); 7.25 + } 7.26 + if (UseMulAddIntrinsic) { 7.27 + if (!FLAG_IS_DEFAULT(UseMulAddIntrinsic)) { 7.28 + warning("mulAdd intrinsic is not available in 32-bit VM"); 7.29 + } 7.30 + FLAG_SET_DEFAULT(UseMulAddIntrinsic, false); 7.31 + } 7.32 #endif 7.33 #endif // COMPILER2 7.34
8.1 --- a/src/share/vm/classfile/vmSymbols.hpp Tue Feb 16 13:56:12 2016 +0000 8.2 +++ b/src/share/vm/classfile/vmSymbols.hpp Wed Feb 17 13:40:12 2016 +0300 8.3 @@ -797,6 +797,14 @@ 8.4 do_name( multiplyToLen_name, "multiplyToLen") \ 8.5 do_signature(multiplyToLen_signature, "([II[II[I)[I") \ 8.6 \ 8.7 + do_intrinsic(_squareToLen, java_math_BigInteger, squareToLen_name, squareToLen_signature, F_S) \ 8.8 + do_name( squareToLen_name, "implSquareToLen") \ 8.9 + do_signature(squareToLen_signature, "([II[II)[I") \ 8.10 + \ 8.11 + do_intrinsic(_mulAdd, java_math_BigInteger, mulAdd_name, mulAdd_signature, F_S) \ 8.12 + do_name( mulAdd_name, "implMulAdd") \ 8.13 + do_signature(mulAdd_signature, "([I[IIII)I") \ 8.14 + \ 8.15 /* java/lang/ref/Reference */ \ 8.16 do_intrinsic(_Reference_get, java_lang_ref_Reference, get_name, void_object_signature, F_R) \ 8.17 \
9.1 --- a/src/share/vm/opto/c2_globals.hpp Tue Feb 16 13:56:12 2016 +0000 9.2 +++ b/src/share/vm/opto/c2_globals.hpp Wed Feb 17 13:40:12 2016 +0300 9.3 @@ -662,6 +662,12 @@ 9.4 product(bool, UseMultiplyToLenIntrinsic, false, \ 9.5 "Enables intrinsification of BigInteger.multiplyToLen()") \ 9.6 \ 9.7 + product(bool, UseSquareToLenIntrinsic, false, \ 9.8 + "Enables intrinsification of BigInteger.squareToLen()") \ 9.9 + \ 9.10 + product(bool, UseMulAddIntrinsic, false, \ 9.11 + "Enables intrinsification of BigInteger.mulAdd()") \ 9.12 + \ 9.13 product(bool, UseTypeSpeculation, true, \ 9.14 "Speculatively propagate types from profiles") \ 9.15 \
10.1 --- a/src/share/vm/opto/escape.cpp Tue Feb 16 13:56:12 2016 +0000 10.2 +++ b/src/share/vm/opto/escape.cpp Wed Feb 17 13:40:12 2016 +0300 10.3 @@ -958,7 +958,9 @@ 10.4 strcmp(call->as_CallLeaf()->_name, "sha256_implCompressMB") == 0 || 10.5 strcmp(call->as_CallLeaf()->_name, "sha512_implCompress") == 0 || 10.6 strcmp(call->as_CallLeaf()->_name, "sha512_implCompressMB") == 0 || 10.7 - strcmp(call->as_CallLeaf()->_name, "multiplyToLen") == 0) 10.8 + strcmp(call->as_CallLeaf()->_name, "multiplyToLen") == 0 || 10.9 + strcmp(call->as_CallLeaf()->_name, "squareToLen") == 0 || 10.10 + strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0) 10.11 ))) { 10.12 call->dump(); 10.13 fatal(err_msg_res("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name));
11.1 --- a/src/share/vm/opto/library_call.cpp Tue Feb 16 13:56:12 2016 +0000 11.2 +++ b/src/share/vm/opto/library_call.cpp Wed Feb 17 13:40:12 2016 +0300 11.3 @@ -324,6 +324,8 @@ 11.4 bool inline_updateBytesCRC32(); 11.5 bool inline_updateByteBufferCRC32(); 11.6 bool inline_multiplyToLen(); 11.7 + bool inline_squareToLen(); 11.8 + bool inline_mulAdd(); 11.9 11.10 bool inline_profileBoolean(); 11.11 }; 11.12 @@ -527,6 +529,14 @@ 11.13 if (!UseMultiplyToLenIntrinsic) return NULL; 11.14 break; 11.15 11.16 + case vmIntrinsics::_squareToLen: 11.17 + if (!UseSquareToLenIntrinsic) return NULL; 11.18 + break; 11.19 + 11.20 + case vmIntrinsics::_mulAdd: 11.21 + if (!UseMulAddIntrinsic) return NULL; 11.22 + break; 11.23 + 11.24 case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt: 11.25 case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt: 11.26 if (!UseAESIntrinsics) return NULL; 11.27 @@ -927,6 +937,12 @@ 11.28 case vmIntrinsics::_multiplyToLen: 11.29 return inline_multiplyToLen(); 11.30 11.31 + case vmIntrinsics::_squareToLen: 11.32 + return inline_squareToLen(); 11.33 + 11.34 + case vmIntrinsics::_mulAdd: 11.35 + return inline_mulAdd(); 11.36 + 11.37 case vmIntrinsics::_encodeISOArray: 11.38 return inline_encodeISOArray(); 11.39 11.40 @@ -5856,6 +5872,100 @@ 11.41 return true; 11.42 } 11.43 11.44 +//-------------inline_squareToLen------------------------------------ 11.45 +bool LibraryCallKit::inline_squareToLen() { 11.46 + assert(UseSquareToLenIntrinsic, "not implementated on this platform"); 11.47 + 11.48 + address stubAddr = StubRoutines::squareToLen(); 11.49 + if (stubAddr == NULL) { 11.50 + return false; // Intrinsic's stub is not implemented on this platform 11.51 + } 11.52 + const char* stubName = "squareToLen"; 11.53 + 11.54 + assert(callee()->signature()->size() == 4, "implSquareToLen has 4 parameters"); 11.55 + 11.56 + Node* x = argument(0); 11.57 + Node* len = argument(1); 11.58 + Node* z = argument(2); 11.59 + Node* zlen = argument(3); 11.60 + 11.61 + const Type* x_type = x->Value(&_gvn); 11.62 + const Type* z_type = z->Value(&_gvn); 11.63 + const TypeAryPtr* top_x = x_type->isa_aryptr(); 11.64 + const TypeAryPtr* top_z = z_type->isa_aryptr(); 11.65 + if (top_x == NULL || top_x->klass() == NULL || 11.66 + top_z == NULL || top_z->klass() == NULL) { 11.67 + // failed array check 11.68 + return false; 11.69 + } 11.70 + 11.71 + BasicType x_elem = x_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type(); 11.72 + BasicType z_elem = z_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type(); 11.73 + if (x_elem != T_INT || z_elem != T_INT) { 11.74 + return false; 11.75 + } 11.76 + 11.77 + 11.78 + Node* x_start = array_element_address(x, intcon(0), x_elem); 11.79 + Node* z_start = array_element_address(z, intcon(0), z_elem); 11.80 + 11.81 + Node* call = make_runtime_call(RC_LEAF|RC_NO_FP, 11.82 + OptoRuntime::squareToLen_Type(), 11.83 + stubAddr, stubName, TypePtr::BOTTOM, 11.84 + x_start, len, z_start, zlen); 11.85 + 11.86 + set_result(z); 11.87 + return true; 11.88 +} 11.89 + 11.90 +//-------------inline_mulAdd------------------------------------------ 11.91 +bool LibraryCallKit::inline_mulAdd() { 11.92 + assert(UseMulAddIntrinsic, "not implementated on this platform"); 11.93 + 11.94 + address stubAddr = StubRoutines::mulAdd(); 11.95 + if (stubAddr == NULL) { 11.96 + return false; // Intrinsic's stub is not implemented on this platform 11.97 + } 11.98 + const char* stubName = "mulAdd"; 11.99 + 11.100 + assert(callee()->signature()->size() == 5, "mulAdd has 5 parameters"); 11.101 + 11.102 + Node* out = argument(0); 11.103 + Node* in = argument(1); 11.104 + Node* offset = argument(2); 11.105 + Node* len = argument(3); 11.106 + Node* k = argument(4); 11.107 + 11.108 + const Type* out_type = out->Value(&_gvn); 11.109 + const Type* in_type = in->Value(&_gvn); 11.110 + const TypeAryPtr* top_out = out_type->isa_aryptr(); 11.111 + const TypeAryPtr* top_in = in_type->isa_aryptr(); 11.112 + if (top_out == NULL || top_out->klass() == NULL || 11.113 + top_in == NULL || top_in->klass() == NULL) { 11.114 + // failed array check 11.115 + return false; 11.116 + } 11.117 + 11.118 + BasicType out_elem = out_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type(); 11.119 + BasicType in_elem = in_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type(); 11.120 + if (out_elem != T_INT || in_elem != T_INT) { 11.121 + return false; 11.122 + } 11.123 + 11.124 + Node* outlen = load_array_length(out); 11.125 + Node* new_offset = _gvn.transform(new (C) SubINode(outlen, offset)); 11.126 + Node* out_start = array_element_address(out, intcon(0), out_elem); 11.127 + Node* in_start = array_element_address(in, intcon(0), in_elem); 11.128 + 11.129 + Node* call = make_runtime_call(RC_LEAF|RC_NO_FP, 11.130 + OptoRuntime::mulAdd_Type(), 11.131 + stubAddr, stubName, TypePtr::BOTTOM, 11.132 + out_start,in_start, new_offset, len, k); 11.133 + Node* result = _gvn.transform(new (C) ProjNode(call, TypeFunc::Parms)); 11.134 + set_result(result); 11.135 + return true; 11.136 +} 11.137 + 11.138 11.139 /** 11.140 * Calculate CRC32 for byte.
12.1 --- a/src/share/vm/opto/runtime.cpp Tue Feb 16 13:56:12 2016 +0000 12.2 +++ b/src/share/vm/opto/runtime.cpp Wed Feb 17 13:40:12 2016 +0300 12.3 @@ -956,6 +956,48 @@ 12.4 return TypeFunc::make(domain, range); 12.5 } 12.6 12.7 +const TypeFunc* OptoRuntime::squareToLen_Type() { 12.8 + // create input type (domain) 12.9 + int num_args = 4; 12.10 + int argcnt = num_args; 12.11 + const Type** fields = TypeTuple::fields(argcnt); 12.12 + int argp = TypeFunc::Parms; 12.13 + fields[argp++] = TypePtr::NOTNULL; // x 12.14 + fields[argp++] = TypeInt::INT; // len 12.15 + fields[argp++] = TypePtr::NOTNULL; // z 12.16 + fields[argp++] = TypeInt::INT; // zlen 12.17 + assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); 12.18 + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); 12.19 + 12.20 + // no result type needed 12.21 + fields = TypeTuple::fields(1); 12.22 + fields[TypeFunc::Parms+0] = NULL; 12.23 + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields); 12.24 + return TypeFunc::make(domain, range); 12.25 +} 12.26 + 12.27 +// for mulAdd calls, 2 pointers and 3 ints, returning int 12.28 +const TypeFunc* OptoRuntime::mulAdd_Type() { 12.29 + // create input type (domain) 12.30 + int num_args = 5; 12.31 + int argcnt = num_args; 12.32 + const Type** fields = TypeTuple::fields(argcnt); 12.33 + int argp = TypeFunc::Parms; 12.34 + fields[argp++] = TypePtr::NOTNULL; // out 12.35 + fields[argp++] = TypePtr::NOTNULL; // in 12.36 + fields[argp++] = TypeInt::INT; // offset 12.37 + fields[argp++] = TypeInt::INT; // len 12.38 + fields[argp++] = TypeInt::INT; // k 12.39 + assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); 12.40 + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); 12.41 + 12.42 + // returning carry (int) 12.43 + fields = TypeTuple::fields(1); 12.44 + fields[TypeFunc::Parms+0] = TypeInt::INT; 12.45 + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms+1, fields); 12.46 + return TypeFunc::make(domain, range); 12.47 +} 12.48 + 12.49 12.50 12.51 //------------- Interpreter state access for on stack replacement
13.1 --- a/src/share/vm/opto/runtime.hpp Tue Feb 16 13:56:12 2016 +0000 13.2 +++ b/src/share/vm/opto/runtime.hpp Wed Feb 17 13:40:12 2016 +0300 13.3 @@ -305,6 +305,10 @@ 13.4 13.5 static const TypeFunc* multiplyToLen_Type(); 13.6 13.7 + static const TypeFunc* squareToLen_Type(); 13.8 + 13.9 + static const TypeFunc* mulAdd_Type(); 13.10 + 13.11 static const TypeFunc* updateBytesCRC32_Type(); 13.12 13.13 // leaf on stack replacement interpreter accessor types
14.1 --- a/src/share/vm/runtime/stubRoutines.cpp Tue Feb 16 13:56:12 2016 +0000 14.2 +++ b/src/share/vm/runtime/stubRoutines.cpp Wed Feb 17 13:40:12 2016 +0300 14.3 @@ -136,6 +136,8 @@ 14.4 address StubRoutines::_crc_table_adr = NULL; 14.5 14.6 address StubRoutines::_multiplyToLen = NULL; 14.7 +address StubRoutines::_squareToLen = NULL; 14.8 +address StubRoutines::_mulAdd = NULL; 14.9 14.10 double (* StubRoutines::_intrinsic_log )(double) = NULL; 14.11 double (* StubRoutines::_intrinsic_log10 )(double) = NULL;
15.1 --- a/src/share/vm/runtime/stubRoutines.hpp Tue Feb 16 13:56:12 2016 +0000 15.2 +++ b/src/share/vm/runtime/stubRoutines.hpp Wed Feb 17 13:40:12 2016 +0300 15.3 @@ -209,6 +209,8 @@ 15.4 static address _crc_table_adr; 15.5 15.6 static address _multiplyToLen; 15.7 + static address _squareToLen; 15.8 + static address _mulAdd; 15.9 15.10 // These are versions of the java.lang.Math methods which perform 15.11 // the same operations as the intrinsic version. They are used for 15.12 @@ -367,6 +369,8 @@ 15.13 static address crc_table_addr() { return _crc_table_adr; } 15.14 15.15 static address multiplyToLen() {return _multiplyToLen; } 15.16 + static address squareToLen() {return _squareToLen; } 15.17 + static address mulAdd() {return _mulAdd; } 15.18 15.19 static address select_fill_function(BasicType t, bool aligned, const char* &name); 15.20
16.1 --- a/src/share/vm/runtime/vmStructs.cpp Tue Feb 16 13:56:12 2016 +0000 16.2 +++ b/src/share/vm/runtime/vmStructs.cpp Wed Feb 17 13:40:12 2016 +0300 16.3 @@ -813,6 +813,8 @@ 16.4 static_field(StubRoutines, _updateBytesCRC32, address) \ 16.5 static_field(StubRoutines, _crc_table_adr, address) \ 16.6 static_field(StubRoutines, _multiplyToLen, address) \ 16.7 + static_field(StubRoutines, _squareToLen, address) \ 16.8 + static_field(StubRoutines, _mulAdd, address) \ 16.9 \ 16.10 /*****************/ \ 16.11 /* SharedRuntime */ \
17.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 17.2 +++ b/test/compiler/intrinsics/muladd/TestMulAdd.java Wed Feb 17 13:40:12 2016 +0300 17.3 @@ -0,0 +1,117 @@ 17.4 +/* 17.5 + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. 17.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 17.7 + * 17.8 + * This code is free software; you can redistribute it and/or modify it 17.9 + * under the terms of the GNU General Public License version 2 only, as 17.10 + * published by the Free Software Foundation. 17.11 + * 17.12 + * This code is distributed in the hope that it will be useful, but WITHOUT 17.13 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 17.14 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 17.15 + * version 2 for more details (a copy is included in the LICENSE file that 17.16 + * accompanied this code). 17.17 + * 17.18 + * You should have received a copy of the GNU General Public License version 17.19 + * 2 along with this work; if not, write to the Free Software Foundation, 17.20 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 17.21 + * 17.22 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 17.23 + * or visit www.oracle.com if you need additional information or have any 17.24 + * questions. 17.25 + * 17.26 + */ 17.27 + 17.28 +/** 17.29 + * @test 17.30 + * @bug 8081778 17.31 + * @summary Add C2 x86 intrinsic for BigInteger::mulAdd() method 17.32 + * 17.33 + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch 17.34 + * -XX:+IgnoreUnrecognizedVMOptions -XX:-UseSquareToLenIntrinsic -XX:-UseMultiplyToLenIntrinsic 17.35 + * -XX:CompileCommand=dontinline,TestMulAdd::main 17.36 + * -XX:CompileCommand=option,TestMulAdd::base_multiply,ccstr,DisableIntrinsic,_mulAdd 17.37 + * -XX:CompileCommand=option,java.math.BigInteger::multiply,ccstr,DisableIntrinsic,_mulAdd 17.38 + * -XX:CompileCommand=option,java.math.BigInteger::square,ccstr,DisableIntrinsic,_mulAdd 17.39 + * -XX:CompileCommand=option,java.math.BigInteger::squareToLen,ccstr,DisableIntrinsic,_mulAdd 17.40 + * -XX:CompileCommand=option,java.math.BigInteger::mulAdd,ccstr,DisableIntrinsic,_mulAdd 17.41 + * -XX:CompileCommand=inline,java.math.BigInteger::multiply 17.42 + * -XX:CompileCommand=inline,java.math.BigInteger::square 17.43 + * -XX:CompileCommand=inline,java.math.BigInteger::squareToLen 17.44 + * -XX:CompileCommand=inline,java.math.BigInteger::mulAdd TestMulAdd 17.45 + */ 17.46 + 17.47 +import java.util.Random; 17.48 +import java.math.*; 17.49 + 17.50 +public class TestMulAdd { 17.51 + 17.52 + // Avoid intrinsic by preventing inlining multiply() and mulAdd(). 17.53 + public static BigInteger base_multiply(BigInteger op1) { 17.54 + return op1.multiply(op1); 17.55 + } 17.56 + 17.57 + // Generate mulAdd() intrinsic by inlining multiply(). 17.58 + public static BigInteger new_multiply(BigInteger op1) { 17.59 + return op1.multiply(op1); 17.60 + } 17.61 + 17.62 + public static boolean bytecompare(BigInteger b1, BigInteger b2) { 17.63 + byte[] data1 = b1.toByteArray(); 17.64 + byte[] data2 = b2.toByteArray(); 17.65 + if (data1.length != data2.length) 17.66 + return false; 17.67 + for (int i = 0; i < data1.length; i++) { 17.68 + if (data1[i] != data2[i]) 17.69 + return false; 17.70 + } 17.71 + return true; 17.72 + } 17.73 + 17.74 + public static String stringify(BigInteger b) { 17.75 + String strout= ""; 17.76 + byte [] data = b.toByteArray(); 17.77 + for (int i = 0; i < data.length; i++) { 17.78 + strout += (String.format("%02x",data[i]) + " "); 17.79 + } 17.80 + return strout; 17.81 + } 17.82 + 17.83 + public static void main(String args[]) throws Exception { 17.84 + 17.85 + BigInteger oldsum = new BigInteger("0"); 17.86 + BigInteger newsum = new BigInteger("0"); 17.87 + 17.88 + BigInteger b1, b2, oldres, newres; 17.89 + 17.90 + Random rand = new Random(); 17.91 + long seed = System.nanoTime(); 17.92 + Random rand1 = new Random(); 17.93 + long seed1 = System.nanoTime(); 17.94 + rand.setSeed(seed); 17.95 + rand1.setSeed(seed1); 17.96 + 17.97 + for (int j = 0; j < 100000; j++) { 17.98 + int rand_int = rand1.nextInt(3136)+32; 17.99 + b1 = new BigInteger(rand_int, rand); 17.100 + 17.101 + oldres = base_multiply(b1); 17.102 + newres = new_multiply(b1); 17.103 + 17.104 + oldsum = oldsum.add(oldres); 17.105 + newsum = newsum.add(newres); 17.106 + 17.107 + if (!bytecompare(oldres,newres)) { 17.108 + System.out.print("mismatch for:b1:" + stringify(b1) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres)); 17.109 + System.out.println(b1); 17.110 + throw new Exception("Failed"); 17.111 + } 17.112 + } 17.113 + if (!bytecompare(oldsum,newsum)) { 17.114 + System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum)); 17.115 + throw new Exception("Failed"); 17.116 + } else { 17.117 + System.out.println("Success"); 17.118 + } 17.119 + } 17.120 +}
18.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 18.2 +++ b/test/compiler/intrinsics/squaretolen/TestSquareToLen.java Wed Feb 17 13:40:12 2016 +0300 18.3 @@ -0,0 +1,114 @@ 18.4 +/* 18.5 + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. 18.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 18.7 + * 18.8 + * This code is free software; you can redistribute it and/or modify it 18.9 + * under the terms of the GNU General Public License version 2 only, as 18.10 + * published by the Free Software Foundation. 18.11 + * 18.12 + * This code is distributed in the hope that it will be useful, but WITHOUT 18.13 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 18.14 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 18.15 + * version 2 for more details (a copy is included in the LICENSE file that 18.16 + * accompanied this code). 18.17 + * 18.18 + * You should have received a copy of the GNU General Public License version 18.19 + * 2 along with this work; if not, write to the Free Software Foundation, 18.20 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18.21 + * 18.22 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 18.23 + * or visit www.oracle.com if you need additional information or have any 18.24 + * questions. 18.25 + * 18.26 + */ 18.27 + 18.28 +/** 18.29 + * @test 18.30 + * @bug 8081778 18.31 + * @summary Add C2 x86 intrinsic for BigInteger::squareToLen() method 18.32 + * 18.33 + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch 18.34 + * -XX:CompileCommand=exclude,TestSquareToLen::main 18.35 + * -XX:CompileCommand=option,TestSquareToLen::base_multiply,ccstr,DisableIntrinsic,_squareToLen 18.36 + * -XX:CompileCommand=option,java.math.BigInteger::multiply,ccstr,DisableIntrinsic,_squareToLen 18.37 + * -XX:CompileCommand=option,java.math.BigInteger::square,ccstr,DisableIntrinsic,_squareToLen 18.38 + * -XX:CompileCommand=option,java.math.BigInteger::squareToLen,ccstr,DisableIntrinsic,_squareToLen 18.39 + * -XX:CompileCommand=inline,java.math.BigInteger::multiply 18.40 + * -XX:CompileCommand=inline,java.math.BigInteger::square 18.41 + * -XX:CompileCommand=inline,java.math.BigInteger::squareToLen TestSquareToLen 18.42 + */ 18.43 + 18.44 +import java.util.Random; 18.45 +import java.math.*; 18.46 + 18.47 +public class TestSquareToLen { 18.48 + 18.49 + // Avoid intrinsic by preventing inlining multiply() and squareToLen(). 18.50 + public static BigInteger base_multiply(BigInteger op1) { 18.51 + return op1.multiply(op1); 18.52 + } 18.53 + 18.54 + // Generate squareToLen() intrinsic by inlining multiply(). 18.55 + public static BigInteger new_multiply(BigInteger op1) { 18.56 + return op1.multiply(op1); 18.57 + } 18.58 + 18.59 + public static boolean bytecompare(BigInteger b1, BigInteger b2) { 18.60 + byte[] data1 = b1.toByteArray(); 18.61 + byte[] data2 = b2.toByteArray(); 18.62 + if (data1.length != data2.length) 18.63 + return false; 18.64 + for (int i = 0; i < data1.length; i++) { 18.65 + if (data1[i] != data2[i]) 18.66 + return false; 18.67 + } 18.68 + return true; 18.69 + } 18.70 + 18.71 + public static String stringify(BigInteger b) { 18.72 + String strout= ""; 18.73 + byte [] data = b.toByteArray(); 18.74 + for (int i = 0; i < data.length; i++) { 18.75 + strout += (String.format("%02x",data[i]) + " "); 18.76 + } 18.77 + return strout; 18.78 + } 18.79 + 18.80 + public static void main(String args[]) throws Exception { 18.81 + 18.82 + BigInteger oldsum = new BigInteger("0"); 18.83 + BigInteger newsum = new BigInteger("0"); 18.84 + 18.85 + BigInteger b1, b2, oldres, newres; 18.86 + 18.87 + Random rand = new Random(); 18.88 + long seed = System.nanoTime(); 18.89 + Random rand1 = new Random(); 18.90 + long seed1 = System.nanoTime(); 18.91 + rand.setSeed(seed); 18.92 + rand1.setSeed(seed1); 18.93 + 18.94 + for (int j = 0; j < 100000; j++) { 18.95 + int rand_int = rand1.nextInt(3136)+32; 18.96 + b1 = new BigInteger(rand_int, rand); 18.97 + 18.98 + oldres = base_multiply(b1); 18.99 + newres = new_multiply(b1); 18.100 + 18.101 + oldsum = oldsum.add(oldres); 18.102 + newsum = newsum.add(newres); 18.103 + 18.104 + if (!bytecompare(oldres,newres)) { 18.105 + System.out.print("mismatch for:b1:" + stringify(b1) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres)); 18.106 + System.out.println(b1); 18.107 + throw new Exception("Failed"); 18.108 + } 18.109 + } 18.110 + if (!bytecompare(oldsum,newsum)) { 18.111 + System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum)); 18.112 + throw new Exception("Failed"); 18.113 + } else { 18.114 + System.out.println("Success"); 18.115 + } 18.116 + } 18.117 +}