8081778: Use Intel x64 CPU instructions for RSA acceleration

Wed, 17 Feb 2016 13:40:12 +0300

author
igerasim
date
Wed, 17 Feb 2016 13:40:12 +0300
changeset 8307
daaf806995b3
parent 8306
81adfb064a4f
child 8309
240ea32410fa

8081778: Use Intel x64 CPU instructions for RSA acceleration
Summary: Add intrinsics for BigInteger squareToLen and mulAdd methods.
Reviewed-by: kvn, jrose

src/cpu/x86/vm/assembler_x86.cpp file | annotate | diff | comparison | revisions
src/cpu/x86/vm/assembler_x86.hpp file | annotate | diff | comparison | revisions
src/cpu/x86/vm/macroAssembler_x86.cpp file | annotate | diff | comparison | revisions
src/cpu/x86/vm/macroAssembler_x86.hpp file | annotate | diff | comparison | revisions
src/cpu/x86/vm/stubGenerator_x86_64.cpp file | annotate | diff | comparison | revisions
src/cpu/x86/vm/stubRoutines_x86_64.hpp file | annotate | diff | comparison | revisions
src/cpu/x86/vm/vm_version_x86.cpp file | annotate | diff | comparison | revisions
src/share/vm/classfile/vmSymbols.hpp file | annotate | diff | comparison | revisions
src/share/vm/opto/c2_globals.hpp file | annotate | diff | comparison | revisions
src/share/vm/opto/escape.cpp file | annotate | diff | comparison | revisions
src/share/vm/opto/library_call.cpp file | annotate | diff | comparison | revisions
src/share/vm/opto/runtime.cpp file | annotate | diff | comparison | revisions
src/share/vm/opto/runtime.hpp file | annotate | diff | comparison | revisions
src/share/vm/runtime/stubRoutines.cpp file | annotate | diff | comparison | revisions
src/share/vm/runtime/stubRoutines.hpp file | annotate | diff | comparison | revisions
src/share/vm/runtime/vmStructs.cpp file | annotate | diff | comparison | revisions
test/compiler/intrinsics/muladd/TestMulAdd.java file | annotate | diff | comparison | revisions
test/compiler/intrinsics/squaretolen/TestSquareToLen.java file | annotate | diff | comparison | revisions
     1.1 --- a/src/cpu/x86/vm/assembler_x86.cpp	Tue Feb 16 13:56:12 2016 +0000
     1.2 +++ b/src/cpu/x86/vm/assembler_x86.cpp	Wed Feb 17 13:40:12 2016 +0300
     1.3 @@ -2318,6 +2318,13 @@
     1.4    emit_arith(0x0B, 0xC0, dst, src);
     1.5  }
     1.6  
     1.7 +void Assembler::orl(Address dst, Register src) {
     1.8 +  InstructionMark im(this);
     1.9 +  prefix(dst, src);
    1.10 +  emit_int8(0x09);
    1.11 +  emit_operand(src, dst);
    1.12 +}
    1.13 +
    1.14  void Assembler::packuswb(XMMRegister dst, Address src) {
    1.15    NOT_LP64(assert(VM_Version::supports_sse2(), ""));
    1.16    assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
    1.17 @@ -5613,6 +5620,19 @@
    1.18    }
    1.19  }
    1.20  
    1.21 +void Assembler::rcrq(Register dst, int imm8) {
    1.22 +  assert(isShiftCount(imm8 >> 1), "illegal shift count");
    1.23 +  int encode = prefixq_and_encode(dst->encoding());
    1.24 +  if (imm8 == 1) {
    1.25 +    emit_int8((unsigned char)0xD1);
    1.26 +    emit_int8((unsigned char)(0xD8 | encode));
    1.27 +  } else {
    1.28 +    emit_int8((unsigned char)0xC1);
    1.29 +    emit_int8((unsigned char)(0xD8 | encode));
    1.30 +    emit_int8(imm8);
    1.31 +  }
    1.32 +}
    1.33 +
    1.34  void Assembler::rorq(Register dst, int imm8) {
    1.35    assert(isShiftCount(imm8 >> 1), "illegal shift count");
    1.36    int encode = prefixq_and_encode(dst->encoding());
     2.1 --- a/src/cpu/x86/vm/assembler_x86.hpp	Tue Feb 16 13:56:12 2016 +0000
     2.2 +++ b/src/cpu/x86/vm/assembler_x86.hpp	Wed Feb 17 13:40:12 2016 +0300
     2.3 @@ -1455,6 +1455,7 @@
     2.4    void orl(Register dst, int32_t imm32);
     2.5    void orl(Register dst, Address src);
     2.6    void orl(Register dst, Register src);
     2.7 +  void orl(Address dst, Register src);
     2.8  
     2.9    void orq(Address dst, int32_t imm32);
    2.10    void orq(Register dst, int32_t imm32);
    2.11 @@ -1555,6 +1556,8 @@
    2.12  
    2.13    void rclq(Register dst, int imm8);
    2.14  
    2.15 +  void rcrq(Register dst, int imm8);
    2.16 +
    2.17    void rdtsc();
    2.18  
    2.19    void ret(int imm16);
     3.1 --- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Tue Feb 16 13:56:12 2016 +0000
     3.2 +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Wed Feb 17 13:40:12 2016 +0300
     3.3 @@ -7769,6 +7769,503 @@
     3.4    pop(tmp2);
     3.5    pop(tmp1);
     3.6  }
     3.7 +
     3.8 +//Helper functions for square_to_len()
     3.9 +
    3.10 +/**
    3.11 + * Store the squares of x[], right shifted one bit (divided by 2) into z[]
    3.12 + * Preserves x and z and modifies rest of the registers.
    3.13 + */
    3.14 +
    3.15 +void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
    3.16 +  // Perform square and right shift by 1
    3.17 +  // Handle odd xlen case first, then for even xlen do the following
    3.18 +  // jlong carry = 0;
    3.19 +  // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
    3.20 +  //     huge_128 product = x[j:j+1] * x[j:j+1];
    3.21 +  //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
    3.22 +  //     z[i+2:i+3] = (jlong)(product >>> 1);
    3.23 +  //     carry = (jlong)product;
    3.24 +  // }
    3.25 +
    3.26 +  xorq(tmp5, tmp5);     // carry
    3.27 +  xorq(rdxReg, rdxReg);
    3.28 +  xorl(tmp1, tmp1);     // index for x
    3.29 +  xorl(tmp4, tmp4);     // index for z
    3.30 +
    3.31 +  Label L_first_loop, L_first_loop_exit;
    3.32 +
    3.33 +  testl(xlen, 1);
    3.34 +  jccb(Assembler::zero, L_first_loop); //jump if xlen is even
    3.35 +
    3.36 +  // Square and right shift by 1 the odd element using 32 bit multiply
    3.37 +  movl(raxReg, Address(x, tmp1, Address::times_4, 0));
    3.38 +  imulq(raxReg, raxReg);
    3.39 +  shrq(raxReg, 1);
    3.40 +  adcq(tmp5, 0);
    3.41 +  movq(Address(z, tmp4, Address::times_4, 0), raxReg);
    3.42 +  incrementl(tmp1);
    3.43 +  addl(tmp4, 2);
    3.44 +
    3.45 +  // Square and  right shift by 1 the rest using 64 bit multiply
    3.46 +  bind(L_first_loop);
    3.47 +  cmpptr(tmp1, xlen);
    3.48 +  jccb(Assembler::equal, L_first_loop_exit);
    3.49 +
    3.50 +  // Square
    3.51 +  movq(raxReg, Address(x, tmp1, Address::times_4,  0));
    3.52 +  rorq(raxReg, 32);    // convert big-endian to little-endian
    3.53 +  mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
    3.54 +
    3.55 +  // Right shift by 1 and save carry
    3.56 +  shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
    3.57 +  rcrq(rdxReg, 1);
    3.58 +  rcrq(raxReg, 1);
    3.59 +  adcq(tmp5, 0);
    3.60 +
    3.61 +  // Store result in z
    3.62 +  movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
    3.63 +  movq(Address(z, tmp4, Address::times_4, 8), raxReg);
    3.64 +
    3.65 +  // Update indices for x and z
    3.66 +  addl(tmp1, 2);
    3.67 +  addl(tmp4, 4);
    3.68 +  jmp(L_first_loop);
    3.69 +
    3.70 +  bind(L_first_loop_exit);
    3.71 +}
    3.72 +
    3.73 +
    3.74 +/**
    3.75 + * Perform the following multiply add operation using BMI2 instructions
    3.76 + * carry:sum = sum + op1*op2 + carry
    3.77 + * op2 should be in rdx
    3.78 + * op2 is preserved, all other registers are modified
    3.79 + */
    3.80 +void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
    3.81 +  // assert op2 is rdx
    3.82 +  mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
    3.83 +  addq(sum, carry);
    3.84 +  adcq(tmp2, 0);
    3.85 +  addq(sum, op1);
    3.86 +  adcq(tmp2, 0);
    3.87 +  movq(carry, tmp2);
    3.88 +}
    3.89 +
    3.90 +/**
    3.91 + * Perform the following multiply add operation:
    3.92 + * carry:sum = sum + op1*op2 + carry
    3.93 + * Preserves op1, op2 and modifies rest of registers
    3.94 + */
    3.95 +void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
    3.96 +  // rdx:rax = op1 * op2
    3.97 +  movq(raxReg, op2);
    3.98 +  mulq(op1);
    3.99 +
   3.100 +  //  rdx:rax = sum + carry + rdx:rax
   3.101 +  addq(sum, carry);
   3.102 +  adcq(rdxReg, 0);
   3.103 +  addq(sum, raxReg);
   3.104 +  adcq(rdxReg, 0);
   3.105 +
   3.106 +  // carry:sum = rdx:sum
   3.107 +  movq(carry, rdxReg);
   3.108 +}
   3.109 +
   3.110 +/**
   3.111 + * Add 64 bit long carry into z[] with carry propogation.
   3.112 + * Preserves z and carry register values and modifies rest of registers.
   3.113 + *
   3.114 + */
   3.115 +void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
   3.116 +  Label L_fourth_loop, L_fourth_loop_exit;
   3.117 +
   3.118 +  movl(tmp1, 1);
   3.119 +  subl(zlen, 2);
   3.120 +  addq(Address(z, zlen, Address::times_4, 0), carry);
   3.121 +
   3.122 +  bind(L_fourth_loop);
   3.123 +  jccb(Assembler::carryClear, L_fourth_loop_exit);
   3.124 +  subl(zlen, 2);
   3.125 +  jccb(Assembler::negative, L_fourth_loop_exit);
   3.126 +  addq(Address(z, zlen, Address::times_4, 0), tmp1);
   3.127 +  jmp(L_fourth_loop);
   3.128 +  bind(L_fourth_loop_exit);
   3.129 +}
   3.130 +
   3.131 +/**
   3.132 + * Shift z[] left by 1 bit.
   3.133 + * Preserves x, len, z and zlen registers and modifies rest of the registers.
   3.134 + *
   3.135 + */
   3.136 +void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
   3.137 +
   3.138 +  Label L_fifth_loop, L_fifth_loop_exit;
   3.139 +
   3.140 +  // Fifth loop
   3.141 +  // Perform primitiveLeftShift(z, zlen, 1)
   3.142 +
   3.143 +  const Register prev_carry = tmp1;
   3.144 +  const Register new_carry = tmp4;
   3.145 +  const Register value = tmp2;
   3.146 +  const Register zidx = tmp3;
   3.147 +
   3.148 +  // int zidx, carry;
   3.149 +  // long value;
   3.150 +  // carry = 0;
   3.151 +  // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
   3.152 +  //    (carry:value)  = (z[i] << 1) | carry ;
   3.153 +  //    z[i] = value;
   3.154 +  // }
   3.155 +
   3.156 +  movl(zidx, zlen);
   3.157 +  xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
   3.158 +
   3.159 +  bind(L_fifth_loop);
   3.160 +  decl(zidx);  // Use decl to preserve carry flag
   3.161 +  decl(zidx);
   3.162 +  jccb(Assembler::negative, L_fifth_loop_exit);
   3.163 +
   3.164 +  if (UseBMI2Instructions) {
   3.165 +     movq(value, Address(z, zidx, Address::times_4, 0));
   3.166 +     rclq(value, 1);
   3.167 +     rorxq(value, value, 32);
   3.168 +     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
   3.169 +  }
   3.170 +  else {
   3.171 +    // clear new_carry
   3.172 +    xorl(new_carry, new_carry);
   3.173 +
   3.174 +    // Shift z[i] by 1, or in previous carry and save new carry
   3.175 +    movq(value, Address(z, zidx, Address::times_4, 0));
   3.176 +    shlq(value, 1);
   3.177 +    adcl(new_carry, 0);
   3.178 +
   3.179 +    orq(value, prev_carry);
   3.180 +    rorq(value, 0x20);
   3.181 +    movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
   3.182 +
   3.183 +    // Set previous carry = new carry
   3.184 +    movl(prev_carry, new_carry);
   3.185 +  }
   3.186 +  jmp(L_fifth_loop);
   3.187 +
   3.188 +  bind(L_fifth_loop_exit);
   3.189 +}
   3.190 +
   3.191 +
   3.192 +/**
   3.193 + * Code for BigInteger::squareToLen() intrinsic
   3.194 + *
   3.195 + * rdi: x
   3.196 + * rsi: len
   3.197 + * r8:  z
   3.198 + * rcx: zlen
   3.199 + * r12: tmp1
   3.200 + * r13: tmp2
   3.201 + * r14: tmp3
   3.202 + * r15: tmp4
   3.203 + * rbx: tmp5
   3.204 + *
   3.205 + */
   3.206 +void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
   3.207 +
   3.208 +  Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply;
   3.209 +  push(tmp1);
   3.210 +  push(tmp2);
   3.211 +  push(tmp3);
   3.212 +  push(tmp4);
   3.213 +  push(tmp5);
   3.214 +
   3.215 +  // First loop
   3.216 +  // Store the squares, right shifted one bit (i.e., divided by 2).
   3.217 +  square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
   3.218 +
   3.219 +  // Add in off-diagonal sums.
   3.220 +  //
   3.221 +  // Second, third (nested) and fourth loops.
   3.222 +  // zlen +=2;
   3.223 +  // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
   3.224 +  //    carry = 0;
   3.225 +  //    long op2 = x[xidx:xidx+1];
   3.226 +  //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
   3.227 +  //       k -= 2;
   3.228 +  //       long op1 = x[j:j+1];
   3.229 +  //       long sum = z[k:k+1];
   3.230 +  //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
   3.231 +  //       z[k:k+1] = sum;
   3.232 +  //    }
   3.233 +  //    add_one_64(z, k, carry, tmp_regs);
   3.234 +  // }
   3.235 +
   3.236 +  const Register carry = tmp5;
   3.237 +  const Register sum = tmp3;
   3.238 +  const Register op1 = tmp4;
   3.239 +  Register op2 = tmp2;
   3.240 +
   3.241 +  push(zlen);
   3.242 +  push(len);
   3.243 +  addl(zlen,2);
   3.244 +  bind(L_second_loop);
   3.245 +  xorq(carry, carry);
   3.246 +  subl(zlen, 4);
   3.247 +  subl(len, 2);
   3.248 +  push(zlen);
   3.249 +  push(len);
   3.250 +  cmpl(len, 0);
   3.251 +  jccb(Assembler::lessEqual, L_second_loop_exit);
   3.252 +
   3.253 +  // Multiply an array by one 64 bit long.
   3.254 +  if (UseBMI2Instructions) {
   3.255 +    op2 = rdxReg;
   3.256 +    movq(op2, Address(x, len, Address::times_4,  0));
   3.257 +    rorxq(op2, op2, 32);
   3.258 +  }
   3.259 +  else {
   3.260 +    movq(op2, Address(x, len, Address::times_4,  0));
   3.261 +    rorq(op2, 32);
   3.262 +  }
   3.263 +
   3.264 +  bind(L_third_loop);
   3.265 +  decrementl(len);
   3.266 +  jccb(Assembler::negative, L_third_loop_exit);
   3.267 +  decrementl(len);
   3.268 +  jccb(Assembler::negative, L_last_x);
   3.269 +
   3.270 +  movq(op1, Address(x, len, Address::times_4,  0));
   3.271 +  rorq(op1, 32);
   3.272 +
   3.273 +  bind(L_multiply);
   3.274 +  subl(zlen, 2);
   3.275 +  movq(sum, Address(z, zlen, Address::times_4,  0));
   3.276 +
   3.277 +  // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
   3.278 +  if (UseBMI2Instructions) {
   3.279 +    multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
   3.280 +  }
   3.281 +  else {
   3.282 +    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
   3.283 +  }
   3.284 +
   3.285 +  movq(Address(z, zlen, Address::times_4, 0), sum);
   3.286 +
   3.287 +  jmp(L_third_loop);
   3.288 +  bind(L_third_loop_exit);
   3.289 +
   3.290 +  // Fourth loop
   3.291 +  // Add 64 bit long carry into z with carry propogation.
   3.292 +  // Uses offsetted zlen.
   3.293 +  add_one_64(z, zlen, carry, tmp1);
   3.294 +
   3.295 +  pop(len);
   3.296 +  pop(zlen);
   3.297 +  jmp(L_second_loop);
   3.298 +
   3.299 +  // Next infrequent code is moved outside loops.
   3.300 +  bind(L_last_x);
   3.301 +  movl(op1, Address(x, 0));
   3.302 +  jmp(L_multiply);
   3.303 +
   3.304 +  bind(L_second_loop_exit);
   3.305 +  pop(len);
   3.306 +  pop(zlen);
   3.307 +  pop(len);
   3.308 +  pop(zlen);
   3.309 +
   3.310 +  // Fifth loop
   3.311 +  // Shift z left 1 bit.
   3.312 +  lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
   3.313 +
   3.314 +  // z[zlen-1] |= x[len-1] & 1;
   3.315 +  movl(tmp3, Address(x, len, Address::times_4, -4));
   3.316 +  andl(tmp3, 1);
   3.317 +  orl(Address(z, zlen, Address::times_4,  -4), tmp3);
   3.318 +
   3.319 +  pop(tmp5);
   3.320 +  pop(tmp4);
   3.321 +  pop(tmp3);
   3.322 +  pop(tmp2);
   3.323 +  pop(tmp1);
   3.324 +}
   3.325 +
   3.326 +/**
   3.327 + * Helper function for mul_add()
   3.328 + * Multiply the in[] by int k and add to out[] starting at offset offs using
   3.329 + * 128 bit by 32 bit multiply and return the carry in tmp5.
   3.330 + * Only quad int aligned length of in[] is operated on in this function.
   3.331 + * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
   3.332 + * This function preserves out, in and k registers.
   3.333 + * len and offset point to the appropriate index in "in" & "out" correspondingly
   3.334 + * tmp5 has the carry.
   3.335 + * other registers are temporary and are modified.
   3.336 + *
   3.337 + */
   3.338 +void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
   3.339 +  Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
   3.340 +  Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
   3.341 +
   3.342 +  Label L_first_loop, L_first_loop_exit;
   3.343 +
   3.344 +  movl(tmp1, len);
   3.345 +  shrl(tmp1, 2);
   3.346 +
   3.347 +  bind(L_first_loop);
   3.348 +  subl(tmp1, 1);
   3.349 +  jccb(Assembler::negative, L_first_loop_exit);
   3.350 +
   3.351 +  subl(len, 4);
   3.352 +  subl(offset, 4);
   3.353 +
   3.354 +  Register op2 = tmp2;
   3.355 +  const Register sum = tmp3;
   3.356 +  const Register op1 = tmp4;
   3.357 +  const Register carry = tmp5;
   3.358 +
   3.359 +  if (UseBMI2Instructions) {
   3.360 +    op2 = rdxReg;
   3.361 +  }
   3.362 +
   3.363 +  movq(op1, Address(in, len, Address::times_4,  8));
   3.364 +  rorq(op1, 32);
   3.365 +  movq(sum, Address(out, offset, Address::times_4,  8));
   3.366 +  rorq(sum, 32);
   3.367 +  if (UseBMI2Instructions) {
   3.368 +    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
   3.369 +  }
   3.370 +  else {
   3.371 +    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
   3.372 +  }
   3.373 +  // Store back in big endian from little endian
   3.374 +  rorq(sum, 0x20);
   3.375 +  movq(Address(out, offset, Address::times_4,  8), sum);
   3.376 +
   3.377 +  movq(op1, Address(in, len, Address::times_4,  0));
   3.378 +  rorq(op1, 32);
   3.379 +  movq(sum, Address(out, offset, Address::times_4,  0));
   3.380 +  rorq(sum, 32);
   3.381 +  if (UseBMI2Instructions) {
   3.382 +    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
   3.383 +  }
   3.384 +  else {
   3.385 +    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
   3.386 +  }
   3.387 +  // Store back in big endian from little endian
   3.388 +  rorq(sum, 0x20);
   3.389 +  movq(Address(out, offset, Address::times_4,  0), sum);
   3.390 +
   3.391 +  jmp(L_first_loop);
   3.392 +  bind(L_first_loop_exit);
   3.393 +}
   3.394 +
   3.395 +/**
   3.396 + * Code for BigInteger::mulAdd() intrinsic
   3.397 + *
   3.398 + * rdi: out
   3.399 + * rsi: in
   3.400 + * r11: offs (out.length - offset)
   3.401 + * rcx: len
   3.402 + * r8:  k
   3.403 + * r12: tmp1
   3.404 + * r13: tmp2
   3.405 + * r14: tmp3
   3.406 + * r15: tmp4
   3.407 + * rbx: tmp5
   3.408 + * Multiply the in[] by word k and add to out[], return the carry in rax
   3.409 + */
   3.410 +void MacroAssembler::mul_add(Register out, Register in, Register offs,
   3.411 +   Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
   3.412 +   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
   3.413 +
   3.414 +  Label L_carry, L_last_in, L_done;
   3.415 +
   3.416 +// carry = 0;
   3.417 +// for (int j=len-1; j >= 0; j--) {
   3.418 +//    long product = (in[j] & LONG_MASK) * kLong +
   3.419 +//                   (out[offs] & LONG_MASK) + carry;
   3.420 +//    out[offs--] = (int)product;
   3.421 +//    carry = product >>> 32;
   3.422 +// }
   3.423 +//
   3.424 +  push(tmp1);
   3.425 +  push(tmp2);
   3.426 +  push(tmp3);
   3.427 +  push(tmp4);
   3.428 +  push(tmp5);
   3.429 +
   3.430 +  Register op2 = tmp2;
   3.431 +  const Register sum = tmp3;
   3.432 +  const Register op1 = tmp4;
   3.433 +  const Register carry =  tmp5;
   3.434 +
   3.435 +  if (UseBMI2Instructions) {
   3.436 +    op2 = rdxReg;
   3.437 +    movl(op2, k);
   3.438 +  }
   3.439 +  else {
   3.440 +    movl(op2, k);
   3.441 +  }
   3.442 +
   3.443 +  xorq(carry, carry);
   3.444 +
   3.445 +  //First loop
   3.446 +
   3.447 +  //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
   3.448 +  //The carry is in tmp5
   3.449 +  mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
   3.450 +
   3.451 +  //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
   3.452 +  decrementl(len);
   3.453 +  jccb(Assembler::negative, L_carry);
   3.454 +  decrementl(len);
   3.455 +  jccb(Assembler::negative, L_last_in);
   3.456 +
   3.457 +  movq(op1, Address(in, len, Address::times_4,  0));
   3.458 +  rorq(op1, 32);
   3.459 +
   3.460 +  subl(offs, 2);
   3.461 +  movq(sum, Address(out, offs, Address::times_4,  0));
   3.462 +  rorq(sum, 32);
   3.463 +
   3.464 +  if (UseBMI2Instructions) {
   3.465 +    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
   3.466 +  }
   3.467 +  else {
   3.468 +    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
   3.469 +  }
   3.470 +
   3.471 +  // Store back in big endian from little endian
   3.472 +  rorq(sum, 0x20);
   3.473 +  movq(Address(out, offs, Address::times_4,  0), sum);
   3.474 +
   3.475 +  testl(len, len);
   3.476 +  jccb(Assembler::zero, L_carry);
   3.477 +
   3.478 +  //Multiply the last in[] entry, if any
   3.479 +  bind(L_last_in);
   3.480 +  movl(op1, Address(in, 0));
   3.481 +  movl(sum, Address(out, offs, Address::times_4,  -4));
   3.482 +
   3.483 +  movl(raxReg, k);
   3.484 +  mull(op1); //tmp4 * eax -> edx:eax
   3.485 +  addl(sum, carry);
   3.486 +  adcl(rdxReg, 0);
   3.487 +  addl(sum, raxReg);
   3.488 +  adcl(rdxReg, 0);
   3.489 +  movl(carry, rdxReg);
   3.490 +
   3.491 +  movl(Address(out, offs, Address::times_4,  -4), sum);
   3.492 +
   3.493 +  bind(L_carry);
   3.494 +  //return tmp5/carry as carry in rax
   3.495 +  movl(rax, carry);
   3.496 +
   3.497 +  bind(L_done);
   3.498 +  pop(tmp5);
   3.499 +  pop(tmp4);
   3.500 +  pop(tmp3);
   3.501 +  pop(tmp2);
   3.502 +  pop(tmp1);
   3.503 +}
   3.504  #endif
   3.505  
   3.506  /**
     4.1 --- a/src/cpu/x86/vm/macroAssembler_x86.hpp	Tue Feb 16 13:56:12 2016 +0000
     4.2 +++ b/src/cpu/x86/vm/macroAssembler_x86.hpp	Wed Feb 17 13:40:12 2016 +0300
     4.3 @@ -1241,6 +1241,25 @@
     4.4                                 Register carry2);
     4.5    void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
     4.6                         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
     4.7 +
     4.8 +  void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3,
     4.9 +                     Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
    4.10 +  void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry,
    4.11 +                            Register tmp2);
    4.12 +  void multiply_add_64(Register sum, Register op1, Register op2, Register carry,
    4.13 +                       Register rdxReg, Register raxReg);
    4.14 +  void add_one_64(Register z, Register zlen, Register carry, Register tmp1);
    4.15 +  void lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
    4.16 +                       Register tmp3, Register tmp4);
    4.17 +  void square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
    4.18 +                     Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
    4.19 +
    4.20 +  void mul_add_128_x_32_loop(Register out, Register in, Register offset, Register len, Register tmp1,
    4.21 +               Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
    4.22 +               Register raxReg);
    4.23 +  void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1,
    4.24 +               Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
    4.25 +               Register raxReg);
    4.26  #endif
    4.27  
    4.28    // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
     5.1 --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Tue Feb 16 13:56:12 2016 +0000
     5.2 +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Wed Feb 17 13:40:12 2016 +0300
     5.3 @@ -3743,6 +3743,107 @@
     5.4      return start;
     5.5    }
     5.6  
     5.7 +/**
     5.8 +   *  Arguments:
     5.9 +   *
    5.10 +  //  Input:
    5.11 +  //    c_rarg0   - x address
    5.12 +  //    c_rarg1   - x length
    5.13 +  //    c_rarg2   - z address
    5.14 +  //    c_rarg3   - z lenth
    5.15 +   *
    5.16 +   */
    5.17 +  address generate_squareToLen() {
    5.18 +
    5.19 +    __ align(CodeEntryAlignment);
    5.20 +    StubCodeMark mark(this, "StubRoutines", "squareToLen");
    5.21 +
    5.22 +    address start = __ pc();
    5.23 +    // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
    5.24 +    // Unix:  rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
    5.25 +    const Register x      = rdi;
    5.26 +    const Register len    = rsi;
    5.27 +    const Register z      = r8;
    5.28 +    const Register zlen   = rcx;
    5.29 +
    5.30 +   const Register tmp1      = r12;
    5.31 +   const Register tmp2      = r13;
    5.32 +   const Register tmp3      = r14;
    5.33 +   const Register tmp4      = r15;
    5.34 +   const Register tmp5      = rbx;
    5.35 +
    5.36 +    BLOCK_COMMENT("Entry:");
    5.37 +    __ enter(); // required for proper stackwalking of RuntimeStub frame
    5.38 +
    5.39 +       setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
    5.40 +                          // zlen => rcx
    5.41 +                          // r9 and r10 may be used to save non-volatile registers
    5.42 +    __ movptr(r8, rdx);
    5.43 +    __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
    5.44 +
    5.45 +    restore_arg_regs();
    5.46 +
    5.47 +    __ leave(); // required for proper stackwalking of RuntimeStub frame
    5.48 +    __ ret(0);
    5.49 +
    5.50 +    return start;
    5.51 +  }
    5.52 +
    5.53 +   /**
    5.54 +   *  Arguments:
    5.55 +   *
    5.56 +   *  Input:
    5.57 +   *    c_rarg0   - out address
    5.58 +   *    c_rarg1   - in address
    5.59 +   *    c_rarg2   - offset
    5.60 +   *    c_rarg3   - len
    5.61 +   * not Win64
    5.62 +   *    c_rarg4   - k
    5.63 +   * Win64
    5.64 +   *    rsp+40    - k
    5.65 +   */
    5.66 +  address generate_mulAdd() {
    5.67 +    __ align(CodeEntryAlignment);
    5.68 +    StubCodeMark mark(this, "StubRoutines", "mulAdd");
    5.69 +
    5.70 +    address start = __ pc();
    5.71 +    // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
    5.72 +    // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
    5.73 +    const Register out     = rdi;
    5.74 +    const Register in      = rsi;
    5.75 +    const Register offset  = r11;
    5.76 +    const Register len     = rcx;
    5.77 +    const Register k       = r8;
    5.78 +
    5.79 +    // Next registers will be saved on stack in mul_add().
    5.80 +    const Register tmp1  = r12;
    5.81 +    const Register tmp2  = r13;
    5.82 +    const Register tmp3  = r14;
    5.83 +    const Register tmp4  = r15;
    5.84 +    const Register tmp5  = rbx;
    5.85 +
    5.86 +    BLOCK_COMMENT("Entry:");
    5.87 +    __ enter(); // required for proper stackwalking of RuntimeStub frame
    5.88 +
    5.89 +    setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
    5.90 +                       // len => rcx, k => r8
    5.91 +                       // r9 and r10 may be used to save non-volatile registers
    5.92 +#ifdef _WIN64
    5.93 +    // last argument is on stack on Win64
    5.94 +    __ movl(k, Address(rsp, 6 * wordSize));
    5.95 +#endif
    5.96 +    __ movptr(r11, rdx);  // move offset in rdx to offset(r11)
    5.97 +    __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
    5.98 +
    5.99 +    restore_arg_regs();
   5.100 +
   5.101 +    __ leave(); // required for proper stackwalking of RuntimeStub frame
   5.102 +    __ ret(0);
   5.103 +
   5.104 +    return start;
   5.105 +  }
   5.106 +
   5.107 +
   5.108  #undef __
   5.109  #define __ masm->
   5.110  
   5.111 @@ -3987,6 +4088,12 @@
   5.112      if (UseMultiplyToLenIntrinsic) {
   5.113        StubRoutines::_multiplyToLen = generate_multiplyToLen();
   5.114      }
   5.115 +    if (UseSquareToLenIntrinsic) {
   5.116 +      StubRoutines::_squareToLen = generate_squareToLen();
   5.117 +    }
   5.118 +    if (UseMulAddIntrinsic) {
   5.119 +      StubRoutines::_mulAdd = generate_mulAdd();
   5.120 +    }
   5.121  #endif
   5.122    }
   5.123  
     6.1 --- a/src/cpu/x86/vm/stubRoutines_x86_64.hpp	Tue Feb 16 13:56:12 2016 +0000
     6.2 +++ b/src/cpu/x86/vm/stubRoutines_x86_64.hpp	Wed Feb 17 13:40:12 2016 +0300
     6.3 @@ -33,7 +33,7 @@
     6.4  
     6.5  enum platform_dependent_constants {
     6.6    code_size1 = 19000,          // simply increase if too small (assembler will crash if too small)
     6.7 -  code_size2 = 22000           // simply increase if too small (assembler will crash if too small)
     6.8 +  code_size2 = 23000           // simply increase if too small (assembler will crash if too small)
     6.9  };
    6.10  
    6.11  class x86 {
     7.1 --- a/src/cpu/x86/vm/vm_version_x86.cpp	Tue Feb 16 13:56:12 2016 +0000
     7.2 +++ b/src/cpu/x86/vm/vm_version_x86.cpp	Wed Feb 17 13:40:12 2016 +0300
     7.3 @@ -703,6 +703,12 @@
     7.4    if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
     7.5      UseMultiplyToLenIntrinsic = true;
     7.6    }
     7.7 +  if (FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
     7.8 +    UseSquareToLenIntrinsic = true;
     7.9 +  }
    7.10 +  if (FLAG_IS_DEFAULT(UseMulAddIntrinsic)) {
    7.11 +    UseMulAddIntrinsic = true;
    7.12 +  }
    7.13  #else
    7.14    if (UseMultiplyToLenIntrinsic) {
    7.15      if (!FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
    7.16 @@ -710,6 +716,18 @@
    7.17      }
    7.18      FLAG_SET_DEFAULT(UseMultiplyToLenIntrinsic, false);
    7.19    }
    7.20 +  if (UseSquareToLenIntrinsic) {
    7.21 +    if (!FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
    7.22 +      warning("squareToLen intrinsic is not available in 32-bit VM");
    7.23 +    }
    7.24 +    FLAG_SET_DEFAULT(UseSquareToLenIntrinsic, false);
    7.25 +  }
    7.26 +  if (UseMulAddIntrinsic) {
    7.27 +    if (!FLAG_IS_DEFAULT(UseMulAddIntrinsic)) {
    7.28 +      warning("mulAdd intrinsic is not available in 32-bit VM");
    7.29 +    }
    7.30 +    FLAG_SET_DEFAULT(UseMulAddIntrinsic, false);
    7.31 +  }
    7.32  #endif
    7.33  #endif // COMPILER2
    7.34  
     8.1 --- a/src/share/vm/classfile/vmSymbols.hpp	Tue Feb 16 13:56:12 2016 +0000
     8.2 +++ b/src/share/vm/classfile/vmSymbols.hpp	Wed Feb 17 13:40:12 2016 +0300
     8.3 @@ -797,6 +797,14 @@
     8.4     do_name(     multiplyToLen_name,                             "multiplyToLen")                                        \
     8.5     do_signature(multiplyToLen_signature,                        "([II[II[I)[I")                                         \
     8.6                                                                                                                          \
     8.7 +  do_intrinsic(_squareToLen, java_math_BigInteger, squareToLen_name, squareToLen_signature, F_S)                        \
     8.8 +   do_name(     squareToLen_name,                             "implSquareToLen")                                        \
     8.9 +   do_signature(squareToLen_signature,                        "([II[II)[I")                                             \
    8.10 +                                                                                                                        \
    8.11 +  do_intrinsic(_mulAdd, java_math_BigInteger, mulAdd_name, mulAdd_signature, F_S)                                       \
    8.12 +   do_name(     mulAdd_name,                                  "implMulAdd")                                             \
    8.13 +   do_signature(mulAdd_signature,                             "([I[IIII)I")                                             \
    8.14 +                                                                                                                        \
    8.15    /* java/lang/ref/Reference */                                                                                         \
    8.16    do_intrinsic(_Reference_get,            java_lang_ref_Reference, get_name,    void_object_signature, F_R)             \
    8.17                                                                                                                          \
     9.1 --- a/src/share/vm/opto/c2_globals.hpp	Tue Feb 16 13:56:12 2016 +0000
     9.2 +++ b/src/share/vm/opto/c2_globals.hpp	Wed Feb 17 13:40:12 2016 +0300
     9.3 @@ -662,6 +662,12 @@
     9.4    product(bool, UseMultiplyToLenIntrinsic, false,                           \
     9.5            "Enables intrinsification of BigInteger.multiplyToLen()")         \
     9.6                                                                              \
     9.7 +  product(bool, UseSquareToLenIntrinsic, false,                             \
     9.8 +          "Enables intrinsification of BigInteger.squareToLen()")           \
     9.9 +                                                                            \
    9.10 +  product(bool, UseMulAddIntrinsic, false,                                  \
    9.11 +          "Enables intrinsification of BigInteger.mulAdd()")                \
    9.12 +                                                                            \
    9.13    product(bool, UseTypeSpeculation, true,                                   \
    9.14            "Speculatively propagate types from profiles")                    \
    9.15                                                                              \
    10.1 --- a/src/share/vm/opto/escape.cpp	Tue Feb 16 13:56:12 2016 +0000
    10.2 +++ b/src/share/vm/opto/escape.cpp	Wed Feb 17 13:40:12 2016 +0300
    10.3 @@ -958,7 +958,9 @@
    10.4                    strcmp(call->as_CallLeaf()->_name, "sha256_implCompressMB") == 0 ||
    10.5                    strcmp(call->as_CallLeaf()->_name, "sha512_implCompress") == 0 ||
    10.6                    strcmp(call->as_CallLeaf()->_name, "sha512_implCompressMB") == 0 ||
    10.7 -                  strcmp(call->as_CallLeaf()->_name, "multiplyToLen") == 0)
    10.8 +                  strcmp(call->as_CallLeaf()->_name, "multiplyToLen") == 0 ||
    10.9 +                  strcmp(call->as_CallLeaf()->_name, "squareToLen") == 0 ||
   10.10 +                  strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0)
   10.11                    ))) {
   10.12              call->dump();
   10.13              fatal(err_msg_res("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name));
    11.1 --- a/src/share/vm/opto/library_call.cpp	Tue Feb 16 13:56:12 2016 +0000
    11.2 +++ b/src/share/vm/opto/library_call.cpp	Wed Feb 17 13:40:12 2016 +0300
    11.3 @@ -324,6 +324,8 @@
    11.4    bool inline_updateBytesCRC32();
    11.5    bool inline_updateByteBufferCRC32();
    11.6    bool inline_multiplyToLen();
    11.7 +  bool inline_squareToLen();
    11.8 +  bool inline_mulAdd();
    11.9  
   11.10    bool inline_profileBoolean();
   11.11  };
   11.12 @@ -527,6 +529,14 @@
   11.13      if (!UseMultiplyToLenIntrinsic) return NULL;
   11.14      break;
   11.15  
   11.16 +  case vmIntrinsics::_squareToLen:
   11.17 +    if (!UseSquareToLenIntrinsic) return NULL;
   11.18 +    break;
   11.19 +
   11.20 +  case vmIntrinsics::_mulAdd:
   11.21 +    if (!UseMulAddIntrinsic) return NULL;
   11.22 +    break;
   11.23 +
   11.24    case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
   11.25    case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
   11.26      if (!UseAESIntrinsics) return NULL;
   11.27 @@ -927,6 +937,12 @@
   11.28    case vmIntrinsics::_multiplyToLen:
   11.29      return inline_multiplyToLen();
   11.30  
   11.31 +  case vmIntrinsics::_squareToLen:
   11.32 +    return inline_squareToLen();
   11.33 +
   11.34 +  case vmIntrinsics::_mulAdd:
   11.35 +    return inline_mulAdd();
   11.36 +
   11.37    case vmIntrinsics::_encodeISOArray:
   11.38      return inline_encodeISOArray();
   11.39  
   11.40 @@ -5856,6 +5872,100 @@
   11.41    return true;
   11.42  }
   11.43  
   11.44 +//-------------inline_squareToLen------------------------------------
   11.45 +bool LibraryCallKit::inline_squareToLen() {
   11.46 +  assert(UseSquareToLenIntrinsic, "not implementated on this platform");
   11.47 +
   11.48 +  address stubAddr = StubRoutines::squareToLen();
   11.49 +  if (stubAddr == NULL) {
   11.50 +    return false; // Intrinsic's stub is not implemented on this platform
   11.51 +  }
   11.52 +  const char* stubName = "squareToLen";
   11.53 +
   11.54 +  assert(callee()->signature()->size() == 4, "implSquareToLen has 4 parameters");
   11.55 +
   11.56 +  Node* x    = argument(0);
   11.57 +  Node* len  = argument(1);
   11.58 +  Node* z    = argument(2);
   11.59 +  Node* zlen = argument(3);
   11.60 +
   11.61 +  const Type* x_type = x->Value(&_gvn);
   11.62 +  const Type* z_type = z->Value(&_gvn);
   11.63 +  const TypeAryPtr* top_x = x_type->isa_aryptr();
   11.64 +  const TypeAryPtr* top_z = z_type->isa_aryptr();
   11.65 +  if (top_x  == NULL || top_x->klass()  == NULL ||
   11.66 +      top_z  == NULL || top_z->klass()  == NULL) {
   11.67 +    // failed array check
   11.68 +    return false;
   11.69 +  }
   11.70 +
   11.71 +  BasicType x_elem = x_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
   11.72 +  BasicType z_elem = z_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
   11.73 +  if (x_elem != T_INT || z_elem != T_INT) {
   11.74 +    return false;
   11.75 +  }
   11.76 +
   11.77 +
   11.78 +  Node* x_start = array_element_address(x, intcon(0), x_elem);
   11.79 +  Node* z_start = array_element_address(z, intcon(0), z_elem);
   11.80 +
   11.81 +  Node*  call = make_runtime_call(RC_LEAF|RC_NO_FP,
   11.82 +                                  OptoRuntime::squareToLen_Type(),
   11.83 +                                  stubAddr, stubName, TypePtr::BOTTOM,
   11.84 +                                  x_start, len, z_start, zlen);
   11.85 +
   11.86 +  set_result(z);
   11.87 +  return true;
   11.88 +}
   11.89 +
   11.90 +//-------------inline_mulAdd------------------------------------------
   11.91 +bool LibraryCallKit::inline_mulAdd() {
   11.92 +  assert(UseMulAddIntrinsic, "not implementated on this platform");
   11.93 +
   11.94 +  address stubAddr = StubRoutines::mulAdd();
   11.95 +  if (stubAddr == NULL) {
   11.96 +    return false; // Intrinsic's stub is not implemented on this platform
   11.97 +  }
   11.98 +  const char* stubName = "mulAdd";
   11.99 +
  11.100 +  assert(callee()->signature()->size() == 5, "mulAdd has 5 parameters");
  11.101 +
  11.102 +  Node* out      = argument(0);
  11.103 +  Node* in       = argument(1);
  11.104 +  Node* offset   = argument(2);
  11.105 +  Node* len      = argument(3);
  11.106 +  Node* k        = argument(4);
  11.107 +
  11.108 +  const Type* out_type = out->Value(&_gvn);
  11.109 +  const Type* in_type = in->Value(&_gvn);
  11.110 +  const TypeAryPtr* top_out = out_type->isa_aryptr();
  11.111 +  const TypeAryPtr* top_in = in_type->isa_aryptr();
  11.112 +  if (top_out  == NULL || top_out->klass()  == NULL ||
  11.113 +      top_in == NULL || top_in->klass() == NULL) {
  11.114 +    // failed array check
  11.115 +    return false;
  11.116 +  }
  11.117 +
  11.118 +  BasicType out_elem = out_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
  11.119 +  BasicType in_elem = in_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
  11.120 +  if (out_elem != T_INT || in_elem != T_INT) {
  11.121 +    return false;
  11.122 +  }
  11.123 +
  11.124 +  Node* outlen = load_array_length(out);
  11.125 +  Node* new_offset = _gvn.transform(new (C) SubINode(outlen, offset));
  11.126 +  Node* out_start = array_element_address(out, intcon(0), out_elem);
  11.127 +  Node* in_start = array_element_address(in, intcon(0), in_elem);
  11.128 +
  11.129 +  Node*  call = make_runtime_call(RC_LEAF|RC_NO_FP,
  11.130 +                                  OptoRuntime::mulAdd_Type(),
  11.131 +                                  stubAddr, stubName, TypePtr::BOTTOM,
  11.132 +                                  out_start,in_start, new_offset, len, k);
  11.133 +  Node* result = _gvn.transform(new (C) ProjNode(call, TypeFunc::Parms));
  11.134 +  set_result(result);
  11.135 +  return true;
  11.136 +}
  11.137 +
  11.138  
  11.139  /**
  11.140   * Calculate CRC32 for byte.
    12.1 --- a/src/share/vm/opto/runtime.cpp	Tue Feb 16 13:56:12 2016 +0000
    12.2 +++ b/src/share/vm/opto/runtime.cpp	Wed Feb 17 13:40:12 2016 +0300
    12.3 @@ -956,6 +956,48 @@
    12.4    return TypeFunc::make(domain, range);
    12.5  }
    12.6  
    12.7 +const TypeFunc* OptoRuntime::squareToLen_Type() {
    12.8 +  // create input type (domain)
    12.9 +  int num_args      = 4;
   12.10 +  int argcnt = num_args;
   12.11 +  const Type** fields = TypeTuple::fields(argcnt);
   12.12 +  int argp = TypeFunc::Parms;
   12.13 +  fields[argp++] = TypePtr::NOTNULL;    // x
   12.14 +  fields[argp++] = TypeInt::INT;        // len
   12.15 +  fields[argp++] = TypePtr::NOTNULL;    // z
   12.16 +  fields[argp++] = TypeInt::INT;        // zlen
   12.17 +  assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
   12.18 +  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
   12.19 +
   12.20 +  // no result type needed
   12.21 +  fields = TypeTuple::fields(1);
   12.22 +  fields[TypeFunc::Parms+0] = NULL;
   12.23 +  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
   12.24 +  return TypeFunc::make(domain, range);
   12.25 +}
   12.26 +
   12.27 +// for mulAdd calls, 2 pointers and 3 ints, returning int
   12.28 +const TypeFunc* OptoRuntime::mulAdd_Type() {
   12.29 +  // create input type (domain)
   12.30 +  int num_args      = 5;
   12.31 +  int argcnt = num_args;
   12.32 +  const Type** fields = TypeTuple::fields(argcnt);
   12.33 +  int argp = TypeFunc::Parms;
   12.34 +  fields[argp++] = TypePtr::NOTNULL;    // out
   12.35 +  fields[argp++] = TypePtr::NOTNULL;    // in
   12.36 +  fields[argp++] = TypeInt::INT;        // offset
   12.37 +  fields[argp++] = TypeInt::INT;        // len
   12.38 +  fields[argp++] = TypeInt::INT;        // k
   12.39 +  assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
   12.40 +  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
   12.41 +
   12.42 +  // returning carry (int)
   12.43 +  fields = TypeTuple::fields(1);
   12.44 +  fields[TypeFunc::Parms+0] = TypeInt::INT;
   12.45 +  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms+1, fields);
   12.46 +  return TypeFunc::make(domain, range);
   12.47 +}
   12.48 +
   12.49  
   12.50  
   12.51  //------------- Interpreter state access for on stack replacement
    13.1 --- a/src/share/vm/opto/runtime.hpp	Tue Feb 16 13:56:12 2016 +0000
    13.2 +++ b/src/share/vm/opto/runtime.hpp	Wed Feb 17 13:40:12 2016 +0300
    13.3 @@ -305,6 +305,10 @@
    13.4  
    13.5    static const TypeFunc* multiplyToLen_Type();
    13.6  
    13.7 +  static const TypeFunc* squareToLen_Type();
    13.8 +
    13.9 +  static const TypeFunc* mulAdd_Type();
   13.10 +
   13.11    static const TypeFunc* updateBytesCRC32_Type();
   13.12  
   13.13    // leaf on stack replacement interpreter accessor types
    14.1 --- a/src/share/vm/runtime/stubRoutines.cpp	Tue Feb 16 13:56:12 2016 +0000
    14.2 +++ b/src/share/vm/runtime/stubRoutines.cpp	Wed Feb 17 13:40:12 2016 +0300
    14.3 @@ -136,6 +136,8 @@
    14.4  address StubRoutines::_crc_table_adr = NULL;
    14.5  
    14.6  address StubRoutines::_multiplyToLen = NULL;
    14.7 +address StubRoutines::_squareToLen = NULL;
    14.8 +address StubRoutines::_mulAdd = NULL;
    14.9  
   14.10  double (* StubRoutines::_intrinsic_log   )(double) = NULL;
   14.11  double (* StubRoutines::_intrinsic_log10 )(double) = NULL;
    15.1 --- a/src/share/vm/runtime/stubRoutines.hpp	Tue Feb 16 13:56:12 2016 +0000
    15.2 +++ b/src/share/vm/runtime/stubRoutines.hpp	Wed Feb 17 13:40:12 2016 +0300
    15.3 @@ -209,6 +209,8 @@
    15.4    static address _crc_table_adr;
    15.5  
    15.6    static address _multiplyToLen;
    15.7 +  static address _squareToLen;
    15.8 +  static address _mulAdd;
    15.9  
   15.10    // These are versions of the java.lang.Math methods which perform
   15.11    // the same operations as the intrinsic version.  They are used for
   15.12 @@ -367,6 +369,8 @@
   15.13    static address crc_table_addr()      { return _crc_table_adr; }
   15.14  
   15.15    static address multiplyToLen()       {return _multiplyToLen; }
   15.16 +  static address squareToLen()         {return _squareToLen; }
   15.17 +  static address mulAdd()              {return _mulAdd; }
   15.18  
   15.19    static address select_fill_function(BasicType t, bool aligned, const char* &name);
   15.20  
    16.1 --- a/src/share/vm/runtime/vmStructs.cpp	Tue Feb 16 13:56:12 2016 +0000
    16.2 +++ b/src/share/vm/runtime/vmStructs.cpp	Wed Feb 17 13:40:12 2016 +0300
    16.3 @@ -813,6 +813,8 @@
    16.4       static_field(StubRoutines,                _updateBytesCRC32,                             address)                               \
    16.5       static_field(StubRoutines,                _crc_table_adr,                                address)                               \
    16.6       static_field(StubRoutines,                _multiplyToLen,                                address)                               \
    16.7 +     static_field(StubRoutines,                _squareToLen,                                  address)                               \
    16.8 +     static_field(StubRoutines,                _mulAdd,                                       address)                               \
    16.9                                                                                                                                       \
   16.10    /*****************/                                                                                                                \
   16.11    /* SharedRuntime */                                                                                                                \
    17.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    17.2 +++ b/test/compiler/intrinsics/muladd/TestMulAdd.java	Wed Feb 17 13:40:12 2016 +0300
    17.3 @@ -0,0 +1,117 @@
    17.4 +/*
    17.5 + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
    17.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    17.7 + *
    17.8 + * This code is free software; you can redistribute it and/or modify it
    17.9 + * under the terms of the GNU General Public License version 2 only, as
   17.10 + * published by the Free Software Foundation.
   17.11 + *
   17.12 + * This code is distributed in the hope that it will be useful, but WITHOUT
   17.13 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   17.14 + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   17.15 + * version 2 for more details (a copy is included in the LICENSE file that
   17.16 + * accompanied this code).
   17.17 + *
   17.18 + * You should have received a copy of the GNU General Public License version
   17.19 + * 2 along with this work; if not, write to the Free Software Foundation,
   17.20 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   17.21 + *
   17.22 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   17.23 + * or visit www.oracle.com if you need additional information or have any
   17.24 + * questions.
   17.25 + *
   17.26 + */
   17.27 +
   17.28 +/**
   17.29 + * @test
   17.30 + * @bug 8081778
   17.31 + * @summary Add C2 x86 intrinsic for BigInteger::mulAdd() method
   17.32 + *
   17.33 + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch
   17.34 + *      -XX:+IgnoreUnrecognizedVMOptions -XX:-UseSquareToLenIntrinsic -XX:-UseMultiplyToLenIntrinsic
   17.35 + *      -XX:CompileCommand=dontinline,TestMulAdd::main
   17.36 + *      -XX:CompileCommand=option,TestMulAdd::base_multiply,ccstr,DisableIntrinsic,_mulAdd
   17.37 + *      -XX:CompileCommand=option,java.math.BigInteger::multiply,ccstr,DisableIntrinsic,_mulAdd
   17.38 + *      -XX:CompileCommand=option,java.math.BigInteger::square,ccstr,DisableIntrinsic,_mulAdd
   17.39 + *      -XX:CompileCommand=option,java.math.BigInteger::squareToLen,ccstr,DisableIntrinsic,_mulAdd
   17.40 + *      -XX:CompileCommand=option,java.math.BigInteger::mulAdd,ccstr,DisableIntrinsic,_mulAdd
   17.41 + *      -XX:CompileCommand=inline,java.math.BigInteger::multiply
   17.42 + *      -XX:CompileCommand=inline,java.math.BigInteger::square
   17.43 + *      -XX:CompileCommand=inline,java.math.BigInteger::squareToLen
   17.44 + *      -XX:CompileCommand=inline,java.math.BigInteger::mulAdd TestMulAdd
   17.45 + */
   17.46 +
   17.47 +import java.util.Random;
   17.48 +import java.math.*;
   17.49 +
   17.50 +public class TestMulAdd {
   17.51 +
   17.52 +    // Avoid intrinsic by preventing inlining multiply() and mulAdd().
   17.53 +    public static BigInteger base_multiply(BigInteger op1) {
   17.54 +      return op1.multiply(op1);
   17.55 +    }
   17.56 +
   17.57 +    // Generate mulAdd() intrinsic by inlining multiply().
   17.58 +    public static BigInteger new_multiply(BigInteger op1) {
   17.59 +      return op1.multiply(op1);
   17.60 +    }
   17.61 +
   17.62 +    public static boolean bytecompare(BigInteger b1, BigInteger b2) {
   17.63 +      byte[] data1 = b1.toByteArray();
   17.64 +      byte[] data2 = b2.toByteArray();
   17.65 +      if (data1.length != data2.length)
   17.66 +        return false;
   17.67 +      for (int i = 0; i < data1.length; i++) {
   17.68 +        if (data1[i] != data2[i])
   17.69 +          return false;
   17.70 +      }
   17.71 +      return true;
   17.72 +    }
   17.73 +
   17.74 +    public static String stringify(BigInteger b) {
   17.75 +      String strout= "";
   17.76 +      byte [] data = b.toByteArray();
   17.77 +      for (int i = 0; i < data.length; i++) {
   17.78 +        strout += (String.format("%02x",data[i]) + " ");
   17.79 +      }
   17.80 +      return strout;
   17.81 +    }
   17.82 +
   17.83 +    public static void main(String args[]) throws Exception {
   17.84 +
   17.85 +      BigInteger oldsum = new BigInteger("0");
   17.86 +      BigInteger newsum = new BigInteger("0");
   17.87 +
   17.88 +      BigInteger b1, b2, oldres, newres;
   17.89 +
   17.90 +      Random rand = new Random();
   17.91 +      long seed = System.nanoTime();
   17.92 +      Random rand1 = new Random();
   17.93 +      long seed1 = System.nanoTime();
   17.94 +      rand.setSeed(seed);
   17.95 +      rand1.setSeed(seed1);
   17.96 +
   17.97 +      for (int j = 0; j < 100000; j++) {
   17.98 +        int rand_int = rand1.nextInt(3136)+32;
   17.99 +        b1 = new BigInteger(rand_int, rand);
  17.100 +
  17.101 +        oldres = base_multiply(b1);
  17.102 +        newres = new_multiply(b1);
  17.103 +
  17.104 +        oldsum = oldsum.add(oldres);
  17.105 +        newsum = newsum.add(newres);
  17.106 +
  17.107 +        if (!bytecompare(oldres,newres)) {
  17.108 +          System.out.print("mismatch for:b1:" + stringify(b1) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres));
  17.109 +          System.out.println(b1);
  17.110 +          throw new Exception("Failed");
  17.111 +        }
  17.112 +      }
  17.113 +      if (!bytecompare(oldsum,newsum))  {
  17.114 +        System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum));
  17.115 +        throw new Exception("Failed");
  17.116 +      } else {
  17.117 +        System.out.println("Success");
  17.118 +      }
  17.119 +   }
  17.120 +}
    18.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    18.2 +++ b/test/compiler/intrinsics/squaretolen/TestSquareToLen.java	Wed Feb 17 13:40:12 2016 +0300
    18.3 @@ -0,0 +1,114 @@
    18.4 +/*
    18.5 + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
    18.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    18.7 + *
    18.8 + * This code is free software; you can redistribute it and/or modify it
    18.9 + * under the terms of the GNU General Public License version 2 only, as
   18.10 + * published by the Free Software Foundation.
   18.11 + *
   18.12 + * This code is distributed in the hope that it will be useful, but WITHOUT
   18.13 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   18.14 + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   18.15 + * version 2 for more details (a copy is included in the LICENSE file that
   18.16 + * accompanied this code).
   18.17 + *
   18.18 + * You should have received a copy of the GNU General Public License version
   18.19 + * 2 along with this work; if not, write to the Free Software Foundation,
   18.20 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18.21 + *
   18.22 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   18.23 + * or visit www.oracle.com if you need additional information or have any
   18.24 + * questions.
   18.25 + *
   18.26 + */
   18.27 +
   18.28 +/**
   18.29 + * @test
   18.30 + * @bug 8081778
   18.31 + * @summary Add C2 x86 intrinsic for BigInteger::squareToLen() method
   18.32 + *
   18.33 + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch
   18.34 + *      -XX:CompileCommand=exclude,TestSquareToLen::main
   18.35 + *      -XX:CompileCommand=option,TestSquareToLen::base_multiply,ccstr,DisableIntrinsic,_squareToLen
   18.36 + *      -XX:CompileCommand=option,java.math.BigInteger::multiply,ccstr,DisableIntrinsic,_squareToLen
   18.37 + *      -XX:CompileCommand=option,java.math.BigInteger::square,ccstr,DisableIntrinsic,_squareToLen
   18.38 + *      -XX:CompileCommand=option,java.math.BigInteger::squareToLen,ccstr,DisableIntrinsic,_squareToLen
   18.39 + *      -XX:CompileCommand=inline,java.math.BigInteger::multiply
   18.40 + *      -XX:CompileCommand=inline,java.math.BigInteger::square
   18.41 + *      -XX:CompileCommand=inline,java.math.BigInteger::squareToLen TestSquareToLen
   18.42 + */
   18.43 +
   18.44 +import java.util.Random;
   18.45 +import java.math.*;
   18.46 +
   18.47 +public class TestSquareToLen {
   18.48 +
   18.49 +    // Avoid intrinsic by preventing inlining multiply() and squareToLen().
   18.50 +    public static BigInteger base_multiply(BigInteger op1) {
   18.51 +      return op1.multiply(op1);
   18.52 +    }
   18.53 +
   18.54 +    // Generate squareToLen() intrinsic by inlining multiply().
   18.55 +    public static BigInteger new_multiply(BigInteger op1) {
   18.56 +      return op1.multiply(op1);
   18.57 +    }
   18.58 +
   18.59 +    public static boolean bytecompare(BigInteger b1, BigInteger b2) {
   18.60 +      byte[] data1 = b1.toByteArray();
   18.61 +      byte[] data2 = b2.toByteArray();
   18.62 +      if (data1.length != data2.length)
   18.63 +        return false;
   18.64 +      for (int i = 0; i < data1.length; i++) {
   18.65 +        if (data1[i] != data2[i])
   18.66 +          return false;
   18.67 +      }
   18.68 +      return true;
   18.69 +    }
   18.70 +
   18.71 +    public static String stringify(BigInteger b) {
   18.72 +      String strout= "";
   18.73 +      byte [] data = b.toByteArray();
   18.74 +      for (int i = 0; i < data.length; i++) {
   18.75 +        strout += (String.format("%02x",data[i]) + " ");
   18.76 +      }
   18.77 +      return strout;
   18.78 +    }
   18.79 +
   18.80 +    public static void main(String args[]) throws Exception {
   18.81 +
   18.82 +      BigInteger oldsum = new BigInteger("0");
   18.83 +      BigInteger newsum = new BigInteger("0");
   18.84 +
   18.85 +      BigInteger b1, b2, oldres, newres;
   18.86 +
   18.87 +      Random rand = new Random();
   18.88 +      long seed = System.nanoTime();
   18.89 +      Random rand1 = new Random();
   18.90 +      long seed1 = System.nanoTime();
   18.91 +      rand.setSeed(seed);
   18.92 +      rand1.setSeed(seed1);
   18.93 +
   18.94 +      for (int j = 0; j < 100000; j++) {
   18.95 +        int rand_int = rand1.nextInt(3136)+32;
   18.96 +        b1 = new BigInteger(rand_int, rand);
   18.97 +
   18.98 +        oldres = base_multiply(b1);
   18.99 +        newres = new_multiply(b1);
  18.100 +
  18.101 +        oldsum = oldsum.add(oldres);
  18.102 +        newsum = newsum.add(newres);
  18.103 +
  18.104 +        if (!bytecompare(oldres,newres)) {
  18.105 +          System.out.print("mismatch for:b1:" + stringify(b1) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres));
  18.106 +          System.out.println(b1);
  18.107 +          throw new Exception("Failed");
  18.108 +        }
  18.109 +      }
  18.110 +      if (!bytecompare(oldsum,newsum))  {
  18.111 +        System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum));
  18.112 +        throw new Exception("Failed");
  18.113 +      } else {
  18.114 +        System.out.println("Success");
  18.115 +      }
  18.116 +   }
  18.117 +}

mercurial