src/cpu/x86/vm/macroAssembler_x86.cpp

changeset 8494
445941ba41c0
parent 8173
faef2a237329
parent 8489
51c505229e71
child 8504
a96cf90239c6
child 8548
f958bebdee26
     1.1 --- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu Mar 31 14:04:14 2016 -0700
     1.2 +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu Mar 31 14:23:12 2016 -0700
     1.3 @@ -7769,6 +7769,503 @@
     1.4    pop(tmp2);
     1.5    pop(tmp1);
     1.6  }
     1.7 +
     1.8 +//Helper functions for square_to_len()
     1.9 +
    1.10 +/**
    1.11 + * Store the squares of x[], right shifted one bit (divided by 2) into z[]
    1.12 + * Preserves x and z and modifies rest of the registers.
    1.13 + */
    1.14 +
    1.15 +void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
    1.16 +  // Perform square and right shift by 1
    1.17 +  // Handle odd xlen case first, then for even xlen do the following
    1.18 +  // jlong carry = 0;
    1.19 +  // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
    1.20 +  //     huge_128 product = x[j:j+1] * x[j:j+1];
    1.21 +  //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
    1.22 +  //     z[i+2:i+3] = (jlong)(product >>> 1);
    1.23 +  //     carry = (jlong)product;
    1.24 +  // }
    1.25 +
    1.26 +  xorq(tmp5, tmp5);     // carry
    1.27 +  xorq(rdxReg, rdxReg);
    1.28 +  xorl(tmp1, tmp1);     // index for x
    1.29 +  xorl(tmp4, tmp4);     // index for z
    1.30 +
    1.31 +  Label L_first_loop, L_first_loop_exit;
    1.32 +
    1.33 +  testl(xlen, 1);
    1.34 +  jccb(Assembler::zero, L_first_loop); //jump if xlen is even
    1.35 +
    1.36 +  // Square and right shift by 1 the odd element using 32 bit multiply
    1.37 +  movl(raxReg, Address(x, tmp1, Address::times_4, 0));
    1.38 +  imulq(raxReg, raxReg);
    1.39 +  shrq(raxReg, 1);
    1.40 +  adcq(tmp5, 0);
    1.41 +  movq(Address(z, tmp4, Address::times_4, 0), raxReg);
    1.42 +  incrementl(tmp1);
    1.43 +  addl(tmp4, 2);
    1.44 +
    1.45 +  // Square and  right shift by 1 the rest using 64 bit multiply
    1.46 +  bind(L_first_loop);
    1.47 +  cmpptr(tmp1, xlen);
    1.48 +  jccb(Assembler::equal, L_first_loop_exit);
    1.49 +
    1.50 +  // Square
    1.51 +  movq(raxReg, Address(x, tmp1, Address::times_4,  0));
    1.52 +  rorq(raxReg, 32);    // convert big-endian to little-endian
    1.53 +  mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
    1.54 +
    1.55 +  // Right shift by 1 and save carry
    1.56 +  shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
    1.57 +  rcrq(rdxReg, 1);
    1.58 +  rcrq(raxReg, 1);
    1.59 +  adcq(tmp5, 0);
    1.60 +
    1.61 +  // Store result in z
    1.62 +  movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
    1.63 +  movq(Address(z, tmp4, Address::times_4, 8), raxReg);
    1.64 +
    1.65 +  // Update indices for x and z
    1.66 +  addl(tmp1, 2);
    1.67 +  addl(tmp4, 4);
    1.68 +  jmp(L_first_loop);
    1.69 +
    1.70 +  bind(L_first_loop_exit);
    1.71 +}
    1.72 +
    1.73 +
    1.74 +/**
    1.75 + * Perform the following multiply add operation using BMI2 instructions
    1.76 + * carry:sum = sum + op1*op2 + carry
    1.77 + * op2 should be in rdx
    1.78 + * op2 is preserved, all other registers are modified
    1.79 + */
    1.80 +void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
    1.81 +  // assert op2 is rdx
    1.82 +  mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
    1.83 +  addq(sum, carry);
    1.84 +  adcq(tmp2, 0);
    1.85 +  addq(sum, op1);
    1.86 +  adcq(tmp2, 0);
    1.87 +  movq(carry, tmp2);
    1.88 +}
    1.89 +
    1.90 +/**
    1.91 + * Perform the following multiply add operation:
    1.92 + * carry:sum = sum + op1*op2 + carry
    1.93 + * Preserves op1, op2 and modifies rest of registers
    1.94 + */
    1.95 +void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
    1.96 +  // rdx:rax = op1 * op2
    1.97 +  movq(raxReg, op2);
    1.98 +  mulq(op1);
    1.99 +
   1.100 +  //  rdx:rax = sum + carry + rdx:rax
   1.101 +  addq(sum, carry);
   1.102 +  adcq(rdxReg, 0);
   1.103 +  addq(sum, raxReg);
   1.104 +  adcq(rdxReg, 0);
   1.105 +
   1.106 +  // carry:sum = rdx:sum
   1.107 +  movq(carry, rdxReg);
   1.108 +}
   1.109 +
   1.110 +/**
   1.111 + * Add 64 bit long carry into z[] with carry propogation.
   1.112 + * Preserves z and carry register values and modifies rest of registers.
   1.113 + *
   1.114 + */
   1.115 +void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
   1.116 +  Label L_fourth_loop, L_fourth_loop_exit;
   1.117 +
   1.118 +  movl(tmp1, 1);
   1.119 +  subl(zlen, 2);
   1.120 +  addq(Address(z, zlen, Address::times_4, 0), carry);
   1.121 +
   1.122 +  bind(L_fourth_loop);
   1.123 +  jccb(Assembler::carryClear, L_fourth_loop_exit);
   1.124 +  subl(zlen, 2);
   1.125 +  jccb(Assembler::negative, L_fourth_loop_exit);
   1.126 +  addq(Address(z, zlen, Address::times_4, 0), tmp1);
   1.127 +  jmp(L_fourth_loop);
   1.128 +  bind(L_fourth_loop_exit);
   1.129 +}
   1.130 +
   1.131 +/**
   1.132 + * Shift z[] left by 1 bit.
   1.133 + * Preserves x, len, z and zlen registers and modifies rest of the registers.
   1.134 + *
   1.135 + */
   1.136 +void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
   1.137 +
   1.138 +  Label L_fifth_loop, L_fifth_loop_exit;
   1.139 +
   1.140 +  // Fifth loop
   1.141 +  // Perform primitiveLeftShift(z, zlen, 1)
   1.142 +
   1.143 +  const Register prev_carry = tmp1;
   1.144 +  const Register new_carry = tmp4;
   1.145 +  const Register value = tmp2;
   1.146 +  const Register zidx = tmp3;
   1.147 +
   1.148 +  // int zidx, carry;
   1.149 +  // long value;
   1.150 +  // carry = 0;
   1.151 +  // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
   1.152 +  //    (carry:value)  = (z[i] << 1) | carry ;
   1.153 +  //    z[i] = value;
   1.154 +  // }
   1.155 +
   1.156 +  movl(zidx, zlen);
   1.157 +  xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
   1.158 +
   1.159 +  bind(L_fifth_loop);
   1.160 +  decl(zidx);  // Use decl to preserve carry flag
   1.161 +  decl(zidx);
   1.162 +  jccb(Assembler::negative, L_fifth_loop_exit);
   1.163 +
   1.164 +  if (UseBMI2Instructions) {
   1.165 +     movq(value, Address(z, zidx, Address::times_4, 0));
   1.166 +     rclq(value, 1);
   1.167 +     rorxq(value, value, 32);
   1.168 +     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
   1.169 +  }
   1.170 +  else {
   1.171 +    // clear new_carry
   1.172 +    xorl(new_carry, new_carry);
   1.173 +
   1.174 +    // Shift z[i] by 1, or in previous carry and save new carry
   1.175 +    movq(value, Address(z, zidx, Address::times_4, 0));
   1.176 +    shlq(value, 1);
   1.177 +    adcl(new_carry, 0);
   1.178 +
   1.179 +    orq(value, prev_carry);
   1.180 +    rorq(value, 0x20);
   1.181 +    movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
   1.182 +
   1.183 +    // Set previous carry = new carry
   1.184 +    movl(prev_carry, new_carry);
   1.185 +  }
   1.186 +  jmp(L_fifth_loop);
   1.187 +
   1.188 +  bind(L_fifth_loop_exit);
   1.189 +}
   1.190 +
   1.191 +
   1.192 +/**
   1.193 + * Code for BigInteger::squareToLen() intrinsic
   1.194 + *
   1.195 + * rdi: x
   1.196 + * rsi: len
   1.197 + * r8:  z
   1.198 + * rcx: zlen
   1.199 + * r12: tmp1
   1.200 + * r13: tmp2
   1.201 + * r14: tmp3
   1.202 + * r15: tmp4
   1.203 + * rbx: tmp5
   1.204 + *
   1.205 + */
   1.206 +void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
   1.207 +
   1.208 +  Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply;
   1.209 +  push(tmp1);
   1.210 +  push(tmp2);
   1.211 +  push(tmp3);
   1.212 +  push(tmp4);
   1.213 +  push(tmp5);
   1.214 +
   1.215 +  // First loop
   1.216 +  // Store the squares, right shifted one bit (i.e., divided by 2).
   1.217 +  square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
   1.218 +
   1.219 +  // Add in off-diagonal sums.
   1.220 +  //
   1.221 +  // Second, third (nested) and fourth loops.
   1.222 +  // zlen +=2;
   1.223 +  // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
   1.224 +  //    carry = 0;
   1.225 +  //    long op2 = x[xidx:xidx+1];
   1.226 +  //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
   1.227 +  //       k -= 2;
   1.228 +  //       long op1 = x[j:j+1];
   1.229 +  //       long sum = z[k:k+1];
   1.230 +  //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
   1.231 +  //       z[k:k+1] = sum;
   1.232 +  //    }
   1.233 +  //    add_one_64(z, k, carry, tmp_regs);
   1.234 +  // }
   1.235 +
   1.236 +  const Register carry = tmp5;
   1.237 +  const Register sum = tmp3;
   1.238 +  const Register op1 = tmp4;
   1.239 +  Register op2 = tmp2;
   1.240 +
   1.241 +  push(zlen);
   1.242 +  push(len);
   1.243 +  addl(zlen,2);
   1.244 +  bind(L_second_loop);
   1.245 +  xorq(carry, carry);
   1.246 +  subl(zlen, 4);
   1.247 +  subl(len, 2);
   1.248 +  push(zlen);
   1.249 +  push(len);
   1.250 +  cmpl(len, 0);
   1.251 +  jccb(Assembler::lessEqual, L_second_loop_exit);
   1.252 +
   1.253 +  // Multiply an array by one 64 bit long.
   1.254 +  if (UseBMI2Instructions) {
   1.255 +    op2 = rdxReg;
   1.256 +    movq(op2, Address(x, len, Address::times_4,  0));
   1.257 +    rorxq(op2, op2, 32);
   1.258 +  }
   1.259 +  else {
   1.260 +    movq(op2, Address(x, len, Address::times_4,  0));
   1.261 +    rorq(op2, 32);
   1.262 +  }
   1.263 +
   1.264 +  bind(L_third_loop);
   1.265 +  decrementl(len);
   1.266 +  jccb(Assembler::negative, L_third_loop_exit);
   1.267 +  decrementl(len);
   1.268 +  jccb(Assembler::negative, L_last_x);
   1.269 +
   1.270 +  movq(op1, Address(x, len, Address::times_4,  0));
   1.271 +  rorq(op1, 32);
   1.272 +
   1.273 +  bind(L_multiply);
   1.274 +  subl(zlen, 2);
   1.275 +  movq(sum, Address(z, zlen, Address::times_4,  0));
   1.276 +
   1.277 +  // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
   1.278 +  if (UseBMI2Instructions) {
   1.279 +    multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
   1.280 +  }
   1.281 +  else {
   1.282 +    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
   1.283 +  }
   1.284 +
   1.285 +  movq(Address(z, zlen, Address::times_4, 0), sum);
   1.286 +
   1.287 +  jmp(L_third_loop);
   1.288 +  bind(L_third_loop_exit);
   1.289 +
   1.290 +  // Fourth loop
   1.291 +  // Add 64 bit long carry into z with carry propogation.
   1.292 +  // Uses offsetted zlen.
   1.293 +  add_one_64(z, zlen, carry, tmp1);
   1.294 +
   1.295 +  pop(len);
   1.296 +  pop(zlen);
   1.297 +  jmp(L_second_loop);
   1.298 +
   1.299 +  // Next infrequent code is moved outside loops.
   1.300 +  bind(L_last_x);
   1.301 +  movl(op1, Address(x, 0));
   1.302 +  jmp(L_multiply);
   1.303 +
   1.304 +  bind(L_second_loop_exit);
   1.305 +  pop(len);
   1.306 +  pop(zlen);
   1.307 +  pop(len);
   1.308 +  pop(zlen);
   1.309 +
   1.310 +  // Fifth loop
   1.311 +  // Shift z left 1 bit.
   1.312 +  lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
   1.313 +
   1.314 +  // z[zlen-1] |= x[len-1] & 1;
   1.315 +  movl(tmp3, Address(x, len, Address::times_4, -4));
   1.316 +  andl(tmp3, 1);
   1.317 +  orl(Address(z, zlen, Address::times_4,  -4), tmp3);
   1.318 +
   1.319 +  pop(tmp5);
   1.320 +  pop(tmp4);
   1.321 +  pop(tmp3);
   1.322 +  pop(tmp2);
   1.323 +  pop(tmp1);
   1.324 +}
   1.325 +
   1.326 +/**
   1.327 + * Helper function for mul_add()
   1.328 + * Multiply the in[] by int k and add to out[] starting at offset offs using
   1.329 + * 128 bit by 32 bit multiply and return the carry in tmp5.
   1.330 + * Only quad int aligned length of in[] is operated on in this function.
   1.331 + * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
   1.332 + * This function preserves out, in and k registers.
   1.333 + * len and offset point to the appropriate index in "in" & "out" correspondingly
   1.334 + * tmp5 has the carry.
   1.335 + * other registers are temporary and are modified.
   1.336 + *
   1.337 + */
   1.338 +void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
   1.339 +  Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
   1.340 +  Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
   1.341 +
   1.342 +  Label L_first_loop, L_first_loop_exit;
   1.343 +
   1.344 +  movl(tmp1, len);
   1.345 +  shrl(tmp1, 2);
   1.346 +
   1.347 +  bind(L_first_loop);
   1.348 +  subl(tmp1, 1);
   1.349 +  jccb(Assembler::negative, L_first_loop_exit);
   1.350 +
   1.351 +  subl(len, 4);
   1.352 +  subl(offset, 4);
   1.353 +
   1.354 +  Register op2 = tmp2;
   1.355 +  const Register sum = tmp3;
   1.356 +  const Register op1 = tmp4;
   1.357 +  const Register carry = tmp5;
   1.358 +
   1.359 +  if (UseBMI2Instructions) {
   1.360 +    op2 = rdxReg;
   1.361 +  }
   1.362 +
   1.363 +  movq(op1, Address(in, len, Address::times_4,  8));
   1.364 +  rorq(op1, 32);
   1.365 +  movq(sum, Address(out, offset, Address::times_4,  8));
   1.366 +  rorq(sum, 32);
   1.367 +  if (UseBMI2Instructions) {
   1.368 +    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
   1.369 +  }
   1.370 +  else {
   1.371 +    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
   1.372 +  }
   1.373 +  // Store back in big endian from little endian
   1.374 +  rorq(sum, 0x20);
   1.375 +  movq(Address(out, offset, Address::times_4,  8), sum);
   1.376 +
   1.377 +  movq(op1, Address(in, len, Address::times_4,  0));
   1.378 +  rorq(op1, 32);
   1.379 +  movq(sum, Address(out, offset, Address::times_4,  0));
   1.380 +  rorq(sum, 32);
   1.381 +  if (UseBMI2Instructions) {
   1.382 +    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
   1.383 +  }
   1.384 +  else {
   1.385 +    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
   1.386 +  }
   1.387 +  // Store back in big endian from little endian
   1.388 +  rorq(sum, 0x20);
   1.389 +  movq(Address(out, offset, Address::times_4,  0), sum);
   1.390 +
   1.391 +  jmp(L_first_loop);
   1.392 +  bind(L_first_loop_exit);
   1.393 +}
   1.394 +
   1.395 +/**
   1.396 + * Code for BigInteger::mulAdd() intrinsic
   1.397 + *
   1.398 + * rdi: out
   1.399 + * rsi: in
   1.400 + * r11: offs (out.length - offset)
   1.401 + * rcx: len
   1.402 + * r8:  k
   1.403 + * r12: tmp1
   1.404 + * r13: tmp2
   1.405 + * r14: tmp3
   1.406 + * r15: tmp4
   1.407 + * rbx: tmp5
   1.408 + * Multiply the in[] by word k and add to out[], return the carry in rax
   1.409 + */
   1.410 +void MacroAssembler::mul_add(Register out, Register in, Register offs,
   1.411 +   Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
   1.412 +   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
   1.413 +
   1.414 +  Label L_carry, L_last_in, L_done;
   1.415 +
   1.416 +// carry = 0;
   1.417 +// for (int j=len-1; j >= 0; j--) {
   1.418 +//    long product = (in[j] & LONG_MASK) * kLong +
   1.419 +//                   (out[offs] & LONG_MASK) + carry;
   1.420 +//    out[offs--] = (int)product;
   1.421 +//    carry = product >>> 32;
   1.422 +// }
   1.423 +//
   1.424 +  push(tmp1);
   1.425 +  push(tmp2);
   1.426 +  push(tmp3);
   1.427 +  push(tmp4);
   1.428 +  push(tmp5);
   1.429 +
   1.430 +  Register op2 = tmp2;
   1.431 +  const Register sum = tmp3;
   1.432 +  const Register op1 = tmp4;
   1.433 +  const Register carry =  tmp5;
   1.434 +
   1.435 +  if (UseBMI2Instructions) {
   1.436 +    op2 = rdxReg;
   1.437 +    movl(op2, k);
   1.438 +  }
   1.439 +  else {
   1.440 +    movl(op2, k);
   1.441 +  }
   1.442 +
   1.443 +  xorq(carry, carry);
   1.444 +
   1.445 +  //First loop
   1.446 +
   1.447 +  //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
   1.448 +  //The carry is in tmp5
   1.449 +  mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
   1.450 +
   1.451 +  //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
   1.452 +  decrementl(len);
   1.453 +  jccb(Assembler::negative, L_carry);
   1.454 +  decrementl(len);
   1.455 +  jccb(Assembler::negative, L_last_in);
   1.456 +
   1.457 +  movq(op1, Address(in, len, Address::times_4,  0));
   1.458 +  rorq(op1, 32);
   1.459 +
   1.460 +  subl(offs, 2);
   1.461 +  movq(sum, Address(out, offs, Address::times_4,  0));
   1.462 +  rorq(sum, 32);
   1.463 +
   1.464 +  if (UseBMI2Instructions) {
   1.465 +    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
   1.466 +  }
   1.467 +  else {
   1.468 +    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
   1.469 +  }
   1.470 +
   1.471 +  // Store back in big endian from little endian
   1.472 +  rorq(sum, 0x20);
   1.473 +  movq(Address(out, offs, Address::times_4,  0), sum);
   1.474 +
   1.475 +  testl(len, len);
   1.476 +  jccb(Assembler::zero, L_carry);
   1.477 +
   1.478 +  //Multiply the last in[] entry, if any
   1.479 +  bind(L_last_in);
   1.480 +  movl(op1, Address(in, 0));
   1.481 +  movl(sum, Address(out, offs, Address::times_4,  -4));
   1.482 +
   1.483 +  movl(raxReg, k);
   1.484 +  mull(op1); //tmp4 * eax -> edx:eax
   1.485 +  addl(sum, carry);
   1.486 +  adcl(rdxReg, 0);
   1.487 +  addl(sum, raxReg);
   1.488 +  adcl(rdxReg, 0);
   1.489 +  movl(carry, rdxReg);
   1.490 +
   1.491 +  movl(Address(out, offs, Address::times_4,  -4), sum);
   1.492 +
   1.493 +  bind(L_carry);
   1.494 +  //return tmp5/carry as carry in rax
   1.495 +  movl(rax, carry);
   1.496 +
   1.497 +  bind(L_done);
   1.498 +  pop(tmp5);
   1.499 +  pop(tmp4);
   1.500 +  pop(tmp3);
   1.501 +  pop(tmp2);
   1.502 +  pop(tmp1);
   1.503 +}
   1.504  #endif
   1.505  
   1.506  /**

mercurial