jdk8-mips64-public/hotspot: comparison src/cpu/x86/vm/macroAssembler

-:4fc39d24d00e
+:445941ba41c0
 pop(tmp4);
 pop(tmp3);
 pop(tmp2);
 pop(tmp1);
 }
+//Helper functions for square_to_len()
+/**
+* Store the squares of x[], right shifted one bit (divided by 2) into z[]
+* Preserves x and z and modifies rest of the registers.
+*/
+void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
+// Perform square and right shift by 1
+// Handle odd xlen case first, then for even xlen do the following
+// jlong carry = 0;
+// for (int j=0, i=0; j < xlen; j+=2, i+=4) {
+//     huge_128 product = x[j:j+1] * x[j:j+1];
+//     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
+//     z[i+2:i+3] = (jlong)(product >>> 1);
+//     carry = (jlong)product;
+// }
+xorq(tmp5, tmp5);     // carry
+xorq(rdxReg, rdxReg);
+xorl(tmp1, tmp1);     // index for x
+xorl(tmp4, tmp4);     // index for z
+Label L_first_loop, L_first_loop_exit;
+testl(xlen, 1);
+jccb(Assembler::zero, L_first_loop); //jump if xlen is even
+// Square and right shift by 1 the odd element using 32 bit multiply
+movl(raxReg, Address(x, tmp1, Address::times_4, 0));
+imulq(raxReg, raxReg);
+shrq(raxReg, 1);
+adcq(tmp5, 0);
+movq(Address(z, tmp4, Address::times_4, 0), raxReg);
+incrementl(tmp1);
+addl(tmp4, 2);
+// Square and  right shift by 1 the rest using 64 bit multiply
+bind(L_first_loop);
+cmpptr(tmp1, xlen);
+jccb(Assembler::equal, L_first_loop_exit);
+// Square
+movq(raxReg, Address(x, tmp1, Address::times_4,  0));
+rorq(raxReg, 32);    // convert big-endian to little-endian
+mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
+// Right shift by 1 and save carry
+shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
+rcrq(rdxReg, 1);
+rcrq(raxReg, 1);
+adcq(tmp5, 0);
+// Store result in z
+movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
+movq(Address(z, tmp4, Address::times_4, 8), raxReg);
+// Update indices for x and z
+addl(tmp1, 2);
+addl(tmp4, 4);
+jmp(L_first_loop);
+bind(L_first_loop_exit);
+}
+/**
+* Perform the following multiply add operation using BMI2 instructions
+* carry:sum = sum + op1*op2 + carry
+* op2 should be in rdx
+* op2 is preserved, all other registers are modified
+*/
+void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
+// assert op2 is rdx
+mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
+addq(sum, carry);
+adcq(tmp2, 0);
+addq(sum, op1);
+adcq(tmp2, 0);
+movq(carry, tmp2);
+}
+/**
+* Perform the following multiply add operation:
+* carry:sum = sum + op1*op2 + carry
+* Preserves op1, op2 and modifies rest of registers
+*/
+void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
+// rdx:rax = op1 * op2
+movq(raxReg, op2);
+mulq(op1);
+//  rdx:rax = sum + carry + rdx:rax
+addq(sum, carry);
+adcq(rdxReg, 0);
+addq(sum, raxReg);
+adcq(rdxReg, 0);
+// carry:sum = rdx:sum
+movq(carry, rdxReg);
+}
+/**
+* Add 64 bit long carry into z[] with carry propogation.
+* Preserves z and carry register values and modifies rest of registers.
+*
+*/
+void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
+Label L_fourth_loop, L_fourth_loop_exit;
+movl(tmp1, 1);
+subl(zlen, 2);
+addq(Address(z, zlen, Address::times_4, 0), carry);
+bind(L_fourth_loop);
+jccb(Assembler::carryClear, L_fourth_loop_exit);
+subl(zlen, 2);
+jccb(Assembler::negative, L_fourth_loop_exit);
+addq(Address(z, zlen, Address::times_4, 0), tmp1);
+jmp(L_fourth_loop);
+bind(L_fourth_loop_exit);
+}
+/**
+* Shift z[] left by 1 bit.
+* Preserves x, len, z and zlen registers and modifies rest of the registers.
+*
+*/
+void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
+Label L_fifth_loop, L_fifth_loop_exit;
+// Fifth loop
+// Perform primitiveLeftShift(z, zlen, 1)
+const Register prev_carry = tmp1;
+const Register new_carry = tmp4;
+const Register value = tmp2;
+const Register zidx = tmp3;
+// int zidx, carry;
+// long value;
+// carry = 0;
+// for (zidx = zlen-2; zidx >=0; zidx -= 2) {
+//    (carry:value)  = (z[i] << 1) | carry ;
+//    z[i] = value;
+// }
+movl(zidx, zlen);
+xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
+bind(L_fifth_loop);
+decl(zidx);  // Use decl to preserve carry flag
+decl(zidx);
+jccb(Assembler::negative, L_fifth_loop_exit);
+if (UseBMI2Instructions) {
+movq(value, Address(z, zidx, Address::times_4, 0));
+rclq(value, 1);
+rorxq(value, value, 32);
+movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
+}
+else {
+// clear new_carry
+xorl(new_carry, new_carry);
+// Shift z[i] by 1, or in previous carry and save new carry
+movq(value, Address(z, zidx, Address::times_4, 0));
+shlq(value, 1);
+adcl(new_carry, 0);
+orq(value, prev_carry);
+rorq(value, 0x20);
+movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
+// Set previous carry = new carry
+movl(prev_carry, new_carry);
+}
+jmp(L_fifth_loop);
+bind(L_fifth_loop_exit);
+}
+/**
+* Code for BigInteger::squareToLen() intrinsic
+*
+* rdi: x
+* rsi: len
+* r8:  z
+* rcx: zlen
+* r12: tmp1
+* r13: tmp2
+* r14: tmp3
+* r15: tmp4
+* rbx: tmp5
+*
+*/
+void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
+Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply;
+push(tmp1);
+push(tmp2);
+push(tmp3);
+push(tmp4);
+push(tmp5);
+// First loop
+// Store the squares, right shifted one bit (i.e., divided by 2).
+square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
+// Add in off-diagonal sums.
+//
+// Second, third (nested) and fourth loops.
+// zlen +=2;
+// for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
+//    carry = 0;
+//    long op2 = x[xidx:xidx+1];
+//    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
+//       k -= 2;
+//       long op1 = x[j:j+1];
+//       long sum = z[k:k+1];
+//       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
+//       z[k:k+1] = sum;
+//    }
+//    add_one_64(z, k, carry, tmp_regs);
+// }
+const Register carry = tmp5;
+const Register sum = tmp3;
+const Register op1 = tmp4;
+Register op2 = tmp2;
+push(zlen);
+push(len);
+addl(zlen,2);
+bind(L_second_loop);
+xorq(carry, carry);
+subl(zlen, 4);
+subl(len, 2);
+push(zlen);
+push(len);
+cmpl(len, 0);
+jccb(Assembler::lessEqual, L_second_loop_exit);
+// Multiply an array by one 64 bit long.
+if (UseBMI2Instructions) {
+op2 = rdxReg;
+movq(op2, Address(x, len, Address::times_4,  0));
+rorxq(op2, op2, 32);
+}
+else {
+movq(op2, Address(x, len, Address::times_4,  0));
+rorq(op2, 32);
+}
+bind(L_third_loop);
+decrementl(len);
+jccb(Assembler::negative, L_third_loop_exit);
+decrementl(len);
+jccb(Assembler::negative, L_last_x);
+movq(op1, Address(x, len, Address::times_4,  0));
+rorq(op1, 32);
+bind(L_multiply);
+subl(zlen, 2);
+movq(sum, Address(z, zlen, Address::times_4,  0));
+// Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
+if (UseBMI2Instructions) {
+multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
+}
+else {
+multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
+}
+movq(Address(z, zlen, Address::times_4, 0), sum);
+jmp(L_third_loop);
+bind(L_third_loop_exit);
+// Fourth loop
+// Add 64 bit long carry into z with carry propogation.
+// Uses offsetted zlen.
+add_one_64(z, zlen, carry, tmp1);
+pop(len);
+pop(zlen);
+jmp(L_second_loop);
+// Next infrequent code is moved outside loops.
+bind(L_last_x);
+movl(op1, Address(x, 0));
+jmp(L_multiply);
+bind(L_second_loop_exit);
+pop(len);
+pop(zlen);
+pop(len);
+pop(zlen);
+// Fifth loop
+// Shift z left 1 bit.
+lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
+// z[zlen-1] |= x[len-1] & 1;
+movl(tmp3, Address(x, len, Address::times_4, -4));
+andl(tmp3, 1);
+orl(Address(z, zlen, Address::times_4,  -4), tmp3);
+pop(tmp5);
+pop(tmp4);
+pop(tmp3);
+pop(tmp2);
+pop(tmp1);
+}
+/**
+* Helper function for mul_add()
+* Multiply the in[] by int k and add to out[] starting at offset offs using
+* 128 bit by 32 bit multiply and return the carry in tmp5.
+* Only quad int aligned length of in[] is operated on in this function.
+* k is in rdxReg for BMI2Instructions, for others it is in tmp2.
+* This function preserves out, in and k registers.
+* len and offset point to the appropriate index in "in" & "out" correspondingly
+* tmp5 has the carry.
+* other registers are temporary and are modified.
+*
+*/
+void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
+Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
+Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
+Label L_first_loop, L_first_loop_exit;
+movl(tmp1, len);
+shrl(tmp1, 2);
+bind(L_first_loop);
+subl(tmp1, 1);
+jccb(Assembler::negative, L_first_loop_exit);
+subl(len, 4);
+subl(offset, 4);
+Register op2 = tmp2;
+const Register sum = tmp3;
+const Register op1 = tmp4;
+const Register carry = tmp5;
+if (UseBMI2Instructions) {
+op2 = rdxReg;
+}
+movq(op1, Address(in, len, Address::times_4,  8));
+rorq(op1, 32);
+movq(sum, Address(out, offset, Address::times_4,  8));
+rorq(sum, 32);
+if (UseBMI2Instructions) {
+multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
+}
+else {
+multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
+}
+// Store back in big endian from little endian
+rorq(sum, 0x20);
+movq(Address(out, offset, Address::times_4,  8), sum);
+movq(op1, Address(in, len, Address::times_4,  0));
+rorq(op1, 32);
+movq(sum, Address(out, offset, Address::times_4,  0));
+rorq(sum, 32);
+if (UseBMI2Instructions) {
+multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
+}
+else {
+multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
+}
+// Store back in big endian from little endian
+rorq(sum, 0x20);
+movq(Address(out, offset, Address::times_4,  0), sum);
+jmp(L_first_loop);
+bind(L_first_loop_exit);
+}
+/**
+* Code for BigInteger::mulAdd() intrinsic
+*
+* rdi: out
+* rsi: in
+* r11: offs (out.length - offset)
+* rcx: len
+* r8:  k
+* r12: tmp1
+* r13: tmp2
+* r14: tmp3
+* r15: tmp4
+* rbx: tmp5
+* Multiply the in[] by word k and add to out[], return the carry in rax
+*/
+void MacroAssembler::mul_add(Register out, Register in, Register offs,
+Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
+Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
+Label L_carry, L_last_in, L_done;
+// carry = 0;
+// for (int j=len-1; j >= 0; j--) {
+//    long product = (in[j] & LONG_MASK) * kLong +
+//                   (out[offs] & LONG_MASK) + carry;
+//    out[offs--] = (int)product;
+//    carry = product >>> 32;
+// }
+//
+push(tmp1);
+push(tmp2);
+push(tmp3);
+push(tmp4);
+push(tmp5);
+Register op2 = tmp2;
+const Register sum = tmp3;
+const Register op1 = tmp4;
+const Register carry =  tmp5;
+if (UseBMI2Instructions) {
+op2 = rdxReg;
+movl(op2, k);
+}
+else {
+movl(op2, k);
+}
+xorq(carry, carry);
+//First loop
+//Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
+//The carry is in tmp5
+mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
+//Multiply the trailing in[] entry using 64 bit by 32 bit, if any
+decrementl(len);
+jccb(Assembler::negative, L_carry);
+decrementl(len);
+jccb(Assembler::negative, L_last_in);
+movq(op1, Address(in, len, Address::times_4,  0));
+rorq(op1, 32);
+subl(offs, 2);
+movq(sum, Address(out, offs, Address::times_4,  0));
+rorq(sum, 32);
+if (UseBMI2Instructions) {
+multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
+}
+else {
+multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
+}
+// Store back in big endian from little endian
+rorq(sum, 0x20);
+movq(Address(out, offs, Address::times_4,  0), sum);
+testl(len, len);
+jccb(Assembler::zero, L_carry);
+//Multiply the last in[] entry, if any
+bind(L_last_in);
+movl(op1, Address(in, 0));
+movl(sum, Address(out, offs, Address::times_4,  -4));
+movl(raxReg, k);
+mull(op1); //tmp4 * eax -> edx:eax
+addl(sum, carry);
+adcl(rdxReg, 0);
+addl(sum, raxReg);
+adcl(rdxReg, 0);
+movl(carry, rdxReg);
+movl(Address(out, offs, Address::times_4,  -4), sum);
+bind(L_carry);
+//return tmp5/carry as carry in rax
+movl(rax, carry);
+bind(L_done);
+pop(tmp5);
+pop(tmp4);
+pop(tmp3);
+pop(tmp2);
+pop(tmp1);
+}
 #endif
 /**
 * Emits code to update CRC-32 with a byte value according to constants in table
 *

comparison: src/cpu/x86/vm/macroAssembler_x86.cpp

src/cpu/x86/vm/macroAssembler_x86.cpp

Mercurial > jdk8-mips64-public > hotspot / file comparison

comparison: src/cpu/x86/vm/macroAssembler_x86.cpp

src/cpu/x86/vm/macroAssembler_x86.cpp