src/cpu/x86/vm/macroAssembler_x86.cpp

changeset 8494
445941ba41c0
parent 8173
faef2a237329
parent 8489
51c505229e71
child 8504
a96cf90239c6
child 8548
f958bebdee26
equal deleted inserted replaced
8487:4fc39d24d00e 8494:445941ba41c0
7767 pop(tmp4); 7767 pop(tmp4);
7768 pop(tmp3); 7768 pop(tmp3);
7769 pop(tmp2); 7769 pop(tmp2);
7770 pop(tmp1); 7770 pop(tmp1);
7771 } 7771 }
7772
7773 //Helper functions for square_to_len()
7774
7775 /**
7776 * Store the squares of x[], right shifted one bit (divided by 2) into z[]
7777 * Preserves x and z and modifies rest of the registers.
7778 */
7779
7780 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7781 // Perform square and right shift by 1
7782 // Handle odd xlen case first, then for even xlen do the following
7783 // jlong carry = 0;
7784 // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
7785 // huge_128 product = x[j:j+1] * x[j:j+1];
7786 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
7787 // z[i+2:i+3] = (jlong)(product >>> 1);
7788 // carry = (jlong)product;
7789 // }
7790
7791 xorq(tmp5, tmp5); // carry
7792 xorq(rdxReg, rdxReg);
7793 xorl(tmp1, tmp1); // index for x
7794 xorl(tmp4, tmp4); // index for z
7795
7796 Label L_first_loop, L_first_loop_exit;
7797
7798 testl(xlen, 1);
7799 jccb(Assembler::zero, L_first_loop); //jump if xlen is even
7800
7801 // Square and right shift by 1 the odd element using 32 bit multiply
7802 movl(raxReg, Address(x, tmp1, Address::times_4, 0));
7803 imulq(raxReg, raxReg);
7804 shrq(raxReg, 1);
7805 adcq(tmp5, 0);
7806 movq(Address(z, tmp4, Address::times_4, 0), raxReg);
7807 incrementl(tmp1);
7808 addl(tmp4, 2);
7809
7810 // Square and right shift by 1 the rest using 64 bit multiply
7811 bind(L_first_loop);
7812 cmpptr(tmp1, xlen);
7813 jccb(Assembler::equal, L_first_loop_exit);
7814
7815 // Square
7816 movq(raxReg, Address(x, tmp1, Address::times_4, 0));
7817 rorq(raxReg, 32); // convert big-endian to little-endian
7818 mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax
7819
7820 // Right shift by 1 and save carry
7821 shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
7822 rcrq(rdxReg, 1);
7823 rcrq(raxReg, 1);
7824 adcq(tmp5, 0);
7825
7826 // Store result in z
7827 movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
7828 movq(Address(z, tmp4, Address::times_4, 8), raxReg);
7829
7830 // Update indices for x and z
7831 addl(tmp1, 2);
7832 addl(tmp4, 4);
7833 jmp(L_first_loop);
7834
7835 bind(L_first_loop_exit);
7836 }
7837
7838
7839 /**
7840 * Perform the following multiply add operation using BMI2 instructions
7841 * carry:sum = sum + op1*op2 + carry
7842 * op2 should be in rdx
7843 * op2 is preserved, all other registers are modified
7844 */
7845 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
7846 // assert op2 is rdx
7847 mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1
7848 addq(sum, carry);
7849 adcq(tmp2, 0);
7850 addq(sum, op1);
7851 adcq(tmp2, 0);
7852 movq(carry, tmp2);
7853 }
7854
7855 /**
7856 * Perform the following multiply add operation:
7857 * carry:sum = sum + op1*op2 + carry
7858 * Preserves op1, op2 and modifies rest of registers
7859 */
7860 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
7861 // rdx:rax = op1 * op2
7862 movq(raxReg, op2);
7863 mulq(op1);
7864
7865 // rdx:rax = sum + carry + rdx:rax
7866 addq(sum, carry);
7867 adcq(rdxReg, 0);
7868 addq(sum, raxReg);
7869 adcq(rdxReg, 0);
7870
7871 // carry:sum = rdx:sum
7872 movq(carry, rdxReg);
7873 }
7874
7875 /**
7876 * Add 64 bit long carry into z[] with carry propogation.
7877 * Preserves z and carry register values and modifies rest of registers.
7878 *
7879 */
7880 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
7881 Label L_fourth_loop, L_fourth_loop_exit;
7882
7883 movl(tmp1, 1);
7884 subl(zlen, 2);
7885 addq(Address(z, zlen, Address::times_4, 0), carry);
7886
7887 bind(L_fourth_loop);
7888 jccb(Assembler::carryClear, L_fourth_loop_exit);
7889 subl(zlen, 2);
7890 jccb(Assembler::negative, L_fourth_loop_exit);
7891 addq(Address(z, zlen, Address::times_4, 0), tmp1);
7892 jmp(L_fourth_loop);
7893 bind(L_fourth_loop_exit);
7894 }
7895
7896 /**
7897 * Shift z[] left by 1 bit.
7898 * Preserves x, len, z and zlen registers and modifies rest of the registers.
7899 *
7900 */
7901 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
7902
7903 Label L_fifth_loop, L_fifth_loop_exit;
7904
7905 // Fifth loop
7906 // Perform primitiveLeftShift(z, zlen, 1)
7907
7908 const Register prev_carry = tmp1;
7909 const Register new_carry = tmp4;
7910 const Register value = tmp2;
7911 const Register zidx = tmp3;
7912
7913 // int zidx, carry;
7914 // long value;
7915 // carry = 0;
7916 // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
7917 // (carry:value) = (z[i] << 1) | carry ;
7918 // z[i] = value;
7919 // }
7920
7921 movl(zidx, zlen);
7922 xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
7923
7924 bind(L_fifth_loop);
7925 decl(zidx); // Use decl to preserve carry flag
7926 decl(zidx);
7927 jccb(Assembler::negative, L_fifth_loop_exit);
7928
7929 if (UseBMI2Instructions) {
7930 movq(value, Address(z, zidx, Address::times_4, 0));
7931 rclq(value, 1);
7932 rorxq(value, value, 32);
7933 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
7934 }
7935 else {
7936 // clear new_carry
7937 xorl(new_carry, new_carry);
7938
7939 // Shift z[i] by 1, or in previous carry and save new carry
7940 movq(value, Address(z, zidx, Address::times_4, 0));
7941 shlq(value, 1);
7942 adcl(new_carry, 0);
7943
7944 orq(value, prev_carry);
7945 rorq(value, 0x20);
7946 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
7947
7948 // Set previous carry = new carry
7949 movl(prev_carry, new_carry);
7950 }
7951 jmp(L_fifth_loop);
7952
7953 bind(L_fifth_loop_exit);
7954 }
7955
7956
7957 /**
7958 * Code for BigInteger::squareToLen() intrinsic
7959 *
7960 * rdi: x
7961 * rsi: len
7962 * r8: z
7963 * rcx: zlen
7964 * r12: tmp1
7965 * r13: tmp2
7966 * r14: tmp3
7967 * r15: tmp4
7968 * rbx: tmp5
7969 *
7970 */
7971 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7972
7973 Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply;
7974 push(tmp1);
7975 push(tmp2);
7976 push(tmp3);
7977 push(tmp4);
7978 push(tmp5);
7979
7980 // First loop
7981 // Store the squares, right shifted one bit (i.e., divided by 2).
7982 square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
7983
7984 // Add in off-diagonal sums.
7985 //
7986 // Second, third (nested) and fourth loops.
7987 // zlen +=2;
7988 // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
7989 // carry = 0;
7990 // long op2 = x[xidx:xidx+1];
7991 // for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
7992 // k -= 2;
7993 // long op1 = x[j:j+1];
7994 // long sum = z[k:k+1];
7995 // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
7996 // z[k:k+1] = sum;
7997 // }
7998 // add_one_64(z, k, carry, tmp_regs);
7999 // }
8000
8001 const Register carry = tmp5;
8002 const Register sum = tmp3;
8003 const Register op1 = tmp4;
8004 Register op2 = tmp2;
8005
8006 push(zlen);
8007 push(len);
8008 addl(zlen,2);
8009 bind(L_second_loop);
8010 xorq(carry, carry);
8011 subl(zlen, 4);
8012 subl(len, 2);
8013 push(zlen);
8014 push(len);
8015 cmpl(len, 0);
8016 jccb(Assembler::lessEqual, L_second_loop_exit);
8017
8018 // Multiply an array by one 64 bit long.
8019 if (UseBMI2Instructions) {
8020 op2 = rdxReg;
8021 movq(op2, Address(x, len, Address::times_4, 0));
8022 rorxq(op2, op2, 32);
8023 }
8024 else {
8025 movq(op2, Address(x, len, Address::times_4, 0));
8026 rorq(op2, 32);
8027 }
8028
8029 bind(L_third_loop);
8030 decrementl(len);
8031 jccb(Assembler::negative, L_third_loop_exit);
8032 decrementl(len);
8033 jccb(Assembler::negative, L_last_x);
8034
8035 movq(op1, Address(x, len, Address::times_4, 0));
8036 rorq(op1, 32);
8037
8038 bind(L_multiply);
8039 subl(zlen, 2);
8040 movq(sum, Address(z, zlen, Address::times_4, 0));
8041
8042 // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
8043 if (UseBMI2Instructions) {
8044 multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
8045 }
8046 else {
8047 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8048 }
8049
8050 movq(Address(z, zlen, Address::times_4, 0), sum);
8051
8052 jmp(L_third_loop);
8053 bind(L_third_loop_exit);
8054
8055 // Fourth loop
8056 // Add 64 bit long carry into z with carry propogation.
8057 // Uses offsetted zlen.
8058 add_one_64(z, zlen, carry, tmp1);
8059
8060 pop(len);
8061 pop(zlen);
8062 jmp(L_second_loop);
8063
8064 // Next infrequent code is moved outside loops.
8065 bind(L_last_x);
8066 movl(op1, Address(x, 0));
8067 jmp(L_multiply);
8068
8069 bind(L_second_loop_exit);
8070 pop(len);
8071 pop(zlen);
8072 pop(len);
8073 pop(zlen);
8074
8075 // Fifth loop
8076 // Shift z left 1 bit.
8077 lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
8078
8079 // z[zlen-1] |= x[len-1] & 1;
8080 movl(tmp3, Address(x, len, Address::times_4, -4));
8081 andl(tmp3, 1);
8082 orl(Address(z, zlen, Address::times_4, -4), tmp3);
8083
8084 pop(tmp5);
8085 pop(tmp4);
8086 pop(tmp3);
8087 pop(tmp2);
8088 pop(tmp1);
8089 }
8090
8091 /**
8092 * Helper function for mul_add()
8093 * Multiply the in[] by int k and add to out[] starting at offset offs using
8094 * 128 bit by 32 bit multiply and return the carry in tmp5.
8095 * Only quad int aligned length of in[] is operated on in this function.
8096 * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
8097 * This function preserves out, in and k registers.
8098 * len and offset point to the appropriate index in "in" & "out" correspondingly
8099 * tmp5 has the carry.
8100 * other registers are temporary and are modified.
8101 *
8102 */
8103 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
8104 Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
8105 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8106
8107 Label L_first_loop, L_first_loop_exit;
8108
8109 movl(tmp1, len);
8110 shrl(tmp1, 2);
8111
8112 bind(L_first_loop);
8113 subl(tmp1, 1);
8114 jccb(Assembler::negative, L_first_loop_exit);
8115
8116 subl(len, 4);
8117 subl(offset, 4);
8118
8119 Register op2 = tmp2;
8120 const Register sum = tmp3;
8121 const Register op1 = tmp4;
8122 const Register carry = tmp5;
8123
8124 if (UseBMI2Instructions) {
8125 op2 = rdxReg;
8126 }
8127
8128 movq(op1, Address(in, len, Address::times_4, 8));
8129 rorq(op1, 32);
8130 movq(sum, Address(out, offset, Address::times_4, 8));
8131 rorq(sum, 32);
8132 if (UseBMI2Instructions) {
8133 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8134 }
8135 else {
8136 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8137 }
8138 // Store back in big endian from little endian
8139 rorq(sum, 0x20);
8140 movq(Address(out, offset, Address::times_4, 8), sum);
8141
8142 movq(op1, Address(in, len, Address::times_4, 0));
8143 rorq(op1, 32);
8144 movq(sum, Address(out, offset, Address::times_4, 0));
8145 rorq(sum, 32);
8146 if (UseBMI2Instructions) {
8147 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8148 }
8149 else {
8150 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8151 }
8152 // Store back in big endian from little endian
8153 rorq(sum, 0x20);
8154 movq(Address(out, offset, Address::times_4, 0), sum);
8155
8156 jmp(L_first_loop);
8157 bind(L_first_loop_exit);
8158 }
8159
8160 /**
8161 * Code for BigInteger::mulAdd() intrinsic
8162 *
8163 * rdi: out
8164 * rsi: in
8165 * r11: offs (out.length - offset)
8166 * rcx: len
8167 * r8: k
8168 * r12: tmp1
8169 * r13: tmp2
8170 * r14: tmp3
8171 * r15: tmp4
8172 * rbx: tmp5
8173 * Multiply the in[] by word k and add to out[], return the carry in rax
8174 */
8175 void MacroAssembler::mul_add(Register out, Register in, Register offs,
8176 Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
8177 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8178
8179 Label L_carry, L_last_in, L_done;
8180
8181 // carry = 0;
8182 // for (int j=len-1; j >= 0; j--) {
8183 // long product = (in[j] & LONG_MASK) * kLong +
8184 // (out[offs] & LONG_MASK) + carry;
8185 // out[offs--] = (int)product;
8186 // carry = product >>> 32;
8187 // }
8188 //
8189 push(tmp1);
8190 push(tmp2);
8191 push(tmp3);
8192 push(tmp4);
8193 push(tmp5);
8194
8195 Register op2 = tmp2;
8196 const Register sum = tmp3;
8197 const Register op1 = tmp4;
8198 const Register carry = tmp5;
8199
8200 if (UseBMI2Instructions) {
8201 op2 = rdxReg;
8202 movl(op2, k);
8203 }
8204 else {
8205 movl(op2, k);
8206 }
8207
8208 xorq(carry, carry);
8209
8210 //First loop
8211
8212 //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
8213 //The carry is in tmp5
8214 mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
8215
8216 //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
8217 decrementl(len);
8218 jccb(Assembler::negative, L_carry);
8219 decrementl(len);
8220 jccb(Assembler::negative, L_last_in);
8221
8222 movq(op1, Address(in, len, Address::times_4, 0));
8223 rorq(op1, 32);
8224
8225 subl(offs, 2);
8226 movq(sum, Address(out, offs, Address::times_4, 0));
8227 rorq(sum, 32);
8228
8229 if (UseBMI2Instructions) {
8230 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8231 }
8232 else {
8233 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8234 }
8235
8236 // Store back in big endian from little endian
8237 rorq(sum, 0x20);
8238 movq(Address(out, offs, Address::times_4, 0), sum);
8239
8240 testl(len, len);
8241 jccb(Assembler::zero, L_carry);
8242
8243 //Multiply the last in[] entry, if any
8244 bind(L_last_in);
8245 movl(op1, Address(in, 0));
8246 movl(sum, Address(out, offs, Address::times_4, -4));
8247
8248 movl(raxReg, k);
8249 mull(op1); //tmp4 * eax -> edx:eax
8250 addl(sum, carry);
8251 adcl(rdxReg, 0);
8252 addl(sum, raxReg);
8253 adcl(rdxReg, 0);
8254 movl(carry, rdxReg);
8255
8256 movl(Address(out, offs, Address::times_4, -4), sum);
8257
8258 bind(L_carry);
8259 //return tmp5/carry as carry in rax
8260 movl(rax, carry);
8261
8262 bind(L_done);
8263 pop(tmp5);
8264 pop(tmp4);
8265 pop(tmp3);
8266 pop(tmp2);
8267 pop(tmp1);
8268 }
7772 #endif 8269 #endif
7773 8270
7774 /** 8271 /**
7775 * Emits code to update CRC-32 with a byte value according to constants in table 8272 * Emits code to update CRC-32 with a byte value according to constants in table
7776 * 8273 *

mercurial