src/cpu/x86/vm/macroAssembler_x86.cpp

changeset 8489
51c505229e71
parent 7854
e8260b6328fb
child 8494
445941ba41c0
equal deleted inserted replaced
8488:0d5597f44603 8489:51c505229e71
7765 pop(tmp4); 7765 pop(tmp4);
7766 pop(tmp3); 7766 pop(tmp3);
7767 pop(tmp2); 7767 pop(tmp2);
7768 pop(tmp1); 7768 pop(tmp1);
7769 } 7769 }
7770
7771 //Helper functions for square_to_len()
7772
7773 /**
7774 * Store the squares of x[], right shifted one bit (divided by 2) into z[]
7775 * Preserves x and z and modifies rest of the registers.
7776 */
7777
7778 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7779 // Perform square and right shift by 1
7780 // Handle odd xlen case first, then for even xlen do the following
7781 // jlong carry = 0;
7782 // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
7783 // huge_128 product = x[j:j+1] * x[j:j+1];
7784 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
7785 // z[i+2:i+3] = (jlong)(product >>> 1);
7786 // carry = (jlong)product;
7787 // }
7788
7789 xorq(tmp5, tmp5); // carry
7790 xorq(rdxReg, rdxReg);
7791 xorl(tmp1, tmp1); // index for x
7792 xorl(tmp4, tmp4); // index for z
7793
7794 Label L_first_loop, L_first_loop_exit;
7795
7796 testl(xlen, 1);
7797 jccb(Assembler::zero, L_first_loop); //jump if xlen is even
7798
7799 // Square and right shift by 1 the odd element using 32 bit multiply
7800 movl(raxReg, Address(x, tmp1, Address::times_4, 0));
7801 imulq(raxReg, raxReg);
7802 shrq(raxReg, 1);
7803 adcq(tmp5, 0);
7804 movq(Address(z, tmp4, Address::times_4, 0), raxReg);
7805 incrementl(tmp1);
7806 addl(tmp4, 2);
7807
7808 // Square and right shift by 1 the rest using 64 bit multiply
7809 bind(L_first_loop);
7810 cmpptr(tmp1, xlen);
7811 jccb(Assembler::equal, L_first_loop_exit);
7812
7813 // Square
7814 movq(raxReg, Address(x, tmp1, Address::times_4, 0));
7815 rorq(raxReg, 32); // convert big-endian to little-endian
7816 mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax
7817
7818 // Right shift by 1 and save carry
7819 shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
7820 rcrq(rdxReg, 1);
7821 rcrq(raxReg, 1);
7822 adcq(tmp5, 0);
7823
7824 // Store result in z
7825 movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
7826 movq(Address(z, tmp4, Address::times_4, 8), raxReg);
7827
7828 // Update indices for x and z
7829 addl(tmp1, 2);
7830 addl(tmp4, 4);
7831 jmp(L_first_loop);
7832
7833 bind(L_first_loop_exit);
7834 }
7835
7836
7837 /**
7838 * Perform the following multiply add operation using BMI2 instructions
7839 * carry:sum = sum + op1*op2 + carry
7840 * op2 should be in rdx
7841 * op2 is preserved, all other registers are modified
7842 */
7843 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
7844 // assert op2 is rdx
7845 mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1
7846 addq(sum, carry);
7847 adcq(tmp2, 0);
7848 addq(sum, op1);
7849 adcq(tmp2, 0);
7850 movq(carry, tmp2);
7851 }
7852
7853 /**
7854 * Perform the following multiply add operation:
7855 * carry:sum = sum + op1*op2 + carry
7856 * Preserves op1, op2 and modifies rest of registers
7857 */
7858 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
7859 // rdx:rax = op1 * op2
7860 movq(raxReg, op2);
7861 mulq(op1);
7862
7863 // rdx:rax = sum + carry + rdx:rax
7864 addq(sum, carry);
7865 adcq(rdxReg, 0);
7866 addq(sum, raxReg);
7867 adcq(rdxReg, 0);
7868
7869 // carry:sum = rdx:sum
7870 movq(carry, rdxReg);
7871 }
7872
7873 /**
7874 * Add 64 bit long carry into z[] with carry propogation.
7875 * Preserves z and carry register values and modifies rest of registers.
7876 *
7877 */
7878 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
7879 Label L_fourth_loop, L_fourth_loop_exit;
7880
7881 movl(tmp1, 1);
7882 subl(zlen, 2);
7883 addq(Address(z, zlen, Address::times_4, 0), carry);
7884
7885 bind(L_fourth_loop);
7886 jccb(Assembler::carryClear, L_fourth_loop_exit);
7887 subl(zlen, 2);
7888 jccb(Assembler::negative, L_fourth_loop_exit);
7889 addq(Address(z, zlen, Address::times_4, 0), tmp1);
7890 jmp(L_fourth_loop);
7891 bind(L_fourth_loop_exit);
7892 }
7893
7894 /**
7895 * Shift z[] left by 1 bit.
7896 * Preserves x, len, z and zlen registers and modifies rest of the registers.
7897 *
7898 */
7899 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
7900
7901 Label L_fifth_loop, L_fifth_loop_exit;
7902
7903 // Fifth loop
7904 // Perform primitiveLeftShift(z, zlen, 1)
7905
7906 const Register prev_carry = tmp1;
7907 const Register new_carry = tmp4;
7908 const Register value = tmp2;
7909 const Register zidx = tmp3;
7910
7911 // int zidx, carry;
7912 // long value;
7913 // carry = 0;
7914 // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
7915 // (carry:value) = (z[i] << 1) | carry ;
7916 // z[i] = value;
7917 // }
7918
7919 movl(zidx, zlen);
7920 xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
7921
7922 bind(L_fifth_loop);
7923 decl(zidx); // Use decl to preserve carry flag
7924 decl(zidx);
7925 jccb(Assembler::negative, L_fifth_loop_exit);
7926
7927 if (UseBMI2Instructions) {
7928 movq(value, Address(z, zidx, Address::times_4, 0));
7929 rclq(value, 1);
7930 rorxq(value, value, 32);
7931 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
7932 }
7933 else {
7934 // clear new_carry
7935 xorl(new_carry, new_carry);
7936
7937 // Shift z[i] by 1, or in previous carry and save new carry
7938 movq(value, Address(z, zidx, Address::times_4, 0));
7939 shlq(value, 1);
7940 adcl(new_carry, 0);
7941
7942 orq(value, prev_carry);
7943 rorq(value, 0x20);
7944 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
7945
7946 // Set previous carry = new carry
7947 movl(prev_carry, new_carry);
7948 }
7949 jmp(L_fifth_loop);
7950
7951 bind(L_fifth_loop_exit);
7952 }
7953
7954
7955 /**
7956 * Code for BigInteger::squareToLen() intrinsic
7957 *
7958 * rdi: x
7959 * rsi: len
7960 * r8: z
7961 * rcx: zlen
7962 * r12: tmp1
7963 * r13: tmp2
7964 * r14: tmp3
7965 * r15: tmp4
7966 * rbx: tmp5
7967 *
7968 */
7969 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7970
7971 Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply;
7972 push(tmp1);
7973 push(tmp2);
7974 push(tmp3);
7975 push(tmp4);
7976 push(tmp5);
7977
7978 // First loop
7979 // Store the squares, right shifted one bit (i.e., divided by 2).
7980 square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
7981
7982 // Add in off-diagonal sums.
7983 //
7984 // Second, third (nested) and fourth loops.
7985 // zlen +=2;
7986 // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
7987 // carry = 0;
7988 // long op2 = x[xidx:xidx+1];
7989 // for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
7990 // k -= 2;
7991 // long op1 = x[j:j+1];
7992 // long sum = z[k:k+1];
7993 // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
7994 // z[k:k+1] = sum;
7995 // }
7996 // add_one_64(z, k, carry, tmp_regs);
7997 // }
7998
7999 const Register carry = tmp5;
8000 const Register sum = tmp3;
8001 const Register op1 = tmp4;
8002 Register op2 = tmp2;
8003
8004 push(zlen);
8005 push(len);
8006 addl(zlen,2);
8007 bind(L_second_loop);
8008 xorq(carry, carry);
8009 subl(zlen, 4);
8010 subl(len, 2);
8011 push(zlen);
8012 push(len);
8013 cmpl(len, 0);
8014 jccb(Assembler::lessEqual, L_second_loop_exit);
8015
8016 // Multiply an array by one 64 bit long.
8017 if (UseBMI2Instructions) {
8018 op2 = rdxReg;
8019 movq(op2, Address(x, len, Address::times_4, 0));
8020 rorxq(op2, op2, 32);
8021 }
8022 else {
8023 movq(op2, Address(x, len, Address::times_4, 0));
8024 rorq(op2, 32);
8025 }
8026
8027 bind(L_third_loop);
8028 decrementl(len);
8029 jccb(Assembler::negative, L_third_loop_exit);
8030 decrementl(len);
8031 jccb(Assembler::negative, L_last_x);
8032
8033 movq(op1, Address(x, len, Address::times_4, 0));
8034 rorq(op1, 32);
8035
8036 bind(L_multiply);
8037 subl(zlen, 2);
8038 movq(sum, Address(z, zlen, Address::times_4, 0));
8039
8040 // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
8041 if (UseBMI2Instructions) {
8042 multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
8043 }
8044 else {
8045 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8046 }
8047
8048 movq(Address(z, zlen, Address::times_4, 0), sum);
8049
8050 jmp(L_third_loop);
8051 bind(L_third_loop_exit);
8052
8053 // Fourth loop
8054 // Add 64 bit long carry into z with carry propogation.
8055 // Uses offsetted zlen.
8056 add_one_64(z, zlen, carry, tmp1);
8057
8058 pop(len);
8059 pop(zlen);
8060 jmp(L_second_loop);
8061
8062 // Next infrequent code is moved outside loops.
8063 bind(L_last_x);
8064 movl(op1, Address(x, 0));
8065 jmp(L_multiply);
8066
8067 bind(L_second_loop_exit);
8068 pop(len);
8069 pop(zlen);
8070 pop(len);
8071 pop(zlen);
8072
8073 // Fifth loop
8074 // Shift z left 1 bit.
8075 lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
8076
8077 // z[zlen-1] |= x[len-1] & 1;
8078 movl(tmp3, Address(x, len, Address::times_4, -4));
8079 andl(tmp3, 1);
8080 orl(Address(z, zlen, Address::times_4, -4), tmp3);
8081
8082 pop(tmp5);
8083 pop(tmp4);
8084 pop(tmp3);
8085 pop(tmp2);
8086 pop(tmp1);
8087 }
8088
8089 /**
8090 * Helper function for mul_add()
8091 * Multiply the in[] by int k and add to out[] starting at offset offs using
8092 * 128 bit by 32 bit multiply and return the carry in tmp5.
8093 * Only quad int aligned length of in[] is operated on in this function.
8094 * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
8095 * This function preserves out, in and k registers.
8096 * len and offset point to the appropriate index in "in" & "out" correspondingly
8097 * tmp5 has the carry.
8098 * other registers are temporary and are modified.
8099 *
8100 */
8101 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
8102 Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
8103 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8104
8105 Label L_first_loop, L_first_loop_exit;
8106
8107 movl(tmp1, len);
8108 shrl(tmp1, 2);
8109
8110 bind(L_first_loop);
8111 subl(tmp1, 1);
8112 jccb(Assembler::negative, L_first_loop_exit);
8113
8114 subl(len, 4);
8115 subl(offset, 4);
8116
8117 Register op2 = tmp2;
8118 const Register sum = tmp3;
8119 const Register op1 = tmp4;
8120 const Register carry = tmp5;
8121
8122 if (UseBMI2Instructions) {
8123 op2 = rdxReg;
8124 }
8125
8126 movq(op1, Address(in, len, Address::times_4, 8));
8127 rorq(op1, 32);
8128 movq(sum, Address(out, offset, Address::times_4, 8));
8129 rorq(sum, 32);
8130 if (UseBMI2Instructions) {
8131 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8132 }
8133 else {
8134 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8135 }
8136 // Store back in big endian from little endian
8137 rorq(sum, 0x20);
8138 movq(Address(out, offset, Address::times_4, 8), sum);
8139
8140 movq(op1, Address(in, len, Address::times_4, 0));
8141 rorq(op1, 32);
8142 movq(sum, Address(out, offset, Address::times_4, 0));
8143 rorq(sum, 32);
8144 if (UseBMI2Instructions) {
8145 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8146 }
8147 else {
8148 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8149 }
8150 // Store back in big endian from little endian
8151 rorq(sum, 0x20);
8152 movq(Address(out, offset, Address::times_4, 0), sum);
8153
8154 jmp(L_first_loop);
8155 bind(L_first_loop_exit);
8156 }
8157
8158 /**
8159 * Code for BigInteger::mulAdd() intrinsic
8160 *
8161 * rdi: out
8162 * rsi: in
8163 * r11: offs (out.length - offset)
8164 * rcx: len
8165 * r8: k
8166 * r12: tmp1
8167 * r13: tmp2
8168 * r14: tmp3
8169 * r15: tmp4
8170 * rbx: tmp5
8171 * Multiply the in[] by word k and add to out[], return the carry in rax
8172 */
8173 void MacroAssembler::mul_add(Register out, Register in, Register offs,
8174 Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
8175 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8176
8177 Label L_carry, L_last_in, L_done;
8178
8179 // carry = 0;
8180 // for (int j=len-1; j >= 0; j--) {
8181 // long product = (in[j] & LONG_MASK) * kLong +
8182 // (out[offs] & LONG_MASK) + carry;
8183 // out[offs--] = (int)product;
8184 // carry = product >>> 32;
8185 // }
8186 //
8187 push(tmp1);
8188 push(tmp2);
8189 push(tmp3);
8190 push(tmp4);
8191 push(tmp5);
8192
8193 Register op2 = tmp2;
8194 const Register sum = tmp3;
8195 const Register op1 = tmp4;
8196 const Register carry = tmp5;
8197
8198 if (UseBMI2Instructions) {
8199 op2 = rdxReg;
8200 movl(op2, k);
8201 }
8202 else {
8203 movl(op2, k);
8204 }
8205
8206 xorq(carry, carry);
8207
8208 //First loop
8209
8210 //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
8211 //The carry is in tmp5
8212 mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
8213
8214 //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
8215 decrementl(len);
8216 jccb(Assembler::negative, L_carry);
8217 decrementl(len);
8218 jccb(Assembler::negative, L_last_in);
8219
8220 movq(op1, Address(in, len, Address::times_4, 0));
8221 rorq(op1, 32);
8222
8223 subl(offs, 2);
8224 movq(sum, Address(out, offs, Address::times_4, 0));
8225 rorq(sum, 32);
8226
8227 if (UseBMI2Instructions) {
8228 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8229 }
8230 else {
8231 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8232 }
8233
8234 // Store back in big endian from little endian
8235 rorq(sum, 0x20);
8236 movq(Address(out, offs, Address::times_4, 0), sum);
8237
8238 testl(len, len);
8239 jccb(Assembler::zero, L_carry);
8240
8241 //Multiply the last in[] entry, if any
8242 bind(L_last_in);
8243 movl(op1, Address(in, 0));
8244 movl(sum, Address(out, offs, Address::times_4, -4));
8245
8246 movl(raxReg, k);
8247 mull(op1); //tmp4 * eax -> edx:eax
8248 addl(sum, carry);
8249 adcl(rdxReg, 0);
8250 addl(sum, raxReg);
8251 adcl(rdxReg, 0);
8252 movl(carry, rdxReg);
8253
8254 movl(Address(out, offs, Address::times_4, -4), sum);
8255
8256 bind(L_carry);
8257 //return tmp5/carry as carry in rax
8258 movl(rax, carry);
8259
8260 bind(L_done);
8261 pop(tmp5);
8262 pop(tmp4);
8263 pop(tmp3);
8264 pop(tmp2);
8265 pop(tmp1);
8266 }
7770 #endif 8267 #endif
7771 8268
7772 /** 8269 /**
7773 * Emits code to update CRC-32 with a byte value according to constants in table 8270 * Emits code to update CRC-32 with a byte value according to constants in table
7774 * 8271 *

mercurial