7767 pop(tmp4); |
7767 pop(tmp4); |
7768 pop(tmp3); |
7768 pop(tmp3); |
7769 pop(tmp2); |
7769 pop(tmp2); |
7770 pop(tmp1); |
7770 pop(tmp1); |
7771 } |
7771 } |
|
7772 |
|
7773 //Helper functions for square_to_len() |
|
7774 |
|
7775 /** |
|
7776 * Store the squares of x[], right shifted one bit (divided by 2) into z[] |
|
7777 * Preserves x and z and modifies rest of the registers. |
|
7778 */ |
|
7779 |
|
7780 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { |
|
7781 // Perform square and right shift by 1 |
|
7782 // Handle odd xlen case first, then for even xlen do the following |
|
7783 // jlong carry = 0; |
|
7784 // for (int j=0, i=0; j < xlen; j+=2, i+=4) { |
|
7785 // huge_128 product = x[j:j+1] * x[j:j+1]; |
|
7786 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65); |
|
7787 // z[i+2:i+3] = (jlong)(product >>> 1); |
|
7788 // carry = (jlong)product; |
|
7789 // } |
|
7790 |
|
7791 xorq(tmp5, tmp5); // carry |
|
7792 xorq(rdxReg, rdxReg); |
|
7793 xorl(tmp1, tmp1); // index for x |
|
7794 xorl(tmp4, tmp4); // index for z |
|
7795 |
|
7796 Label L_first_loop, L_first_loop_exit; |
|
7797 |
|
7798 testl(xlen, 1); |
|
7799 jccb(Assembler::zero, L_first_loop); //jump if xlen is even |
|
7800 |
|
7801 // Square and right shift by 1 the odd element using 32 bit multiply |
|
7802 movl(raxReg, Address(x, tmp1, Address::times_4, 0)); |
|
7803 imulq(raxReg, raxReg); |
|
7804 shrq(raxReg, 1); |
|
7805 adcq(tmp5, 0); |
|
7806 movq(Address(z, tmp4, Address::times_4, 0), raxReg); |
|
7807 incrementl(tmp1); |
|
7808 addl(tmp4, 2); |
|
7809 |
|
7810 // Square and right shift by 1 the rest using 64 bit multiply |
|
7811 bind(L_first_loop); |
|
7812 cmpptr(tmp1, xlen); |
|
7813 jccb(Assembler::equal, L_first_loop_exit); |
|
7814 |
|
7815 // Square |
|
7816 movq(raxReg, Address(x, tmp1, Address::times_4, 0)); |
|
7817 rorq(raxReg, 32); // convert big-endian to little-endian |
|
7818 mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax |
|
7819 |
|
7820 // Right shift by 1 and save carry |
|
7821 shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1 |
|
7822 rcrq(rdxReg, 1); |
|
7823 rcrq(raxReg, 1); |
|
7824 adcq(tmp5, 0); |
|
7825 |
|
7826 // Store result in z |
|
7827 movq(Address(z, tmp4, Address::times_4, 0), rdxReg); |
|
7828 movq(Address(z, tmp4, Address::times_4, 8), raxReg); |
|
7829 |
|
7830 // Update indices for x and z |
|
7831 addl(tmp1, 2); |
|
7832 addl(tmp4, 4); |
|
7833 jmp(L_first_loop); |
|
7834 |
|
7835 bind(L_first_loop_exit); |
|
7836 } |
|
7837 |
|
7838 |
|
7839 /** |
|
7840 * Perform the following multiply add operation using BMI2 instructions |
|
7841 * carry:sum = sum + op1*op2 + carry |
|
7842 * op2 should be in rdx |
|
7843 * op2 is preserved, all other registers are modified |
|
7844 */ |
|
7845 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) { |
|
7846 // assert op2 is rdx |
|
7847 mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1 |
|
7848 addq(sum, carry); |
|
7849 adcq(tmp2, 0); |
|
7850 addq(sum, op1); |
|
7851 adcq(tmp2, 0); |
|
7852 movq(carry, tmp2); |
|
7853 } |
|
7854 |
|
7855 /** |
|
7856 * Perform the following multiply add operation: |
|
7857 * carry:sum = sum + op1*op2 + carry |
|
7858 * Preserves op1, op2 and modifies rest of registers |
|
7859 */ |
|
7860 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) { |
|
7861 // rdx:rax = op1 * op2 |
|
7862 movq(raxReg, op2); |
|
7863 mulq(op1); |
|
7864 |
|
7865 // rdx:rax = sum + carry + rdx:rax |
|
7866 addq(sum, carry); |
|
7867 adcq(rdxReg, 0); |
|
7868 addq(sum, raxReg); |
|
7869 adcq(rdxReg, 0); |
|
7870 |
|
7871 // carry:sum = rdx:sum |
|
7872 movq(carry, rdxReg); |
|
7873 } |
|
7874 |
|
7875 /** |
|
7876 * Add 64 bit long carry into z[] with carry propogation. |
|
7877 * Preserves z and carry register values and modifies rest of registers. |
|
7878 * |
|
7879 */ |
|
7880 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) { |
|
7881 Label L_fourth_loop, L_fourth_loop_exit; |
|
7882 |
|
7883 movl(tmp1, 1); |
|
7884 subl(zlen, 2); |
|
7885 addq(Address(z, zlen, Address::times_4, 0), carry); |
|
7886 |
|
7887 bind(L_fourth_loop); |
|
7888 jccb(Assembler::carryClear, L_fourth_loop_exit); |
|
7889 subl(zlen, 2); |
|
7890 jccb(Assembler::negative, L_fourth_loop_exit); |
|
7891 addq(Address(z, zlen, Address::times_4, 0), tmp1); |
|
7892 jmp(L_fourth_loop); |
|
7893 bind(L_fourth_loop_exit); |
|
7894 } |
|
7895 |
|
7896 /** |
|
7897 * Shift z[] left by 1 bit. |
|
7898 * Preserves x, len, z and zlen registers and modifies rest of the registers. |
|
7899 * |
|
7900 */ |
|
7901 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) { |
|
7902 |
|
7903 Label L_fifth_loop, L_fifth_loop_exit; |
|
7904 |
|
7905 // Fifth loop |
|
7906 // Perform primitiveLeftShift(z, zlen, 1) |
|
7907 |
|
7908 const Register prev_carry = tmp1; |
|
7909 const Register new_carry = tmp4; |
|
7910 const Register value = tmp2; |
|
7911 const Register zidx = tmp3; |
|
7912 |
|
7913 // int zidx, carry; |
|
7914 // long value; |
|
7915 // carry = 0; |
|
7916 // for (zidx = zlen-2; zidx >=0; zidx -= 2) { |
|
7917 // (carry:value) = (z[i] << 1) | carry ; |
|
7918 // z[i] = value; |
|
7919 // } |
|
7920 |
|
7921 movl(zidx, zlen); |
|
7922 xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register |
|
7923 |
|
7924 bind(L_fifth_loop); |
|
7925 decl(zidx); // Use decl to preserve carry flag |
|
7926 decl(zidx); |
|
7927 jccb(Assembler::negative, L_fifth_loop_exit); |
|
7928 |
|
7929 if (UseBMI2Instructions) { |
|
7930 movq(value, Address(z, zidx, Address::times_4, 0)); |
|
7931 rclq(value, 1); |
|
7932 rorxq(value, value, 32); |
|
7933 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form |
|
7934 } |
|
7935 else { |
|
7936 // clear new_carry |
|
7937 xorl(new_carry, new_carry); |
|
7938 |
|
7939 // Shift z[i] by 1, or in previous carry and save new carry |
|
7940 movq(value, Address(z, zidx, Address::times_4, 0)); |
|
7941 shlq(value, 1); |
|
7942 adcl(new_carry, 0); |
|
7943 |
|
7944 orq(value, prev_carry); |
|
7945 rorq(value, 0x20); |
|
7946 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form |
|
7947 |
|
7948 // Set previous carry = new carry |
|
7949 movl(prev_carry, new_carry); |
|
7950 } |
|
7951 jmp(L_fifth_loop); |
|
7952 |
|
7953 bind(L_fifth_loop_exit); |
|
7954 } |
|
7955 |
|
7956 |
|
7957 /** |
|
7958 * Code for BigInteger::squareToLen() intrinsic |
|
7959 * |
|
7960 * rdi: x |
|
7961 * rsi: len |
|
7962 * r8: z |
|
7963 * rcx: zlen |
|
7964 * r12: tmp1 |
|
7965 * r13: tmp2 |
|
7966 * r14: tmp3 |
|
7967 * r15: tmp4 |
|
7968 * rbx: tmp5 |
|
7969 * |
|
7970 */ |
|
7971 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { |
|
7972 |
|
7973 Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply; |
|
7974 push(tmp1); |
|
7975 push(tmp2); |
|
7976 push(tmp3); |
|
7977 push(tmp4); |
|
7978 push(tmp5); |
|
7979 |
|
7980 // First loop |
|
7981 // Store the squares, right shifted one bit (i.e., divided by 2). |
|
7982 square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg); |
|
7983 |
|
7984 // Add in off-diagonal sums. |
|
7985 // |
|
7986 // Second, third (nested) and fourth loops. |
|
7987 // zlen +=2; |
|
7988 // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) { |
|
7989 // carry = 0; |
|
7990 // long op2 = x[xidx:xidx+1]; |
|
7991 // for (int j=xidx-2,k=zidx; j >= 0; j-=2) { |
|
7992 // k -= 2; |
|
7993 // long op1 = x[j:j+1]; |
|
7994 // long sum = z[k:k+1]; |
|
7995 // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs); |
|
7996 // z[k:k+1] = sum; |
|
7997 // } |
|
7998 // add_one_64(z, k, carry, tmp_regs); |
|
7999 // } |
|
8000 |
|
8001 const Register carry = tmp5; |
|
8002 const Register sum = tmp3; |
|
8003 const Register op1 = tmp4; |
|
8004 Register op2 = tmp2; |
|
8005 |
|
8006 push(zlen); |
|
8007 push(len); |
|
8008 addl(zlen,2); |
|
8009 bind(L_second_loop); |
|
8010 xorq(carry, carry); |
|
8011 subl(zlen, 4); |
|
8012 subl(len, 2); |
|
8013 push(zlen); |
|
8014 push(len); |
|
8015 cmpl(len, 0); |
|
8016 jccb(Assembler::lessEqual, L_second_loop_exit); |
|
8017 |
|
8018 // Multiply an array by one 64 bit long. |
|
8019 if (UseBMI2Instructions) { |
|
8020 op2 = rdxReg; |
|
8021 movq(op2, Address(x, len, Address::times_4, 0)); |
|
8022 rorxq(op2, op2, 32); |
|
8023 } |
|
8024 else { |
|
8025 movq(op2, Address(x, len, Address::times_4, 0)); |
|
8026 rorq(op2, 32); |
|
8027 } |
|
8028 |
|
8029 bind(L_third_loop); |
|
8030 decrementl(len); |
|
8031 jccb(Assembler::negative, L_third_loop_exit); |
|
8032 decrementl(len); |
|
8033 jccb(Assembler::negative, L_last_x); |
|
8034 |
|
8035 movq(op1, Address(x, len, Address::times_4, 0)); |
|
8036 rorq(op1, 32); |
|
8037 |
|
8038 bind(L_multiply); |
|
8039 subl(zlen, 2); |
|
8040 movq(sum, Address(z, zlen, Address::times_4, 0)); |
|
8041 |
|
8042 // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry. |
|
8043 if (UseBMI2Instructions) { |
|
8044 multiply_add_64_bmi2(sum, op1, op2, carry, tmp2); |
|
8045 } |
|
8046 else { |
|
8047 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); |
|
8048 } |
|
8049 |
|
8050 movq(Address(z, zlen, Address::times_4, 0), sum); |
|
8051 |
|
8052 jmp(L_third_loop); |
|
8053 bind(L_third_loop_exit); |
|
8054 |
|
8055 // Fourth loop |
|
8056 // Add 64 bit long carry into z with carry propogation. |
|
8057 // Uses offsetted zlen. |
|
8058 add_one_64(z, zlen, carry, tmp1); |
|
8059 |
|
8060 pop(len); |
|
8061 pop(zlen); |
|
8062 jmp(L_second_loop); |
|
8063 |
|
8064 // Next infrequent code is moved outside loops. |
|
8065 bind(L_last_x); |
|
8066 movl(op1, Address(x, 0)); |
|
8067 jmp(L_multiply); |
|
8068 |
|
8069 bind(L_second_loop_exit); |
|
8070 pop(len); |
|
8071 pop(zlen); |
|
8072 pop(len); |
|
8073 pop(zlen); |
|
8074 |
|
8075 // Fifth loop |
|
8076 // Shift z left 1 bit. |
|
8077 lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4); |
|
8078 |
|
8079 // z[zlen-1] |= x[len-1] & 1; |
|
8080 movl(tmp3, Address(x, len, Address::times_4, -4)); |
|
8081 andl(tmp3, 1); |
|
8082 orl(Address(z, zlen, Address::times_4, -4), tmp3); |
|
8083 |
|
8084 pop(tmp5); |
|
8085 pop(tmp4); |
|
8086 pop(tmp3); |
|
8087 pop(tmp2); |
|
8088 pop(tmp1); |
|
8089 } |
|
8090 |
|
8091 /** |
|
8092 * Helper function for mul_add() |
|
8093 * Multiply the in[] by int k and add to out[] starting at offset offs using |
|
8094 * 128 bit by 32 bit multiply and return the carry in tmp5. |
|
8095 * Only quad int aligned length of in[] is operated on in this function. |
|
8096 * k is in rdxReg for BMI2Instructions, for others it is in tmp2. |
|
8097 * This function preserves out, in and k registers. |
|
8098 * len and offset point to the appropriate index in "in" & "out" correspondingly |
|
8099 * tmp5 has the carry. |
|
8100 * other registers are temporary and are modified. |
|
8101 * |
|
8102 */ |
|
8103 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in, |
|
8104 Register offset, Register len, Register tmp1, Register tmp2, Register tmp3, |
|
8105 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { |
|
8106 |
|
8107 Label L_first_loop, L_first_loop_exit; |
|
8108 |
|
8109 movl(tmp1, len); |
|
8110 shrl(tmp1, 2); |
|
8111 |
|
8112 bind(L_first_loop); |
|
8113 subl(tmp1, 1); |
|
8114 jccb(Assembler::negative, L_first_loop_exit); |
|
8115 |
|
8116 subl(len, 4); |
|
8117 subl(offset, 4); |
|
8118 |
|
8119 Register op2 = tmp2; |
|
8120 const Register sum = tmp3; |
|
8121 const Register op1 = tmp4; |
|
8122 const Register carry = tmp5; |
|
8123 |
|
8124 if (UseBMI2Instructions) { |
|
8125 op2 = rdxReg; |
|
8126 } |
|
8127 |
|
8128 movq(op1, Address(in, len, Address::times_4, 8)); |
|
8129 rorq(op1, 32); |
|
8130 movq(sum, Address(out, offset, Address::times_4, 8)); |
|
8131 rorq(sum, 32); |
|
8132 if (UseBMI2Instructions) { |
|
8133 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); |
|
8134 } |
|
8135 else { |
|
8136 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); |
|
8137 } |
|
8138 // Store back in big endian from little endian |
|
8139 rorq(sum, 0x20); |
|
8140 movq(Address(out, offset, Address::times_4, 8), sum); |
|
8141 |
|
8142 movq(op1, Address(in, len, Address::times_4, 0)); |
|
8143 rorq(op1, 32); |
|
8144 movq(sum, Address(out, offset, Address::times_4, 0)); |
|
8145 rorq(sum, 32); |
|
8146 if (UseBMI2Instructions) { |
|
8147 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); |
|
8148 } |
|
8149 else { |
|
8150 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); |
|
8151 } |
|
8152 // Store back in big endian from little endian |
|
8153 rorq(sum, 0x20); |
|
8154 movq(Address(out, offset, Address::times_4, 0), sum); |
|
8155 |
|
8156 jmp(L_first_loop); |
|
8157 bind(L_first_loop_exit); |
|
8158 } |
|
8159 |
|
8160 /** |
|
8161 * Code for BigInteger::mulAdd() intrinsic |
|
8162 * |
|
8163 * rdi: out |
|
8164 * rsi: in |
|
8165 * r11: offs (out.length - offset) |
|
8166 * rcx: len |
|
8167 * r8: k |
|
8168 * r12: tmp1 |
|
8169 * r13: tmp2 |
|
8170 * r14: tmp3 |
|
8171 * r15: tmp4 |
|
8172 * rbx: tmp5 |
|
8173 * Multiply the in[] by word k and add to out[], return the carry in rax |
|
8174 */ |
|
8175 void MacroAssembler::mul_add(Register out, Register in, Register offs, |
|
8176 Register len, Register k, Register tmp1, Register tmp2, Register tmp3, |
|
8177 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { |
|
8178 |
|
8179 Label L_carry, L_last_in, L_done; |
|
8180 |
|
8181 // carry = 0; |
|
8182 // for (int j=len-1; j >= 0; j--) { |
|
8183 // long product = (in[j] & LONG_MASK) * kLong + |
|
8184 // (out[offs] & LONG_MASK) + carry; |
|
8185 // out[offs--] = (int)product; |
|
8186 // carry = product >>> 32; |
|
8187 // } |
|
8188 // |
|
8189 push(tmp1); |
|
8190 push(tmp2); |
|
8191 push(tmp3); |
|
8192 push(tmp4); |
|
8193 push(tmp5); |
|
8194 |
|
8195 Register op2 = tmp2; |
|
8196 const Register sum = tmp3; |
|
8197 const Register op1 = tmp4; |
|
8198 const Register carry = tmp5; |
|
8199 |
|
8200 if (UseBMI2Instructions) { |
|
8201 op2 = rdxReg; |
|
8202 movl(op2, k); |
|
8203 } |
|
8204 else { |
|
8205 movl(op2, k); |
|
8206 } |
|
8207 |
|
8208 xorq(carry, carry); |
|
8209 |
|
8210 //First loop |
|
8211 |
|
8212 //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply |
|
8213 //The carry is in tmp5 |
|
8214 mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg); |
|
8215 |
|
8216 //Multiply the trailing in[] entry using 64 bit by 32 bit, if any |
|
8217 decrementl(len); |
|
8218 jccb(Assembler::negative, L_carry); |
|
8219 decrementl(len); |
|
8220 jccb(Assembler::negative, L_last_in); |
|
8221 |
|
8222 movq(op1, Address(in, len, Address::times_4, 0)); |
|
8223 rorq(op1, 32); |
|
8224 |
|
8225 subl(offs, 2); |
|
8226 movq(sum, Address(out, offs, Address::times_4, 0)); |
|
8227 rorq(sum, 32); |
|
8228 |
|
8229 if (UseBMI2Instructions) { |
|
8230 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); |
|
8231 } |
|
8232 else { |
|
8233 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); |
|
8234 } |
|
8235 |
|
8236 // Store back in big endian from little endian |
|
8237 rorq(sum, 0x20); |
|
8238 movq(Address(out, offs, Address::times_4, 0), sum); |
|
8239 |
|
8240 testl(len, len); |
|
8241 jccb(Assembler::zero, L_carry); |
|
8242 |
|
8243 //Multiply the last in[] entry, if any |
|
8244 bind(L_last_in); |
|
8245 movl(op1, Address(in, 0)); |
|
8246 movl(sum, Address(out, offs, Address::times_4, -4)); |
|
8247 |
|
8248 movl(raxReg, k); |
|
8249 mull(op1); //tmp4 * eax -> edx:eax |
|
8250 addl(sum, carry); |
|
8251 adcl(rdxReg, 0); |
|
8252 addl(sum, raxReg); |
|
8253 adcl(rdxReg, 0); |
|
8254 movl(carry, rdxReg); |
|
8255 |
|
8256 movl(Address(out, offs, Address::times_4, -4), sum); |
|
8257 |
|
8258 bind(L_carry); |
|
8259 //return tmp5/carry as carry in rax |
|
8260 movl(rax, carry); |
|
8261 |
|
8262 bind(L_done); |
|
8263 pop(tmp5); |
|
8264 pop(tmp4); |
|
8265 pop(tmp3); |
|
8266 pop(tmp2); |
|
8267 pop(tmp1); |
|
8268 } |
7772 #endif |
8269 #endif |
7773 |
8270 |
7774 /** |
8271 /** |
7775 * Emits code to update CRC-32 with a byte value according to constants in table |
8272 * Emits code to update CRC-32 with a byte value according to constants in table |
7776 * |
8273 * |