7765 pop(tmp4); |
7765 pop(tmp4); |
7766 pop(tmp3); |
7766 pop(tmp3); |
7767 pop(tmp2); |
7767 pop(tmp2); |
7768 pop(tmp1); |
7768 pop(tmp1); |
7769 } |
7769 } |
|
7770 |
|
7771 //Helper functions for square_to_len() |
|
7772 |
|
7773 /** |
|
7774 * Store the squares of x[], right shifted one bit (divided by 2) into z[] |
|
7775 * Preserves x and z and modifies rest of the registers. |
|
7776 */ |
|
7777 |
|
7778 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { |
|
7779 // Perform square and right shift by 1 |
|
7780 // Handle odd xlen case first, then for even xlen do the following |
|
7781 // jlong carry = 0; |
|
7782 // for (int j=0, i=0; j < xlen; j+=2, i+=4) { |
|
7783 // huge_128 product = x[j:j+1] * x[j:j+1]; |
|
7784 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65); |
|
7785 // z[i+2:i+3] = (jlong)(product >>> 1); |
|
7786 // carry = (jlong)product; |
|
7787 // } |
|
7788 |
|
7789 xorq(tmp5, tmp5); // carry |
|
7790 xorq(rdxReg, rdxReg); |
|
7791 xorl(tmp1, tmp1); // index for x |
|
7792 xorl(tmp4, tmp4); // index for z |
|
7793 |
|
7794 Label L_first_loop, L_first_loop_exit; |
|
7795 |
|
7796 testl(xlen, 1); |
|
7797 jccb(Assembler::zero, L_first_loop); //jump if xlen is even |
|
7798 |
|
7799 // Square and right shift by 1 the odd element using 32 bit multiply |
|
7800 movl(raxReg, Address(x, tmp1, Address::times_4, 0)); |
|
7801 imulq(raxReg, raxReg); |
|
7802 shrq(raxReg, 1); |
|
7803 adcq(tmp5, 0); |
|
7804 movq(Address(z, tmp4, Address::times_4, 0), raxReg); |
|
7805 incrementl(tmp1); |
|
7806 addl(tmp4, 2); |
|
7807 |
|
7808 // Square and right shift by 1 the rest using 64 bit multiply |
|
7809 bind(L_first_loop); |
|
7810 cmpptr(tmp1, xlen); |
|
7811 jccb(Assembler::equal, L_first_loop_exit); |
|
7812 |
|
7813 // Square |
|
7814 movq(raxReg, Address(x, tmp1, Address::times_4, 0)); |
|
7815 rorq(raxReg, 32); // convert big-endian to little-endian |
|
7816 mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax |
|
7817 |
|
7818 // Right shift by 1 and save carry |
|
7819 shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1 |
|
7820 rcrq(rdxReg, 1); |
|
7821 rcrq(raxReg, 1); |
|
7822 adcq(tmp5, 0); |
|
7823 |
|
7824 // Store result in z |
|
7825 movq(Address(z, tmp4, Address::times_4, 0), rdxReg); |
|
7826 movq(Address(z, tmp4, Address::times_4, 8), raxReg); |
|
7827 |
|
7828 // Update indices for x and z |
|
7829 addl(tmp1, 2); |
|
7830 addl(tmp4, 4); |
|
7831 jmp(L_first_loop); |
|
7832 |
|
7833 bind(L_first_loop_exit); |
|
7834 } |
|
7835 |
|
7836 |
|
7837 /** |
|
7838 * Perform the following multiply add operation using BMI2 instructions |
|
7839 * carry:sum = sum + op1*op2 + carry |
|
7840 * op2 should be in rdx |
|
7841 * op2 is preserved, all other registers are modified |
|
7842 */ |
|
7843 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) { |
|
7844 // assert op2 is rdx |
|
7845 mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1 |
|
7846 addq(sum, carry); |
|
7847 adcq(tmp2, 0); |
|
7848 addq(sum, op1); |
|
7849 adcq(tmp2, 0); |
|
7850 movq(carry, tmp2); |
|
7851 } |
|
7852 |
|
7853 /** |
|
7854 * Perform the following multiply add operation: |
|
7855 * carry:sum = sum + op1*op2 + carry |
|
7856 * Preserves op1, op2 and modifies rest of registers |
|
7857 */ |
|
7858 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) { |
|
7859 // rdx:rax = op1 * op2 |
|
7860 movq(raxReg, op2); |
|
7861 mulq(op1); |
|
7862 |
|
7863 // rdx:rax = sum + carry + rdx:rax |
|
7864 addq(sum, carry); |
|
7865 adcq(rdxReg, 0); |
|
7866 addq(sum, raxReg); |
|
7867 adcq(rdxReg, 0); |
|
7868 |
|
7869 // carry:sum = rdx:sum |
|
7870 movq(carry, rdxReg); |
|
7871 } |
|
7872 |
|
7873 /** |
|
7874 * Add 64 bit long carry into z[] with carry propogation. |
|
7875 * Preserves z and carry register values and modifies rest of registers. |
|
7876 * |
|
7877 */ |
|
7878 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) { |
|
7879 Label L_fourth_loop, L_fourth_loop_exit; |
|
7880 |
|
7881 movl(tmp1, 1); |
|
7882 subl(zlen, 2); |
|
7883 addq(Address(z, zlen, Address::times_4, 0), carry); |
|
7884 |
|
7885 bind(L_fourth_loop); |
|
7886 jccb(Assembler::carryClear, L_fourth_loop_exit); |
|
7887 subl(zlen, 2); |
|
7888 jccb(Assembler::negative, L_fourth_loop_exit); |
|
7889 addq(Address(z, zlen, Address::times_4, 0), tmp1); |
|
7890 jmp(L_fourth_loop); |
|
7891 bind(L_fourth_loop_exit); |
|
7892 } |
|
7893 |
|
7894 /** |
|
7895 * Shift z[] left by 1 bit. |
|
7896 * Preserves x, len, z and zlen registers and modifies rest of the registers. |
|
7897 * |
|
7898 */ |
|
7899 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) { |
|
7900 |
|
7901 Label L_fifth_loop, L_fifth_loop_exit; |
|
7902 |
|
7903 // Fifth loop |
|
7904 // Perform primitiveLeftShift(z, zlen, 1) |
|
7905 |
|
7906 const Register prev_carry = tmp1; |
|
7907 const Register new_carry = tmp4; |
|
7908 const Register value = tmp2; |
|
7909 const Register zidx = tmp3; |
|
7910 |
|
7911 // int zidx, carry; |
|
7912 // long value; |
|
7913 // carry = 0; |
|
7914 // for (zidx = zlen-2; zidx >=0; zidx -= 2) { |
|
7915 // (carry:value) = (z[i] << 1) | carry ; |
|
7916 // z[i] = value; |
|
7917 // } |
|
7918 |
|
7919 movl(zidx, zlen); |
|
7920 xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register |
|
7921 |
|
7922 bind(L_fifth_loop); |
|
7923 decl(zidx); // Use decl to preserve carry flag |
|
7924 decl(zidx); |
|
7925 jccb(Assembler::negative, L_fifth_loop_exit); |
|
7926 |
|
7927 if (UseBMI2Instructions) { |
|
7928 movq(value, Address(z, zidx, Address::times_4, 0)); |
|
7929 rclq(value, 1); |
|
7930 rorxq(value, value, 32); |
|
7931 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form |
|
7932 } |
|
7933 else { |
|
7934 // clear new_carry |
|
7935 xorl(new_carry, new_carry); |
|
7936 |
|
7937 // Shift z[i] by 1, or in previous carry and save new carry |
|
7938 movq(value, Address(z, zidx, Address::times_4, 0)); |
|
7939 shlq(value, 1); |
|
7940 adcl(new_carry, 0); |
|
7941 |
|
7942 orq(value, prev_carry); |
|
7943 rorq(value, 0x20); |
|
7944 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form |
|
7945 |
|
7946 // Set previous carry = new carry |
|
7947 movl(prev_carry, new_carry); |
|
7948 } |
|
7949 jmp(L_fifth_loop); |
|
7950 |
|
7951 bind(L_fifth_loop_exit); |
|
7952 } |
|
7953 |
|
7954 |
|
7955 /** |
|
7956 * Code for BigInteger::squareToLen() intrinsic |
|
7957 * |
|
7958 * rdi: x |
|
7959 * rsi: len |
|
7960 * r8: z |
|
7961 * rcx: zlen |
|
7962 * r12: tmp1 |
|
7963 * r13: tmp2 |
|
7964 * r14: tmp3 |
|
7965 * r15: tmp4 |
|
7966 * rbx: tmp5 |
|
7967 * |
|
7968 */ |
|
7969 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { |
|
7970 |
|
7971 Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply; |
|
7972 push(tmp1); |
|
7973 push(tmp2); |
|
7974 push(tmp3); |
|
7975 push(tmp4); |
|
7976 push(tmp5); |
|
7977 |
|
7978 // First loop |
|
7979 // Store the squares, right shifted one bit (i.e., divided by 2). |
|
7980 square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg); |
|
7981 |
|
7982 // Add in off-diagonal sums. |
|
7983 // |
|
7984 // Second, third (nested) and fourth loops. |
|
7985 // zlen +=2; |
|
7986 // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) { |
|
7987 // carry = 0; |
|
7988 // long op2 = x[xidx:xidx+1]; |
|
7989 // for (int j=xidx-2,k=zidx; j >= 0; j-=2) { |
|
7990 // k -= 2; |
|
7991 // long op1 = x[j:j+1]; |
|
7992 // long sum = z[k:k+1]; |
|
7993 // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs); |
|
7994 // z[k:k+1] = sum; |
|
7995 // } |
|
7996 // add_one_64(z, k, carry, tmp_regs); |
|
7997 // } |
|
7998 |
|
7999 const Register carry = tmp5; |
|
8000 const Register sum = tmp3; |
|
8001 const Register op1 = tmp4; |
|
8002 Register op2 = tmp2; |
|
8003 |
|
8004 push(zlen); |
|
8005 push(len); |
|
8006 addl(zlen,2); |
|
8007 bind(L_second_loop); |
|
8008 xorq(carry, carry); |
|
8009 subl(zlen, 4); |
|
8010 subl(len, 2); |
|
8011 push(zlen); |
|
8012 push(len); |
|
8013 cmpl(len, 0); |
|
8014 jccb(Assembler::lessEqual, L_second_loop_exit); |
|
8015 |
|
8016 // Multiply an array by one 64 bit long. |
|
8017 if (UseBMI2Instructions) { |
|
8018 op2 = rdxReg; |
|
8019 movq(op2, Address(x, len, Address::times_4, 0)); |
|
8020 rorxq(op2, op2, 32); |
|
8021 } |
|
8022 else { |
|
8023 movq(op2, Address(x, len, Address::times_4, 0)); |
|
8024 rorq(op2, 32); |
|
8025 } |
|
8026 |
|
8027 bind(L_third_loop); |
|
8028 decrementl(len); |
|
8029 jccb(Assembler::negative, L_third_loop_exit); |
|
8030 decrementl(len); |
|
8031 jccb(Assembler::negative, L_last_x); |
|
8032 |
|
8033 movq(op1, Address(x, len, Address::times_4, 0)); |
|
8034 rorq(op1, 32); |
|
8035 |
|
8036 bind(L_multiply); |
|
8037 subl(zlen, 2); |
|
8038 movq(sum, Address(z, zlen, Address::times_4, 0)); |
|
8039 |
|
8040 // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry. |
|
8041 if (UseBMI2Instructions) { |
|
8042 multiply_add_64_bmi2(sum, op1, op2, carry, tmp2); |
|
8043 } |
|
8044 else { |
|
8045 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); |
|
8046 } |
|
8047 |
|
8048 movq(Address(z, zlen, Address::times_4, 0), sum); |
|
8049 |
|
8050 jmp(L_third_loop); |
|
8051 bind(L_third_loop_exit); |
|
8052 |
|
8053 // Fourth loop |
|
8054 // Add 64 bit long carry into z with carry propogation. |
|
8055 // Uses offsetted zlen. |
|
8056 add_one_64(z, zlen, carry, tmp1); |
|
8057 |
|
8058 pop(len); |
|
8059 pop(zlen); |
|
8060 jmp(L_second_loop); |
|
8061 |
|
8062 // Next infrequent code is moved outside loops. |
|
8063 bind(L_last_x); |
|
8064 movl(op1, Address(x, 0)); |
|
8065 jmp(L_multiply); |
|
8066 |
|
8067 bind(L_second_loop_exit); |
|
8068 pop(len); |
|
8069 pop(zlen); |
|
8070 pop(len); |
|
8071 pop(zlen); |
|
8072 |
|
8073 // Fifth loop |
|
8074 // Shift z left 1 bit. |
|
8075 lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4); |
|
8076 |
|
8077 // z[zlen-1] |= x[len-1] & 1; |
|
8078 movl(tmp3, Address(x, len, Address::times_4, -4)); |
|
8079 andl(tmp3, 1); |
|
8080 orl(Address(z, zlen, Address::times_4, -4), tmp3); |
|
8081 |
|
8082 pop(tmp5); |
|
8083 pop(tmp4); |
|
8084 pop(tmp3); |
|
8085 pop(tmp2); |
|
8086 pop(tmp1); |
|
8087 } |
|
8088 |
|
8089 /** |
|
8090 * Helper function for mul_add() |
|
8091 * Multiply the in[] by int k and add to out[] starting at offset offs using |
|
8092 * 128 bit by 32 bit multiply and return the carry in tmp5. |
|
8093 * Only quad int aligned length of in[] is operated on in this function. |
|
8094 * k is in rdxReg for BMI2Instructions, for others it is in tmp2. |
|
8095 * This function preserves out, in and k registers. |
|
8096 * len and offset point to the appropriate index in "in" & "out" correspondingly |
|
8097 * tmp5 has the carry. |
|
8098 * other registers are temporary and are modified. |
|
8099 * |
|
8100 */ |
|
8101 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in, |
|
8102 Register offset, Register len, Register tmp1, Register tmp2, Register tmp3, |
|
8103 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { |
|
8104 |
|
8105 Label L_first_loop, L_first_loop_exit; |
|
8106 |
|
8107 movl(tmp1, len); |
|
8108 shrl(tmp1, 2); |
|
8109 |
|
8110 bind(L_first_loop); |
|
8111 subl(tmp1, 1); |
|
8112 jccb(Assembler::negative, L_first_loop_exit); |
|
8113 |
|
8114 subl(len, 4); |
|
8115 subl(offset, 4); |
|
8116 |
|
8117 Register op2 = tmp2; |
|
8118 const Register sum = tmp3; |
|
8119 const Register op1 = tmp4; |
|
8120 const Register carry = tmp5; |
|
8121 |
|
8122 if (UseBMI2Instructions) { |
|
8123 op2 = rdxReg; |
|
8124 } |
|
8125 |
|
8126 movq(op1, Address(in, len, Address::times_4, 8)); |
|
8127 rorq(op1, 32); |
|
8128 movq(sum, Address(out, offset, Address::times_4, 8)); |
|
8129 rorq(sum, 32); |
|
8130 if (UseBMI2Instructions) { |
|
8131 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); |
|
8132 } |
|
8133 else { |
|
8134 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); |
|
8135 } |
|
8136 // Store back in big endian from little endian |
|
8137 rorq(sum, 0x20); |
|
8138 movq(Address(out, offset, Address::times_4, 8), sum); |
|
8139 |
|
8140 movq(op1, Address(in, len, Address::times_4, 0)); |
|
8141 rorq(op1, 32); |
|
8142 movq(sum, Address(out, offset, Address::times_4, 0)); |
|
8143 rorq(sum, 32); |
|
8144 if (UseBMI2Instructions) { |
|
8145 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); |
|
8146 } |
|
8147 else { |
|
8148 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); |
|
8149 } |
|
8150 // Store back in big endian from little endian |
|
8151 rorq(sum, 0x20); |
|
8152 movq(Address(out, offset, Address::times_4, 0), sum); |
|
8153 |
|
8154 jmp(L_first_loop); |
|
8155 bind(L_first_loop_exit); |
|
8156 } |
|
8157 |
|
8158 /** |
|
8159 * Code for BigInteger::mulAdd() intrinsic |
|
8160 * |
|
8161 * rdi: out |
|
8162 * rsi: in |
|
8163 * r11: offs (out.length - offset) |
|
8164 * rcx: len |
|
8165 * r8: k |
|
8166 * r12: tmp1 |
|
8167 * r13: tmp2 |
|
8168 * r14: tmp3 |
|
8169 * r15: tmp4 |
|
8170 * rbx: tmp5 |
|
8171 * Multiply the in[] by word k and add to out[], return the carry in rax |
|
8172 */ |
|
8173 void MacroAssembler::mul_add(Register out, Register in, Register offs, |
|
8174 Register len, Register k, Register tmp1, Register tmp2, Register tmp3, |
|
8175 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { |
|
8176 |
|
8177 Label L_carry, L_last_in, L_done; |
|
8178 |
|
8179 // carry = 0; |
|
8180 // for (int j=len-1; j >= 0; j--) { |
|
8181 // long product = (in[j] & LONG_MASK) * kLong + |
|
8182 // (out[offs] & LONG_MASK) + carry; |
|
8183 // out[offs--] = (int)product; |
|
8184 // carry = product >>> 32; |
|
8185 // } |
|
8186 // |
|
8187 push(tmp1); |
|
8188 push(tmp2); |
|
8189 push(tmp3); |
|
8190 push(tmp4); |
|
8191 push(tmp5); |
|
8192 |
|
8193 Register op2 = tmp2; |
|
8194 const Register sum = tmp3; |
|
8195 const Register op1 = tmp4; |
|
8196 const Register carry = tmp5; |
|
8197 |
|
8198 if (UseBMI2Instructions) { |
|
8199 op2 = rdxReg; |
|
8200 movl(op2, k); |
|
8201 } |
|
8202 else { |
|
8203 movl(op2, k); |
|
8204 } |
|
8205 |
|
8206 xorq(carry, carry); |
|
8207 |
|
8208 //First loop |
|
8209 |
|
8210 //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply |
|
8211 //The carry is in tmp5 |
|
8212 mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg); |
|
8213 |
|
8214 //Multiply the trailing in[] entry using 64 bit by 32 bit, if any |
|
8215 decrementl(len); |
|
8216 jccb(Assembler::negative, L_carry); |
|
8217 decrementl(len); |
|
8218 jccb(Assembler::negative, L_last_in); |
|
8219 |
|
8220 movq(op1, Address(in, len, Address::times_4, 0)); |
|
8221 rorq(op1, 32); |
|
8222 |
|
8223 subl(offs, 2); |
|
8224 movq(sum, Address(out, offs, Address::times_4, 0)); |
|
8225 rorq(sum, 32); |
|
8226 |
|
8227 if (UseBMI2Instructions) { |
|
8228 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); |
|
8229 } |
|
8230 else { |
|
8231 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); |
|
8232 } |
|
8233 |
|
8234 // Store back in big endian from little endian |
|
8235 rorq(sum, 0x20); |
|
8236 movq(Address(out, offs, Address::times_4, 0), sum); |
|
8237 |
|
8238 testl(len, len); |
|
8239 jccb(Assembler::zero, L_carry); |
|
8240 |
|
8241 //Multiply the last in[] entry, if any |
|
8242 bind(L_last_in); |
|
8243 movl(op1, Address(in, 0)); |
|
8244 movl(sum, Address(out, offs, Address::times_4, -4)); |
|
8245 |
|
8246 movl(raxReg, k); |
|
8247 mull(op1); //tmp4 * eax -> edx:eax |
|
8248 addl(sum, carry); |
|
8249 adcl(rdxReg, 0); |
|
8250 addl(sum, raxReg); |
|
8251 adcl(rdxReg, 0); |
|
8252 movl(carry, rdxReg); |
|
8253 |
|
8254 movl(Address(out, offs, Address::times_4, -4), sum); |
|
8255 |
|
8256 bind(L_carry); |
|
8257 //return tmp5/carry as carry in rax |
|
8258 movl(rax, carry); |
|
8259 |
|
8260 bind(L_done); |
|
8261 pop(tmp5); |
|
8262 pop(tmp4); |
|
8263 pop(tmp3); |
|
8264 pop(tmp2); |
|
8265 pop(tmp1); |
|
8266 } |
7770 #endif |
8267 #endif |
7771 |
8268 |
7772 /** |
8269 /** |
7773 * Emits code to update CRC-32 with a byte value according to constants in table |
8270 * Emits code to update CRC-32 with a byte value according to constants in table |
7774 * |
8271 * |