1.1 --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp Tue Feb 04 17:38:01 2020 +0800 1.2 +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp Tue Feb 04 18:13:14 2020 +0800 1.3 @@ -3639,6 +3639,175 @@ 1.4 return start; 1.5 } 1.6 1.7 + 1.8 + // byte swap x86 long 1.9 + address generate_ghash_long_swap_mask() { 1.10 + __ align(CodeEntryAlignment); 1.11 + StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask"); 1.12 + address start = __ pc(); 1.13 + __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none ); 1.14 + __ emit_data64(0x0706050403020100, relocInfo::none ); 1.15 + return start; 1.16 + } 1.17 + 1.18 + // byte swap x86 byte array 1.19 + address generate_ghash_byte_swap_mask() { 1.20 + __ align(CodeEntryAlignment); 1.21 + StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask"); 1.22 + address start = __ pc(); 1.23 + __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none ); 1.24 + __ emit_data64(0x0001020304050607, relocInfo::none ); 1.25 + return start; 1.26 + } 1.27 + 1.28 + /* Single and multi-block ghash operations */ 1.29 + address generate_ghash_processBlocks() { 1.30 + __ align(CodeEntryAlignment); 1.31 + Label L_ghash_loop, L_exit; 1.32 + StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 1.33 + address start = __ pc(); 1.34 + 1.35 + const Register state = c_rarg0; 1.36 + const Register subkeyH = c_rarg1; 1.37 + const Register data = c_rarg2; 1.38 + const Register blocks = c_rarg3; 1.39 + 1.40 +#ifdef _WIN64 1.41 + const int XMM_REG_LAST = 10; 1.42 +#endif 1.43 + 1.44 + const XMMRegister xmm_temp0 = xmm0; 1.45 + const XMMRegister xmm_temp1 = xmm1; 1.46 + const XMMRegister xmm_temp2 = xmm2; 1.47 + const XMMRegister xmm_temp3 = xmm3; 1.48 + const XMMRegister xmm_temp4 = xmm4; 1.49 + const XMMRegister xmm_temp5 = xmm5; 1.50 + const XMMRegister xmm_temp6 = xmm6; 1.51 + const XMMRegister xmm_temp7 = xmm7; 1.52 + const XMMRegister xmm_temp8 = xmm8; 1.53 + const XMMRegister xmm_temp9 = xmm9; 1.54 + const XMMRegister xmm_temp10 = xmm10; 1.55 + 1.56 + __ enter(); 1.57 + 1.58 +#ifdef _WIN64 1.59 + // save the xmm registers which must be preserved 6-10 1.60 + __ subptr(rsp, -rsp_after_call_off * wordSize); 1.61 + for (int i = 6; i <= XMM_REG_LAST; i++) { 1.62 + __ movdqu(xmm_save(i), as_XMMRegister(i)); 1.63 + } 1.64 +#endif 1.65 + 1.66 + __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); 1.67 + 1.68 + __ movdqu(xmm_temp0, Address(state, 0)); 1.69 + __ pshufb(xmm_temp0, xmm_temp10); 1.70 + 1.71 + 1.72 + __ BIND(L_ghash_loop); 1.73 + __ movdqu(xmm_temp2, Address(data, 0)); 1.74 + __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); 1.75 + 1.76 + __ movdqu(xmm_temp1, Address(subkeyH, 0)); 1.77 + __ pshufb(xmm_temp1, xmm_temp10); 1.78 + 1.79 + __ pxor(xmm_temp0, xmm_temp2); 1.80 + 1.81 + // 1.82 + // Multiply with the hash key 1.83 + // 1.84 + __ movdqu(xmm_temp3, xmm_temp0); 1.85 + __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0 1.86 + __ movdqu(xmm_temp4, xmm_temp0); 1.87 + __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1 1.88 + 1.89 + __ movdqu(xmm_temp5, xmm_temp0); 1.90 + __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0 1.91 + __ movdqu(xmm_temp6, xmm_temp0); 1.92 + __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1 1.93 + 1.94 + __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0 1.95 + 1.96 + __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5 1.97 + __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right 1.98 + __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left 1.99 + __ pxor(xmm_temp3, xmm_temp5); 1.100 + __ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result 1.101 + // of the carry-less multiplication of 1.102 + // xmm0 by xmm1. 1.103 + 1.104 + // We shift the result of the multiplication by one bit position 1.105 + // to the left to cope for the fact that the bits are reversed. 1.106 + __ movdqu(xmm_temp7, xmm_temp3); 1.107 + __ movdqu(xmm_temp8, xmm_temp6); 1.108 + __ pslld(xmm_temp3, 1); 1.109 + __ pslld(xmm_temp6, 1); 1.110 + __ psrld(xmm_temp7, 31); 1.111 + __ psrld(xmm_temp8, 31); 1.112 + __ movdqu(xmm_temp9, xmm_temp7); 1.113 + __ pslldq(xmm_temp8, 4); 1.114 + __ pslldq(xmm_temp7, 4); 1.115 + __ psrldq(xmm_temp9, 12); 1.116 + __ por(xmm_temp3, xmm_temp7); 1.117 + __ por(xmm_temp6, xmm_temp8); 1.118 + __ por(xmm_temp6, xmm_temp9); 1.119 + 1.120 + // 1.121 + // First phase of the reduction 1.122 + // 1.123 + // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts 1.124 + // independently. 1.125 + __ movdqu(xmm_temp7, xmm_temp3); 1.126 + __ movdqu(xmm_temp8, xmm_temp3); 1.127 + __ movdqu(xmm_temp9, xmm_temp3); 1.128 + __ pslld(xmm_temp7, 31); // packed right shift shifting << 31 1.129 + __ pslld(xmm_temp8, 30); // packed right shift shifting << 30 1.130 + __ pslld(xmm_temp9, 25); // packed right shift shifting << 25 1.131 + __ pxor(xmm_temp7, xmm_temp8); // xor the shifted versions 1.132 + __ pxor(xmm_temp7, xmm_temp9); 1.133 + __ movdqu(xmm_temp8, xmm_temp7); 1.134 + __ pslldq(xmm_temp7, 12); 1.135 + __ psrldq(xmm_temp8, 4); 1.136 + __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete 1.137 + 1.138 + // 1.139 + // Second phase of the reduction 1.140 + // 1.141 + // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these 1.142 + // shift operations. 1.143 + __ movdqu(xmm_temp2, xmm_temp3); 1.144 + __ movdqu(xmm_temp4, xmm_temp3); 1.145 + __ movdqu(xmm_temp5, xmm_temp3); 1.146 + __ psrld(xmm_temp2, 1); // packed left shifting >> 1 1.147 + __ psrld(xmm_temp4, 2); // packed left shifting >> 2 1.148 + __ psrld(xmm_temp5, 7); // packed left shifting >> 7 1.149 + __ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions 1.150 + __ pxor(xmm_temp2, xmm_temp5); 1.151 + __ pxor(xmm_temp2, xmm_temp8); 1.152 + __ pxor(xmm_temp3, xmm_temp2); 1.153 + __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6 1.154 + 1.155 + __ decrement(blocks); 1.156 + __ jcc(Assembler::zero, L_exit); 1.157 + __ movdqu(xmm_temp0, xmm_temp6); 1.158 + __ addptr(data, 16); 1.159 + __ jmp(L_ghash_loop); 1.160 + 1.161 + __ BIND(L_exit); 1.162 + __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result 1.163 + __ movdqu(Address(state, 0), xmm_temp6); // store the result 1.164 + 1.165 +#ifdef _WIN64 1.166 + // restore xmm regs belonging to calling function 1.167 + for (int i = 6; i <= XMM_REG_LAST; i++) { 1.168 + __ movdqu(as_XMMRegister(i), xmm_save(i)); 1.169 + } 1.170 +#endif 1.171 + __ leave(); 1.172 + __ ret(0); 1.173 + return start; 1.174 + } 1.175 + 1.176 /** 1.177 * Arguments: 1.178 * 1.179 @@ -4077,6 +4246,13 @@ 1.180 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); 1.181 } 1.182 1.183 + // Generate GHASH intrinsics code 1.184 + if (UseGHASHIntrinsics) { 1.185 + StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask(); 1.186 + StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask(); 1.187 + StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 1.188 + } 1.189 + 1.190 // Safefetch stubs. 1.191 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 1.192 &StubRoutines::_safefetch32_fault_pc,