src/cpu/x86/vm/stubGenerator_x86_64.cpp

changeset 9806
758c07667682
parent 9041
95a08233f46c
parent 9788
44ef77ad417c
     1.1 --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Tue Feb 04 17:38:01 2020 +0800
     1.2 +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Tue Feb 04 18:13:14 2020 +0800
     1.3 @@ -3639,6 +3639,175 @@
     1.4      return start;
     1.5    }
     1.6  
     1.7 +
     1.8 +  // byte swap x86 long
     1.9 +  address generate_ghash_long_swap_mask() {
    1.10 +    __ align(CodeEntryAlignment);
    1.11 +    StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
    1.12 +    address start = __ pc();
    1.13 +    __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
    1.14 +    __ emit_data64(0x0706050403020100, relocInfo::none );
    1.15 +  return start;
    1.16 +  }
    1.17 +
    1.18 +  // byte swap x86 byte array
    1.19 +  address generate_ghash_byte_swap_mask() {
    1.20 +    __ align(CodeEntryAlignment);
    1.21 +    StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
    1.22 +    address start = __ pc();
    1.23 +    __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
    1.24 +    __ emit_data64(0x0001020304050607, relocInfo::none );
    1.25 +  return start;
    1.26 +  }
    1.27 +
    1.28 +  /* Single and multi-block ghash operations */
    1.29 +  address generate_ghash_processBlocks() {
    1.30 +    __ align(CodeEntryAlignment);
    1.31 +    Label L_ghash_loop, L_exit;
    1.32 +    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
    1.33 +    address start = __ pc();
    1.34 +
    1.35 +    const Register state        = c_rarg0;
    1.36 +    const Register subkeyH      = c_rarg1;
    1.37 +    const Register data         = c_rarg2;
    1.38 +    const Register blocks       = c_rarg3;
    1.39 +
    1.40 +#ifdef _WIN64
    1.41 +    const int XMM_REG_LAST  = 10;
    1.42 +#endif
    1.43 +
    1.44 +    const XMMRegister xmm_temp0 = xmm0;
    1.45 +    const XMMRegister xmm_temp1 = xmm1;
    1.46 +    const XMMRegister xmm_temp2 = xmm2;
    1.47 +    const XMMRegister xmm_temp3 = xmm3;
    1.48 +    const XMMRegister xmm_temp4 = xmm4;
    1.49 +    const XMMRegister xmm_temp5 = xmm5;
    1.50 +    const XMMRegister xmm_temp6 = xmm6;
    1.51 +    const XMMRegister xmm_temp7 = xmm7;
    1.52 +    const XMMRegister xmm_temp8 = xmm8;
    1.53 +    const XMMRegister xmm_temp9 = xmm9;
    1.54 +    const XMMRegister xmm_temp10 = xmm10;
    1.55 +
    1.56 +    __ enter();
    1.57 +
    1.58 +#ifdef _WIN64
    1.59 +    // save the xmm registers which must be preserved 6-10
    1.60 +    __ subptr(rsp, -rsp_after_call_off * wordSize);
    1.61 +    for (int i = 6; i <= XMM_REG_LAST; i++) {
    1.62 +      __ movdqu(xmm_save(i), as_XMMRegister(i));
    1.63 +    }
    1.64 +#endif
    1.65 +
    1.66 +    __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
    1.67 +
    1.68 +    __ movdqu(xmm_temp0, Address(state, 0));
    1.69 +    __ pshufb(xmm_temp0, xmm_temp10);
    1.70 +
    1.71 +
    1.72 +    __ BIND(L_ghash_loop);
    1.73 +    __ movdqu(xmm_temp2, Address(data, 0));
    1.74 +    __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
    1.75 +
    1.76 +    __ movdqu(xmm_temp1, Address(subkeyH, 0));
    1.77 +    __ pshufb(xmm_temp1, xmm_temp10);
    1.78 +
    1.79 +    __ pxor(xmm_temp0, xmm_temp2);
    1.80 +
    1.81 +    //
    1.82 +    // Multiply with the hash key
    1.83 +    //
    1.84 +    __ movdqu(xmm_temp3, xmm_temp0);
    1.85 +    __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
    1.86 +    __ movdqu(xmm_temp4, xmm_temp0);
    1.87 +    __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
    1.88 +
    1.89 +    __ movdqu(xmm_temp5, xmm_temp0);
    1.90 +    __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
    1.91 +    __ movdqu(xmm_temp6, xmm_temp0);
    1.92 +    __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
    1.93 +
    1.94 +    __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
    1.95 +
    1.96 +    __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
    1.97 +    __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
    1.98 +    __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
    1.99 +    __ pxor(xmm_temp3, xmm_temp5);
   1.100 +    __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
   1.101 +                                        // of the carry-less multiplication of
   1.102 +                                        // xmm0 by xmm1.
   1.103 +
   1.104 +    // We shift the result of the multiplication by one bit position
   1.105 +    // to the left to cope for the fact that the bits are reversed.
   1.106 +    __ movdqu(xmm_temp7, xmm_temp3);
   1.107 +    __ movdqu(xmm_temp8, xmm_temp6);
   1.108 +    __ pslld(xmm_temp3, 1);
   1.109 +    __ pslld(xmm_temp6, 1);
   1.110 +    __ psrld(xmm_temp7, 31);
   1.111 +    __ psrld(xmm_temp8, 31);
   1.112 +    __ movdqu(xmm_temp9, xmm_temp7);
   1.113 +    __ pslldq(xmm_temp8, 4);
   1.114 +    __ pslldq(xmm_temp7, 4);
   1.115 +    __ psrldq(xmm_temp9, 12);
   1.116 +    __ por(xmm_temp3, xmm_temp7);
   1.117 +    __ por(xmm_temp6, xmm_temp8);
   1.118 +    __ por(xmm_temp6, xmm_temp9);
   1.119 +
   1.120 +    //
   1.121 +    // First phase of the reduction
   1.122 +    //
   1.123 +    // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
   1.124 +    // independently.
   1.125 +    __ movdqu(xmm_temp7, xmm_temp3);
   1.126 +    __ movdqu(xmm_temp8, xmm_temp3);
   1.127 +    __ movdqu(xmm_temp9, xmm_temp3);
   1.128 +    __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
   1.129 +    __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
   1.130 +    __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
   1.131 +    __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
   1.132 +    __ pxor(xmm_temp7, xmm_temp9);
   1.133 +    __ movdqu(xmm_temp8, xmm_temp7);
   1.134 +    __ pslldq(xmm_temp7, 12);
   1.135 +    __ psrldq(xmm_temp8, 4);
   1.136 +    __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
   1.137 +
   1.138 +    //
   1.139 +    // Second phase of the reduction
   1.140 +    //
   1.141 +    // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
   1.142 +    // shift operations.
   1.143 +    __ movdqu(xmm_temp2, xmm_temp3);
   1.144 +    __ movdqu(xmm_temp4, xmm_temp3);
   1.145 +    __ movdqu(xmm_temp5, xmm_temp3);
   1.146 +    __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
   1.147 +    __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
   1.148 +    __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
   1.149 +    __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
   1.150 +    __ pxor(xmm_temp2, xmm_temp5);
   1.151 +    __ pxor(xmm_temp2, xmm_temp8);
   1.152 +    __ pxor(xmm_temp3, xmm_temp2);
   1.153 +    __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
   1.154 +
   1.155 +    __ decrement(blocks);
   1.156 +    __ jcc(Assembler::zero, L_exit);
   1.157 +    __ movdqu(xmm_temp0, xmm_temp6);
   1.158 +    __ addptr(data, 16);
   1.159 +    __ jmp(L_ghash_loop);
   1.160 +
   1.161 +    __ BIND(L_exit);
   1.162 +    __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
   1.163 +    __ movdqu(Address(state, 0), xmm_temp6);   // store the result
   1.164 +
   1.165 +#ifdef _WIN64
   1.166 +    // restore xmm regs belonging to calling function
   1.167 +    for (int i = 6; i <= XMM_REG_LAST; i++) {
   1.168 +      __ movdqu(as_XMMRegister(i), xmm_save(i));
   1.169 +    }
   1.170 +#endif
   1.171 +    __ leave();
   1.172 +    __ ret(0);
   1.173 +    return start;
   1.174 +  }
   1.175 +
   1.176    /**
   1.177     *  Arguments:
   1.178     *
   1.179 @@ -4077,6 +4246,13 @@
   1.180        StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
   1.181      }
   1.182  
   1.183 +    // Generate GHASH intrinsics code
   1.184 +    if (UseGHASHIntrinsics) {
   1.185 +      StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
   1.186 +      StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
   1.187 +      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
   1.188 +    }
   1.189 +
   1.190      // Safefetch stubs.
   1.191      generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
   1.192                                                         &StubRoutines::_safefetch32_fault_pc,

mercurial