src/cpu/x86/vm/stubGenerator_x86_64.cpp

changeset 4205
a3ecd773a7b9
parent 4142
d8ce2825b193
child 4299
f34d701e952e
child 4318
cd3d6a6b95d9
     1.1 --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Tue Oct 23 13:06:37 2012 -0700
     1.2 +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Wed Oct 24 14:33:22 2012 -0700
     1.3 @@ -2941,6 +2941,548 @@
     1.4      }
     1.5    }
     1.6  
     1.7 +  // AES intrinsic stubs
     1.8 +  enum {AESBlockSize = 16};
     1.9 +
    1.10 +  address generate_key_shuffle_mask() {
    1.11 +    __ align(16);
    1.12 +    StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
    1.13 +    address start = __ pc();
    1.14 +    __ emit_data64( 0x0405060700010203, relocInfo::none );
    1.15 +    __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
    1.16 +    return start;
    1.17 +  }
    1.18 +
    1.19 +  // Utility routine for loading a 128-bit key word in little endian format
    1.20 +  // can optionally specify that the shuffle mask is already in an xmmregister
    1.21 +  void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
    1.22 +    __ movdqu(xmmdst, Address(key, offset));
    1.23 +    if (xmm_shuf_mask != NULL) {
    1.24 +      __ pshufb(xmmdst, xmm_shuf_mask);
    1.25 +    } else {
    1.26 +      __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
    1.27 +    }
    1.28 +  }
    1.29 +
    1.30 +  // aesenc using specified key+offset
    1.31 +  // can optionally specify that the shuffle mask is already in an xmmregister
    1.32 +  void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
    1.33 +    load_key(xmmtmp, key, offset, xmm_shuf_mask);
    1.34 +    __ aesenc(xmmdst, xmmtmp);
    1.35 +  }
    1.36 +
    1.37 +  // aesdec using specified key+offset
    1.38 +  // can optionally specify that the shuffle mask is already in an xmmregister
    1.39 +  void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
    1.40 +    load_key(xmmtmp, key, offset, xmm_shuf_mask);
    1.41 +    __ aesdec(xmmdst, xmmtmp);
    1.42 +  }
    1.43 +
    1.44 +
    1.45 +  // Arguments:
    1.46 +  //
    1.47 +  // Inputs:
    1.48 +  //   c_rarg0   - source byte array address
    1.49 +  //   c_rarg1   - destination byte array address
    1.50 +  //   c_rarg2   - K (key) in little endian int array
    1.51 +  //
    1.52 +  address generate_aescrypt_encryptBlock() {
    1.53 +    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
    1.54 +    __ align(CodeEntryAlignment);
    1.55 +    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
    1.56 +    Label L_doLast;
    1.57 +    address start = __ pc();
    1.58 +
    1.59 +    const Register from        = c_rarg0;  // source array address
    1.60 +    const Register to          = c_rarg1;  // destination array address
    1.61 +    const Register key         = c_rarg2;  // key array address
    1.62 +    const Register keylen      = rax;
    1.63 +
    1.64 +    const XMMRegister xmm_result = xmm0;
    1.65 +    const XMMRegister xmm_temp   = xmm1;
    1.66 +    const XMMRegister xmm_key_shuf_mask = xmm2;
    1.67 +
    1.68 +    __ enter(); // required for proper stackwalking of RuntimeStub frame
    1.69 +
    1.70 +    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
    1.71 +    // keylen = # of 32-bit words, convert to 128-bit words
    1.72 +    __ shrl(keylen, 2);
    1.73 +    __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more
    1.74 +
    1.75 +    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
    1.76 +    __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
    1.77 +
    1.78 +    // For encryption, the java expanded key ordering is just what we need
    1.79 +    // we don't know if the key is aligned, hence not using load-execute form
    1.80 +
    1.81 +    load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
    1.82 +    __ pxor(xmm_result, xmm_temp);
    1.83 +    for (int offset = 0x10; offset <= 0x90; offset += 0x10) {
    1.84 +      aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
    1.85 +    }
    1.86 +    load_key  (xmm_temp, key, 0xa0, xmm_key_shuf_mask);
    1.87 +    __ cmpl(keylen, 0);
    1.88 +    __ jcc(Assembler::equal, L_doLast);
    1.89 +    __ aesenc(xmm_result, xmm_temp);                   // only in 192 and 256 bit keys
    1.90 +    aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
    1.91 +    load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask);
    1.92 +    __ subl(keylen, 2);
    1.93 +    __ jcc(Assembler::equal, L_doLast);
    1.94 +    __ aesenc(xmm_result, xmm_temp);                   // only in 256 bit keys
    1.95 +    aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
    1.96 +    load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask);
    1.97 +
    1.98 +    __ BIND(L_doLast);
    1.99 +    __ aesenclast(xmm_result, xmm_temp);
   1.100 +    __ movdqu(Address(to, 0), xmm_result);        // store the result
   1.101 +    __ xorptr(rax, rax); // return 0
   1.102 +    __ leave(); // required for proper stackwalking of RuntimeStub frame
   1.103 +    __ ret(0);
   1.104 +
   1.105 +    return start;
   1.106 +  }
   1.107 +
   1.108 +
   1.109 +  // Arguments:
   1.110 +  //
   1.111 +  // Inputs:
   1.112 +  //   c_rarg0   - source byte array address
   1.113 +  //   c_rarg1   - destination byte array address
   1.114 +  //   c_rarg2   - K (key) in little endian int array
   1.115 +  //
   1.116 +  address generate_aescrypt_decryptBlock() {
   1.117 +    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
   1.118 +    __ align(CodeEntryAlignment);
   1.119 +    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
   1.120 +    Label L_doLast;
   1.121 +    address start = __ pc();
   1.122 +
   1.123 +    const Register from        = c_rarg0;  // source array address
   1.124 +    const Register to          = c_rarg1;  // destination array address
   1.125 +    const Register key         = c_rarg2;  // key array address
   1.126 +    const Register keylen      = rax;
   1.127 +
   1.128 +    const XMMRegister xmm_result = xmm0;
   1.129 +    const XMMRegister xmm_temp   = xmm1;
   1.130 +    const XMMRegister xmm_key_shuf_mask = xmm2;
   1.131 +
   1.132 +    __ enter(); // required for proper stackwalking of RuntimeStub frame
   1.133 +
   1.134 +    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
   1.135 +    // keylen = # of 32-bit words, convert to 128-bit words
   1.136 +    __ shrl(keylen, 2);
   1.137 +    __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more
   1.138 +
   1.139 +    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
   1.140 +    __ movdqu(xmm_result, Address(from, 0));
   1.141 +
   1.142 +    // for decryption java expanded key ordering is rotated one position from what we want
   1.143 +    // so we start from 0x10 here and hit 0x00 last
   1.144 +    // we don't know if the key is aligned, hence not using load-execute form
   1.145 +    load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask);
   1.146 +    __ pxor  (xmm_result, xmm_temp);
   1.147 +    for (int offset = 0x20; offset <= 0xa0; offset += 0x10) {
   1.148 +      aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
   1.149 +    }
   1.150 +    __ cmpl(keylen, 0);
   1.151 +    __ jcc(Assembler::equal, L_doLast);
   1.152 +    // only in 192 and 256 bit keys
   1.153 +    aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
   1.154 +    aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask);
   1.155 +    __ subl(keylen, 2);
   1.156 +    __ jcc(Assembler::equal, L_doLast);
   1.157 +    // only in 256 bit keys
   1.158 +    aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
   1.159 +    aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask);
   1.160 +
   1.161 +    __ BIND(L_doLast);
   1.162 +    // for decryption the aesdeclast operation is always on key+0x00
   1.163 +    load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
   1.164 +    __ aesdeclast(xmm_result, xmm_temp);
   1.165 +
   1.166 +    __ movdqu(Address(to, 0), xmm_result);  // store the result
   1.167 +
   1.168 +    __ xorptr(rax, rax); // return 0
   1.169 +    __ leave(); // required for proper stackwalking of RuntimeStub frame
   1.170 +    __ ret(0);
   1.171 +
   1.172 +    return start;
   1.173 +  }
   1.174 +
   1.175 +
   1.176 +  // Arguments:
   1.177 +  //
   1.178 +  // Inputs:
   1.179 +  //   c_rarg0   - source byte array address
   1.180 +  //   c_rarg1   - destination byte array address
   1.181 +  //   c_rarg2   - K (key) in little endian int array
   1.182 +  //   c_rarg3   - r vector byte array address
   1.183 +  //   c_rarg4   - input length
   1.184 +  //
   1.185 +  address generate_cipherBlockChaining_encryptAESCrypt() {
   1.186 +    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
   1.187 +    __ align(CodeEntryAlignment);
   1.188 +    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
   1.189 +    address start = __ pc();
   1.190 +
   1.191 +    Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
   1.192 +    const Register from        = c_rarg0;  // source array address
   1.193 +    const Register to          = c_rarg1;  // destination array address
   1.194 +    const Register key         = c_rarg2;  // key array address
   1.195 +    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
   1.196 +                                           // and left with the results of the last encryption block
   1.197 +#ifndef _WIN64
   1.198 +    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
   1.199 +#else
   1.200 +    const Address  len_mem(rsp, 6 * wordSize);  // length is on stack on Win64
   1.201 +    const Register len_reg     = r10;      // pick the first volatile windows register
   1.202 +#endif
   1.203 +    const Register pos         = rax;
   1.204 +
   1.205 +    // xmm register assignments for the loops below
   1.206 +    const XMMRegister xmm_result = xmm0;
   1.207 +    const XMMRegister xmm_temp   = xmm1;
   1.208 +    // keys 0-10 preloaded into xmm2-xmm12
   1.209 +    const int XMM_REG_NUM_KEY_FIRST = 2;
   1.210 +    const int XMM_REG_NUM_KEY_LAST  = 12;
   1.211 +    const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
   1.212 +    const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
   1.213 +
   1.214 +    __ enter(); // required for proper stackwalking of RuntimeStub frame
   1.215 +
   1.216 +#ifdef _WIN64
   1.217 +    // on win64, fill len_reg from stack position
   1.218 +    __ movl(len_reg, len_mem);
   1.219 +    // save the xmm registers which must be preserved 6-12
   1.220 +    __ subptr(rsp, -rsp_after_call_off * wordSize);
   1.221 +    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
   1.222 +      __ movdqu(xmm_save(i), as_XMMRegister(i));
   1.223 +    }
   1.224 +#endif
   1.225 +
   1.226 +    const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
   1.227 +    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
   1.228 +    // load up xmm regs 2 thru 12 with key 0x00 - 0xa0
   1.229 +    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
   1.230 +      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
   1.231 +      offset += 0x10;
   1.232 +    }
   1.233 +
   1.234 +    __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
   1.235 +
   1.236 +    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
   1.237 +    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
   1.238 +    __ cmpl(rax, 44);
   1.239 +    __ jcc(Assembler::notEqual, L_key_192_256);
   1.240 +
   1.241 +    // 128 bit code follows here
   1.242 +    __ movptr(pos, 0);
   1.243 +    __ align(OptoLoopAlignment);
   1.244 +    __ BIND(L_loopTop_128);
   1.245 +    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
   1.246 +    __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
   1.247 +
   1.248 +    __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
   1.249 +    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
   1.250 +      __ aesenc(xmm_result, as_XMMRegister(rnum));
   1.251 +    }
   1.252 +    __ aesenclast(xmm_result, xmm_key10);
   1.253 +
   1.254 +    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
   1.255 +    // no need to store r to memory until we exit
   1.256 +    __ addptr(pos, AESBlockSize);
   1.257 +    __ subptr(len_reg, AESBlockSize);
   1.258 +    __ jcc(Assembler::notEqual, L_loopTop_128);
   1.259 +
   1.260 +    __ BIND(L_exit);
   1.261 +    __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
   1.262 +
   1.263 +#ifdef _WIN64
   1.264 +    // restore xmm regs belonging to calling function
   1.265 +    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
   1.266 +      __ movdqu(as_XMMRegister(i), xmm_save(i));
   1.267 +    }
   1.268 +#endif
   1.269 +    __ movl(rax, 0); // return 0 (why?)
   1.270 +    __ leave(); // required for proper stackwalking of RuntimeStub frame
   1.271 +    __ ret(0);
   1.272 +
   1.273 +    __ BIND(L_key_192_256);
   1.274 +    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
   1.275 +    __ cmpl(rax, 52);
   1.276 +    __ jcc(Assembler::notEqual, L_key_256);
   1.277 +
   1.278 +    // 192-bit code follows here (could be changed to use more xmm registers)
   1.279 +    __ movptr(pos, 0);
   1.280 +    __ align(OptoLoopAlignment);
   1.281 +    __ BIND(L_loopTop_192);
   1.282 +    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
   1.283 +    __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
   1.284 +
   1.285 +    __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
   1.286 +    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
   1.287 +      __ aesenc(xmm_result, as_XMMRegister(rnum));
   1.288 +    }
   1.289 +    aes_enc_key(xmm_result, xmm_temp, key, 0xb0);
   1.290 +    load_key(xmm_temp, key, 0xc0);
   1.291 +    __ aesenclast(xmm_result, xmm_temp);
   1.292 +
   1.293 +    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
   1.294 +    // no need to store r to memory until we exit
   1.295 +    __ addptr(pos, AESBlockSize);
   1.296 +    __ subptr(len_reg, AESBlockSize);
   1.297 +    __ jcc(Assembler::notEqual, L_loopTop_192);
   1.298 +    __ jmp(L_exit);
   1.299 +
   1.300 +    __ BIND(L_key_256);
   1.301 +    // 256-bit code follows here (could be changed to use more xmm registers)
   1.302 +    __ movptr(pos, 0);
   1.303 +    __ align(OptoLoopAlignment);
   1.304 +    __ BIND(L_loopTop_256);
   1.305 +    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
   1.306 +    __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
   1.307 +
   1.308 +    __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
   1.309 +    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
   1.310 +      __ aesenc(xmm_result, as_XMMRegister(rnum));
   1.311 +    }
   1.312 +    aes_enc_key(xmm_result, xmm_temp, key, 0xb0);
   1.313 +    aes_enc_key(xmm_result, xmm_temp, key, 0xc0);
   1.314 +    aes_enc_key(xmm_result, xmm_temp, key, 0xd0);
   1.315 +    load_key(xmm_temp, key, 0xe0);
   1.316 +    __ aesenclast(xmm_result, xmm_temp);
   1.317 +
   1.318 +    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
   1.319 +    // no need to store r to memory until we exit
   1.320 +    __ addptr(pos, AESBlockSize);
   1.321 +    __ subptr(len_reg, AESBlockSize);
   1.322 +    __ jcc(Assembler::notEqual, L_loopTop_256);
   1.323 +    __ jmp(L_exit);
   1.324 +
   1.325 +    return start;
   1.326 +  }
   1.327 +
   1.328 +
   1.329 +
   1.330 +  // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
   1.331 +  // to hide instruction latency
   1.332 +  //
   1.333 +  // Arguments:
   1.334 +  //
   1.335 +  // Inputs:
   1.336 +  //   c_rarg0   - source byte array address
   1.337 +  //   c_rarg1   - destination byte array address
   1.338 +  //   c_rarg2   - K (key) in little endian int array
   1.339 +  //   c_rarg3   - r vector byte array address
   1.340 +  //   c_rarg4   - input length
   1.341 +  //
   1.342 +
   1.343 +  address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
   1.344 +    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
   1.345 +    __ align(CodeEntryAlignment);
   1.346 +    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
   1.347 +    address start = __ pc();
   1.348 +
   1.349 +    Label L_exit, L_key_192_256, L_key_256;
   1.350 +    Label L_singleBlock_loopTop_128, L_multiBlock_loopTop_128;
   1.351 +    Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256;
   1.352 +    const Register from        = c_rarg0;  // source array address
   1.353 +    const Register to          = c_rarg1;  // destination array address
   1.354 +    const Register key         = c_rarg2;  // key array address
   1.355 +    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
   1.356 +                                           // and left with the results of the last encryption block
   1.357 +#ifndef _WIN64
   1.358 +    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
   1.359 +#else
   1.360 +    const Address  len_mem(rsp, 6 * wordSize);  // length is on stack on Win64
   1.361 +    const Register len_reg     = r10;      // pick the first volatile windows register
   1.362 +#endif
   1.363 +    const Register pos         = rax;
   1.364 +
   1.365 +    // xmm register assignments for the loops below
   1.366 +    const XMMRegister xmm_result = xmm0;
   1.367 +    // keys 0-10 preloaded into xmm2-xmm12
   1.368 +    const int XMM_REG_NUM_KEY_FIRST = 5;
   1.369 +    const int XMM_REG_NUM_KEY_LAST  = 15;
   1.370 +    const XMMRegister xmm_key_first   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
   1.371 +    const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
   1.372 +
   1.373 +    __ enter(); // required for proper stackwalking of RuntimeStub frame
   1.374 +
   1.375 +#ifdef _WIN64
   1.376 +    // on win64, fill len_reg from stack position
   1.377 +    __ movl(len_reg, len_mem);
   1.378 +    // save the xmm registers which must be preserved 6-15
   1.379 +    __ subptr(rsp, -rsp_after_call_off * wordSize);
   1.380 +    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
   1.381 +      __ movdqu(xmm_save(i), as_XMMRegister(i));
   1.382 +    }
   1.383 +#endif
   1.384 +    // the java expanded key ordering is rotated one position from what we want
   1.385 +    // so we start from 0x10 here and hit 0x00 last
   1.386 +    const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
   1.387 +    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
   1.388 +    // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
   1.389 +    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
   1.390 +      if (rnum == XMM_REG_NUM_KEY_LAST) offset = 0x00;
   1.391 +      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
   1.392 +      offset += 0x10;
   1.393 +    }
   1.394 +
   1.395 +    const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
   1.396 +    // registers holding the four results in the parallelized loop
   1.397 +    const XMMRegister xmm_result0 = xmm0;
   1.398 +    const XMMRegister xmm_result1 = xmm2;
   1.399 +    const XMMRegister xmm_result2 = xmm3;
   1.400 +    const XMMRegister xmm_result3 = xmm4;
   1.401 +
   1.402 +    __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
   1.403 +
   1.404 +    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
   1.405 +    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
   1.406 +    __ cmpl(rax, 44);
   1.407 +    __ jcc(Assembler::notEqual, L_key_192_256);
   1.408 +
   1.409 +
   1.410 +    // 128-bit code follows here, parallelized
   1.411 +    __ movptr(pos, 0);
   1.412 +    __ align(OptoLoopAlignment);
   1.413 +    __ BIND(L_multiBlock_loopTop_128);
   1.414 +    __ cmpptr(len_reg, 4*AESBlockSize);           // see if at least 4 blocks left
   1.415 +    __ jcc(Assembler::less, L_singleBlock_loopTop_128);
   1.416 +
   1.417 +    __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0*AESBlockSize));   // get next 4 blocks into xmmresult registers
   1.418 +    __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1*AESBlockSize));
   1.419 +    __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2*AESBlockSize));
   1.420 +    __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3*AESBlockSize));
   1.421 +
   1.422 +#define DoFour(opc, src_reg)                    \
   1.423 +    __ opc(xmm_result0, src_reg);               \
   1.424 +    __ opc(xmm_result1, src_reg);               \
   1.425 +    __ opc(xmm_result2, src_reg);               \
   1.426 +    __ opc(xmm_result3, src_reg);
   1.427 +
   1.428 +    DoFour(pxor, xmm_key_first);
   1.429 +    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
   1.430 +      DoFour(aesdec, as_XMMRegister(rnum));
   1.431 +    }
   1.432 +    DoFour(aesdeclast, xmm_key_last);
   1.433 +    // for each result, xor with the r vector of previous cipher block
   1.434 +    __ pxor(xmm_result0, xmm_prev_block_cipher);
   1.435 +    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0*AESBlockSize));
   1.436 +    __ pxor(xmm_result1, xmm_prev_block_cipher);
   1.437 +    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1*AESBlockSize));
   1.438 +    __ pxor(xmm_result2, xmm_prev_block_cipher);
   1.439 +    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2*AESBlockSize));
   1.440 +    __ pxor(xmm_result3, xmm_prev_block_cipher);
   1.441 +    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3*AESBlockSize));   // this will carry over to next set of blocks
   1.442 +
   1.443 +    __ movdqu(Address(to, pos, Address::times_1, 0*AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
   1.444 +    __ movdqu(Address(to, pos, Address::times_1, 1*AESBlockSize), xmm_result1);
   1.445 +    __ movdqu(Address(to, pos, Address::times_1, 2*AESBlockSize), xmm_result2);
   1.446 +    __ movdqu(Address(to, pos, Address::times_1, 3*AESBlockSize), xmm_result3);
   1.447 +
   1.448 +    __ addptr(pos, 4*AESBlockSize);
   1.449 +    __ subptr(len_reg, 4*AESBlockSize);
   1.450 +    __ jmp(L_multiBlock_loopTop_128);
   1.451 +
   1.452 +    // registers used in the non-parallelized loops
   1.453 +    const XMMRegister xmm_prev_block_cipher_save = xmm2;
   1.454 +    const XMMRegister xmm_temp   = xmm3;
   1.455 +
   1.456 +    __ align(OptoLoopAlignment);
   1.457 +    __ BIND(L_singleBlock_loopTop_128);
   1.458 +    __ cmpptr(len_reg, 0);           // any blocks left??
   1.459 +    __ jcc(Assembler::equal, L_exit);
   1.460 +    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
   1.461 +    __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
   1.462 +    __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
   1.463 +    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
   1.464 +      __ aesdec(xmm_result, as_XMMRegister(rnum));
   1.465 +    }
   1.466 +    __ aesdeclast(xmm_result, xmm_key_last);
   1.467 +    __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
   1.468 +    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
   1.469 +    // no need to store r to memory until we exit
   1.470 +    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
   1.471 +
   1.472 +    __ addptr(pos, AESBlockSize);
   1.473 +    __ subptr(len_reg, AESBlockSize);
   1.474 +    __ jmp(L_singleBlock_loopTop_128);
   1.475 +
   1.476 +
   1.477 +    __ BIND(L_exit);
   1.478 +    __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
   1.479 +#ifdef _WIN64
   1.480 +    // restore regs belonging to calling function
   1.481 +    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
   1.482 +      __ movdqu(as_XMMRegister(i), xmm_save(i));
   1.483 +    }
   1.484 +#endif
   1.485 +    __ movl(rax, 0); // return 0 (why?)
   1.486 +    __ leave(); // required for proper stackwalking of RuntimeStub frame
   1.487 +    __ ret(0);
   1.488 +
   1.489 +
   1.490 +    __ BIND(L_key_192_256);
   1.491 +    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
   1.492 +    __ cmpl(rax, 52);
   1.493 +    __ jcc(Assembler::notEqual, L_key_256);
   1.494 +
   1.495 +    // 192-bit code follows here (could be optimized to use parallelism)
   1.496 +    __ movptr(pos, 0);
   1.497 +    __ align(OptoLoopAlignment);
   1.498 +    __ BIND(L_singleBlock_loopTop_192);
   1.499 +    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
   1.500 +    __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
   1.501 +    __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
   1.502 +    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
   1.503 +      __ aesdec(xmm_result, as_XMMRegister(rnum));
   1.504 +    }
   1.505 +    aes_dec_key(xmm_result, xmm_temp, key, 0xb0);     // 192-bit key goes up to c0
   1.506 +    aes_dec_key(xmm_result, xmm_temp, key, 0xc0);
   1.507 +    __ aesdeclast(xmm_result, xmm_key_last);                    // xmm15 always came from key+0
   1.508 +    __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
   1.509 +    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
   1.510 +    // no need to store r to memory until we exit
   1.511 +    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
   1.512 +
   1.513 +    __ addptr(pos, AESBlockSize);
   1.514 +    __ subptr(len_reg, AESBlockSize);
   1.515 +    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
   1.516 +    __ jmp(L_exit);
   1.517 +
   1.518 +    __ BIND(L_key_256);
   1.519 +    // 256-bit code follows here (could be optimized to use parallelism)
   1.520 +    __ movptr(pos, 0);
   1.521 +    __ align(OptoLoopAlignment);
   1.522 +    __ BIND(L_singleBlock_loopTop_256);
   1.523 +    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
   1.524 +    __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
   1.525 +    __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
   1.526 +    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
   1.527 +      __ aesdec(xmm_result, as_XMMRegister(rnum));
   1.528 +    }
   1.529 +    aes_dec_key(xmm_result, xmm_temp, key, 0xb0);     // 256-bit key goes up to e0
   1.530 +    aes_dec_key(xmm_result, xmm_temp, key, 0xc0);
   1.531 +    aes_dec_key(xmm_result, xmm_temp, key, 0xd0);
   1.532 +    aes_dec_key(xmm_result, xmm_temp, key, 0xe0);
   1.533 +    __ aesdeclast(xmm_result, xmm_key_last);             // xmm15 came from key+0
   1.534 +    __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
   1.535 +    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
   1.536 +    // no need to store r to memory until we exit
   1.537 +    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
   1.538 +
   1.539 +    __ addptr(pos, AESBlockSize);
   1.540 +    __ subptr(len_reg, AESBlockSize);
   1.541 +    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
   1.542 +    __ jmp(L_exit);
   1.543 +
   1.544 +    return start;
   1.545 +  }
   1.546 +
   1.547 +
   1.548 +
   1.549  #undef __
   1.550  #define __ masm->
   1.551  
   1.552 @@ -3135,6 +3677,16 @@
   1.553      generate_arraycopy_stubs();
   1.554  
   1.555      generate_math_stubs();
   1.556 +
   1.557 +    // don't bother generating these AES intrinsic stubs unless global flag is set
   1.558 +    if (UseAESIntrinsics) {
   1.559 +      StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
   1.560 +
   1.561 +      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
   1.562 +      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
   1.563 +      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
   1.564 +      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
   1.565 +    }
   1.566    }
   1.567  
   1.568   public:

mercurial