1.1 --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp Tue Oct 23 13:06:37 2012 -0700 1.2 +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp Wed Oct 24 14:33:22 2012 -0700 1.3 @@ -2941,6 +2941,548 @@ 1.4 } 1.5 } 1.6 1.7 + // AES intrinsic stubs 1.8 + enum {AESBlockSize = 16}; 1.9 + 1.10 + address generate_key_shuffle_mask() { 1.11 + __ align(16); 1.12 + StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask"); 1.13 + address start = __ pc(); 1.14 + __ emit_data64( 0x0405060700010203, relocInfo::none ); 1.15 + __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none ); 1.16 + return start; 1.17 + } 1.18 + 1.19 + // Utility routine for loading a 128-bit key word in little endian format 1.20 + // can optionally specify that the shuffle mask is already in an xmmregister 1.21 + void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { 1.22 + __ movdqu(xmmdst, Address(key, offset)); 1.23 + if (xmm_shuf_mask != NULL) { 1.24 + __ pshufb(xmmdst, xmm_shuf_mask); 1.25 + } else { 1.26 + __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 1.27 + } 1.28 + } 1.29 + 1.30 + // aesenc using specified key+offset 1.31 + // can optionally specify that the shuffle mask is already in an xmmregister 1.32 + void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { 1.33 + load_key(xmmtmp, key, offset, xmm_shuf_mask); 1.34 + __ aesenc(xmmdst, xmmtmp); 1.35 + } 1.36 + 1.37 + // aesdec using specified key+offset 1.38 + // can optionally specify that the shuffle mask is already in an xmmregister 1.39 + void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { 1.40 + load_key(xmmtmp, key, offset, xmm_shuf_mask); 1.41 + __ aesdec(xmmdst, xmmtmp); 1.42 + } 1.43 + 1.44 + 1.45 + // Arguments: 1.46 + // 1.47 + // Inputs: 1.48 + // c_rarg0 - source byte array address 1.49 + // c_rarg1 - destination byte array address 1.50 + // c_rarg2 - K (key) in little endian int array 1.51 + // 1.52 + address generate_aescrypt_encryptBlock() { 1.53 + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); 1.54 + __ align(CodeEntryAlignment); 1.55 + StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 1.56 + Label L_doLast; 1.57 + address start = __ pc(); 1.58 + 1.59 + const Register from = c_rarg0; // source array address 1.60 + const Register to = c_rarg1; // destination array address 1.61 + const Register key = c_rarg2; // key array address 1.62 + const Register keylen = rax; 1.63 + 1.64 + const XMMRegister xmm_result = xmm0; 1.65 + const XMMRegister xmm_temp = xmm1; 1.66 + const XMMRegister xmm_key_shuf_mask = xmm2; 1.67 + 1.68 + __ enter(); // required for proper stackwalking of RuntimeStub frame 1.69 + 1.70 + __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 1.71 + // keylen = # of 32-bit words, convert to 128-bit words 1.72 + __ shrl(keylen, 2); 1.73 + __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more 1.74 + 1.75 + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 1.76 + __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input 1.77 + 1.78 + // For encryption, the java expanded key ordering is just what we need 1.79 + // we don't know if the key is aligned, hence not using load-execute form 1.80 + 1.81 + load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); 1.82 + __ pxor(xmm_result, xmm_temp); 1.83 + for (int offset = 0x10; offset <= 0x90; offset += 0x10) { 1.84 + aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); 1.85 + } 1.86 + load_key (xmm_temp, key, 0xa0, xmm_key_shuf_mask); 1.87 + __ cmpl(keylen, 0); 1.88 + __ jcc(Assembler::equal, L_doLast); 1.89 + __ aesenc(xmm_result, xmm_temp); // only in 192 and 256 bit keys 1.90 + aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); 1.91 + load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask); 1.92 + __ subl(keylen, 2); 1.93 + __ jcc(Assembler::equal, L_doLast); 1.94 + __ aesenc(xmm_result, xmm_temp); // only in 256 bit keys 1.95 + aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); 1.96 + load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask); 1.97 + 1.98 + __ BIND(L_doLast); 1.99 + __ aesenclast(xmm_result, xmm_temp); 1.100 + __ movdqu(Address(to, 0), xmm_result); // store the result 1.101 + __ xorptr(rax, rax); // return 0 1.102 + __ leave(); // required for proper stackwalking of RuntimeStub frame 1.103 + __ ret(0); 1.104 + 1.105 + return start; 1.106 + } 1.107 + 1.108 + 1.109 + // Arguments: 1.110 + // 1.111 + // Inputs: 1.112 + // c_rarg0 - source byte array address 1.113 + // c_rarg1 - destination byte array address 1.114 + // c_rarg2 - K (key) in little endian int array 1.115 + // 1.116 + address generate_aescrypt_decryptBlock() { 1.117 + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); 1.118 + __ align(CodeEntryAlignment); 1.119 + StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 1.120 + Label L_doLast; 1.121 + address start = __ pc(); 1.122 + 1.123 + const Register from = c_rarg0; // source array address 1.124 + const Register to = c_rarg1; // destination array address 1.125 + const Register key = c_rarg2; // key array address 1.126 + const Register keylen = rax; 1.127 + 1.128 + const XMMRegister xmm_result = xmm0; 1.129 + const XMMRegister xmm_temp = xmm1; 1.130 + const XMMRegister xmm_key_shuf_mask = xmm2; 1.131 + 1.132 + __ enter(); // required for proper stackwalking of RuntimeStub frame 1.133 + 1.134 + __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 1.135 + // keylen = # of 32-bit words, convert to 128-bit words 1.136 + __ shrl(keylen, 2); 1.137 + __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more 1.138 + 1.139 + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 1.140 + __ movdqu(xmm_result, Address(from, 0)); 1.141 + 1.142 + // for decryption java expanded key ordering is rotated one position from what we want 1.143 + // so we start from 0x10 here and hit 0x00 last 1.144 + // we don't know if the key is aligned, hence not using load-execute form 1.145 + load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask); 1.146 + __ pxor (xmm_result, xmm_temp); 1.147 + for (int offset = 0x20; offset <= 0xa0; offset += 0x10) { 1.148 + aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); 1.149 + } 1.150 + __ cmpl(keylen, 0); 1.151 + __ jcc(Assembler::equal, L_doLast); 1.152 + // only in 192 and 256 bit keys 1.153 + aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); 1.154 + aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask); 1.155 + __ subl(keylen, 2); 1.156 + __ jcc(Assembler::equal, L_doLast); 1.157 + // only in 256 bit keys 1.158 + aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); 1.159 + aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask); 1.160 + 1.161 + __ BIND(L_doLast); 1.162 + // for decryption the aesdeclast operation is always on key+0x00 1.163 + load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); 1.164 + __ aesdeclast(xmm_result, xmm_temp); 1.165 + 1.166 + __ movdqu(Address(to, 0), xmm_result); // store the result 1.167 + 1.168 + __ xorptr(rax, rax); // return 0 1.169 + __ leave(); // required for proper stackwalking of RuntimeStub frame 1.170 + __ ret(0); 1.171 + 1.172 + return start; 1.173 + } 1.174 + 1.175 + 1.176 + // Arguments: 1.177 + // 1.178 + // Inputs: 1.179 + // c_rarg0 - source byte array address 1.180 + // c_rarg1 - destination byte array address 1.181 + // c_rarg2 - K (key) in little endian int array 1.182 + // c_rarg3 - r vector byte array address 1.183 + // c_rarg4 - input length 1.184 + // 1.185 + address generate_cipherBlockChaining_encryptAESCrypt() { 1.186 + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); 1.187 + __ align(CodeEntryAlignment); 1.188 + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 1.189 + address start = __ pc(); 1.190 + 1.191 + Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256; 1.192 + const Register from = c_rarg0; // source array address 1.193 + const Register to = c_rarg1; // destination array address 1.194 + const Register key = c_rarg2; // key array address 1.195 + const Register rvec = c_rarg3; // r byte array initialized from initvector array address 1.196 + // and left with the results of the last encryption block 1.197 +#ifndef _WIN64 1.198 + const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 1.199 +#else 1.200 + const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64 1.201 + const Register len_reg = r10; // pick the first volatile windows register 1.202 +#endif 1.203 + const Register pos = rax; 1.204 + 1.205 + // xmm register assignments for the loops below 1.206 + const XMMRegister xmm_result = xmm0; 1.207 + const XMMRegister xmm_temp = xmm1; 1.208 + // keys 0-10 preloaded into xmm2-xmm12 1.209 + const int XMM_REG_NUM_KEY_FIRST = 2; 1.210 + const int XMM_REG_NUM_KEY_LAST = 12; 1.211 + const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); 1.212 + const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_LAST); 1.213 + 1.214 + __ enter(); // required for proper stackwalking of RuntimeStub frame 1.215 + 1.216 +#ifdef _WIN64 1.217 + // on win64, fill len_reg from stack position 1.218 + __ movl(len_reg, len_mem); 1.219 + // save the xmm registers which must be preserved 6-12 1.220 + __ subptr(rsp, -rsp_after_call_off * wordSize); 1.221 + for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { 1.222 + __ movdqu(xmm_save(i), as_XMMRegister(i)); 1.223 + } 1.224 +#endif 1.225 + 1.226 + const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front 1.227 + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 1.228 + // load up xmm regs 2 thru 12 with key 0x00 - 0xa0 1.229 + for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 1.230 + load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); 1.231 + offset += 0x10; 1.232 + } 1.233 + 1.234 + __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec 1.235 + 1.236 + // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) 1.237 + __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 1.238 + __ cmpl(rax, 44); 1.239 + __ jcc(Assembler::notEqual, L_key_192_256); 1.240 + 1.241 + // 128 bit code follows here 1.242 + __ movptr(pos, 0); 1.243 + __ align(OptoLoopAlignment); 1.244 + __ BIND(L_loopTop_128); 1.245 + __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 1.246 + __ pxor (xmm_result, xmm_temp); // xor with the current r vector 1.247 + 1.248 + __ pxor (xmm_result, xmm_key0); // do the aes rounds 1.249 + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { 1.250 + __ aesenc(xmm_result, as_XMMRegister(rnum)); 1.251 + } 1.252 + __ aesenclast(xmm_result, xmm_key10); 1.253 + 1.254 + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 1.255 + // no need to store r to memory until we exit 1.256 + __ addptr(pos, AESBlockSize); 1.257 + __ subptr(len_reg, AESBlockSize); 1.258 + __ jcc(Assembler::notEqual, L_loopTop_128); 1.259 + 1.260 + __ BIND(L_exit); 1.261 + __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object 1.262 + 1.263 +#ifdef _WIN64 1.264 + // restore xmm regs belonging to calling function 1.265 + for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { 1.266 + __ movdqu(as_XMMRegister(i), xmm_save(i)); 1.267 + } 1.268 +#endif 1.269 + __ movl(rax, 0); // return 0 (why?) 1.270 + __ leave(); // required for proper stackwalking of RuntimeStub frame 1.271 + __ ret(0); 1.272 + 1.273 + __ BIND(L_key_192_256); 1.274 + // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) 1.275 + __ cmpl(rax, 52); 1.276 + __ jcc(Assembler::notEqual, L_key_256); 1.277 + 1.278 + // 192-bit code follows here (could be changed to use more xmm registers) 1.279 + __ movptr(pos, 0); 1.280 + __ align(OptoLoopAlignment); 1.281 + __ BIND(L_loopTop_192); 1.282 + __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 1.283 + __ pxor (xmm_result, xmm_temp); // xor with the current r vector 1.284 + 1.285 + __ pxor (xmm_result, xmm_key0); // do the aes rounds 1.286 + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 1.287 + __ aesenc(xmm_result, as_XMMRegister(rnum)); 1.288 + } 1.289 + aes_enc_key(xmm_result, xmm_temp, key, 0xb0); 1.290 + load_key(xmm_temp, key, 0xc0); 1.291 + __ aesenclast(xmm_result, xmm_temp); 1.292 + 1.293 + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 1.294 + // no need to store r to memory until we exit 1.295 + __ addptr(pos, AESBlockSize); 1.296 + __ subptr(len_reg, AESBlockSize); 1.297 + __ jcc(Assembler::notEqual, L_loopTop_192); 1.298 + __ jmp(L_exit); 1.299 + 1.300 + __ BIND(L_key_256); 1.301 + // 256-bit code follows here (could be changed to use more xmm registers) 1.302 + __ movptr(pos, 0); 1.303 + __ align(OptoLoopAlignment); 1.304 + __ BIND(L_loopTop_256); 1.305 + __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 1.306 + __ pxor (xmm_result, xmm_temp); // xor with the current r vector 1.307 + 1.308 + __ pxor (xmm_result, xmm_key0); // do the aes rounds 1.309 + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 1.310 + __ aesenc(xmm_result, as_XMMRegister(rnum)); 1.311 + } 1.312 + aes_enc_key(xmm_result, xmm_temp, key, 0xb0); 1.313 + aes_enc_key(xmm_result, xmm_temp, key, 0xc0); 1.314 + aes_enc_key(xmm_result, xmm_temp, key, 0xd0); 1.315 + load_key(xmm_temp, key, 0xe0); 1.316 + __ aesenclast(xmm_result, xmm_temp); 1.317 + 1.318 + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 1.319 + // no need to store r to memory until we exit 1.320 + __ addptr(pos, AESBlockSize); 1.321 + __ subptr(len_reg, AESBlockSize); 1.322 + __ jcc(Assembler::notEqual, L_loopTop_256); 1.323 + __ jmp(L_exit); 1.324 + 1.325 + return start; 1.326 + } 1.327 + 1.328 + 1.329 + 1.330 + // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time 1.331 + // to hide instruction latency 1.332 + // 1.333 + // Arguments: 1.334 + // 1.335 + // Inputs: 1.336 + // c_rarg0 - source byte array address 1.337 + // c_rarg1 - destination byte array address 1.338 + // c_rarg2 - K (key) in little endian int array 1.339 + // c_rarg3 - r vector byte array address 1.340 + // c_rarg4 - input length 1.341 + // 1.342 + 1.343 + address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { 1.344 + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); 1.345 + __ align(CodeEntryAlignment); 1.346 + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 1.347 + address start = __ pc(); 1.348 + 1.349 + Label L_exit, L_key_192_256, L_key_256; 1.350 + Label L_singleBlock_loopTop_128, L_multiBlock_loopTop_128; 1.351 + Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256; 1.352 + const Register from = c_rarg0; // source array address 1.353 + const Register to = c_rarg1; // destination array address 1.354 + const Register key = c_rarg2; // key array address 1.355 + const Register rvec = c_rarg3; // r byte array initialized from initvector array address 1.356 + // and left with the results of the last encryption block 1.357 +#ifndef _WIN64 1.358 + const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 1.359 +#else 1.360 + const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64 1.361 + const Register len_reg = r10; // pick the first volatile windows register 1.362 +#endif 1.363 + const Register pos = rax; 1.364 + 1.365 + // xmm register assignments for the loops below 1.366 + const XMMRegister xmm_result = xmm0; 1.367 + // keys 0-10 preloaded into xmm2-xmm12 1.368 + const int XMM_REG_NUM_KEY_FIRST = 5; 1.369 + const int XMM_REG_NUM_KEY_LAST = 15; 1.370 + const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); 1.371 + const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST); 1.372 + 1.373 + __ enter(); // required for proper stackwalking of RuntimeStub frame 1.374 + 1.375 +#ifdef _WIN64 1.376 + // on win64, fill len_reg from stack position 1.377 + __ movl(len_reg, len_mem); 1.378 + // save the xmm registers which must be preserved 6-15 1.379 + __ subptr(rsp, -rsp_after_call_off * wordSize); 1.380 + for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { 1.381 + __ movdqu(xmm_save(i), as_XMMRegister(i)); 1.382 + } 1.383 +#endif 1.384 + // the java expanded key ordering is rotated one position from what we want 1.385 + // so we start from 0x10 here and hit 0x00 last 1.386 + const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front 1.387 + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 1.388 + // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00 1.389 + for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 1.390 + if (rnum == XMM_REG_NUM_KEY_LAST) offset = 0x00; 1.391 + load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); 1.392 + offset += 0x10; 1.393 + } 1.394 + 1.395 + const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block 1.396 + // registers holding the four results in the parallelized loop 1.397 + const XMMRegister xmm_result0 = xmm0; 1.398 + const XMMRegister xmm_result1 = xmm2; 1.399 + const XMMRegister xmm_result2 = xmm3; 1.400 + const XMMRegister xmm_result3 = xmm4; 1.401 + 1.402 + __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec 1.403 + 1.404 + // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) 1.405 + __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 1.406 + __ cmpl(rax, 44); 1.407 + __ jcc(Assembler::notEqual, L_key_192_256); 1.408 + 1.409 + 1.410 + // 128-bit code follows here, parallelized 1.411 + __ movptr(pos, 0); 1.412 + __ align(OptoLoopAlignment); 1.413 + __ BIND(L_multiBlock_loopTop_128); 1.414 + __ cmpptr(len_reg, 4*AESBlockSize); // see if at least 4 blocks left 1.415 + __ jcc(Assembler::less, L_singleBlock_loopTop_128); 1.416 + 1.417 + __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0*AESBlockSize)); // get next 4 blocks into xmmresult registers 1.418 + __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1*AESBlockSize)); 1.419 + __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2*AESBlockSize)); 1.420 + __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3*AESBlockSize)); 1.421 + 1.422 +#define DoFour(opc, src_reg) \ 1.423 + __ opc(xmm_result0, src_reg); \ 1.424 + __ opc(xmm_result1, src_reg); \ 1.425 + __ opc(xmm_result2, src_reg); \ 1.426 + __ opc(xmm_result3, src_reg); 1.427 + 1.428 + DoFour(pxor, xmm_key_first); 1.429 + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { 1.430 + DoFour(aesdec, as_XMMRegister(rnum)); 1.431 + } 1.432 + DoFour(aesdeclast, xmm_key_last); 1.433 + // for each result, xor with the r vector of previous cipher block 1.434 + __ pxor(xmm_result0, xmm_prev_block_cipher); 1.435 + __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0*AESBlockSize)); 1.436 + __ pxor(xmm_result1, xmm_prev_block_cipher); 1.437 + __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1*AESBlockSize)); 1.438 + __ pxor(xmm_result2, xmm_prev_block_cipher); 1.439 + __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2*AESBlockSize)); 1.440 + __ pxor(xmm_result3, xmm_prev_block_cipher); 1.441 + __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3*AESBlockSize)); // this will carry over to next set of blocks 1.442 + 1.443 + __ movdqu(Address(to, pos, Address::times_1, 0*AESBlockSize), xmm_result0); // store 4 results into the next 64 bytes of output 1.444 + __ movdqu(Address(to, pos, Address::times_1, 1*AESBlockSize), xmm_result1); 1.445 + __ movdqu(Address(to, pos, Address::times_1, 2*AESBlockSize), xmm_result2); 1.446 + __ movdqu(Address(to, pos, Address::times_1, 3*AESBlockSize), xmm_result3); 1.447 + 1.448 + __ addptr(pos, 4*AESBlockSize); 1.449 + __ subptr(len_reg, 4*AESBlockSize); 1.450 + __ jmp(L_multiBlock_loopTop_128); 1.451 + 1.452 + // registers used in the non-parallelized loops 1.453 + const XMMRegister xmm_prev_block_cipher_save = xmm2; 1.454 + const XMMRegister xmm_temp = xmm3; 1.455 + 1.456 + __ align(OptoLoopAlignment); 1.457 + __ BIND(L_singleBlock_loopTop_128); 1.458 + __ cmpptr(len_reg, 0); // any blocks left?? 1.459 + __ jcc(Assembler::equal, L_exit); 1.460 + __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input 1.461 + __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector 1.462 + __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds 1.463 + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { 1.464 + __ aesdec(xmm_result, as_XMMRegister(rnum)); 1.465 + } 1.466 + __ aesdeclast(xmm_result, xmm_key_last); 1.467 + __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector 1.468 + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 1.469 + // no need to store r to memory until we exit 1.470 + __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block 1.471 + 1.472 + __ addptr(pos, AESBlockSize); 1.473 + __ subptr(len_reg, AESBlockSize); 1.474 + __ jmp(L_singleBlock_loopTop_128); 1.475 + 1.476 + 1.477 + __ BIND(L_exit); 1.478 + __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object 1.479 +#ifdef _WIN64 1.480 + // restore regs belonging to calling function 1.481 + for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { 1.482 + __ movdqu(as_XMMRegister(i), xmm_save(i)); 1.483 + } 1.484 +#endif 1.485 + __ movl(rax, 0); // return 0 (why?) 1.486 + __ leave(); // required for proper stackwalking of RuntimeStub frame 1.487 + __ ret(0); 1.488 + 1.489 + 1.490 + __ BIND(L_key_192_256); 1.491 + // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) 1.492 + __ cmpl(rax, 52); 1.493 + __ jcc(Assembler::notEqual, L_key_256); 1.494 + 1.495 + // 192-bit code follows here (could be optimized to use parallelism) 1.496 + __ movptr(pos, 0); 1.497 + __ align(OptoLoopAlignment); 1.498 + __ BIND(L_singleBlock_loopTop_192); 1.499 + __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input 1.500 + __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector 1.501 + __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds 1.502 + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { 1.503 + __ aesdec(xmm_result, as_XMMRegister(rnum)); 1.504 + } 1.505 + aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 192-bit key goes up to c0 1.506 + aes_dec_key(xmm_result, xmm_temp, key, 0xc0); 1.507 + __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0 1.508 + __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector 1.509 + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 1.510 + // no need to store r to memory until we exit 1.511 + __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block 1.512 + 1.513 + __ addptr(pos, AESBlockSize); 1.514 + __ subptr(len_reg, AESBlockSize); 1.515 + __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192); 1.516 + __ jmp(L_exit); 1.517 + 1.518 + __ BIND(L_key_256); 1.519 + // 256-bit code follows here (could be optimized to use parallelism) 1.520 + __ movptr(pos, 0); 1.521 + __ align(OptoLoopAlignment); 1.522 + __ BIND(L_singleBlock_loopTop_256); 1.523 + __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input 1.524 + __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector 1.525 + __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds 1.526 + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { 1.527 + __ aesdec(xmm_result, as_XMMRegister(rnum)); 1.528 + } 1.529 + aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 256-bit key goes up to e0 1.530 + aes_dec_key(xmm_result, xmm_temp, key, 0xc0); 1.531 + aes_dec_key(xmm_result, xmm_temp, key, 0xd0); 1.532 + aes_dec_key(xmm_result, xmm_temp, key, 0xe0); 1.533 + __ aesdeclast(xmm_result, xmm_key_last); // xmm15 came from key+0 1.534 + __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector 1.535 + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 1.536 + // no need to store r to memory until we exit 1.537 + __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block 1.538 + 1.539 + __ addptr(pos, AESBlockSize); 1.540 + __ subptr(len_reg, AESBlockSize); 1.541 + __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256); 1.542 + __ jmp(L_exit); 1.543 + 1.544 + return start; 1.545 + } 1.546 + 1.547 + 1.548 + 1.549 #undef __ 1.550 #define __ masm-> 1.551 1.552 @@ -3135,6 +3677,16 @@ 1.553 generate_arraycopy_stubs(); 1.554 1.555 generate_math_stubs(); 1.556 + 1.557 + // don't bother generating these AES intrinsic stubs unless global flag is set 1.558 + if (UseAESIntrinsics) { 1.559 + StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others 1.560 + 1.561 + StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 1.562 + StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 1.563 + StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 1.564 + StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); 1.565 + } 1.566 } 1.567 1.568 public: