1.1 --- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp Tue Oct 23 13:06:37 2012 -0700 1.2 +++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp Wed Oct 24 14:33:22 2012 -0700 1.3 @@ -2137,6 +2137,529 @@ 1.4 } 1.5 } 1.6 1.7 + // AES intrinsic stubs 1.8 + enum {AESBlockSize = 16}; 1.9 + 1.10 + address generate_key_shuffle_mask() { 1.11 + __ align(16); 1.12 + StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask"); 1.13 + address start = __ pc(); 1.14 + __ emit_data(0x00010203, relocInfo::none, 0 ); 1.15 + __ emit_data(0x04050607, relocInfo::none, 0 ); 1.16 + __ emit_data(0x08090a0b, relocInfo::none, 0 ); 1.17 + __ emit_data(0x0c0d0e0f, relocInfo::none, 0 ); 1.18 + return start; 1.19 + } 1.20 + 1.21 + // Utility routine for loading a 128-bit key word in little endian format 1.22 + // can optionally specify that the shuffle mask is already in an xmmregister 1.23 + void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { 1.24 + __ movdqu(xmmdst, Address(key, offset)); 1.25 + if (xmm_shuf_mask != NULL) { 1.26 + __ pshufb(xmmdst, xmm_shuf_mask); 1.27 + } else { 1.28 + __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 1.29 + } 1.30 + } 1.31 + 1.32 + // aesenc using specified key+offset 1.33 + // can optionally specify that the shuffle mask is already in an xmmregister 1.34 + void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { 1.35 + load_key(xmmtmp, key, offset, xmm_shuf_mask); 1.36 + __ aesenc(xmmdst, xmmtmp); 1.37 + } 1.38 + 1.39 + // aesdec using specified key+offset 1.40 + // can optionally specify that the shuffle mask is already in an xmmregister 1.41 + void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { 1.42 + load_key(xmmtmp, key, offset, xmm_shuf_mask); 1.43 + __ aesdec(xmmdst, xmmtmp); 1.44 + } 1.45 + 1.46 + 1.47 + // Arguments: 1.48 + // 1.49 + // Inputs: 1.50 + // c_rarg0 - source byte array address 1.51 + // c_rarg1 - destination byte array address 1.52 + // c_rarg2 - K (key) in little endian int array 1.53 + // 1.54 + address generate_aescrypt_encryptBlock() { 1.55 + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); 1.56 + __ align(CodeEntryAlignment); 1.57 + StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 1.58 + Label L_doLast; 1.59 + address start = __ pc(); 1.60 + 1.61 + const Register from = rsi; // source array address 1.62 + const Register to = rdx; // destination array address 1.63 + const Register key = rcx; // key array address 1.64 + const Register keylen = rax; 1.65 + const Address from_param(rbp, 8+0); 1.66 + const Address to_param (rbp, 8+4); 1.67 + const Address key_param (rbp, 8+8); 1.68 + 1.69 + const XMMRegister xmm_result = xmm0; 1.70 + const XMMRegister xmm_temp = xmm1; 1.71 + const XMMRegister xmm_key_shuf_mask = xmm2; 1.72 + 1.73 + __ enter(); // required for proper stackwalking of RuntimeStub frame 1.74 + __ push(rsi); 1.75 + __ movptr(from , from_param); 1.76 + __ movptr(to , to_param); 1.77 + __ movptr(key , key_param); 1.78 + 1.79 + __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 1.80 + // keylen = # of 32-bit words, convert to 128-bit words 1.81 + __ shrl(keylen, 2); 1.82 + __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more 1.83 + 1.84 + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 1.85 + __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input 1.86 + 1.87 + // For encryption, the java expanded key ordering is just what we need 1.88 + 1.89 + load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); 1.90 + __ pxor(xmm_result, xmm_temp); 1.91 + for (int offset = 0x10; offset <= 0x90; offset += 0x10) { 1.92 + aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); 1.93 + } 1.94 + load_key (xmm_temp, key, 0xa0, xmm_key_shuf_mask); 1.95 + __ cmpl(keylen, 0); 1.96 + __ jcc(Assembler::equal, L_doLast); 1.97 + __ aesenc(xmm_result, xmm_temp); // only in 192 and 256 bit keys 1.98 + aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); 1.99 + load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask); 1.100 + __ subl(keylen, 2); 1.101 + __ jcc(Assembler::equal, L_doLast); 1.102 + __ aesenc(xmm_result, xmm_temp); // only in 256 bit keys 1.103 + aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); 1.104 + load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask); 1.105 + 1.106 + __ BIND(L_doLast); 1.107 + __ aesenclast(xmm_result, xmm_temp); 1.108 + __ movdqu(Address(to, 0), xmm_result); // store the result 1.109 + __ xorptr(rax, rax); // return 0 1.110 + __ pop(rsi); 1.111 + __ leave(); // required for proper stackwalking of RuntimeStub frame 1.112 + __ ret(0); 1.113 + 1.114 + return start; 1.115 + } 1.116 + 1.117 + 1.118 + // Arguments: 1.119 + // 1.120 + // Inputs: 1.121 + // c_rarg0 - source byte array address 1.122 + // c_rarg1 - destination byte array address 1.123 + // c_rarg2 - K (key) in little endian int array 1.124 + // 1.125 + address generate_aescrypt_decryptBlock() { 1.126 + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); 1.127 + __ align(CodeEntryAlignment); 1.128 + StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 1.129 + Label L_doLast; 1.130 + address start = __ pc(); 1.131 + 1.132 + const Register from = rsi; // source array address 1.133 + const Register to = rdx; // destination array address 1.134 + const Register key = rcx; // key array address 1.135 + const Register keylen = rax; 1.136 + const Address from_param(rbp, 8+0); 1.137 + const Address to_param (rbp, 8+4); 1.138 + const Address key_param (rbp, 8+8); 1.139 + 1.140 + const XMMRegister xmm_result = xmm0; 1.141 + const XMMRegister xmm_temp = xmm1; 1.142 + const XMMRegister xmm_key_shuf_mask = xmm2; 1.143 + 1.144 + __ enter(); // required for proper stackwalking of RuntimeStub frame 1.145 + __ push(rsi); 1.146 + __ movptr(from , from_param); 1.147 + __ movptr(to , to_param); 1.148 + __ movptr(key , key_param); 1.149 + 1.150 + __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 1.151 + // keylen = # of 32-bit words, convert to 128-bit words 1.152 + __ shrl(keylen, 2); 1.153 + __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more 1.154 + 1.155 + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 1.156 + __ movdqu(xmm_result, Address(from, 0)); 1.157 + 1.158 + // for decryption java expanded key ordering is rotated one position from what we want 1.159 + // so we start from 0x10 here and hit 0x00 last 1.160 + // we don't know if the key is aligned, hence not using load-execute form 1.161 + load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask); 1.162 + __ pxor (xmm_result, xmm_temp); 1.163 + for (int offset = 0x20; offset <= 0xa0; offset += 0x10) { 1.164 + aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); 1.165 + } 1.166 + __ cmpl(keylen, 0); 1.167 + __ jcc(Assembler::equal, L_doLast); 1.168 + // only in 192 and 256 bit keys 1.169 + aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); 1.170 + aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask); 1.171 + __ subl(keylen, 2); 1.172 + __ jcc(Assembler::equal, L_doLast); 1.173 + // only in 256 bit keys 1.174 + aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); 1.175 + aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask); 1.176 + 1.177 + __ BIND(L_doLast); 1.178 + // for decryption the aesdeclast operation is always on key+0x00 1.179 + load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); 1.180 + __ aesdeclast(xmm_result, xmm_temp); 1.181 + 1.182 + __ movdqu(Address(to, 0), xmm_result); // store the result 1.183 + 1.184 + __ xorptr(rax, rax); // return 0 1.185 + __ pop(rsi); 1.186 + __ leave(); // required for proper stackwalking of RuntimeStub frame 1.187 + __ ret(0); 1.188 + 1.189 + return start; 1.190 + } 1.191 + 1.192 + void handleSOERegisters(bool saving) { 1.193 + const int saveFrameSizeInBytes = 4 * wordSize; 1.194 + const Address saved_rbx (rbp, -3 * wordSize); 1.195 + const Address saved_rsi (rbp, -2 * wordSize); 1.196 + const Address saved_rdi (rbp, -1 * wordSize); 1.197 + 1.198 + if (saving) { 1.199 + __ subptr(rsp, saveFrameSizeInBytes); 1.200 + __ movptr(saved_rsi, rsi); 1.201 + __ movptr(saved_rdi, rdi); 1.202 + __ movptr(saved_rbx, rbx); 1.203 + } else { 1.204 + // restoring 1.205 + __ movptr(rsi, saved_rsi); 1.206 + __ movptr(rdi, saved_rdi); 1.207 + __ movptr(rbx, saved_rbx); 1.208 + } 1.209 + } 1.210 + 1.211 + // Arguments: 1.212 + // 1.213 + // Inputs: 1.214 + // c_rarg0 - source byte array address 1.215 + // c_rarg1 - destination byte array address 1.216 + // c_rarg2 - K (key) in little endian int array 1.217 + // c_rarg3 - r vector byte array address 1.218 + // c_rarg4 - input length 1.219 + // 1.220 + address generate_cipherBlockChaining_encryptAESCrypt() { 1.221 + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); 1.222 + __ align(CodeEntryAlignment); 1.223 + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 1.224 + address start = __ pc(); 1.225 + 1.226 + Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256; 1.227 + const Register from = rsi; // source array address 1.228 + const Register to = rdx; // destination array address 1.229 + const Register key = rcx; // key array address 1.230 + const Register rvec = rdi; // r byte array initialized from initvector array address 1.231 + // and left with the results of the last encryption block 1.232 + const Register len_reg = rbx; // src len (must be multiple of blocksize 16) 1.233 + const Register pos = rax; 1.234 + 1.235 + // xmm register assignments for the loops below 1.236 + const XMMRegister xmm_result = xmm0; 1.237 + const XMMRegister xmm_temp = xmm1; 1.238 + // first 6 keys preloaded into xmm2-xmm7 1.239 + const int XMM_REG_NUM_KEY_FIRST = 2; 1.240 + const int XMM_REG_NUM_KEY_LAST = 7; 1.241 + const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); 1.242 + 1.243 + __ enter(); // required for proper stackwalking of RuntimeStub frame 1.244 + handleSOERegisters(true /*saving*/); 1.245 + 1.246 + // load registers from incoming parameters 1.247 + const Address from_param(rbp, 8+0); 1.248 + const Address to_param (rbp, 8+4); 1.249 + const Address key_param (rbp, 8+8); 1.250 + const Address rvec_param (rbp, 8+12); 1.251 + const Address len_param (rbp, 8+16); 1.252 + __ movptr(from , from_param); 1.253 + __ movptr(to , to_param); 1.254 + __ movptr(key , key_param); 1.255 + __ movptr(rvec , rvec_param); 1.256 + __ movptr(len_reg , len_param); 1.257 + 1.258 + const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front 1.259 + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 1.260 + // load up xmm regs 2 thru 7 with keys 0-5 1.261 + for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 1.262 + load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); 1.263 + offset += 0x10; 1.264 + } 1.265 + 1.266 + __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec 1.267 + 1.268 + // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) 1.269 + __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 1.270 + __ cmpl(rax, 44); 1.271 + __ jcc(Assembler::notEqual, L_key_192_256); 1.272 + 1.273 + // 128 bit code follows here 1.274 + __ movptr(pos, 0); 1.275 + __ align(OptoLoopAlignment); 1.276 + __ BIND(L_loopTop_128); 1.277 + __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 1.278 + __ pxor (xmm_result, xmm_temp); // xor with the current r vector 1.279 + 1.280 + __ pxor (xmm_result, xmm_key0); // do the aes rounds 1.281 + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 1.282 + __ aesenc(xmm_result, as_XMMRegister(rnum)); 1.283 + } 1.284 + for (int key_offset = 0x60; key_offset <= 0x90; key_offset += 0x10) { 1.285 + aes_enc_key(xmm_result, xmm_temp, key, key_offset); 1.286 + } 1.287 + load_key(xmm_temp, key, 0xa0); 1.288 + __ aesenclast(xmm_result, xmm_temp); 1.289 + 1.290 + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 1.291 + // no need to store r to memory until we exit 1.292 + __ addptr(pos, AESBlockSize); 1.293 + __ subptr(len_reg, AESBlockSize); 1.294 + __ jcc(Assembler::notEqual, L_loopTop_128); 1.295 + 1.296 + __ BIND(L_exit); 1.297 + __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object 1.298 + 1.299 + handleSOERegisters(false /*restoring*/); 1.300 + __ movl(rax, 0); // return 0 (why?) 1.301 + __ leave(); // required for proper stackwalking of RuntimeStub frame 1.302 + __ ret(0); 1.303 + 1.304 + __ BIND(L_key_192_256); 1.305 + // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) 1.306 + __ cmpl(rax, 52); 1.307 + __ jcc(Assembler::notEqual, L_key_256); 1.308 + 1.309 + // 192-bit code follows here (could be changed to use more xmm registers) 1.310 + __ movptr(pos, 0); 1.311 + __ align(OptoLoopAlignment); 1.312 + __ BIND(L_loopTop_192); 1.313 + __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 1.314 + __ pxor (xmm_result, xmm_temp); // xor with the current r vector 1.315 + 1.316 + __ pxor (xmm_result, xmm_key0); // do the aes rounds 1.317 + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 1.318 + __ aesenc(xmm_result, as_XMMRegister(rnum)); 1.319 + } 1.320 + for (int key_offset = 0x60; key_offset <= 0xb0; key_offset += 0x10) { 1.321 + aes_enc_key(xmm_result, xmm_temp, key, key_offset); 1.322 + } 1.323 + load_key(xmm_temp, key, 0xc0); 1.324 + __ aesenclast(xmm_result, xmm_temp); 1.325 + 1.326 + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 1.327 + // no need to store r to memory until we exit 1.328 + __ addptr(pos, AESBlockSize); 1.329 + __ subptr(len_reg, AESBlockSize); 1.330 + __ jcc(Assembler::notEqual, L_loopTop_192); 1.331 + __ jmp(L_exit); 1.332 + 1.333 + __ BIND(L_key_256); 1.334 + // 256-bit code follows here (could be changed to use more xmm registers) 1.335 + __ movptr(pos, 0); 1.336 + __ align(OptoLoopAlignment); 1.337 + __ BIND(L_loopTop_256); 1.338 + __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 1.339 + __ pxor (xmm_result, xmm_temp); // xor with the current r vector 1.340 + 1.341 + __ pxor (xmm_result, xmm_key0); // do the aes rounds 1.342 + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 1.343 + __ aesenc(xmm_result, as_XMMRegister(rnum)); 1.344 + } 1.345 + for (int key_offset = 0x60; key_offset <= 0xd0; key_offset += 0x10) { 1.346 + aes_enc_key(xmm_result, xmm_temp, key, key_offset); 1.347 + } 1.348 + load_key(xmm_temp, key, 0xe0); 1.349 + __ aesenclast(xmm_result, xmm_temp); 1.350 + 1.351 + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 1.352 + // no need to store r to memory until we exit 1.353 + __ addptr(pos, AESBlockSize); 1.354 + __ subptr(len_reg, AESBlockSize); 1.355 + __ jcc(Assembler::notEqual, L_loopTop_256); 1.356 + __ jmp(L_exit); 1.357 + 1.358 + return start; 1.359 + } 1.360 + 1.361 + 1.362 + // CBC AES Decryption. 1.363 + // In 32-bit stub, because of lack of registers we do not try to parallelize 4 blocks at a time. 1.364 + // 1.365 + // Arguments: 1.366 + // 1.367 + // Inputs: 1.368 + // c_rarg0 - source byte array address 1.369 + // c_rarg1 - destination byte array address 1.370 + // c_rarg2 - K (key) in little endian int array 1.371 + // c_rarg3 - r vector byte array address 1.372 + // c_rarg4 - input length 1.373 + // 1.374 + 1.375 + address generate_cipherBlockChaining_decryptAESCrypt() { 1.376 + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); 1.377 + __ align(CodeEntryAlignment); 1.378 + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 1.379 + address start = __ pc(); 1.380 + 1.381 + Label L_exit, L_key_192_256, L_key_256; 1.382 + Label L_singleBlock_loopTop_128; 1.383 + Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256; 1.384 + const Register from = rsi; // source array address 1.385 + const Register to = rdx; // destination array address 1.386 + const Register key = rcx; // key array address 1.387 + const Register rvec = rdi; // r byte array initialized from initvector array address 1.388 + // and left with the results of the last encryption block 1.389 + const Register len_reg = rbx; // src len (must be multiple of blocksize 16) 1.390 + const Register pos = rax; 1.391 + 1.392 + // xmm register assignments for the loops below 1.393 + const XMMRegister xmm_result = xmm0; 1.394 + const XMMRegister xmm_temp = xmm1; 1.395 + // first 6 keys preloaded into xmm2-xmm7 1.396 + const int XMM_REG_NUM_KEY_FIRST = 2; 1.397 + const int XMM_REG_NUM_KEY_LAST = 7; 1.398 + const int FIRST_NON_REG_KEY_offset = 0x70; 1.399 + const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); 1.400 + 1.401 + __ enter(); // required for proper stackwalking of RuntimeStub frame 1.402 + handleSOERegisters(true /*saving*/); 1.403 + 1.404 + // load registers from incoming parameters 1.405 + const Address from_param(rbp, 8+0); 1.406 + const Address to_param (rbp, 8+4); 1.407 + const Address key_param (rbp, 8+8); 1.408 + const Address rvec_param (rbp, 8+12); 1.409 + const Address len_param (rbp, 8+16); 1.410 + __ movptr(from , from_param); 1.411 + __ movptr(to , to_param); 1.412 + __ movptr(key , key_param); 1.413 + __ movptr(rvec , rvec_param); 1.414 + __ movptr(len_reg , len_param); 1.415 + 1.416 + // the java expanded key ordering is rotated one position from what we want 1.417 + // so we start from 0x10 here and hit 0x00 last 1.418 + const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front 1.419 + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 1.420 + // load up xmm regs 2 thru 6 with first 5 keys 1.421 + for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 1.422 + load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); 1.423 + offset += 0x10; 1.424 + } 1.425 + 1.426 + // inside here, use the rvec register to point to previous block cipher 1.427 + // with which we xor at the end of each newly decrypted block 1.428 + const Register prev_block_cipher_ptr = rvec; 1.429 + 1.430 + // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) 1.431 + __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 1.432 + __ cmpl(rax, 44); 1.433 + __ jcc(Assembler::notEqual, L_key_192_256); 1.434 + 1.435 + 1.436 + // 128-bit code follows here, parallelized 1.437 + __ movptr(pos, 0); 1.438 + __ align(OptoLoopAlignment); 1.439 + __ BIND(L_singleBlock_loopTop_128); 1.440 + __ cmpptr(len_reg, 0); // any blocks left?? 1.441 + __ jcc(Assembler::equal, L_exit); 1.442 + __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input 1.443 + __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds 1.444 + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 1.445 + __ aesdec(xmm_result, as_XMMRegister(rnum)); 1.446 + } 1.447 + for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xa0; key_offset += 0x10) { // 128-bit runs up to key offset a0 1.448 + aes_dec_key(xmm_result, xmm_temp, key, key_offset); 1.449 + } 1.450 + load_key(xmm_temp, key, 0x00); // final key is stored in java expanded array at offset 0 1.451 + __ aesdeclast(xmm_result, xmm_temp); 1.452 + __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00)); 1.453 + __ pxor (xmm_result, xmm_temp); // xor with the current r vector 1.454 + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 1.455 + // no need to store r to memory until we exit 1.456 + __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0)); // set up new ptr 1.457 + __ addptr(pos, AESBlockSize); 1.458 + __ subptr(len_reg, AESBlockSize); 1.459 + __ jmp(L_singleBlock_loopTop_128); 1.460 + 1.461 + 1.462 + __ BIND(L_exit); 1.463 + __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00)); 1.464 + __ movptr(rvec , rvec_param); // restore this since used in loop 1.465 + __ movdqu(Address(rvec, 0), xmm_temp); // final value of r stored in rvec of CipherBlockChaining object 1.466 + handleSOERegisters(false /*restoring*/); 1.467 + __ movl(rax, 0); // return 0 (why?) 1.468 + __ leave(); // required for proper stackwalking of RuntimeStub frame 1.469 + __ ret(0); 1.470 + 1.471 + 1.472 + __ BIND(L_key_192_256); 1.473 + // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) 1.474 + __ cmpl(rax, 52); 1.475 + __ jcc(Assembler::notEqual, L_key_256); 1.476 + 1.477 + // 192-bit code follows here (could be optimized to use parallelism) 1.478 + __ movptr(pos, 0); 1.479 + __ align(OptoLoopAlignment); 1.480 + __ BIND(L_singleBlock_loopTop_192); 1.481 + __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input 1.482 + __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds 1.483 + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 1.484 + __ aesdec(xmm_result, as_XMMRegister(rnum)); 1.485 + } 1.486 + for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xc0; key_offset += 0x10) { // 192-bit runs up to key offset c0 1.487 + aes_dec_key(xmm_result, xmm_temp, key, key_offset); 1.488 + } 1.489 + load_key(xmm_temp, key, 0x00); // final key is stored in java expanded array at offset 0 1.490 + __ aesdeclast(xmm_result, xmm_temp); 1.491 + __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00)); 1.492 + __ pxor (xmm_result, xmm_temp); // xor with the current r vector 1.493 + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 1.494 + // no need to store r to memory until we exit 1.495 + __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0)); // set up new ptr 1.496 + __ addptr(pos, AESBlockSize); 1.497 + __ subptr(len_reg, AESBlockSize); 1.498 + __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192); 1.499 + __ jmp(L_exit); 1.500 + 1.501 + __ BIND(L_key_256); 1.502 + // 256-bit code follows here (could be optimized to use parallelism) 1.503 + __ movptr(pos, 0); 1.504 + __ align(OptoLoopAlignment); 1.505 + __ BIND(L_singleBlock_loopTop_256); 1.506 + __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input 1.507 + __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds 1.508 + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 1.509 + __ aesdec(xmm_result, as_XMMRegister(rnum)); 1.510 + } 1.511 + for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xe0; key_offset += 0x10) { // 256-bit runs up to key offset e0 1.512 + aes_dec_key(xmm_result, xmm_temp, key, key_offset); 1.513 + } 1.514 + load_key(xmm_temp, key, 0x00); // final key is stored in java expanded array at offset 0 1.515 + __ aesdeclast(xmm_result, xmm_temp); 1.516 + __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00)); 1.517 + __ pxor (xmm_result, xmm_temp); // xor with the current r vector 1.518 + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 1.519 + // no need to store r to memory until we exit 1.520 + __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0)); // set up new ptr 1.521 + __ addptr(pos, AESBlockSize); 1.522 + __ subptr(len_reg, AESBlockSize); 1.523 + __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256); 1.524 + __ jmp(L_exit); 1.525 + 1.526 + return start; 1.527 + } 1.528 + 1.529 + 1.530 public: 1.531 // Information about frame layout at time of blocking runtime call. 1.532 // Note that we only have to preserve callee-saved registers since 1.533 @@ -2332,6 +2855,16 @@ 1.534 generate_arraycopy_stubs(); 1.535 1.536 generate_math_stubs(); 1.537 + 1.538 + // don't bother generating these AES intrinsic stubs unless global flag is set 1.539 + if (UseAESIntrinsics) { 1.540 + StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // might be needed by the others 1.541 + 1.542 + StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 1.543 + StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 1.544 + StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 1.545 + StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 1.546 + } 1.547 } 1.548 1.549