1.1 --- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp Fri Feb 14 16:17:22 2014 +0100 1.2 +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp Tue Jan 14 17:46:48 2014 -0800 1.3 @@ -3304,6 +3304,775 @@ 1.4 } 1.5 } 1.6 1.7 + address generate_aescrypt_encryptBlock() { 1.8 + __ align(CodeEntryAlignment); 1.9 + StubCodeMark mark(this, "StubRoutines", "aesencryptBlock"); 1.10 + Label L_doLast128bit, L_storeOutput; 1.11 + address start = __ pc(); 1.12 + Register from = O0; // source byte array 1.13 + Register to = O1; // destination byte array 1.14 + Register key = O2; // expanded key array 1.15 + const Register keylen = O4; //reg for storing expanded key array length 1.16 + 1.17 + // read expanded key length 1.18 + __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 1.19 + 1.20 + // load input into F54-F56; F30-F31 used as temp 1.21 + __ ldf(FloatRegisterImpl::S, from, 0, F30); 1.22 + __ ldf(FloatRegisterImpl::S, from, 4, F31); 1.23 + __ fmov(FloatRegisterImpl::D, F30, F54); 1.24 + __ ldf(FloatRegisterImpl::S, from, 8, F30); 1.25 + __ ldf(FloatRegisterImpl::S, from, 12, F31); 1.26 + __ fmov(FloatRegisterImpl::D, F30, F56); 1.27 + 1.28 + // load expanded key 1.29 + for ( int i = 0; i <= 38; i += 2 ) { 1.30 + __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i)); 1.31 + } 1.32 + 1.33 + // perform cipher transformation 1.34 + __ fxor(FloatRegisterImpl::D, F0, F54, F54); 1.35 + __ fxor(FloatRegisterImpl::D, F2, F56, F56); 1.36 + // rounds 1 through 8 1.37 + for ( int i = 4; i <= 28; i += 8 ) { 1.38 + __ aes_eround01(as_FloatRegister(i), F54, F56, F58); 1.39 + __ aes_eround23(as_FloatRegister(i+2), F54, F56, F60); 1.40 + __ aes_eround01(as_FloatRegister(i+4), F58, F60, F54); 1.41 + __ aes_eround23(as_FloatRegister(i+6), F58, F60, F56); 1.42 + } 1.43 + __ aes_eround01(F36, F54, F56, F58); //round 9 1.44 + __ aes_eround23(F38, F54, F56, F60); 1.45 + 1.46 + // 128-bit original key size 1.47 + __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_doLast128bit); 1.48 + 1.49 + for ( int i = 40; i <= 50; i += 2 ) { 1.50 + __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i) ); 1.51 + } 1.52 + __ aes_eround01(F40, F58, F60, F54); //round 10 1.53 + __ aes_eround23(F42, F58, F60, F56); 1.54 + __ aes_eround01(F44, F54, F56, F58); //round 11 1.55 + __ aes_eround23(F46, F54, F56, F60); 1.56 + 1.57 + // 192-bit original key size 1.58 + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_storeOutput); 1.59 + 1.60 + __ ldf(FloatRegisterImpl::D, key, 208, F52); 1.61 + __ aes_eround01(F48, F58, F60, F54); //round 12 1.62 + __ aes_eround23(F50, F58, F60, F56); 1.63 + __ ldf(FloatRegisterImpl::D, key, 216, F46); 1.64 + __ ldf(FloatRegisterImpl::D, key, 224, F48); 1.65 + __ ldf(FloatRegisterImpl::D, key, 232, F50); 1.66 + __ aes_eround01(F52, F54, F56, F58); //round 13 1.67 + __ aes_eround23(F46, F54, F56, F60); 1.68 + __ br(Assembler::always, false, Assembler::pt, L_storeOutput); 1.69 + __ delayed()->nop(); 1.70 + 1.71 + __ BIND(L_doLast128bit); 1.72 + __ ldf(FloatRegisterImpl::D, key, 160, F48); 1.73 + __ ldf(FloatRegisterImpl::D, key, 168, F50); 1.74 + 1.75 + __ BIND(L_storeOutput); 1.76 + // perform last round of encryption common for all key sizes 1.77 + __ aes_eround01_l(F48, F58, F60, F54); //last round 1.78 + __ aes_eround23_l(F50, F58, F60, F56); 1.79 + 1.80 + // store output into the destination array, F0-F1 used as temp 1.81 + __ fmov(FloatRegisterImpl::D, F54, F0); 1.82 + __ stf(FloatRegisterImpl::S, F0, to, 0); 1.83 + __ stf(FloatRegisterImpl::S, F1, to, 4); 1.84 + __ fmov(FloatRegisterImpl::D, F56, F0); 1.85 + __ stf(FloatRegisterImpl::S, F0, to, 8); 1.86 + __ retl(); 1.87 + __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12); 1.88 + 1.89 + return start; 1.90 + } 1.91 + 1.92 + address generate_aescrypt_decryptBlock() { 1.93 + __ align(CodeEntryAlignment); 1.94 + StubCodeMark mark(this, "StubRoutines", "aesdecryptBlock"); 1.95 + address start = __ pc(); 1.96 + Label L_expand192bit, L_expand256bit, L_common_transform; 1.97 + Register from = O0; // source byte array 1.98 + Register to = O1; // destination byte array 1.99 + Register key = O2; // expanded key array 1.100 + Register original_key = O3; // original key array only required during decryption 1.101 + const Register keylen = O4; // reg for storing expanded key array length 1.102 + 1.103 + // read expanded key array length 1.104 + __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 1.105 + 1.106 + // load input into F52-F54; F30,F31 used as temp 1.107 + __ ldf(FloatRegisterImpl::S, from, 0, F30); 1.108 + __ ldf(FloatRegisterImpl::S, from, 4, F31); 1.109 + __ fmov(FloatRegisterImpl::D, F30, F52); 1.110 + __ ldf(FloatRegisterImpl::S, from, 8, F30); 1.111 + __ ldf(FloatRegisterImpl::S, from, 12, F31); 1.112 + __ fmov(FloatRegisterImpl::D, F30, F54); 1.113 + 1.114 + // load original key from SunJCE expanded decryption key 1.115 + for ( int i = 0; i <= 3; i++ ) { 1.116 + __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 1.117 + } 1.118 + 1.119 + // 256-bit original key size 1.120 + __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit); 1.121 + 1.122 + // 192-bit original key size 1.123 + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit); 1.124 + 1.125 + // 128-bit original key size 1.126 + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 1.127 + for ( int i = 0; i <= 36; i += 4 ) { 1.128 + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4)); 1.129 + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6)); 1.130 + } 1.131 + 1.132 + // perform 128-bit key specific inverse cipher transformation 1.133 + __ fxor(FloatRegisterImpl::D, F42, F54, F54); 1.134 + __ fxor(FloatRegisterImpl::D, F40, F52, F52); 1.135 + __ br(Assembler::always, false, Assembler::pt, L_common_transform); 1.136 + __ delayed()->nop(); 1.137 + 1.138 + __ BIND(L_expand192bit); 1.139 + 1.140 + // start loading rest of the 192-bit key 1.141 + __ ldf(FloatRegisterImpl::S, original_key, 16, F4); 1.142 + __ ldf(FloatRegisterImpl::S, original_key, 20, F5); 1.143 + 1.144 + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 1.145 + for ( int i = 0; i <= 36; i += 6 ) { 1.146 + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6)); 1.147 + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8)); 1.148 + __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10)); 1.149 + } 1.150 + __ aes_kexpand1(F42, F46, 7, F48); 1.151 + __ aes_kexpand2(F44, F48, F50); 1.152 + 1.153 + // perform 192-bit key specific inverse cipher transformation 1.154 + __ fxor(FloatRegisterImpl::D, F50, F54, F54); 1.155 + __ fxor(FloatRegisterImpl::D, F48, F52, F52); 1.156 + __ aes_dround23(F46, F52, F54, F58); 1.157 + __ aes_dround01(F44, F52, F54, F56); 1.158 + __ aes_dround23(F42, F56, F58, F54); 1.159 + __ aes_dround01(F40, F56, F58, F52); 1.160 + __ br(Assembler::always, false, Assembler::pt, L_common_transform); 1.161 + __ delayed()->nop(); 1.162 + 1.163 + __ BIND(L_expand256bit); 1.164 + 1.165 + // load rest of the 256-bit key 1.166 + for ( int i = 4; i <= 7; i++ ) { 1.167 + __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 1.168 + } 1.169 + 1.170 + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 1.171 + for ( int i = 0; i <= 40; i += 8 ) { 1.172 + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8)); 1.173 + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10)); 1.174 + __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12)); 1.175 + __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14)); 1.176 + } 1.177 + __ aes_kexpand1(F48, F54, 6, F56); 1.178 + __ aes_kexpand2(F50, F56, F58); 1.179 + 1.180 + for ( int i = 0; i <= 6; i += 2 ) { 1.181 + __ fmov(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i)); 1.182 + } 1.183 + 1.184 + // load input into F52-F54 1.185 + __ ldf(FloatRegisterImpl::D, from, 0, F52); 1.186 + __ ldf(FloatRegisterImpl::D, from, 8, F54); 1.187 + 1.188 + // perform 256-bit key specific inverse cipher transformation 1.189 + __ fxor(FloatRegisterImpl::D, F0, F54, F54); 1.190 + __ fxor(FloatRegisterImpl::D, F2, F52, F52); 1.191 + __ aes_dround23(F4, F52, F54, F58); 1.192 + __ aes_dround01(F6, F52, F54, F56); 1.193 + __ aes_dround23(F50, F56, F58, F54); 1.194 + __ aes_dround01(F48, F56, F58, F52); 1.195 + __ aes_dround23(F46, F52, F54, F58); 1.196 + __ aes_dround01(F44, F52, F54, F56); 1.197 + __ aes_dround23(F42, F56, F58, F54); 1.198 + __ aes_dround01(F40, F56, F58, F52); 1.199 + 1.200 + for ( int i = 0; i <= 7; i++ ) { 1.201 + __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 1.202 + } 1.203 + 1.204 + // perform inverse cipher transformations common for all key sizes 1.205 + __ BIND(L_common_transform); 1.206 + for ( int i = 38; i >= 6; i -= 8 ) { 1.207 + __ aes_dround23(as_FloatRegister(i), F52, F54, F58); 1.208 + __ aes_dround01(as_FloatRegister(i-2), F52, F54, F56); 1.209 + if ( i != 6) { 1.210 + __ aes_dround23(as_FloatRegister(i-4), F56, F58, F54); 1.211 + __ aes_dround01(as_FloatRegister(i-6), F56, F58, F52); 1.212 + } else { 1.213 + __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F54); 1.214 + __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F52); 1.215 + } 1.216 + } 1.217 + 1.218 + // store output to destination array, F0-F1 used as temp 1.219 + __ fmov(FloatRegisterImpl::D, F52, F0); 1.220 + __ stf(FloatRegisterImpl::S, F0, to, 0); 1.221 + __ stf(FloatRegisterImpl::S, F1, to, 4); 1.222 + __ fmov(FloatRegisterImpl::D, F54, F0); 1.223 + __ stf(FloatRegisterImpl::S, F0, to, 8); 1.224 + __ retl(); 1.225 + __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12); 1.226 + 1.227 + return start; 1.228 + } 1.229 + 1.230 + address generate_cipherBlockChaining_encryptAESCrypt() { 1.231 + __ align(CodeEntryAlignment); 1.232 + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 1.233 + Label L_cbcenc128, L_cbcenc192, L_cbcenc256; 1.234 + address start = __ pc(); 1.235 + Register from = O0; // source byte array 1.236 + Register to = O1; // destination byte array 1.237 + Register key = O2; // expanded key array 1.238 + Register rvec = O3; // init vector 1.239 + const Register len_reg = O4; // cipher length 1.240 + const Register keylen = O5; // reg for storing expanded key array length 1.241 + 1.242 + // save cipher len to return in the end 1.243 + __ mov(len_reg, L1); 1.244 + 1.245 + // read expanded key length 1.246 + __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 1.247 + 1.248 + // load init vector 1.249 + __ ldf(FloatRegisterImpl::D, rvec, 0, F60); 1.250 + __ ldf(FloatRegisterImpl::D, rvec, 8, F62); 1.251 + __ ldx(key,0,G1); 1.252 + __ ldx(key,8,G2); 1.253 + 1.254 + // start loading expanded key 1.255 + for ( int i = 0, j = 16; i <= 38; i += 2, j += 8 ) { 1.256 + __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); 1.257 + } 1.258 + 1.259 + // 128-bit original key size 1.260 + __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_cbcenc128); 1.261 + 1.262 + for ( int i = 40, j = 176; i <= 46; i += 2, j += 8 ) { 1.263 + __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); 1.264 + } 1.265 + 1.266 + // 192-bit original key size 1.267 + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_cbcenc192); 1.268 + 1.269 + for ( int i = 48, j = 208; i <= 54; i += 2, j += 8 ) { 1.270 + __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); 1.271 + } 1.272 + 1.273 + // 256-bit original key size 1.274 + __ br(Assembler::always, false, Assembler::pt, L_cbcenc256); 1.275 + __ delayed()->nop(); 1.276 + 1.277 + __ align(OptoLoopAlignment); 1.278 + __ BIND(L_cbcenc128); 1.279 + __ ldx(from,0,G3); 1.280 + __ ldx(from,8,G4); 1.281 + __ xor3(G1,G3,G3); 1.282 + __ xor3(G2,G4,G4); 1.283 + __ movxtod(G3,F56); 1.284 + __ movxtod(G4,F58); 1.285 + __ fxor(FloatRegisterImpl::D, F60, F56, F60); 1.286 + __ fxor(FloatRegisterImpl::D, F62, F58, F62); 1.287 + 1.288 + // TEN_EROUNDS 1.289 + for ( int i = 0; i <= 32; i += 8 ) { 1.290 + __ aes_eround01(as_FloatRegister(i), F60, F62, F56); 1.291 + __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58); 1.292 + if (i != 32 ) { 1.293 + __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60); 1.294 + __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62); 1.295 + } else { 1.296 + __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); 1.297 + __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); 1.298 + } 1.299 + } 1.300 + 1.301 + __ stf(FloatRegisterImpl::D, F60, to, 0); 1.302 + __ stf(FloatRegisterImpl::D, F62, to, 8); 1.303 + __ add(from, 16, from); 1.304 + __ add(to, 16, to); 1.305 + __ subcc(len_reg, 16, len_reg); 1.306 + __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128); 1.307 + __ delayed()->nop(); 1.308 + __ stf(FloatRegisterImpl::D, F60, rvec, 0); 1.309 + __ stf(FloatRegisterImpl::D, F62, rvec, 8); 1.310 + __ retl(); 1.311 + __ delayed()->mov(L1, O0); 1.312 + 1.313 + __ align(OptoLoopAlignment); 1.314 + __ BIND(L_cbcenc192); 1.315 + __ ldx(from,0,G3); 1.316 + __ ldx(from,8,G4); 1.317 + __ xor3(G1,G3,G3); 1.318 + __ xor3(G2,G4,G4); 1.319 + __ movxtod(G3,F56); 1.320 + __ movxtod(G4,F58); 1.321 + __ fxor(FloatRegisterImpl::D, F60, F56, F60); 1.322 + __ fxor(FloatRegisterImpl::D, F62, F58, F62); 1.323 + 1.324 + // TWELEVE_EROUNDS 1.325 + for ( int i = 0; i <= 40; i += 8 ) { 1.326 + __ aes_eround01(as_FloatRegister(i), F60, F62, F56); 1.327 + __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58); 1.328 + if (i != 40 ) { 1.329 + __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60); 1.330 + __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62); 1.331 + } else { 1.332 + __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); 1.333 + __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); 1.334 + } 1.335 + } 1.336 + 1.337 + __ stf(FloatRegisterImpl::D, F60, to, 0); 1.338 + __ stf(FloatRegisterImpl::D, F62, to, 8); 1.339 + __ add(from, 16, from); 1.340 + __ subcc(len_reg, 16, len_reg); 1.341 + __ add(to, 16, to); 1.342 + __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192); 1.343 + __ delayed()->nop(); 1.344 + __ stf(FloatRegisterImpl::D, F60, rvec, 0); 1.345 + __ stf(FloatRegisterImpl::D, F62, rvec, 8); 1.346 + __ retl(); 1.347 + __ delayed()->mov(L1, O0); 1.348 + 1.349 + __ align(OptoLoopAlignment); 1.350 + __ BIND(L_cbcenc256); 1.351 + __ ldx(from,0,G3); 1.352 + __ ldx(from,8,G4); 1.353 + __ xor3(G1,G3,G3); 1.354 + __ xor3(G2,G4,G4); 1.355 + __ movxtod(G3,F56); 1.356 + __ movxtod(G4,F58); 1.357 + __ fxor(FloatRegisterImpl::D, F60, F56, F60); 1.358 + __ fxor(FloatRegisterImpl::D, F62, F58, F62); 1.359 + 1.360 + // FOURTEEN_EROUNDS 1.361 + for ( int i = 0; i <= 48; i += 8 ) { 1.362 + __ aes_eround01(as_FloatRegister(i), F60, F62, F56); 1.363 + __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58); 1.364 + if (i != 48 ) { 1.365 + __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60); 1.366 + __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62); 1.367 + } else { 1.368 + __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); 1.369 + __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); 1.370 + } 1.371 + } 1.372 + 1.373 + __ stf(FloatRegisterImpl::D, F60, to, 0); 1.374 + __ stf(FloatRegisterImpl::D, F62, to, 8); 1.375 + __ add(from, 16, from); 1.376 + __ subcc(len_reg, 16, len_reg); 1.377 + __ add(to, 16, to); 1.378 + __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256); 1.379 + __ delayed()->nop(); 1.380 + __ stf(FloatRegisterImpl::D, F60, rvec, 0); 1.381 + __ stf(FloatRegisterImpl::D, F62, rvec, 8); 1.382 + __ retl(); 1.383 + __ delayed()->mov(L1, O0); 1.384 + 1.385 + return start; 1.386 + } 1.387 + 1.388 + address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { 1.389 + __ align(CodeEntryAlignment); 1.390 + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 1.391 + Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start; 1.392 + Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256; 1.393 + address start = __ pc(); 1.394 + Register from = I0; // source byte array 1.395 + Register to = I1; // destination byte array 1.396 + Register key = I2; // expanded key array 1.397 + Register rvec = I3; // init vector 1.398 + const Register len_reg = I4; // cipher length 1.399 + const Register original_key = I5; // original key array only required during decryption 1.400 + const Register keylen = L6; // reg for storing expanded key array length 1.401 + 1.402 + // save cipher len before save_frame, to return in the end 1.403 + __ mov(O4, L0); 1.404 + __ save_frame(0); //args are read from I* registers since we save the frame in the beginning 1.405 + 1.406 + // load original key from SunJCE expanded decryption key 1.407 + for ( int i = 0; i <= 3; i++ ) { 1.408 + __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 1.409 + } 1.410 + 1.411 + // load initial vector 1.412 + __ ldx(rvec,0,L0); 1.413 + __ ldx(rvec,8,L1); 1.414 + 1.415 + // read expanded key array length 1.416 + __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 1.417 + 1.418 + // 256-bit original key size 1.419 + __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit); 1.420 + 1.421 + // 192-bit original key size 1.422 + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit); 1.423 + 1.424 + // 128-bit original key size 1.425 + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 1.426 + for ( int i = 0; i <= 36; i += 4 ) { 1.427 + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4)); 1.428 + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6)); 1.429 + } 1.430 + 1.431 + // load expanded key[last-1] and key[last] elements 1.432 + __ movdtox(F40,L2); 1.433 + __ movdtox(F42,L3); 1.434 + 1.435 + __ and3(len_reg, 16, L4); 1.436 + __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks128); 1.437 + __ delayed()->nop(); 1.438 + 1.439 + __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start); 1.440 + __ delayed()->nop(); 1.441 + 1.442 + __ BIND(L_expand192bit); 1.443 + // load rest of the 192-bit key 1.444 + __ ldf(FloatRegisterImpl::S, original_key, 16, F4); 1.445 + __ ldf(FloatRegisterImpl::S, original_key, 20, F5); 1.446 + 1.447 + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 1.448 + for ( int i = 0; i <= 36; i += 6 ) { 1.449 + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6)); 1.450 + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8)); 1.451 + __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10)); 1.452 + } 1.453 + __ aes_kexpand1(F42, F46, 7, F48); 1.454 + __ aes_kexpand2(F44, F48, F50); 1.455 + 1.456 + // load expanded key[last-1] and key[last] elements 1.457 + __ movdtox(F48,L2); 1.458 + __ movdtox(F50,L3); 1.459 + 1.460 + __ and3(len_reg, 16, L4); 1.461 + __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks192); 1.462 + __ delayed()->nop(); 1.463 + 1.464 + __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start); 1.465 + __ delayed()->nop(); 1.466 + 1.467 + __ BIND(L_expand256bit); 1.468 + // load rest of the 256-bit key 1.469 + for ( int i = 4; i <= 7; i++ ) { 1.470 + __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 1.471 + } 1.472 + 1.473 + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 1.474 + for ( int i = 0; i <= 40; i += 8 ) { 1.475 + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8)); 1.476 + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10)); 1.477 + __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12)); 1.478 + __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14)); 1.479 + } 1.480 + __ aes_kexpand1(F48, F54, 6, F56); 1.481 + __ aes_kexpand2(F50, F56, F58); 1.482 + 1.483 + // load expanded key[last-1] and key[last] elements 1.484 + __ movdtox(F56,L2); 1.485 + __ movdtox(F58,L3); 1.486 + 1.487 + __ and3(len_reg, 16, L4); 1.488 + __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks256); 1.489 + __ delayed()->nop(); 1.490 + 1.491 + __ BIND(L_dec_first_block_start); 1.492 + __ ldx(from,0,L4); 1.493 + __ ldx(from,8,L5); 1.494 + __ xor3(L2,L4,G1); 1.495 + __ movxtod(G1,F60); 1.496 + __ xor3(L3,L5,G1); 1.497 + __ movxtod(G1,F62); 1.498 + 1.499 + // 128-bit original key size 1.500 + __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pn, L_dec_first_block128); 1.501 + 1.502 + // 192-bit original key size 1.503 + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_first_block192); 1.504 + 1.505 + __ aes_dround23(F54, F60, F62, F58); 1.506 + __ aes_dround01(F52, F60, F62, F56); 1.507 + __ aes_dround23(F50, F56, F58, F62); 1.508 + __ aes_dround01(F48, F56, F58, F60); 1.509 + 1.510 + __ BIND(L_dec_first_block192); 1.511 + __ aes_dround23(F46, F60, F62, F58); 1.512 + __ aes_dround01(F44, F60, F62, F56); 1.513 + __ aes_dround23(F42, F56, F58, F62); 1.514 + __ aes_dround01(F40, F56, F58, F60); 1.515 + 1.516 + __ BIND(L_dec_first_block128); 1.517 + for ( int i = 38; i >= 6; i -= 8 ) { 1.518 + __ aes_dround23(as_FloatRegister(i), F60, F62, F58); 1.519 + __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); 1.520 + if ( i != 6) { 1.521 + __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); 1.522 + __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); 1.523 + } else { 1.524 + __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62); 1.525 + __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60); 1.526 + } 1.527 + } 1.528 + 1.529 + __ movxtod(L0,F56); 1.530 + __ movxtod(L1,F58); 1.531 + __ mov(L4,L0); 1.532 + __ mov(L5,L1); 1.533 + __ fxor(FloatRegisterImpl::D, F56, F60, F60); 1.534 + __ fxor(FloatRegisterImpl::D, F58, F62, F62); 1.535 + 1.536 + __ stf(FloatRegisterImpl::D, F60, to, 0); 1.537 + __ stf(FloatRegisterImpl::D, F62, to, 8); 1.538 + 1.539 + __ add(from, 16, from); 1.540 + __ add(to, 16, to); 1.541 + __ subcc(len_reg, 16, len_reg); 1.542 + __ br(Assembler::equal, false, Assembler::pt, L_cbcdec_end); 1.543 + __ delayed()->nop(); 1.544 + 1.545 + // 256-bit original key size 1.546 + __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_dec_next2_blocks256); 1.547 + 1.548 + // 192-bit original key size 1.549 + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_next2_blocks192); 1.550 + 1.551 + __ align(OptoLoopAlignment); 1.552 + __ BIND(L_dec_next2_blocks128); 1.553 + __ nop(); 1.554 + 1.555 + // F40:F42 used for first 16-bytes 1.556 + __ ldx(from,0,G4); 1.557 + __ ldx(from,8,G5); 1.558 + __ xor3(L2,G4,G1); 1.559 + __ movxtod(G1,F40); 1.560 + __ xor3(L3,G5,G1); 1.561 + __ movxtod(G1,F42); 1.562 + 1.563 + // F60:F62 used for next 16-bytes 1.564 + __ ldx(from,16,L4); 1.565 + __ ldx(from,24,L5); 1.566 + __ xor3(L2,L4,G1); 1.567 + __ movxtod(G1,F60); 1.568 + __ xor3(L3,L5,G1); 1.569 + __ movxtod(G1,F62); 1.570 + 1.571 + for ( int i = 38; i >= 6; i -= 8 ) { 1.572 + __ aes_dround23(as_FloatRegister(i), F40, F42, F44); 1.573 + __ aes_dround01(as_FloatRegister(i-2), F40, F42, F46); 1.574 + __ aes_dround23(as_FloatRegister(i), F60, F62, F58); 1.575 + __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); 1.576 + if (i != 6 ) { 1.577 + __ aes_dround23(as_FloatRegister(i-4), F46, F44, F42); 1.578 + __ aes_dround01(as_FloatRegister(i-6), F46, F44, F40); 1.579 + __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); 1.580 + __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); 1.581 + } else { 1.582 + __ aes_dround23_l(as_FloatRegister(i-4), F46, F44, F42); 1.583 + __ aes_dround01_l(as_FloatRegister(i-6), F46, F44, F40); 1.584 + __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62); 1.585 + __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60); 1.586 + } 1.587 + } 1.588 + 1.589 + __ movxtod(L0,F46); 1.590 + __ movxtod(L1,F44); 1.591 + __ fxor(FloatRegisterImpl::D, F46, F40, F40); 1.592 + __ fxor(FloatRegisterImpl::D, F44, F42, F42); 1.593 + 1.594 + __ stf(FloatRegisterImpl::D, F40, to, 0); 1.595 + __ stf(FloatRegisterImpl::D, F42, to, 8); 1.596 + 1.597 + __ movxtod(G4,F56); 1.598 + __ movxtod(G5,F58); 1.599 + __ mov(L4,L0); 1.600 + __ mov(L5,L1); 1.601 + __ fxor(FloatRegisterImpl::D, F56, F60, F60); 1.602 + __ fxor(FloatRegisterImpl::D, F58, F62, F62); 1.603 + 1.604 + __ stf(FloatRegisterImpl::D, F60, to, 16); 1.605 + __ stf(FloatRegisterImpl::D, F62, to, 24); 1.606 + 1.607 + __ add(from, 32, from); 1.608 + __ add(to, 32, to); 1.609 + __ subcc(len_reg, 32, len_reg); 1.610 + __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128); 1.611 + __ delayed()->nop(); 1.612 + __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end); 1.613 + __ delayed()->nop(); 1.614 + 1.615 + __ align(OptoLoopAlignment); 1.616 + __ BIND(L_dec_next2_blocks192); 1.617 + __ nop(); 1.618 + 1.619 + // F48:F50 used for first 16-bytes 1.620 + __ ldx(from,0,G4); 1.621 + __ ldx(from,8,G5); 1.622 + __ xor3(L2,G4,G1); 1.623 + __ movxtod(G1,F48); 1.624 + __ xor3(L3,G5,G1); 1.625 + __ movxtod(G1,F50); 1.626 + 1.627 + // F60:F62 used for next 16-bytes 1.628 + __ ldx(from,16,L4); 1.629 + __ ldx(from,24,L5); 1.630 + __ xor3(L2,L4,G1); 1.631 + __ movxtod(G1,F60); 1.632 + __ xor3(L3,L5,G1); 1.633 + __ movxtod(G1,F62); 1.634 + 1.635 + for ( int i = 46; i >= 6; i -= 8 ) { 1.636 + __ aes_dround23(as_FloatRegister(i), F48, F50, F52); 1.637 + __ aes_dround01(as_FloatRegister(i-2), F48, F50, F54); 1.638 + __ aes_dround23(as_FloatRegister(i), F60, F62, F58); 1.639 + __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); 1.640 + if (i != 6 ) { 1.641 + __ aes_dround23(as_FloatRegister(i-4), F54, F52, F50); 1.642 + __ aes_dround01(as_FloatRegister(i-6), F54, F52, F48); 1.643 + __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); 1.644 + __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); 1.645 + } else { 1.646 + __ aes_dround23_l(as_FloatRegister(i-4), F54, F52, F50); 1.647 + __ aes_dround01_l(as_FloatRegister(i-6), F54, F52, F48); 1.648 + __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62); 1.649 + __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60); 1.650 + } 1.651 + } 1.652 + 1.653 + __ movxtod(L0,F54); 1.654 + __ movxtod(L1,F52); 1.655 + __ fxor(FloatRegisterImpl::D, F54, F48, F48); 1.656 + __ fxor(FloatRegisterImpl::D, F52, F50, F50); 1.657 + 1.658 + __ stf(FloatRegisterImpl::D, F48, to, 0); 1.659 + __ stf(FloatRegisterImpl::D, F50, to, 8); 1.660 + 1.661 + __ movxtod(G4,F56); 1.662 + __ movxtod(G5,F58); 1.663 + __ mov(L4,L0); 1.664 + __ mov(L5,L1); 1.665 + __ fxor(FloatRegisterImpl::D, F56, F60, F60); 1.666 + __ fxor(FloatRegisterImpl::D, F58, F62, F62); 1.667 + 1.668 + __ stf(FloatRegisterImpl::D, F60, to, 16); 1.669 + __ stf(FloatRegisterImpl::D, F62, to, 24); 1.670 + 1.671 + __ add(from, 32, from); 1.672 + __ add(to, 32, to); 1.673 + __ subcc(len_reg, 32, len_reg); 1.674 + __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192); 1.675 + __ delayed()->nop(); 1.676 + __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end); 1.677 + __ delayed()->nop(); 1.678 + 1.679 + __ align(OptoLoopAlignment); 1.680 + __ BIND(L_dec_next2_blocks256); 1.681 + __ nop(); 1.682 + 1.683 + // F0:F2 used for first 16-bytes 1.684 + __ ldx(from,0,G4); 1.685 + __ ldx(from,8,G5); 1.686 + __ xor3(L2,G4,G1); 1.687 + __ movxtod(G1,F0); 1.688 + __ xor3(L3,G5,G1); 1.689 + __ movxtod(G1,F2); 1.690 + 1.691 + // F60:F62 used for next 16-bytes 1.692 + __ ldx(from,16,L4); 1.693 + __ ldx(from,24,L5); 1.694 + __ xor3(L2,L4,G1); 1.695 + __ movxtod(G1,F60); 1.696 + __ xor3(L3,L5,G1); 1.697 + __ movxtod(G1,F62); 1.698 + 1.699 + __ aes_dround23(F54, F0, F2, F4); 1.700 + __ aes_dround01(F52, F0, F2, F6); 1.701 + __ aes_dround23(F54, F60, F62, F58); 1.702 + __ aes_dround01(F52, F60, F62, F56); 1.703 + __ aes_dround23(F50, F6, F4, F2); 1.704 + __ aes_dround01(F48, F6, F4, F0); 1.705 + __ aes_dround23(F50, F56, F58, F62); 1.706 + __ aes_dround01(F48, F56, F58, F60); 1.707 + // save F48:F54 in temp registers 1.708 + __ movdtox(F54,G2); 1.709 + __ movdtox(F52,G3); 1.710 + __ movdtox(F50,G6); 1.711 + __ movdtox(F48,G1); 1.712 + for ( int i = 46; i >= 14; i -= 8 ) { 1.713 + __ aes_dround23(as_FloatRegister(i), F0, F2, F4); 1.714 + __ aes_dround01(as_FloatRegister(i-2), F0, F2, F6); 1.715 + __ aes_dround23(as_FloatRegister(i), F60, F62, F58); 1.716 + __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); 1.717 + __ aes_dround23(as_FloatRegister(i-4), F6, F4, F2); 1.718 + __ aes_dround01(as_FloatRegister(i-6), F6, F4, F0); 1.719 + __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); 1.720 + __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); 1.721 + } 1.722 + // init F48:F54 with F0:F6 values (original key) 1.723 + __ ldf(FloatRegisterImpl::D, original_key, 0, F48); 1.724 + __ ldf(FloatRegisterImpl::D, original_key, 8, F50); 1.725 + __ ldf(FloatRegisterImpl::D, original_key, 16, F52); 1.726 + __ ldf(FloatRegisterImpl::D, original_key, 24, F54); 1.727 + __ aes_dround23(F54, F0, F2, F4); 1.728 + __ aes_dround01(F52, F0, F2, F6); 1.729 + __ aes_dround23(F54, F60, F62, F58); 1.730 + __ aes_dround01(F52, F60, F62, F56); 1.731 + __ aes_dround23_l(F50, F6, F4, F2); 1.732 + __ aes_dround01_l(F48, F6, F4, F0); 1.733 + __ aes_dround23_l(F50, F56, F58, F62); 1.734 + __ aes_dround01_l(F48, F56, F58, F60); 1.735 + // re-init F48:F54 with their original values 1.736 + __ movxtod(G2,F54); 1.737 + __ movxtod(G3,F52); 1.738 + __ movxtod(G6,F50); 1.739 + __ movxtod(G1,F48); 1.740 + 1.741 + __ movxtod(L0,F6); 1.742 + __ movxtod(L1,F4); 1.743 + __ fxor(FloatRegisterImpl::D, F6, F0, F0); 1.744 + __ fxor(FloatRegisterImpl::D, F4, F2, F2); 1.745 + 1.746 + __ stf(FloatRegisterImpl::D, F0, to, 0); 1.747 + __ stf(FloatRegisterImpl::D, F2, to, 8); 1.748 + 1.749 + __ movxtod(G4,F56); 1.750 + __ movxtod(G5,F58); 1.751 + __ mov(L4,L0); 1.752 + __ mov(L5,L1); 1.753 + __ fxor(FloatRegisterImpl::D, F56, F60, F60); 1.754 + __ fxor(FloatRegisterImpl::D, F58, F62, F62); 1.755 + 1.756 + __ stf(FloatRegisterImpl::D, F60, to, 16); 1.757 + __ stf(FloatRegisterImpl::D, F62, to, 24); 1.758 + 1.759 + __ add(from, 32, from); 1.760 + __ add(to, 32, to); 1.761 + __ subcc(len_reg, 32, len_reg); 1.762 + __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks256); 1.763 + __ delayed()->nop(); 1.764 + 1.765 + __ BIND(L_cbcdec_end); 1.766 + __ stx(L0, rvec, 0); 1.767 + __ stx(L1, rvec, 8); 1.768 + __ restore(); 1.769 + __ mov(L0, O0); 1.770 + __ retl(); 1.771 + __ delayed()->nop(); 1.772 + 1.773 + return start; 1.774 + } 1.775 + 1.776 void generate_initial() { 1.777 // Generates all stubs and initializes the entry points 1.778 1.779 @@ -3368,6 +4137,14 @@ 1.780 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 1.781 &StubRoutines::_safefetchN_fault_pc, 1.782 &StubRoutines::_safefetchN_continuation_pc); 1.783 + 1.784 + // generate AES intrinsics code 1.785 + if (UseAESIntrinsics) { 1.786 + StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 1.787 + StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 1.788 + StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 1.789 + StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); 1.790 + } 1.791 } 1.792 1.793