src/cpu/sparc/vm/stubGenerator_sparc.cpp

changeset 6312
04d32e7fad07
parent 6198
55fb97c4c58d
child 6653
03214612e77e
     1.1 --- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Fri Feb 14 16:17:22 2014 +0100
     1.2 +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Tue Jan 14 17:46:48 2014 -0800
     1.3 @@ -3304,6 +3304,775 @@
     1.4      }
     1.5    }
     1.6  
     1.7 +  address generate_aescrypt_encryptBlock() {
     1.8 +    __ align(CodeEntryAlignment);
     1.9 +    StubCodeMark mark(this, "StubRoutines", "aesencryptBlock");
    1.10 +    Label L_doLast128bit, L_storeOutput;
    1.11 +    address start = __ pc();
    1.12 +    Register from = O0; // source byte array
    1.13 +    Register to = O1;   // destination byte array
    1.14 +    Register key = O2;  // expanded key array
    1.15 +    const Register keylen = O4; //reg for storing expanded key array length
    1.16 +
    1.17 +    // read expanded key length
    1.18 +    __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
    1.19 +
    1.20 +    // load input into F54-F56; F30-F31 used as temp
    1.21 +    __ ldf(FloatRegisterImpl::S, from, 0, F30);
    1.22 +    __ ldf(FloatRegisterImpl::S, from, 4, F31);
    1.23 +    __ fmov(FloatRegisterImpl::D, F30, F54);
    1.24 +    __ ldf(FloatRegisterImpl::S, from, 8, F30);
    1.25 +    __ ldf(FloatRegisterImpl::S, from, 12, F31);
    1.26 +    __ fmov(FloatRegisterImpl::D, F30, F56);
    1.27 +
    1.28 +    // load expanded key
    1.29 +    for ( int i = 0;  i <= 38; i += 2 ) {
    1.30 +      __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i));
    1.31 +    }
    1.32 +
    1.33 +    // perform cipher transformation
    1.34 +    __ fxor(FloatRegisterImpl::D, F0, F54, F54);
    1.35 +    __ fxor(FloatRegisterImpl::D, F2, F56, F56);
    1.36 +    // rounds 1 through 8
    1.37 +    for ( int i = 4;  i <= 28; i += 8 ) {
    1.38 +      __ aes_eround01(as_FloatRegister(i), F54, F56, F58);
    1.39 +      __ aes_eround23(as_FloatRegister(i+2), F54, F56, F60);
    1.40 +      __ aes_eround01(as_FloatRegister(i+4), F58, F60, F54);
    1.41 +      __ aes_eround23(as_FloatRegister(i+6), F58, F60, F56);
    1.42 +    }
    1.43 +    __ aes_eround01(F36, F54, F56, F58); //round 9
    1.44 +    __ aes_eround23(F38, F54, F56, F60);
    1.45 +
    1.46 +    // 128-bit original key size
    1.47 +    __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_doLast128bit);
    1.48 +
    1.49 +    for ( int i = 40;  i <= 50; i += 2 ) {
    1.50 +      __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i) );
    1.51 +    }
    1.52 +    __ aes_eround01(F40, F58, F60, F54); //round 10
    1.53 +    __ aes_eround23(F42, F58, F60, F56);
    1.54 +    __ aes_eround01(F44, F54, F56, F58); //round 11
    1.55 +    __ aes_eround23(F46, F54, F56, F60);
    1.56 +
    1.57 +    // 192-bit original key size
    1.58 +    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_storeOutput);
    1.59 +
    1.60 +    __ ldf(FloatRegisterImpl::D, key, 208, F52);
    1.61 +    __ aes_eround01(F48, F58, F60, F54); //round 12
    1.62 +    __ aes_eround23(F50, F58, F60, F56);
    1.63 +    __ ldf(FloatRegisterImpl::D, key, 216, F46);
    1.64 +    __ ldf(FloatRegisterImpl::D, key, 224, F48);
    1.65 +    __ ldf(FloatRegisterImpl::D, key, 232, F50);
    1.66 +    __ aes_eround01(F52, F54, F56, F58); //round 13
    1.67 +    __ aes_eround23(F46, F54, F56, F60);
    1.68 +    __ br(Assembler::always, false, Assembler::pt, L_storeOutput);
    1.69 +    __ delayed()->nop();
    1.70 +
    1.71 +    __ BIND(L_doLast128bit);
    1.72 +    __ ldf(FloatRegisterImpl::D, key, 160, F48);
    1.73 +    __ ldf(FloatRegisterImpl::D, key, 168, F50);
    1.74 +
    1.75 +    __ BIND(L_storeOutput);
    1.76 +    // perform last round of encryption common for all key sizes
    1.77 +    __ aes_eround01_l(F48, F58, F60, F54); //last round
    1.78 +    __ aes_eround23_l(F50, F58, F60, F56);
    1.79 +
    1.80 +    // store output into the destination array, F0-F1 used as temp
    1.81 +    __ fmov(FloatRegisterImpl::D, F54, F0);
    1.82 +    __ stf(FloatRegisterImpl::S, F0, to, 0);
    1.83 +    __ stf(FloatRegisterImpl::S, F1, to, 4);
    1.84 +    __ fmov(FloatRegisterImpl::D, F56, F0);
    1.85 +    __ stf(FloatRegisterImpl::S, F0, to, 8);
    1.86 +    __ retl();
    1.87 +    __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12);
    1.88 +
    1.89 +    return start;
    1.90 +  }
    1.91 +
    1.92 +  address generate_aescrypt_decryptBlock() {
    1.93 +    __ align(CodeEntryAlignment);
    1.94 +    StubCodeMark mark(this, "StubRoutines", "aesdecryptBlock");
    1.95 +    address start = __ pc();
    1.96 +    Label L_expand192bit, L_expand256bit, L_common_transform;
    1.97 +    Register from = O0; // source byte array
    1.98 +    Register to = O1;   // destination byte array
    1.99 +    Register key = O2;  // expanded key array
   1.100 +    Register original_key = O3;  // original key array only required during decryption
   1.101 +    const Register keylen = O4;  // reg for storing expanded key array length
   1.102 +
   1.103 +    // read expanded key array length
   1.104 +    __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
   1.105 +
   1.106 +    // load input into F52-F54; F30,F31 used as temp
   1.107 +    __ ldf(FloatRegisterImpl::S, from, 0, F30);
   1.108 +    __ ldf(FloatRegisterImpl::S, from, 4, F31);
   1.109 +    __ fmov(FloatRegisterImpl::D, F30, F52);
   1.110 +    __ ldf(FloatRegisterImpl::S, from, 8, F30);
   1.111 +    __ ldf(FloatRegisterImpl::S, from, 12, F31);
   1.112 +    __ fmov(FloatRegisterImpl::D, F30, F54);
   1.113 +
   1.114 +    // load original key from SunJCE expanded decryption key
   1.115 +    for ( int i = 0;  i <= 3; i++ ) {
   1.116 +      __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
   1.117 +    }
   1.118 +
   1.119 +    // 256-bit original key size
   1.120 +    __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit);
   1.121 +
   1.122 +    // 192-bit original key size
   1.123 +    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit);
   1.124 +
   1.125 +    // 128-bit original key size
   1.126 +    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
   1.127 +    for ( int i = 0;  i <= 36; i += 4 ) {
   1.128 +      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4));
   1.129 +      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6));
   1.130 +    }
   1.131 +
   1.132 +    // perform 128-bit key specific inverse cipher transformation
   1.133 +    __ fxor(FloatRegisterImpl::D, F42, F54, F54);
   1.134 +    __ fxor(FloatRegisterImpl::D, F40, F52, F52);
   1.135 +    __ br(Assembler::always, false, Assembler::pt, L_common_transform);
   1.136 +    __ delayed()->nop();
   1.137 +
   1.138 +    __ BIND(L_expand192bit);
   1.139 +
   1.140 +    // start loading rest of the 192-bit key
   1.141 +    __ ldf(FloatRegisterImpl::S, original_key, 16, F4);
   1.142 +    __ ldf(FloatRegisterImpl::S, original_key, 20, F5);
   1.143 +
   1.144 +    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
   1.145 +    for ( int i = 0;  i <= 36; i += 6 ) {
   1.146 +      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6));
   1.147 +      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8));
   1.148 +      __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10));
   1.149 +    }
   1.150 +    __ aes_kexpand1(F42, F46, 7, F48);
   1.151 +    __ aes_kexpand2(F44, F48, F50);
   1.152 +
   1.153 +    // perform 192-bit key specific inverse cipher transformation
   1.154 +    __ fxor(FloatRegisterImpl::D, F50, F54, F54);
   1.155 +    __ fxor(FloatRegisterImpl::D, F48, F52, F52);
   1.156 +    __ aes_dround23(F46, F52, F54, F58);
   1.157 +    __ aes_dround01(F44, F52, F54, F56);
   1.158 +    __ aes_dround23(F42, F56, F58, F54);
   1.159 +    __ aes_dround01(F40, F56, F58, F52);
   1.160 +    __ br(Assembler::always, false, Assembler::pt, L_common_transform);
   1.161 +    __ delayed()->nop();
   1.162 +
   1.163 +    __ BIND(L_expand256bit);
   1.164 +
   1.165 +    // load rest of the 256-bit key
   1.166 +    for ( int i = 4;  i <= 7; i++ ) {
   1.167 +      __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
   1.168 +    }
   1.169 +
   1.170 +    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
   1.171 +    for ( int i = 0;  i <= 40; i += 8 ) {
   1.172 +      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8));
   1.173 +      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10));
   1.174 +      __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12));
   1.175 +      __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14));
   1.176 +    }
   1.177 +    __ aes_kexpand1(F48, F54, 6, F56);
   1.178 +    __ aes_kexpand2(F50, F56, F58);
   1.179 +
   1.180 +    for ( int i = 0;  i <= 6; i += 2 ) {
   1.181 +      __ fmov(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i));
   1.182 +    }
   1.183 +
   1.184 +    // load input into F52-F54
   1.185 +    __ ldf(FloatRegisterImpl::D, from, 0, F52);
   1.186 +    __ ldf(FloatRegisterImpl::D, from, 8, F54);
   1.187 +
   1.188 +    // perform 256-bit key specific inverse cipher transformation
   1.189 +    __ fxor(FloatRegisterImpl::D, F0, F54, F54);
   1.190 +    __ fxor(FloatRegisterImpl::D, F2, F52, F52);
   1.191 +    __ aes_dround23(F4, F52, F54, F58);
   1.192 +    __ aes_dround01(F6, F52, F54, F56);
   1.193 +    __ aes_dround23(F50, F56, F58, F54);
   1.194 +    __ aes_dround01(F48, F56, F58, F52);
   1.195 +    __ aes_dround23(F46, F52, F54, F58);
   1.196 +    __ aes_dround01(F44, F52, F54, F56);
   1.197 +    __ aes_dround23(F42, F56, F58, F54);
   1.198 +    __ aes_dround01(F40, F56, F58, F52);
   1.199 +
   1.200 +    for ( int i = 0;  i <= 7; i++ ) {
   1.201 +      __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
   1.202 +    }
   1.203 +
   1.204 +    // perform inverse cipher transformations common for all key sizes
   1.205 +    __ BIND(L_common_transform);
   1.206 +    for ( int i = 38;  i >= 6; i -= 8 ) {
   1.207 +      __ aes_dround23(as_FloatRegister(i), F52, F54, F58);
   1.208 +      __ aes_dround01(as_FloatRegister(i-2), F52, F54, F56);
   1.209 +      if ( i != 6) {
   1.210 +        __ aes_dround23(as_FloatRegister(i-4), F56, F58, F54);
   1.211 +        __ aes_dround01(as_FloatRegister(i-6), F56, F58, F52);
   1.212 +      } else {
   1.213 +        __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F54);
   1.214 +        __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F52);
   1.215 +      }
   1.216 +    }
   1.217 +
   1.218 +    // store output to destination array, F0-F1 used as temp
   1.219 +    __ fmov(FloatRegisterImpl::D, F52, F0);
   1.220 +    __ stf(FloatRegisterImpl::S, F0, to, 0);
   1.221 +    __ stf(FloatRegisterImpl::S, F1, to, 4);
   1.222 +    __ fmov(FloatRegisterImpl::D, F54, F0);
   1.223 +    __ stf(FloatRegisterImpl::S, F0, to, 8);
   1.224 +    __ retl();
   1.225 +    __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12);
   1.226 +
   1.227 +    return start;
   1.228 +  }
   1.229 +
   1.230 +  address generate_cipherBlockChaining_encryptAESCrypt() {
   1.231 +    __ align(CodeEntryAlignment);
   1.232 +    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
   1.233 +    Label L_cbcenc128, L_cbcenc192, L_cbcenc256;
   1.234 +    address start = __ pc();
   1.235 +    Register from = O0; // source byte array
   1.236 +    Register to = O1;   // destination byte array
   1.237 +    Register key = O2;  // expanded key array
   1.238 +    Register rvec = O3; // init vector
   1.239 +    const Register len_reg = O4; // cipher length
   1.240 +    const Register keylen = O5;  // reg for storing expanded key array length
   1.241 +
   1.242 +    // save cipher len to return in the end
   1.243 +    __ mov(len_reg, L1);
   1.244 +
   1.245 +    // read expanded key length
   1.246 +    __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
   1.247 +
   1.248 +    // load init vector
   1.249 +    __ ldf(FloatRegisterImpl::D, rvec, 0, F60);
   1.250 +    __ ldf(FloatRegisterImpl::D, rvec, 8, F62);
   1.251 +    __ ldx(key,0,G1);
   1.252 +    __ ldx(key,8,G2);
   1.253 +
   1.254 +    // start loading expanded key
   1.255 +    for ( int i = 0, j = 16;  i <= 38; i += 2, j += 8 ) {
   1.256 +      __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
   1.257 +    }
   1.258 +
   1.259 +    // 128-bit original key size
   1.260 +    __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_cbcenc128);
   1.261 +
   1.262 +    for ( int i = 40, j = 176;  i <= 46; i += 2, j += 8 ) {
   1.263 +      __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
   1.264 +    }
   1.265 +
   1.266 +    // 192-bit original key size
   1.267 +    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_cbcenc192);
   1.268 +
   1.269 +    for ( int i = 48, j = 208;  i <= 54; i += 2, j += 8 ) {
   1.270 +      __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
   1.271 +    }
   1.272 +
   1.273 +    // 256-bit original key size
   1.274 +    __ br(Assembler::always, false, Assembler::pt, L_cbcenc256);
   1.275 +    __ delayed()->nop();
   1.276 +
   1.277 +    __ align(OptoLoopAlignment);
   1.278 +    __ BIND(L_cbcenc128);
   1.279 +    __ ldx(from,0,G3);
   1.280 +    __ ldx(from,8,G4);
   1.281 +    __ xor3(G1,G3,G3);
   1.282 +    __ xor3(G2,G4,G4);
   1.283 +    __ movxtod(G3,F56);
   1.284 +    __ movxtod(G4,F58);
   1.285 +    __ fxor(FloatRegisterImpl::D, F60, F56, F60);
   1.286 +    __ fxor(FloatRegisterImpl::D, F62, F58, F62);
   1.287 +
   1.288 +    // TEN_EROUNDS
   1.289 +    for ( int i = 0;  i <= 32; i += 8 ) {
   1.290 +      __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
   1.291 +      __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
   1.292 +      if (i != 32 ) {
   1.293 +        __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
   1.294 +        __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
   1.295 +      } else {
   1.296 +        __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
   1.297 +        __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
   1.298 +      }
   1.299 +    }
   1.300 +
   1.301 +    __ stf(FloatRegisterImpl::D, F60, to, 0);
   1.302 +    __ stf(FloatRegisterImpl::D, F62, to, 8);
   1.303 +    __ add(from, 16, from);
   1.304 +    __ add(to, 16, to);
   1.305 +    __ subcc(len_reg, 16, len_reg);
   1.306 +    __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128);
   1.307 +    __ delayed()->nop();
   1.308 +    __ stf(FloatRegisterImpl::D, F60, rvec, 0);
   1.309 +    __ stf(FloatRegisterImpl::D, F62, rvec, 8);
   1.310 +    __ retl();
   1.311 +    __ delayed()->mov(L1, O0);
   1.312 +
   1.313 +    __ align(OptoLoopAlignment);
   1.314 +    __ BIND(L_cbcenc192);
   1.315 +    __ ldx(from,0,G3);
   1.316 +    __ ldx(from,8,G4);
   1.317 +    __ xor3(G1,G3,G3);
   1.318 +    __ xor3(G2,G4,G4);
   1.319 +    __ movxtod(G3,F56);
   1.320 +    __ movxtod(G4,F58);
   1.321 +    __ fxor(FloatRegisterImpl::D, F60, F56, F60);
   1.322 +    __ fxor(FloatRegisterImpl::D, F62, F58, F62);
   1.323 +
   1.324 +    // TWELEVE_EROUNDS
   1.325 +    for ( int i = 0;  i <= 40; i += 8 ) {
   1.326 +      __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
   1.327 +      __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
   1.328 +      if (i != 40 ) {
   1.329 +        __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
   1.330 +        __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
   1.331 +      } else {
   1.332 +        __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
   1.333 +        __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
   1.334 +      }
   1.335 +    }
   1.336 +
   1.337 +    __ stf(FloatRegisterImpl::D, F60, to, 0);
   1.338 +    __ stf(FloatRegisterImpl::D, F62, to, 8);
   1.339 +    __ add(from, 16, from);
   1.340 +    __ subcc(len_reg, 16, len_reg);
   1.341 +    __ add(to, 16, to);
   1.342 +    __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192);
   1.343 +    __ delayed()->nop();
   1.344 +    __ stf(FloatRegisterImpl::D, F60, rvec, 0);
   1.345 +    __ stf(FloatRegisterImpl::D, F62, rvec, 8);
   1.346 +    __ retl();
   1.347 +    __ delayed()->mov(L1, O0);
   1.348 +
   1.349 +    __ align(OptoLoopAlignment);
   1.350 +    __ BIND(L_cbcenc256);
   1.351 +    __ ldx(from,0,G3);
   1.352 +    __ ldx(from,8,G4);
   1.353 +    __ xor3(G1,G3,G3);
   1.354 +    __ xor3(G2,G4,G4);
   1.355 +    __ movxtod(G3,F56);
   1.356 +    __ movxtod(G4,F58);
   1.357 +    __ fxor(FloatRegisterImpl::D, F60, F56, F60);
   1.358 +    __ fxor(FloatRegisterImpl::D, F62, F58, F62);
   1.359 +
   1.360 +    // FOURTEEN_EROUNDS
   1.361 +    for ( int i = 0;  i <= 48; i += 8 ) {
   1.362 +      __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
   1.363 +      __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
   1.364 +      if (i != 48 ) {
   1.365 +        __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
   1.366 +        __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
   1.367 +      } else {
   1.368 +        __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
   1.369 +        __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
   1.370 +      }
   1.371 +    }
   1.372 +
   1.373 +    __ stf(FloatRegisterImpl::D, F60, to, 0);
   1.374 +    __ stf(FloatRegisterImpl::D, F62, to, 8);
   1.375 +    __ add(from, 16, from);
   1.376 +    __ subcc(len_reg, 16, len_reg);
   1.377 +    __ add(to, 16, to);
   1.378 +    __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256);
   1.379 +    __ delayed()->nop();
   1.380 +    __ stf(FloatRegisterImpl::D, F60, rvec, 0);
   1.381 +    __ stf(FloatRegisterImpl::D, F62, rvec, 8);
   1.382 +    __ retl();
   1.383 +    __ delayed()->mov(L1, O0);
   1.384 +
   1.385 +    return start;
   1.386 +  }
   1.387 +
   1.388 +  address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
   1.389 +    __ align(CodeEntryAlignment);
   1.390 +    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
   1.391 +    Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start;
   1.392 +    Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256;
   1.393 +    address start = __ pc();
   1.394 +    Register from = I0; // source byte array
   1.395 +    Register to = I1;   // destination byte array
   1.396 +    Register key = I2;  // expanded key array
   1.397 +    Register rvec = I3; // init vector
   1.398 +    const Register len_reg = I4; // cipher length
   1.399 +    const Register original_key = I5;  // original key array only required during decryption
   1.400 +    const Register keylen = L6;  // reg for storing expanded key array length
   1.401 +
   1.402 +    // save cipher len before save_frame, to return in the end
   1.403 +    __ mov(O4, L0);
   1.404 +    __ save_frame(0); //args are read from I* registers since we save the frame in the beginning
   1.405 +
   1.406 +    // load original key from SunJCE expanded decryption key
   1.407 +    for ( int i = 0;  i <= 3; i++ ) {
   1.408 +      __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
   1.409 +    }
   1.410 +
   1.411 +    // load initial vector
   1.412 +    __ ldx(rvec,0,L0);
   1.413 +    __ ldx(rvec,8,L1);
   1.414 +
   1.415 +    // read expanded key array length
   1.416 +    __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
   1.417 +
   1.418 +    // 256-bit original key size
   1.419 +    __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit);
   1.420 +
   1.421 +    // 192-bit original key size
   1.422 +    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit);
   1.423 +
   1.424 +    // 128-bit original key size
   1.425 +    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
   1.426 +    for ( int i = 0;  i <= 36; i += 4 ) {
   1.427 +      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4));
   1.428 +      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6));
   1.429 +    }
   1.430 +
   1.431 +    // load expanded key[last-1] and key[last] elements
   1.432 +    __ movdtox(F40,L2);
   1.433 +    __ movdtox(F42,L3);
   1.434 +
   1.435 +    __ and3(len_reg, 16, L4);
   1.436 +    __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks128);
   1.437 +    __ delayed()->nop();
   1.438 +
   1.439 +    __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start);
   1.440 +    __ delayed()->nop();
   1.441 +
   1.442 +    __ BIND(L_expand192bit);
   1.443 +    // load rest of the 192-bit key
   1.444 +    __ ldf(FloatRegisterImpl::S, original_key, 16, F4);
   1.445 +    __ ldf(FloatRegisterImpl::S, original_key, 20, F5);
   1.446 +
   1.447 +    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
   1.448 +    for ( int i = 0;  i <= 36; i += 6 ) {
   1.449 +      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6));
   1.450 +      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8));
   1.451 +      __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10));
   1.452 +    }
   1.453 +    __ aes_kexpand1(F42, F46, 7, F48);
   1.454 +    __ aes_kexpand2(F44, F48, F50);
   1.455 +
   1.456 +    // load expanded key[last-1] and key[last] elements
   1.457 +    __ movdtox(F48,L2);
   1.458 +    __ movdtox(F50,L3);
   1.459 +
   1.460 +    __ and3(len_reg, 16, L4);
   1.461 +    __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks192);
   1.462 +    __ delayed()->nop();
   1.463 +
   1.464 +    __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start);
   1.465 +    __ delayed()->nop();
   1.466 +
   1.467 +    __ BIND(L_expand256bit);
   1.468 +    // load rest of the 256-bit key
   1.469 +    for ( int i = 4;  i <= 7; i++ ) {
   1.470 +      __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
   1.471 +    }
   1.472 +
   1.473 +    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
   1.474 +    for ( int i = 0;  i <= 40; i += 8 ) {
   1.475 +      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8));
   1.476 +      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10));
   1.477 +      __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12));
   1.478 +      __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14));
   1.479 +    }
   1.480 +    __ aes_kexpand1(F48, F54, 6, F56);
   1.481 +    __ aes_kexpand2(F50, F56, F58);
   1.482 +
   1.483 +    // load expanded key[last-1] and key[last] elements
   1.484 +    __ movdtox(F56,L2);
   1.485 +    __ movdtox(F58,L3);
   1.486 +
   1.487 +    __ and3(len_reg, 16, L4);
   1.488 +    __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks256);
   1.489 +    __ delayed()->nop();
   1.490 +
   1.491 +    __ BIND(L_dec_first_block_start);
   1.492 +    __ ldx(from,0,L4);
   1.493 +    __ ldx(from,8,L5);
   1.494 +    __ xor3(L2,L4,G1);
   1.495 +    __ movxtod(G1,F60);
   1.496 +    __ xor3(L3,L5,G1);
   1.497 +    __ movxtod(G1,F62);
   1.498 +
   1.499 +    // 128-bit original key size
   1.500 +    __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pn, L_dec_first_block128);
   1.501 +
   1.502 +    // 192-bit original key size
   1.503 +    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_first_block192);
   1.504 +
   1.505 +    __ aes_dround23(F54, F60, F62, F58);
   1.506 +    __ aes_dround01(F52, F60, F62, F56);
   1.507 +    __ aes_dround23(F50, F56, F58, F62);
   1.508 +    __ aes_dround01(F48, F56, F58, F60);
   1.509 +
   1.510 +    __ BIND(L_dec_first_block192);
   1.511 +    __ aes_dround23(F46, F60, F62, F58);
   1.512 +    __ aes_dround01(F44, F60, F62, F56);
   1.513 +    __ aes_dround23(F42, F56, F58, F62);
   1.514 +    __ aes_dround01(F40, F56, F58, F60);
   1.515 +
   1.516 +    __ BIND(L_dec_first_block128);
   1.517 +    for ( int i = 38;  i >= 6; i -= 8 ) {
   1.518 +      __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
   1.519 +      __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
   1.520 +      if ( i != 6) {
   1.521 +        __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
   1.522 +        __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
   1.523 +      } else {
   1.524 +        __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
   1.525 +        __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
   1.526 +      }
   1.527 +    }
   1.528 +
   1.529 +    __ movxtod(L0,F56);
   1.530 +    __ movxtod(L1,F58);
   1.531 +    __ mov(L4,L0);
   1.532 +    __ mov(L5,L1);
   1.533 +    __ fxor(FloatRegisterImpl::D, F56, F60, F60);
   1.534 +    __ fxor(FloatRegisterImpl::D, F58, F62, F62);
   1.535 +
   1.536 +    __ stf(FloatRegisterImpl::D, F60, to, 0);
   1.537 +    __ stf(FloatRegisterImpl::D, F62, to, 8);
   1.538 +
   1.539 +    __ add(from, 16, from);
   1.540 +    __ add(to, 16, to);
   1.541 +    __ subcc(len_reg, 16, len_reg);
   1.542 +    __ br(Assembler::equal, false, Assembler::pt, L_cbcdec_end);
   1.543 +    __ delayed()->nop();
   1.544 +
   1.545 +    // 256-bit original key size
   1.546 +    __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_dec_next2_blocks256);
   1.547 +
   1.548 +    // 192-bit original key size
   1.549 +    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_next2_blocks192);
   1.550 +
   1.551 +    __ align(OptoLoopAlignment);
   1.552 +    __ BIND(L_dec_next2_blocks128);
   1.553 +    __ nop();
   1.554 +
   1.555 +    // F40:F42 used for first 16-bytes
   1.556 +    __ ldx(from,0,G4);
   1.557 +    __ ldx(from,8,G5);
   1.558 +    __ xor3(L2,G4,G1);
   1.559 +    __ movxtod(G1,F40);
   1.560 +    __ xor3(L3,G5,G1);
   1.561 +    __ movxtod(G1,F42);
   1.562 +
   1.563 +    // F60:F62 used for next 16-bytes
   1.564 +    __ ldx(from,16,L4);
   1.565 +    __ ldx(from,24,L5);
   1.566 +    __ xor3(L2,L4,G1);
   1.567 +    __ movxtod(G1,F60);
   1.568 +    __ xor3(L3,L5,G1);
   1.569 +    __ movxtod(G1,F62);
   1.570 +
   1.571 +    for ( int i = 38;  i >= 6; i -= 8 ) {
   1.572 +      __ aes_dround23(as_FloatRegister(i), F40, F42, F44);
   1.573 +      __ aes_dround01(as_FloatRegister(i-2), F40, F42, F46);
   1.574 +      __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
   1.575 +      __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
   1.576 +      if (i != 6 ) {
   1.577 +        __ aes_dround23(as_FloatRegister(i-4), F46, F44, F42);
   1.578 +        __ aes_dround01(as_FloatRegister(i-6), F46, F44, F40);
   1.579 +        __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
   1.580 +        __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
   1.581 +      } else {
   1.582 +        __ aes_dround23_l(as_FloatRegister(i-4), F46, F44, F42);
   1.583 +        __ aes_dround01_l(as_FloatRegister(i-6), F46, F44, F40);
   1.584 +        __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
   1.585 +        __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
   1.586 +      }
   1.587 +    }
   1.588 +
   1.589 +    __ movxtod(L0,F46);
   1.590 +    __ movxtod(L1,F44);
   1.591 +    __ fxor(FloatRegisterImpl::D, F46, F40, F40);
   1.592 +    __ fxor(FloatRegisterImpl::D, F44, F42, F42);
   1.593 +
   1.594 +    __ stf(FloatRegisterImpl::D, F40, to, 0);
   1.595 +    __ stf(FloatRegisterImpl::D, F42, to, 8);
   1.596 +
   1.597 +    __ movxtod(G4,F56);
   1.598 +    __ movxtod(G5,F58);
   1.599 +    __ mov(L4,L0);
   1.600 +    __ mov(L5,L1);
   1.601 +    __ fxor(FloatRegisterImpl::D, F56, F60, F60);
   1.602 +    __ fxor(FloatRegisterImpl::D, F58, F62, F62);
   1.603 +
   1.604 +    __ stf(FloatRegisterImpl::D, F60, to, 16);
   1.605 +    __ stf(FloatRegisterImpl::D, F62, to, 24);
   1.606 +
   1.607 +    __ add(from, 32, from);
   1.608 +    __ add(to, 32, to);
   1.609 +    __ subcc(len_reg, 32, len_reg);
   1.610 +    __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128);
   1.611 +    __ delayed()->nop();
   1.612 +    __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end);
   1.613 +    __ delayed()->nop();
   1.614 +
   1.615 +    __ align(OptoLoopAlignment);
   1.616 +    __ BIND(L_dec_next2_blocks192);
   1.617 +    __ nop();
   1.618 +
   1.619 +    // F48:F50 used for first 16-bytes
   1.620 +    __ ldx(from,0,G4);
   1.621 +    __ ldx(from,8,G5);
   1.622 +    __ xor3(L2,G4,G1);
   1.623 +    __ movxtod(G1,F48);
   1.624 +    __ xor3(L3,G5,G1);
   1.625 +    __ movxtod(G1,F50);
   1.626 +
   1.627 +    // F60:F62 used for next 16-bytes
   1.628 +    __ ldx(from,16,L4);
   1.629 +    __ ldx(from,24,L5);
   1.630 +    __ xor3(L2,L4,G1);
   1.631 +    __ movxtod(G1,F60);
   1.632 +    __ xor3(L3,L5,G1);
   1.633 +    __ movxtod(G1,F62);
   1.634 +
   1.635 +    for ( int i = 46;  i >= 6; i -= 8 ) {
   1.636 +      __ aes_dround23(as_FloatRegister(i), F48, F50, F52);
   1.637 +      __ aes_dround01(as_FloatRegister(i-2), F48, F50, F54);
   1.638 +      __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
   1.639 +      __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
   1.640 +      if (i != 6 ) {
   1.641 +        __ aes_dround23(as_FloatRegister(i-4), F54, F52, F50);
   1.642 +        __ aes_dround01(as_FloatRegister(i-6), F54, F52, F48);
   1.643 +        __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
   1.644 +        __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
   1.645 +      } else {
   1.646 +        __ aes_dround23_l(as_FloatRegister(i-4), F54, F52, F50);
   1.647 +        __ aes_dround01_l(as_FloatRegister(i-6), F54, F52, F48);
   1.648 +        __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
   1.649 +        __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
   1.650 +      }
   1.651 +    }
   1.652 +
   1.653 +    __ movxtod(L0,F54);
   1.654 +    __ movxtod(L1,F52);
   1.655 +    __ fxor(FloatRegisterImpl::D, F54, F48, F48);
   1.656 +    __ fxor(FloatRegisterImpl::D, F52, F50, F50);
   1.657 +
   1.658 +    __ stf(FloatRegisterImpl::D, F48, to, 0);
   1.659 +    __ stf(FloatRegisterImpl::D, F50, to, 8);
   1.660 +
   1.661 +    __ movxtod(G4,F56);
   1.662 +    __ movxtod(G5,F58);
   1.663 +    __ mov(L4,L0);
   1.664 +    __ mov(L5,L1);
   1.665 +    __ fxor(FloatRegisterImpl::D, F56, F60, F60);
   1.666 +    __ fxor(FloatRegisterImpl::D, F58, F62, F62);
   1.667 +
   1.668 +    __ stf(FloatRegisterImpl::D, F60, to, 16);
   1.669 +    __ stf(FloatRegisterImpl::D, F62, to, 24);
   1.670 +
   1.671 +    __ add(from, 32, from);
   1.672 +    __ add(to, 32, to);
   1.673 +    __ subcc(len_reg, 32, len_reg);
   1.674 +    __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192);
   1.675 +    __ delayed()->nop();
   1.676 +    __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end);
   1.677 +    __ delayed()->nop();
   1.678 +
   1.679 +    __ align(OptoLoopAlignment);
   1.680 +    __ BIND(L_dec_next2_blocks256);
   1.681 +    __ nop();
   1.682 +
   1.683 +    // F0:F2 used for first 16-bytes
   1.684 +    __ ldx(from,0,G4);
   1.685 +    __ ldx(from,8,G5);
   1.686 +    __ xor3(L2,G4,G1);
   1.687 +    __ movxtod(G1,F0);
   1.688 +    __ xor3(L3,G5,G1);
   1.689 +    __ movxtod(G1,F2);
   1.690 +
   1.691 +    // F60:F62 used for next 16-bytes
   1.692 +    __ ldx(from,16,L4);
   1.693 +    __ ldx(from,24,L5);
   1.694 +    __ xor3(L2,L4,G1);
   1.695 +    __ movxtod(G1,F60);
   1.696 +    __ xor3(L3,L5,G1);
   1.697 +    __ movxtod(G1,F62);
   1.698 +
   1.699 +    __ aes_dround23(F54, F0, F2, F4);
   1.700 +    __ aes_dround01(F52, F0, F2, F6);
   1.701 +    __ aes_dround23(F54, F60, F62, F58);
   1.702 +    __ aes_dround01(F52, F60, F62, F56);
   1.703 +    __ aes_dround23(F50, F6, F4, F2);
   1.704 +    __ aes_dround01(F48, F6, F4, F0);
   1.705 +    __ aes_dround23(F50, F56, F58, F62);
   1.706 +    __ aes_dround01(F48, F56, F58, F60);
   1.707 +    // save F48:F54 in temp registers
   1.708 +    __ movdtox(F54,G2);
   1.709 +    __ movdtox(F52,G3);
   1.710 +    __ movdtox(F50,G6);
   1.711 +    __ movdtox(F48,G1);
   1.712 +    for ( int i = 46;  i >= 14; i -= 8 ) {
   1.713 +      __ aes_dround23(as_FloatRegister(i), F0, F2, F4);
   1.714 +      __ aes_dround01(as_FloatRegister(i-2), F0, F2, F6);
   1.715 +      __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
   1.716 +      __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
   1.717 +      __ aes_dround23(as_FloatRegister(i-4), F6, F4, F2);
   1.718 +      __ aes_dround01(as_FloatRegister(i-6), F6, F4, F0);
   1.719 +      __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
   1.720 +      __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
   1.721 +    }
   1.722 +    // init F48:F54 with F0:F6 values (original key)
   1.723 +    __ ldf(FloatRegisterImpl::D, original_key, 0, F48);
   1.724 +    __ ldf(FloatRegisterImpl::D, original_key, 8, F50);
   1.725 +    __ ldf(FloatRegisterImpl::D, original_key, 16, F52);
   1.726 +    __ ldf(FloatRegisterImpl::D, original_key, 24, F54);
   1.727 +    __ aes_dround23(F54, F0, F2, F4);
   1.728 +    __ aes_dround01(F52, F0, F2, F6);
   1.729 +    __ aes_dround23(F54, F60, F62, F58);
   1.730 +    __ aes_dround01(F52, F60, F62, F56);
   1.731 +    __ aes_dround23_l(F50, F6, F4, F2);
   1.732 +    __ aes_dround01_l(F48, F6, F4, F0);
   1.733 +    __ aes_dround23_l(F50, F56, F58, F62);
   1.734 +    __ aes_dround01_l(F48, F56, F58, F60);
   1.735 +    // re-init F48:F54 with their original values
   1.736 +    __ movxtod(G2,F54);
   1.737 +    __ movxtod(G3,F52);
   1.738 +    __ movxtod(G6,F50);
   1.739 +    __ movxtod(G1,F48);
   1.740 +
   1.741 +    __ movxtod(L0,F6);
   1.742 +    __ movxtod(L1,F4);
   1.743 +    __ fxor(FloatRegisterImpl::D, F6, F0, F0);
   1.744 +    __ fxor(FloatRegisterImpl::D, F4, F2, F2);
   1.745 +
   1.746 +    __ stf(FloatRegisterImpl::D, F0, to, 0);
   1.747 +    __ stf(FloatRegisterImpl::D, F2, to, 8);
   1.748 +
   1.749 +    __ movxtod(G4,F56);
   1.750 +    __ movxtod(G5,F58);
   1.751 +    __ mov(L4,L0);
   1.752 +    __ mov(L5,L1);
   1.753 +    __ fxor(FloatRegisterImpl::D, F56, F60, F60);
   1.754 +    __ fxor(FloatRegisterImpl::D, F58, F62, F62);
   1.755 +
   1.756 +    __ stf(FloatRegisterImpl::D, F60, to, 16);
   1.757 +    __ stf(FloatRegisterImpl::D, F62, to, 24);
   1.758 +
   1.759 +    __ add(from, 32, from);
   1.760 +    __ add(to, 32, to);
   1.761 +    __ subcc(len_reg, 32, len_reg);
   1.762 +    __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks256);
   1.763 +    __ delayed()->nop();
   1.764 +
   1.765 +    __ BIND(L_cbcdec_end);
   1.766 +    __ stx(L0, rvec, 0);
   1.767 +    __ stx(L1, rvec, 8);
   1.768 +    __ restore();
   1.769 +    __ mov(L0, O0);
   1.770 +    __ retl();
   1.771 +    __ delayed()->nop();
   1.772 +
   1.773 +    return start;
   1.774 +  }
   1.775 +
   1.776    void generate_initial() {
   1.777      // Generates all stubs and initializes the entry points
   1.778  
   1.779 @@ -3368,6 +4137,14 @@
   1.780      generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
   1.781                                                         &StubRoutines::_safefetchN_fault_pc,
   1.782                                                         &StubRoutines::_safefetchN_continuation_pc);
   1.783 +
   1.784 +    // generate AES intrinsics code
   1.785 +    if (UseAESIntrinsics) {
   1.786 +      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
   1.787 +      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
   1.788 +      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
   1.789 +      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
   1.790 +    }
   1.791    }
   1.792  
   1.793  

mercurial