src/cpu/sparc/vm/stubGenerator_sparc.cpp

changeset 6653
03214612e77e
parent 6312
04d32e7fad07
child 6682
0fb5b60ab4a2
     1.1 --- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Thu May 01 15:02:46 2014 -0700
     1.2 +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Wed Apr 30 14:14:01 2014 -0700
     1.3 @@ -1,5 +1,5 @@
     1.4  /*
     1.5 - * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
     1.6 + * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
     1.7   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     1.8   *
     1.9   * This code is free software; you can redistribute it and/or modify it
    1.10 @@ -3305,9 +3305,12 @@
    1.11    }
    1.12  
    1.13    address generate_aescrypt_encryptBlock() {
    1.14 +    // required since we read expanded key 'int' array starting first element without alignment considerations
    1.15 +    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
    1.16 +           "the following code assumes that first element of an int array is aligned to 8 bytes");
    1.17      __ align(CodeEntryAlignment);
    1.18 -    StubCodeMark mark(this, "StubRoutines", "aesencryptBlock");
    1.19 -    Label L_doLast128bit, L_storeOutput;
    1.20 +    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
    1.21 +    Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output;
    1.22      address start = __ pc();
    1.23      Register from = O0; // source byte array
    1.24      Register to = O1;   // destination byte array
    1.25 @@ -3317,15 +3320,33 @@
    1.26      // read expanded key length
    1.27      __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
    1.28  
    1.29 -    // load input into F54-F56; F30-F31 used as temp
    1.30 -    __ ldf(FloatRegisterImpl::S, from, 0, F30);
    1.31 -    __ ldf(FloatRegisterImpl::S, from, 4, F31);
    1.32 -    __ fmov(FloatRegisterImpl::D, F30, F54);
    1.33 -    __ ldf(FloatRegisterImpl::S, from, 8, F30);
    1.34 -    __ ldf(FloatRegisterImpl::S, from, 12, F31);
    1.35 -    __ fmov(FloatRegisterImpl::D, F30, F56);
    1.36 -
    1.37 -    // load expanded key
    1.38 +    // Method to address arbitrary alignment for load instructions:
    1.39 +    // Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary
    1.40 +    // If zero/aligned then continue with double FP load instructions
    1.41 +    // If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata
    1.42 +    // alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address
    1.43 +    // load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address
    1.44 +    // faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs
    1.45 +
    1.46 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
    1.47 +    __ andcc(from, 7, G0);
    1.48 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
    1.49 +    __ delayed()->alignaddr(from, G0, from);
    1.50 +
    1.51 +    // aligned case: load input into F54-F56
    1.52 +    __ ldf(FloatRegisterImpl::D, from, 0, F54);
    1.53 +    __ ldf(FloatRegisterImpl::D, from, 8, F56);
    1.54 +    __ ba_short(L_load_expanded_key);
    1.55 +
    1.56 +    __ BIND(L_load_misaligned_input);
    1.57 +    __ ldf(FloatRegisterImpl::D, from, 0, F54);
    1.58 +    __ ldf(FloatRegisterImpl::D, from, 8, F56);
    1.59 +    __ ldf(FloatRegisterImpl::D, from, 16, F58);
    1.60 +    __ faligndata(F54, F56, F54);
    1.61 +    __ faligndata(F56, F58, F56);
    1.62 +
    1.63 +    __ BIND(L_load_expanded_key);
    1.64 +    // Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed
    1.65      for ( int i = 0;  i <= 38; i += 2 ) {
    1.66        __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i));
    1.67      }
    1.68 @@ -3365,8 +3386,7 @@
    1.69      __ ldf(FloatRegisterImpl::D, key, 232, F50);
    1.70      __ aes_eround01(F52, F54, F56, F58); //round 13
    1.71      __ aes_eround23(F46, F54, F56, F60);
    1.72 -    __ br(Assembler::always, false, Assembler::pt, L_storeOutput);
    1.73 -    __ delayed()->nop();
    1.74 +    __ ba_short(L_storeOutput);
    1.75  
    1.76      __ BIND(L_doLast128bit);
    1.77      __ ldf(FloatRegisterImpl::D, key, 160, F48);
    1.78 @@ -3377,23 +3397,62 @@
    1.79      __ aes_eround01_l(F48, F58, F60, F54); //last round
    1.80      __ aes_eround23_l(F50, F58, F60, F56);
    1.81  
    1.82 -    // store output into the destination array, F0-F1 used as temp
    1.83 -    __ fmov(FloatRegisterImpl::D, F54, F0);
    1.84 -    __ stf(FloatRegisterImpl::S, F0, to, 0);
    1.85 -    __ stf(FloatRegisterImpl::S, F1, to, 4);
    1.86 -    __ fmov(FloatRegisterImpl::D, F56, F0);
    1.87 -    __ stf(FloatRegisterImpl::S, F0, to, 8);
    1.88 +    // Method to address arbitrary alignment for store instructions:
    1.89 +    // Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary
    1.90 +    // If zero/aligned then continue with double FP store instructions
    1.91 +    // If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case)
    1.92 +    // Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001
    1.93 +    // Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case
    1.94 +    // We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case.
    1.95 +    // Set GSR.align to (8-n) using alignaddr
    1.96 +    // Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf
    1.97 +    // Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address
    1.98 +    // Store (partial) the original first (8-n) bytes starting at the original 'dest' address
    1.99 +    // Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address
   1.100 +    // We need to execute this process for both the 8-byte result values
   1.101 +
   1.102 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
   1.103 +    __ andcc(to, 7, O5);
   1.104 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
   1.105 +    __ delayed()->edge8n(to, G0, O3);
   1.106 +
   1.107 +    // aligned case: store output into the destination array
   1.108 +    __ stf(FloatRegisterImpl::D, F54, to, 0);
   1.109      __ retl();
   1.110 -    __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12);
   1.111 +    __ delayed()->stf(FloatRegisterImpl::D, F56, to, 8);
   1.112 +
   1.113 +    __ BIND(L_store_misaligned_output);
   1.114 +    __ add(to, 8, O4);
   1.115 +    __ mov(8, O2);
   1.116 +    __ sub(O2, O5, O2);
   1.117 +    __ alignaddr(O2, G0, O2);
   1.118 +    __ faligndata(F54, F54, F54);
   1.119 +    __ faligndata(F56, F56, F56);
   1.120 +    __ and3(to, -8, to);
   1.121 +    __ and3(O4, -8, O4);
   1.122 +    __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
   1.123 +    __ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
   1.124 +    __ add(to, 8, to);
   1.125 +    __ add(O4, 8, O4);
   1.126 +    __ orn(G0, O3, O3);
   1.127 +    __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
   1.128 +    __ retl();
   1.129 +    __ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
   1.130  
   1.131      return start;
   1.132    }
   1.133  
   1.134    address generate_aescrypt_decryptBlock() {
   1.135 +    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
   1.136 +           "the following code assumes that first element of an int array is aligned to 8 bytes");
   1.137 +    // required since we read original key 'byte' array as well in the decryption stubs
   1.138 +    assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
   1.139 +           "the following code assumes that first element of a byte array is aligned to 8 bytes");
   1.140      __ align(CodeEntryAlignment);
   1.141 -    StubCodeMark mark(this, "StubRoutines", "aesdecryptBlock");
   1.142 +    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
   1.143      address start = __ pc();
   1.144 -    Label L_expand192bit, L_expand256bit, L_common_transform;
   1.145 +    Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input;
   1.146 +    Label L_256bit_transform, L_common_transform, L_store_misaligned_output;
   1.147      Register from = O0; // source byte array
   1.148      Register to = O1;   // destination byte array
   1.149      Register key = O2;  // expanded key array
   1.150 @@ -3403,15 +3462,29 @@
   1.151      // read expanded key array length
   1.152      __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
   1.153  
   1.154 -    // load input into F52-F54; F30,F31 used as temp
   1.155 -    __ ldf(FloatRegisterImpl::S, from, 0, F30);
   1.156 -    __ ldf(FloatRegisterImpl::S, from, 4, F31);
   1.157 -    __ fmov(FloatRegisterImpl::D, F30, F52);
   1.158 -    __ ldf(FloatRegisterImpl::S, from, 8, F30);
   1.159 -    __ ldf(FloatRegisterImpl::S, from, 12, F31);
   1.160 -    __ fmov(FloatRegisterImpl::D, F30, F54);
   1.161 -
   1.162 +    // save 'from' since we may need to recheck alignment in case of 256-bit decryption
   1.163 +    __ mov(from, G1);
   1.164 +
   1.165 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
   1.166 +    __ andcc(from, 7, G0);
   1.167 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
   1.168 +    __ delayed()->alignaddr(from, G0, from);
   1.169 +
   1.170 +    // aligned case: load input into F52-F54
   1.171 +    __ ldf(FloatRegisterImpl::D, from, 0, F52);
   1.172 +    __ ldf(FloatRegisterImpl::D, from, 8, F54);
   1.173 +    __ ba_short(L_load_original_key);
   1.174 +
   1.175 +    __ BIND(L_load_misaligned_input);
   1.176 +    __ ldf(FloatRegisterImpl::D, from, 0, F52);
   1.177 +    __ ldf(FloatRegisterImpl::D, from, 8, F54);
   1.178 +    __ ldf(FloatRegisterImpl::D, from, 16, F56);
   1.179 +    __ faligndata(F52, F54, F52);
   1.180 +    __ faligndata(F54, F56, F54);
   1.181 +
   1.182 +    __ BIND(L_load_original_key);
   1.183      // load original key from SunJCE expanded decryption key
   1.184 +    // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
   1.185      for ( int i = 0;  i <= 3; i++ ) {
   1.186        __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
   1.187      }
   1.188 @@ -3432,8 +3505,7 @@
   1.189      // perform 128-bit key specific inverse cipher transformation
   1.190      __ fxor(FloatRegisterImpl::D, F42, F54, F54);
   1.191      __ fxor(FloatRegisterImpl::D, F40, F52, F52);
   1.192 -    __ br(Assembler::always, false, Assembler::pt, L_common_transform);
   1.193 -    __ delayed()->nop();
   1.194 +    __ ba_short(L_common_transform);
   1.195  
   1.196      __ BIND(L_expand192bit);
   1.197  
   1.198 @@ -3457,8 +3529,7 @@
   1.199      __ aes_dround01(F44, F52, F54, F56);
   1.200      __ aes_dround23(F42, F56, F58, F54);
   1.201      __ aes_dround01(F40, F56, F58, F52);
   1.202 -    __ br(Assembler::always, false, Assembler::pt, L_common_transform);
   1.203 -    __ delayed()->nop();
   1.204 +    __ ba_short(L_common_transform);
   1.205  
   1.206      __ BIND(L_expand256bit);
   1.207  
   1.208 @@ -3478,14 +3549,31 @@
   1.209      __ aes_kexpand2(F50, F56, F58);
   1.210  
   1.211      for ( int i = 0;  i <= 6; i += 2 ) {
   1.212 -      __ fmov(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i));
   1.213 +      __ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i));
   1.214      }
   1.215  
   1.216 -    // load input into F52-F54
   1.217 +    // reload original 'from' address
   1.218 +    __ mov(G1, from);
   1.219 +
   1.220 +    // re-check 8-byte alignment
   1.221 +    __ andcc(from, 7, G0);
   1.222 +    __ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input);
   1.223 +    __ delayed()->alignaddr(from, G0, from);
   1.224 +
   1.225 +    // aligned case: load input into F52-F54
   1.226      __ ldf(FloatRegisterImpl::D, from, 0, F52);
   1.227      __ ldf(FloatRegisterImpl::D, from, 8, F54);
   1.228 +    __ ba_short(L_256bit_transform);
   1.229 +
   1.230 +    __ BIND(L_reload_misaligned_input);
   1.231 +    __ ldf(FloatRegisterImpl::D, from, 0, F52);
   1.232 +    __ ldf(FloatRegisterImpl::D, from, 8, F54);
   1.233 +    __ ldf(FloatRegisterImpl::D, from, 16, F56);
   1.234 +    __ faligndata(F52, F54, F52);
   1.235 +    __ faligndata(F54, F56, F54);
   1.236  
   1.237      // perform 256-bit key specific inverse cipher transformation
   1.238 +    __ BIND(L_256bit_transform);
   1.239      __ fxor(FloatRegisterImpl::D, F0, F54, F54);
   1.240      __ fxor(FloatRegisterImpl::D, F2, F52, F52);
   1.241      __ aes_dround23(F4, F52, F54, F58);
   1.242 @@ -3515,43 +3603,71 @@
   1.243        }
   1.244      }
   1.245  
   1.246 -    // store output to destination array, F0-F1 used as temp
   1.247 -    __ fmov(FloatRegisterImpl::D, F52, F0);
   1.248 -    __ stf(FloatRegisterImpl::S, F0, to, 0);
   1.249 -    __ stf(FloatRegisterImpl::S, F1, to, 4);
   1.250 -    __ fmov(FloatRegisterImpl::D, F54, F0);
   1.251 -    __ stf(FloatRegisterImpl::S, F0, to, 8);
   1.252 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
   1.253 +    __ andcc(to, 7, O5);
   1.254 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
   1.255 +    __ delayed()->edge8n(to, G0, O3);
   1.256 +
   1.257 +    // aligned case: store output into the destination array
   1.258 +    __ stf(FloatRegisterImpl::D, F52, to, 0);
   1.259      __ retl();
   1.260 -    __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12);
   1.261 +    __ delayed()->stf(FloatRegisterImpl::D, F54, to, 8);
   1.262 +
   1.263 +    __ BIND(L_store_misaligned_output);
   1.264 +    __ add(to, 8, O4);
   1.265 +    __ mov(8, O2);
   1.266 +    __ sub(O2, O5, O2);
   1.267 +    __ alignaddr(O2, G0, O2);
   1.268 +    __ faligndata(F52, F52, F52);
   1.269 +    __ faligndata(F54, F54, F54);
   1.270 +    __ and3(to, -8, to);
   1.271 +    __ and3(O4, -8, O4);
   1.272 +    __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
   1.273 +    __ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
   1.274 +    __ add(to, 8, to);
   1.275 +    __ add(O4, 8, O4);
   1.276 +    __ orn(G0, O3, O3);
   1.277 +    __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
   1.278 +    __ retl();
   1.279 +    __ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
   1.280  
   1.281      return start;
   1.282    }
   1.283  
   1.284    address generate_cipherBlockChaining_encryptAESCrypt() {
   1.285 +    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
   1.286 +           "the following code assumes that first element of an int array is aligned to 8 bytes");
   1.287 +    assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
   1.288 +           "the following code assumes that first element of a byte array is aligned to 8 bytes");
   1.289      __ align(CodeEntryAlignment);
   1.290      StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
   1.291 -    Label L_cbcenc128, L_cbcenc192, L_cbcenc256;
   1.292 +    Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit;
   1.293 +    Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform;
   1.294 +    Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit;
   1.295 +    Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit;
   1.296      address start = __ pc();
   1.297 -    Register from = O0; // source byte array
   1.298 -    Register to = O1;   // destination byte array
   1.299 -    Register key = O2;  // expanded key array
   1.300 -    Register rvec = O3; // init vector
   1.301 -    const Register len_reg = O4; // cipher length
   1.302 -    const Register keylen = O5;  // reg for storing expanded key array length
   1.303 -
   1.304 -    // save cipher len to return in the end
   1.305 -    __ mov(len_reg, L1);
   1.306 +    Register from = I0; // source byte array
   1.307 +    Register to = I1;   // destination byte array
   1.308 +    Register key = I2;  // expanded key array
   1.309 +    Register rvec = I3; // init vector
   1.310 +    const Register len_reg = I4; // cipher length
   1.311 +    const Register keylen = I5;  // reg for storing expanded key array length
   1.312 +
   1.313 +    // save cipher len before save_frame, to return in the end
   1.314 +    __ mov(O4, L0);
   1.315 +    __ save_frame(0);
   1.316  
   1.317      // read expanded key length
   1.318      __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
   1.319  
   1.320 -    // load init vector
   1.321 +    // load initial vector, 8-byte alignment is guranteed
   1.322      __ ldf(FloatRegisterImpl::D, rvec, 0, F60);
   1.323      __ ldf(FloatRegisterImpl::D, rvec, 8, F62);
   1.324 +    // load key, 8-byte alignment is guranteed
   1.325      __ ldx(key,0,G1);
   1.326 -    __ ldx(key,8,G2);
   1.327 -
   1.328 -    // start loading expanded key
   1.329 +    __ ldx(key,8,G5);
   1.330 +
   1.331 +    // start loading expanded key, 8-byte alignment is guranteed
   1.332      for ( int i = 0, j = 16;  i <= 38; i += 2, j += 8 ) {
   1.333        __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
   1.334      }
   1.335 @@ -3571,15 +3687,35 @@
   1.336      }
   1.337  
   1.338      // 256-bit original key size
   1.339 -    __ br(Assembler::always, false, Assembler::pt, L_cbcenc256);
   1.340 -    __ delayed()->nop();
   1.341 +    __ ba_short(L_cbcenc256);
   1.342  
   1.343      __ align(OptoLoopAlignment);
   1.344      __ BIND(L_cbcenc128);
   1.345 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
   1.346 +    __ andcc(from, 7, G0);
   1.347 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit);
   1.348 +    __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
   1.349 +
   1.350 +    // aligned case: load input into G3 and G4
   1.351      __ ldx(from,0,G3);
   1.352      __ ldx(from,8,G4);
   1.353 +    __ ba_short(L_128bit_transform);
   1.354 +
   1.355 +    __ BIND(L_load_misaligned_input_128bit);
   1.356 +    // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
   1.357 +    __ alignaddr(from, G0, from);
   1.358 +    __ ldf(FloatRegisterImpl::D, from, 0, F48);
   1.359 +    __ ldf(FloatRegisterImpl::D, from, 8, F50);
   1.360 +    __ ldf(FloatRegisterImpl::D, from, 16, F52);
   1.361 +    __ faligndata(F48, F50, F48);
   1.362 +    __ faligndata(F50, F52, F50);
   1.363 +    __ movdtox(F48, G3);
   1.364 +    __ movdtox(F50, G4);
   1.365 +    __ mov(L1, from);
   1.366 +
   1.367 +    __ BIND(L_128bit_transform);
   1.368      __ xor3(G1,G3,G3);
   1.369 -    __ xor3(G2,G4,G4);
   1.370 +    __ xor3(G5,G4,G4);
   1.371      __ movxtod(G3,F56);
   1.372      __ movxtod(G4,F58);
   1.373      __ fxor(FloatRegisterImpl::D, F60, F56, F60);
   1.374 @@ -3598,24 +3734,81 @@
   1.375        }
   1.376      }
   1.377  
   1.378 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
   1.379 +    __ andcc(to, 7, L1);
   1.380 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit);
   1.381 +    __ delayed()->edge8n(to, G0, L2);
   1.382 +
   1.383 +    // aligned case: store output into the destination array
   1.384      __ stf(FloatRegisterImpl::D, F60, to, 0);
   1.385      __ stf(FloatRegisterImpl::D, F62, to, 8);
   1.386 +    __ ba_short(L_check_loop_end_128bit);
   1.387 +
   1.388 +    __ BIND(L_store_misaligned_output_128bit);
   1.389 +    __ add(to, 8, L3);
   1.390 +    __ mov(8, L4);
   1.391 +    __ sub(L4, L1, L4);
   1.392 +    __ alignaddr(L4, G0, L4);
   1.393 +    // save cipher text before circular right shift
   1.394 +    // as it needs to be stored as iv for next block (see code before next retl)
   1.395 +    __ movdtox(F60, L6);
   1.396 +    __ movdtox(F62, L7);
   1.397 +    __ faligndata(F60, F60, F60);
   1.398 +    __ faligndata(F62, F62, F62);
   1.399 +    __ mov(to, L5);
   1.400 +    __ and3(to, -8, to);
   1.401 +    __ and3(L3, -8, L3);
   1.402 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
   1.403 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
   1.404 +    __ add(to, 8, to);
   1.405 +    __ add(L3, 8, L3);
   1.406 +    __ orn(G0, L2, L2);
   1.407 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
   1.408 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
   1.409 +    __ mov(L5, to);
   1.410 +    __ movxtod(L6, F60);
   1.411 +    __ movxtod(L7, F62);
   1.412 +
   1.413 +    __ BIND(L_check_loop_end_128bit);
   1.414      __ add(from, 16, from);
   1.415      __ add(to, 16, to);
   1.416      __ subcc(len_reg, 16, len_reg);
   1.417      __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128);
   1.418      __ delayed()->nop();
   1.419 +    // re-init intial vector for next block, 8-byte alignment is guaranteed
   1.420      __ stf(FloatRegisterImpl::D, F60, rvec, 0);
   1.421      __ stf(FloatRegisterImpl::D, F62, rvec, 8);
   1.422 +    __ restore();
   1.423      __ retl();
   1.424 -    __ delayed()->mov(L1, O0);
   1.425 +    __ delayed()->mov(L0, O0);
   1.426  
   1.427      __ align(OptoLoopAlignment);
   1.428      __ BIND(L_cbcenc192);
   1.429 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
   1.430 +    __ andcc(from, 7, G0);
   1.431 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit);
   1.432 +    __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
   1.433 +
   1.434 +    // aligned case: load input into G3 and G4
   1.435      __ ldx(from,0,G3);
   1.436      __ ldx(from,8,G4);
   1.437 +    __ ba_short(L_192bit_transform);
   1.438 +
   1.439 +    __ BIND(L_load_misaligned_input_192bit);
   1.440 +    // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
   1.441 +    __ alignaddr(from, G0, from);
   1.442 +    __ ldf(FloatRegisterImpl::D, from, 0, F48);
   1.443 +    __ ldf(FloatRegisterImpl::D, from, 8, F50);
   1.444 +    __ ldf(FloatRegisterImpl::D, from, 16, F52);
   1.445 +    __ faligndata(F48, F50, F48);
   1.446 +    __ faligndata(F50, F52, F50);
   1.447 +    __ movdtox(F48, G3);
   1.448 +    __ movdtox(F50, G4);
   1.449 +    __ mov(L1, from);
   1.450 +
   1.451 +    __ BIND(L_192bit_transform);
   1.452      __ xor3(G1,G3,G3);
   1.453 -    __ xor3(G2,G4,G4);
   1.454 +    __ xor3(G5,G4,G4);
   1.455      __ movxtod(G3,F56);
   1.456      __ movxtod(G4,F58);
   1.457      __ fxor(FloatRegisterImpl::D, F60, F56, F60);
   1.458 @@ -3634,24 +3827,81 @@
   1.459        }
   1.460      }
   1.461  
   1.462 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
   1.463 +    __ andcc(to, 7, L1);
   1.464 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit);
   1.465 +    __ delayed()->edge8n(to, G0, L2);
   1.466 +
   1.467 +    // aligned case: store output into the destination array
   1.468      __ stf(FloatRegisterImpl::D, F60, to, 0);
   1.469      __ stf(FloatRegisterImpl::D, F62, to, 8);
   1.470 +    __ ba_short(L_check_loop_end_192bit);
   1.471 +
   1.472 +    __ BIND(L_store_misaligned_output_192bit);
   1.473 +    __ add(to, 8, L3);
   1.474 +    __ mov(8, L4);
   1.475 +    __ sub(L4, L1, L4);
   1.476 +    __ alignaddr(L4, G0, L4);
   1.477 +    __ movdtox(F60, L6);
   1.478 +    __ movdtox(F62, L7);
   1.479 +    __ faligndata(F60, F60, F60);
   1.480 +    __ faligndata(F62, F62, F62);
   1.481 +    __ mov(to, L5);
   1.482 +    __ and3(to, -8, to);
   1.483 +    __ and3(L3, -8, L3);
   1.484 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
   1.485 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
   1.486 +    __ add(to, 8, to);
   1.487 +    __ add(L3, 8, L3);
   1.488 +    __ orn(G0, L2, L2);
   1.489 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
   1.490 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
   1.491 +    __ mov(L5, to);
   1.492 +    __ movxtod(L6, F60);
   1.493 +    __ movxtod(L7, F62);
   1.494 +
   1.495 +    __ BIND(L_check_loop_end_192bit);
   1.496      __ add(from, 16, from);
   1.497      __ subcc(len_reg, 16, len_reg);
   1.498      __ add(to, 16, to);
   1.499      __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192);
   1.500      __ delayed()->nop();
   1.501 +    // re-init intial vector for next block, 8-byte alignment is guaranteed
   1.502      __ stf(FloatRegisterImpl::D, F60, rvec, 0);
   1.503      __ stf(FloatRegisterImpl::D, F62, rvec, 8);
   1.504 +    __ restore();
   1.505      __ retl();
   1.506 -    __ delayed()->mov(L1, O0);
   1.507 +    __ delayed()->mov(L0, O0);
   1.508  
   1.509      __ align(OptoLoopAlignment);
   1.510      __ BIND(L_cbcenc256);
   1.511 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
   1.512 +    __ andcc(from, 7, G0);
   1.513 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit);
   1.514 +    __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
   1.515 +
   1.516 +    // aligned case: load input into G3 and G4
   1.517      __ ldx(from,0,G3);
   1.518      __ ldx(from,8,G4);
   1.519 +    __ ba_short(L_256bit_transform);
   1.520 +
   1.521 +    __ BIND(L_load_misaligned_input_256bit);
   1.522 +    // cannot clobber F48, F50 and F52. F56, F58 can be used though
   1.523 +    __ alignaddr(from, G0, from);
   1.524 +    __ movdtox(F60, L2); // save F60 before overwriting
   1.525 +    __ ldf(FloatRegisterImpl::D, from, 0, F56);
   1.526 +    __ ldf(FloatRegisterImpl::D, from, 8, F58);
   1.527 +    __ ldf(FloatRegisterImpl::D, from, 16, F60);
   1.528 +    __ faligndata(F56, F58, F56);
   1.529 +    __ faligndata(F58, F60, F58);
   1.530 +    __ movdtox(F56, G3);
   1.531 +    __ movdtox(F58, G4);
   1.532 +    __ mov(L1, from);
   1.533 +    __ movxtod(L2, F60);
   1.534 +
   1.535 +    __ BIND(L_256bit_transform);
   1.536      __ xor3(G1,G3,G3);
   1.537 -    __ xor3(G2,G4,G4);
   1.538 +    __ xor3(G5,G4,G4);
   1.539      __ movxtod(G3,F56);
   1.540      __ movxtod(G4,F58);
   1.541      __ fxor(FloatRegisterImpl::D, F60, F56, F60);
   1.542 @@ -3670,26 +3920,69 @@
   1.543        }
   1.544      }
   1.545  
   1.546 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
   1.547 +    __ andcc(to, 7, L1);
   1.548 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit);
   1.549 +    __ delayed()->edge8n(to, G0, L2);
   1.550 +
   1.551 +    // aligned case: store output into the destination array
   1.552      __ stf(FloatRegisterImpl::D, F60, to, 0);
   1.553      __ stf(FloatRegisterImpl::D, F62, to, 8);
   1.554 +    __ ba_short(L_check_loop_end_256bit);
   1.555 +
   1.556 +    __ BIND(L_store_misaligned_output_256bit);
   1.557 +    __ add(to, 8, L3);
   1.558 +    __ mov(8, L4);
   1.559 +    __ sub(L4, L1, L4);
   1.560 +    __ alignaddr(L4, G0, L4);
   1.561 +    __ movdtox(F60, L6);
   1.562 +    __ movdtox(F62, L7);
   1.563 +    __ faligndata(F60, F60, F60);
   1.564 +    __ faligndata(F62, F62, F62);
   1.565 +    __ mov(to, L5);
   1.566 +    __ and3(to, -8, to);
   1.567 +    __ and3(L3, -8, L3);
   1.568 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
   1.569 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
   1.570 +    __ add(to, 8, to);
   1.571 +    __ add(L3, 8, L3);
   1.572 +    __ orn(G0, L2, L2);
   1.573 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
   1.574 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
   1.575 +    __ mov(L5, to);
   1.576 +    __ movxtod(L6, F60);
   1.577 +    __ movxtod(L7, F62);
   1.578 +
   1.579 +    __ BIND(L_check_loop_end_256bit);
   1.580      __ add(from, 16, from);
   1.581      __ subcc(len_reg, 16, len_reg);
   1.582      __ add(to, 16, to);
   1.583      __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256);
   1.584      __ delayed()->nop();
   1.585 +    // re-init intial vector for next block, 8-byte alignment is guaranteed
   1.586      __ stf(FloatRegisterImpl::D, F60, rvec, 0);
   1.587      __ stf(FloatRegisterImpl::D, F62, rvec, 8);
   1.588 +    __ restore();
   1.589      __ retl();
   1.590 -    __ delayed()->mov(L1, O0);
   1.591 +    __ delayed()->mov(L0, O0);
   1.592  
   1.593      return start;
   1.594    }
   1.595  
   1.596    address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
   1.597 +    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
   1.598 +           "the following code assumes that first element of an int array is aligned to 8 bytes");
   1.599 +    assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
   1.600 +           "the following code assumes that first element of a byte array is aligned to 8 bytes");
   1.601      __ align(CodeEntryAlignment);
   1.602      StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
   1.603      Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start;
   1.604      Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256;
   1.605 +    Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128;
   1.606 +    Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256;
   1.607 +    Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128;
   1.608 +    Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192;
   1.609 +    Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256;
   1.610      address start = __ pc();
   1.611      Register from = I0; // source byte array
   1.612      Register to = I1;   // destination byte array
   1.613 @@ -3704,11 +3997,12 @@
   1.614      __ save_frame(0); //args are read from I* registers since we save the frame in the beginning
   1.615  
   1.616      // load original key from SunJCE expanded decryption key
   1.617 +    // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
   1.618      for ( int i = 0;  i <= 3; i++ ) {
   1.619        __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
   1.620      }
   1.621  
   1.622 -    // load initial vector
   1.623 +    // load initial vector, 8-byte alignment is guaranteed
   1.624      __ ldx(rvec,0,L0);
   1.625      __ ldx(rvec,8,L1);
   1.626  
   1.627 @@ -3733,11 +4027,10 @@
   1.628      __ movdtox(F42,L3);
   1.629  
   1.630      __ and3(len_reg, 16, L4);
   1.631 -    __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks128);
   1.632 -    __ delayed()->nop();
   1.633 -
   1.634 -    __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start);
   1.635 -    __ delayed()->nop();
   1.636 +    __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128);
   1.637 +    __ nop();
   1.638 +
   1.639 +    __ ba_short(L_dec_first_block_start);
   1.640  
   1.641      __ BIND(L_expand192bit);
   1.642      // load rest of the 192-bit key
   1.643 @@ -3758,11 +4051,10 @@
   1.644      __ movdtox(F50,L3);
   1.645  
   1.646      __ and3(len_reg, 16, L4);
   1.647 -    __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks192);
   1.648 -    __ delayed()->nop();
   1.649 -
   1.650 -    __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start);
   1.651 -    __ delayed()->nop();
   1.652 +    __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192);
   1.653 +    __ nop();
   1.654 +
   1.655 +    __ ba_short(L_dec_first_block_start);
   1.656  
   1.657      __ BIND(L_expand256bit);
   1.658      // load rest of the 256-bit key
   1.659 @@ -3785,12 +4077,32 @@
   1.660      __ movdtox(F58,L3);
   1.661  
   1.662      __ and3(len_reg, 16, L4);
   1.663 -    __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks256);
   1.664 -    __ delayed()->nop();
   1.665 +    __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256);
   1.666  
   1.667      __ BIND(L_dec_first_block_start);
   1.668 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
   1.669 +    __ andcc(from, 7, G0);
   1.670 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block);
   1.671 +    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
   1.672 +
   1.673 +    // aligned case: load input into L4 and L5
   1.674      __ ldx(from,0,L4);
   1.675      __ ldx(from,8,L5);
   1.676 +    __ ba_short(L_transform_first_block);
   1.677 +
   1.678 +    __ BIND(L_load_misaligned_input_first_block);
   1.679 +    __ alignaddr(from, G0, from);
   1.680 +    // F58, F60, F62 can be clobbered
   1.681 +    __ ldf(FloatRegisterImpl::D, from, 0, F58);
   1.682 +    __ ldf(FloatRegisterImpl::D, from, 8, F60);
   1.683 +    __ ldf(FloatRegisterImpl::D, from, 16, F62);
   1.684 +    __ faligndata(F58, F60, F58);
   1.685 +    __ faligndata(F60, F62, F60);
   1.686 +    __ movdtox(F58, L4);
   1.687 +    __ movdtox(F60, L5);
   1.688 +    __ mov(G1, from);
   1.689 +
   1.690 +    __ BIND(L_transform_first_block);
   1.691      __ xor3(L2,L4,G1);
   1.692      __ movxtod(G1,F60);
   1.693      __ xor3(L3,L5,G1);
   1.694 @@ -3833,9 +4145,36 @@
   1.695      __ fxor(FloatRegisterImpl::D, F56, F60, F60);
   1.696      __ fxor(FloatRegisterImpl::D, F58, F62, F62);
   1.697  
   1.698 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
   1.699 +    __ andcc(to, 7, G1);
   1.700 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block);
   1.701 +    __ delayed()->edge8n(to, G0, G2);
   1.702 +
   1.703 +    // aligned case: store output into the destination array
   1.704      __ stf(FloatRegisterImpl::D, F60, to, 0);
   1.705      __ stf(FloatRegisterImpl::D, F62, to, 8);
   1.706 -
   1.707 +    __ ba_short(L_check_decrypt_end);
   1.708 +
   1.709 +    __ BIND(L_store_misaligned_output_first_block);
   1.710 +    __ add(to, 8, G3);
   1.711 +    __ mov(8, G4);
   1.712 +    __ sub(G4, G1, G4);
   1.713 +    __ alignaddr(G4, G0, G4);
   1.714 +    __ faligndata(F60, F60, F60);
   1.715 +    __ faligndata(F62, F62, F62);
   1.716 +    __ mov(to, G1);
   1.717 +    __ and3(to, -8, to);
   1.718 +    __ and3(G3, -8, G3);
   1.719 +    __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
   1.720 +    __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
   1.721 +    __ add(to, 8, to);
   1.722 +    __ add(G3, 8, G3);
   1.723 +    __ orn(G0, G2, G2);
   1.724 +    __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
   1.725 +    __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
   1.726 +    __ mov(G1, to);
   1.727 +
   1.728 +    __ BIND(L_check_decrypt_end);
   1.729      __ add(from, 16, from);
   1.730      __ add(to, 16, to);
   1.731      __ subcc(len_reg, 16, len_reg);
   1.732 @@ -3852,17 +4191,44 @@
   1.733      __ BIND(L_dec_next2_blocks128);
   1.734      __ nop();
   1.735  
   1.736 -    // F40:F42 used for first 16-bytes
   1.737 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
   1.738 +    __ andcc(from, 7, G0);
   1.739 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128);
   1.740 +    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
   1.741 +
   1.742 +    // aligned case: load input into G4, G5, L4 and L5
   1.743      __ ldx(from,0,G4);
   1.744      __ ldx(from,8,G5);
   1.745 +    __ ldx(from,16,L4);
   1.746 +    __ ldx(from,24,L5);
   1.747 +    __ ba_short(L_transform_next2_blocks128);
   1.748 +
   1.749 +    __ BIND(L_load_misaligned_next2_blocks128);
   1.750 +    __ alignaddr(from, G0, from);
   1.751 +    // F40, F42, F58, F60, F62 can be clobbered
   1.752 +    __ ldf(FloatRegisterImpl::D, from, 0, F40);
   1.753 +    __ ldf(FloatRegisterImpl::D, from, 8, F42);
   1.754 +    __ ldf(FloatRegisterImpl::D, from, 16, F60);
   1.755 +    __ ldf(FloatRegisterImpl::D, from, 24, F62);
   1.756 +    __ ldf(FloatRegisterImpl::D, from, 32, F58);
   1.757 +    __ faligndata(F40, F42, F40);
   1.758 +    __ faligndata(F42, F60, F42);
   1.759 +    __ faligndata(F60, F62, F60);
   1.760 +    __ faligndata(F62, F58, F62);
   1.761 +    __ movdtox(F40, G4);
   1.762 +    __ movdtox(F42, G5);
   1.763 +    __ movdtox(F60, L4);
   1.764 +    __ movdtox(F62, L5);
   1.765 +    __ mov(G1, from);
   1.766 +
   1.767 +    __ BIND(L_transform_next2_blocks128);
   1.768 +    // F40:F42 used for first 16-bytes
   1.769      __ xor3(L2,G4,G1);
   1.770      __ movxtod(G1,F40);
   1.771      __ xor3(L3,G5,G1);
   1.772      __ movxtod(G1,F42);
   1.773  
   1.774      // F60:F62 used for next 16-bytes
   1.775 -    __ ldx(from,16,L4);
   1.776 -    __ ldx(from,24,L5);
   1.777      __ xor3(L2,L4,G1);
   1.778      __ movxtod(G1,F60);
   1.779      __ xor3(L3,L5,G1);
   1.780 @@ -3891,9 +4257,6 @@
   1.781      __ fxor(FloatRegisterImpl::D, F46, F40, F40);
   1.782      __ fxor(FloatRegisterImpl::D, F44, F42, F42);
   1.783  
   1.784 -    __ stf(FloatRegisterImpl::D, F40, to, 0);
   1.785 -    __ stf(FloatRegisterImpl::D, F42, to, 8);
   1.786 -
   1.787      __ movxtod(G4,F56);
   1.788      __ movxtod(G5,F58);
   1.789      __ mov(L4,L0);
   1.790 @@ -3901,32 +4264,93 @@
   1.791      __ fxor(FloatRegisterImpl::D, F56, F60, F60);
   1.792      __ fxor(FloatRegisterImpl::D, F58, F62, F62);
   1.793  
   1.794 +    // For mis-aligned store of 32 bytes of result we can do:
   1.795 +    // Circular right-shift all 4 FP registers so that 'head' and 'tail'
   1.796 +    // parts that need to be stored starting at mis-aligned address are in a FP reg
   1.797 +    // the other 3 FP regs can thus be stored using regular store
   1.798 +    // we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts
   1.799 +
   1.800 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
   1.801 +    __ andcc(to, 7, G1);
   1.802 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128);
   1.803 +    __ delayed()->edge8n(to, G0, G2);
   1.804 +
   1.805 +    // aligned case: store output into the destination array
   1.806 +    __ stf(FloatRegisterImpl::D, F40, to, 0);
   1.807 +    __ stf(FloatRegisterImpl::D, F42, to, 8);
   1.808      __ stf(FloatRegisterImpl::D, F60, to, 16);
   1.809      __ stf(FloatRegisterImpl::D, F62, to, 24);
   1.810 -
   1.811 +    __ ba_short(L_check_decrypt_loop_end128);
   1.812 +
   1.813 +    __ BIND(L_store_misaligned_output_next2_blocks128);
   1.814 +    __ mov(8, G4);
   1.815 +    __ sub(G4, G1, G4);
   1.816 +    __ alignaddr(G4, G0, G4);
   1.817 +    __ faligndata(F40, F42, F56); // F56 can be clobbered
   1.818 +    __ faligndata(F42, F60, F42);
   1.819 +    __ faligndata(F60, F62, F60);
   1.820 +    __ faligndata(F62, F40, F40);
   1.821 +    __ mov(to, G1);
   1.822 +    __ and3(to, -8, to);
   1.823 +    __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
   1.824 +    __ stf(FloatRegisterImpl::D, F56, to, 8);
   1.825 +    __ stf(FloatRegisterImpl::D, F42, to, 16);
   1.826 +    __ stf(FloatRegisterImpl::D, F60, to, 24);
   1.827 +    __ add(to, 32, to);
   1.828 +    __ orn(G0, G2, G2);
   1.829 +    __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
   1.830 +    __ mov(G1, to);
   1.831 +
   1.832 +    __ BIND(L_check_decrypt_loop_end128);
   1.833      __ add(from, 32, from);
   1.834      __ add(to, 32, to);
   1.835      __ subcc(len_reg, 32, len_reg);
   1.836      __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128);
   1.837      __ delayed()->nop();
   1.838 -    __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end);
   1.839 -    __ delayed()->nop();
   1.840 +    __ ba_short(L_cbcdec_end);
   1.841  
   1.842      __ align(OptoLoopAlignment);
   1.843      __ BIND(L_dec_next2_blocks192);
   1.844      __ nop();
   1.845  
   1.846 -    // F48:F50 used for first 16-bytes
   1.847 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
   1.848 +    __ andcc(from, 7, G0);
   1.849 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192);
   1.850 +    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
   1.851 +
   1.852 +    // aligned case: load input into G4, G5, L4 and L5
   1.853      __ ldx(from,0,G4);
   1.854      __ ldx(from,8,G5);
   1.855 +    __ ldx(from,16,L4);
   1.856 +    __ ldx(from,24,L5);
   1.857 +    __ ba_short(L_transform_next2_blocks192);
   1.858 +
   1.859 +    __ BIND(L_load_misaligned_next2_blocks192);
   1.860 +    __ alignaddr(from, G0, from);
   1.861 +    // F48, F50, F52, F60, F62 can be clobbered
   1.862 +    __ ldf(FloatRegisterImpl::D, from, 0, F48);
   1.863 +    __ ldf(FloatRegisterImpl::D, from, 8, F50);
   1.864 +    __ ldf(FloatRegisterImpl::D, from, 16, F60);
   1.865 +    __ ldf(FloatRegisterImpl::D, from, 24, F62);
   1.866 +    __ ldf(FloatRegisterImpl::D, from, 32, F52);
   1.867 +    __ faligndata(F48, F50, F48);
   1.868 +    __ faligndata(F50, F60, F50);
   1.869 +    __ faligndata(F60, F62, F60);
   1.870 +    __ faligndata(F62, F52, F62);
   1.871 +    __ movdtox(F48, G4);
   1.872 +    __ movdtox(F50, G5);
   1.873 +    __ movdtox(F60, L4);
   1.874 +    __ movdtox(F62, L5);
   1.875 +    __ mov(G1, from);
   1.876 +
   1.877 +    __ BIND(L_transform_next2_blocks192);
   1.878 +    // F48:F50 used for first 16-bytes
   1.879      __ xor3(L2,G4,G1);
   1.880      __ movxtod(G1,F48);
   1.881      __ xor3(L3,G5,G1);
   1.882      __ movxtod(G1,F50);
   1.883  
   1.884      // F60:F62 used for next 16-bytes
   1.885 -    __ ldx(from,16,L4);
   1.886 -    __ ldx(from,24,L5);
   1.887      __ xor3(L2,L4,G1);
   1.888      __ movxtod(G1,F60);
   1.889      __ xor3(L3,L5,G1);
   1.890 @@ -3955,9 +4379,6 @@
   1.891      __ fxor(FloatRegisterImpl::D, F54, F48, F48);
   1.892      __ fxor(FloatRegisterImpl::D, F52, F50, F50);
   1.893  
   1.894 -    __ stf(FloatRegisterImpl::D, F48, to, 0);
   1.895 -    __ stf(FloatRegisterImpl::D, F50, to, 8);
   1.896 -
   1.897      __ movxtod(G4,F56);
   1.898      __ movxtod(G5,F58);
   1.899      __ mov(L4,L0);
   1.900 @@ -3965,32 +4386,87 @@
   1.901      __ fxor(FloatRegisterImpl::D, F56, F60, F60);
   1.902      __ fxor(FloatRegisterImpl::D, F58, F62, F62);
   1.903  
   1.904 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
   1.905 +    __ andcc(to, 7, G1);
   1.906 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192);
   1.907 +    __ delayed()->edge8n(to, G0, G2);
   1.908 +
   1.909 +    // aligned case: store output into the destination array
   1.910 +    __ stf(FloatRegisterImpl::D, F48, to, 0);
   1.911 +    __ stf(FloatRegisterImpl::D, F50, to, 8);
   1.912      __ stf(FloatRegisterImpl::D, F60, to, 16);
   1.913      __ stf(FloatRegisterImpl::D, F62, to, 24);
   1.914 -
   1.915 +    __ ba_short(L_check_decrypt_loop_end192);
   1.916 +
   1.917 +    __ BIND(L_store_misaligned_output_next2_blocks192);
   1.918 +    __ mov(8, G4);
   1.919 +    __ sub(G4, G1, G4);
   1.920 +    __ alignaddr(G4, G0, G4);
   1.921 +    __ faligndata(F48, F50, F56); // F56 can be clobbered
   1.922 +    __ faligndata(F50, F60, F50);
   1.923 +    __ faligndata(F60, F62, F60);
   1.924 +    __ faligndata(F62, F48, F48);
   1.925 +    __ mov(to, G1);
   1.926 +    __ and3(to, -8, to);
   1.927 +    __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
   1.928 +    __ stf(FloatRegisterImpl::D, F56, to, 8);
   1.929 +    __ stf(FloatRegisterImpl::D, F50, to, 16);
   1.930 +    __ stf(FloatRegisterImpl::D, F60, to, 24);
   1.931 +    __ add(to, 32, to);
   1.932 +    __ orn(G0, G2, G2);
   1.933 +    __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
   1.934 +    __ mov(G1, to);
   1.935 +
   1.936 +    __ BIND(L_check_decrypt_loop_end192);
   1.937      __ add(from, 32, from);
   1.938      __ add(to, 32, to);
   1.939      __ subcc(len_reg, 32, len_reg);
   1.940      __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192);
   1.941      __ delayed()->nop();
   1.942 -    __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end);
   1.943 -    __ delayed()->nop();
   1.944 +    __ ba_short(L_cbcdec_end);
   1.945  
   1.946      __ align(OptoLoopAlignment);
   1.947      __ BIND(L_dec_next2_blocks256);
   1.948      __ nop();
   1.949  
   1.950 -    // F0:F2 used for first 16-bytes
   1.951 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
   1.952 +    __ andcc(from, 7, G0);
   1.953 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256);
   1.954 +    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
   1.955 +
   1.956 +    // aligned case: load input into G4, G5, L4 and L5
   1.957      __ ldx(from,0,G4);
   1.958      __ ldx(from,8,G5);
   1.959 +    __ ldx(from,16,L4);
   1.960 +    __ ldx(from,24,L5);
   1.961 +    __ ba_short(L_transform_next2_blocks256);
   1.962 +
   1.963 +    __ BIND(L_load_misaligned_next2_blocks256);
   1.964 +    __ alignaddr(from, G0, from);
   1.965 +    // F0, F2, F4, F60, F62 can be clobbered
   1.966 +    __ ldf(FloatRegisterImpl::D, from, 0, F0);
   1.967 +    __ ldf(FloatRegisterImpl::D, from, 8, F2);
   1.968 +    __ ldf(FloatRegisterImpl::D, from, 16, F60);
   1.969 +    __ ldf(FloatRegisterImpl::D, from, 24, F62);
   1.970 +    __ ldf(FloatRegisterImpl::D, from, 32, F4);
   1.971 +    __ faligndata(F0, F2, F0);
   1.972 +    __ faligndata(F2, F60, F2);
   1.973 +    __ faligndata(F60, F62, F60);
   1.974 +    __ faligndata(F62, F4, F62);
   1.975 +    __ movdtox(F0, G4);
   1.976 +    __ movdtox(F2, G5);
   1.977 +    __ movdtox(F60, L4);
   1.978 +    __ movdtox(F62, L5);
   1.979 +    __ mov(G1, from);
   1.980 +
   1.981 +    __ BIND(L_transform_next2_blocks256);
   1.982 +    // F0:F2 used for first 16-bytes
   1.983      __ xor3(L2,G4,G1);
   1.984      __ movxtod(G1,F0);
   1.985      __ xor3(L3,G5,G1);
   1.986      __ movxtod(G1,F2);
   1.987  
   1.988      // F60:F62 used for next 16-bytes
   1.989 -    __ ldx(from,16,L4);
   1.990 -    __ ldx(from,24,L5);
   1.991      __ xor3(L2,L4,G1);
   1.992      __ movxtod(G1,F60);
   1.993      __ xor3(L3,L5,G1);
   1.994 @@ -4043,9 +4519,6 @@
   1.995      __ fxor(FloatRegisterImpl::D, F6, F0, F0);
   1.996      __ fxor(FloatRegisterImpl::D, F4, F2, F2);
   1.997  
   1.998 -    __ stf(FloatRegisterImpl::D, F0, to, 0);
   1.999 -    __ stf(FloatRegisterImpl::D, F2, to, 8);
  1.1000 -
  1.1001      __ movxtod(G4,F56);
  1.1002      __ movxtod(G5,F58);
  1.1003      __ mov(L4,L0);
  1.1004 @@ -4053,9 +4526,38 @@
  1.1005      __ fxor(FloatRegisterImpl::D, F56, F60, F60);
  1.1006      __ fxor(FloatRegisterImpl::D, F58, F62, F62);
  1.1007  
  1.1008 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
  1.1009 +    __ andcc(to, 7, G1);
  1.1010 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256);
  1.1011 +    __ delayed()->edge8n(to, G0, G2);
  1.1012 +
  1.1013 +    // aligned case: store output into the destination array
  1.1014 +    __ stf(FloatRegisterImpl::D, F0, to, 0);
  1.1015 +    __ stf(FloatRegisterImpl::D, F2, to, 8);
  1.1016      __ stf(FloatRegisterImpl::D, F60, to, 16);
  1.1017      __ stf(FloatRegisterImpl::D, F62, to, 24);
  1.1018 -
  1.1019 +    __ ba_short(L_check_decrypt_loop_end256);
  1.1020 +
  1.1021 +    __ BIND(L_store_misaligned_output_next2_blocks256);
  1.1022 +    __ mov(8, G4);
  1.1023 +    __ sub(G4, G1, G4);
  1.1024 +    __ alignaddr(G4, G0, G4);
  1.1025 +    __ faligndata(F0, F2, F56); // F56 can be clobbered
  1.1026 +    __ faligndata(F2, F60, F2);
  1.1027 +    __ faligndata(F60, F62, F60);
  1.1028 +    __ faligndata(F62, F0, F0);
  1.1029 +    __ mov(to, G1);
  1.1030 +    __ and3(to, -8, to);
  1.1031 +    __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
  1.1032 +    __ stf(FloatRegisterImpl::D, F56, to, 8);
  1.1033 +    __ stf(FloatRegisterImpl::D, F2, to, 16);
  1.1034 +    __ stf(FloatRegisterImpl::D, F60, to, 24);
  1.1035 +    __ add(to, 32, to);
  1.1036 +    __ orn(G0, G2, G2);
  1.1037 +    __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
  1.1038 +    __ mov(G1, to);
  1.1039 +
  1.1040 +    __ BIND(L_check_decrypt_loop_end256);
  1.1041      __ add(from, 32, from);
  1.1042      __ add(to, 32, to);
  1.1043      __ subcc(len_reg, 32, len_reg);
  1.1044 @@ -4063,6 +4565,7 @@
  1.1045      __ delayed()->nop();
  1.1046  
  1.1047      __ BIND(L_cbcdec_end);
  1.1048 +    // re-init intial vector for next block, 8-byte alignment is guaranteed
  1.1049      __ stx(L0, rvec, 0);
  1.1050      __ stx(L1, rvec, 8);
  1.1051      __ restore();

mercurial