1.1 --- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp Thu May 01 15:02:46 2014 -0700 1.2 +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp Wed Apr 30 14:14:01 2014 -0700 1.3 @@ -1,5 +1,5 @@ 1.4 /* 1.5 - * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. 1.6 + * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. 1.7 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 1.8 * 1.9 * This code is free software; you can redistribute it and/or modify it 1.10 @@ -3305,9 +3305,12 @@ 1.11 } 1.12 1.13 address generate_aescrypt_encryptBlock() { 1.14 + // required since we read expanded key 'int' array starting first element without alignment considerations 1.15 + assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 1.16 + "the following code assumes that first element of an int array is aligned to 8 bytes"); 1.17 __ align(CodeEntryAlignment); 1.18 - StubCodeMark mark(this, "StubRoutines", "aesencryptBlock"); 1.19 - Label L_doLast128bit, L_storeOutput; 1.20 + StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 1.21 + Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output; 1.22 address start = __ pc(); 1.23 Register from = O0; // source byte array 1.24 Register to = O1; // destination byte array 1.25 @@ -3317,15 +3320,33 @@ 1.26 // read expanded key length 1.27 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 1.28 1.29 - // load input into F54-F56; F30-F31 used as temp 1.30 - __ ldf(FloatRegisterImpl::S, from, 0, F30); 1.31 - __ ldf(FloatRegisterImpl::S, from, 4, F31); 1.32 - __ fmov(FloatRegisterImpl::D, F30, F54); 1.33 - __ ldf(FloatRegisterImpl::S, from, 8, F30); 1.34 - __ ldf(FloatRegisterImpl::S, from, 12, F31); 1.35 - __ fmov(FloatRegisterImpl::D, F30, F56); 1.36 - 1.37 - // load expanded key 1.38 + // Method to address arbitrary alignment for load instructions: 1.39 + // Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary 1.40 + // If zero/aligned then continue with double FP load instructions 1.41 + // If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata 1.42 + // alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address 1.43 + // load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address 1.44 + // faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs 1.45 + 1.46 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 1.47 + __ andcc(from, 7, G0); 1.48 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input); 1.49 + __ delayed()->alignaddr(from, G0, from); 1.50 + 1.51 + // aligned case: load input into F54-F56 1.52 + __ ldf(FloatRegisterImpl::D, from, 0, F54); 1.53 + __ ldf(FloatRegisterImpl::D, from, 8, F56); 1.54 + __ ba_short(L_load_expanded_key); 1.55 + 1.56 + __ BIND(L_load_misaligned_input); 1.57 + __ ldf(FloatRegisterImpl::D, from, 0, F54); 1.58 + __ ldf(FloatRegisterImpl::D, from, 8, F56); 1.59 + __ ldf(FloatRegisterImpl::D, from, 16, F58); 1.60 + __ faligndata(F54, F56, F54); 1.61 + __ faligndata(F56, F58, F56); 1.62 + 1.63 + __ BIND(L_load_expanded_key); 1.64 + // Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed 1.65 for ( int i = 0; i <= 38; i += 2 ) { 1.66 __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i)); 1.67 } 1.68 @@ -3365,8 +3386,7 @@ 1.69 __ ldf(FloatRegisterImpl::D, key, 232, F50); 1.70 __ aes_eround01(F52, F54, F56, F58); //round 13 1.71 __ aes_eround23(F46, F54, F56, F60); 1.72 - __ br(Assembler::always, false, Assembler::pt, L_storeOutput); 1.73 - __ delayed()->nop(); 1.74 + __ ba_short(L_storeOutput); 1.75 1.76 __ BIND(L_doLast128bit); 1.77 __ ldf(FloatRegisterImpl::D, key, 160, F48); 1.78 @@ -3377,23 +3397,62 @@ 1.79 __ aes_eround01_l(F48, F58, F60, F54); //last round 1.80 __ aes_eround23_l(F50, F58, F60, F56); 1.81 1.82 - // store output into the destination array, F0-F1 used as temp 1.83 - __ fmov(FloatRegisterImpl::D, F54, F0); 1.84 - __ stf(FloatRegisterImpl::S, F0, to, 0); 1.85 - __ stf(FloatRegisterImpl::S, F1, to, 4); 1.86 - __ fmov(FloatRegisterImpl::D, F56, F0); 1.87 - __ stf(FloatRegisterImpl::S, F0, to, 8); 1.88 + // Method to address arbitrary alignment for store instructions: 1.89 + // Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary 1.90 + // If zero/aligned then continue with double FP store instructions 1.91 + // If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case) 1.92 + // Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001 1.93 + // Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case 1.94 + // We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case. 1.95 + // Set GSR.align to (8-n) using alignaddr 1.96 + // Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf 1.97 + // Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address 1.98 + // Store (partial) the original first (8-n) bytes starting at the original 'dest' address 1.99 + // Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address 1.100 + // We need to execute this process for both the 8-byte result values 1.101 + 1.102 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 1.103 + __ andcc(to, 7, O5); 1.104 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output); 1.105 + __ delayed()->edge8n(to, G0, O3); 1.106 + 1.107 + // aligned case: store output into the destination array 1.108 + __ stf(FloatRegisterImpl::D, F54, to, 0); 1.109 __ retl(); 1.110 - __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12); 1.111 + __ delayed()->stf(FloatRegisterImpl::D, F56, to, 8); 1.112 + 1.113 + __ BIND(L_store_misaligned_output); 1.114 + __ add(to, 8, O4); 1.115 + __ mov(8, O2); 1.116 + __ sub(O2, O5, O2); 1.117 + __ alignaddr(O2, G0, O2); 1.118 + __ faligndata(F54, F54, F54); 1.119 + __ faligndata(F56, F56, F56); 1.120 + __ and3(to, -8, to); 1.121 + __ and3(O4, -8, O4); 1.122 + __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY); 1.123 + __ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY); 1.124 + __ add(to, 8, to); 1.125 + __ add(O4, 8, O4); 1.126 + __ orn(G0, O3, O3); 1.127 + __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY); 1.128 + __ retl(); 1.129 + __ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY); 1.130 1.131 return start; 1.132 } 1.133 1.134 address generate_aescrypt_decryptBlock() { 1.135 + assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 1.136 + "the following code assumes that first element of an int array is aligned to 8 bytes"); 1.137 + // required since we read original key 'byte' array as well in the decryption stubs 1.138 + assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, 1.139 + "the following code assumes that first element of a byte array is aligned to 8 bytes"); 1.140 __ align(CodeEntryAlignment); 1.141 - StubCodeMark mark(this, "StubRoutines", "aesdecryptBlock"); 1.142 + StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 1.143 address start = __ pc(); 1.144 - Label L_expand192bit, L_expand256bit, L_common_transform; 1.145 + Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input; 1.146 + Label L_256bit_transform, L_common_transform, L_store_misaligned_output; 1.147 Register from = O0; // source byte array 1.148 Register to = O1; // destination byte array 1.149 Register key = O2; // expanded key array 1.150 @@ -3403,15 +3462,29 @@ 1.151 // read expanded key array length 1.152 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 1.153 1.154 - // load input into F52-F54; F30,F31 used as temp 1.155 - __ ldf(FloatRegisterImpl::S, from, 0, F30); 1.156 - __ ldf(FloatRegisterImpl::S, from, 4, F31); 1.157 - __ fmov(FloatRegisterImpl::D, F30, F52); 1.158 - __ ldf(FloatRegisterImpl::S, from, 8, F30); 1.159 - __ ldf(FloatRegisterImpl::S, from, 12, F31); 1.160 - __ fmov(FloatRegisterImpl::D, F30, F54); 1.161 - 1.162 + // save 'from' since we may need to recheck alignment in case of 256-bit decryption 1.163 + __ mov(from, G1); 1.164 + 1.165 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 1.166 + __ andcc(from, 7, G0); 1.167 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input); 1.168 + __ delayed()->alignaddr(from, G0, from); 1.169 + 1.170 + // aligned case: load input into F52-F54 1.171 + __ ldf(FloatRegisterImpl::D, from, 0, F52); 1.172 + __ ldf(FloatRegisterImpl::D, from, 8, F54); 1.173 + __ ba_short(L_load_original_key); 1.174 + 1.175 + __ BIND(L_load_misaligned_input); 1.176 + __ ldf(FloatRegisterImpl::D, from, 0, F52); 1.177 + __ ldf(FloatRegisterImpl::D, from, 8, F54); 1.178 + __ ldf(FloatRegisterImpl::D, from, 16, F56); 1.179 + __ faligndata(F52, F54, F52); 1.180 + __ faligndata(F54, F56, F54); 1.181 + 1.182 + __ BIND(L_load_original_key); 1.183 // load original key from SunJCE expanded decryption key 1.184 + // Since we load original key buffer starting first element, 8-byte alignment is guaranteed 1.185 for ( int i = 0; i <= 3; i++ ) { 1.186 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 1.187 } 1.188 @@ -3432,8 +3505,7 @@ 1.189 // perform 128-bit key specific inverse cipher transformation 1.190 __ fxor(FloatRegisterImpl::D, F42, F54, F54); 1.191 __ fxor(FloatRegisterImpl::D, F40, F52, F52); 1.192 - __ br(Assembler::always, false, Assembler::pt, L_common_transform); 1.193 - __ delayed()->nop(); 1.194 + __ ba_short(L_common_transform); 1.195 1.196 __ BIND(L_expand192bit); 1.197 1.198 @@ -3457,8 +3529,7 @@ 1.199 __ aes_dround01(F44, F52, F54, F56); 1.200 __ aes_dround23(F42, F56, F58, F54); 1.201 __ aes_dround01(F40, F56, F58, F52); 1.202 - __ br(Assembler::always, false, Assembler::pt, L_common_transform); 1.203 - __ delayed()->nop(); 1.204 + __ ba_short(L_common_transform); 1.205 1.206 __ BIND(L_expand256bit); 1.207 1.208 @@ -3478,14 +3549,31 @@ 1.209 __ aes_kexpand2(F50, F56, F58); 1.210 1.211 for ( int i = 0; i <= 6; i += 2 ) { 1.212 - __ fmov(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i)); 1.213 + __ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i)); 1.214 } 1.215 1.216 - // load input into F52-F54 1.217 + // reload original 'from' address 1.218 + __ mov(G1, from); 1.219 + 1.220 + // re-check 8-byte alignment 1.221 + __ andcc(from, 7, G0); 1.222 + __ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input); 1.223 + __ delayed()->alignaddr(from, G0, from); 1.224 + 1.225 + // aligned case: load input into F52-F54 1.226 __ ldf(FloatRegisterImpl::D, from, 0, F52); 1.227 __ ldf(FloatRegisterImpl::D, from, 8, F54); 1.228 + __ ba_short(L_256bit_transform); 1.229 + 1.230 + __ BIND(L_reload_misaligned_input); 1.231 + __ ldf(FloatRegisterImpl::D, from, 0, F52); 1.232 + __ ldf(FloatRegisterImpl::D, from, 8, F54); 1.233 + __ ldf(FloatRegisterImpl::D, from, 16, F56); 1.234 + __ faligndata(F52, F54, F52); 1.235 + __ faligndata(F54, F56, F54); 1.236 1.237 // perform 256-bit key specific inverse cipher transformation 1.238 + __ BIND(L_256bit_transform); 1.239 __ fxor(FloatRegisterImpl::D, F0, F54, F54); 1.240 __ fxor(FloatRegisterImpl::D, F2, F52, F52); 1.241 __ aes_dround23(F4, F52, F54, F58); 1.242 @@ -3515,43 +3603,71 @@ 1.243 } 1.244 } 1.245 1.246 - // store output to destination array, F0-F1 used as temp 1.247 - __ fmov(FloatRegisterImpl::D, F52, F0); 1.248 - __ stf(FloatRegisterImpl::S, F0, to, 0); 1.249 - __ stf(FloatRegisterImpl::S, F1, to, 4); 1.250 - __ fmov(FloatRegisterImpl::D, F54, F0); 1.251 - __ stf(FloatRegisterImpl::S, F0, to, 8); 1.252 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 1.253 + __ andcc(to, 7, O5); 1.254 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output); 1.255 + __ delayed()->edge8n(to, G0, O3); 1.256 + 1.257 + // aligned case: store output into the destination array 1.258 + __ stf(FloatRegisterImpl::D, F52, to, 0); 1.259 __ retl(); 1.260 - __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12); 1.261 + __ delayed()->stf(FloatRegisterImpl::D, F54, to, 8); 1.262 + 1.263 + __ BIND(L_store_misaligned_output); 1.264 + __ add(to, 8, O4); 1.265 + __ mov(8, O2); 1.266 + __ sub(O2, O5, O2); 1.267 + __ alignaddr(O2, G0, O2); 1.268 + __ faligndata(F52, F52, F52); 1.269 + __ faligndata(F54, F54, F54); 1.270 + __ and3(to, -8, to); 1.271 + __ and3(O4, -8, O4); 1.272 + __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY); 1.273 + __ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY); 1.274 + __ add(to, 8, to); 1.275 + __ add(O4, 8, O4); 1.276 + __ orn(G0, O3, O3); 1.277 + __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY); 1.278 + __ retl(); 1.279 + __ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY); 1.280 1.281 return start; 1.282 } 1.283 1.284 address generate_cipherBlockChaining_encryptAESCrypt() { 1.285 + assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 1.286 + "the following code assumes that first element of an int array is aligned to 8 bytes"); 1.287 + assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, 1.288 + "the following code assumes that first element of a byte array is aligned to 8 bytes"); 1.289 __ align(CodeEntryAlignment); 1.290 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 1.291 - Label L_cbcenc128, L_cbcenc192, L_cbcenc256; 1.292 + Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit; 1.293 + Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform; 1.294 + Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit; 1.295 + Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit; 1.296 address start = __ pc(); 1.297 - Register from = O0; // source byte array 1.298 - Register to = O1; // destination byte array 1.299 - Register key = O2; // expanded key array 1.300 - Register rvec = O3; // init vector 1.301 - const Register len_reg = O4; // cipher length 1.302 - const Register keylen = O5; // reg for storing expanded key array length 1.303 - 1.304 - // save cipher len to return in the end 1.305 - __ mov(len_reg, L1); 1.306 + Register from = I0; // source byte array 1.307 + Register to = I1; // destination byte array 1.308 + Register key = I2; // expanded key array 1.309 + Register rvec = I3; // init vector 1.310 + const Register len_reg = I4; // cipher length 1.311 + const Register keylen = I5; // reg for storing expanded key array length 1.312 + 1.313 + // save cipher len before save_frame, to return in the end 1.314 + __ mov(O4, L0); 1.315 + __ save_frame(0); 1.316 1.317 // read expanded key length 1.318 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 1.319 1.320 - // load init vector 1.321 + // load initial vector, 8-byte alignment is guranteed 1.322 __ ldf(FloatRegisterImpl::D, rvec, 0, F60); 1.323 __ ldf(FloatRegisterImpl::D, rvec, 8, F62); 1.324 + // load key, 8-byte alignment is guranteed 1.325 __ ldx(key,0,G1); 1.326 - __ ldx(key,8,G2); 1.327 - 1.328 - // start loading expanded key 1.329 + __ ldx(key,8,G5); 1.330 + 1.331 + // start loading expanded key, 8-byte alignment is guranteed 1.332 for ( int i = 0, j = 16; i <= 38; i += 2, j += 8 ) { 1.333 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); 1.334 } 1.335 @@ -3571,15 +3687,35 @@ 1.336 } 1.337 1.338 // 256-bit original key size 1.339 - __ br(Assembler::always, false, Assembler::pt, L_cbcenc256); 1.340 - __ delayed()->nop(); 1.341 + __ ba_short(L_cbcenc256); 1.342 1.343 __ align(OptoLoopAlignment); 1.344 __ BIND(L_cbcenc128); 1.345 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 1.346 + __ andcc(from, 7, G0); 1.347 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit); 1.348 + __ delayed()->mov(from, L1); // save original 'from' address before alignaddr 1.349 + 1.350 + // aligned case: load input into G3 and G4 1.351 __ ldx(from,0,G3); 1.352 __ ldx(from,8,G4); 1.353 + __ ba_short(L_128bit_transform); 1.354 + 1.355 + __ BIND(L_load_misaligned_input_128bit); 1.356 + // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption 1.357 + __ alignaddr(from, G0, from); 1.358 + __ ldf(FloatRegisterImpl::D, from, 0, F48); 1.359 + __ ldf(FloatRegisterImpl::D, from, 8, F50); 1.360 + __ ldf(FloatRegisterImpl::D, from, 16, F52); 1.361 + __ faligndata(F48, F50, F48); 1.362 + __ faligndata(F50, F52, F50); 1.363 + __ movdtox(F48, G3); 1.364 + __ movdtox(F50, G4); 1.365 + __ mov(L1, from); 1.366 + 1.367 + __ BIND(L_128bit_transform); 1.368 __ xor3(G1,G3,G3); 1.369 - __ xor3(G2,G4,G4); 1.370 + __ xor3(G5,G4,G4); 1.371 __ movxtod(G3,F56); 1.372 __ movxtod(G4,F58); 1.373 __ fxor(FloatRegisterImpl::D, F60, F56, F60); 1.374 @@ -3598,24 +3734,81 @@ 1.375 } 1.376 } 1.377 1.378 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 1.379 + __ andcc(to, 7, L1); 1.380 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit); 1.381 + __ delayed()->edge8n(to, G0, L2); 1.382 + 1.383 + // aligned case: store output into the destination array 1.384 __ stf(FloatRegisterImpl::D, F60, to, 0); 1.385 __ stf(FloatRegisterImpl::D, F62, to, 8); 1.386 + __ ba_short(L_check_loop_end_128bit); 1.387 + 1.388 + __ BIND(L_store_misaligned_output_128bit); 1.389 + __ add(to, 8, L3); 1.390 + __ mov(8, L4); 1.391 + __ sub(L4, L1, L4); 1.392 + __ alignaddr(L4, G0, L4); 1.393 + // save cipher text before circular right shift 1.394 + // as it needs to be stored as iv for next block (see code before next retl) 1.395 + __ movdtox(F60, L6); 1.396 + __ movdtox(F62, L7); 1.397 + __ faligndata(F60, F60, F60); 1.398 + __ faligndata(F62, F62, F62); 1.399 + __ mov(to, L5); 1.400 + __ and3(to, -8, to); 1.401 + __ and3(L3, -8, L3); 1.402 + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 1.403 + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 1.404 + __ add(to, 8, to); 1.405 + __ add(L3, 8, L3); 1.406 + __ orn(G0, L2, L2); 1.407 + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 1.408 + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 1.409 + __ mov(L5, to); 1.410 + __ movxtod(L6, F60); 1.411 + __ movxtod(L7, F62); 1.412 + 1.413 + __ BIND(L_check_loop_end_128bit); 1.414 __ add(from, 16, from); 1.415 __ add(to, 16, to); 1.416 __ subcc(len_reg, 16, len_reg); 1.417 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128); 1.418 __ delayed()->nop(); 1.419 + // re-init intial vector for next block, 8-byte alignment is guaranteed 1.420 __ stf(FloatRegisterImpl::D, F60, rvec, 0); 1.421 __ stf(FloatRegisterImpl::D, F62, rvec, 8); 1.422 + __ restore(); 1.423 __ retl(); 1.424 - __ delayed()->mov(L1, O0); 1.425 + __ delayed()->mov(L0, O0); 1.426 1.427 __ align(OptoLoopAlignment); 1.428 __ BIND(L_cbcenc192); 1.429 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 1.430 + __ andcc(from, 7, G0); 1.431 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit); 1.432 + __ delayed()->mov(from, L1); // save original 'from' address before alignaddr 1.433 + 1.434 + // aligned case: load input into G3 and G4 1.435 __ ldx(from,0,G3); 1.436 __ ldx(from,8,G4); 1.437 + __ ba_short(L_192bit_transform); 1.438 + 1.439 + __ BIND(L_load_misaligned_input_192bit); 1.440 + // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption 1.441 + __ alignaddr(from, G0, from); 1.442 + __ ldf(FloatRegisterImpl::D, from, 0, F48); 1.443 + __ ldf(FloatRegisterImpl::D, from, 8, F50); 1.444 + __ ldf(FloatRegisterImpl::D, from, 16, F52); 1.445 + __ faligndata(F48, F50, F48); 1.446 + __ faligndata(F50, F52, F50); 1.447 + __ movdtox(F48, G3); 1.448 + __ movdtox(F50, G4); 1.449 + __ mov(L1, from); 1.450 + 1.451 + __ BIND(L_192bit_transform); 1.452 __ xor3(G1,G3,G3); 1.453 - __ xor3(G2,G4,G4); 1.454 + __ xor3(G5,G4,G4); 1.455 __ movxtod(G3,F56); 1.456 __ movxtod(G4,F58); 1.457 __ fxor(FloatRegisterImpl::D, F60, F56, F60); 1.458 @@ -3634,24 +3827,81 @@ 1.459 } 1.460 } 1.461 1.462 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 1.463 + __ andcc(to, 7, L1); 1.464 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit); 1.465 + __ delayed()->edge8n(to, G0, L2); 1.466 + 1.467 + // aligned case: store output into the destination array 1.468 __ stf(FloatRegisterImpl::D, F60, to, 0); 1.469 __ stf(FloatRegisterImpl::D, F62, to, 8); 1.470 + __ ba_short(L_check_loop_end_192bit); 1.471 + 1.472 + __ BIND(L_store_misaligned_output_192bit); 1.473 + __ add(to, 8, L3); 1.474 + __ mov(8, L4); 1.475 + __ sub(L4, L1, L4); 1.476 + __ alignaddr(L4, G0, L4); 1.477 + __ movdtox(F60, L6); 1.478 + __ movdtox(F62, L7); 1.479 + __ faligndata(F60, F60, F60); 1.480 + __ faligndata(F62, F62, F62); 1.481 + __ mov(to, L5); 1.482 + __ and3(to, -8, to); 1.483 + __ and3(L3, -8, L3); 1.484 + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 1.485 + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 1.486 + __ add(to, 8, to); 1.487 + __ add(L3, 8, L3); 1.488 + __ orn(G0, L2, L2); 1.489 + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 1.490 + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 1.491 + __ mov(L5, to); 1.492 + __ movxtod(L6, F60); 1.493 + __ movxtod(L7, F62); 1.494 + 1.495 + __ BIND(L_check_loop_end_192bit); 1.496 __ add(from, 16, from); 1.497 __ subcc(len_reg, 16, len_reg); 1.498 __ add(to, 16, to); 1.499 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192); 1.500 __ delayed()->nop(); 1.501 + // re-init intial vector for next block, 8-byte alignment is guaranteed 1.502 __ stf(FloatRegisterImpl::D, F60, rvec, 0); 1.503 __ stf(FloatRegisterImpl::D, F62, rvec, 8); 1.504 + __ restore(); 1.505 __ retl(); 1.506 - __ delayed()->mov(L1, O0); 1.507 + __ delayed()->mov(L0, O0); 1.508 1.509 __ align(OptoLoopAlignment); 1.510 __ BIND(L_cbcenc256); 1.511 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 1.512 + __ andcc(from, 7, G0); 1.513 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit); 1.514 + __ delayed()->mov(from, L1); // save original 'from' address before alignaddr 1.515 + 1.516 + // aligned case: load input into G3 and G4 1.517 __ ldx(from,0,G3); 1.518 __ ldx(from,8,G4); 1.519 + __ ba_short(L_256bit_transform); 1.520 + 1.521 + __ BIND(L_load_misaligned_input_256bit); 1.522 + // cannot clobber F48, F50 and F52. F56, F58 can be used though 1.523 + __ alignaddr(from, G0, from); 1.524 + __ movdtox(F60, L2); // save F60 before overwriting 1.525 + __ ldf(FloatRegisterImpl::D, from, 0, F56); 1.526 + __ ldf(FloatRegisterImpl::D, from, 8, F58); 1.527 + __ ldf(FloatRegisterImpl::D, from, 16, F60); 1.528 + __ faligndata(F56, F58, F56); 1.529 + __ faligndata(F58, F60, F58); 1.530 + __ movdtox(F56, G3); 1.531 + __ movdtox(F58, G4); 1.532 + __ mov(L1, from); 1.533 + __ movxtod(L2, F60); 1.534 + 1.535 + __ BIND(L_256bit_transform); 1.536 __ xor3(G1,G3,G3); 1.537 - __ xor3(G2,G4,G4); 1.538 + __ xor3(G5,G4,G4); 1.539 __ movxtod(G3,F56); 1.540 __ movxtod(G4,F58); 1.541 __ fxor(FloatRegisterImpl::D, F60, F56, F60); 1.542 @@ -3670,26 +3920,69 @@ 1.543 } 1.544 } 1.545 1.546 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 1.547 + __ andcc(to, 7, L1); 1.548 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit); 1.549 + __ delayed()->edge8n(to, G0, L2); 1.550 + 1.551 + // aligned case: store output into the destination array 1.552 __ stf(FloatRegisterImpl::D, F60, to, 0); 1.553 __ stf(FloatRegisterImpl::D, F62, to, 8); 1.554 + __ ba_short(L_check_loop_end_256bit); 1.555 + 1.556 + __ BIND(L_store_misaligned_output_256bit); 1.557 + __ add(to, 8, L3); 1.558 + __ mov(8, L4); 1.559 + __ sub(L4, L1, L4); 1.560 + __ alignaddr(L4, G0, L4); 1.561 + __ movdtox(F60, L6); 1.562 + __ movdtox(F62, L7); 1.563 + __ faligndata(F60, F60, F60); 1.564 + __ faligndata(F62, F62, F62); 1.565 + __ mov(to, L5); 1.566 + __ and3(to, -8, to); 1.567 + __ and3(L3, -8, L3); 1.568 + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 1.569 + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 1.570 + __ add(to, 8, to); 1.571 + __ add(L3, 8, L3); 1.572 + __ orn(G0, L2, L2); 1.573 + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 1.574 + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 1.575 + __ mov(L5, to); 1.576 + __ movxtod(L6, F60); 1.577 + __ movxtod(L7, F62); 1.578 + 1.579 + __ BIND(L_check_loop_end_256bit); 1.580 __ add(from, 16, from); 1.581 __ subcc(len_reg, 16, len_reg); 1.582 __ add(to, 16, to); 1.583 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256); 1.584 __ delayed()->nop(); 1.585 + // re-init intial vector for next block, 8-byte alignment is guaranteed 1.586 __ stf(FloatRegisterImpl::D, F60, rvec, 0); 1.587 __ stf(FloatRegisterImpl::D, F62, rvec, 8); 1.588 + __ restore(); 1.589 __ retl(); 1.590 - __ delayed()->mov(L1, O0); 1.591 + __ delayed()->mov(L0, O0); 1.592 1.593 return start; 1.594 } 1.595 1.596 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { 1.597 + assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 1.598 + "the following code assumes that first element of an int array is aligned to 8 bytes"); 1.599 + assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, 1.600 + "the following code assumes that first element of a byte array is aligned to 8 bytes"); 1.601 __ align(CodeEntryAlignment); 1.602 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 1.603 Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start; 1.604 Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256; 1.605 + Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128; 1.606 + Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256; 1.607 + Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128; 1.608 + Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192; 1.609 + Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256; 1.610 address start = __ pc(); 1.611 Register from = I0; // source byte array 1.612 Register to = I1; // destination byte array 1.613 @@ -3704,11 +3997,12 @@ 1.614 __ save_frame(0); //args are read from I* registers since we save the frame in the beginning 1.615 1.616 // load original key from SunJCE expanded decryption key 1.617 + // Since we load original key buffer starting first element, 8-byte alignment is guaranteed 1.618 for ( int i = 0; i <= 3; i++ ) { 1.619 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 1.620 } 1.621 1.622 - // load initial vector 1.623 + // load initial vector, 8-byte alignment is guaranteed 1.624 __ ldx(rvec,0,L0); 1.625 __ ldx(rvec,8,L1); 1.626 1.627 @@ -3733,11 +4027,10 @@ 1.628 __ movdtox(F42,L3); 1.629 1.630 __ and3(len_reg, 16, L4); 1.631 - __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks128); 1.632 - __ delayed()->nop(); 1.633 - 1.634 - __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start); 1.635 - __ delayed()->nop(); 1.636 + __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128); 1.637 + __ nop(); 1.638 + 1.639 + __ ba_short(L_dec_first_block_start); 1.640 1.641 __ BIND(L_expand192bit); 1.642 // load rest of the 192-bit key 1.643 @@ -3758,11 +4051,10 @@ 1.644 __ movdtox(F50,L3); 1.645 1.646 __ and3(len_reg, 16, L4); 1.647 - __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks192); 1.648 - __ delayed()->nop(); 1.649 - 1.650 - __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start); 1.651 - __ delayed()->nop(); 1.652 + __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192); 1.653 + __ nop(); 1.654 + 1.655 + __ ba_short(L_dec_first_block_start); 1.656 1.657 __ BIND(L_expand256bit); 1.658 // load rest of the 256-bit key 1.659 @@ -3785,12 +4077,32 @@ 1.660 __ movdtox(F58,L3); 1.661 1.662 __ and3(len_reg, 16, L4); 1.663 - __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks256); 1.664 - __ delayed()->nop(); 1.665 + __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256); 1.666 1.667 __ BIND(L_dec_first_block_start); 1.668 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 1.669 + __ andcc(from, 7, G0); 1.670 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block); 1.671 + __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 1.672 + 1.673 + // aligned case: load input into L4 and L5 1.674 __ ldx(from,0,L4); 1.675 __ ldx(from,8,L5); 1.676 + __ ba_short(L_transform_first_block); 1.677 + 1.678 + __ BIND(L_load_misaligned_input_first_block); 1.679 + __ alignaddr(from, G0, from); 1.680 + // F58, F60, F62 can be clobbered 1.681 + __ ldf(FloatRegisterImpl::D, from, 0, F58); 1.682 + __ ldf(FloatRegisterImpl::D, from, 8, F60); 1.683 + __ ldf(FloatRegisterImpl::D, from, 16, F62); 1.684 + __ faligndata(F58, F60, F58); 1.685 + __ faligndata(F60, F62, F60); 1.686 + __ movdtox(F58, L4); 1.687 + __ movdtox(F60, L5); 1.688 + __ mov(G1, from); 1.689 + 1.690 + __ BIND(L_transform_first_block); 1.691 __ xor3(L2,L4,G1); 1.692 __ movxtod(G1,F60); 1.693 __ xor3(L3,L5,G1); 1.694 @@ -3833,9 +4145,36 @@ 1.695 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 1.696 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 1.697 1.698 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 1.699 + __ andcc(to, 7, G1); 1.700 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block); 1.701 + __ delayed()->edge8n(to, G0, G2); 1.702 + 1.703 + // aligned case: store output into the destination array 1.704 __ stf(FloatRegisterImpl::D, F60, to, 0); 1.705 __ stf(FloatRegisterImpl::D, F62, to, 8); 1.706 - 1.707 + __ ba_short(L_check_decrypt_end); 1.708 + 1.709 + __ BIND(L_store_misaligned_output_first_block); 1.710 + __ add(to, 8, G3); 1.711 + __ mov(8, G4); 1.712 + __ sub(G4, G1, G4); 1.713 + __ alignaddr(G4, G0, G4); 1.714 + __ faligndata(F60, F60, F60); 1.715 + __ faligndata(F62, F62, F62); 1.716 + __ mov(to, G1); 1.717 + __ and3(to, -8, to); 1.718 + __ and3(G3, -8, G3); 1.719 + __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY); 1.720 + __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY); 1.721 + __ add(to, 8, to); 1.722 + __ add(G3, 8, G3); 1.723 + __ orn(G0, G2, G2); 1.724 + __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY); 1.725 + __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY); 1.726 + __ mov(G1, to); 1.727 + 1.728 + __ BIND(L_check_decrypt_end); 1.729 __ add(from, 16, from); 1.730 __ add(to, 16, to); 1.731 __ subcc(len_reg, 16, len_reg); 1.732 @@ -3852,17 +4191,44 @@ 1.733 __ BIND(L_dec_next2_blocks128); 1.734 __ nop(); 1.735 1.736 - // F40:F42 used for first 16-bytes 1.737 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 1.738 + __ andcc(from, 7, G0); 1.739 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128); 1.740 + __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 1.741 + 1.742 + // aligned case: load input into G4, G5, L4 and L5 1.743 __ ldx(from,0,G4); 1.744 __ ldx(from,8,G5); 1.745 + __ ldx(from,16,L4); 1.746 + __ ldx(from,24,L5); 1.747 + __ ba_short(L_transform_next2_blocks128); 1.748 + 1.749 + __ BIND(L_load_misaligned_next2_blocks128); 1.750 + __ alignaddr(from, G0, from); 1.751 + // F40, F42, F58, F60, F62 can be clobbered 1.752 + __ ldf(FloatRegisterImpl::D, from, 0, F40); 1.753 + __ ldf(FloatRegisterImpl::D, from, 8, F42); 1.754 + __ ldf(FloatRegisterImpl::D, from, 16, F60); 1.755 + __ ldf(FloatRegisterImpl::D, from, 24, F62); 1.756 + __ ldf(FloatRegisterImpl::D, from, 32, F58); 1.757 + __ faligndata(F40, F42, F40); 1.758 + __ faligndata(F42, F60, F42); 1.759 + __ faligndata(F60, F62, F60); 1.760 + __ faligndata(F62, F58, F62); 1.761 + __ movdtox(F40, G4); 1.762 + __ movdtox(F42, G5); 1.763 + __ movdtox(F60, L4); 1.764 + __ movdtox(F62, L5); 1.765 + __ mov(G1, from); 1.766 + 1.767 + __ BIND(L_transform_next2_blocks128); 1.768 + // F40:F42 used for first 16-bytes 1.769 __ xor3(L2,G4,G1); 1.770 __ movxtod(G1,F40); 1.771 __ xor3(L3,G5,G1); 1.772 __ movxtod(G1,F42); 1.773 1.774 // F60:F62 used for next 16-bytes 1.775 - __ ldx(from,16,L4); 1.776 - __ ldx(from,24,L5); 1.777 __ xor3(L2,L4,G1); 1.778 __ movxtod(G1,F60); 1.779 __ xor3(L3,L5,G1); 1.780 @@ -3891,9 +4257,6 @@ 1.781 __ fxor(FloatRegisterImpl::D, F46, F40, F40); 1.782 __ fxor(FloatRegisterImpl::D, F44, F42, F42); 1.783 1.784 - __ stf(FloatRegisterImpl::D, F40, to, 0); 1.785 - __ stf(FloatRegisterImpl::D, F42, to, 8); 1.786 - 1.787 __ movxtod(G4,F56); 1.788 __ movxtod(G5,F58); 1.789 __ mov(L4,L0); 1.790 @@ -3901,32 +4264,93 @@ 1.791 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 1.792 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 1.793 1.794 + // For mis-aligned store of 32 bytes of result we can do: 1.795 + // Circular right-shift all 4 FP registers so that 'head' and 'tail' 1.796 + // parts that need to be stored starting at mis-aligned address are in a FP reg 1.797 + // the other 3 FP regs can thus be stored using regular store 1.798 + // we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts 1.799 + 1.800 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 1.801 + __ andcc(to, 7, G1); 1.802 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128); 1.803 + __ delayed()->edge8n(to, G0, G2); 1.804 + 1.805 + // aligned case: store output into the destination array 1.806 + __ stf(FloatRegisterImpl::D, F40, to, 0); 1.807 + __ stf(FloatRegisterImpl::D, F42, to, 8); 1.808 __ stf(FloatRegisterImpl::D, F60, to, 16); 1.809 __ stf(FloatRegisterImpl::D, F62, to, 24); 1.810 - 1.811 + __ ba_short(L_check_decrypt_loop_end128); 1.812 + 1.813 + __ BIND(L_store_misaligned_output_next2_blocks128); 1.814 + __ mov(8, G4); 1.815 + __ sub(G4, G1, G4); 1.816 + __ alignaddr(G4, G0, G4); 1.817 + __ faligndata(F40, F42, F56); // F56 can be clobbered 1.818 + __ faligndata(F42, F60, F42); 1.819 + __ faligndata(F60, F62, F60); 1.820 + __ faligndata(F62, F40, F40); 1.821 + __ mov(to, G1); 1.822 + __ and3(to, -8, to); 1.823 + __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY); 1.824 + __ stf(FloatRegisterImpl::D, F56, to, 8); 1.825 + __ stf(FloatRegisterImpl::D, F42, to, 16); 1.826 + __ stf(FloatRegisterImpl::D, F60, to, 24); 1.827 + __ add(to, 32, to); 1.828 + __ orn(G0, G2, G2); 1.829 + __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY); 1.830 + __ mov(G1, to); 1.831 + 1.832 + __ BIND(L_check_decrypt_loop_end128); 1.833 __ add(from, 32, from); 1.834 __ add(to, 32, to); 1.835 __ subcc(len_reg, 32, len_reg); 1.836 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128); 1.837 __ delayed()->nop(); 1.838 - __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end); 1.839 - __ delayed()->nop(); 1.840 + __ ba_short(L_cbcdec_end); 1.841 1.842 __ align(OptoLoopAlignment); 1.843 __ BIND(L_dec_next2_blocks192); 1.844 __ nop(); 1.845 1.846 - // F48:F50 used for first 16-bytes 1.847 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 1.848 + __ andcc(from, 7, G0); 1.849 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192); 1.850 + __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 1.851 + 1.852 + // aligned case: load input into G4, G5, L4 and L5 1.853 __ ldx(from,0,G4); 1.854 __ ldx(from,8,G5); 1.855 + __ ldx(from,16,L4); 1.856 + __ ldx(from,24,L5); 1.857 + __ ba_short(L_transform_next2_blocks192); 1.858 + 1.859 + __ BIND(L_load_misaligned_next2_blocks192); 1.860 + __ alignaddr(from, G0, from); 1.861 + // F48, F50, F52, F60, F62 can be clobbered 1.862 + __ ldf(FloatRegisterImpl::D, from, 0, F48); 1.863 + __ ldf(FloatRegisterImpl::D, from, 8, F50); 1.864 + __ ldf(FloatRegisterImpl::D, from, 16, F60); 1.865 + __ ldf(FloatRegisterImpl::D, from, 24, F62); 1.866 + __ ldf(FloatRegisterImpl::D, from, 32, F52); 1.867 + __ faligndata(F48, F50, F48); 1.868 + __ faligndata(F50, F60, F50); 1.869 + __ faligndata(F60, F62, F60); 1.870 + __ faligndata(F62, F52, F62); 1.871 + __ movdtox(F48, G4); 1.872 + __ movdtox(F50, G5); 1.873 + __ movdtox(F60, L4); 1.874 + __ movdtox(F62, L5); 1.875 + __ mov(G1, from); 1.876 + 1.877 + __ BIND(L_transform_next2_blocks192); 1.878 + // F48:F50 used for first 16-bytes 1.879 __ xor3(L2,G4,G1); 1.880 __ movxtod(G1,F48); 1.881 __ xor3(L3,G5,G1); 1.882 __ movxtod(G1,F50); 1.883 1.884 // F60:F62 used for next 16-bytes 1.885 - __ ldx(from,16,L4); 1.886 - __ ldx(from,24,L5); 1.887 __ xor3(L2,L4,G1); 1.888 __ movxtod(G1,F60); 1.889 __ xor3(L3,L5,G1); 1.890 @@ -3955,9 +4379,6 @@ 1.891 __ fxor(FloatRegisterImpl::D, F54, F48, F48); 1.892 __ fxor(FloatRegisterImpl::D, F52, F50, F50); 1.893 1.894 - __ stf(FloatRegisterImpl::D, F48, to, 0); 1.895 - __ stf(FloatRegisterImpl::D, F50, to, 8); 1.896 - 1.897 __ movxtod(G4,F56); 1.898 __ movxtod(G5,F58); 1.899 __ mov(L4,L0); 1.900 @@ -3965,32 +4386,87 @@ 1.901 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 1.902 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 1.903 1.904 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 1.905 + __ andcc(to, 7, G1); 1.906 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192); 1.907 + __ delayed()->edge8n(to, G0, G2); 1.908 + 1.909 + // aligned case: store output into the destination array 1.910 + __ stf(FloatRegisterImpl::D, F48, to, 0); 1.911 + __ stf(FloatRegisterImpl::D, F50, to, 8); 1.912 __ stf(FloatRegisterImpl::D, F60, to, 16); 1.913 __ stf(FloatRegisterImpl::D, F62, to, 24); 1.914 - 1.915 + __ ba_short(L_check_decrypt_loop_end192); 1.916 + 1.917 + __ BIND(L_store_misaligned_output_next2_blocks192); 1.918 + __ mov(8, G4); 1.919 + __ sub(G4, G1, G4); 1.920 + __ alignaddr(G4, G0, G4); 1.921 + __ faligndata(F48, F50, F56); // F56 can be clobbered 1.922 + __ faligndata(F50, F60, F50); 1.923 + __ faligndata(F60, F62, F60); 1.924 + __ faligndata(F62, F48, F48); 1.925 + __ mov(to, G1); 1.926 + __ and3(to, -8, to); 1.927 + __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY); 1.928 + __ stf(FloatRegisterImpl::D, F56, to, 8); 1.929 + __ stf(FloatRegisterImpl::D, F50, to, 16); 1.930 + __ stf(FloatRegisterImpl::D, F60, to, 24); 1.931 + __ add(to, 32, to); 1.932 + __ orn(G0, G2, G2); 1.933 + __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY); 1.934 + __ mov(G1, to); 1.935 + 1.936 + __ BIND(L_check_decrypt_loop_end192); 1.937 __ add(from, 32, from); 1.938 __ add(to, 32, to); 1.939 __ subcc(len_reg, 32, len_reg); 1.940 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192); 1.941 __ delayed()->nop(); 1.942 - __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end); 1.943 - __ delayed()->nop(); 1.944 + __ ba_short(L_cbcdec_end); 1.945 1.946 __ align(OptoLoopAlignment); 1.947 __ BIND(L_dec_next2_blocks256); 1.948 __ nop(); 1.949 1.950 - // F0:F2 used for first 16-bytes 1.951 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 1.952 + __ andcc(from, 7, G0); 1.953 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256); 1.954 + __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 1.955 + 1.956 + // aligned case: load input into G4, G5, L4 and L5 1.957 __ ldx(from,0,G4); 1.958 __ ldx(from,8,G5); 1.959 + __ ldx(from,16,L4); 1.960 + __ ldx(from,24,L5); 1.961 + __ ba_short(L_transform_next2_blocks256); 1.962 + 1.963 + __ BIND(L_load_misaligned_next2_blocks256); 1.964 + __ alignaddr(from, G0, from); 1.965 + // F0, F2, F4, F60, F62 can be clobbered 1.966 + __ ldf(FloatRegisterImpl::D, from, 0, F0); 1.967 + __ ldf(FloatRegisterImpl::D, from, 8, F2); 1.968 + __ ldf(FloatRegisterImpl::D, from, 16, F60); 1.969 + __ ldf(FloatRegisterImpl::D, from, 24, F62); 1.970 + __ ldf(FloatRegisterImpl::D, from, 32, F4); 1.971 + __ faligndata(F0, F2, F0); 1.972 + __ faligndata(F2, F60, F2); 1.973 + __ faligndata(F60, F62, F60); 1.974 + __ faligndata(F62, F4, F62); 1.975 + __ movdtox(F0, G4); 1.976 + __ movdtox(F2, G5); 1.977 + __ movdtox(F60, L4); 1.978 + __ movdtox(F62, L5); 1.979 + __ mov(G1, from); 1.980 + 1.981 + __ BIND(L_transform_next2_blocks256); 1.982 + // F0:F2 used for first 16-bytes 1.983 __ xor3(L2,G4,G1); 1.984 __ movxtod(G1,F0); 1.985 __ xor3(L3,G5,G1); 1.986 __ movxtod(G1,F2); 1.987 1.988 // F60:F62 used for next 16-bytes 1.989 - __ ldx(from,16,L4); 1.990 - __ ldx(from,24,L5); 1.991 __ xor3(L2,L4,G1); 1.992 __ movxtod(G1,F60); 1.993 __ xor3(L3,L5,G1); 1.994 @@ -4043,9 +4519,6 @@ 1.995 __ fxor(FloatRegisterImpl::D, F6, F0, F0); 1.996 __ fxor(FloatRegisterImpl::D, F4, F2, F2); 1.997 1.998 - __ stf(FloatRegisterImpl::D, F0, to, 0); 1.999 - __ stf(FloatRegisterImpl::D, F2, to, 8); 1.1000 - 1.1001 __ movxtod(G4,F56); 1.1002 __ movxtod(G5,F58); 1.1003 __ mov(L4,L0); 1.1004 @@ -4053,9 +4526,38 @@ 1.1005 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 1.1006 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 1.1007 1.1008 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 1.1009 + __ andcc(to, 7, G1); 1.1010 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256); 1.1011 + __ delayed()->edge8n(to, G0, G2); 1.1012 + 1.1013 + // aligned case: store output into the destination array 1.1014 + __ stf(FloatRegisterImpl::D, F0, to, 0); 1.1015 + __ stf(FloatRegisterImpl::D, F2, to, 8); 1.1016 __ stf(FloatRegisterImpl::D, F60, to, 16); 1.1017 __ stf(FloatRegisterImpl::D, F62, to, 24); 1.1018 - 1.1019 + __ ba_short(L_check_decrypt_loop_end256); 1.1020 + 1.1021 + __ BIND(L_store_misaligned_output_next2_blocks256); 1.1022 + __ mov(8, G4); 1.1023 + __ sub(G4, G1, G4); 1.1024 + __ alignaddr(G4, G0, G4); 1.1025 + __ faligndata(F0, F2, F56); // F56 can be clobbered 1.1026 + __ faligndata(F2, F60, F2); 1.1027 + __ faligndata(F60, F62, F60); 1.1028 + __ faligndata(F62, F0, F0); 1.1029 + __ mov(to, G1); 1.1030 + __ and3(to, -8, to); 1.1031 + __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY); 1.1032 + __ stf(FloatRegisterImpl::D, F56, to, 8); 1.1033 + __ stf(FloatRegisterImpl::D, F2, to, 16); 1.1034 + __ stf(FloatRegisterImpl::D, F60, to, 24); 1.1035 + __ add(to, 32, to); 1.1036 + __ orn(G0, G2, G2); 1.1037 + __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY); 1.1038 + __ mov(G1, to); 1.1039 + 1.1040 + __ BIND(L_check_decrypt_loop_end256); 1.1041 __ add(from, 32, from); 1.1042 __ add(to, 32, to); 1.1043 __ subcc(len_reg, 32, len_reg); 1.1044 @@ -4063,6 +4565,7 @@ 1.1045 __ delayed()->nop(); 1.1046 1.1047 __ BIND(L_cbcdec_end); 1.1048 + // re-init intial vector for next block, 8-byte alignment is guaranteed 1.1049 __ stx(L0, rvec, 0); 1.1050 __ stx(L1, rvec, 8); 1.1051 __ restore();