Sun, 11 Aug 2019 19:11:08 -0400
8188868: PPC64: Support AES intrinsics on Big Endian
Reviewed-by: goetz
1.1 --- a/src/cpu/ppc/vm/assembler_ppc.hpp Tue Jan 29 08:28:24 2019 -0500 1.2 +++ b/src/cpu/ppc/vm/assembler_ppc.hpp Sun Aug 11 19:11:08 2019 -0400 1.3 @@ -2100,6 +2100,7 @@ 1.4 // Endianess specific concatenation of 2 loaded vectors. 1.5 inline void load_perm(VectorRegister perm, Register addr); 1.6 inline void vec_perm(VectorRegister first_dest, VectorRegister second, VectorRegister perm); 1.7 + inline void vec_perm(VectorRegister dest, VectorRegister first, VectorRegister second, VectorRegister perm); 1.8 1.9 // RegisterOrConstant versions. 1.10 // These emitters choose between the versions using two registers and
2.1 --- a/src/cpu/ppc/vm/assembler_ppc.inline.hpp Tue Jan 29 08:28:24 2019 -0500 2.2 +++ b/src/cpu/ppc/vm/assembler_ppc.inline.hpp Sun Aug 11 19:11:08 2019 -0400 2.3 @@ -904,6 +904,14 @@ 2.4 #endif 2.5 } 2.6 2.7 +inline void Assembler::vec_perm(VectorRegister dest, VectorRegister first, VectorRegister second, VectorRegister perm) { 2.8 +#if defined(VM_LITTLE_ENDIAN) 2.9 + vperm(dest, second, first, perm); 2.10 +#else 2.11 + vperm(dest, first, second, perm); 2.12 +#endif 2.13 +} 2.14 + 2.15 inline void Assembler::load_const(Register d, void* x, Register tmp) { 2.16 load_const(d, (long)x, tmp); 2.17 }
3.1 --- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp Tue Jan 29 08:28:24 2019 -0500 3.2 +++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp Sun Aug 11 19:11:08 2019 -0400 3.3 @@ -2224,7 +2224,7 @@ 3.4 return start; 3.5 } 3.6 3.7 - // Arguments for generated stub (little endian only): 3.8 + // Arguments for generated stub: 3.9 // R3_ARG1 - source byte array address 3.10 // R4_ARG2 - destination byte array address 3.11 // R5_ARG3 - round key array 3.12 @@ -2243,7 +2243,6 @@ 3.13 Register keylen = R8; 3.14 Register temp = R9; 3.15 Register keypos = R10; 3.16 - Register hex = R11; 3.17 Register fifteen = R12; 3.18 3.19 VectorRegister vRet = VR0; 3.20 @@ -2263,164 +2262,170 @@ 3.21 VectorRegister vTmp3 = VR11; 3.22 VectorRegister vTmp4 = VR12; 3.23 3.24 - VectorRegister vLow = VR13; 3.25 - VectorRegister vHigh = VR14; 3.26 - 3.27 - __ li (hex, 16); 3.28 __ li (fifteen, 15); 3.29 - __ vspltisb (fSplt, 0x0f); 3.30 3.31 // load unaligned from[0-15] to vsRet 3.32 __ lvx (vRet, from); 3.33 __ lvx (vTmp1, fifteen, from); 3.34 __ lvsl (fromPerm, from); 3.35 +#ifdef VM_LITTLE_ENDIAN 3.36 + __ vspltisb (fSplt, 0x0f); 3.37 __ vxor (fromPerm, fromPerm, fSplt); 3.38 +#endif 3.39 __ vperm (vRet, vRet, vTmp1, fromPerm); 3.40 3.41 // load keylen (44 or 52 or 60) 3.42 __ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key); 3.43 3.44 // to load keys 3.45 - __ lvsr (keyPerm, key); 3.46 - __ vxor (vTmp2, vTmp2, vTmp2); 3.47 + __ load_perm (keyPerm, key); 3.48 +#ifdef VM_LITTLE_ENDIAN 3.49 __ vspltisb (vTmp2, -16); 3.50 __ vrld (keyPerm, keyPerm, vTmp2); 3.51 __ vrld (keyPerm, keyPerm, vTmp2); 3.52 __ vsldoi (keyPerm, keyPerm, keyPerm, 8); 3.53 - 3.54 - // load the 1st round key to vKey1 3.55 - __ li (keypos, 0); 3.56 +#endif 3.57 + 3.58 + // load the 1st round key to vTmp1 3.59 + __ lvx (vTmp1, key); 3.60 + __ li (keypos, 16); 3.61 __ lvx (vKey1, keypos, key); 3.62 - __ addi (keypos, keypos, 16); 3.63 + __ vec_perm (vTmp1, vKey1, keyPerm); 3.64 + 3.65 + // 1st round 3.66 + __ vxor (vRet, vRet, vTmp1); 3.67 + 3.68 + // load the 2nd round key to vKey1 3.69 + __ li (keypos, 32); 3.70 + __ lvx (vKey2, keypos, key); 3.71 + __ vec_perm (vKey1, vKey2, keyPerm); 3.72 + 3.73 + // load the 3rd round key to vKey2 3.74 + __ li (keypos, 48); 3.75 + __ lvx (vKey3, keypos, key); 3.76 + __ vec_perm (vKey2, vKey3, keyPerm); 3.77 + 3.78 + // load the 4th round key to vKey3 3.79 + __ li (keypos, 64); 3.80 + __ lvx (vKey4, keypos, key); 3.81 + __ vec_perm (vKey3, vKey4, keyPerm); 3.82 + 3.83 + // load the 5th round key to vKey4 3.84 + __ li (keypos, 80); 3.85 __ lvx (vTmp1, keypos, key); 3.86 - __ vperm (vKey1, vTmp1, vKey1, keyPerm); 3.87 - 3.88 - // 1st round 3.89 - __ vxor (vRet, vRet, vKey1); 3.90 - 3.91 - // load the 2nd round key to vKey1 3.92 - __ addi (keypos, keypos, 16); 3.93 - __ lvx (vTmp2, keypos, key); 3.94 - __ vperm (vKey1, vTmp2, vTmp1, keyPerm); 3.95 - 3.96 - // load the 3rd round key to vKey2 3.97 - __ addi (keypos, keypos, 16); 3.98 + __ vec_perm (vKey4, vTmp1, keyPerm); 3.99 + 3.100 + // 2nd - 5th rounds 3.101 + __ vcipher (vRet, vRet, vKey1); 3.102 + __ vcipher (vRet, vRet, vKey2); 3.103 + __ vcipher (vRet, vRet, vKey3); 3.104 + __ vcipher (vRet, vRet, vKey4); 3.105 + 3.106 + // load the 6th round key to vKey1 3.107 + __ li (keypos, 96); 3.108 + __ lvx (vKey2, keypos, key); 3.109 + __ vec_perm (vKey1, vTmp1, vKey2, keyPerm); 3.110 + 3.111 + // load the 7th round key to vKey2 3.112 + __ li (keypos, 112); 3.113 + __ lvx (vKey3, keypos, key); 3.114 + __ vec_perm (vKey2, vKey3, keyPerm); 3.115 + 3.116 + // load the 8th round key to vKey3 3.117 + __ li (keypos, 128); 3.118 + __ lvx (vKey4, keypos, key); 3.119 + __ vec_perm (vKey3, vKey4, keyPerm); 3.120 + 3.121 + // load the 9th round key to vKey4 3.122 + __ li (keypos, 144); 3.123 __ lvx (vTmp1, keypos, key); 3.124 - __ vperm (vKey2, vTmp1, vTmp2, keyPerm); 3.125 - 3.126 - // load the 4th round key to vKey3 3.127 - __ addi (keypos, keypos, 16); 3.128 - __ lvx (vTmp2, keypos, key); 3.129 - __ vperm (vKey3, vTmp2, vTmp1, keyPerm); 3.130 - 3.131 - // load the 5th round key to vKey4 3.132 - __ addi (keypos, keypos, 16); 3.133 + __ vec_perm (vKey4, vTmp1, keyPerm); 3.134 + 3.135 + // 6th - 9th rounds 3.136 + __ vcipher (vRet, vRet, vKey1); 3.137 + __ vcipher (vRet, vRet, vKey2); 3.138 + __ vcipher (vRet, vRet, vKey3); 3.139 + __ vcipher (vRet, vRet, vKey4); 3.140 + 3.141 + // load the 10th round key to vKey1 3.142 + __ li (keypos, 160); 3.143 + __ lvx (vKey2, keypos, key); 3.144 + __ vec_perm (vKey1, vTmp1, vKey2, keyPerm); 3.145 + 3.146 + // load the 11th round key to vKey2 3.147 + __ li (keypos, 176); 3.148 __ lvx (vTmp1, keypos, key); 3.149 - __ vperm (vKey4, vTmp1, vTmp2, keyPerm); 3.150 - 3.151 - // 2nd - 5th rounds 3.152 - __ vcipher (vRet, vRet, vKey1); 3.153 - __ vcipher (vRet, vRet, vKey2); 3.154 - __ vcipher (vRet, vRet, vKey3); 3.155 - __ vcipher (vRet, vRet, vKey4); 3.156 - 3.157 - // load the 6th round key to vKey1 3.158 - __ addi (keypos, keypos, 16); 3.159 - __ lvx (vTmp2, keypos, key); 3.160 - __ vperm (vKey1, vTmp2, vTmp1, keyPerm); 3.161 - 3.162 - // load the 7th round key to vKey2 3.163 - __ addi (keypos, keypos, 16); 3.164 - __ lvx (vTmp1, keypos, key); 3.165 - __ vperm (vKey2, vTmp1, vTmp2, keyPerm); 3.166 - 3.167 - // load the 8th round key to vKey3 3.168 - __ addi (keypos, keypos, 16); 3.169 - __ lvx (vTmp2, keypos, key); 3.170 - __ vperm (vKey3, vTmp2, vTmp1, keyPerm); 3.171 - 3.172 - // load the 9th round key to vKey4 3.173 - __ addi (keypos, keypos, 16); 3.174 - __ lvx (vTmp1, keypos, key); 3.175 - __ vperm (vKey4, vTmp1, vTmp2, keyPerm); 3.176 - 3.177 - // 6th - 9th rounds 3.178 - __ vcipher (vRet, vRet, vKey1); 3.179 - __ vcipher (vRet, vRet, vKey2); 3.180 - __ vcipher (vRet, vRet, vKey3); 3.181 - __ vcipher (vRet, vRet, vKey4); 3.182 - 3.183 - // load the 10th round key to vKey1 3.184 - __ addi (keypos, keypos, 16); 3.185 - __ lvx (vTmp2, keypos, key); 3.186 - __ vperm (vKey1, vTmp2, vTmp1, keyPerm); 3.187 - 3.188 - // load the 11th round key to vKey2 3.189 - __ addi (keypos, keypos, 16); 3.190 - __ lvx (vTmp1, keypos, key); 3.191 - __ vperm (vKey2, vTmp1, vTmp2, keyPerm); 3.192 + __ vec_perm (vKey2, vTmp1, keyPerm); 3.193 3.194 // if all round keys are loaded, skip next 4 rounds 3.195 __ cmpwi (CCR0, keylen, 44); 3.196 __ beq (CCR0, L_doLast); 3.197 3.198 // 10th - 11th rounds 3.199 - __ vcipher (vRet, vRet, vKey1); 3.200 - __ vcipher (vRet, vRet, vKey2); 3.201 + __ vcipher (vRet, vRet, vKey1); 3.202 + __ vcipher (vRet, vRet, vKey2); 3.203 3.204 // load the 12th round key to vKey1 3.205 - __ addi (keypos, keypos, 16); 3.206 - __ lvx (vTmp2, keypos, key); 3.207 - __ vperm (vKey1, vTmp2, vTmp1, keyPerm); 3.208 + __ li (keypos, 192); 3.209 + __ lvx (vKey2, keypos, key); 3.210 + __ vec_perm (vKey1, vTmp1, vKey2, keyPerm); 3.211 3.212 // load the 13th round key to vKey2 3.213 - __ addi (keypos, keypos, 16); 3.214 + __ li (keypos, 208); 3.215 __ lvx (vTmp1, keypos, key); 3.216 - __ vperm (vKey2, vTmp1, vTmp2, keyPerm); 3.217 + __ vec_perm (vKey2, vTmp1, keyPerm); 3.218 3.219 // if all round keys are loaded, skip next 2 rounds 3.220 __ cmpwi (CCR0, keylen, 52); 3.221 __ beq (CCR0, L_doLast); 3.222 3.223 // 12th - 13th rounds 3.224 - __ vcipher (vRet, vRet, vKey1); 3.225 - __ vcipher (vRet, vRet, vKey2); 3.226 + __ vcipher (vRet, vRet, vKey1); 3.227 + __ vcipher (vRet, vRet, vKey2); 3.228 3.229 // load the 14th round key to vKey1 3.230 - __ addi (keypos, keypos, 16); 3.231 - __ lvx (vTmp2, keypos, key); 3.232 - __ vperm (vKey1, vTmp2, vTmp1, keyPerm); 3.233 + __ li (keypos, 224); 3.234 + __ lvx (vKey2, keypos, key); 3.235 + __ vec_perm (vKey1, vTmp1, vKey2, keyPerm); 3.236 3.237 // load the 15th round key to vKey2 3.238 - __ addi (keypos, keypos, 16); 3.239 + __ li (keypos, 240); 3.240 __ lvx (vTmp1, keypos, key); 3.241 - __ vperm (vKey2, vTmp1, vTmp2, keyPerm); 3.242 + __ vec_perm (vKey2, vTmp1, keyPerm); 3.243 3.244 __ bind(L_doLast); 3.245 3.246 // last two rounds 3.247 - __ vcipher (vRet, vRet, vKey1); 3.248 - __ vcipherlast (vRet, vRet, vKey2); 3.249 - 3.250 - __ neg (temp, to); 3.251 - __ lvsr (toPerm, temp); 3.252 - __ vspltisb (vTmp2, -1); 3.253 - __ vxor (vTmp1, vTmp1, vTmp1); 3.254 - __ vperm (vTmp2, vTmp2, vTmp1, toPerm); 3.255 - __ vxor (toPerm, toPerm, fSplt); 3.256 + __ vcipher (vRet, vRet, vKey1); 3.257 + __ vcipherlast (vRet, vRet, vKey2); 3.258 + 3.259 + // store result (unaligned) 3.260 +#ifdef VM_LITTLE_ENDIAN 3.261 + __ lvsl (toPerm, to); 3.262 +#else 3.263 + __ lvsr (toPerm, to); 3.264 +#endif 3.265 + __ vspltisb (vTmp3, -1); 3.266 + __ vspltisb (vTmp4, 0); 3.267 __ lvx (vTmp1, to); 3.268 - __ vperm (vRet, vRet, vRet, toPerm); 3.269 - __ vsel (vTmp1, vTmp1, vRet, vTmp2); 3.270 - __ lvx (vTmp4, fifteen, to); 3.271 + __ lvx (vTmp2, fifteen, to); 3.272 +#ifdef VM_LITTLE_ENDIAN 3.273 + __ vperm (vTmp3, vTmp3, vTmp4, toPerm); // generate select mask 3.274 + __ vxor (toPerm, toPerm, fSplt); // swap bytes 3.275 +#else 3.276 + __ vperm (vTmp3, vTmp4, vTmp3, toPerm); // generate select mask 3.277 +#endif 3.278 + __ vperm (vTmp4, vRet, vRet, toPerm); // rotate data 3.279 + __ vsel (vTmp2, vTmp4, vTmp2, vTmp3); 3.280 + __ vsel (vTmp1, vTmp1, vTmp4, vTmp3); 3.281 + __ stvx (vTmp2, fifteen, to); // store this one first (may alias) 3.282 __ stvx (vTmp1, to); 3.283 - __ vsel (vRet, vRet, vTmp4, vTmp2); 3.284 - __ stvx (vRet, fifteen, to); 3.285 3.286 __ blr(); 3.287 return start; 3.288 } 3.289 3.290 - // Arguments for generated stub (little endian only): 3.291 + // Arguments for generated stub: 3.292 // R3_ARG1 - source byte array address 3.293 // R4_ARG2 - destination byte array address 3.294 // R5_ARG3 - K (key) in little endian int array 3.295 @@ -2442,7 +2447,6 @@ 3.296 Register keylen = R8; 3.297 Register temp = R9; 3.298 Register keypos = R10; 3.299 - Register hex = R11; 3.300 Register fifteen = R12; 3.301 3.302 VectorRegister vRet = VR0; 3.303 @@ -2463,30 +2467,30 @@ 3.304 VectorRegister vTmp3 = VR12; 3.305 VectorRegister vTmp4 = VR13; 3.306 3.307 - VectorRegister vLow = VR14; 3.308 - VectorRegister vHigh = VR15; 3.309 - 3.310 - __ li (hex, 16); 3.311 __ li (fifteen, 15); 3.312 - __ vspltisb (fSplt, 0x0f); 3.313 3.314 // load unaligned from[0-15] to vsRet 3.315 __ lvx (vRet, from); 3.316 __ lvx (vTmp1, fifteen, from); 3.317 __ lvsl (fromPerm, from); 3.318 +#ifdef VM_LITTLE_ENDIAN 3.319 + __ vspltisb (fSplt, 0x0f); 3.320 __ vxor (fromPerm, fromPerm, fSplt); 3.321 +#endif 3.322 __ vperm (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE] 3.323 3.324 // load keylen (44 or 52 or 60) 3.325 __ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key); 3.326 3.327 // to load keys 3.328 - __ lvsr (keyPerm, key); 3.329 + __ load_perm (keyPerm, key); 3.330 +#ifdef VM_LITTLE_ENDIAN 3.331 __ vxor (vTmp2, vTmp2, vTmp2); 3.332 __ vspltisb (vTmp2, -16); 3.333 __ vrld (keyPerm, keyPerm, vTmp2); 3.334 __ vrld (keyPerm, keyPerm, vTmp2); 3.335 __ vsldoi (keyPerm, keyPerm, keyPerm, 8); 3.336 +#endif 3.337 3.338 __ cmpwi (CCR0, keylen, 44); 3.339 __ beq (CCR0, L_do44); 3.340 @@ -2494,32 +2498,32 @@ 3.341 __ cmpwi (CCR0, keylen, 52); 3.342 __ beq (CCR0, L_do52); 3.343 3.344 - // load the 15th round key to vKey11 3.345 + // load the 15th round key to vKey1 3.346 __ li (keypos, 240); 3.347 + __ lvx (vKey1, keypos, key); 3.348 + __ li (keypos, 224); 3.349 + __ lvx (vKey2, keypos, key); 3.350 + __ vec_perm (vKey1, vKey2, vKey1, keyPerm); 3.351 + 3.352 + // load the 14th round key to vKey2 3.353 + __ li (keypos, 208); 3.354 + __ lvx (vKey3, keypos, key); 3.355 + __ vec_perm (vKey2, vKey3, vKey2, keyPerm); 3.356 + 3.357 + // load the 13th round key to vKey3 3.358 + __ li (keypos, 192); 3.359 + __ lvx (vKey4, keypos, key); 3.360 + __ vec_perm (vKey3, vKey4, vKey3, keyPerm); 3.361 + 3.362 + // load the 12th round key to vKey4 3.363 + __ li (keypos, 176); 3.364 + __ lvx (vKey5, keypos, key); 3.365 + __ vec_perm (vKey4, vKey5, vKey4, keyPerm); 3.366 + 3.367 + // load the 11th round key to vKey5 3.368 + __ li (keypos, 160); 3.369 __ lvx (vTmp1, keypos, key); 3.370 - __ addi (keypos, keypos, -16); 3.371 - __ lvx (vTmp2, keypos, key); 3.372 - __ vperm (vKey1, vTmp1, vTmp2, keyPerm); 3.373 - 3.374 - // load the 14th round key to vKey10 3.375 - __ addi (keypos, keypos, -16); 3.376 - __ lvx (vTmp1, keypos, key); 3.377 - __ vperm (vKey2, vTmp2, vTmp1, keyPerm); 3.378 - 3.379 - // load the 13th round key to vKey10 3.380 - __ addi (keypos, keypos, -16); 3.381 - __ lvx (vTmp2, keypos, key); 3.382 - __ vperm (vKey3, vTmp1, vTmp2, keyPerm); 3.383 - 3.384 - // load the 12th round key to vKey10 3.385 - __ addi (keypos, keypos, -16); 3.386 - __ lvx (vTmp1, keypos, key); 3.387 - __ vperm (vKey4, vTmp2, vTmp1, keyPerm); 3.388 - 3.389 - // load the 11th round key to vKey10 3.390 - __ addi (keypos, keypos, -16); 3.391 - __ lvx (vTmp2, keypos, key); 3.392 - __ vperm (vKey5, vTmp1, vTmp2, keyPerm); 3.393 + __ vec_perm (vKey5, vTmp1, vKey5, keyPerm); 3.394 3.395 // 1st - 5th rounds 3.396 __ vxor (vRet, vRet, vKey1); 3.397 @@ -2532,22 +2536,22 @@ 3.398 3.399 __ bind (L_do52); 3.400 3.401 - // load the 13th round key to vKey11 3.402 + // load the 13th round key to vKey1 3.403 __ li (keypos, 208); 3.404 + __ lvx (vKey1, keypos, key); 3.405 + __ li (keypos, 192); 3.406 + __ lvx (vKey2, keypos, key); 3.407 + __ vec_perm (vKey1, vKey2, vKey1, keyPerm); 3.408 + 3.409 + // load the 12th round key to vKey2 3.410 + __ li (keypos, 176); 3.411 + __ lvx (vKey3, keypos, key); 3.412 + __ vec_perm (vKey2, vKey3, vKey2, keyPerm); 3.413 + 3.414 + // load the 11th round key to vKey3 3.415 + __ li (keypos, 160); 3.416 __ lvx (vTmp1, keypos, key); 3.417 - __ addi (keypos, keypos, -16); 3.418 - __ lvx (vTmp2, keypos, key); 3.419 - __ vperm (vKey1, vTmp1, vTmp2, keyPerm); 3.420 - 3.421 - // load the 12th round key to vKey10 3.422 - __ addi (keypos, keypos, -16); 3.423 - __ lvx (vTmp1, keypos, key); 3.424 - __ vperm (vKey2, vTmp2, vTmp1, keyPerm); 3.425 - 3.426 - // load the 11th round key to vKey10 3.427 - __ addi (keypos, keypos, -16); 3.428 - __ lvx (vTmp2, keypos, key); 3.429 - __ vperm (vKey3, vTmp1, vTmp2, keyPerm); 3.430 + __ vec_perm (vKey3, vTmp1, vKey3, keyPerm); 3.431 3.432 // 1st - 3rd rounds 3.433 __ vxor (vRet, vRet, vKey1); 3.434 @@ -2558,42 +2562,42 @@ 3.435 3.436 __ bind (L_do44); 3.437 3.438 - // load the 11th round key to vKey11 3.439 + // load the 11th round key to vKey1 3.440 __ li (keypos, 176); 3.441 + __ lvx (vKey1, keypos, key); 3.442 + __ li (keypos, 160); 3.443 __ lvx (vTmp1, keypos, key); 3.444 - __ addi (keypos, keypos, -16); 3.445 - __ lvx (vTmp2, keypos, key); 3.446 - __ vperm (vKey1, vTmp1, vTmp2, keyPerm); 3.447 + __ vec_perm (vKey1, vTmp1, vKey1, keyPerm); 3.448 3.449 // 1st round 3.450 __ vxor (vRet, vRet, vKey1); 3.451 3.452 __ bind (L_doLast); 3.453 3.454 - // load the 10th round key to vKey10 3.455 - __ addi (keypos, keypos, -16); 3.456 + // load the 10th round key to vKey1 3.457 + __ li (keypos, 144); 3.458 + __ lvx (vKey2, keypos, key); 3.459 + __ vec_perm (vKey1, vKey2, vTmp1, keyPerm); 3.460 + 3.461 + // load the 9th round key to vKey2 3.462 + __ li (keypos, 128); 3.463 + __ lvx (vKey3, keypos, key); 3.464 + __ vec_perm (vKey2, vKey3, vKey2, keyPerm); 3.465 + 3.466 + // load the 8th round key to vKey3 3.467 + __ li (keypos, 112); 3.468 + __ lvx (vKey4, keypos, key); 3.469 + __ vec_perm (vKey3, vKey4, vKey3, keyPerm); 3.470 + 3.471 + // load the 7th round key to vKey4 3.472 + __ li (keypos, 96); 3.473 + __ lvx (vKey5, keypos, key); 3.474 + __ vec_perm (vKey4, vKey5, vKey4, keyPerm); 3.475 + 3.476 + // load the 6th round key to vKey5 3.477 + __ li (keypos, 80); 3.478 __ lvx (vTmp1, keypos, key); 3.479 - __ vperm (vKey1, vTmp2, vTmp1, keyPerm); 3.480 - 3.481 - // load the 9th round key to vKey10 3.482 - __ addi (keypos, keypos, -16); 3.483 - __ lvx (vTmp2, keypos, key); 3.484 - __ vperm (vKey2, vTmp1, vTmp2, keyPerm); 3.485 - 3.486 - // load the 8th round key to vKey10 3.487 - __ addi (keypos, keypos, -16); 3.488 - __ lvx (vTmp1, keypos, key); 3.489 - __ vperm (vKey3, vTmp2, vTmp1, keyPerm); 3.490 - 3.491 - // load the 7th round key to vKey10 3.492 - __ addi (keypos, keypos, -16); 3.493 - __ lvx (vTmp2, keypos, key); 3.494 - __ vperm (vKey4, vTmp1, vTmp2, keyPerm); 3.495 - 3.496 - // load the 6th round key to vKey10 3.497 - __ addi (keypos, keypos, -16); 3.498 - __ lvx (vTmp1, keypos, key); 3.499 - __ vperm (vKey5, vTmp2, vTmp1, keyPerm); 3.500 + __ vec_perm (vKey5, vTmp1, vKey5, keyPerm); 3.501 3.502 // last 10th - 6th rounds 3.503 __ vncipher (vRet, vRet, vKey1); 3.504 @@ -2602,30 +2606,29 @@ 3.505 __ vncipher (vRet, vRet, vKey4); 3.506 __ vncipher (vRet, vRet, vKey5); 3.507 3.508 - // load the 5th round key to vKey10 3.509 - __ addi (keypos, keypos, -16); 3.510 - __ lvx (vTmp2, keypos, key); 3.511 - __ vperm (vKey1, vTmp1, vTmp2, keyPerm); 3.512 - 3.513 - // load the 4th round key to vKey10 3.514 - __ addi (keypos, keypos, -16); 3.515 - __ lvx (vTmp1, keypos, key); 3.516 - __ vperm (vKey2, vTmp2, vTmp1, keyPerm); 3.517 - 3.518 - // load the 3rd round key to vKey10 3.519 - __ addi (keypos, keypos, -16); 3.520 - __ lvx (vTmp2, keypos, key); 3.521 - __ vperm (vKey3, vTmp1, vTmp2, keyPerm); 3.522 - 3.523 - // load the 2nd round key to vKey10 3.524 - __ addi (keypos, keypos, -16); 3.525 - __ lvx (vTmp1, keypos, key); 3.526 - __ vperm (vKey4, vTmp2, vTmp1, keyPerm); 3.527 - 3.528 - // load the 1st round key to vKey10 3.529 - __ addi (keypos, keypos, -16); 3.530 - __ lvx (vTmp2, keypos, key); 3.531 - __ vperm (vKey5, vTmp1, vTmp2, keyPerm); 3.532 + // load the 5th round key to vKey1 3.533 + __ li (keypos, 64); 3.534 + __ lvx (vKey2, keypos, key); 3.535 + __ vec_perm (vKey1, vKey2, vTmp1, keyPerm); 3.536 + 3.537 + // load the 4th round key to vKey2 3.538 + __ li (keypos, 48); 3.539 + __ lvx (vKey3, keypos, key); 3.540 + __ vec_perm (vKey2, vKey3, vKey2, keyPerm); 3.541 + 3.542 + // load the 3rd round key to vKey3 3.543 + __ li (keypos, 32); 3.544 + __ lvx (vKey4, keypos, key); 3.545 + __ vec_perm (vKey3, vKey4, vKey3, keyPerm); 3.546 + 3.547 + // load the 2nd round key to vKey4 3.548 + __ li (keypos, 16); 3.549 + __ lvx (vKey5, keypos, key); 3.550 + __ vec_perm (vKey4, vKey5, vKey4, keyPerm); 3.551 + 3.552 + // load the 1st round key to vKey5 3.553 + __ lvx (vTmp1, key); 3.554 + __ vec_perm (vKey5, vTmp1, vKey5, keyPerm); 3.555 3.556 // last 5th - 1th rounds 3.557 __ vncipher (vRet, vRet, vKey1); 3.558 @@ -2634,19 +2637,27 @@ 3.559 __ vncipher (vRet, vRet, vKey4); 3.560 __ vncipherlast (vRet, vRet, vKey5); 3.561 3.562 - __ neg (temp, to); 3.563 - __ lvsr (toPerm, temp); 3.564 - __ vspltisb (vTmp2, -1); 3.565 - __ vxor (vTmp1, vTmp1, vTmp1); 3.566 - __ vperm (vTmp2, vTmp2, vTmp1, toPerm); 3.567 - __ vxor (toPerm, toPerm, fSplt); 3.568 + // store result (unaligned) 3.569 +#ifdef VM_LITTLE_ENDIAN 3.570 + __ lvsl (toPerm, to); 3.571 +#else 3.572 + __ lvsr (toPerm, to); 3.573 +#endif 3.574 + __ vspltisb (vTmp3, -1); 3.575 + __ vspltisb (vTmp4, 0); 3.576 __ lvx (vTmp1, to); 3.577 - __ vperm (vRet, vRet, vRet, toPerm); 3.578 - __ vsel (vTmp1, vTmp1, vRet, vTmp2); 3.579 - __ lvx (vTmp4, fifteen, to); 3.580 + __ lvx (vTmp2, fifteen, to); 3.581 +#ifdef VM_LITTLE_ENDIAN 3.582 + __ vperm (vTmp3, vTmp3, vTmp4, toPerm); // generate select mask 3.583 + __ vxor (toPerm, toPerm, fSplt); // swap bytes 3.584 +#else 3.585 + __ vperm (vTmp3, vTmp4, vTmp3, toPerm); // generate select mask 3.586 +#endif 3.587 + __ vperm (vTmp4, vRet, vRet, toPerm); // rotate data 3.588 + __ vsel (vTmp2, vTmp4, vTmp2, vTmp3); 3.589 + __ vsel (vTmp1, vTmp1, vTmp4, vTmp3); 3.590 + __ stvx (vTmp2, fifteen, to); // store this one first (may alias) 3.591 __ stvx (vTmp1, to); 3.592 - __ vsel (vRet, vRet, vTmp4, vTmp2); 3.593 - __ stvx (vRet, fifteen, to); 3.594 3.595 __ blr(); 3.596 return start;
4.1 --- a/src/cpu/ppc/vm/stubRoutines_ppc_64.hpp Tue Jan 29 08:28:24 2019 -0500 4.2 +++ b/src/cpu/ppc/vm/stubRoutines_ppc_64.hpp Sun Aug 11 19:11:08 2019 -0400 4.3 @@ -34,7 +34,7 @@ 4.4 4.5 enum platform_dependent_constants { 4.6 code_size1 = 20000, // simply increase if too small (assembler will crash if too small) 4.7 - code_size2 = 22000 // simply increase if too small (assembler will crash if too small) 4.8 + code_size2 = 24000 // simply increase if too small (assembler will crash if too small) 4.9 }; 4.10 4.11 // CRC32 Intrinsics.
5.1 --- a/src/cpu/ppc/vm/vm_version_ppc.cpp Tue Jan 29 08:28:24 2019 -0500 5.2 +++ b/src/cpu/ppc/vm/vm_version_ppc.cpp Sun Aug 11 19:11:08 2019 -0400 5.3 @@ -174,7 +174,6 @@ 5.4 } 5.5 5.6 // The AES intrinsic stubs require AES instruction support. 5.7 -#if defined(VM_LITTLE_ENDIAN) 5.8 if (has_vcipher()) { 5.9 if (FLAG_IS_DEFAULT(UseAES)) { 5.10 UseAES = true; 5.11 @@ -195,18 +194,6 @@ 5.12 FLAG_SET_DEFAULT(UseAESIntrinsics, false); 5.13 } 5.14 5.15 -#else 5.16 - if (UseAES) { 5.17 - warning("AES instructions are not available on this CPU"); 5.18 - FLAG_SET_DEFAULT(UseAES, false); 5.19 - } 5.20 - if (UseAESIntrinsics) { 5.21 - if (!FLAG_IS_DEFAULT(UseAESIntrinsics)) 5.22 - warning("AES intrinsics are not available on this CPU"); 5.23 - FLAG_SET_DEFAULT(UseAESIntrinsics, false); 5.24 - } 5.25 -#endif 5.26 - 5.27 if (has_vshasig()) { 5.28 if (FLAG_IS_DEFAULT(UseSHA)) { 5.29 UseSHA = true;