1.1 --- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp Tue Jan 29 08:28:24 2019 -0500 1.2 +++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp Sun Aug 11 19:11:08 2019 -0400 1.3 @@ -2224,7 +2224,7 @@ 1.4 return start; 1.5 } 1.6 1.7 - // Arguments for generated stub (little endian only): 1.8 + // Arguments for generated stub: 1.9 // R3_ARG1 - source byte array address 1.10 // R4_ARG2 - destination byte array address 1.11 // R5_ARG3 - round key array 1.12 @@ -2243,7 +2243,6 @@ 1.13 Register keylen = R8; 1.14 Register temp = R9; 1.15 Register keypos = R10; 1.16 - Register hex = R11; 1.17 Register fifteen = R12; 1.18 1.19 VectorRegister vRet = VR0; 1.20 @@ -2263,164 +2262,170 @@ 1.21 VectorRegister vTmp3 = VR11; 1.22 VectorRegister vTmp4 = VR12; 1.23 1.24 - VectorRegister vLow = VR13; 1.25 - VectorRegister vHigh = VR14; 1.26 - 1.27 - __ li (hex, 16); 1.28 __ li (fifteen, 15); 1.29 - __ vspltisb (fSplt, 0x0f); 1.30 1.31 // load unaligned from[0-15] to vsRet 1.32 __ lvx (vRet, from); 1.33 __ lvx (vTmp1, fifteen, from); 1.34 __ lvsl (fromPerm, from); 1.35 +#ifdef VM_LITTLE_ENDIAN 1.36 + __ vspltisb (fSplt, 0x0f); 1.37 __ vxor (fromPerm, fromPerm, fSplt); 1.38 +#endif 1.39 __ vperm (vRet, vRet, vTmp1, fromPerm); 1.40 1.41 // load keylen (44 or 52 or 60) 1.42 __ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key); 1.43 1.44 // to load keys 1.45 - __ lvsr (keyPerm, key); 1.46 - __ vxor (vTmp2, vTmp2, vTmp2); 1.47 + __ load_perm (keyPerm, key); 1.48 +#ifdef VM_LITTLE_ENDIAN 1.49 __ vspltisb (vTmp2, -16); 1.50 __ vrld (keyPerm, keyPerm, vTmp2); 1.51 __ vrld (keyPerm, keyPerm, vTmp2); 1.52 __ vsldoi (keyPerm, keyPerm, keyPerm, 8); 1.53 - 1.54 - // load the 1st round key to vKey1 1.55 - __ li (keypos, 0); 1.56 +#endif 1.57 + 1.58 + // load the 1st round key to vTmp1 1.59 + __ lvx (vTmp1, key); 1.60 + __ li (keypos, 16); 1.61 __ lvx (vKey1, keypos, key); 1.62 - __ addi (keypos, keypos, 16); 1.63 + __ vec_perm (vTmp1, vKey1, keyPerm); 1.64 + 1.65 + // 1st round 1.66 + __ vxor (vRet, vRet, vTmp1); 1.67 + 1.68 + // load the 2nd round key to vKey1 1.69 + __ li (keypos, 32); 1.70 + __ lvx (vKey2, keypos, key); 1.71 + __ vec_perm (vKey1, vKey2, keyPerm); 1.72 + 1.73 + // load the 3rd round key to vKey2 1.74 + __ li (keypos, 48); 1.75 + __ lvx (vKey3, keypos, key); 1.76 + __ vec_perm (vKey2, vKey3, keyPerm); 1.77 + 1.78 + // load the 4th round key to vKey3 1.79 + __ li (keypos, 64); 1.80 + __ lvx (vKey4, keypos, key); 1.81 + __ vec_perm (vKey3, vKey4, keyPerm); 1.82 + 1.83 + // load the 5th round key to vKey4 1.84 + __ li (keypos, 80); 1.85 __ lvx (vTmp1, keypos, key); 1.86 - __ vperm (vKey1, vTmp1, vKey1, keyPerm); 1.87 - 1.88 - // 1st round 1.89 - __ vxor (vRet, vRet, vKey1); 1.90 - 1.91 - // load the 2nd round key to vKey1 1.92 - __ addi (keypos, keypos, 16); 1.93 - __ lvx (vTmp2, keypos, key); 1.94 - __ vperm (vKey1, vTmp2, vTmp1, keyPerm); 1.95 - 1.96 - // load the 3rd round key to vKey2 1.97 - __ addi (keypos, keypos, 16); 1.98 + __ vec_perm (vKey4, vTmp1, keyPerm); 1.99 + 1.100 + // 2nd - 5th rounds 1.101 + __ vcipher (vRet, vRet, vKey1); 1.102 + __ vcipher (vRet, vRet, vKey2); 1.103 + __ vcipher (vRet, vRet, vKey3); 1.104 + __ vcipher (vRet, vRet, vKey4); 1.105 + 1.106 + // load the 6th round key to vKey1 1.107 + __ li (keypos, 96); 1.108 + __ lvx (vKey2, keypos, key); 1.109 + __ vec_perm (vKey1, vTmp1, vKey2, keyPerm); 1.110 + 1.111 + // load the 7th round key to vKey2 1.112 + __ li (keypos, 112); 1.113 + __ lvx (vKey3, keypos, key); 1.114 + __ vec_perm (vKey2, vKey3, keyPerm); 1.115 + 1.116 + // load the 8th round key to vKey3 1.117 + __ li (keypos, 128); 1.118 + __ lvx (vKey4, keypos, key); 1.119 + __ vec_perm (vKey3, vKey4, keyPerm); 1.120 + 1.121 + // load the 9th round key to vKey4 1.122 + __ li (keypos, 144); 1.123 __ lvx (vTmp1, keypos, key); 1.124 - __ vperm (vKey2, vTmp1, vTmp2, keyPerm); 1.125 - 1.126 - // load the 4th round key to vKey3 1.127 - __ addi (keypos, keypos, 16); 1.128 - __ lvx (vTmp2, keypos, key); 1.129 - __ vperm (vKey3, vTmp2, vTmp1, keyPerm); 1.130 - 1.131 - // load the 5th round key to vKey4 1.132 - __ addi (keypos, keypos, 16); 1.133 + __ vec_perm (vKey4, vTmp1, keyPerm); 1.134 + 1.135 + // 6th - 9th rounds 1.136 + __ vcipher (vRet, vRet, vKey1); 1.137 + __ vcipher (vRet, vRet, vKey2); 1.138 + __ vcipher (vRet, vRet, vKey3); 1.139 + __ vcipher (vRet, vRet, vKey4); 1.140 + 1.141 + // load the 10th round key to vKey1 1.142 + __ li (keypos, 160); 1.143 + __ lvx (vKey2, keypos, key); 1.144 + __ vec_perm (vKey1, vTmp1, vKey2, keyPerm); 1.145 + 1.146 + // load the 11th round key to vKey2 1.147 + __ li (keypos, 176); 1.148 __ lvx (vTmp1, keypos, key); 1.149 - __ vperm (vKey4, vTmp1, vTmp2, keyPerm); 1.150 - 1.151 - // 2nd - 5th rounds 1.152 - __ vcipher (vRet, vRet, vKey1); 1.153 - __ vcipher (vRet, vRet, vKey2); 1.154 - __ vcipher (vRet, vRet, vKey3); 1.155 - __ vcipher (vRet, vRet, vKey4); 1.156 - 1.157 - // load the 6th round key to vKey1 1.158 - __ addi (keypos, keypos, 16); 1.159 - __ lvx (vTmp2, keypos, key); 1.160 - __ vperm (vKey1, vTmp2, vTmp1, keyPerm); 1.161 - 1.162 - // load the 7th round key to vKey2 1.163 - __ addi (keypos, keypos, 16); 1.164 - __ lvx (vTmp1, keypos, key); 1.165 - __ vperm (vKey2, vTmp1, vTmp2, keyPerm); 1.166 - 1.167 - // load the 8th round key to vKey3 1.168 - __ addi (keypos, keypos, 16); 1.169 - __ lvx (vTmp2, keypos, key); 1.170 - __ vperm (vKey3, vTmp2, vTmp1, keyPerm); 1.171 - 1.172 - // load the 9th round key to vKey4 1.173 - __ addi (keypos, keypos, 16); 1.174 - __ lvx (vTmp1, keypos, key); 1.175 - __ vperm (vKey4, vTmp1, vTmp2, keyPerm); 1.176 - 1.177 - // 6th - 9th rounds 1.178 - __ vcipher (vRet, vRet, vKey1); 1.179 - __ vcipher (vRet, vRet, vKey2); 1.180 - __ vcipher (vRet, vRet, vKey3); 1.181 - __ vcipher (vRet, vRet, vKey4); 1.182 - 1.183 - // load the 10th round key to vKey1 1.184 - __ addi (keypos, keypos, 16); 1.185 - __ lvx (vTmp2, keypos, key); 1.186 - __ vperm (vKey1, vTmp2, vTmp1, keyPerm); 1.187 - 1.188 - // load the 11th round key to vKey2 1.189 - __ addi (keypos, keypos, 16); 1.190 - __ lvx (vTmp1, keypos, key); 1.191 - __ vperm (vKey2, vTmp1, vTmp2, keyPerm); 1.192 + __ vec_perm (vKey2, vTmp1, keyPerm); 1.193 1.194 // if all round keys are loaded, skip next 4 rounds 1.195 __ cmpwi (CCR0, keylen, 44); 1.196 __ beq (CCR0, L_doLast); 1.197 1.198 // 10th - 11th rounds 1.199 - __ vcipher (vRet, vRet, vKey1); 1.200 - __ vcipher (vRet, vRet, vKey2); 1.201 + __ vcipher (vRet, vRet, vKey1); 1.202 + __ vcipher (vRet, vRet, vKey2); 1.203 1.204 // load the 12th round key to vKey1 1.205 - __ addi (keypos, keypos, 16); 1.206 - __ lvx (vTmp2, keypos, key); 1.207 - __ vperm (vKey1, vTmp2, vTmp1, keyPerm); 1.208 + __ li (keypos, 192); 1.209 + __ lvx (vKey2, keypos, key); 1.210 + __ vec_perm (vKey1, vTmp1, vKey2, keyPerm); 1.211 1.212 // load the 13th round key to vKey2 1.213 - __ addi (keypos, keypos, 16); 1.214 + __ li (keypos, 208); 1.215 __ lvx (vTmp1, keypos, key); 1.216 - __ vperm (vKey2, vTmp1, vTmp2, keyPerm); 1.217 + __ vec_perm (vKey2, vTmp1, keyPerm); 1.218 1.219 // if all round keys are loaded, skip next 2 rounds 1.220 __ cmpwi (CCR0, keylen, 52); 1.221 __ beq (CCR0, L_doLast); 1.222 1.223 // 12th - 13th rounds 1.224 - __ vcipher (vRet, vRet, vKey1); 1.225 - __ vcipher (vRet, vRet, vKey2); 1.226 + __ vcipher (vRet, vRet, vKey1); 1.227 + __ vcipher (vRet, vRet, vKey2); 1.228 1.229 // load the 14th round key to vKey1 1.230 - __ addi (keypos, keypos, 16); 1.231 - __ lvx (vTmp2, keypos, key); 1.232 - __ vperm (vKey1, vTmp2, vTmp1, keyPerm); 1.233 + __ li (keypos, 224); 1.234 + __ lvx (vKey2, keypos, key); 1.235 + __ vec_perm (vKey1, vTmp1, vKey2, keyPerm); 1.236 1.237 // load the 15th round key to vKey2 1.238 - __ addi (keypos, keypos, 16); 1.239 + __ li (keypos, 240); 1.240 __ lvx (vTmp1, keypos, key); 1.241 - __ vperm (vKey2, vTmp1, vTmp2, keyPerm); 1.242 + __ vec_perm (vKey2, vTmp1, keyPerm); 1.243 1.244 __ bind(L_doLast); 1.245 1.246 // last two rounds 1.247 - __ vcipher (vRet, vRet, vKey1); 1.248 - __ vcipherlast (vRet, vRet, vKey2); 1.249 - 1.250 - __ neg (temp, to); 1.251 - __ lvsr (toPerm, temp); 1.252 - __ vspltisb (vTmp2, -1); 1.253 - __ vxor (vTmp1, vTmp1, vTmp1); 1.254 - __ vperm (vTmp2, vTmp2, vTmp1, toPerm); 1.255 - __ vxor (toPerm, toPerm, fSplt); 1.256 + __ vcipher (vRet, vRet, vKey1); 1.257 + __ vcipherlast (vRet, vRet, vKey2); 1.258 + 1.259 + // store result (unaligned) 1.260 +#ifdef VM_LITTLE_ENDIAN 1.261 + __ lvsl (toPerm, to); 1.262 +#else 1.263 + __ lvsr (toPerm, to); 1.264 +#endif 1.265 + __ vspltisb (vTmp3, -1); 1.266 + __ vspltisb (vTmp4, 0); 1.267 __ lvx (vTmp1, to); 1.268 - __ vperm (vRet, vRet, vRet, toPerm); 1.269 - __ vsel (vTmp1, vTmp1, vRet, vTmp2); 1.270 - __ lvx (vTmp4, fifteen, to); 1.271 + __ lvx (vTmp2, fifteen, to); 1.272 +#ifdef VM_LITTLE_ENDIAN 1.273 + __ vperm (vTmp3, vTmp3, vTmp4, toPerm); // generate select mask 1.274 + __ vxor (toPerm, toPerm, fSplt); // swap bytes 1.275 +#else 1.276 + __ vperm (vTmp3, vTmp4, vTmp3, toPerm); // generate select mask 1.277 +#endif 1.278 + __ vperm (vTmp4, vRet, vRet, toPerm); // rotate data 1.279 + __ vsel (vTmp2, vTmp4, vTmp2, vTmp3); 1.280 + __ vsel (vTmp1, vTmp1, vTmp4, vTmp3); 1.281 + __ stvx (vTmp2, fifteen, to); // store this one first (may alias) 1.282 __ stvx (vTmp1, to); 1.283 - __ vsel (vRet, vRet, vTmp4, vTmp2); 1.284 - __ stvx (vRet, fifteen, to); 1.285 1.286 __ blr(); 1.287 return start; 1.288 } 1.289 1.290 - // Arguments for generated stub (little endian only): 1.291 + // Arguments for generated stub: 1.292 // R3_ARG1 - source byte array address 1.293 // R4_ARG2 - destination byte array address 1.294 // R5_ARG3 - K (key) in little endian int array 1.295 @@ -2442,7 +2447,6 @@ 1.296 Register keylen = R8; 1.297 Register temp = R9; 1.298 Register keypos = R10; 1.299 - Register hex = R11; 1.300 Register fifteen = R12; 1.301 1.302 VectorRegister vRet = VR0; 1.303 @@ -2463,30 +2467,30 @@ 1.304 VectorRegister vTmp3 = VR12; 1.305 VectorRegister vTmp4 = VR13; 1.306 1.307 - VectorRegister vLow = VR14; 1.308 - VectorRegister vHigh = VR15; 1.309 - 1.310 - __ li (hex, 16); 1.311 __ li (fifteen, 15); 1.312 - __ vspltisb (fSplt, 0x0f); 1.313 1.314 // load unaligned from[0-15] to vsRet 1.315 __ lvx (vRet, from); 1.316 __ lvx (vTmp1, fifteen, from); 1.317 __ lvsl (fromPerm, from); 1.318 +#ifdef VM_LITTLE_ENDIAN 1.319 + __ vspltisb (fSplt, 0x0f); 1.320 __ vxor (fromPerm, fromPerm, fSplt); 1.321 +#endif 1.322 __ vperm (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE] 1.323 1.324 // load keylen (44 or 52 or 60) 1.325 __ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key); 1.326 1.327 // to load keys 1.328 - __ lvsr (keyPerm, key); 1.329 + __ load_perm (keyPerm, key); 1.330 +#ifdef VM_LITTLE_ENDIAN 1.331 __ vxor (vTmp2, vTmp2, vTmp2); 1.332 __ vspltisb (vTmp2, -16); 1.333 __ vrld (keyPerm, keyPerm, vTmp2); 1.334 __ vrld (keyPerm, keyPerm, vTmp2); 1.335 __ vsldoi (keyPerm, keyPerm, keyPerm, 8); 1.336 +#endif 1.337 1.338 __ cmpwi (CCR0, keylen, 44); 1.339 __ beq (CCR0, L_do44); 1.340 @@ -2494,32 +2498,32 @@ 1.341 __ cmpwi (CCR0, keylen, 52); 1.342 __ beq (CCR0, L_do52); 1.343 1.344 - // load the 15th round key to vKey11 1.345 + // load the 15th round key to vKey1 1.346 __ li (keypos, 240); 1.347 + __ lvx (vKey1, keypos, key); 1.348 + __ li (keypos, 224); 1.349 + __ lvx (vKey2, keypos, key); 1.350 + __ vec_perm (vKey1, vKey2, vKey1, keyPerm); 1.351 + 1.352 + // load the 14th round key to vKey2 1.353 + __ li (keypos, 208); 1.354 + __ lvx (vKey3, keypos, key); 1.355 + __ vec_perm (vKey2, vKey3, vKey2, keyPerm); 1.356 + 1.357 + // load the 13th round key to vKey3 1.358 + __ li (keypos, 192); 1.359 + __ lvx (vKey4, keypos, key); 1.360 + __ vec_perm (vKey3, vKey4, vKey3, keyPerm); 1.361 + 1.362 + // load the 12th round key to vKey4 1.363 + __ li (keypos, 176); 1.364 + __ lvx (vKey5, keypos, key); 1.365 + __ vec_perm (vKey4, vKey5, vKey4, keyPerm); 1.366 + 1.367 + // load the 11th round key to vKey5 1.368 + __ li (keypos, 160); 1.369 __ lvx (vTmp1, keypos, key); 1.370 - __ addi (keypos, keypos, -16); 1.371 - __ lvx (vTmp2, keypos, key); 1.372 - __ vperm (vKey1, vTmp1, vTmp2, keyPerm); 1.373 - 1.374 - // load the 14th round key to vKey10 1.375 - __ addi (keypos, keypos, -16); 1.376 - __ lvx (vTmp1, keypos, key); 1.377 - __ vperm (vKey2, vTmp2, vTmp1, keyPerm); 1.378 - 1.379 - // load the 13th round key to vKey10 1.380 - __ addi (keypos, keypos, -16); 1.381 - __ lvx (vTmp2, keypos, key); 1.382 - __ vperm (vKey3, vTmp1, vTmp2, keyPerm); 1.383 - 1.384 - // load the 12th round key to vKey10 1.385 - __ addi (keypos, keypos, -16); 1.386 - __ lvx (vTmp1, keypos, key); 1.387 - __ vperm (vKey4, vTmp2, vTmp1, keyPerm); 1.388 - 1.389 - // load the 11th round key to vKey10 1.390 - __ addi (keypos, keypos, -16); 1.391 - __ lvx (vTmp2, keypos, key); 1.392 - __ vperm (vKey5, vTmp1, vTmp2, keyPerm); 1.393 + __ vec_perm (vKey5, vTmp1, vKey5, keyPerm); 1.394 1.395 // 1st - 5th rounds 1.396 __ vxor (vRet, vRet, vKey1); 1.397 @@ -2532,22 +2536,22 @@ 1.398 1.399 __ bind (L_do52); 1.400 1.401 - // load the 13th round key to vKey11 1.402 + // load the 13th round key to vKey1 1.403 __ li (keypos, 208); 1.404 + __ lvx (vKey1, keypos, key); 1.405 + __ li (keypos, 192); 1.406 + __ lvx (vKey2, keypos, key); 1.407 + __ vec_perm (vKey1, vKey2, vKey1, keyPerm); 1.408 + 1.409 + // load the 12th round key to vKey2 1.410 + __ li (keypos, 176); 1.411 + __ lvx (vKey3, keypos, key); 1.412 + __ vec_perm (vKey2, vKey3, vKey2, keyPerm); 1.413 + 1.414 + // load the 11th round key to vKey3 1.415 + __ li (keypos, 160); 1.416 __ lvx (vTmp1, keypos, key); 1.417 - __ addi (keypos, keypos, -16); 1.418 - __ lvx (vTmp2, keypos, key); 1.419 - __ vperm (vKey1, vTmp1, vTmp2, keyPerm); 1.420 - 1.421 - // load the 12th round key to vKey10 1.422 - __ addi (keypos, keypos, -16); 1.423 - __ lvx (vTmp1, keypos, key); 1.424 - __ vperm (vKey2, vTmp2, vTmp1, keyPerm); 1.425 - 1.426 - // load the 11th round key to vKey10 1.427 - __ addi (keypos, keypos, -16); 1.428 - __ lvx (vTmp2, keypos, key); 1.429 - __ vperm (vKey3, vTmp1, vTmp2, keyPerm); 1.430 + __ vec_perm (vKey3, vTmp1, vKey3, keyPerm); 1.431 1.432 // 1st - 3rd rounds 1.433 __ vxor (vRet, vRet, vKey1); 1.434 @@ -2558,42 +2562,42 @@ 1.435 1.436 __ bind (L_do44); 1.437 1.438 - // load the 11th round key to vKey11 1.439 + // load the 11th round key to vKey1 1.440 __ li (keypos, 176); 1.441 + __ lvx (vKey1, keypos, key); 1.442 + __ li (keypos, 160); 1.443 __ lvx (vTmp1, keypos, key); 1.444 - __ addi (keypos, keypos, -16); 1.445 - __ lvx (vTmp2, keypos, key); 1.446 - __ vperm (vKey1, vTmp1, vTmp2, keyPerm); 1.447 + __ vec_perm (vKey1, vTmp1, vKey1, keyPerm); 1.448 1.449 // 1st round 1.450 __ vxor (vRet, vRet, vKey1); 1.451 1.452 __ bind (L_doLast); 1.453 1.454 - // load the 10th round key to vKey10 1.455 - __ addi (keypos, keypos, -16); 1.456 + // load the 10th round key to vKey1 1.457 + __ li (keypos, 144); 1.458 + __ lvx (vKey2, keypos, key); 1.459 + __ vec_perm (vKey1, vKey2, vTmp1, keyPerm); 1.460 + 1.461 + // load the 9th round key to vKey2 1.462 + __ li (keypos, 128); 1.463 + __ lvx (vKey3, keypos, key); 1.464 + __ vec_perm (vKey2, vKey3, vKey2, keyPerm); 1.465 + 1.466 + // load the 8th round key to vKey3 1.467 + __ li (keypos, 112); 1.468 + __ lvx (vKey4, keypos, key); 1.469 + __ vec_perm (vKey3, vKey4, vKey3, keyPerm); 1.470 + 1.471 + // load the 7th round key to vKey4 1.472 + __ li (keypos, 96); 1.473 + __ lvx (vKey5, keypos, key); 1.474 + __ vec_perm (vKey4, vKey5, vKey4, keyPerm); 1.475 + 1.476 + // load the 6th round key to vKey5 1.477 + __ li (keypos, 80); 1.478 __ lvx (vTmp1, keypos, key); 1.479 - __ vperm (vKey1, vTmp2, vTmp1, keyPerm); 1.480 - 1.481 - // load the 9th round key to vKey10 1.482 - __ addi (keypos, keypos, -16); 1.483 - __ lvx (vTmp2, keypos, key); 1.484 - __ vperm (vKey2, vTmp1, vTmp2, keyPerm); 1.485 - 1.486 - // load the 8th round key to vKey10 1.487 - __ addi (keypos, keypos, -16); 1.488 - __ lvx (vTmp1, keypos, key); 1.489 - __ vperm (vKey3, vTmp2, vTmp1, keyPerm); 1.490 - 1.491 - // load the 7th round key to vKey10 1.492 - __ addi (keypos, keypos, -16); 1.493 - __ lvx (vTmp2, keypos, key); 1.494 - __ vperm (vKey4, vTmp1, vTmp2, keyPerm); 1.495 - 1.496 - // load the 6th round key to vKey10 1.497 - __ addi (keypos, keypos, -16); 1.498 - __ lvx (vTmp1, keypos, key); 1.499 - __ vperm (vKey5, vTmp2, vTmp1, keyPerm); 1.500 + __ vec_perm (vKey5, vTmp1, vKey5, keyPerm); 1.501 1.502 // last 10th - 6th rounds 1.503 __ vncipher (vRet, vRet, vKey1); 1.504 @@ -2602,30 +2606,29 @@ 1.505 __ vncipher (vRet, vRet, vKey4); 1.506 __ vncipher (vRet, vRet, vKey5); 1.507 1.508 - // load the 5th round key to vKey10 1.509 - __ addi (keypos, keypos, -16); 1.510 - __ lvx (vTmp2, keypos, key); 1.511 - __ vperm (vKey1, vTmp1, vTmp2, keyPerm); 1.512 - 1.513 - // load the 4th round key to vKey10 1.514 - __ addi (keypos, keypos, -16); 1.515 - __ lvx (vTmp1, keypos, key); 1.516 - __ vperm (vKey2, vTmp2, vTmp1, keyPerm); 1.517 - 1.518 - // load the 3rd round key to vKey10 1.519 - __ addi (keypos, keypos, -16); 1.520 - __ lvx (vTmp2, keypos, key); 1.521 - __ vperm (vKey3, vTmp1, vTmp2, keyPerm); 1.522 - 1.523 - // load the 2nd round key to vKey10 1.524 - __ addi (keypos, keypos, -16); 1.525 - __ lvx (vTmp1, keypos, key); 1.526 - __ vperm (vKey4, vTmp2, vTmp1, keyPerm); 1.527 - 1.528 - // load the 1st round key to vKey10 1.529 - __ addi (keypos, keypos, -16); 1.530 - __ lvx (vTmp2, keypos, key); 1.531 - __ vperm (vKey5, vTmp1, vTmp2, keyPerm); 1.532 + // load the 5th round key to vKey1 1.533 + __ li (keypos, 64); 1.534 + __ lvx (vKey2, keypos, key); 1.535 + __ vec_perm (vKey1, vKey2, vTmp1, keyPerm); 1.536 + 1.537 + // load the 4th round key to vKey2 1.538 + __ li (keypos, 48); 1.539 + __ lvx (vKey3, keypos, key); 1.540 + __ vec_perm (vKey2, vKey3, vKey2, keyPerm); 1.541 + 1.542 + // load the 3rd round key to vKey3 1.543 + __ li (keypos, 32); 1.544 + __ lvx (vKey4, keypos, key); 1.545 + __ vec_perm (vKey3, vKey4, vKey3, keyPerm); 1.546 + 1.547 + // load the 2nd round key to vKey4 1.548 + __ li (keypos, 16); 1.549 + __ lvx (vKey5, keypos, key); 1.550 + __ vec_perm (vKey4, vKey5, vKey4, keyPerm); 1.551 + 1.552 + // load the 1st round key to vKey5 1.553 + __ lvx (vTmp1, key); 1.554 + __ vec_perm (vKey5, vTmp1, vKey5, keyPerm); 1.555 1.556 // last 5th - 1th rounds 1.557 __ vncipher (vRet, vRet, vKey1); 1.558 @@ -2634,19 +2637,27 @@ 1.559 __ vncipher (vRet, vRet, vKey4); 1.560 __ vncipherlast (vRet, vRet, vKey5); 1.561 1.562 - __ neg (temp, to); 1.563 - __ lvsr (toPerm, temp); 1.564 - __ vspltisb (vTmp2, -1); 1.565 - __ vxor (vTmp1, vTmp1, vTmp1); 1.566 - __ vperm (vTmp2, vTmp2, vTmp1, toPerm); 1.567 - __ vxor (toPerm, toPerm, fSplt); 1.568 + // store result (unaligned) 1.569 +#ifdef VM_LITTLE_ENDIAN 1.570 + __ lvsl (toPerm, to); 1.571 +#else 1.572 + __ lvsr (toPerm, to); 1.573 +#endif 1.574 + __ vspltisb (vTmp3, -1); 1.575 + __ vspltisb (vTmp4, 0); 1.576 __ lvx (vTmp1, to); 1.577 - __ vperm (vRet, vRet, vRet, toPerm); 1.578 - __ vsel (vTmp1, vTmp1, vRet, vTmp2); 1.579 - __ lvx (vTmp4, fifteen, to); 1.580 + __ lvx (vTmp2, fifteen, to); 1.581 +#ifdef VM_LITTLE_ENDIAN 1.582 + __ vperm (vTmp3, vTmp3, vTmp4, toPerm); // generate select mask 1.583 + __ vxor (toPerm, toPerm, fSplt); // swap bytes 1.584 +#else 1.585 + __ vperm (vTmp3, vTmp4, vTmp3, toPerm); // generate select mask 1.586 +#endif 1.587 + __ vperm (vTmp4, vRet, vRet, toPerm); // rotate data 1.588 + __ vsel (vTmp2, vTmp4, vTmp2, vTmp3); 1.589 + __ vsel (vTmp1, vTmp1, vTmp4, vTmp3); 1.590 + __ stvx (vTmp2, fifteen, to); // store this one first (may alias) 1.591 __ stvx (vTmp1, to); 1.592 - __ vsel (vRet, vRet, vTmp4, vTmp2); 1.593 - __ stvx (vRet, fifteen, to); 1.594 1.595 __ blr(); 1.596 return start;