src/cpu/ppc/vm/stubGenerator_ppc.cpp

changeset 9730
42118db355f5
parent 9713
c4567d28f31f
child 9756
2be326848943
     1.1 --- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Tue Jan 29 08:28:24 2019 -0500
     1.2 +++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Sun Aug 11 19:11:08 2019 -0400
     1.3 @@ -2224,7 +2224,7 @@
     1.4      return start;
     1.5    }
     1.6  
     1.7 -  // Arguments for generated stub (little endian only):
     1.8 +  // Arguments for generated stub:
     1.9    //   R3_ARG1   - source byte array address
    1.10    //   R4_ARG2   - destination byte array address
    1.11    //   R5_ARG3   - round key array
    1.12 @@ -2243,7 +2243,6 @@
    1.13      Register keylen         = R8;
    1.14      Register temp           = R9;
    1.15      Register keypos         = R10;
    1.16 -    Register hex            = R11;
    1.17      Register fifteen        = R12;
    1.18  
    1.19      VectorRegister vRet     = VR0;
    1.20 @@ -2263,164 +2262,170 @@
    1.21      VectorRegister vTmp3    = VR11;
    1.22      VectorRegister vTmp4    = VR12;
    1.23  
    1.24 -    VectorRegister vLow     = VR13;
    1.25 -    VectorRegister vHigh    = VR14;
    1.26 -
    1.27 -    __ li              (hex, 16);
    1.28      __ li              (fifteen, 15);
    1.29 -    __ vspltisb        (fSplt, 0x0f);
    1.30  
    1.31      // load unaligned from[0-15] to vsRet
    1.32      __ lvx             (vRet, from);
    1.33      __ lvx             (vTmp1, fifteen, from);
    1.34      __ lvsl            (fromPerm, from);
    1.35 +#ifdef VM_LITTLE_ENDIAN
    1.36 +    __ vspltisb        (fSplt, 0x0f);
    1.37      __ vxor            (fromPerm, fromPerm, fSplt);
    1.38 +#endif
    1.39      __ vperm           (vRet, vRet, vTmp1, fromPerm);
    1.40  
    1.41      // load keylen (44 or 52 or 60)
    1.42      __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
    1.43  
    1.44      // to load keys
    1.45 -    __ lvsr            (keyPerm, key);
    1.46 -    __ vxor            (vTmp2, vTmp2, vTmp2);
    1.47 +    __ load_perm       (keyPerm, key);
    1.48 +#ifdef VM_LITTLE_ENDIAN
    1.49      __ vspltisb        (vTmp2, -16);
    1.50      __ vrld            (keyPerm, keyPerm, vTmp2);
    1.51      __ vrld            (keyPerm, keyPerm, vTmp2);
    1.52      __ vsldoi          (keyPerm, keyPerm, keyPerm, 8);
    1.53 -
    1.54 -    // load the 1st round key to vKey1
    1.55 -    __ li              (keypos, 0);
    1.56 +#endif
    1.57 +
    1.58 +    // load the 1st round key to vTmp1
    1.59 +    __ lvx             (vTmp1, key);
    1.60 +    __ li              (keypos, 16);
    1.61      __ lvx             (vKey1, keypos, key);
    1.62 -    __ addi            (keypos, keypos, 16);
    1.63 +    __ vec_perm        (vTmp1, vKey1, keyPerm);
    1.64 +
    1.65 +    // 1st round
    1.66 +    __ vxor            (vRet, vRet, vTmp1);
    1.67 +
    1.68 +    // load the 2nd round key to vKey1
    1.69 +    __ li              (keypos, 32);
    1.70 +    __ lvx             (vKey2, keypos, key);
    1.71 +    __ vec_perm        (vKey1, vKey2, keyPerm);
    1.72 +
    1.73 +    // load the 3rd round key to vKey2
    1.74 +    __ li              (keypos, 48);
    1.75 +    __ lvx             (vKey3, keypos, key);
    1.76 +    __ vec_perm        (vKey2, vKey3, keyPerm);
    1.77 +
    1.78 +    // load the 4th round key to vKey3
    1.79 +    __ li              (keypos, 64);
    1.80 +    __ lvx             (vKey4, keypos, key);
    1.81 +    __ vec_perm        (vKey3, vKey4, keyPerm);
    1.82 +
    1.83 +    // load the 5th round key to vKey4
    1.84 +    __ li              (keypos, 80);
    1.85      __ lvx             (vTmp1, keypos, key);
    1.86 -    __ vperm           (vKey1, vTmp1, vKey1, keyPerm);
    1.87 -
    1.88 -    // 1st round
    1.89 -    __ vxor (vRet, vRet, vKey1);
    1.90 -
    1.91 -    // load the 2nd round key to vKey1
    1.92 -    __ addi            (keypos, keypos, 16);
    1.93 -    __ lvx             (vTmp2, keypos, key);
    1.94 -    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
    1.95 -
    1.96 -    // load the 3rd round key to vKey2
    1.97 -    __ addi            (keypos, keypos, 16);
    1.98 +    __ vec_perm        (vKey4, vTmp1, keyPerm);
    1.99 +
   1.100 +    // 2nd - 5th rounds
   1.101 +    __ vcipher         (vRet, vRet, vKey1);
   1.102 +    __ vcipher         (vRet, vRet, vKey2);
   1.103 +    __ vcipher         (vRet, vRet, vKey3);
   1.104 +    __ vcipher         (vRet, vRet, vKey4);
   1.105 +
   1.106 +    // load the 6th round key to vKey1
   1.107 +    __ li              (keypos, 96);
   1.108 +    __ lvx             (vKey2, keypos, key);
   1.109 +    __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
   1.110 +
   1.111 +    // load the 7th round key to vKey2
   1.112 +    __ li              (keypos, 112);
   1.113 +    __ lvx             (vKey3, keypos, key);
   1.114 +    __ vec_perm        (vKey2, vKey3, keyPerm);
   1.115 +
   1.116 +    // load the 8th round key to vKey3
   1.117 +    __ li              (keypos, 128);
   1.118 +    __ lvx             (vKey4, keypos, key);
   1.119 +    __ vec_perm        (vKey3, vKey4, keyPerm);
   1.120 +
   1.121 +    // load the 9th round key to vKey4
   1.122 +    __ li              (keypos, 144);
   1.123      __ lvx             (vTmp1, keypos, key);
   1.124 -    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
   1.125 -
   1.126 -    // load the 4th round key to vKey3
   1.127 -    __ addi            (keypos, keypos, 16);
   1.128 -    __ lvx             (vTmp2, keypos, key);
   1.129 -    __ vperm           (vKey3, vTmp2, vTmp1, keyPerm);
   1.130 -
   1.131 -    // load the 5th round key to vKey4
   1.132 -    __ addi            (keypos, keypos, 16);
   1.133 +    __ vec_perm        (vKey4, vTmp1, keyPerm);
   1.134 +
   1.135 +    // 6th - 9th rounds
   1.136 +    __ vcipher         (vRet, vRet, vKey1);
   1.137 +    __ vcipher         (vRet, vRet, vKey2);
   1.138 +    __ vcipher         (vRet, vRet, vKey3);
   1.139 +    __ vcipher         (vRet, vRet, vKey4);
   1.140 +
   1.141 +    // load the 10th round key to vKey1
   1.142 +    __ li              (keypos, 160);
   1.143 +    __ lvx             (vKey2, keypos, key);
   1.144 +    __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
   1.145 +
   1.146 +    // load the 11th round key to vKey2
   1.147 +    __ li              (keypos, 176);
   1.148      __ lvx             (vTmp1, keypos, key);
   1.149 -    __ vperm           (vKey4, vTmp1, vTmp2, keyPerm);
   1.150 -
   1.151 -    // 2nd - 5th rounds
   1.152 -    __ vcipher (vRet, vRet, vKey1);
   1.153 -    __ vcipher (vRet, vRet, vKey2);
   1.154 -    __ vcipher (vRet, vRet, vKey3);
   1.155 -    __ vcipher (vRet, vRet, vKey4);
   1.156 -
   1.157 -    // load the 6th round key to vKey1
   1.158 -    __ addi            (keypos, keypos, 16);
   1.159 -    __ lvx             (vTmp2, keypos, key);
   1.160 -    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
   1.161 -
   1.162 -    // load the 7th round key to vKey2
   1.163 -    __ addi            (keypos, keypos, 16);
   1.164 -    __ lvx             (vTmp1, keypos, key);
   1.165 -    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
   1.166 -
   1.167 -    // load the 8th round key to vKey3
   1.168 -    __ addi            (keypos, keypos, 16);
   1.169 -    __ lvx             (vTmp2, keypos, key);
   1.170 -    __ vperm           (vKey3, vTmp2, vTmp1, keyPerm);
   1.171 -
   1.172 -    // load the 9th round key to vKey4
   1.173 -    __ addi            (keypos, keypos, 16);
   1.174 -    __ lvx             (vTmp1, keypos, key);
   1.175 -    __ vperm           (vKey4, vTmp1, vTmp2, keyPerm);
   1.176 -
   1.177 -    // 6th - 9th rounds
   1.178 -    __ vcipher (vRet, vRet, vKey1);
   1.179 -    __ vcipher (vRet, vRet, vKey2);
   1.180 -    __ vcipher (vRet, vRet, vKey3);
   1.181 -    __ vcipher (vRet, vRet, vKey4);
   1.182 -
   1.183 -    // load the 10th round key to vKey1
   1.184 -    __ addi            (keypos, keypos, 16);
   1.185 -    __ lvx             (vTmp2, keypos, key);
   1.186 -    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
   1.187 -
   1.188 -    // load the 11th round key to vKey2
   1.189 -    __ addi            (keypos, keypos, 16);
   1.190 -    __ lvx             (vTmp1, keypos, key);
   1.191 -    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
   1.192 +    __ vec_perm        (vKey2, vTmp1, keyPerm);
   1.193  
   1.194      // if all round keys are loaded, skip next 4 rounds
   1.195      __ cmpwi           (CCR0, keylen, 44);
   1.196      __ beq             (CCR0, L_doLast);
   1.197  
   1.198      // 10th - 11th rounds
   1.199 -    __ vcipher (vRet, vRet, vKey1);
   1.200 -    __ vcipher (vRet, vRet, vKey2);
   1.201 +    __ vcipher         (vRet, vRet, vKey1);
   1.202 +    __ vcipher         (vRet, vRet, vKey2);
   1.203  
   1.204      // load the 12th round key to vKey1
   1.205 -    __ addi            (keypos, keypos, 16);
   1.206 -    __ lvx             (vTmp2, keypos, key);
   1.207 -    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
   1.208 +    __ li              (keypos, 192);
   1.209 +    __ lvx             (vKey2, keypos, key);
   1.210 +    __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
   1.211  
   1.212      // load the 13th round key to vKey2
   1.213 -    __ addi            (keypos, keypos, 16);
   1.214 +    __ li              (keypos, 208);
   1.215      __ lvx             (vTmp1, keypos, key);
   1.216 -    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
   1.217 +    __ vec_perm        (vKey2, vTmp1, keyPerm);
   1.218  
   1.219      // if all round keys are loaded, skip next 2 rounds
   1.220      __ cmpwi           (CCR0, keylen, 52);
   1.221      __ beq             (CCR0, L_doLast);
   1.222  
   1.223      // 12th - 13th rounds
   1.224 -    __ vcipher (vRet, vRet, vKey1);
   1.225 -    __ vcipher (vRet, vRet, vKey2);
   1.226 +    __ vcipher         (vRet, vRet, vKey1);
   1.227 +    __ vcipher         (vRet, vRet, vKey2);
   1.228  
   1.229      // load the 14th round key to vKey1
   1.230 -    __ addi            (keypos, keypos, 16);
   1.231 -    __ lvx             (vTmp2, keypos, key);
   1.232 -    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
   1.233 +    __ li              (keypos, 224);
   1.234 +    __ lvx             (vKey2, keypos, key);
   1.235 +    __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
   1.236  
   1.237      // load the 15th round key to vKey2
   1.238 -    __ addi            (keypos, keypos, 16);
   1.239 +    __ li              (keypos, 240);
   1.240      __ lvx             (vTmp1, keypos, key);
   1.241 -    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
   1.242 +    __ vec_perm        (vKey2, vTmp1, keyPerm);
   1.243  
   1.244      __ bind(L_doLast);
   1.245  
   1.246      // last two rounds
   1.247 -    __ vcipher (vRet, vRet, vKey1);
   1.248 -    __ vcipherlast (vRet, vRet, vKey2);
   1.249 -
   1.250 -    __ neg             (temp, to);
   1.251 -    __ lvsr            (toPerm, temp);
   1.252 -    __ vspltisb        (vTmp2, -1);
   1.253 -    __ vxor            (vTmp1, vTmp1, vTmp1);
   1.254 -    __ vperm           (vTmp2, vTmp2, vTmp1, toPerm);
   1.255 -    __ vxor            (toPerm, toPerm, fSplt);
   1.256 +    __ vcipher         (vRet, vRet, vKey1);
   1.257 +    __ vcipherlast     (vRet, vRet, vKey2);
   1.258 +
   1.259 +    // store result (unaligned)
   1.260 +#ifdef VM_LITTLE_ENDIAN
   1.261 +    __ lvsl            (toPerm, to);
   1.262 +#else
   1.263 +    __ lvsr            (toPerm, to);
   1.264 +#endif
   1.265 +    __ vspltisb        (vTmp3, -1);
   1.266 +    __ vspltisb        (vTmp4, 0);
   1.267      __ lvx             (vTmp1, to);
   1.268 -    __ vperm           (vRet, vRet, vRet, toPerm);
   1.269 -    __ vsel            (vTmp1, vTmp1, vRet, vTmp2);
   1.270 -    __ lvx             (vTmp4, fifteen, to);
   1.271 +    __ lvx             (vTmp2, fifteen, to);
   1.272 +#ifdef VM_LITTLE_ENDIAN
   1.273 +    __ vperm           (vTmp3, vTmp3, vTmp4, toPerm); // generate select mask
   1.274 +    __ vxor            (toPerm, toPerm, fSplt);       // swap bytes
   1.275 +#else
   1.276 +    __ vperm           (vTmp3, vTmp4, vTmp3, toPerm); // generate select mask
   1.277 +#endif
   1.278 +    __ vperm           (vTmp4, vRet, vRet, toPerm);   // rotate data
   1.279 +    __ vsel            (vTmp2, vTmp4, vTmp2, vTmp3);
   1.280 +    __ vsel            (vTmp1, vTmp1, vTmp4, vTmp3);
   1.281 +    __ stvx            (vTmp2, fifteen, to);          // store this one first (may alias)
   1.282      __ stvx            (vTmp1, to);
   1.283 -    __ vsel            (vRet, vRet, vTmp4, vTmp2);
   1.284 -    __ stvx            (vRet, fifteen, to);
   1.285  
   1.286      __ blr();
   1.287       return start;
   1.288    }
   1.289  
   1.290 -  // Arguments for generated stub (little endian only):
   1.291 +  // Arguments for generated stub:
   1.292    //   R3_ARG1   - source byte array address
   1.293    //   R4_ARG2   - destination byte array address
   1.294    //   R5_ARG3   - K (key) in little endian int array
   1.295 @@ -2442,7 +2447,6 @@
   1.296      Register keylen         = R8;
   1.297      Register temp           = R9;
   1.298      Register keypos         = R10;
   1.299 -    Register hex            = R11;
   1.300      Register fifteen        = R12;
   1.301  
   1.302      VectorRegister vRet     = VR0;
   1.303 @@ -2463,30 +2467,30 @@
   1.304      VectorRegister vTmp3    = VR12;
   1.305      VectorRegister vTmp4    = VR13;
   1.306  
   1.307 -    VectorRegister vLow     = VR14;
   1.308 -    VectorRegister vHigh    = VR15;
   1.309 -
   1.310 -    __ li              (hex, 16);
   1.311      __ li              (fifteen, 15);
   1.312 -    __ vspltisb        (fSplt, 0x0f);
   1.313  
   1.314      // load unaligned from[0-15] to vsRet
   1.315      __ lvx             (vRet, from);
   1.316      __ lvx             (vTmp1, fifteen, from);
   1.317      __ lvsl            (fromPerm, from);
   1.318 +#ifdef VM_LITTLE_ENDIAN
   1.319 +    __ vspltisb        (fSplt, 0x0f);
   1.320      __ vxor            (fromPerm, fromPerm, fSplt);
   1.321 +#endif
   1.322      __ vperm           (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE]
   1.323  
   1.324      // load keylen (44 or 52 or 60)
   1.325      __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
   1.326  
   1.327      // to load keys
   1.328 -    __ lvsr            (keyPerm, key);
   1.329 +    __ load_perm       (keyPerm, key);
   1.330 +#ifdef VM_LITTLE_ENDIAN
   1.331      __ vxor            (vTmp2, vTmp2, vTmp2);
   1.332      __ vspltisb        (vTmp2, -16);
   1.333      __ vrld            (keyPerm, keyPerm, vTmp2);
   1.334      __ vrld            (keyPerm, keyPerm, vTmp2);
   1.335      __ vsldoi          (keyPerm, keyPerm, keyPerm, 8);
   1.336 +#endif
   1.337  
   1.338      __ cmpwi           (CCR0, keylen, 44);
   1.339      __ beq             (CCR0, L_do44);
   1.340 @@ -2494,32 +2498,32 @@
   1.341      __ cmpwi           (CCR0, keylen, 52);
   1.342      __ beq             (CCR0, L_do52);
   1.343  
   1.344 -    // load the 15th round key to vKey11
   1.345 +    // load the 15th round key to vKey1
   1.346      __ li              (keypos, 240);
   1.347 +    __ lvx             (vKey1, keypos, key);
   1.348 +    __ li              (keypos, 224);
   1.349 +    __ lvx             (vKey2, keypos, key);
   1.350 +    __ vec_perm        (vKey1, vKey2, vKey1, keyPerm);
   1.351 +
   1.352 +    // load the 14th round key to vKey2
   1.353 +    __ li              (keypos, 208);
   1.354 +    __ lvx             (vKey3, keypos, key);
   1.355 +    __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
   1.356 +
   1.357 +    // load the 13th round key to vKey3
   1.358 +    __ li              (keypos, 192);
   1.359 +    __ lvx             (vKey4, keypos, key);
   1.360 +    __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
   1.361 +
   1.362 +    // load the 12th round key to vKey4
   1.363 +    __ li              (keypos, 176);
   1.364 +    __ lvx             (vKey5, keypos, key);
   1.365 +    __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
   1.366 +
   1.367 +    // load the 11th round key to vKey5
   1.368 +    __ li              (keypos, 160);
   1.369      __ lvx             (vTmp1, keypos, key);
   1.370 -    __ addi            (keypos, keypos, -16);
   1.371 -    __ lvx             (vTmp2, keypos, key);
   1.372 -    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
   1.373 -
   1.374 -    // load the 14th round key to vKey10
   1.375 -    __ addi            (keypos, keypos, -16);
   1.376 -    __ lvx             (vTmp1, keypos, key);
   1.377 -    __ vperm           (vKey2, vTmp2, vTmp1, keyPerm);
   1.378 -
   1.379 -    // load the 13th round key to vKey10
   1.380 -    __ addi            (keypos, keypos, -16);
   1.381 -    __ lvx             (vTmp2, keypos, key);
   1.382 -    __ vperm           (vKey3, vTmp1, vTmp2, keyPerm);
   1.383 -
   1.384 -    // load the 12th round key to vKey10
   1.385 -    __ addi            (keypos, keypos, -16);
   1.386 -    __ lvx             (vTmp1, keypos, key);
   1.387 -    __ vperm           (vKey4, vTmp2, vTmp1, keyPerm);
   1.388 -
   1.389 -    // load the 11th round key to vKey10
   1.390 -    __ addi            (keypos, keypos, -16);
   1.391 -    __ lvx             (vTmp2, keypos, key);
   1.392 -    __ vperm           (vKey5, vTmp1, vTmp2, keyPerm);
   1.393 +    __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
   1.394  
   1.395      // 1st - 5th rounds
   1.396      __ vxor            (vRet, vRet, vKey1);
   1.397 @@ -2532,22 +2536,22 @@
   1.398  
   1.399      __ bind            (L_do52);
   1.400  
   1.401 -    // load the 13th round key to vKey11
   1.402 +    // load the 13th round key to vKey1
   1.403      __ li              (keypos, 208);
   1.404 +    __ lvx             (vKey1, keypos, key);
   1.405 +    __ li              (keypos, 192);
   1.406 +    __ lvx             (vKey2, keypos, key);
   1.407 +    __ vec_perm        (vKey1, vKey2, vKey1, keyPerm);
   1.408 +
   1.409 +    // load the 12th round key to vKey2
   1.410 +    __ li              (keypos, 176);
   1.411 +    __ lvx             (vKey3, keypos, key);
   1.412 +    __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
   1.413 +
   1.414 +    // load the 11th round key to vKey3
   1.415 +    __ li              (keypos, 160);
   1.416      __ lvx             (vTmp1, keypos, key);
   1.417 -    __ addi            (keypos, keypos, -16);
   1.418 -    __ lvx             (vTmp2, keypos, key);
   1.419 -    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
   1.420 -
   1.421 -    // load the 12th round key to vKey10
   1.422 -    __ addi            (keypos, keypos, -16);
   1.423 -    __ lvx             (vTmp1, keypos, key);
   1.424 -    __ vperm           (vKey2, vTmp2, vTmp1, keyPerm);
   1.425 -
   1.426 -    // load the 11th round key to vKey10
   1.427 -    __ addi            (keypos, keypos, -16);
   1.428 -    __ lvx             (vTmp2, keypos, key);
   1.429 -    __ vperm           (vKey3, vTmp1, vTmp2, keyPerm);
   1.430 +    __ vec_perm        (vKey3, vTmp1, vKey3, keyPerm);
   1.431  
   1.432      // 1st - 3rd rounds
   1.433      __ vxor            (vRet, vRet, vKey1);
   1.434 @@ -2558,42 +2562,42 @@
   1.435  
   1.436      __ bind            (L_do44);
   1.437  
   1.438 -    // load the 11th round key to vKey11
   1.439 +    // load the 11th round key to vKey1
   1.440      __ li              (keypos, 176);
   1.441 +    __ lvx             (vKey1, keypos, key);
   1.442 +    __ li              (keypos, 160);
   1.443      __ lvx             (vTmp1, keypos, key);
   1.444 -    __ addi            (keypos, keypos, -16);
   1.445 -    __ lvx             (vTmp2, keypos, key);
   1.446 -    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
   1.447 +    __ vec_perm        (vKey1, vTmp1, vKey1, keyPerm);
   1.448  
   1.449      // 1st round
   1.450      __ vxor            (vRet, vRet, vKey1);
   1.451  
   1.452      __ bind            (L_doLast);
   1.453  
   1.454 -    // load the 10th round key to vKey10
   1.455 -    __ addi            (keypos, keypos, -16);
   1.456 +    // load the 10th round key to vKey1
   1.457 +    __ li              (keypos, 144);
   1.458 +    __ lvx             (vKey2, keypos, key);
   1.459 +    __ vec_perm        (vKey1, vKey2, vTmp1, keyPerm);
   1.460 +
   1.461 +    // load the 9th round key to vKey2
   1.462 +    __ li              (keypos, 128);
   1.463 +    __ lvx             (vKey3, keypos, key);
   1.464 +    __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
   1.465 +
   1.466 +    // load the 8th round key to vKey3
   1.467 +    __ li              (keypos, 112);
   1.468 +    __ lvx             (vKey4, keypos, key);
   1.469 +    __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
   1.470 +
   1.471 +    // load the 7th round key to vKey4
   1.472 +    __ li              (keypos, 96);
   1.473 +    __ lvx             (vKey5, keypos, key);
   1.474 +    __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
   1.475 +
   1.476 +    // load the 6th round key to vKey5
   1.477 +    __ li              (keypos, 80);
   1.478      __ lvx             (vTmp1, keypos, key);
   1.479 -    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
   1.480 -
   1.481 -    // load the 9th round key to vKey10
   1.482 -    __ addi            (keypos, keypos, -16);
   1.483 -    __ lvx             (vTmp2, keypos, key);
   1.484 -    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
   1.485 -
   1.486 -    // load the 8th round key to vKey10
   1.487 -    __ addi            (keypos, keypos, -16);
   1.488 -    __ lvx             (vTmp1, keypos, key);
   1.489 -    __ vperm           (vKey3, vTmp2, vTmp1, keyPerm);
   1.490 -
   1.491 -    // load the 7th round key to vKey10
   1.492 -    __ addi            (keypos, keypos, -16);
   1.493 -    __ lvx             (vTmp2, keypos, key);
   1.494 -    __ vperm           (vKey4, vTmp1, vTmp2, keyPerm);
   1.495 -
   1.496 -    // load the 6th round key to vKey10
   1.497 -    __ addi            (keypos, keypos, -16);
   1.498 -    __ lvx             (vTmp1, keypos, key);
   1.499 -    __ vperm           (vKey5, vTmp2, vTmp1, keyPerm);
   1.500 +    __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
   1.501  
   1.502      // last 10th - 6th rounds
   1.503      __ vncipher        (vRet, vRet, vKey1);
   1.504 @@ -2602,30 +2606,29 @@
   1.505      __ vncipher        (vRet, vRet, vKey4);
   1.506      __ vncipher        (vRet, vRet, vKey5);
   1.507  
   1.508 -    // load the 5th round key to vKey10
   1.509 -    __ addi            (keypos, keypos, -16);
   1.510 -    __ lvx             (vTmp2, keypos, key);
   1.511 -    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
   1.512 -
   1.513 -    // load the 4th round key to vKey10
   1.514 -    __ addi            (keypos, keypos, -16);
   1.515 -    __ lvx             (vTmp1, keypos, key);
   1.516 -    __ vperm           (vKey2, vTmp2, vTmp1, keyPerm);
   1.517 -
   1.518 -    // load the 3rd round key to vKey10
   1.519 -    __ addi            (keypos, keypos, -16);
   1.520 -    __ lvx             (vTmp2, keypos, key);
   1.521 -    __ vperm           (vKey3, vTmp1, vTmp2, keyPerm);
   1.522 -
   1.523 -    // load the 2nd round key to vKey10
   1.524 -    __ addi            (keypos, keypos, -16);
   1.525 -    __ lvx             (vTmp1, keypos, key);
   1.526 -    __ vperm           (vKey4, vTmp2, vTmp1, keyPerm);
   1.527 -
   1.528 -    // load the 1st round key to vKey10
   1.529 -    __ addi            (keypos, keypos, -16);
   1.530 -    __ lvx             (vTmp2, keypos, key);
   1.531 -    __ vperm           (vKey5, vTmp1, vTmp2, keyPerm);
   1.532 +    // load the 5th round key to vKey1
   1.533 +    __ li              (keypos, 64);
   1.534 +    __ lvx             (vKey2, keypos, key);
   1.535 +    __ vec_perm        (vKey1, vKey2, vTmp1, keyPerm);
   1.536 +
   1.537 +    // load the 4th round key to vKey2
   1.538 +    __ li              (keypos, 48);
   1.539 +    __ lvx             (vKey3, keypos, key);
   1.540 +    __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
   1.541 +
   1.542 +    // load the 3rd round key to vKey3
   1.543 +    __ li              (keypos, 32);
   1.544 +    __ lvx             (vKey4, keypos, key);
   1.545 +    __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
   1.546 +
   1.547 +    // load the 2nd round key to vKey4
   1.548 +    __ li              (keypos, 16);
   1.549 +    __ lvx             (vKey5, keypos, key);
   1.550 +    __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
   1.551 +
   1.552 +    // load the 1st round key to vKey5
   1.553 +    __ lvx             (vTmp1, key);
   1.554 +    __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
   1.555  
   1.556      // last 5th - 1th rounds
   1.557      __ vncipher        (vRet, vRet, vKey1);
   1.558 @@ -2634,19 +2637,27 @@
   1.559      __ vncipher        (vRet, vRet, vKey4);
   1.560      __ vncipherlast    (vRet, vRet, vKey5);
   1.561  
   1.562 -    __ neg             (temp, to);
   1.563 -    __ lvsr            (toPerm, temp);
   1.564 -    __ vspltisb        (vTmp2, -1);
   1.565 -    __ vxor            (vTmp1, vTmp1, vTmp1);
   1.566 -    __ vperm           (vTmp2, vTmp2, vTmp1, toPerm);
   1.567 -    __ vxor            (toPerm, toPerm, fSplt);
   1.568 +    // store result (unaligned)
   1.569 +#ifdef VM_LITTLE_ENDIAN
   1.570 +    __ lvsl            (toPerm, to);
   1.571 +#else
   1.572 +    __ lvsr            (toPerm, to);
   1.573 +#endif
   1.574 +    __ vspltisb        (vTmp3, -1);
   1.575 +    __ vspltisb        (vTmp4, 0);
   1.576      __ lvx             (vTmp1, to);
   1.577 -    __ vperm           (vRet, vRet, vRet, toPerm);
   1.578 -    __ vsel            (vTmp1, vTmp1, vRet, vTmp2);
   1.579 -    __ lvx             (vTmp4, fifteen, to);
   1.580 +    __ lvx             (vTmp2, fifteen, to);
   1.581 +#ifdef VM_LITTLE_ENDIAN
   1.582 +    __ vperm           (vTmp3, vTmp3, vTmp4, toPerm); // generate select mask
   1.583 +    __ vxor            (toPerm, toPerm, fSplt);       // swap bytes
   1.584 +#else
   1.585 +    __ vperm           (vTmp3, vTmp4, vTmp3, toPerm); // generate select mask
   1.586 +#endif
   1.587 +    __ vperm           (vTmp4, vRet, vRet, toPerm);   // rotate data
   1.588 +    __ vsel            (vTmp2, vTmp4, vTmp2, vTmp3);
   1.589 +    __ vsel            (vTmp1, vTmp1, vTmp4, vTmp3);
   1.590 +    __ stvx            (vTmp2, fifteen, to);          // store this one first (may alias)
   1.591      __ stvx            (vTmp1, to);
   1.592 -    __ vsel            (vRet, vRet, vTmp4, vTmp2);
   1.593 -    __ stvx            (vRet, fifteen, to);
   1.594  
   1.595      __ blr();
   1.596       return start;

mercurial