8188868: PPC64: Support AES intrinsics on Big Endian

Sun, 11 Aug 2019 19:11:08 -0400

author
mdoerr
date
Sun, 11 Aug 2019 19:11:08 -0400
changeset 9730
42118db355f5
parent 9728
fa7fe6dae563
child 9731
921c5ee7965f

8188868: PPC64: Support AES intrinsics on Big Endian
Reviewed-by: goetz

src/cpu/ppc/vm/assembler_ppc.hpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/assembler_ppc.inline.hpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/stubGenerator_ppc.cpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/stubRoutines_ppc_64.hpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/vm_version_ppc.cpp file | annotate | diff | comparison | revisions
     1.1 --- a/src/cpu/ppc/vm/assembler_ppc.hpp	Tue Jan 29 08:28:24 2019 -0500
     1.2 +++ b/src/cpu/ppc/vm/assembler_ppc.hpp	Sun Aug 11 19:11:08 2019 -0400
     1.3 @@ -2100,6 +2100,7 @@
     1.4    // Endianess specific concatenation of 2 loaded vectors.
     1.5    inline void load_perm(VectorRegister perm, Register addr);
     1.6    inline void vec_perm(VectorRegister first_dest, VectorRegister second, VectorRegister perm);
     1.7 +  inline void vec_perm(VectorRegister dest, VectorRegister first, VectorRegister second, VectorRegister perm);
     1.8  
     1.9    // RegisterOrConstant versions.
    1.10    // These emitters choose between the versions using two registers and
     2.1 --- a/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Tue Jan 29 08:28:24 2019 -0500
     2.2 +++ b/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Sun Aug 11 19:11:08 2019 -0400
     2.3 @@ -904,6 +904,14 @@
     2.4  #endif
     2.5  }
     2.6  
     2.7 +inline void Assembler::vec_perm(VectorRegister dest, VectorRegister first, VectorRegister second, VectorRegister perm) {
     2.8 +#if defined(VM_LITTLE_ENDIAN)
     2.9 +  vperm(dest, second, first, perm);
    2.10 +#else
    2.11 +  vperm(dest, first, second, perm);
    2.12 +#endif
    2.13 +}
    2.14 +
    2.15  inline void Assembler::load_const(Register d, void* x, Register tmp) {
    2.16     load_const(d, (long)x, tmp);
    2.17  }
     3.1 --- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Tue Jan 29 08:28:24 2019 -0500
     3.2 +++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Sun Aug 11 19:11:08 2019 -0400
     3.3 @@ -2224,7 +2224,7 @@
     3.4      return start;
     3.5    }
     3.6  
     3.7 -  // Arguments for generated stub (little endian only):
     3.8 +  // Arguments for generated stub:
     3.9    //   R3_ARG1   - source byte array address
    3.10    //   R4_ARG2   - destination byte array address
    3.11    //   R5_ARG3   - round key array
    3.12 @@ -2243,7 +2243,6 @@
    3.13      Register keylen         = R8;
    3.14      Register temp           = R9;
    3.15      Register keypos         = R10;
    3.16 -    Register hex            = R11;
    3.17      Register fifteen        = R12;
    3.18  
    3.19      VectorRegister vRet     = VR0;
    3.20 @@ -2263,164 +2262,170 @@
    3.21      VectorRegister vTmp3    = VR11;
    3.22      VectorRegister vTmp4    = VR12;
    3.23  
    3.24 -    VectorRegister vLow     = VR13;
    3.25 -    VectorRegister vHigh    = VR14;
    3.26 -
    3.27 -    __ li              (hex, 16);
    3.28      __ li              (fifteen, 15);
    3.29 -    __ vspltisb        (fSplt, 0x0f);
    3.30  
    3.31      // load unaligned from[0-15] to vsRet
    3.32      __ lvx             (vRet, from);
    3.33      __ lvx             (vTmp1, fifteen, from);
    3.34      __ lvsl            (fromPerm, from);
    3.35 +#ifdef VM_LITTLE_ENDIAN
    3.36 +    __ vspltisb        (fSplt, 0x0f);
    3.37      __ vxor            (fromPerm, fromPerm, fSplt);
    3.38 +#endif
    3.39      __ vperm           (vRet, vRet, vTmp1, fromPerm);
    3.40  
    3.41      // load keylen (44 or 52 or 60)
    3.42      __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
    3.43  
    3.44      // to load keys
    3.45 -    __ lvsr            (keyPerm, key);
    3.46 -    __ vxor            (vTmp2, vTmp2, vTmp2);
    3.47 +    __ load_perm       (keyPerm, key);
    3.48 +#ifdef VM_LITTLE_ENDIAN
    3.49      __ vspltisb        (vTmp2, -16);
    3.50      __ vrld            (keyPerm, keyPerm, vTmp2);
    3.51      __ vrld            (keyPerm, keyPerm, vTmp2);
    3.52      __ vsldoi          (keyPerm, keyPerm, keyPerm, 8);
    3.53 -
    3.54 -    // load the 1st round key to vKey1
    3.55 -    __ li              (keypos, 0);
    3.56 +#endif
    3.57 +
    3.58 +    // load the 1st round key to vTmp1
    3.59 +    __ lvx             (vTmp1, key);
    3.60 +    __ li              (keypos, 16);
    3.61      __ lvx             (vKey1, keypos, key);
    3.62 -    __ addi            (keypos, keypos, 16);
    3.63 +    __ vec_perm        (vTmp1, vKey1, keyPerm);
    3.64 +
    3.65 +    // 1st round
    3.66 +    __ vxor            (vRet, vRet, vTmp1);
    3.67 +
    3.68 +    // load the 2nd round key to vKey1
    3.69 +    __ li              (keypos, 32);
    3.70 +    __ lvx             (vKey2, keypos, key);
    3.71 +    __ vec_perm        (vKey1, vKey2, keyPerm);
    3.72 +
    3.73 +    // load the 3rd round key to vKey2
    3.74 +    __ li              (keypos, 48);
    3.75 +    __ lvx             (vKey3, keypos, key);
    3.76 +    __ vec_perm        (vKey2, vKey3, keyPerm);
    3.77 +
    3.78 +    // load the 4th round key to vKey3
    3.79 +    __ li              (keypos, 64);
    3.80 +    __ lvx             (vKey4, keypos, key);
    3.81 +    __ vec_perm        (vKey3, vKey4, keyPerm);
    3.82 +
    3.83 +    // load the 5th round key to vKey4
    3.84 +    __ li              (keypos, 80);
    3.85      __ lvx             (vTmp1, keypos, key);
    3.86 -    __ vperm           (vKey1, vTmp1, vKey1, keyPerm);
    3.87 -
    3.88 -    // 1st round
    3.89 -    __ vxor (vRet, vRet, vKey1);
    3.90 -
    3.91 -    // load the 2nd round key to vKey1
    3.92 -    __ addi            (keypos, keypos, 16);
    3.93 -    __ lvx             (vTmp2, keypos, key);
    3.94 -    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
    3.95 -
    3.96 -    // load the 3rd round key to vKey2
    3.97 -    __ addi            (keypos, keypos, 16);
    3.98 +    __ vec_perm        (vKey4, vTmp1, keyPerm);
    3.99 +
   3.100 +    // 2nd - 5th rounds
   3.101 +    __ vcipher         (vRet, vRet, vKey1);
   3.102 +    __ vcipher         (vRet, vRet, vKey2);
   3.103 +    __ vcipher         (vRet, vRet, vKey3);
   3.104 +    __ vcipher         (vRet, vRet, vKey4);
   3.105 +
   3.106 +    // load the 6th round key to vKey1
   3.107 +    __ li              (keypos, 96);
   3.108 +    __ lvx             (vKey2, keypos, key);
   3.109 +    __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
   3.110 +
   3.111 +    // load the 7th round key to vKey2
   3.112 +    __ li              (keypos, 112);
   3.113 +    __ lvx             (vKey3, keypos, key);
   3.114 +    __ vec_perm        (vKey2, vKey3, keyPerm);
   3.115 +
   3.116 +    // load the 8th round key to vKey3
   3.117 +    __ li              (keypos, 128);
   3.118 +    __ lvx             (vKey4, keypos, key);
   3.119 +    __ vec_perm        (vKey3, vKey4, keyPerm);
   3.120 +
   3.121 +    // load the 9th round key to vKey4
   3.122 +    __ li              (keypos, 144);
   3.123      __ lvx             (vTmp1, keypos, key);
   3.124 -    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
   3.125 -
   3.126 -    // load the 4th round key to vKey3
   3.127 -    __ addi            (keypos, keypos, 16);
   3.128 -    __ lvx             (vTmp2, keypos, key);
   3.129 -    __ vperm           (vKey3, vTmp2, vTmp1, keyPerm);
   3.130 -
   3.131 -    // load the 5th round key to vKey4
   3.132 -    __ addi            (keypos, keypos, 16);
   3.133 +    __ vec_perm        (vKey4, vTmp1, keyPerm);
   3.134 +
   3.135 +    // 6th - 9th rounds
   3.136 +    __ vcipher         (vRet, vRet, vKey1);
   3.137 +    __ vcipher         (vRet, vRet, vKey2);
   3.138 +    __ vcipher         (vRet, vRet, vKey3);
   3.139 +    __ vcipher         (vRet, vRet, vKey4);
   3.140 +
   3.141 +    // load the 10th round key to vKey1
   3.142 +    __ li              (keypos, 160);
   3.143 +    __ lvx             (vKey2, keypos, key);
   3.144 +    __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
   3.145 +
   3.146 +    // load the 11th round key to vKey2
   3.147 +    __ li              (keypos, 176);
   3.148      __ lvx             (vTmp1, keypos, key);
   3.149 -    __ vperm           (vKey4, vTmp1, vTmp2, keyPerm);
   3.150 -
   3.151 -    // 2nd - 5th rounds
   3.152 -    __ vcipher (vRet, vRet, vKey1);
   3.153 -    __ vcipher (vRet, vRet, vKey2);
   3.154 -    __ vcipher (vRet, vRet, vKey3);
   3.155 -    __ vcipher (vRet, vRet, vKey4);
   3.156 -
   3.157 -    // load the 6th round key to vKey1
   3.158 -    __ addi            (keypos, keypos, 16);
   3.159 -    __ lvx             (vTmp2, keypos, key);
   3.160 -    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
   3.161 -
   3.162 -    // load the 7th round key to vKey2
   3.163 -    __ addi            (keypos, keypos, 16);
   3.164 -    __ lvx             (vTmp1, keypos, key);
   3.165 -    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
   3.166 -
   3.167 -    // load the 8th round key to vKey3
   3.168 -    __ addi            (keypos, keypos, 16);
   3.169 -    __ lvx             (vTmp2, keypos, key);
   3.170 -    __ vperm           (vKey3, vTmp2, vTmp1, keyPerm);
   3.171 -
   3.172 -    // load the 9th round key to vKey4
   3.173 -    __ addi            (keypos, keypos, 16);
   3.174 -    __ lvx             (vTmp1, keypos, key);
   3.175 -    __ vperm           (vKey4, vTmp1, vTmp2, keyPerm);
   3.176 -
   3.177 -    // 6th - 9th rounds
   3.178 -    __ vcipher (vRet, vRet, vKey1);
   3.179 -    __ vcipher (vRet, vRet, vKey2);
   3.180 -    __ vcipher (vRet, vRet, vKey3);
   3.181 -    __ vcipher (vRet, vRet, vKey4);
   3.182 -
   3.183 -    // load the 10th round key to vKey1
   3.184 -    __ addi            (keypos, keypos, 16);
   3.185 -    __ lvx             (vTmp2, keypos, key);
   3.186 -    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
   3.187 -
   3.188 -    // load the 11th round key to vKey2
   3.189 -    __ addi            (keypos, keypos, 16);
   3.190 -    __ lvx             (vTmp1, keypos, key);
   3.191 -    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
   3.192 +    __ vec_perm        (vKey2, vTmp1, keyPerm);
   3.193  
   3.194      // if all round keys are loaded, skip next 4 rounds
   3.195      __ cmpwi           (CCR0, keylen, 44);
   3.196      __ beq             (CCR0, L_doLast);
   3.197  
   3.198      // 10th - 11th rounds
   3.199 -    __ vcipher (vRet, vRet, vKey1);
   3.200 -    __ vcipher (vRet, vRet, vKey2);
   3.201 +    __ vcipher         (vRet, vRet, vKey1);
   3.202 +    __ vcipher         (vRet, vRet, vKey2);
   3.203  
   3.204      // load the 12th round key to vKey1
   3.205 -    __ addi            (keypos, keypos, 16);
   3.206 -    __ lvx             (vTmp2, keypos, key);
   3.207 -    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
   3.208 +    __ li              (keypos, 192);
   3.209 +    __ lvx             (vKey2, keypos, key);
   3.210 +    __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
   3.211  
   3.212      // load the 13th round key to vKey2
   3.213 -    __ addi            (keypos, keypos, 16);
   3.214 +    __ li              (keypos, 208);
   3.215      __ lvx             (vTmp1, keypos, key);
   3.216 -    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
   3.217 +    __ vec_perm        (vKey2, vTmp1, keyPerm);
   3.218  
   3.219      // if all round keys are loaded, skip next 2 rounds
   3.220      __ cmpwi           (CCR0, keylen, 52);
   3.221      __ beq             (CCR0, L_doLast);
   3.222  
   3.223      // 12th - 13th rounds
   3.224 -    __ vcipher (vRet, vRet, vKey1);
   3.225 -    __ vcipher (vRet, vRet, vKey2);
   3.226 +    __ vcipher         (vRet, vRet, vKey1);
   3.227 +    __ vcipher         (vRet, vRet, vKey2);
   3.228  
   3.229      // load the 14th round key to vKey1
   3.230 -    __ addi            (keypos, keypos, 16);
   3.231 -    __ lvx             (vTmp2, keypos, key);
   3.232 -    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
   3.233 +    __ li              (keypos, 224);
   3.234 +    __ lvx             (vKey2, keypos, key);
   3.235 +    __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
   3.236  
   3.237      // load the 15th round key to vKey2
   3.238 -    __ addi            (keypos, keypos, 16);
   3.239 +    __ li              (keypos, 240);
   3.240      __ lvx             (vTmp1, keypos, key);
   3.241 -    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
   3.242 +    __ vec_perm        (vKey2, vTmp1, keyPerm);
   3.243  
   3.244      __ bind(L_doLast);
   3.245  
   3.246      // last two rounds
   3.247 -    __ vcipher (vRet, vRet, vKey1);
   3.248 -    __ vcipherlast (vRet, vRet, vKey2);
   3.249 -
   3.250 -    __ neg             (temp, to);
   3.251 -    __ lvsr            (toPerm, temp);
   3.252 -    __ vspltisb        (vTmp2, -1);
   3.253 -    __ vxor            (vTmp1, vTmp1, vTmp1);
   3.254 -    __ vperm           (vTmp2, vTmp2, vTmp1, toPerm);
   3.255 -    __ vxor            (toPerm, toPerm, fSplt);
   3.256 +    __ vcipher         (vRet, vRet, vKey1);
   3.257 +    __ vcipherlast     (vRet, vRet, vKey2);
   3.258 +
   3.259 +    // store result (unaligned)
   3.260 +#ifdef VM_LITTLE_ENDIAN
   3.261 +    __ lvsl            (toPerm, to);
   3.262 +#else
   3.263 +    __ lvsr            (toPerm, to);
   3.264 +#endif
   3.265 +    __ vspltisb        (vTmp3, -1);
   3.266 +    __ vspltisb        (vTmp4, 0);
   3.267      __ lvx             (vTmp1, to);
   3.268 -    __ vperm           (vRet, vRet, vRet, toPerm);
   3.269 -    __ vsel            (vTmp1, vTmp1, vRet, vTmp2);
   3.270 -    __ lvx             (vTmp4, fifteen, to);
   3.271 +    __ lvx             (vTmp2, fifteen, to);
   3.272 +#ifdef VM_LITTLE_ENDIAN
   3.273 +    __ vperm           (vTmp3, vTmp3, vTmp4, toPerm); // generate select mask
   3.274 +    __ vxor            (toPerm, toPerm, fSplt);       // swap bytes
   3.275 +#else
   3.276 +    __ vperm           (vTmp3, vTmp4, vTmp3, toPerm); // generate select mask
   3.277 +#endif
   3.278 +    __ vperm           (vTmp4, vRet, vRet, toPerm);   // rotate data
   3.279 +    __ vsel            (vTmp2, vTmp4, vTmp2, vTmp3);
   3.280 +    __ vsel            (vTmp1, vTmp1, vTmp4, vTmp3);
   3.281 +    __ stvx            (vTmp2, fifteen, to);          // store this one first (may alias)
   3.282      __ stvx            (vTmp1, to);
   3.283 -    __ vsel            (vRet, vRet, vTmp4, vTmp2);
   3.284 -    __ stvx            (vRet, fifteen, to);
   3.285  
   3.286      __ blr();
   3.287       return start;
   3.288    }
   3.289  
   3.290 -  // Arguments for generated stub (little endian only):
   3.291 +  // Arguments for generated stub:
   3.292    //   R3_ARG1   - source byte array address
   3.293    //   R4_ARG2   - destination byte array address
   3.294    //   R5_ARG3   - K (key) in little endian int array
   3.295 @@ -2442,7 +2447,6 @@
   3.296      Register keylen         = R8;
   3.297      Register temp           = R9;
   3.298      Register keypos         = R10;
   3.299 -    Register hex            = R11;
   3.300      Register fifteen        = R12;
   3.301  
   3.302      VectorRegister vRet     = VR0;
   3.303 @@ -2463,30 +2467,30 @@
   3.304      VectorRegister vTmp3    = VR12;
   3.305      VectorRegister vTmp4    = VR13;
   3.306  
   3.307 -    VectorRegister vLow     = VR14;
   3.308 -    VectorRegister vHigh    = VR15;
   3.309 -
   3.310 -    __ li              (hex, 16);
   3.311      __ li              (fifteen, 15);
   3.312 -    __ vspltisb        (fSplt, 0x0f);
   3.313  
   3.314      // load unaligned from[0-15] to vsRet
   3.315      __ lvx             (vRet, from);
   3.316      __ lvx             (vTmp1, fifteen, from);
   3.317      __ lvsl            (fromPerm, from);
   3.318 +#ifdef VM_LITTLE_ENDIAN
   3.319 +    __ vspltisb        (fSplt, 0x0f);
   3.320      __ vxor            (fromPerm, fromPerm, fSplt);
   3.321 +#endif
   3.322      __ vperm           (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE]
   3.323  
   3.324      // load keylen (44 or 52 or 60)
   3.325      __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
   3.326  
   3.327      // to load keys
   3.328 -    __ lvsr            (keyPerm, key);
   3.329 +    __ load_perm       (keyPerm, key);
   3.330 +#ifdef VM_LITTLE_ENDIAN
   3.331      __ vxor            (vTmp2, vTmp2, vTmp2);
   3.332      __ vspltisb        (vTmp2, -16);
   3.333      __ vrld            (keyPerm, keyPerm, vTmp2);
   3.334      __ vrld            (keyPerm, keyPerm, vTmp2);
   3.335      __ vsldoi          (keyPerm, keyPerm, keyPerm, 8);
   3.336 +#endif
   3.337  
   3.338      __ cmpwi           (CCR0, keylen, 44);
   3.339      __ beq             (CCR0, L_do44);
   3.340 @@ -2494,32 +2498,32 @@
   3.341      __ cmpwi           (CCR0, keylen, 52);
   3.342      __ beq             (CCR0, L_do52);
   3.343  
   3.344 -    // load the 15th round key to vKey11
   3.345 +    // load the 15th round key to vKey1
   3.346      __ li              (keypos, 240);
   3.347 +    __ lvx             (vKey1, keypos, key);
   3.348 +    __ li              (keypos, 224);
   3.349 +    __ lvx             (vKey2, keypos, key);
   3.350 +    __ vec_perm        (vKey1, vKey2, vKey1, keyPerm);
   3.351 +
   3.352 +    // load the 14th round key to vKey2
   3.353 +    __ li              (keypos, 208);
   3.354 +    __ lvx             (vKey3, keypos, key);
   3.355 +    __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
   3.356 +
   3.357 +    // load the 13th round key to vKey3
   3.358 +    __ li              (keypos, 192);
   3.359 +    __ lvx             (vKey4, keypos, key);
   3.360 +    __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
   3.361 +
   3.362 +    // load the 12th round key to vKey4
   3.363 +    __ li              (keypos, 176);
   3.364 +    __ lvx             (vKey5, keypos, key);
   3.365 +    __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
   3.366 +
   3.367 +    // load the 11th round key to vKey5
   3.368 +    __ li              (keypos, 160);
   3.369      __ lvx             (vTmp1, keypos, key);
   3.370 -    __ addi            (keypos, keypos, -16);
   3.371 -    __ lvx             (vTmp2, keypos, key);
   3.372 -    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
   3.373 -
   3.374 -    // load the 14th round key to vKey10
   3.375 -    __ addi            (keypos, keypos, -16);
   3.376 -    __ lvx             (vTmp1, keypos, key);
   3.377 -    __ vperm           (vKey2, vTmp2, vTmp1, keyPerm);
   3.378 -
   3.379 -    // load the 13th round key to vKey10
   3.380 -    __ addi            (keypos, keypos, -16);
   3.381 -    __ lvx             (vTmp2, keypos, key);
   3.382 -    __ vperm           (vKey3, vTmp1, vTmp2, keyPerm);
   3.383 -
   3.384 -    // load the 12th round key to vKey10
   3.385 -    __ addi            (keypos, keypos, -16);
   3.386 -    __ lvx             (vTmp1, keypos, key);
   3.387 -    __ vperm           (vKey4, vTmp2, vTmp1, keyPerm);
   3.388 -
   3.389 -    // load the 11th round key to vKey10
   3.390 -    __ addi            (keypos, keypos, -16);
   3.391 -    __ lvx             (vTmp2, keypos, key);
   3.392 -    __ vperm           (vKey5, vTmp1, vTmp2, keyPerm);
   3.393 +    __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
   3.394  
   3.395      // 1st - 5th rounds
   3.396      __ vxor            (vRet, vRet, vKey1);
   3.397 @@ -2532,22 +2536,22 @@
   3.398  
   3.399      __ bind            (L_do52);
   3.400  
   3.401 -    // load the 13th round key to vKey11
   3.402 +    // load the 13th round key to vKey1
   3.403      __ li              (keypos, 208);
   3.404 +    __ lvx             (vKey1, keypos, key);
   3.405 +    __ li              (keypos, 192);
   3.406 +    __ lvx             (vKey2, keypos, key);
   3.407 +    __ vec_perm        (vKey1, vKey2, vKey1, keyPerm);
   3.408 +
   3.409 +    // load the 12th round key to vKey2
   3.410 +    __ li              (keypos, 176);
   3.411 +    __ lvx             (vKey3, keypos, key);
   3.412 +    __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
   3.413 +
   3.414 +    // load the 11th round key to vKey3
   3.415 +    __ li              (keypos, 160);
   3.416      __ lvx             (vTmp1, keypos, key);
   3.417 -    __ addi            (keypos, keypos, -16);
   3.418 -    __ lvx             (vTmp2, keypos, key);
   3.419 -    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
   3.420 -
   3.421 -    // load the 12th round key to vKey10
   3.422 -    __ addi            (keypos, keypos, -16);
   3.423 -    __ lvx             (vTmp1, keypos, key);
   3.424 -    __ vperm           (vKey2, vTmp2, vTmp1, keyPerm);
   3.425 -
   3.426 -    // load the 11th round key to vKey10
   3.427 -    __ addi            (keypos, keypos, -16);
   3.428 -    __ lvx             (vTmp2, keypos, key);
   3.429 -    __ vperm           (vKey3, vTmp1, vTmp2, keyPerm);
   3.430 +    __ vec_perm        (vKey3, vTmp1, vKey3, keyPerm);
   3.431  
   3.432      // 1st - 3rd rounds
   3.433      __ vxor            (vRet, vRet, vKey1);
   3.434 @@ -2558,42 +2562,42 @@
   3.435  
   3.436      __ bind            (L_do44);
   3.437  
   3.438 -    // load the 11th round key to vKey11
   3.439 +    // load the 11th round key to vKey1
   3.440      __ li              (keypos, 176);
   3.441 +    __ lvx             (vKey1, keypos, key);
   3.442 +    __ li              (keypos, 160);
   3.443      __ lvx             (vTmp1, keypos, key);
   3.444 -    __ addi            (keypos, keypos, -16);
   3.445 -    __ lvx             (vTmp2, keypos, key);
   3.446 -    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
   3.447 +    __ vec_perm        (vKey1, vTmp1, vKey1, keyPerm);
   3.448  
   3.449      // 1st round
   3.450      __ vxor            (vRet, vRet, vKey1);
   3.451  
   3.452      __ bind            (L_doLast);
   3.453  
   3.454 -    // load the 10th round key to vKey10
   3.455 -    __ addi            (keypos, keypos, -16);
   3.456 +    // load the 10th round key to vKey1
   3.457 +    __ li              (keypos, 144);
   3.458 +    __ lvx             (vKey2, keypos, key);
   3.459 +    __ vec_perm        (vKey1, vKey2, vTmp1, keyPerm);
   3.460 +
   3.461 +    // load the 9th round key to vKey2
   3.462 +    __ li              (keypos, 128);
   3.463 +    __ lvx             (vKey3, keypos, key);
   3.464 +    __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
   3.465 +
   3.466 +    // load the 8th round key to vKey3
   3.467 +    __ li              (keypos, 112);
   3.468 +    __ lvx             (vKey4, keypos, key);
   3.469 +    __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
   3.470 +
   3.471 +    // load the 7th round key to vKey4
   3.472 +    __ li              (keypos, 96);
   3.473 +    __ lvx             (vKey5, keypos, key);
   3.474 +    __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
   3.475 +
   3.476 +    // load the 6th round key to vKey5
   3.477 +    __ li              (keypos, 80);
   3.478      __ lvx             (vTmp1, keypos, key);
   3.479 -    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
   3.480 -
   3.481 -    // load the 9th round key to vKey10
   3.482 -    __ addi            (keypos, keypos, -16);
   3.483 -    __ lvx             (vTmp2, keypos, key);
   3.484 -    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
   3.485 -
   3.486 -    // load the 8th round key to vKey10
   3.487 -    __ addi            (keypos, keypos, -16);
   3.488 -    __ lvx             (vTmp1, keypos, key);
   3.489 -    __ vperm           (vKey3, vTmp2, vTmp1, keyPerm);
   3.490 -
   3.491 -    // load the 7th round key to vKey10
   3.492 -    __ addi            (keypos, keypos, -16);
   3.493 -    __ lvx             (vTmp2, keypos, key);
   3.494 -    __ vperm           (vKey4, vTmp1, vTmp2, keyPerm);
   3.495 -
   3.496 -    // load the 6th round key to vKey10
   3.497 -    __ addi            (keypos, keypos, -16);
   3.498 -    __ lvx             (vTmp1, keypos, key);
   3.499 -    __ vperm           (vKey5, vTmp2, vTmp1, keyPerm);
   3.500 +    __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
   3.501  
   3.502      // last 10th - 6th rounds
   3.503      __ vncipher        (vRet, vRet, vKey1);
   3.504 @@ -2602,30 +2606,29 @@
   3.505      __ vncipher        (vRet, vRet, vKey4);
   3.506      __ vncipher        (vRet, vRet, vKey5);
   3.507  
   3.508 -    // load the 5th round key to vKey10
   3.509 -    __ addi            (keypos, keypos, -16);
   3.510 -    __ lvx             (vTmp2, keypos, key);
   3.511 -    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
   3.512 -
   3.513 -    // load the 4th round key to vKey10
   3.514 -    __ addi            (keypos, keypos, -16);
   3.515 -    __ lvx             (vTmp1, keypos, key);
   3.516 -    __ vperm           (vKey2, vTmp2, vTmp1, keyPerm);
   3.517 -
   3.518 -    // load the 3rd round key to vKey10
   3.519 -    __ addi            (keypos, keypos, -16);
   3.520 -    __ lvx             (vTmp2, keypos, key);
   3.521 -    __ vperm           (vKey3, vTmp1, vTmp2, keyPerm);
   3.522 -
   3.523 -    // load the 2nd round key to vKey10
   3.524 -    __ addi            (keypos, keypos, -16);
   3.525 -    __ lvx             (vTmp1, keypos, key);
   3.526 -    __ vperm           (vKey4, vTmp2, vTmp1, keyPerm);
   3.527 -
   3.528 -    // load the 1st round key to vKey10
   3.529 -    __ addi            (keypos, keypos, -16);
   3.530 -    __ lvx             (vTmp2, keypos, key);
   3.531 -    __ vperm           (vKey5, vTmp1, vTmp2, keyPerm);
   3.532 +    // load the 5th round key to vKey1
   3.533 +    __ li              (keypos, 64);
   3.534 +    __ lvx             (vKey2, keypos, key);
   3.535 +    __ vec_perm        (vKey1, vKey2, vTmp1, keyPerm);
   3.536 +
   3.537 +    // load the 4th round key to vKey2
   3.538 +    __ li              (keypos, 48);
   3.539 +    __ lvx             (vKey3, keypos, key);
   3.540 +    __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
   3.541 +
   3.542 +    // load the 3rd round key to vKey3
   3.543 +    __ li              (keypos, 32);
   3.544 +    __ lvx             (vKey4, keypos, key);
   3.545 +    __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
   3.546 +
   3.547 +    // load the 2nd round key to vKey4
   3.548 +    __ li              (keypos, 16);
   3.549 +    __ lvx             (vKey5, keypos, key);
   3.550 +    __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
   3.551 +
   3.552 +    // load the 1st round key to vKey5
   3.553 +    __ lvx             (vTmp1, key);
   3.554 +    __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
   3.555  
   3.556      // last 5th - 1th rounds
   3.557      __ vncipher        (vRet, vRet, vKey1);
   3.558 @@ -2634,19 +2637,27 @@
   3.559      __ vncipher        (vRet, vRet, vKey4);
   3.560      __ vncipherlast    (vRet, vRet, vKey5);
   3.561  
   3.562 -    __ neg             (temp, to);
   3.563 -    __ lvsr            (toPerm, temp);
   3.564 -    __ vspltisb        (vTmp2, -1);
   3.565 -    __ vxor            (vTmp1, vTmp1, vTmp1);
   3.566 -    __ vperm           (vTmp2, vTmp2, vTmp1, toPerm);
   3.567 -    __ vxor            (toPerm, toPerm, fSplt);
   3.568 +    // store result (unaligned)
   3.569 +#ifdef VM_LITTLE_ENDIAN
   3.570 +    __ lvsl            (toPerm, to);
   3.571 +#else
   3.572 +    __ lvsr            (toPerm, to);
   3.573 +#endif
   3.574 +    __ vspltisb        (vTmp3, -1);
   3.575 +    __ vspltisb        (vTmp4, 0);
   3.576      __ lvx             (vTmp1, to);
   3.577 -    __ vperm           (vRet, vRet, vRet, toPerm);
   3.578 -    __ vsel            (vTmp1, vTmp1, vRet, vTmp2);
   3.579 -    __ lvx             (vTmp4, fifteen, to);
   3.580 +    __ lvx             (vTmp2, fifteen, to);
   3.581 +#ifdef VM_LITTLE_ENDIAN
   3.582 +    __ vperm           (vTmp3, vTmp3, vTmp4, toPerm); // generate select mask
   3.583 +    __ vxor            (toPerm, toPerm, fSplt);       // swap bytes
   3.584 +#else
   3.585 +    __ vperm           (vTmp3, vTmp4, vTmp3, toPerm); // generate select mask
   3.586 +#endif
   3.587 +    __ vperm           (vTmp4, vRet, vRet, toPerm);   // rotate data
   3.588 +    __ vsel            (vTmp2, vTmp4, vTmp2, vTmp3);
   3.589 +    __ vsel            (vTmp1, vTmp1, vTmp4, vTmp3);
   3.590 +    __ stvx            (vTmp2, fifteen, to);          // store this one first (may alias)
   3.591      __ stvx            (vTmp1, to);
   3.592 -    __ vsel            (vRet, vRet, vTmp4, vTmp2);
   3.593 -    __ stvx            (vRet, fifteen, to);
   3.594  
   3.595      __ blr();
   3.596       return start;
     4.1 --- a/src/cpu/ppc/vm/stubRoutines_ppc_64.hpp	Tue Jan 29 08:28:24 2019 -0500
     4.2 +++ b/src/cpu/ppc/vm/stubRoutines_ppc_64.hpp	Sun Aug 11 19:11:08 2019 -0400
     4.3 @@ -34,7 +34,7 @@
     4.4  
     4.5  enum platform_dependent_constants {
     4.6    code_size1 = 20000,          // simply increase if too small (assembler will crash if too small)
     4.7 -  code_size2 = 22000           // simply increase if too small (assembler will crash if too small)
     4.8 +  code_size2 = 24000           // simply increase if too small (assembler will crash if too small)
     4.9  };
    4.10  
    4.11  // CRC32 Intrinsics.
     5.1 --- a/src/cpu/ppc/vm/vm_version_ppc.cpp	Tue Jan 29 08:28:24 2019 -0500
     5.2 +++ b/src/cpu/ppc/vm/vm_version_ppc.cpp	Sun Aug 11 19:11:08 2019 -0400
     5.3 @@ -174,7 +174,6 @@
     5.4    }
     5.5  
     5.6    // The AES intrinsic stubs require AES instruction support.
     5.7 -#if defined(VM_LITTLE_ENDIAN)
     5.8    if (has_vcipher()) {
     5.9      if (FLAG_IS_DEFAULT(UseAES)) {
    5.10        UseAES = true;
    5.11 @@ -195,18 +194,6 @@
    5.12      FLAG_SET_DEFAULT(UseAESIntrinsics, false);
    5.13    }
    5.14  
    5.15 -#else
    5.16 -  if (UseAES) {
    5.17 -    warning("AES instructions are not available on this CPU");
    5.18 -    FLAG_SET_DEFAULT(UseAES, false);
    5.19 -  }
    5.20 -  if (UseAESIntrinsics) {
    5.21 -    if (!FLAG_IS_DEFAULT(UseAESIntrinsics))
    5.22 -      warning("AES intrinsics are not available on this CPU");
    5.23 -    FLAG_SET_DEFAULT(UseAESIntrinsics, false);
    5.24 -  }
    5.25 -#endif
    5.26 -
    5.27    if (has_vshasig()) {
    5.28      if (FLAG_IS_DEFAULT(UseSHA)) {
    5.29        UseSHA = true;

mercurial