8185979: PPC64: Implement SHA2 intrinsic

Tue, 18 Jun 2019 09:33:34 -0400

author
ogatak
date
Tue, 18 Jun 2019 09:33:34 -0400
changeset 9713
c4567d28f31f
parent 9712
d7e1e002b496
child 9714
e49125a0c77c

8185979: PPC64: Implement SHA2 intrinsic
Reviewed-by: mdoerr, goetz
Contributed-by: Bruno Rosa <bruno.rosa@eldorado.org.br>, Gustavo Serra Scalet <gustavo.scalet@eldorado.org.br>, Igor Nunes <igor.nunes@eldorado.org.br>, Martin Doerr <martin.doerr@sap.com>

src/cpu/ppc/vm/assembler_ppc.hpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/assembler_ppc.inline.hpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/macroAssembler_ppc.hpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/macroAssembler_ppc_sha.cpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/stubGenerator_ppc.cpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/stubRoutines_ppc_64.hpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/vm_version_ppc.cpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/vm_version_ppc.hpp file | annotate | diff | comparison | revisions
src/share/vm/opto/library_call.cpp file | annotate | diff | comparison | revisions
src/share/vm/opto/runtime.cpp file | annotate | diff | comparison | revisions
test/compiler/intrinsics/sha/cli/testcases/GenericTestCaseForOtherCPU.java file | annotate | diff | comparison | revisions
test/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java file | annotate | diff | comparison | revisions
     1.1 --- a/src/cpu/ppc/vm/assembler_ppc.hpp	Mon Jun 17 17:20:10 2019 +0100
     1.2 +++ b/src/cpu/ppc/vm/assembler_ppc.hpp	Tue Jun 18 09:33:34 2019 -0400
     1.3 @@ -2000,7 +2000,8 @@
     1.4    inline void vsbox(       VectorRegister d, VectorRegister a);
     1.5  
     1.6    // SHA (introduced with Power 8)
     1.7 -  // Not yet implemented.
     1.8 +  inline void vshasigmad(VectorRegister d, VectorRegister a, bool st, int six);
     1.9 +  inline void vshasigmaw(VectorRegister d, VectorRegister a, bool st, int six);
    1.10  
    1.11    // Vector Binary Polynomial Multiplication (introduced with Power 8)
    1.12    inline void vpmsumb(  VectorRegister d, VectorRegister a, VectorRegister b);
    1.13 @@ -2096,6 +2097,10 @@
    1.14    inline void lvsl(  VectorRegister d, Register s2);
    1.15    inline void lvsr(  VectorRegister d, Register s2);
    1.16  
    1.17 +  // Endianess specific concatenation of 2 loaded vectors.
    1.18 +  inline void load_perm(VectorRegister perm, Register addr);
    1.19 +  inline void vec_perm(VectorRegister first_dest, VectorRegister second, VectorRegister perm);
    1.20 +
    1.21    // RegisterOrConstant versions.
    1.22    // These emitters choose between the versions using two registers and
    1.23    // those with register and immediate, depending on the content of roc.
     2.1 --- a/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Mon Jun 17 17:20:10 2019 +0100
     2.2 +++ b/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Tue Jun 18 09:33:34 2019 -0400
     2.3 @@ -789,7 +789,8 @@
     2.4  inline void Assembler::vsbox(       VectorRegister d, VectorRegister a)                   { emit_int32( VSBOX_OPCODE        | vrt(d) | vra(a)         ); }
     2.5  
     2.6  // SHA (introduced with Power 8)
     2.7 -// Not yet implemented.
     2.8 +inline void Assembler::vshasigmad(VectorRegister d, VectorRegister a, bool st, int six) { emit_int32( VSHASIGMAD_OPCODE | vrt(d) | vra(a) | vst(st) | vsix(six)); }
     2.9 +inline void Assembler::vshasigmaw(VectorRegister d, VectorRegister a, bool st, int six) { emit_int32( VSHASIGMAW_OPCODE | vrt(d) | vra(a) | vst(st) | vsix(six)); }
    2.10  
    2.11  // Vector Binary Polynomial Multiplication (introduced with Power 8)
    2.12  inline void Assembler::vpmsumb(  VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPMSUMB_OPCODE | vrt(d) | vra(a) | vrb(b)); }
    2.13 @@ -887,6 +888,22 @@
    2.14  inline void Assembler::lvsl(  VectorRegister d, Register s2) { emit_int32( LVSL_OPCODE   | vrt(d) | rb(s2)); }
    2.15  inline void Assembler::lvsr(  VectorRegister d, Register s2) { emit_int32( LVSR_OPCODE   | vrt(d) | rb(s2)); }
    2.16  
    2.17 +inline void Assembler::load_perm(VectorRegister perm, Register addr) {
    2.18 +#if defined(VM_LITTLE_ENDIAN)
    2.19 +  lvsr(perm, addr);
    2.20 +#else
    2.21 +  lvsl(perm, addr);
    2.22 +#endif
    2.23 +}
    2.24 +
    2.25 +inline void Assembler::vec_perm(VectorRegister first_dest, VectorRegister second, VectorRegister perm) {
    2.26 +#if defined(VM_LITTLE_ENDIAN)
    2.27 +  vperm(first_dest, second, first_dest, perm);
    2.28 +#else
    2.29 +  vperm(first_dest, first_dest, second, perm);
    2.30 +#endif
    2.31 +}
    2.32 +
    2.33  inline void Assembler::load_const(Register d, void* x, Register tmp) {
    2.34     load_const(d, (long)x, tmp);
    2.35  }
     3.1 --- a/src/cpu/ppc/vm/macroAssembler_ppc.hpp	Mon Jun 17 17:20:10 2019 +0100
     3.2 +++ b/src/cpu/ppc/vm/macroAssembler_ppc.hpp	Tue Jun 18 09:33:34 2019 -0400
     3.3 @@ -667,6 +667,40 @@
     3.4  
     3.5    void kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp);
     3.6  
     3.7 +  // SHA-2 auxiliary functions and public interfaces
     3.8 + private:
     3.9 +  void sha256_deque(const VectorRegister src,
    3.10 +      const VectorRegister dst1, const VectorRegister dst2, const VectorRegister dst3);
    3.11 +  void sha256_load_h_vec(const VectorRegister a, const VectorRegister e, const Register hptr);
    3.12 +  void sha256_round(const VectorRegister* hs, const int total_hs, int& h_cnt, const VectorRegister kpw);
    3.13 +  void sha256_load_w_plus_k_vec(const Register buf_in, const VectorRegister* ws,
    3.14 +      const int total_ws, const Register k, const VectorRegister* kpws,
    3.15 +      const int total_kpws);
    3.16 +  void sha256_calc_4w(const VectorRegister w0, const VectorRegister w1,
    3.17 +      const VectorRegister w2, const VectorRegister w3, const VectorRegister kpw0,
    3.18 +      const VectorRegister kpw1, const VectorRegister kpw2, const VectorRegister kpw3,
    3.19 +      const Register j, const Register k);
    3.20 +  void sha256_update_sha_state(const VectorRegister a, const VectorRegister b,
    3.21 +      const VectorRegister c, const VectorRegister d, const VectorRegister e,
    3.22 +      const VectorRegister f, const VectorRegister g, const VectorRegister h,
    3.23 +      const Register hptr);
    3.24 +
    3.25 +  void sha512_load_w_vec(const Register buf_in, const VectorRegister* ws, const int total_ws);
    3.26 +  void sha512_update_sha_state(const Register state, const VectorRegister* hs, const int total_hs);
    3.27 +  void sha512_round(const VectorRegister* hs, const int total_hs, int& h_cnt, const VectorRegister kpw);
    3.28 +  void sha512_load_h_vec(const Register state, const VectorRegister* hs, const int total_hs);
    3.29 +  void sha512_calc_2w(const VectorRegister w0, const VectorRegister w1,
    3.30 +      const VectorRegister w2, const VectorRegister w3,
    3.31 +      const VectorRegister w4, const VectorRegister w5,
    3.32 +      const VectorRegister w6, const VectorRegister w7,
    3.33 +      const VectorRegister kpw0, const VectorRegister kpw1, const Register j,
    3.34 +      const VectorRegister vRb, const Register k);
    3.35 +
    3.36 + public:
    3.37 +  void sha256(bool multi_block);
    3.38 +  void sha512(bool multi_block);
    3.39 +
    3.40 +
    3.41    //
    3.42    // Debugging
    3.43    //
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/src/cpu/ppc/vm/macroAssembler_ppc_sha.cpp	Tue Jun 18 09:33:34 2019 -0400
     4.3 @@ -0,0 +1,1136 @@
     4.4 +// Copyright (c) 2017 Instituto de Pesquisas Eldorado. All rights reserved.
     4.5 +// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     4.6 +//
     4.7 +// This code is free software; you can redistribute it and/or modify it
     4.8 +// under the terms of the GNU General Public License version 2 only, as
     4.9 +// published by the Free Software Foundation.
    4.10 +//
    4.11 +// This code is distributed in the hope that it will be useful, but WITHOUT
    4.12 +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    4.13 +// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    4.14 +// version 2 for more details (a copy is included in the LICENSE file that
    4.15 +// accompanied this code).
    4.16 +//
    4.17 +// You should have received a copy of the GNU General Public License version
    4.18 +// 2 along with this work; if not, write to the Free Software Foundation,
    4.19 +// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    4.20 +//
    4.21 +// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    4.22 +// or visit www.oracle.com if you need additional information or have any
    4.23 +// questions.
    4.24 +
    4.25 +// Implemented according to "Descriptions of SHA-256, SHA-384, and SHA-512"
    4.26 +// (http://www.iwar.org.uk/comsec/resources/cipher/sha256-384-512.pdf).
    4.27 +
    4.28 +#include "asm/macroAssembler.inline.hpp"
    4.29 +#include "runtime/stubRoutines.hpp"
    4.30 +
    4.31 +/**********************************************************************
    4.32 + * SHA 256
    4.33 + *********************************************************************/
    4.34 +
    4.35 +void MacroAssembler::sha256_deque(const VectorRegister src,
    4.36 +                                  const VectorRegister dst1,
    4.37 +                                  const VectorRegister dst2,
    4.38 +                                  const VectorRegister dst3) {
    4.39 +  vsldoi (dst1, src, src, 12);
    4.40 +  vsldoi (dst2, src, src, 8);
    4.41 +  vsldoi (dst3, src, src, 4);
    4.42 +}
    4.43 +
    4.44 +void MacroAssembler::sha256_round(const VectorRegister* hs,
    4.45 +                                  const int total_hs,
    4.46 +                                  int& h_cnt,
    4.47 +                                  const VectorRegister kpw) {
    4.48 +  // convenience registers: cycle from 0-7 downwards
    4.49 +  const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
    4.50 +  const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
    4.51 +  const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
    4.52 +  const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
    4.53 +  const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
    4.54 +  const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
    4.55 +  const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
    4.56 +  const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
    4.57 +  // temporaries
    4.58 +  VectorRegister ch  = VR0;
    4.59 +  VectorRegister maj = VR1;
    4.60 +  VectorRegister bsa = VR2;
    4.61 +  VectorRegister bse = VR3;
    4.62 +  VectorRegister vt0 = VR4;
    4.63 +  VectorRegister vt1 = VR5;
    4.64 +  VectorRegister vt2 = VR6;
    4.65 +  VectorRegister vt3 = VR7;
    4.66 +
    4.67 +  vsel       (ch,  g,   f, e);
    4.68 +  vxor       (maj, a,   b);
    4.69 +  vshasigmaw (bse, e,   1, 0xf);
    4.70 +  vadduwm    (vt2, ch,  kpw);
    4.71 +  vadduwm    (vt1, h,   bse);
    4.72 +  vsel       (maj, b,   c, maj);
    4.73 +  vadduwm    (vt3, vt1, vt2);
    4.74 +  vshasigmaw (bsa, a,   1, 0);
    4.75 +  vadduwm    (vt0, bsa, maj);
    4.76 +
    4.77 +  vadduwm    (d,   d,   vt3);
    4.78 +  vadduwm    (h,   vt3, vt0);
    4.79 +
    4.80 +  // advance vector pointer to the next iteration
    4.81 +  h_cnt++;
    4.82 +}
    4.83 +
    4.84 +void MacroAssembler::sha256_load_h_vec(const VectorRegister a,
    4.85 +                                       const VectorRegister e,
    4.86 +                                       const Register hptr) {
    4.87 +  // temporaries
    4.88 +  Register tmp = R8;
    4.89 +  VectorRegister vt0 = VR0;
    4.90 +  VectorRegister vRb = VR6;
    4.91 +  // labels
    4.92 +  Label sha256_aligned;
    4.93 +
    4.94 +  andi_  (tmp,  hptr, 0xf);
    4.95 +  lvx    (a,    hptr);
    4.96 +  addi   (tmp,  hptr, 16);
    4.97 +  lvx    (e,    tmp);
    4.98 +  beq    (CCR0, sha256_aligned);
    4.99 +
   4.100 +  // handle unaligned accesses
   4.101 +  load_perm(vRb, hptr);
   4.102 +  addi   (tmp, hptr, 32);
   4.103 +  vec_perm(a,   e,    vRb);
   4.104 +
   4.105 +  lvx    (vt0,  tmp);
   4.106 +  vec_perm(e,   vt0,  vRb);
   4.107 +
   4.108 +  // aligned accesses
   4.109 +  bind(sha256_aligned);
   4.110 +}
   4.111 +
   4.112 +void MacroAssembler::sha256_load_w_plus_k_vec(const Register buf_in,
   4.113 +                                              const VectorRegister* ws,
   4.114 +                                              const int total_ws,
   4.115 +                                              const Register k,
   4.116 +                                              const VectorRegister* kpws,
   4.117 +                                              const int total_kpws) {
   4.118 +  Label w_aligned, after_w_load;
   4.119 +
   4.120 +  Register tmp       = R8;
   4.121 +  VectorRegister vt0 = VR0;
   4.122 +  VectorRegister vt1 = VR1;
   4.123 +  VectorRegister vRb = VR6;
   4.124 +
   4.125 +  andi_ (tmp, buf_in, 0xF);
   4.126 +  beq   (CCR0, w_aligned); // address ends with 0x0, not 0x8
   4.127 +
   4.128 +  // deal with unaligned addresses
   4.129 +  lvx    (ws[0], buf_in);
   4.130 +  load_perm(vRb, buf_in);
   4.131 +
   4.132 +  for (int n = 1; n < total_ws; n++) {
   4.133 +    VectorRegister w_cur = ws[n];
   4.134 +    VectorRegister w_prev = ws[n-1];
   4.135 +
   4.136 +    addi (tmp, buf_in, n * 16);
   4.137 +    lvx  (w_cur, tmp);
   4.138 +    vec_perm(w_prev, w_cur, vRb);
   4.139 +  }
   4.140 +  addi   (tmp, buf_in, total_ws * 16);
   4.141 +  lvx    (vt0, tmp);
   4.142 +  vec_perm(ws[total_ws-1], vt0, vRb);
   4.143 +  b      (after_w_load);
   4.144 +
   4.145 +  bind(w_aligned);
   4.146 +
   4.147 +  // deal with aligned addresses
   4.148 +  lvx(ws[0], buf_in);
   4.149 +  for (int n = 1; n < total_ws; n++) {
   4.150 +    VectorRegister w = ws[n];
   4.151 +    addi (tmp, buf_in, n * 16);
   4.152 +    lvx  (w, tmp);
   4.153 +  }
   4.154 +
   4.155 +  bind(after_w_load);
   4.156 +
   4.157 +#if defined(VM_LITTLE_ENDIAN)
   4.158 +  // Byte swapping within int values
   4.159 +  li       (tmp, 8);
   4.160 +  lvsl     (vt0, tmp);
   4.161 +  vspltisb (vt1, 0xb);
   4.162 +  vxor     (vt1, vt0, vt1);
   4.163 +  for (int n = 0; n < total_ws; n++) {
   4.164 +    VectorRegister w = ws[n];
   4.165 +    vec_perm(w, w, vt1);
   4.166 +  }
   4.167 +#endif
   4.168 +
   4.169 +  // Loading k, which is always aligned to 16-bytes
   4.170 +  lvx    (kpws[0], k);
   4.171 +  for (int n = 1; n < total_kpws; n++) {
   4.172 +    VectorRegister kpw = kpws[n];
   4.173 +    addi (tmp, k, 16 * n);
   4.174 +    lvx  (kpw, tmp);
   4.175 +  }
   4.176 +
   4.177 +  // Add w to K
   4.178 +  assert(total_ws == total_kpws, "Redesign the loop below");
   4.179 +  for (int n = 0; n < total_kpws; n++) {
   4.180 +    VectorRegister kpw = kpws[n];
   4.181 +    VectorRegister w   = ws[n];
   4.182 +
   4.183 +    vadduwm  (kpw, kpw, w);
   4.184 +  }
   4.185 +}
   4.186 +
   4.187 +void MacroAssembler::sha256_calc_4w(const VectorRegister w0,
   4.188 +                                    const VectorRegister w1,
   4.189 +                                    const VectorRegister w2,
   4.190 +                                    const VectorRegister w3,
   4.191 +                                    const VectorRegister kpw0,
   4.192 +                                    const VectorRegister kpw1,
   4.193 +                                    const VectorRegister kpw2,
   4.194 +                                    const VectorRegister kpw3,
   4.195 +                                    const Register j,
   4.196 +                                    const Register k) {
   4.197 +  // Temporaries
   4.198 +  const VectorRegister  vt0  = VR0;
   4.199 +  const VectorRegister  vt1  = VR1;
   4.200 +  const VectorSRegister vsrt1 = vt1->to_vsr();
   4.201 +  const VectorRegister  vt2  = VR2;
   4.202 +  const VectorRegister  vt3  = VR3;
   4.203 +  const VectorSRegister vst3 = vt3->to_vsr();
   4.204 +  const VectorRegister  vt4  = VR4;
   4.205 +
   4.206 +  // load to k[j]
   4.207 +  lvx        (vt0, j,   k);
   4.208 +
   4.209 +  // advance j
   4.210 +  addi       (j,   j,   16); // 16 bytes were read
   4.211 +
   4.212 +#if defined(VM_LITTLE_ENDIAN)
   4.213 +  // b = w[j-15], w[j-14], w[j-13], w[j-12]
   4.214 +  vsldoi     (vt1, w1,  w0, 12);
   4.215 +
   4.216 +  // c = w[j-7], w[j-6], w[j-5], w[j-4]
   4.217 +  vsldoi     (vt2, w3,  w2, 12);
   4.218 +
   4.219 +#else
   4.220 +  // b = w[j-15], w[j-14], w[j-13], w[j-12]
   4.221 +  vsldoi     (vt1, w0,  w1, 4);
   4.222 +
   4.223 +  // c = w[j-7], w[j-6], w[j-5], w[j-4]
   4.224 +  vsldoi     (vt2, w2,  w3, 4);
   4.225 +#endif
   4.226 +
   4.227 +  // d = w[j-2], w[j-1], w[j-4], w[j-3]
   4.228 +  vsldoi     (vt3, w3,  w3, 8);
   4.229 +
   4.230 +  // b = s0(w[j-15]) , s0(w[j-14]) , s0(w[j-13]) , s0(w[j-12])
   4.231 +  vshasigmaw (vt1, vt1, 0,  0);
   4.232 +
   4.233 +  // d = s1(w[j-2]) , s1(w[j-1]) , s1(w[j-4]) , s1(w[j-3])
   4.234 +  vshasigmaw (vt3, vt3, 0,  0xf);
   4.235 +
   4.236 +  // c = s0(w[j-15]) + w[j-7],
   4.237 +  //     s0(w[j-14]) + w[j-6],
   4.238 +  //     s0(w[j-13]) + w[j-5],
   4.239 +  //     s0(w[j-12]) + w[j-4]
   4.240 +  vadduwm    (vt2, vt1, vt2);
   4.241 +
   4.242 +  // c = s0(w[j-15]) + w[j-7] + w[j-16],
   4.243 +  //     s0(w[j-14]) + w[j-6] + w[j-15],
   4.244 +  //     s0(w[j-13]) + w[j-5] + w[j-14],
   4.245 +  //     s0(w[j-12]) + w[j-4] + w[j-13]
   4.246 +  vadduwm    (vt2, vt2, w0);
   4.247 +
   4.248 +  // e = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
   4.249 +  //     s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
   4.250 +  //     s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j-4]), // UNDEFINED
   4.251 +  //     s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j-3])  // UNDEFINED
   4.252 +  vadduwm    (vt4, vt2, vt3);
   4.253 +
   4.254 +  // At this point, e[0] and e[1] are the correct values to be stored at w[j]
   4.255 +  // and w[j+1].
   4.256 +  // e[2] and e[3] are not considered.
   4.257 +  // b = s1(w[j]) , s1(s(w[j+1]) , UNDEFINED , UNDEFINED
   4.258 +  vshasigmaw (vt1, vt4, 0,  0xf);
   4.259 +
   4.260 +  // v5 = s1(w[j-2]) , s1(w[j-1]) , s1(w[j]) , s1(w[j+1])
   4.261 +#if defined(VM_LITTLE_ENDIAN)
   4.262 +  xxmrgld    (vst3, vsrt1, vst3);
   4.263 +#else
   4.264 +  xxmrghd    (vst3, vst3, vsrt1);
   4.265 +#endif
   4.266 +
   4.267 +  // c = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
   4.268 +  //     s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
   4.269 +  //     s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j]),   // w[j+2]
   4.270 +  //     s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j+1])  // w[j+4]
   4.271 +  vadduwm    (vt2, vt2, vt3);
   4.272 +
   4.273 +  // Updating w0 to w3 to hold the new previous 16 values from w.
   4.274 +  vmr        (w0,  w1);
   4.275 +  vmr        (w1,  w2);
   4.276 +  vmr        (w2,  w3);
   4.277 +  vmr        (w3,  vt2);
   4.278 +
   4.279 +  // store k + w to v9 (4 values at once)
   4.280 +#if defined(VM_LITTLE_ENDIAN)
   4.281 +  vadduwm    (kpw0, vt2, vt0);
   4.282 +
   4.283 +  vsldoi     (kpw1, kpw0, kpw0, 12);
   4.284 +  vsldoi     (kpw2, kpw0, kpw0, 8);
   4.285 +  vsldoi     (kpw3, kpw0, kpw0, 4);
   4.286 +#else
   4.287 +  vadduwm    (kpw3, vt2, vt0);
   4.288 +
   4.289 +  vsldoi     (kpw2, kpw3, kpw3, 12);
   4.290 +  vsldoi     (kpw1, kpw3, kpw3, 8);
   4.291 +  vsldoi     (kpw0, kpw3, kpw3, 4);
   4.292 +#endif
   4.293 +}
   4.294 +
   4.295 +void MacroAssembler::sha256_update_sha_state(const VectorRegister a,
   4.296 +                                             const VectorRegister b_,
   4.297 +                                             const VectorRegister c,
   4.298 +                                             const VectorRegister d,
   4.299 +                                             const VectorRegister e,
   4.300 +                                             const VectorRegister f,
   4.301 +                                             const VectorRegister g,
   4.302 +                                             const VectorRegister h,
   4.303 +                                             const Register hptr) {
   4.304 +  // temporaries
   4.305 +  VectorRegister vt0  = VR0;
   4.306 +  VectorRegister vt1  = VR1;
   4.307 +  VectorRegister vt2  = VR2;
   4.308 +  VectorRegister vt3  = VR3;
   4.309 +  VectorRegister vt4  = VR4;
   4.310 +  VectorRegister vt5  = VR5;
   4.311 +  VectorRegister vaux = VR6;
   4.312 +  VectorRegister vRb  = VR6;
   4.313 +  Register tmp        = R8;
   4.314 +  Register of16       = R8;
   4.315 +  Register of32       = R9;
   4.316 +  Label state_load_aligned;
   4.317 +
   4.318 +  // Load hptr
   4.319 +  andi_   (tmp, hptr, 0xf);
   4.320 +  li      (of16, 16);
   4.321 +  lvx     (vt0, hptr);
   4.322 +  lvx     (vt5, of16, hptr);
   4.323 +  beq     (CCR0, state_load_aligned);
   4.324 +
   4.325 +  // handle unaligned accesses
   4.326 +  li      (of32, 32);
   4.327 +  load_perm(vRb, hptr);
   4.328 +
   4.329 +  vec_perm(vt0, vt5,  vRb);        // vt0 = hptr[0]..hptr[3]
   4.330 +
   4.331 +  lvx     (vt1, hptr, of32);
   4.332 +  vec_perm(vt5, vt1,  vRb);        // vt5 = hptr[4]..hptr[7]
   4.333 +
   4.334 +  // aligned accesses
   4.335 +  bind(state_load_aligned);
   4.336 +
   4.337 +#if defined(VM_LITTLE_ENDIAN)
   4.338 +  vmrglw  (vt1, b_, a);            // vt1 = {a, b, ?, ?}
   4.339 +  vmrglw  (vt2, d, c);             // vt2 = {c, d, ?, ?}
   4.340 +  vmrglw  (vt3, f, e);             // vt3 = {e, f, ?, ?}
   4.341 +  vmrglw  (vt4, h, g);             // vt4 = {g, h, ?, ?}
   4.342 +  xxmrgld (vt1->to_vsr(), vt2->to_vsr(), vt1->to_vsr()); // vt1 = {a, b, c, d}
   4.343 +  xxmrgld (vt3->to_vsr(), vt4->to_vsr(), vt3->to_vsr()); // vt3 = {e, f, g, h}
   4.344 +  vadduwm (a,   vt0, vt1);         // a = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
   4.345 +  vadduwm (e,   vt5, vt3);         // e = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
   4.346 +
   4.347 +  // Save hptr back, works for any alignment
   4.348 +  xxswapd (vt0->to_vsr(), a->to_vsr());
   4.349 +  stxvd2x (vt0->to_vsr(), hptr);
   4.350 +  xxswapd (vt5->to_vsr(), e->to_vsr());
   4.351 +  stxvd2x (vt5->to_vsr(), of16, hptr);
   4.352 +#else
   4.353 +  vmrglw  (vt1, a, b_);            // vt1 = {a, b, ?, ?}
   4.354 +  vmrglw  (vt2, c, d);             // vt2 = {c, d, ?, ?}
   4.355 +  vmrglw  (vt3, e, f);             // vt3 = {e, f, ?, ?}
   4.356 +  vmrglw  (vt4, g, h);             // vt4 = {g, h, ?, ?}
   4.357 +  xxmrgld (vt1->to_vsr(), vt1->to_vsr(), vt2->to_vsr()); // vt1 = {a, b, c, d}
   4.358 +  xxmrgld (vt3->to_vsr(), vt3->to_vsr(), vt4->to_vsr()); // vt3 = {e, f, g, h}
   4.359 +  vadduwm (d,   vt0, vt1);         // d = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
   4.360 +  vadduwm (h,   vt5, vt3);         // h = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
   4.361 +
   4.362 +  // Save hptr back, works for any alignment
   4.363 +  stxvd2x (d->to_vsr(), hptr);
   4.364 +  stxvd2x (h->to_vsr(), of16, hptr);
   4.365 +#endif
   4.366 +}
   4.367 +
   4.368 +static const uint32_t sha256_round_table[64] __attribute((aligned(16))) = {
   4.369 +  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
   4.370 +  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
   4.371 +  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
   4.372 +  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
   4.373 +  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
   4.374 +  0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
   4.375 +  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
   4.376 +  0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
   4.377 +  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
   4.378 +  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
   4.379 +  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
   4.380 +  0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
   4.381 +  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
   4.382 +  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
   4.383 +  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
   4.384 +  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
   4.385 +};
   4.386 +static const uint32_t *sha256_round_consts = sha256_round_table;
   4.387 +
   4.388 +//   R3_ARG1   - byte[]  Input string with padding but in Big Endian
   4.389 +//   R4_ARG2   - int[]   SHA.state (at first, the root of primes)
   4.390 +//   R5_ARG3   - int     offset
   4.391 +//   R6_ARG4   - int     limit
   4.392 +//
   4.393 +//   Internal Register usage:
   4.394 +//   R7        - k
   4.395 +//   R8        - tmp | j | of16
   4.396 +//   R9        - of32
   4.397 +//   VR0-VR8   - ch, maj, bsa, bse, vt0-vt3 | vt0-vt5, vaux/vRb
   4.398 +//   VR9-VR16  - a-h
   4.399 +//   VR17-VR20 - w0-w3
   4.400 +//   VR21-VR23 - vRb | vaux0-vaux2
   4.401 +//   VR24-VR27 - kpw0-kpw3
   4.402 +void MacroAssembler::sha256(bool multi_block) {
   4.403 +  static const ssize_t buf_size = 64;
   4.404 +  static const uint8_t w_size = sizeof(sha256_round_table)/sizeof(uint32_t);
   4.405 +#ifdef AIX
   4.406 +  // malloc provides 16 byte alignment
   4.407 +  if (((uintptr_t)sha256_round_consts & 0xF) != 0) {
   4.408 +    uint32_t *new_round_consts = (uint32_t*)malloc(sizeof(sha256_round_table));
   4.409 +    guarantee(new_round_consts, "oom");
   4.410 +    memcpy(new_round_consts, sha256_round_consts, sizeof(sha256_round_table));
   4.411 +    sha256_round_consts = (const uint32_t*)new_round_consts;
   4.412 +  }
   4.413 +#endif
   4.414 +
   4.415 +  Register buf_in = R3_ARG1;
   4.416 +  Register state  = R4_ARG2;
   4.417 +  Register ofs    = R5_ARG3;
   4.418 +  Register limit  = R6_ARG4;
   4.419 +
   4.420 +  Label sha_loop, core_loop;
   4.421 +
   4.422 +  // Save non-volatile vector registers in the red zone
   4.423 +  static const VectorRegister nv[] = {
   4.424 +    VR20, VR21, VR22, VR23, VR24, VR25, VR26, VR27/*, VR28, VR29, VR30, VR31*/
   4.425 +  };
   4.426 +  static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
   4.427 +
   4.428 +  for (int c = 0; c < nv_size; c++) {
   4.429 +    Register tmp = R8;
   4.430 +    li  (tmp, (c - (nv_size)) * 16);
   4.431 +    stvx(nv[c], tmp, R1);
   4.432 +  }
   4.433 +
   4.434 +  // Load hash state to registers
   4.435 +  VectorRegister a = VR9;
   4.436 +  VectorRegister b = VR10;
   4.437 +  VectorRegister c = VR11;
   4.438 +  VectorRegister d = VR12;
   4.439 +  VectorRegister e = VR13;
   4.440 +  VectorRegister f = VR14;
   4.441 +  VectorRegister g = VR15;
   4.442 +  VectorRegister h = VR16;
   4.443 +  static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
   4.444 +  static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
   4.445 +  // counter for cycling through hs vector to avoid register moves between iterations
   4.446 +  int h_cnt = 0;
   4.447 +
   4.448 +  // Load a-h registers from the memory pointed by state
   4.449 +#if defined(VM_LITTLE_ENDIAN)
   4.450 +  sha256_load_h_vec(a, e, state);
   4.451 +#else
   4.452 +  sha256_load_h_vec(d, h, state);
   4.453 +#endif
   4.454 +
   4.455 +  // keep k loaded also during MultiBlock loops
   4.456 +  Register k = R7;
   4.457 +  assert(((uintptr_t)sha256_round_consts & 0xF) == 0, "k alignment");
   4.458 +  load_const_optimized(k, (address)sha256_round_consts, R0);
   4.459 +
   4.460 +  // Avoiding redundant loads
   4.461 +  if (multi_block) {
   4.462 +    align(OptoLoopAlignment);
   4.463 +  }
   4.464 +  bind(sha_loop);
   4.465 +#if defined(VM_LITTLE_ENDIAN)
   4.466 +  sha256_deque(a, b, c, d);
   4.467 +  sha256_deque(e, f, g, h);
   4.468 +#else
   4.469 +  sha256_deque(d, c, b, a);
   4.470 +  sha256_deque(h, g, f, e);
   4.471 +#endif
   4.472 +
   4.473 +  // Load 16 elements from w out of the loop.
   4.474 +  // Order of the int values is Endianess specific.
   4.475 +  VectorRegister w0 = VR17;
   4.476 +  VectorRegister w1 = VR18;
   4.477 +  VectorRegister w2 = VR19;
   4.478 +  VectorRegister w3 = VR20;
   4.479 +  static const VectorRegister ws[] = {w0, w1, w2, w3};
   4.480 +  static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
   4.481 +
   4.482 +  VectorRegister kpw0 = VR24;
   4.483 +  VectorRegister kpw1 = VR25;
   4.484 +  VectorRegister kpw2 = VR26;
   4.485 +  VectorRegister kpw3 = VR27;
   4.486 +  static const VectorRegister kpws[] = {kpw0, kpw1, kpw2, kpw3};
   4.487 +  static const int total_kpws = sizeof(kpws)/sizeof(VectorRegister);
   4.488 +
   4.489 +  sha256_load_w_plus_k_vec(buf_in, ws, total_ws, k, kpws, total_kpws);
   4.490 +
   4.491 +  // Cycle through the first 16 elements
   4.492 +  assert(total_ws == total_kpws, "Redesign the loop below");
   4.493 +  for (int n = 0; n < total_ws; n++) {
   4.494 +    VectorRegister vaux0 = VR21;
   4.495 +    VectorRegister vaux1 = VR22;
   4.496 +    VectorRegister vaux2 = VR23;
   4.497 +
   4.498 +    sha256_deque(kpws[n], vaux0, vaux1, vaux2);
   4.499 +
   4.500 +#if defined(VM_LITTLE_ENDIAN)
   4.501 +    sha256_round(hs, total_hs, h_cnt, kpws[n]);
   4.502 +    sha256_round(hs, total_hs, h_cnt, vaux0);
   4.503 +    sha256_round(hs, total_hs, h_cnt, vaux1);
   4.504 +    sha256_round(hs, total_hs, h_cnt, vaux2);
   4.505 +#else
   4.506 +    sha256_round(hs, total_hs, h_cnt, vaux2);
   4.507 +    sha256_round(hs, total_hs, h_cnt, vaux1);
   4.508 +    sha256_round(hs, total_hs, h_cnt, vaux0);
   4.509 +    sha256_round(hs, total_hs, h_cnt, kpws[n]);
   4.510 +#endif
   4.511 +  }
   4.512 +
   4.513 +  Register tmp = R8;
   4.514 +  // loop the 16th to the 64th iteration by 8 steps
   4.515 +  li   (tmp, (w_size - 16) / total_hs);
   4.516 +  mtctr(tmp);
   4.517 +
   4.518 +  // j will be aligned to 4 for loading words.
   4.519 +  // Whenever read, advance the pointer (e.g: when j is used in a function)
   4.520 +  Register j = R8;
   4.521 +  li   (j, 16*4);
   4.522 +
   4.523 +  align(OptoLoopAlignment);
   4.524 +  bind(core_loop);
   4.525 +
   4.526 +  // due to VectorRegister rotate, always iterate in multiples of total_hs
   4.527 +  for (int n = 0; n < total_hs/4; n++) {
   4.528 +    sha256_calc_4w(w0, w1, w2, w3, kpw0, kpw1, kpw2, kpw3, j, k);
   4.529 +    sha256_round(hs, total_hs, h_cnt, kpw0);
   4.530 +    sha256_round(hs, total_hs, h_cnt, kpw1);
   4.531 +    sha256_round(hs, total_hs, h_cnt, kpw2);
   4.532 +    sha256_round(hs, total_hs, h_cnt, kpw3);
   4.533 +  }
   4.534 +
   4.535 +  bdnz   (core_loop);
   4.536 +
   4.537 +  // Update hash state
   4.538 +  sha256_update_sha_state(a, b, c, d, e, f, g, h, state);
   4.539 +
   4.540 +  if (multi_block) {
   4.541 +    addi(buf_in, buf_in, buf_size);
   4.542 +    addi(ofs, ofs, buf_size);
   4.543 +    cmplw(CCR0, ofs, limit);
   4.544 +    ble(CCR0, sha_loop);
   4.545 +
   4.546 +    // return ofs
   4.547 +    mr(R3_RET, ofs);
   4.548 +  }
   4.549 +
   4.550 +  // Restore non-volatile registers
   4.551 +  for (int c = 0; c < nv_size; c++) {
   4.552 +    Register tmp = R8;
   4.553 +    li  (tmp, (c - (nv_size)) * 16);
   4.554 +    lvx(nv[c], tmp, R1);
   4.555 +  }
   4.556 +}
   4.557 +
   4.558 +
   4.559 +/**********************************************************************
   4.560 + * SHA 512
   4.561 + *********************************************************************/
   4.562 +
   4.563 +void MacroAssembler::sha512_load_w_vec(const Register buf_in,
   4.564 +                                       const VectorRegister* ws,
   4.565 +                                       const int total_ws) {
   4.566 +  Register tmp       = R8;
   4.567 +  VectorRegister vRb = VR8;
   4.568 +  VectorRegister aux = VR9;
   4.569 +  Label is_aligned, after_alignment;
   4.570 +
   4.571 +  andi_  (tmp, buf_in, 0xF);
   4.572 +  beq    (CCR0, is_aligned); // address ends with 0x0, not 0x8
   4.573 +
   4.574 +  // deal with unaligned addresses
   4.575 +  lvx    (ws[0], buf_in);
   4.576 +  load_perm(vRb, buf_in);
   4.577 +
   4.578 +  for (int n = 1; n < total_ws; n++) {
   4.579 +    VectorRegister w_cur = ws[n];
   4.580 +    VectorRegister w_prev = ws[n-1];
   4.581 +    addi (tmp, buf_in, n * 16);
   4.582 +    lvx  (w_cur, tmp);
   4.583 +    vec_perm(w_prev, w_cur, vRb);
   4.584 +  }
   4.585 +  addi   (tmp, buf_in, total_ws * 16);
   4.586 +  lvx    (aux, tmp);
   4.587 +  vec_perm(ws[total_ws-1], aux, vRb);
   4.588 +  b      (after_alignment);
   4.589 +
   4.590 +  bind(is_aligned);
   4.591 +  lvx  (ws[0], buf_in);
   4.592 +  for (int n = 1; n < total_ws; n++) {
   4.593 +    VectorRegister w = ws[n];
   4.594 +    addi (tmp, buf_in, n * 16);
   4.595 +    lvx  (w, tmp);
   4.596 +  }
   4.597 +
   4.598 +  bind(after_alignment);
   4.599 +}
   4.600 +
   4.601 +// Update hash state
   4.602 +void MacroAssembler::sha512_update_sha_state(const Register state,
   4.603 +                                             const VectorRegister* hs,
   4.604 +                                             const int total_hs) {
   4.605 +
   4.606 +#if defined(VM_LITTLE_ENDIAN)
   4.607 +  int start_idx = 0;
   4.608 +#else
   4.609 +  int start_idx = 1;
   4.610 +#endif
   4.611 +
   4.612 +  // load initial hash from the memory pointed by state
   4.613 +  VectorRegister ini_a = VR10;
   4.614 +  VectorRegister ini_c = VR12;
   4.615 +  VectorRegister ini_e = VR14;
   4.616 +  VectorRegister ini_g = VR16;
   4.617 +  static const VectorRegister inis[] = {ini_a, ini_c, ini_e, ini_g};
   4.618 +  static const int total_inis = sizeof(inis)/sizeof(VectorRegister);
   4.619 +
   4.620 +  Label state_save_aligned, after_state_save_aligned;
   4.621 +
   4.622 +  Register addr      = R7;
   4.623 +  Register tmp       = R8;
   4.624 +  VectorRegister vRb = VR8;
   4.625 +  VectorRegister aux = VR9;
   4.626 +
   4.627 +  andi_(tmp, state, 0xf);
   4.628 +  beq(CCR0, state_save_aligned);
   4.629 +  // deal with unaligned addresses
   4.630 +
   4.631 +  {
   4.632 +    VectorRegister a = hs[0];
   4.633 +    VectorRegister b_ = hs[1];
   4.634 +    VectorRegister c = hs[2];
   4.635 +    VectorRegister d = hs[3];
   4.636 +    VectorRegister e = hs[4];
   4.637 +    VectorRegister f = hs[5];
   4.638 +    VectorRegister g = hs[6];
   4.639 +    VectorRegister h = hs[7];
   4.640 +    load_perm(vRb, state);
   4.641 +    lvx    (ini_a, state);
   4.642 +    addi   (addr, state, 16);
   4.643 +
   4.644 +    lvx    (ini_c, addr);
   4.645 +    addi   (addr, state, 32);
   4.646 +    vec_perm(ini_a, ini_c, vRb);
   4.647 +
   4.648 +    lvx    (ini_e, addr);
   4.649 +    addi   (addr, state, 48);
   4.650 +    vec_perm(ini_c, ini_e, vRb);
   4.651 +
   4.652 +    lvx    (ini_g, addr);
   4.653 +    addi   (addr, state, 64);
   4.654 +    vec_perm(ini_e, ini_g, vRb);
   4.655 +
   4.656 +    lvx    (aux, addr);
   4.657 +    vec_perm(ini_g, aux, vRb);
   4.658 +
   4.659 +#if defined(VM_LITTLE_ENDIAN)
   4.660 +    xxmrgld(a->to_vsr(), b_->to_vsr(), a->to_vsr());
   4.661 +    xxmrgld(c->to_vsr(), d->to_vsr(), c->to_vsr());
   4.662 +    xxmrgld(e->to_vsr(), f->to_vsr(), e->to_vsr());
   4.663 +    xxmrgld(g->to_vsr(), h->to_vsr(), g->to_vsr());
   4.664 +#else
   4.665 +    xxmrgld(b_->to_vsr(), a->to_vsr(), b_->to_vsr());
   4.666 +    xxmrgld(d->to_vsr(), c->to_vsr(), d->to_vsr());
   4.667 +    xxmrgld(f->to_vsr(), e->to_vsr(), f->to_vsr());
   4.668 +    xxmrgld(h->to_vsr(), g->to_vsr(), h->to_vsr());
   4.669 +#endif
   4.670 +
   4.671 +    for (int n = start_idx; n < total_hs; n += 2) {
   4.672 +      VectorRegister h_cur = hs[n];
   4.673 +      VectorRegister ini_cur = inis[n/2];
   4.674 +
   4.675 +      vaddudm(h_cur, ini_cur, h_cur);
   4.676 +    }
   4.677 +
   4.678 +    for (int n = start_idx; n < total_hs; n += 2) {
   4.679 +      VectorRegister h_cur = hs[n];
   4.680 +
   4.681 +      mfvrd  (tmp, h_cur);
   4.682 +#if defined(VM_LITTLE_ENDIAN)
   4.683 +      std    (tmp, 8*n + 8, state);
   4.684 +#else
   4.685 +      std    (tmp, 8*n - 8, state);
   4.686 +#endif
   4.687 +      vsldoi (aux, h_cur, h_cur, 8);
   4.688 +      mfvrd  (tmp, aux);
   4.689 +      std    (tmp, 8*n + 0, state);
   4.690 +    }
   4.691 +
   4.692 +    b      (after_state_save_aligned);
   4.693 +  }
   4.694 +
   4.695 +  bind(state_save_aligned);
   4.696 +  {
   4.697 +    for (int n = 0; n < total_hs; n += 2) {
   4.698 +#if defined(VM_LITTLE_ENDIAN)
   4.699 +      VectorRegister h_cur = hs[n];
   4.700 +      VectorRegister h_next = hs[n+1];
   4.701 +#else
   4.702 +      VectorRegister h_cur = hs[n+1];
   4.703 +      VectorRegister h_next = hs[n];
   4.704 +#endif
   4.705 +      VectorRegister ini_cur = inis[n/2];
   4.706 +
   4.707 +      if (n/2 == 0) {
   4.708 +        lvx(ini_cur, state);
   4.709 +      } else {
   4.710 +        addi(addr, state, (n/2) * 16);
   4.711 +        lvx(ini_cur, addr);
   4.712 +      }
   4.713 +      xxmrgld(h_cur->to_vsr(), h_next->to_vsr(), h_cur->to_vsr());
   4.714 +    }
   4.715 +
   4.716 +    for (int n = start_idx; n < total_hs; n += 2) {
   4.717 +      VectorRegister h_cur = hs[n];
   4.718 +      VectorRegister ini_cur = inis[n/2];
   4.719 +
   4.720 +      vaddudm(h_cur, ini_cur, h_cur);
   4.721 +    }
   4.722 +
   4.723 +    for (int n = start_idx; n < total_hs; n += 2) {
   4.724 +      VectorRegister h_cur = hs[n];
   4.725 +
   4.726 +      if (n/2 == 0) {
   4.727 +        stvx(h_cur, state);
   4.728 +      } else {
   4.729 +        addi(addr, state, (n/2) * 16);
   4.730 +        stvx(h_cur, addr);
   4.731 +      }
   4.732 +    }
   4.733 +  }
   4.734 +
   4.735 +  bind(after_state_save_aligned);
   4.736 +}
   4.737 +
   4.738 +// Use h_cnt to cycle through hs elements but also increment it at the end
   4.739 +void MacroAssembler::sha512_round(const VectorRegister* hs,
   4.740 +                                  const int total_hs, int& h_cnt,
   4.741 +                                  const VectorRegister kpw) {
   4.742 +
   4.743 +  // convenience registers: cycle from 0-7 downwards
   4.744 +  const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
   4.745 +  const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
   4.746 +  const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
   4.747 +  const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
   4.748 +  const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
   4.749 +  const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
   4.750 +  const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
   4.751 +  const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
   4.752 +  // temporaries
   4.753 +  const VectorRegister Ch   = VR20;
   4.754 +  const VectorRegister Maj  = VR21;
   4.755 +  const VectorRegister bsa  = VR22;
   4.756 +  const VectorRegister bse  = VR23;
   4.757 +  const VectorRegister tmp1 = VR24;
   4.758 +  const VectorRegister tmp2 = VR25;
   4.759 +
   4.760 +  vsel      (Ch,   g,    f,   e);
   4.761 +  vxor      (Maj,  a,    b);
   4.762 +  vshasigmad(bse,  e,    1,   0xf);
   4.763 +  vaddudm   (tmp2, Ch,   kpw);
   4.764 +  vaddudm   (tmp1, h,    bse);
   4.765 +  vsel      (Maj,  b,    c,   Maj);
   4.766 +  vaddudm   (tmp1, tmp1, tmp2);
   4.767 +  vshasigmad(bsa,  a,    1,   0);
   4.768 +  vaddudm   (tmp2, bsa,  Maj);
   4.769 +  vaddudm   (d,    d,    tmp1);
   4.770 +  vaddudm   (h,    tmp1, tmp2);
   4.771 +
   4.772 +  // advance vector pointer to the next iteration
   4.773 +  h_cnt++;
   4.774 +}
   4.775 +
   4.776 +void MacroAssembler::sha512_calc_2w(const VectorRegister w0,
   4.777 +                                    const VectorRegister w1,
   4.778 +                                    const VectorRegister w2,
   4.779 +                                    const VectorRegister w3,
   4.780 +                                    const VectorRegister w4,
   4.781 +                                    const VectorRegister w5,
   4.782 +                                    const VectorRegister w6,
   4.783 +                                    const VectorRegister w7,
   4.784 +                                    const VectorRegister kpw0,
   4.785 +                                    const VectorRegister kpw1,
   4.786 +                                    const Register j,
   4.787 +                                    const VectorRegister vRb,
   4.788 +                                    const Register k) {
   4.789 +  // Temporaries
   4.790 +  const VectorRegister VR_a = VR20;
   4.791 +  const VectorRegister VR_b = VR21;
   4.792 +  const VectorRegister VR_c = VR22;
   4.793 +  const VectorRegister VR_d = VR23;
   4.794 +
   4.795 +  // load to k[j]
   4.796 +  lvx        (VR_a, j,    k);
   4.797 +  // advance j
   4.798 +  addi       (j,    j,    16); // 16 bytes were read
   4.799 +
   4.800 +#if defined(VM_LITTLE_ENDIAN)
   4.801 +  // v6 = w[j-15], w[j-14]
   4.802 +  vperm      (VR_b, w1,   w0,  vRb);
   4.803 +  // v12 = w[j-7], w[j-6]
   4.804 +  vperm      (VR_c, w5,   w4,  vRb);
   4.805 +#else
   4.806 +  // v6 = w[j-15], w[j-14]
   4.807 +  vperm      (VR_b, w0,   w1,  vRb);
   4.808 +  // v12 = w[j-7], w[j-6]
   4.809 +  vperm      (VR_c, w4,   w5,  vRb);
   4.810 +#endif
   4.811 +
   4.812 +  // v6 = s0(w[j-15]) , s0(w[j-14])
   4.813 +  vshasigmad (VR_b, VR_b,    0,   0);
   4.814 +  // v5 = s1(w[j-2]) , s1(w[j-1])
   4.815 +  vshasigmad (VR_d, w7,      0,   0xf);
   4.816 +  // v6 = s0(w[j-15]) + w[j-7] , s0(w[j-14]) + w[j-6]
   4.817 +  vaddudm    (VR_b, VR_b, VR_c);
   4.818 +  // v8 = s1(w[j-2]) + w[j-16] , s1(w[j-1]) + w[j-15]
   4.819 +  vaddudm    (VR_d, VR_d, w0);
   4.820 +  // v9 = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
   4.821 +  //      s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
   4.822 +  vaddudm    (VR_c, VR_d, VR_b);
   4.823 +  // Updating w0 to w7 to hold the new previous 16 values from w.
   4.824 +  vmr        (w0,   w1);
   4.825 +  vmr        (w1,   w2);
   4.826 +  vmr        (w2,   w3);
   4.827 +  vmr        (w3,   w4);
   4.828 +  vmr        (w4,   w5);
   4.829 +  vmr        (w5,   w6);
   4.830 +  vmr        (w6,   w7);
   4.831 +  vmr        (w7,   VR_c);
   4.832 +
   4.833 +#if defined(VM_LITTLE_ENDIAN)
   4.834 +  // store k + w to kpw0 (2 values at once)
   4.835 +  vaddudm    (kpw0, VR_c, VR_a);
   4.836 +  // kpw1 holds (k + w)[1]
   4.837 +  vsldoi     (kpw1, kpw0, kpw0, 8);
   4.838 +#else
   4.839 +  // store k + w to kpw0 (2 values at once)
   4.840 +  vaddudm    (kpw1, VR_c, VR_a);
   4.841 +  // kpw1 holds (k + w)[1]
   4.842 +  vsldoi     (kpw0, kpw1, kpw1, 8);
   4.843 +#endif
   4.844 +}
   4.845 +
   4.846 +void MacroAssembler::sha512_load_h_vec(const Register state,
   4.847 +                                       const VectorRegister* hs,
   4.848 +                                       const int total_hs) {
   4.849 +#if defined(VM_LITTLE_ENDIAN)
   4.850 +  VectorRegister a   = hs[0];
   4.851 +  VectorRegister g   = hs[6];
   4.852 +  int start_idx = 0;
   4.853 +#else
   4.854 +  VectorRegister a   = hs[1];
   4.855 +  VectorRegister g   = hs[7];
   4.856 +  int start_idx = 1;
   4.857 +#endif
   4.858 +
   4.859 +  Register addr      = R7;
   4.860 +  VectorRegister vRb = VR8;
   4.861 +  Register tmp       = R8;
   4.862 +  Label state_aligned, after_state_aligned;
   4.863 +
   4.864 +  andi_(tmp, state, 0xf);
   4.865 +  beq(CCR0, state_aligned);
   4.866 +
   4.867 +  // deal with unaligned addresses
   4.868 +  VectorRegister aux = VR9;
   4.869 +
   4.870 +  lvx(hs[start_idx], state);
   4.871 +  load_perm(vRb, state);
   4.872 +
   4.873 +  for (int n = start_idx + 2; n < total_hs; n += 2) {
   4.874 +    VectorRegister h_cur   = hs[n];
   4.875 +    VectorRegister h_prev2 = hs[n - 2];
   4.876 +    addi(addr, state, (n/2) * 16);
   4.877 +    lvx(h_cur, addr);
   4.878 +    vec_perm(h_prev2, h_cur, vRb);
   4.879 +  }
   4.880 +  addi(addr, state, (total_hs/2) * 16);
   4.881 +  lvx    (aux, addr);
   4.882 +  vec_perm(hs[total_hs - 2 + start_idx], aux, vRb);
   4.883 +  b      (after_state_aligned);
   4.884 +
   4.885 +  bind(state_aligned);
   4.886 +
   4.887 +  // deal with aligned addresses
   4.888 +  lvx(hs[start_idx], state);
   4.889 +
   4.890 +  for (int n = start_idx + 2; n < total_hs; n += 2) {
   4.891 +    VectorRegister h_cur = hs[n];
   4.892 +    addi(addr, state, (n/2) * 16);
   4.893 +    lvx(h_cur, addr);
   4.894 +  }
   4.895 +
   4.896 +  bind(after_state_aligned);
   4.897 +}
   4.898 +
   4.899 +static const uint64_t sha512_round_table[80] __attribute((aligned(16))) = {
   4.900 +  0x428a2f98d728ae22, 0x7137449123ef65cd,
   4.901 +  0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
   4.902 +  0x3956c25bf348b538, 0x59f111f1b605d019,
   4.903 +  0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
   4.904 +  0xd807aa98a3030242, 0x12835b0145706fbe,
   4.905 +  0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
   4.906 +  0x72be5d74f27b896f, 0x80deb1fe3b1696b1,
   4.907 +  0x9bdc06a725c71235, 0xc19bf174cf692694,
   4.908 +  0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
   4.909 +  0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
   4.910 +  0x2de92c6f592b0275, 0x4a7484aa6ea6e483,
   4.911 +  0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
   4.912 +  0x983e5152ee66dfab, 0xa831c66d2db43210,
   4.913 +  0xb00327c898fb213f, 0xbf597fc7beef0ee4,
   4.914 +  0xc6e00bf33da88fc2, 0xd5a79147930aa725,
   4.915 +  0x06ca6351e003826f, 0x142929670a0e6e70,
   4.916 +  0x27b70a8546d22ffc, 0x2e1b21385c26c926,
   4.917 +  0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
   4.918 +  0x650a73548baf63de, 0x766a0abb3c77b2a8,
   4.919 +  0x81c2c92e47edaee6, 0x92722c851482353b,
   4.920 +  0xa2bfe8a14cf10364, 0xa81a664bbc423001,
   4.921 +  0xc24b8b70d0f89791, 0xc76c51a30654be30,
   4.922 +  0xd192e819d6ef5218, 0xd69906245565a910,
   4.923 +  0xf40e35855771202a, 0x106aa07032bbd1b8,
   4.924 +  0x19a4c116b8d2d0c8, 0x1e376c085141ab53,
   4.925 +  0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
   4.926 +  0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
   4.927 +  0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
   4.928 +  0x748f82ee5defb2fc, 0x78a5636f43172f60,
   4.929 +  0x84c87814a1f0ab72, 0x8cc702081a6439ec,
   4.930 +  0x90befffa23631e28, 0xa4506cebde82bde9,
   4.931 +  0xbef9a3f7b2c67915, 0xc67178f2e372532b,
   4.932 +  0xca273eceea26619c, 0xd186b8c721c0c207,
   4.933 +  0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
   4.934 +  0x06f067aa72176fba, 0x0a637dc5a2c898a6,
   4.935 +  0x113f9804bef90dae, 0x1b710b35131c471b,
   4.936 +  0x28db77f523047d84, 0x32caab7b40c72493,
   4.937 +  0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
   4.938 +  0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
   4.939 +  0x5fcb6fab3ad6faec, 0x6c44198c4a475817,
   4.940 +};
   4.941 +static const uint64_t *sha512_round_consts = sha512_round_table;
   4.942 +
   4.943 +//   R3_ARG1   - byte[]  Input string with padding but in Big Endian
   4.944 +//   R4_ARG2   - int[]   SHA.state (at first, the root of primes)
   4.945 +//   R5_ARG3   - int     offset
   4.946 +//   R6_ARG4   - int     limit
   4.947 +//
   4.948 +//   Internal Register usage:
   4.949 +//   R7 R8 R9  - volatile temporaries
   4.950 +//   VR0-VR7   - a-h
   4.951 +//   VR8       - vRb
   4.952 +//   VR9       - aux (highly volatile, use with care)
   4.953 +//   VR10-VR17 - w0-w7 | ini_a-ini_h
   4.954 +//   VR18      - vsp16 | kplusw0
   4.955 +//   VR19      - vsp32 | kplusw1
   4.956 +//   VR20-VR25 - sha512_calc_2w and sha512_round temporaries
   4.957 +void MacroAssembler::sha512(bool multi_block) {
   4.958 +  static const ssize_t buf_size = 128;
   4.959 +  static const uint8_t w_size = sizeof(sha512_round_table)/sizeof(uint64_t);
   4.960 +#ifdef AIX
   4.961 +  // malloc provides 16 byte alignment
   4.962 +  if (((uintptr_t)sha512_round_consts & 0xF) != 0) {
   4.963 +    uint64_t *new_round_consts = (uint64_t*)malloc(sizeof(sha512_round_table));
   4.964 +    guarantee(new_round_consts, "oom");
   4.965 +    memcpy(new_round_consts, sha512_round_consts, sizeof(sha512_round_table));
   4.966 +    sha512_round_consts = (const uint64_t*)new_round_consts;
   4.967 +  }
   4.968 +#endif
   4.969 +
   4.970 +  Register buf_in = R3_ARG1;
   4.971 +  Register state  = R4_ARG2;
   4.972 +  Register ofs    = R5_ARG3;
   4.973 +  Register limit  = R6_ARG4;
   4.974 +
   4.975 +  Label sha_loop, core_loop;
   4.976 +
   4.977 +  // Save non-volatile vector registers in the red zone
   4.978 +  static const VectorRegister nv[] = {
   4.979 +    VR20, VR21, VR22, VR23, VR24, VR25/*, VR26, VR27, VR28, VR29, VR30, VR31*/
   4.980 +  };
   4.981 +  static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
   4.982 +
   4.983 +  for (int c = 0; c < nv_size; c++) {
   4.984 +    Register idx = R7;
   4.985 +    li  (idx, (c - (nv_size)) * 16);
   4.986 +    stvx(nv[c], idx, R1);
   4.987 +  }
   4.988 +
   4.989 +  // Load hash state to registers
   4.990 +  VectorRegister a = VR0;
   4.991 +  VectorRegister b = VR1;
   4.992 +  VectorRegister c = VR2;
   4.993 +  VectorRegister d = VR3;
   4.994 +  VectorRegister e = VR4;
   4.995 +  VectorRegister f = VR5;
   4.996 +  VectorRegister g = VR6;
   4.997 +  VectorRegister h = VR7;
   4.998 +  static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
   4.999 +  static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
  4.1000 +  // counter for cycling through hs vector to avoid register moves between iterations
  4.1001 +  int h_cnt = 0;
  4.1002 +
  4.1003 +  // Load a-h registers from the memory pointed by state
  4.1004 +  sha512_load_h_vec(state, hs, total_hs);
  4.1005 +
  4.1006 +  Register k = R9;
  4.1007 +  assert(((uintptr_t)sha512_round_consts & 0xF) == 0, "k alignment");
  4.1008 +  load_const_optimized(k, (address)sha512_round_consts, R0);
  4.1009 +
  4.1010 +  if (multi_block) {
  4.1011 +    align(OptoLoopAlignment);
  4.1012 +  }
  4.1013 +  bind(sha_loop);
  4.1014 +
  4.1015 +  for (int n = 0; n < total_hs; n += 2) {
  4.1016 +#if defined(VM_LITTLE_ENDIAN)
  4.1017 +    VectorRegister h_cur = hs[n];
  4.1018 +    VectorRegister h_next = hs[n + 1];
  4.1019 +#else
  4.1020 +    VectorRegister h_cur = hs[n + 1];
  4.1021 +    VectorRegister h_next = hs[n];
  4.1022 +#endif
  4.1023 +    vsldoi (h_next, h_cur, h_cur, 8);
  4.1024 +  }
  4.1025 +
  4.1026 +  // Load 16 elements from w out of the loop.
  4.1027 +  // Order of the long values is Endianess specific.
  4.1028 +  VectorRegister w0 = VR10;
  4.1029 +  VectorRegister w1 = VR11;
  4.1030 +  VectorRegister w2 = VR12;
  4.1031 +  VectorRegister w3 = VR13;
  4.1032 +  VectorRegister w4 = VR14;
  4.1033 +  VectorRegister w5 = VR15;
  4.1034 +  VectorRegister w6 = VR16;
  4.1035 +  VectorRegister w7 = VR17;
  4.1036 +  static const VectorRegister ws[] = {w0, w1, w2, w3, w4, w5, w6, w7};
  4.1037 +  static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
  4.1038 +
  4.1039 +  // Load 16 w into vectors and setup vsl for vperm
  4.1040 +  sha512_load_w_vec(buf_in, ws, total_ws);
  4.1041 +
  4.1042 +#if defined(VM_LITTLE_ENDIAN)
  4.1043 +  VectorRegister vsp16 = VR18;
  4.1044 +  VectorRegister vsp32 = VR19;
  4.1045 +  VectorRegister shiftarg = VR9;
  4.1046 +
  4.1047 +  vspltisw(vsp16,    8);
  4.1048 +  vspltisw(shiftarg, 1);
  4.1049 +  vsl     (vsp16,    vsp16, shiftarg);
  4.1050 +  vsl     (vsp32,    vsp16, shiftarg);
  4.1051 +
  4.1052 +  VectorRegister vsp8 = VR9;
  4.1053 +  vspltish(vsp8,     8);
  4.1054 +
  4.1055 +  // Convert input from Big Endian to Little Endian
  4.1056 +  for (int c = 0; c < total_ws; c++) {
  4.1057 +    VectorRegister w = ws[c];
  4.1058 +    vrlh  (w, w, vsp8);
  4.1059 +  }
  4.1060 +  for (int c = 0; c < total_ws; c++) {
  4.1061 +    VectorRegister w = ws[c];
  4.1062 +    vrlw  (w, w, vsp16);
  4.1063 +  }
  4.1064 +  for (int c = 0; c < total_ws; c++) {
  4.1065 +    VectorRegister w = ws[c];
  4.1066 +    vrld  (w, w, vsp32);
  4.1067 +  }
  4.1068 +#endif
  4.1069 +
  4.1070 +  Register Rb        = R10;
  4.1071 +  VectorRegister vRb = VR8;
  4.1072 +  li      (Rb, 8);
  4.1073 +  load_perm(vRb, Rb);
  4.1074 +
  4.1075 +  VectorRegister kplusw0 = VR18;
  4.1076 +  VectorRegister kplusw1 = VR19;
  4.1077 +
  4.1078 +  Register addr      = R7;
  4.1079 +
  4.1080 +  for (int n = 0; n < total_ws; n++) {
  4.1081 +    VectorRegister w = ws[n];
  4.1082 +
  4.1083 +    if (n == 0) {
  4.1084 +      lvx  (kplusw0, k);
  4.1085 +    } else {
  4.1086 +      addi (addr, k, n * 16);
  4.1087 +      lvx  (kplusw0, addr);
  4.1088 +    }
  4.1089 +#if defined(VM_LITTLE_ENDIAN)
  4.1090 +    vaddudm(kplusw0, kplusw0, w);
  4.1091 +    vsldoi (kplusw1, kplusw0, kplusw0, 8);
  4.1092 +#else
  4.1093 +    vaddudm(kplusw1, kplusw0, w);
  4.1094 +    vsldoi (kplusw0, kplusw1, kplusw1, 8);
  4.1095 +#endif
  4.1096 +
  4.1097 +    sha512_round(hs, total_hs, h_cnt, kplusw0);
  4.1098 +    sha512_round(hs, total_hs, h_cnt, kplusw1);
  4.1099 +  }
  4.1100 +
  4.1101 +  Register tmp       = R8;
  4.1102 +  li    (tmp, (w_size-16)/total_hs);
  4.1103 +  mtctr (tmp);
  4.1104 +  // j will be aligned to 4 for loading words.
  4.1105 +  // Whenever read, advance the pointer (e.g: when j is used in a function)
  4.1106 +  Register j = tmp;
  4.1107 +  li     (j, 8*16);
  4.1108 +
  4.1109 +  align(OptoLoopAlignment);
  4.1110 +  bind(core_loop);
  4.1111 +
  4.1112 +  // due to VectorRegister rotate, always iterate in multiples of total_hs
  4.1113 +  for (int n = 0; n < total_hs/2; n++) {
  4.1114 +    sha512_calc_2w(w0, w1, w2, w3, w4, w5, w6, w7, kplusw0, kplusw1, j, vRb, k);
  4.1115 +    sha512_round(hs, total_hs, h_cnt, kplusw0);
  4.1116 +    sha512_round(hs, total_hs, h_cnt, kplusw1);
  4.1117 +  }
  4.1118 +
  4.1119 +  bdnz   (core_loop);
  4.1120 +
  4.1121 +  sha512_update_sha_state(state, hs, total_hs);
  4.1122 +
  4.1123 +  if (multi_block) {
  4.1124 +    addi(buf_in, buf_in, buf_size);
  4.1125 +    addi(ofs, ofs, buf_size);
  4.1126 +    cmplw(CCR0, ofs, limit);
  4.1127 +    ble(CCR0, sha_loop);
  4.1128 +
  4.1129 +    // return ofs
  4.1130 +    mr(R3_RET, ofs);
  4.1131 +  }
  4.1132 +
  4.1133 +  // Restore non-volatile registers
  4.1134 +  for (int c = 0; c < nv_size; c++) {
  4.1135 +    Register idx = R7;
  4.1136 +    li  (idx, (c - (nv_size)) * 16);
  4.1137 +    lvx(nv[c], idx, R1);
  4.1138 +  }
  4.1139 +}
     5.1 --- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Mon Jun 17 17:20:10 2019 +0100
     5.2 +++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Tue Jun 18 09:33:34 2019 -0400
     5.3 @@ -2652,6 +2652,28 @@
     5.4       return start;
     5.5    }
     5.6  
     5.7 +  address generate_sha256_implCompress(bool multi_block, const char *name) {
     5.8 +    assert(UseSHA, "need SHA instructions");
     5.9 +    StubCodeMark mark(this, "StubRoutines", name);
    5.10 +    address start = __ function_entry();
    5.11 +
    5.12 +    __ sha256 (multi_block);
    5.13 +
    5.14 +    __ blr();
    5.15 +    return start;
    5.16 +  }
    5.17 +
    5.18 +  address generate_sha512_implCompress(bool multi_block, const char *name) {
    5.19 +    assert(UseSHA, "need SHA instructions");
    5.20 +    StubCodeMark mark(this, "StubRoutines", name);
    5.21 +    address start = __ function_entry();
    5.22 +
    5.23 +    __ sha512 (multi_block);
    5.24 +
    5.25 +    __ blr();
    5.26 +    return start;
    5.27 +  }
    5.28 +
    5.29    void generate_arraycopy_stubs() {
    5.30      // Note: the disjoint stubs must be generated first, some of
    5.31      // the conjoint stubs use them.
    5.32 @@ -2881,6 +2903,15 @@
    5.33        StubRoutines::_montgomerySquare
    5.34          = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
    5.35      }
    5.36 +
    5.37 +    if (UseSHA256Intrinsics) {
    5.38 +      StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
    5.39 +      StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
    5.40 +    }
    5.41 +    if (UseSHA512Intrinsics) {
    5.42 +      StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
    5.43 +      StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
    5.44 +    }
    5.45    }
    5.46  
    5.47   public:
     6.1 --- a/src/cpu/ppc/vm/stubRoutines_ppc_64.hpp	Mon Jun 17 17:20:10 2019 +0100
     6.2 +++ b/src/cpu/ppc/vm/stubRoutines_ppc_64.hpp	Tue Jun 18 09:33:34 2019 -0400
     6.3 @@ -34,7 +34,7 @@
     6.4  
     6.5  enum platform_dependent_constants {
     6.6    code_size1 = 20000,          // simply increase if too small (assembler will crash if too small)
     6.7 -  code_size2 = 20000           // simply increase if too small (assembler will crash if too small)
     6.8 +  code_size2 = 22000           // simply increase if too small (assembler will crash if too small)
     6.9  };
    6.10  
    6.11  // CRC32 Intrinsics.
     7.1 --- a/src/cpu/ppc/vm/vm_version_ppc.cpp	Mon Jun 17 17:20:10 2019 +0100
     7.2 +++ b/src/cpu/ppc/vm/vm_version_ppc.cpp	Tue Jun 18 09:33:34 2019 -0400
     7.3 @@ -110,7 +110,7 @@
     7.4    // Create and print feature-string.
     7.5    char buf[(num_features+1) * 16]; // Max 16 chars per feature.
     7.6    jio_snprintf(buf, sizeof(buf),
     7.7 -               "ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s",
     7.8 +               "ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
     7.9                 (has_fsqrt()   ? " fsqrt"   : ""),
    7.10                 (has_isel()    ? " isel"    : ""),
    7.11                 (has_lxarxeh() ? " lxarxeh" : ""),
    7.12 @@ -124,7 +124,8 @@
    7.13                 (has_vcipher() ? " aes"     : ""),
    7.14                 (has_vpmsumb() ? " vpmsumb" : ""),
    7.15                 (has_mfdscr()  ? " mfdscr"  : ""),
    7.16 -               (has_vsx()     ? " vsx"     : "")
    7.17 +               (has_vsx()     ? " vsx"     : ""),
    7.18 +               (has_vshasig() ? " sha"     : "")
    7.19                 // Make sure number of %s matches num_features!
    7.20                );
    7.21    _features_str = strdup(buf);
    7.22 @@ -206,17 +207,43 @@
    7.23    }
    7.24  #endif
    7.25  
    7.26 -  if (UseSHA) {
    7.27 -    warning("SHA instructions are not available on this CPU");
    7.28 +  if (has_vshasig()) {
    7.29 +    if (FLAG_IS_DEFAULT(UseSHA)) {
    7.30 +      UseSHA = true;
    7.31 +    }
    7.32 +  } else if (UseSHA) {
    7.33 +    if (!FLAG_IS_DEFAULT(UseSHA))
    7.34 +      warning("SHA instructions are not available on this CPU");
    7.35      FLAG_SET_DEFAULT(UseSHA, false);
    7.36    }
    7.37 -  if (UseSHA1Intrinsics || UseSHA256Intrinsics || UseSHA512Intrinsics) {
    7.38 -    warning("SHA intrinsics are not available on this CPU");
    7.39 +
    7.40 +  if (UseSHA1Intrinsics) {
    7.41 +    warning("Intrinsics for SHA-1 crypto hash functions not available on this CPU.");
    7.42      FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
    7.43 +  }
    7.44 +
    7.45 +  if (UseSHA && has_vshasig()) {
    7.46 +    if (FLAG_IS_DEFAULT(UseSHA256Intrinsics)) {
    7.47 +      FLAG_SET_DEFAULT(UseSHA256Intrinsics, true);
    7.48 +    }
    7.49 +  } else if (UseSHA256Intrinsics) {
    7.50 +    warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU.");
    7.51      FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
    7.52 +  }
    7.53 +
    7.54 +  if (UseSHA && has_vshasig()) {
    7.55 +    if (FLAG_IS_DEFAULT(UseSHA512Intrinsics)) {
    7.56 +      FLAG_SET_DEFAULT(UseSHA512Intrinsics, true);
    7.57 +    }
    7.58 +  } else if (UseSHA512Intrinsics) {
    7.59 +    warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU.");
    7.60      FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
    7.61    }
    7.62  
    7.63 +  if (!(UseSHA1Intrinsics || UseSHA256Intrinsics || UseSHA512Intrinsics)) {
    7.64 +    FLAG_SET_DEFAULT(UseSHA, false);
    7.65 +  }
    7.66 +
    7.67    if (FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) {
    7.68      UseMontgomeryMultiplyIntrinsic = true;
    7.69    }
    7.70 @@ -503,6 +530,7 @@
    7.71    a->vpmsumb(VR0, VR1, VR2);                   // code[12] -> vpmsumb
    7.72    a->mfdscr(R0);                               // code[13] -> mfdscr
    7.73    a->lxvd2x(VSR0, R3_ARG1);                    // code[14] -> vsx
    7.74 +  a->vshasigmaw(VR0, VR1, 1, 0xF);             // code[15] -> vshasig
    7.75    a->blr();
    7.76  
    7.77    // Emit function to set one cache line to zero. Emit function descriptor and get pointer to it.
    7.78 @@ -551,6 +579,7 @@
    7.79    if (code[feature_cntr++]) features |= vpmsumb_m;
    7.80    if (code[feature_cntr++]) features |= mfdscr_m;
    7.81    if (code[feature_cntr++]) features |= vsx_m;
    7.82 +  if (code[feature_cntr++]) features |= vshasig_m;
    7.83  
    7.84    // Print the detection code.
    7.85    if (PrintAssembly) {
     8.1 --- a/src/cpu/ppc/vm/vm_version_ppc.hpp	Mon Jun 17 17:20:10 2019 +0100
     8.2 +++ b/src/cpu/ppc/vm/vm_version_ppc.hpp	Tue Jun 18 09:33:34 2019 -0400
     8.3 @@ -47,6 +47,7 @@
     8.4      vpmsumb,
     8.5      mfdscr,
     8.6      vsx,
     8.7 +    vshasig,
     8.8      num_features // last entry to count features
     8.9    };
    8.10    enum Feature_Flag_Set {
    8.11 @@ -63,6 +64,7 @@
    8.12      dcba_m                = (1 << dcba   ),
    8.13      lqarx_m               = (1 << lqarx  ),
    8.14      vcipher_m             = (1 << vcipher),
    8.15 +    vshasig_m             = (1 << vshasig),
    8.16      vpmsumb_m             = (1 << vpmsumb),
    8.17      mfdscr_m              = (1 << mfdscr ),
    8.18      vsx_m                 = (1 << vsx    ),
    8.19 @@ -99,6 +101,7 @@
    8.20    static bool has_vpmsumb() { return (_features & vpmsumb_m) != 0; }
    8.21    static bool has_mfdscr()  { return (_features & mfdscr_m) != 0; }
    8.22    static bool has_vsx()     { return (_features & vsx_m) != 0; }
    8.23 +  static bool has_vshasig() { return (_features & vshasig_m) != 0; }
    8.24  
    8.25    static const char* cpu_features() { return _features_str; }
    8.26  
     9.1 --- a/src/share/vm/opto/library_call.cpp	Mon Jun 17 17:20:10 2019 +0100
     9.2 +++ b/src/share/vm/opto/library_call.cpp	Tue Jun 18 09:33:34 2019 -0400
     9.3 @@ -1,5 +1,5 @@
     9.4  /*
     9.5 - * Copyright (c) 1999, 2018, Oracle and/or its affiliates. All rights reserved.
     9.6 + * Copyright (c) 1999, 2019, Oracle and/or its affiliates. All rights reserved.
     9.7   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     9.8   *
     9.9   * This code is free software; you can redistribute it and/or modify it
    9.10 @@ -6759,10 +6759,18 @@
    9.11    if (state == NULL) return false;
    9.12  
    9.13    // Call the stub.
    9.14 -  Node* call = make_runtime_call(RC_LEAF|RC_NO_FP,
    9.15 -                                 OptoRuntime::digestBase_implCompressMB_Type(),
    9.16 -                                 stubAddr, stubName, TypePtr::BOTTOM,
    9.17 -                                 src_start, state, ofs, limit);
    9.18 +  Node *call;
    9.19 +  if (CCallingConventionRequiresIntsAsLongs) {
    9.20 +    call = make_runtime_call(RC_LEAF|RC_NO_FP,
    9.21 +                             OptoRuntime::digestBase_implCompressMB_Type(),
    9.22 +                             stubAddr, stubName, TypePtr::BOTTOM,
    9.23 +                             src_start, state, ofs XTOP, limit XTOP);
    9.24 +  } else {
    9.25 +    call = make_runtime_call(RC_LEAF|RC_NO_FP,
    9.26 +                             OptoRuntime::digestBase_implCompressMB_Type(),
    9.27 +                             stubAddr, stubName, TypePtr::BOTTOM,
    9.28 +                             src_start, state, ofs, limit);
    9.29 +  }
    9.30    // return ofs (int)
    9.31    Node* result = _gvn.transform(new (C) ProjNode(call, TypeFunc::Parms));
    9.32    set_result(result);
    10.1 --- a/src/share/vm/opto/runtime.cpp	Mon Jun 17 17:20:10 2019 +0100
    10.2 +++ b/src/share/vm/opto/runtime.cpp	Tue Jun 18 09:33:34 2019 -0400
    10.3 @@ -1,5 +1,5 @@
    10.4  /*
    10.5 - * Copyright (c) 1998, 2018, Oracle and/or its affiliates. All rights reserved.
    10.6 + * Copyright (c) 1998, 2019, Oracle and/or its affiliates. All rights reserved.
    10.7   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    10.8   *
    10.9   * This code is free software; you can redistribute it and/or modify it
   10.10 @@ -930,12 +930,24 @@
   10.11    // create input type (domain)
   10.12    int num_args = 4;
   10.13    int argcnt = num_args;
   10.14 +  if(CCallingConventionRequiresIntsAsLongs) {
   10.15 +    argcnt += 2;
   10.16 +  }
   10.17    const Type** fields = TypeTuple::fields(argcnt);
   10.18    int argp = TypeFunc::Parms;
   10.19 -  fields[argp++] = TypePtr::NOTNULL; // buf
   10.20 -  fields[argp++] = TypePtr::NOTNULL; // state
   10.21 -  fields[argp++] = TypeInt::INT;     // ofs
   10.22 -  fields[argp++] = TypeInt::INT;     // limit
   10.23 +  if(CCallingConventionRequiresIntsAsLongs) {
   10.24 +    fields[argp++] = TypePtr::NOTNULL; // buf
   10.25 +    fields[argp++] = TypePtr::NOTNULL; // state
   10.26 +    fields[argp++] = TypeLong::LONG;   // ofs
   10.27 +    fields[argp++] = Type::HALF;
   10.28 +    fields[argp++] = TypeLong::LONG;   // limit
   10.29 +    fields[argp++] = Type::HALF;
   10.30 +  } else {
   10.31 +    fields[argp++] = TypePtr::NOTNULL; // buf
   10.32 +    fields[argp++] = TypePtr::NOTNULL; // state
   10.33 +    fields[argp++] = TypeInt::INT;     // ofs
   10.34 +    fields[argp++] = TypeInt::INT;     // limit
   10.35 +  }
   10.36    assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
   10.37    const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
   10.38  
    11.1 --- a/test/compiler/intrinsics/sha/cli/testcases/GenericTestCaseForOtherCPU.java	Mon Jun 17 17:20:10 2019 +0100
    11.2 +++ b/test/compiler/intrinsics/sha/cli/testcases/GenericTestCaseForOtherCPU.java	Tue Jun 18 09:33:34 2019 -0400
    11.3 @@ -36,7 +36,8 @@
    11.4      public GenericTestCaseForOtherCPU(String optionName) {
    11.5          // Execute the test case on any CPU except SPARC and X86
    11.6          super(optionName, new NotPredicate(new OrPredicate(Platform::isSparc,
    11.7 -                new OrPredicate(Platform::isX64, Platform::isX86))));
    11.8 +                new OrPredicate(Platform::isPPC,
    11.9 +                new OrPredicate(Platform::isX64, Platform::isX86)))));
   11.10      }
   11.11  
   11.12      @Override
    12.1 --- a/test/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java	Mon Jun 17 17:20:10 2019 +0100
    12.2 +++ b/test/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java	Tue Jun 18 09:33:34 2019 -0400
    12.3 @@ -63,12 +63,20 @@
    12.4                      null);
    12.5  
    12.6      public static final BooleanSupplier SHA256_INSTRUCTION_AVAILABLE
    12.7 -            = new CPUSpecificPredicate("sparc.*", new String[] { "sha256" },
    12.8 -                    null);
    12.9 +            = new OrPredicate(new CPUSpecificPredicate("sparc.*",   new String[] { "sha256" },
   12.10 +                                                       null),
   12.11 +              new OrPredicate(new CPUSpecificPredicate("ppc64.*",   new String[] { "sha"    },
   12.12 +                                                       null),
   12.13 +                              new CPUSpecificPredicate("ppc64le.*", new String[] { "sha"    },
   12.14 +                                                       null)));
   12.15  
   12.16      public static final BooleanSupplier SHA512_INSTRUCTION_AVAILABLE
   12.17 -            = new CPUSpecificPredicate("sparc.*", new String[] { "sha512" },
   12.18 -                    null);
   12.19 +            = new OrPredicate(new CPUSpecificPredicate("sparc.*",   new String[] { "sha512" },
   12.20 +                                                       null),
   12.21 +              new OrPredicate(new CPUSpecificPredicate("ppc64.*",   new String[] { "sha"    },
   12.22 +                                                       null),
   12.23 +                              new CPUSpecificPredicate("ppc64le.*", new String[] { "sha"    },
   12.24 +                                                       null)));
   12.25  
   12.26      public static final BooleanSupplier ANY_SHA_INSTRUCTION_AVAILABLE
   12.27              = new OrPredicate(IntrinsicPredicates.SHA1_INSTRUCTION_AVAILABLE,

mercurial