Tue, 18 Jun 2019 09:33:34 -0400
8185979: PPC64: Implement SHA2 intrinsic
Reviewed-by: mdoerr, goetz
Contributed-by: Bruno Rosa <bruno.rosa@eldorado.org.br>, Gustavo Serra Scalet <gustavo.scalet@eldorado.org.br>, Igor Nunes <igor.nunes@eldorado.org.br>, Martin Doerr <martin.doerr@sap.com>
1.1 --- a/src/cpu/ppc/vm/assembler_ppc.hpp Mon Jun 17 17:20:10 2019 +0100 1.2 +++ b/src/cpu/ppc/vm/assembler_ppc.hpp Tue Jun 18 09:33:34 2019 -0400 1.3 @@ -2000,7 +2000,8 @@ 1.4 inline void vsbox( VectorRegister d, VectorRegister a); 1.5 1.6 // SHA (introduced with Power 8) 1.7 - // Not yet implemented. 1.8 + inline void vshasigmad(VectorRegister d, VectorRegister a, bool st, int six); 1.9 + inline void vshasigmaw(VectorRegister d, VectorRegister a, bool st, int six); 1.10 1.11 // Vector Binary Polynomial Multiplication (introduced with Power 8) 1.12 inline void vpmsumb( VectorRegister d, VectorRegister a, VectorRegister b); 1.13 @@ -2096,6 +2097,10 @@ 1.14 inline void lvsl( VectorRegister d, Register s2); 1.15 inline void lvsr( VectorRegister d, Register s2); 1.16 1.17 + // Endianess specific concatenation of 2 loaded vectors. 1.18 + inline void load_perm(VectorRegister perm, Register addr); 1.19 + inline void vec_perm(VectorRegister first_dest, VectorRegister second, VectorRegister perm); 1.20 + 1.21 // RegisterOrConstant versions. 1.22 // These emitters choose between the versions using two registers and 1.23 // those with register and immediate, depending on the content of roc.
2.1 --- a/src/cpu/ppc/vm/assembler_ppc.inline.hpp Mon Jun 17 17:20:10 2019 +0100 2.2 +++ b/src/cpu/ppc/vm/assembler_ppc.inline.hpp Tue Jun 18 09:33:34 2019 -0400 2.3 @@ -789,7 +789,8 @@ 2.4 inline void Assembler::vsbox( VectorRegister d, VectorRegister a) { emit_int32( VSBOX_OPCODE | vrt(d) | vra(a) ); } 2.5 2.6 // SHA (introduced with Power 8) 2.7 -// Not yet implemented. 2.8 +inline void Assembler::vshasigmad(VectorRegister d, VectorRegister a, bool st, int six) { emit_int32( VSHASIGMAD_OPCODE | vrt(d) | vra(a) | vst(st) | vsix(six)); } 2.9 +inline void Assembler::vshasigmaw(VectorRegister d, VectorRegister a, bool st, int six) { emit_int32( VSHASIGMAW_OPCODE | vrt(d) | vra(a) | vst(st) | vsix(six)); } 2.10 2.11 // Vector Binary Polynomial Multiplication (introduced with Power 8) 2.12 inline void Assembler::vpmsumb( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPMSUMB_OPCODE | vrt(d) | vra(a) | vrb(b)); } 2.13 @@ -887,6 +888,22 @@ 2.14 inline void Assembler::lvsl( VectorRegister d, Register s2) { emit_int32( LVSL_OPCODE | vrt(d) | rb(s2)); } 2.15 inline void Assembler::lvsr( VectorRegister d, Register s2) { emit_int32( LVSR_OPCODE | vrt(d) | rb(s2)); } 2.16 2.17 +inline void Assembler::load_perm(VectorRegister perm, Register addr) { 2.18 +#if defined(VM_LITTLE_ENDIAN) 2.19 + lvsr(perm, addr); 2.20 +#else 2.21 + lvsl(perm, addr); 2.22 +#endif 2.23 +} 2.24 + 2.25 +inline void Assembler::vec_perm(VectorRegister first_dest, VectorRegister second, VectorRegister perm) { 2.26 +#if defined(VM_LITTLE_ENDIAN) 2.27 + vperm(first_dest, second, first_dest, perm); 2.28 +#else 2.29 + vperm(first_dest, first_dest, second, perm); 2.30 +#endif 2.31 +} 2.32 + 2.33 inline void Assembler::load_const(Register d, void* x, Register tmp) { 2.34 load_const(d, (long)x, tmp); 2.35 }
3.1 --- a/src/cpu/ppc/vm/macroAssembler_ppc.hpp Mon Jun 17 17:20:10 2019 +0100 3.2 +++ b/src/cpu/ppc/vm/macroAssembler_ppc.hpp Tue Jun 18 09:33:34 2019 -0400 3.3 @@ -667,6 +667,40 @@ 3.4 3.5 void kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp); 3.6 3.7 + // SHA-2 auxiliary functions and public interfaces 3.8 + private: 3.9 + void sha256_deque(const VectorRegister src, 3.10 + const VectorRegister dst1, const VectorRegister dst2, const VectorRegister dst3); 3.11 + void sha256_load_h_vec(const VectorRegister a, const VectorRegister e, const Register hptr); 3.12 + void sha256_round(const VectorRegister* hs, const int total_hs, int& h_cnt, const VectorRegister kpw); 3.13 + void sha256_load_w_plus_k_vec(const Register buf_in, const VectorRegister* ws, 3.14 + const int total_ws, const Register k, const VectorRegister* kpws, 3.15 + const int total_kpws); 3.16 + void sha256_calc_4w(const VectorRegister w0, const VectorRegister w1, 3.17 + const VectorRegister w2, const VectorRegister w3, const VectorRegister kpw0, 3.18 + const VectorRegister kpw1, const VectorRegister kpw2, const VectorRegister kpw3, 3.19 + const Register j, const Register k); 3.20 + void sha256_update_sha_state(const VectorRegister a, const VectorRegister b, 3.21 + const VectorRegister c, const VectorRegister d, const VectorRegister e, 3.22 + const VectorRegister f, const VectorRegister g, const VectorRegister h, 3.23 + const Register hptr); 3.24 + 3.25 + void sha512_load_w_vec(const Register buf_in, const VectorRegister* ws, const int total_ws); 3.26 + void sha512_update_sha_state(const Register state, const VectorRegister* hs, const int total_hs); 3.27 + void sha512_round(const VectorRegister* hs, const int total_hs, int& h_cnt, const VectorRegister kpw); 3.28 + void sha512_load_h_vec(const Register state, const VectorRegister* hs, const int total_hs); 3.29 + void sha512_calc_2w(const VectorRegister w0, const VectorRegister w1, 3.30 + const VectorRegister w2, const VectorRegister w3, 3.31 + const VectorRegister w4, const VectorRegister w5, 3.32 + const VectorRegister w6, const VectorRegister w7, 3.33 + const VectorRegister kpw0, const VectorRegister kpw1, const Register j, 3.34 + const VectorRegister vRb, const Register k); 3.35 + 3.36 + public: 3.37 + void sha256(bool multi_block); 3.38 + void sha512(bool multi_block); 3.39 + 3.40 + 3.41 // 3.42 // Debugging 3.43 //
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 4.2 +++ b/src/cpu/ppc/vm/macroAssembler_ppc_sha.cpp Tue Jun 18 09:33:34 2019 -0400 4.3 @@ -0,0 +1,1136 @@ 4.4 +// Copyright (c) 2017 Instituto de Pesquisas Eldorado. All rights reserved. 4.5 +// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4.6 +// 4.7 +// This code is free software; you can redistribute it and/or modify it 4.8 +// under the terms of the GNU General Public License version 2 only, as 4.9 +// published by the Free Software Foundation. 4.10 +// 4.11 +// This code is distributed in the hope that it will be useful, but WITHOUT 4.12 +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 4.13 +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 4.14 +// version 2 for more details (a copy is included in the LICENSE file that 4.15 +// accompanied this code). 4.16 +// 4.17 +// You should have received a copy of the GNU General Public License version 4.18 +// 2 along with this work; if not, write to the Free Software Foundation, 4.19 +// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 4.20 +// 4.21 +// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 4.22 +// or visit www.oracle.com if you need additional information or have any 4.23 +// questions. 4.24 + 4.25 +// Implemented according to "Descriptions of SHA-256, SHA-384, and SHA-512" 4.26 +// (http://www.iwar.org.uk/comsec/resources/cipher/sha256-384-512.pdf). 4.27 + 4.28 +#include "asm/macroAssembler.inline.hpp" 4.29 +#include "runtime/stubRoutines.hpp" 4.30 + 4.31 +/********************************************************************** 4.32 + * SHA 256 4.33 + *********************************************************************/ 4.34 + 4.35 +void MacroAssembler::sha256_deque(const VectorRegister src, 4.36 + const VectorRegister dst1, 4.37 + const VectorRegister dst2, 4.38 + const VectorRegister dst3) { 4.39 + vsldoi (dst1, src, src, 12); 4.40 + vsldoi (dst2, src, src, 8); 4.41 + vsldoi (dst3, src, src, 4); 4.42 +} 4.43 + 4.44 +void MacroAssembler::sha256_round(const VectorRegister* hs, 4.45 + const int total_hs, 4.46 + int& h_cnt, 4.47 + const VectorRegister kpw) { 4.48 + // convenience registers: cycle from 0-7 downwards 4.49 + const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs]; 4.50 + const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs]; 4.51 + const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs]; 4.52 + const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs]; 4.53 + const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs]; 4.54 + const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs]; 4.55 + const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs]; 4.56 + const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs]; 4.57 + // temporaries 4.58 + VectorRegister ch = VR0; 4.59 + VectorRegister maj = VR1; 4.60 + VectorRegister bsa = VR2; 4.61 + VectorRegister bse = VR3; 4.62 + VectorRegister vt0 = VR4; 4.63 + VectorRegister vt1 = VR5; 4.64 + VectorRegister vt2 = VR6; 4.65 + VectorRegister vt3 = VR7; 4.66 + 4.67 + vsel (ch, g, f, e); 4.68 + vxor (maj, a, b); 4.69 + vshasigmaw (bse, e, 1, 0xf); 4.70 + vadduwm (vt2, ch, kpw); 4.71 + vadduwm (vt1, h, bse); 4.72 + vsel (maj, b, c, maj); 4.73 + vadduwm (vt3, vt1, vt2); 4.74 + vshasigmaw (bsa, a, 1, 0); 4.75 + vadduwm (vt0, bsa, maj); 4.76 + 4.77 + vadduwm (d, d, vt3); 4.78 + vadduwm (h, vt3, vt0); 4.79 + 4.80 + // advance vector pointer to the next iteration 4.81 + h_cnt++; 4.82 +} 4.83 + 4.84 +void MacroAssembler::sha256_load_h_vec(const VectorRegister a, 4.85 + const VectorRegister e, 4.86 + const Register hptr) { 4.87 + // temporaries 4.88 + Register tmp = R8; 4.89 + VectorRegister vt0 = VR0; 4.90 + VectorRegister vRb = VR6; 4.91 + // labels 4.92 + Label sha256_aligned; 4.93 + 4.94 + andi_ (tmp, hptr, 0xf); 4.95 + lvx (a, hptr); 4.96 + addi (tmp, hptr, 16); 4.97 + lvx (e, tmp); 4.98 + beq (CCR0, sha256_aligned); 4.99 + 4.100 + // handle unaligned accesses 4.101 + load_perm(vRb, hptr); 4.102 + addi (tmp, hptr, 32); 4.103 + vec_perm(a, e, vRb); 4.104 + 4.105 + lvx (vt0, tmp); 4.106 + vec_perm(e, vt0, vRb); 4.107 + 4.108 + // aligned accesses 4.109 + bind(sha256_aligned); 4.110 +} 4.111 + 4.112 +void MacroAssembler::sha256_load_w_plus_k_vec(const Register buf_in, 4.113 + const VectorRegister* ws, 4.114 + const int total_ws, 4.115 + const Register k, 4.116 + const VectorRegister* kpws, 4.117 + const int total_kpws) { 4.118 + Label w_aligned, after_w_load; 4.119 + 4.120 + Register tmp = R8; 4.121 + VectorRegister vt0 = VR0; 4.122 + VectorRegister vt1 = VR1; 4.123 + VectorRegister vRb = VR6; 4.124 + 4.125 + andi_ (tmp, buf_in, 0xF); 4.126 + beq (CCR0, w_aligned); // address ends with 0x0, not 0x8 4.127 + 4.128 + // deal with unaligned addresses 4.129 + lvx (ws[0], buf_in); 4.130 + load_perm(vRb, buf_in); 4.131 + 4.132 + for (int n = 1; n < total_ws; n++) { 4.133 + VectorRegister w_cur = ws[n]; 4.134 + VectorRegister w_prev = ws[n-1]; 4.135 + 4.136 + addi (tmp, buf_in, n * 16); 4.137 + lvx (w_cur, tmp); 4.138 + vec_perm(w_prev, w_cur, vRb); 4.139 + } 4.140 + addi (tmp, buf_in, total_ws * 16); 4.141 + lvx (vt0, tmp); 4.142 + vec_perm(ws[total_ws-1], vt0, vRb); 4.143 + b (after_w_load); 4.144 + 4.145 + bind(w_aligned); 4.146 + 4.147 + // deal with aligned addresses 4.148 + lvx(ws[0], buf_in); 4.149 + for (int n = 1; n < total_ws; n++) { 4.150 + VectorRegister w = ws[n]; 4.151 + addi (tmp, buf_in, n * 16); 4.152 + lvx (w, tmp); 4.153 + } 4.154 + 4.155 + bind(after_w_load); 4.156 + 4.157 +#if defined(VM_LITTLE_ENDIAN) 4.158 + // Byte swapping within int values 4.159 + li (tmp, 8); 4.160 + lvsl (vt0, tmp); 4.161 + vspltisb (vt1, 0xb); 4.162 + vxor (vt1, vt0, vt1); 4.163 + for (int n = 0; n < total_ws; n++) { 4.164 + VectorRegister w = ws[n]; 4.165 + vec_perm(w, w, vt1); 4.166 + } 4.167 +#endif 4.168 + 4.169 + // Loading k, which is always aligned to 16-bytes 4.170 + lvx (kpws[0], k); 4.171 + for (int n = 1; n < total_kpws; n++) { 4.172 + VectorRegister kpw = kpws[n]; 4.173 + addi (tmp, k, 16 * n); 4.174 + lvx (kpw, tmp); 4.175 + } 4.176 + 4.177 + // Add w to K 4.178 + assert(total_ws == total_kpws, "Redesign the loop below"); 4.179 + for (int n = 0; n < total_kpws; n++) { 4.180 + VectorRegister kpw = kpws[n]; 4.181 + VectorRegister w = ws[n]; 4.182 + 4.183 + vadduwm (kpw, kpw, w); 4.184 + } 4.185 +} 4.186 + 4.187 +void MacroAssembler::sha256_calc_4w(const VectorRegister w0, 4.188 + const VectorRegister w1, 4.189 + const VectorRegister w2, 4.190 + const VectorRegister w3, 4.191 + const VectorRegister kpw0, 4.192 + const VectorRegister kpw1, 4.193 + const VectorRegister kpw2, 4.194 + const VectorRegister kpw3, 4.195 + const Register j, 4.196 + const Register k) { 4.197 + // Temporaries 4.198 + const VectorRegister vt0 = VR0; 4.199 + const VectorRegister vt1 = VR1; 4.200 + const VectorSRegister vsrt1 = vt1->to_vsr(); 4.201 + const VectorRegister vt2 = VR2; 4.202 + const VectorRegister vt3 = VR3; 4.203 + const VectorSRegister vst3 = vt3->to_vsr(); 4.204 + const VectorRegister vt4 = VR4; 4.205 + 4.206 + // load to k[j] 4.207 + lvx (vt0, j, k); 4.208 + 4.209 + // advance j 4.210 + addi (j, j, 16); // 16 bytes were read 4.211 + 4.212 +#if defined(VM_LITTLE_ENDIAN) 4.213 + // b = w[j-15], w[j-14], w[j-13], w[j-12] 4.214 + vsldoi (vt1, w1, w0, 12); 4.215 + 4.216 + // c = w[j-7], w[j-6], w[j-5], w[j-4] 4.217 + vsldoi (vt2, w3, w2, 12); 4.218 + 4.219 +#else 4.220 + // b = w[j-15], w[j-14], w[j-13], w[j-12] 4.221 + vsldoi (vt1, w0, w1, 4); 4.222 + 4.223 + // c = w[j-7], w[j-6], w[j-5], w[j-4] 4.224 + vsldoi (vt2, w2, w3, 4); 4.225 +#endif 4.226 + 4.227 + // d = w[j-2], w[j-1], w[j-4], w[j-3] 4.228 + vsldoi (vt3, w3, w3, 8); 4.229 + 4.230 + // b = s0(w[j-15]) , s0(w[j-14]) , s0(w[j-13]) , s0(w[j-12]) 4.231 + vshasigmaw (vt1, vt1, 0, 0); 4.232 + 4.233 + // d = s1(w[j-2]) , s1(w[j-1]) , s1(w[j-4]) , s1(w[j-3]) 4.234 + vshasigmaw (vt3, vt3, 0, 0xf); 4.235 + 4.236 + // c = s0(w[j-15]) + w[j-7], 4.237 + // s0(w[j-14]) + w[j-6], 4.238 + // s0(w[j-13]) + w[j-5], 4.239 + // s0(w[j-12]) + w[j-4] 4.240 + vadduwm (vt2, vt1, vt2); 4.241 + 4.242 + // c = s0(w[j-15]) + w[j-7] + w[j-16], 4.243 + // s0(w[j-14]) + w[j-6] + w[j-15], 4.244 + // s0(w[j-13]) + w[j-5] + w[j-14], 4.245 + // s0(w[j-12]) + w[j-4] + w[j-13] 4.246 + vadduwm (vt2, vt2, w0); 4.247 + 4.248 + // e = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j] 4.249 + // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1] 4.250 + // s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j-4]), // UNDEFINED 4.251 + // s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j-3]) // UNDEFINED 4.252 + vadduwm (vt4, vt2, vt3); 4.253 + 4.254 + // At this point, e[0] and e[1] are the correct values to be stored at w[j] 4.255 + // and w[j+1]. 4.256 + // e[2] and e[3] are not considered. 4.257 + // b = s1(w[j]) , s1(s(w[j+1]) , UNDEFINED , UNDEFINED 4.258 + vshasigmaw (vt1, vt4, 0, 0xf); 4.259 + 4.260 + // v5 = s1(w[j-2]) , s1(w[j-1]) , s1(w[j]) , s1(w[j+1]) 4.261 +#if defined(VM_LITTLE_ENDIAN) 4.262 + xxmrgld (vst3, vsrt1, vst3); 4.263 +#else 4.264 + xxmrghd (vst3, vst3, vsrt1); 4.265 +#endif 4.266 + 4.267 + // c = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j] 4.268 + // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1] 4.269 + // s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j]), // w[j+2] 4.270 + // s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j+1]) // w[j+4] 4.271 + vadduwm (vt2, vt2, vt3); 4.272 + 4.273 + // Updating w0 to w3 to hold the new previous 16 values from w. 4.274 + vmr (w0, w1); 4.275 + vmr (w1, w2); 4.276 + vmr (w2, w3); 4.277 + vmr (w3, vt2); 4.278 + 4.279 + // store k + w to v9 (4 values at once) 4.280 +#if defined(VM_LITTLE_ENDIAN) 4.281 + vadduwm (kpw0, vt2, vt0); 4.282 + 4.283 + vsldoi (kpw1, kpw0, kpw0, 12); 4.284 + vsldoi (kpw2, kpw0, kpw0, 8); 4.285 + vsldoi (kpw3, kpw0, kpw0, 4); 4.286 +#else 4.287 + vadduwm (kpw3, vt2, vt0); 4.288 + 4.289 + vsldoi (kpw2, kpw3, kpw3, 12); 4.290 + vsldoi (kpw1, kpw3, kpw3, 8); 4.291 + vsldoi (kpw0, kpw3, kpw3, 4); 4.292 +#endif 4.293 +} 4.294 + 4.295 +void MacroAssembler::sha256_update_sha_state(const VectorRegister a, 4.296 + const VectorRegister b_, 4.297 + const VectorRegister c, 4.298 + const VectorRegister d, 4.299 + const VectorRegister e, 4.300 + const VectorRegister f, 4.301 + const VectorRegister g, 4.302 + const VectorRegister h, 4.303 + const Register hptr) { 4.304 + // temporaries 4.305 + VectorRegister vt0 = VR0; 4.306 + VectorRegister vt1 = VR1; 4.307 + VectorRegister vt2 = VR2; 4.308 + VectorRegister vt3 = VR3; 4.309 + VectorRegister vt4 = VR4; 4.310 + VectorRegister vt5 = VR5; 4.311 + VectorRegister vaux = VR6; 4.312 + VectorRegister vRb = VR6; 4.313 + Register tmp = R8; 4.314 + Register of16 = R8; 4.315 + Register of32 = R9; 4.316 + Label state_load_aligned; 4.317 + 4.318 + // Load hptr 4.319 + andi_ (tmp, hptr, 0xf); 4.320 + li (of16, 16); 4.321 + lvx (vt0, hptr); 4.322 + lvx (vt5, of16, hptr); 4.323 + beq (CCR0, state_load_aligned); 4.324 + 4.325 + // handle unaligned accesses 4.326 + li (of32, 32); 4.327 + load_perm(vRb, hptr); 4.328 + 4.329 + vec_perm(vt0, vt5, vRb); // vt0 = hptr[0]..hptr[3] 4.330 + 4.331 + lvx (vt1, hptr, of32); 4.332 + vec_perm(vt5, vt1, vRb); // vt5 = hptr[4]..hptr[7] 4.333 + 4.334 + // aligned accesses 4.335 + bind(state_load_aligned); 4.336 + 4.337 +#if defined(VM_LITTLE_ENDIAN) 4.338 + vmrglw (vt1, b_, a); // vt1 = {a, b, ?, ?} 4.339 + vmrglw (vt2, d, c); // vt2 = {c, d, ?, ?} 4.340 + vmrglw (vt3, f, e); // vt3 = {e, f, ?, ?} 4.341 + vmrglw (vt4, h, g); // vt4 = {g, h, ?, ?} 4.342 + xxmrgld (vt1->to_vsr(), vt2->to_vsr(), vt1->to_vsr()); // vt1 = {a, b, c, d} 4.343 + xxmrgld (vt3->to_vsr(), vt4->to_vsr(), vt3->to_vsr()); // vt3 = {e, f, g, h} 4.344 + vadduwm (a, vt0, vt1); // a = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]} 4.345 + vadduwm (e, vt5, vt3); // e = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]} 4.346 + 4.347 + // Save hptr back, works for any alignment 4.348 + xxswapd (vt0->to_vsr(), a->to_vsr()); 4.349 + stxvd2x (vt0->to_vsr(), hptr); 4.350 + xxswapd (vt5->to_vsr(), e->to_vsr()); 4.351 + stxvd2x (vt5->to_vsr(), of16, hptr); 4.352 +#else 4.353 + vmrglw (vt1, a, b_); // vt1 = {a, b, ?, ?} 4.354 + vmrglw (vt2, c, d); // vt2 = {c, d, ?, ?} 4.355 + vmrglw (vt3, e, f); // vt3 = {e, f, ?, ?} 4.356 + vmrglw (vt4, g, h); // vt4 = {g, h, ?, ?} 4.357 + xxmrgld (vt1->to_vsr(), vt1->to_vsr(), vt2->to_vsr()); // vt1 = {a, b, c, d} 4.358 + xxmrgld (vt3->to_vsr(), vt3->to_vsr(), vt4->to_vsr()); // vt3 = {e, f, g, h} 4.359 + vadduwm (d, vt0, vt1); // d = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]} 4.360 + vadduwm (h, vt5, vt3); // h = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]} 4.361 + 4.362 + // Save hptr back, works for any alignment 4.363 + stxvd2x (d->to_vsr(), hptr); 4.364 + stxvd2x (h->to_vsr(), of16, hptr); 4.365 +#endif 4.366 +} 4.367 + 4.368 +static const uint32_t sha256_round_table[64] __attribute((aligned(16))) = { 4.369 + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 4.370 + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 4.371 + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 4.372 + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 4.373 + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 4.374 + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 4.375 + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 4.376 + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 4.377 + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 4.378 + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 4.379 + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 4.380 + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 4.381 + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 4.382 + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 4.383 + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 4.384 + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 4.385 +}; 4.386 +static const uint32_t *sha256_round_consts = sha256_round_table; 4.387 + 4.388 +// R3_ARG1 - byte[] Input string with padding but in Big Endian 4.389 +// R4_ARG2 - int[] SHA.state (at first, the root of primes) 4.390 +// R5_ARG3 - int offset 4.391 +// R6_ARG4 - int limit 4.392 +// 4.393 +// Internal Register usage: 4.394 +// R7 - k 4.395 +// R8 - tmp | j | of16 4.396 +// R9 - of32 4.397 +// VR0-VR8 - ch, maj, bsa, bse, vt0-vt3 | vt0-vt5, vaux/vRb 4.398 +// VR9-VR16 - a-h 4.399 +// VR17-VR20 - w0-w3 4.400 +// VR21-VR23 - vRb | vaux0-vaux2 4.401 +// VR24-VR27 - kpw0-kpw3 4.402 +void MacroAssembler::sha256(bool multi_block) { 4.403 + static const ssize_t buf_size = 64; 4.404 + static const uint8_t w_size = sizeof(sha256_round_table)/sizeof(uint32_t); 4.405 +#ifdef AIX 4.406 + // malloc provides 16 byte alignment 4.407 + if (((uintptr_t)sha256_round_consts & 0xF) != 0) { 4.408 + uint32_t *new_round_consts = (uint32_t*)malloc(sizeof(sha256_round_table)); 4.409 + guarantee(new_round_consts, "oom"); 4.410 + memcpy(new_round_consts, sha256_round_consts, sizeof(sha256_round_table)); 4.411 + sha256_round_consts = (const uint32_t*)new_round_consts; 4.412 + } 4.413 +#endif 4.414 + 4.415 + Register buf_in = R3_ARG1; 4.416 + Register state = R4_ARG2; 4.417 + Register ofs = R5_ARG3; 4.418 + Register limit = R6_ARG4; 4.419 + 4.420 + Label sha_loop, core_loop; 4.421 + 4.422 + // Save non-volatile vector registers in the red zone 4.423 + static const VectorRegister nv[] = { 4.424 + VR20, VR21, VR22, VR23, VR24, VR25, VR26, VR27/*, VR28, VR29, VR30, VR31*/ 4.425 + }; 4.426 + static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister); 4.427 + 4.428 + for (int c = 0; c < nv_size; c++) { 4.429 + Register tmp = R8; 4.430 + li (tmp, (c - (nv_size)) * 16); 4.431 + stvx(nv[c], tmp, R1); 4.432 + } 4.433 + 4.434 + // Load hash state to registers 4.435 + VectorRegister a = VR9; 4.436 + VectorRegister b = VR10; 4.437 + VectorRegister c = VR11; 4.438 + VectorRegister d = VR12; 4.439 + VectorRegister e = VR13; 4.440 + VectorRegister f = VR14; 4.441 + VectorRegister g = VR15; 4.442 + VectorRegister h = VR16; 4.443 + static const VectorRegister hs[] = {a, b, c, d, e, f, g, h}; 4.444 + static const int total_hs = sizeof(hs)/sizeof(VectorRegister); 4.445 + // counter for cycling through hs vector to avoid register moves between iterations 4.446 + int h_cnt = 0; 4.447 + 4.448 + // Load a-h registers from the memory pointed by state 4.449 +#if defined(VM_LITTLE_ENDIAN) 4.450 + sha256_load_h_vec(a, e, state); 4.451 +#else 4.452 + sha256_load_h_vec(d, h, state); 4.453 +#endif 4.454 + 4.455 + // keep k loaded also during MultiBlock loops 4.456 + Register k = R7; 4.457 + assert(((uintptr_t)sha256_round_consts & 0xF) == 0, "k alignment"); 4.458 + load_const_optimized(k, (address)sha256_round_consts, R0); 4.459 + 4.460 + // Avoiding redundant loads 4.461 + if (multi_block) { 4.462 + align(OptoLoopAlignment); 4.463 + } 4.464 + bind(sha_loop); 4.465 +#if defined(VM_LITTLE_ENDIAN) 4.466 + sha256_deque(a, b, c, d); 4.467 + sha256_deque(e, f, g, h); 4.468 +#else 4.469 + sha256_deque(d, c, b, a); 4.470 + sha256_deque(h, g, f, e); 4.471 +#endif 4.472 + 4.473 + // Load 16 elements from w out of the loop. 4.474 + // Order of the int values is Endianess specific. 4.475 + VectorRegister w0 = VR17; 4.476 + VectorRegister w1 = VR18; 4.477 + VectorRegister w2 = VR19; 4.478 + VectorRegister w3 = VR20; 4.479 + static const VectorRegister ws[] = {w0, w1, w2, w3}; 4.480 + static const int total_ws = sizeof(ws)/sizeof(VectorRegister); 4.481 + 4.482 + VectorRegister kpw0 = VR24; 4.483 + VectorRegister kpw1 = VR25; 4.484 + VectorRegister kpw2 = VR26; 4.485 + VectorRegister kpw3 = VR27; 4.486 + static const VectorRegister kpws[] = {kpw0, kpw1, kpw2, kpw3}; 4.487 + static const int total_kpws = sizeof(kpws)/sizeof(VectorRegister); 4.488 + 4.489 + sha256_load_w_plus_k_vec(buf_in, ws, total_ws, k, kpws, total_kpws); 4.490 + 4.491 + // Cycle through the first 16 elements 4.492 + assert(total_ws == total_kpws, "Redesign the loop below"); 4.493 + for (int n = 0; n < total_ws; n++) { 4.494 + VectorRegister vaux0 = VR21; 4.495 + VectorRegister vaux1 = VR22; 4.496 + VectorRegister vaux2 = VR23; 4.497 + 4.498 + sha256_deque(kpws[n], vaux0, vaux1, vaux2); 4.499 + 4.500 +#if defined(VM_LITTLE_ENDIAN) 4.501 + sha256_round(hs, total_hs, h_cnt, kpws[n]); 4.502 + sha256_round(hs, total_hs, h_cnt, vaux0); 4.503 + sha256_round(hs, total_hs, h_cnt, vaux1); 4.504 + sha256_round(hs, total_hs, h_cnt, vaux2); 4.505 +#else 4.506 + sha256_round(hs, total_hs, h_cnt, vaux2); 4.507 + sha256_round(hs, total_hs, h_cnt, vaux1); 4.508 + sha256_round(hs, total_hs, h_cnt, vaux0); 4.509 + sha256_round(hs, total_hs, h_cnt, kpws[n]); 4.510 +#endif 4.511 + } 4.512 + 4.513 + Register tmp = R8; 4.514 + // loop the 16th to the 64th iteration by 8 steps 4.515 + li (tmp, (w_size - 16) / total_hs); 4.516 + mtctr(tmp); 4.517 + 4.518 + // j will be aligned to 4 for loading words. 4.519 + // Whenever read, advance the pointer (e.g: when j is used in a function) 4.520 + Register j = R8; 4.521 + li (j, 16*4); 4.522 + 4.523 + align(OptoLoopAlignment); 4.524 + bind(core_loop); 4.525 + 4.526 + // due to VectorRegister rotate, always iterate in multiples of total_hs 4.527 + for (int n = 0; n < total_hs/4; n++) { 4.528 + sha256_calc_4w(w0, w1, w2, w3, kpw0, kpw1, kpw2, kpw3, j, k); 4.529 + sha256_round(hs, total_hs, h_cnt, kpw0); 4.530 + sha256_round(hs, total_hs, h_cnt, kpw1); 4.531 + sha256_round(hs, total_hs, h_cnt, kpw2); 4.532 + sha256_round(hs, total_hs, h_cnt, kpw3); 4.533 + } 4.534 + 4.535 + bdnz (core_loop); 4.536 + 4.537 + // Update hash state 4.538 + sha256_update_sha_state(a, b, c, d, e, f, g, h, state); 4.539 + 4.540 + if (multi_block) { 4.541 + addi(buf_in, buf_in, buf_size); 4.542 + addi(ofs, ofs, buf_size); 4.543 + cmplw(CCR0, ofs, limit); 4.544 + ble(CCR0, sha_loop); 4.545 + 4.546 + // return ofs 4.547 + mr(R3_RET, ofs); 4.548 + } 4.549 + 4.550 + // Restore non-volatile registers 4.551 + for (int c = 0; c < nv_size; c++) { 4.552 + Register tmp = R8; 4.553 + li (tmp, (c - (nv_size)) * 16); 4.554 + lvx(nv[c], tmp, R1); 4.555 + } 4.556 +} 4.557 + 4.558 + 4.559 +/********************************************************************** 4.560 + * SHA 512 4.561 + *********************************************************************/ 4.562 + 4.563 +void MacroAssembler::sha512_load_w_vec(const Register buf_in, 4.564 + const VectorRegister* ws, 4.565 + const int total_ws) { 4.566 + Register tmp = R8; 4.567 + VectorRegister vRb = VR8; 4.568 + VectorRegister aux = VR9; 4.569 + Label is_aligned, after_alignment; 4.570 + 4.571 + andi_ (tmp, buf_in, 0xF); 4.572 + beq (CCR0, is_aligned); // address ends with 0x0, not 0x8 4.573 + 4.574 + // deal with unaligned addresses 4.575 + lvx (ws[0], buf_in); 4.576 + load_perm(vRb, buf_in); 4.577 + 4.578 + for (int n = 1; n < total_ws; n++) { 4.579 + VectorRegister w_cur = ws[n]; 4.580 + VectorRegister w_prev = ws[n-1]; 4.581 + addi (tmp, buf_in, n * 16); 4.582 + lvx (w_cur, tmp); 4.583 + vec_perm(w_prev, w_cur, vRb); 4.584 + } 4.585 + addi (tmp, buf_in, total_ws * 16); 4.586 + lvx (aux, tmp); 4.587 + vec_perm(ws[total_ws-1], aux, vRb); 4.588 + b (after_alignment); 4.589 + 4.590 + bind(is_aligned); 4.591 + lvx (ws[0], buf_in); 4.592 + for (int n = 1; n < total_ws; n++) { 4.593 + VectorRegister w = ws[n]; 4.594 + addi (tmp, buf_in, n * 16); 4.595 + lvx (w, tmp); 4.596 + } 4.597 + 4.598 + bind(after_alignment); 4.599 +} 4.600 + 4.601 +// Update hash state 4.602 +void MacroAssembler::sha512_update_sha_state(const Register state, 4.603 + const VectorRegister* hs, 4.604 + const int total_hs) { 4.605 + 4.606 +#if defined(VM_LITTLE_ENDIAN) 4.607 + int start_idx = 0; 4.608 +#else 4.609 + int start_idx = 1; 4.610 +#endif 4.611 + 4.612 + // load initial hash from the memory pointed by state 4.613 + VectorRegister ini_a = VR10; 4.614 + VectorRegister ini_c = VR12; 4.615 + VectorRegister ini_e = VR14; 4.616 + VectorRegister ini_g = VR16; 4.617 + static const VectorRegister inis[] = {ini_a, ini_c, ini_e, ini_g}; 4.618 + static const int total_inis = sizeof(inis)/sizeof(VectorRegister); 4.619 + 4.620 + Label state_save_aligned, after_state_save_aligned; 4.621 + 4.622 + Register addr = R7; 4.623 + Register tmp = R8; 4.624 + VectorRegister vRb = VR8; 4.625 + VectorRegister aux = VR9; 4.626 + 4.627 + andi_(tmp, state, 0xf); 4.628 + beq(CCR0, state_save_aligned); 4.629 + // deal with unaligned addresses 4.630 + 4.631 + { 4.632 + VectorRegister a = hs[0]; 4.633 + VectorRegister b_ = hs[1]; 4.634 + VectorRegister c = hs[2]; 4.635 + VectorRegister d = hs[3]; 4.636 + VectorRegister e = hs[4]; 4.637 + VectorRegister f = hs[5]; 4.638 + VectorRegister g = hs[6]; 4.639 + VectorRegister h = hs[7]; 4.640 + load_perm(vRb, state); 4.641 + lvx (ini_a, state); 4.642 + addi (addr, state, 16); 4.643 + 4.644 + lvx (ini_c, addr); 4.645 + addi (addr, state, 32); 4.646 + vec_perm(ini_a, ini_c, vRb); 4.647 + 4.648 + lvx (ini_e, addr); 4.649 + addi (addr, state, 48); 4.650 + vec_perm(ini_c, ini_e, vRb); 4.651 + 4.652 + lvx (ini_g, addr); 4.653 + addi (addr, state, 64); 4.654 + vec_perm(ini_e, ini_g, vRb); 4.655 + 4.656 + lvx (aux, addr); 4.657 + vec_perm(ini_g, aux, vRb); 4.658 + 4.659 +#if defined(VM_LITTLE_ENDIAN) 4.660 + xxmrgld(a->to_vsr(), b_->to_vsr(), a->to_vsr()); 4.661 + xxmrgld(c->to_vsr(), d->to_vsr(), c->to_vsr()); 4.662 + xxmrgld(e->to_vsr(), f->to_vsr(), e->to_vsr()); 4.663 + xxmrgld(g->to_vsr(), h->to_vsr(), g->to_vsr()); 4.664 +#else 4.665 + xxmrgld(b_->to_vsr(), a->to_vsr(), b_->to_vsr()); 4.666 + xxmrgld(d->to_vsr(), c->to_vsr(), d->to_vsr()); 4.667 + xxmrgld(f->to_vsr(), e->to_vsr(), f->to_vsr()); 4.668 + xxmrgld(h->to_vsr(), g->to_vsr(), h->to_vsr()); 4.669 +#endif 4.670 + 4.671 + for (int n = start_idx; n < total_hs; n += 2) { 4.672 + VectorRegister h_cur = hs[n]; 4.673 + VectorRegister ini_cur = inis[n/2]; 4.674 + 4.675 + vaddudm(h_cur, ini_cur, h_cur); 4.676 + } 4.677 + 4.678 + for (int n = start_idx; n < total_hs; n += 2) { 4.679 + VectorRegister h_cur = hs[n]; 4.680 + 4.681 + mfvrd (tmp, h_cur); 4.682 +#if defined(VM_LITTLE_ENDIAN) 4.683 + std (tmp, 8*n + 8, state); 4.684 +#else 4.685 + std (tmp, 8*n - 8, state); 4.686 +#endif 4.687 + vsldoi (aux, h_cur, h_cur, 8); 4.688 + mfvrd (tmp, aux); 4.689 + std (tmp, 8*n + 0, state); 4.690 + } 4.691 + 4.692 + b (after_state_save_aligned); 4.693 + } 4.694 + 4.695 + bind(state_save_aligned); 4.696 + { 4.697 + for (int n = 0; n < total_hs; n += 2) { 4.698 +#if defined(VM_LITTLE_ENDIAN) 4.699 + VectorRegister h_cur = hs[n]; 4.700 + VectorRegister h_next = hs[n+1]; 4.701 +#else 4.702 + VectorRegister h_cur = hs[n+1]; 4.703 + VectorRegister h_next = hs[n]; 4.704 +#endif 4.705 + VectorRegister ini_cur = inis[n/2]; 4.706 + 4.707 + if (n/2 == 0) { 4.708 + lvx(ini_cur, state); 4.709 + } else { 4.710 + addi(addr, state, (n/2) * 16); 4.711 + lvx(ini_cur, addr); 4.712 + } 4.713 + xxmrgld(h_cur->to_vsr(), h_next->to_vsr(), h_cur->to_vsr()); 4.714 + } 4.715 + 4.716 + for (int n = start_idx; n < total_hs; n += 2) { 4.717 + VectorRegister h_cur = hs[n]; 4.718 + VectorRegister ini_cur = inis[n/2]; 4.719 + 4.720 + vaddudm(h_cur, ini_cur, h_cur); 4.721 + } 4.722 + 4.723 + for (int n = start_idx; n < total_hs; n += 2) { 4.724 + VectorRegister h_cur = hs[n]; 4.725 + 4.726 + if (n/2 == 0) { 4.727 + stvx(h_cur, state); 4.728 + } else { 4.729 + addi(addr, state, (n/2) * 16); 4.730 + stvx(h_cur, addr); 4.731 + } 4.732 + } 4.733 + } 4.734 + 4.735 + bind(after_state_save_aligned); 4.736 +} 4.737 + 4.738 +// Use h_cnt to cycle through hs elements but also increment it at the end 4.739 +void MacroAssembler::sha512_round(const VectorRegister* hs, 4.740 + const int total_hs, int& h_cnt, 4.741 + const VectorRegister kpw) { 4.742 + 4.743 + // convenience registers: cycle from 0-7 downwards 4.744 + const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs]; 4.745 + const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs]; 4.746 + const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs]; 4.747 + const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs]; 4.748 + const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs]; 4.749 + const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs]; 4.750 + const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs]; 4.751 + const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs]; 4.752 + // temporaries 4.753 + const VectorRegister Ch = VR20; 4.754 + const VectorRegister Maj = VR21; 4.755 + const VectorRegister bsa = VR22; 4.756 + const VectorRegister bse = VR23; 4.757 + const VectorRegister tmp1 = VR24; 4.758 + const VectorRegister tmp2 = VR25; 4.759 + 4.760 + vsel (Ch, g, f, e); 4.761 + vxor (Maj, a, b); 4.762 + vshasigmad(bse, e, 1, 0xf); 4.763 + vaddudm (tmp2, Ch, kpw); 4.764 + vaddudm (tmp1, h, bse); 4.765 + vsel (Maj, b, c, Maj); 4.766 + vaddudm (tmp1, tmp1, tmp2); 4.767 + vshasigmad(bsa, a, 1, 0); 4.768 + vaddudm (tmp2, bsa, Maj); 4.769 + vaddudm (d, d, tmp1); 4.770 + vaddudm (h, tmp1, tmp2); 4.771 + 4.772 + // advance vector pointer to the next iteration 4.773 + h_cnt++; 4.774 +} 4.775 + 4.776 +void MacroAssembler::sha512_calc_2w(const VectorRegister w0, 4.777 + const VectorRegister w1, 4.778 + const VectorRegister w2, 4.779 + const VectorRegister w3, 4.780 + const VectorRegister w4, 4.781 + const VectorRegister w5, 4.782 + const VectorRegister w6, 4.783 + const VectorRegister w7, 4.784 + const VectorRegister kpw0, 4.785 + const VectorRegister kpw1, 4.786 + const Register j, 4.787 + const VectorRegister vRb, 4.788 + const Register k) { 4.789 + // Temporaries 4.790 + const VectorRegister VR_a = VR20; 4.791 + const VectorRegister VR_b = VR21; 4.792 + const VectorRegister VR_c = VR22; 4.793 + const VectorRegister VR_d = VR23; 4.794 + 4.795 + // load to k[j] 4.796 + lvx (VR_a, j, k); 4.797 + // advance j 4.798 + addi (j, j, 16); // 16 bytes were read 4.799 + 4.800 +#if defined(VM_LITTLE_ENDIAN) 4.801 + // v6 = w[j-15], w[j-14] 4.802 + vperm (VR_b, w1, w0, vRb); 4.803 + // v12 = w[j-7], w[j-6] 4.804 + vperm (VR_c, w5, w4, vRb); 4.805 +#else 4.806 + // v6 = w[j-15], w[j-14] 4.807 + vperm (VR_b, w0, w1, vRb); 4.808 + // v12 = w[j-7], w[j-6] 4.809 + vperm (VR_c, w4, w5, vRb); 4.810 +#endif 4.811 + 4.812 + // v6 = s0(w[j-15]) , s0(w[j-14]) 4.813 + vshasigmad (VR_b, VR_b, 0, 0); 4.814 + // v5 = s1(w[j-2]) , s1(w[j-1]) 4.815 + vshasigmad (VR_d, w7, 0, 0xf); 4.816 + // v6 = s0(w[j-15]) + w[j-7] , s0(w[j-14]) + w[j-6] 4.817 + vaddudm (VR_b, VR_b, VR_c); 4.818 + // v8 = s1(w[j-2]) + w[j-16] , s1(w[j-1]) + w[j-15] 4.819 + vaddudm (VR_d, VR_d, w0); 4.820 + // v9 = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j] 4.821 + // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1] 4.822 + vaddudm (VR_c, VR_d, VR_b); 4.823 + // Updating w0 to w7 to hold the new previous 16 values from w. 4.824 + vmr (w0, w1); 4.825 + vmr (w1, w2); 4.826 + vmr (w2, w3); 4.827 + vmr (w3, w4); 4.828 + vmr (w4, w5); 4.829 + vmr (w5, w6); 4.830 + vmr (w6, w7); 4.831 + vmr (w7, VR_c); 4.832 + 4.833 +#if defined(VM_LITTLE_ENDIAN) 4.834 + // store k + w to kpw0 (2 values at once) 4.835 + vaddudm (kpw0, VR_c, VR_a); 4.836 + // kpw1 holds (k + w)[1] 4.837 + vsldoi (kpw1, kpw0, kpw0, 8); 4.838 +#else 4.839 + // store k + w to kpw0 (2 values at once) 4.840 + vaddudm (kpw1, VR_c, VR_a); 4.841 + // kpw1 holds (k + w)[1] 4.842 + vsldoi (kpw0, kpw1, kpw1, 8); 4.843 +#endif 4.844 +} 4.845 + 4.846 +void MacroAssembler::sha512_load_h_vec(const Register state, 4.847 + const VectorRegister* hs, 4.848 + const int total_hs) { 4.849 +#if defined(VM_LITTLE_ENDIAN) 4.850 + VectorRegister a = hs[0]; 4.851 + VectorRegister g = hs[6]; 4.852 + int start_idx = 0; 4.853 +#else 4.854 + VectorRegister a = hs[1]; 4.855 + VectorRegister g = hs[7]; 4.856 + int start_idx = 1; 4.857 +#endif 4.858 + 4.859 + Register addr = R7; 4.860 + VectorRegister vRb = VR8; 4.861 + Register tmp = R8; 4.862 + Label state_aligned, after_state_aligned; 4.863 + 4.864 + andi_(tmp, state, 0xf); 4.865 + beq(CCR0, state_aligned); 4.866 + 4.867 + // deal with unaligned addresses 4.868 + VectorRegister aux = VR9; 4.869 + 4.870 + lvx(hs[start_idx], state); 4.871 + load_perm(vRb, state); 4.872 + 4.873 + for (int n = start_idx + 2; n < total_hs; n += 2) { 4.874 + VectorRegister h_cur = hs[n]; 4.875 + VectorRegister h_prev2 = hs[n - 2]; 4.876 + addi(addr, state, (n/2) * 16); 4.877 + lvx(h_cur, addr); 4.878 + vec_perm(h_prev2, h_cur, vRb); 4.879 + } 4.880 + addi(addr, state, (total_hs/2) * 16); 4.881 + lvx (aux, addr); 4.882 + vec_perm(hs[total_hs - 2 + start_idx], aux, vRb); 4.883 + b (after_state_aligned); 4.884 + 4.885 + bind(state_aligned); 4.886 + 4.887 + // deal with aligned addresses 4.888 + lvx(hs[start_idx], state); 4.889 + 4.890 + for (int n = start_idx + 2; n < total_hs; n += 2) { 4.891 + VectorRegister h_cur = hs[n]; 4.892 + addi(addr, state, (n/2) * 16); 4.893 + lvx(h_cur, addr); 4.894 + } 4.895 + 4.896 + bind(after_state_aligned); 4.897 +} 4.898 + 4.899 +static const uint64_t sha512_round_table[80] __attribute((aligned(16))) = { 4.900 + 0x428a2f98d728ae22, 0x7137449123ef65cd, 4.901 + 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc, 4.902 + 0x3956c25bf348b538, 0x59f111f1b605d019, 4.903 + 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, 4.904 + 0xd807aa98a3030242, 0x12835b0145706fbe, 4.905 + 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2, 4.906 + 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 4.907 + 0x9bdc06a725c71235, 0xc19bf174cf692694, 4.908 + 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 4.909 + 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, 4.910 + 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 4.911 + 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5, 4.912 + 0x983e5152ee66dfab, 0xa831c66d2db43210, 4.913 + 0xb00327c898fb213f, 0xbf597fc7beef0ee4, 4.914 + 0xc6e00bf33da88fc2, 0xd5a79147930aa725, 4.915 + 0x06ca6351e003826f, 0x142929670a0e6e70, 4.916 + 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 4.917 + 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df, 4.918 + 0x650a73548baf63de, 0x766a0abb3c77b2a8, 4.919 + 0x81c2c92e47edaee6, 0x92722c851482353b, 4.920 + 0xa2bfe8a14cf10364, 0xa81a664bbc423001, 4.921 + 0xc24b8b70d0f89791, 0xc76c51a30654be30, 4.922 + 0xd192e819d6ef5218, 0xd69906245565a910, 4.923 + 0xf40e35855771202a, 0x106aa07032bbd1b8, 4.924 + 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 4.925 + 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8, 4.926 + 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 4.927 + 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, 4.928 + 0x748f82ee5defb2fc, 0x78a5636f43172f60, 4.929 + 0x84c87814a1f0ab72, 0x8cc702081a6439ec, 4.930 + 0x90befffa23631e28, 0xa4506cebde82bde9, 4.931 + 0xbef9a3f7b2c67915, 0xc67178f2e372532b, 4.932 + 0xca273eceea26619c, 0xd186b8c721c0c207, 4.933 + 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, 4.934 + 0x06f067aa72176fba, 0x0a637dc5a2c898a6, 4.935 + 0x113f9804bef90dae, 0x1b710b35131c471b, 4.936 + 0x28db77f523047d84, 0x32caab7b40c72493, 4.937 + 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c, 4.938 + 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 4.939 + 0x5fcb6fab3ad6faec, 0x6c44198c4a475817, 4.940 +}; 4.941 +static const uint64_t *sha512_round_consts = sha512_round_table; 4.942 + 4.943 +// R3_ARG1 - byte[] Input string with padding but in Big Endian 4.944 +// R4_ARG2 - int[] SHA.state (at first, the root of primes) 4.945 +// R5_ARG3 - int offset 4.946 +// R6_ARG4 - int limit 4.947 +// 4.948 +// Internal Register usage: 4.949 +// R7 R8 R9 - volatile temporaries 4.950 +// VR0-VR7 - a-h 4.951 +// VR8 - vRb 4.952 +// VR9 - aux (highly volatile, use with care) 4.953 +// VR10-VR17 - w0-w7 | ini_a-ini_h 4.954 +// VR18 - vsp16 | kplusw0 4.955 +// VR19 - vsp32 | kplusw1 4.956 +// VR20-VR25 - sha512_calc_2w and sha512_round temporaries 4.957 +void MacroAssembler::sha512(bool multi_block) { 4.958 + static const ssize_t buf_size = 128; 4.959 + static const uint8_t w_size = sizeof(sha512_round_table)/sizeof(uint64_t); 4.960 +#ifdef AIX 4.961 + // malloc provides 16 byte alignment 4.962 + if (((uintptr_t)sha512_round_consts & 0xF) != 0) { 4.963 + uint64_t *new_round_consts = (uint64_t*)malloc(sizeof(sha512_round_table)); 4.964 + guarantee(new_round_consts, "oom"); 4.965 + memcpy(new_round_consts, sha512_round_consts, sizeof(sha512_round_table)); 4.966 + sha512_round_consts = (const uint64_t*)new_round_consts; 4.967 + } 4.968 +#endif 4.969 + 4.970 + Register buf_in = R3_ARG1; 4.971 + Register state = R4_ARG2; 4.972 + Register ofs = R5_ARG3; 4.973 + Register limit = R6_ARG4; 4.974 + 4.975 + Label sha_loop, core_loop; 4.976 + 4.977 + // Save non-volatile vector registers in the red zone 4.978 + static const VectorRegister nv[] = { 4.979 + VR20, VR21, VR22, VR23, VR24, VR25/*, VR26, VR27, VR28, VR29, VR30, VR31*/ 4.980 + }; 4.981 + static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister); 4.982 + 4.983 + for (int c = 0; c < nv_size; c++) { 4.984 + Register idx = R7; 4.985 + li (idx, (c - (nv_size)) * 16); 4.986 + stvx(nv[c], idx, R1); 4.987 + } 4.988 + 4.989 + // Load hash state to registers 4.990 + VectorRegister a = VR0; 4.991 + VectorRegister b = VR1; 4.992 + VectorRegister c = VR2; 4.993 + VectorRegister d = VR3; 4.994 + VectorRegister e = VR4; 4.995 + VectorRegister f = VR5; 4.996 + VectorRegister g = VR6; 4.997 + VectorRegister h = VR7; 4.998 + static const VectorRegister hs[] = {a, b, c, d, e, f, g, h}; 4.999 + static const int total_hs = sizeof(hs)/sizeof(VectorRegister); 4.1000 + // counter for cycling through hs vector to avoid register moves between iterations 4.1001 + int h_cnt = 0; 4.1002 + 4.1003 + // Load a-h registers from the memory pointed by state 4.1004 + sha512_load_h_vec(state, hs, total_hs); 4.1005 + 4.1006 + Register k = R9; 4.1007 + assert(((uintptr_t)sha512_round_consts & 0xF) == 0, "k alignment"); 4.1008 + load_const_optimized(k, (address)sha512_round_consts, R0); 4.1009 + 4.1010 + if (multi_block) { 4.1011 + align(OptoLoopAlignment); 4.1012 + } 4.1013 + bind(sha_loop); 4.1014 + 4.1015 + for (int n = 0; n < total_hs; n += 2) { 4.1016 +#if defined(VM_LITTLE_ENDIAN) 4.1017 + VectorRegister h_cur = hs[n]; 4.1018 + VectorRegister h_next = hs[n + 1]; 4.1019 +#else 4.1020 + VectorRegister h_cur = hs[n + 1]; 4.1021 + VectorRegister h_next = hs[n]; 4.1022 +#endif 4.1023 + vsldoi (h_next, h_cur, h_cur, 8); 4.1024 + } 4.1025 + 4.1026 + // Load 16 elements from w out of the loop. 4.1027 + // Order of the long values is Endianess specific. 4.1028 + VectorRegister w0 = VR10; 4.1029 + VectorRegister w1 = VR11; 4.1030 + VectorRegister w2 = VR12; 4.1031 + VectorRegister w3 = VR13; 4.1032 + VectorRegister w4 = VR14; 4.1033 + VectorRegister w5 = VR15; 4.1034 + VectorRegister w6 = VR16; 4.1035 + VectorRegister w7 = VR17; 4.1036 + static const VectorRegister ws[] = {w0, w1, w2, w3, w4, w5, w6, w7}; 4.1037 + static const int total_ws = sizeof(ws)/sizeof(VectorRegister); 4.1038 + 4.1039 + // Load 16 w into vectors and setup vsl for vperm 4.1040 + sha512_load_w_vec(buf_in, ws, total_ws); 4.1041 + 4.1042 +#if defined(VM_LITTLE_ENDIAN) 4.1043 + VectorRegister vsp16 = VR18; 4.1044 + VectorRegister vsp32 = VR19; 4.1045 + VectorRegister shiftarg = VR9; 4.1046 + 4.1047 + vspltisw(vsp16, 8); 4.1048 + vspltisw(shiftarg, 1); 4.1049 + vsl (vsp16, vsp16, shiftarg); 4.1050 + vsl (vsp32, vsp16, shiftarg); 4.1051 + 4.1052 + VectorRegister vsp8 = VR9; 4.1053 + vspltish(vsp8, 8); 4.1054 + 4.1055 + // Convert input from Big Endian to Little Endian 4.1056 + for (int c = 0; c < total_ws; c++) { 4.1057 + VectorRegister w = ws[c]; 4.1058 + vrlh (w, w, vsp8); 4.1059 + } 4.1060 + for (int c = 0; c < total_ws; c++) { 4.1061 + VectorRegister w = ws[c]; 4.1062 + vrlw (w, w, vsp16); 4.1063 + } 4.1064 + for (int c = 0; c < total_ws; c++) { 4.1065 + VectorRegister w = ws[c]; 4.1066 + vrld (w, w, vsp32); 4.1067 + } 4.1068 +#endif 4.1069 + 4.1070 + Register Rb = R10; 4.1071 + VectorRegister vRb = VR8; 4.1072 + li (Rb, 8); 4.1073 + load_perm(vRb, Rb); 4.1074 + 4.1075 + VectorRegister kplusw0 = VR18; 4.1076 + VectorRegister kplusw1 = VR19; 4.1077 + 4.1078 + Register addr = R7; 4.1079 + 4.1080 + for (int n = 0; n < total_ws; n++) { 4.1081 + VectorRegister w = ws[n]; 4.1082 + 4.1083 + if (n == 0) { 4.1084 + lvx (kplusw0, k); 4.1085 + } else { 4.1086 + addi (addr, k, n * 16); 4.1087 + lvx (kplusw0, addr); 4.1088 + } 4.1089 +#if defined(VM_LITTLE_ENDIAN) 4.1090 + vaddudm(kplusw0, kplusw0, w); 4.1091 + vsldoi (kplusw1, kplusw0, kplusw0, 8); 4.1092 +#else 4.1093 + vaddudm(kplusw1, kplusw0, w); 4.1094 + vsldoi (kplusw0, kplusw1, kplusw1, 8); 4.1095 +#endif 4.1096 + 4.1097 + sha512_round(hs, total_hs, h_cnt, kplusw0); 4.1098 + sha512_round(hs, total_hs, h_cnt, kplusw1); 4.1099 + } 4.1100 + 4.1101 + Register tmp = R8; 4.1102 + li (tmp, (w_size-16)/total_hs); 4.1103 + mtctr (tmp); 4.1104 + // j will be aligned to 4 for loading words. 4.1105 + // Whenever read, advance the pointer (e.g: when j is used in a function) 4.1106 + Register j = tmp; 4.1107 + li (j, 8*16); 4.1108 + 4.1109 + align(OptoLoopAlignment); 4.1110 + bind(core_loop); 4.1111 + 4.1112 + // due to VectorRegister rotate, always iterate in multiples of total_hs 4.1113 + for (int n = 0; n < total_hs/2; n++) { 4.1114 + sha512_calc_2w(w0, w1, w2, w3, w4, w5, w6, w7, kplusw0, kplusw1, j, vRb, k); 4.1115 + sha512_round(hs, total_hs, h_cnt, kplusw0); 4.1116 + sha512_round(hs, total_hs, h_cnt, kplusw1); 4.1117 + } 4.1118 + 4.1119 + bdnz (core_loop); 4.1120 + 4.1121 + sha512_update_sha_state(state, hs, total_hs); 4.1122 + 4.1123 + if (multi_block) { 4.1124 + addi(buf_in, buf_in, buf_size); 4.1125 + addi(ofs, ofs, buf_size); 4.1126 + cmplw(CCR0, ofs, limit); 4.1127 + ble(CCR0, sha_loop); 4.1128 + 4.1129 + // return ofs 4.1130 + mr(R3_RET, ofs); 4.1131 + } 4.1132 + 4.1133 + // Restore non-volatile registers 4.1134 + for (int c = 0; c < nv_size; c++) { 4.1135 + Register idx = R7; 4.1136 + li (idx, (c - (nv_size)) * 16); 4.1137 + lvx(nv[c], idx, R1); 4.1138 + } 4.1139 +}
5.1 --- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp Mon Jun 17 17:20:10 2019 +0100 5.2 +++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp Tue Jun 18 09:33:34 2019 -0400 5.3 @@ -2652,6 +2652,28 @@ 5.4 return start; 5.5 } 5.6 5.7 + address generate_sha256_implCompress(bool multi_block, const char *name) { 5.8 + assert(UseSHA, "need SHA instructions"); 5.9 + StubCodeMark mark(this, "StubRoutines", name); 5.10 + address start = __ function_entry(); 5.11 + 5.12 + __ sha256 (multi_block); 5.13 + 5.14 + __ blr(); 5.15 + return start; 5.16 + } 5.17 + 5.18 + address generate_sha512_implCompress(bool multi_block, const char *name) { 5.19 + assert(UseSHA, "need SHA instructions"); 5.20 + StubCodeMark mark(this, "StubRoutines", name); 5.21 + address start = __ function_entry(); 5.22 + 5.23 + __ sha512 (multi_block); 5.24 + 5.25 + __ blr(); 5.26 + return start; 5.27 + } 5.28 + 5.29 void generate_arraycopy_stubs() { 5.30 // Note: the disjoint stubs must be generated first, some of 5.31 // the conjoint stubs use them. 5.32 @@ -2881,6 +2903,15 @@ 5.33 StubRoutines::_montgomerySquare 5.34 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square); 5.35 } 5.36 + 5.37 + if (UseSHA256Intrinsics) { 5.38 + StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5.39 + StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5.40 + } 5.41 + if (UseSHA512Intrinsics) { 5.42 + StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 5.43 + StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 5.44 + } 5.45 } 5.46 5.47 public:
6.1 --- a/src/cpu/ppc/vm/stubRoutines_ppc_64.hpp Mon Jun 17 17:20:10 2019 +0100 6.2 +++ b/src/cpu/ppc/vm/stubRoutines_ppc_64.hpp Tue Jun 18 09:33:34 2019 -0400 6.3 @@ -34,7 +34,7 @@ 6.4 6.5 enum platform_dependent_constants { 6.6 code_size1 = 20000, // simply increase if too small (assembler will crash if too small) 6.7 - code_size2 = 20000 // simply increase if too small (assembler will crash if too small) 6.8 + code_size2 = 22000 // simply increase if too small (assembler will crash if too small) 6.9 }; 6.10 6.11 // CRC32 Intrinsics.
7.1 --- a/src/cpu/ppc/vm/vm_version_ppc.cpp Mon Jun 17 17:20:10 2019 +0100 7.2 +++ b/src/cpu/ppc/vm/vm_version_ppc.cpp Tue Jun 18 09:33:34 2019 -0400 7.3 @@ -110,7 +110,7 @@ 7.4 // Create and print feature-string. 7.5 char buf[(num_features+1) * 16]; // Max 16 chars per feature. 7.6 jio_snprintf(buf, sizeof(buf), 7.7 - "ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s", 7.8 + "ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s%s", 7.9 (has_fsqrt() ? " fsqrt" : ""), 7.10 (has_isel() ? " isel" : ""), 7.11 (has_lxarxeh() ? " lxarxeh" : ""), 7.12 @@ -124,7 +124,8 @@ 7.13 (has_vcipher() ? " aes" : ""), 7.14 (has_vpmsumb() ? " vpmsumb" : ""), 7.15 (has_mfdscr() ? " mfdscr" : ""), 7.16 - (has_vsx() ? " vsx" : "") 7.17 + (has_vsx() ? " vsx" : ""), 7.18 + (has_vshasig() ? " sha" : "") 7.19 // Make sure number of %s matches num_features! 7.20 ); 7.21 _features_str = strdup(buf); 7.22 @@ -206,17 +207,43 @@ 7.23 } 7.24 #endif 7.25 7.26 - if (UseSHA) { 7.27 - warning("SHA instructions are not available on this CPU"); 7.28 + if (has_vshasig()) { 7.29 + if (FLAG_IS_DEFAULT(UseSHA)) { 7.30 + UseSHA = true; 7.31 + } 7.32 + } else if (UseSHA) { 7.33 + if (!FLAG_IS_DEFAULT(UseSHA)) 7.34 + warning("SHA instructions are not available on this CPU"); 7.35 FLAG_SET_DEFAULT(UseSHA, false); 7.36 } 7.37 - if (UseSHA1Intrinsics || UseSHA256Intrinsics || UseSHA512Intrinsics) { 7.38 - warning("SHA intrinsics are not available on this CPU"); 7.39 + 7.40 + if (UseSHA1Intrinsics) { 7.41 + warning("Intrinsics for SHA-1 crypto hash functions not available on this CPU."); 7.42 FLAG_SET_DEFAULT(UseSHA1Intrinsics, false); 7.43 + } 7.44 + 7.45 + if (UseSHA && has_vshasig()) { 7.46 + if (FLAG_IS_DEFAULT(UseSHA256Intrinsics)) { 7.47 + FLAG_SET_DEFAULT(UseSHA256Intrinsics, true); 7.48 + } 7.49 + } else if (UseSHA256Intrinsics) { 7.50 + warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU."); 7.51 FLAG_SET_DEFAULT(UseSHA256Intrinsics, false); 7.52 + } 7.53 + 7.54 + if (UseSHA && has_vshasig()) { 7.55 + if (FLAG_IS_DEFAULT(UseSHA512Intrinsics)) { 7.56 + FLAG_SET_DEFAULT(UseSHA512Intrinsics, true); 7.57 + } 7.58 + } else if (UseSHA512Intrinsics) { 7.59 + warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU."); 7.60 FLAG_SET_DEFAULT(UseSHA512Intrinsics, false); 7.61 } 7.62 7.63 + if (!(UseSHA1Intrinsics || UseSHA256Intrinsics || UseSHA512Intrinsics)) { 7.64 + FLAG_SET_DEFAULT(UseSHA, false); 7.65 + } 7.66 + 7.67 if (FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) { 7.68 UseMontgomeryMultiplyIntrinsic = true; 7.69 } 7.70 @@ -503,6 +530,7 @@ 7.71 a->vpmsumb(VR0, VR1, VR2); // code[12] -> vpmsumb 7.72 a->mfdscr(R0); // code[13] -> mfdscr 7.73 a->lxvd2x(VSR0, R3_ARG1); // code[14] -> vsx 7.74 + a->vshasigmaw(VR0, VR1, 1, 0xF); // code[15] -> vshasig 7.75 a->blr(); 7.76 7.77 // Emit function to set one cache line to zero. Emit function descriptor and get pointer to it. 7.78 @@ -551,6 +579,7 @@ 7.79 if (code[feature_cntr++]) features |= vpmsumb_m; 7.80 if (code[feature_cntr++]) features |= mfdscr_m; 7.81 if (code[feature_cntr++]) features |= vsx_m; 7.82 + if (code[feature_cntr++]) features |= vshasig_m; 7.83 7.84 // Print the detection code. 7.85 if (PrintAssembly) {
8.1 --- a/src/cpu/ppc/vm/vm_version_ppc.hpp Mon Jun 17 17:20:10 2019 +0100 8.2 +++ b/src/cpu/ppc/vm/vm_version_ppc.hpp Tue Jun 18 09:33:34 2019 -0400 8.3 @@ -47,6 +47,7 @@ 8.4 vpmsumb, 8.5 mfdscr, 8.6 vsx, 8.7 + vshasig, 8.8 num_features // last entry to count features 8.9 }; 8.10 enum Feature_Flag_Set { 8.11 @@ -63,6 +64,7 @@ 8.12 dcba_m = (1 << dcba ), 8.13 lqarx_m = (1 << lqarx ), 8.14 vcipher_m = (1 << vcipher), 8.15 + vshasig_m = (1 << vshasig), 8.16 vpmsumb_m = (1 << vpmsumb), 8.17 mfdscr_m = (1 << mfdscr ), 8.18 vsx_m = (1 << vsx ), 8.19 @@ -99,6 +101,7 @@ 8.20 static bool has_vpmsumb() { return (_features & vpmsumb_m) != 0; } 8.21 static bool has_mfdscr() { return (_features & mfdscr_m) != 0; } 8.22 static bool has_vsx() { return (_features & vsx_m) != 0; } 8.23 + static bool has_vshasig() { return (_features & vshasig_m) != 0; } 8.24 8.25 static const char* cpu_features() { return _features_str; } 8.26
9.1 --- a/src/share/vm/opto/library_call.cpp Mon Jun 17 17:20:10 2019 +0100 9.2 +++ b/src/share/vm/opto/library_call.cpp Tue Jun 18 09:33:34 2019 -0400 9.3 @@ -1,5 +1,5 @@ 9.4 /* 9.5 - * Copyright (c) 1999, 2018, Oracle and/or its affiliates. All rights reserved. 9.6 + * Copyright (c) 1999, 2019, Oracle and/or its affiliates. All rights reserved. 9.7 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 9.8 * 9.9 * This code is free software; you can redistribute it and/or modify it 9.10 @@ -6759,10 +6759,18 @@ 9.11 if (state == NULL) return false; 9.12 9.13 // Call the stub. 9.14 - Node* call = make_runtime_call(RC_LEAF|RC_NO_FP, 9.15 - OptoRuntime::digestBase_implCompressMB_Type(), 9.16 - stubAddr, stubName, TypePtr::BOTTOM, 9.17 - src_start, state, ofs, limit); 9.18 + Node *call; 9.19 + if (CCallingConventionRequiresIntsAsLongs) { 9.20 + call = make_runtime_call(RC_LEAF|RC_NO_FP, 9.21 + OptoRuntime::digestBase_implCompressMB_Type(), 9.22 + stubAddr, stubName, TypePtr::BOTTOM, 9.23 + src_start, state, ofs XTOP, limit XTOP); 9.24 + } else { 9.25 + call = make_runtime_call(RC_LEAF|RC_NO_FP, 9.26 + OptoRuntime::digestBase_implCompressMB_Type(), 9.27 + stubAddr, stubName, TypePtr::BOTTOM, 9.28 + src_start, state, ofs, limit); 9.29 + } 9.30 // return ofs (int) 9.31 Node* result = _gvn.transform(new (C) ProjNode(call, TypeFunc::Parms)); 9.32 set_result(result);
10.1 --- a/src/share/vm/opto/runtime.cpp Mon Jun 17 17:20:10 2019 +0100 10.2 +++ b/src/share/vm/opto/runtime.cpp Tue Jun 18 09:33:34 2019 -0400 10.3 @@ -1,5 +1,5 @@ 10.4 /* 10.5 - * Copyright (c) 1998, 2018, Oracle and/or its affiliates. All rights reserved. 10.6 + * Copyright (c) 1998, 2019, Oracle and/or its affiliates. All rights reserved. 10.7 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 10.8 * 10.9 * This code is free software; you can redistribute it and/or modify it 10.10 @@ -930,12 +930,24 @@ 10.11 // create input type (domain) 10.12 int num_args = 4; 10.13 int argcnt = num_args; 10.14 + if(CCallingConventionRequiresIntsAsLongs) { 10.15 + argcnt += 2; 10.16 + } 10.17 const Type** fields = TypeTuple::fields(argcnt); 10.18 int argp = TypeFunc::Parms; 10.19 - fields[argp++] = TypePtr::NOTNULL; // buf 10.20 - fields[argp++] = TypePtr::NOTNULL; // state 10.21 - fields[argp++] = TypeInt::INT; // ofs 10.22 - fields[argp++] = TypeInt::INT; // limit 10.23 + if(CCallingConventionRequiresIntsAsLongs) { 10.24 + fields[argp++] = TypePtr::NOTNULL; // buf 10.25 + fields[argp++] = TypePtr::NOTNULL; // state 10.26 + fields[argp++] = TypeLong::LONG; // ofs 10.27 + fields[argp++] = Type::HALF; 10.28 + fields[argp++] = TypeLong::LONG; // limit 10.29 + fields[argp++] = Type::HALF; 10.30 + } else { 10.31 + fields[argp++] = TypePtr::NOTNULL; // buf 10.32 + fields[argp++] = TypePtr::NOTNULL; // state 10.33 + fields[argp++] = TypeInt::INT; // ofs 10.34 + fields[argp++] = TypeInt::INT; // limit 10.35 + } 10.36 assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); 10.37 const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); 10.38
11.1 --- a/test/compiler/intrinsics/sha/cli/testcases/GenericTestCaseForOtherCPU.java Mon Jun 17 17:20:10 2019 +0100 11.2 +++ b/test/compiler/intrinsics/sha/cli/testcases/GenericTestCaseForOtherCPU.java Tue Jun 18 09:33:34 2019 -0400 11.3 @@ -36,7 +36,8 @@ 11.4 public GenericTestCaseForOtherCPU(String optionName) { 11.5 // Execute the test case on any CPU except SPARC and X86 11.6 super(optionName, new NotPredicate(new OrPredicate(Platform::isSparc, 11.7 - new OrPredicate(Platform::isX64, Platform::isX86)))); 11.8 + new OrPredicate(Platform::isPPC, 11.9 + new OrPredicate(Platform::isX64, Platform::isX86))))); 11.10 } 11.11 11.12 @Override
12.1 --- a/test/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java Mon Jun 17 17:20:10 2019 +0100 12.2 +++ b/test/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java Tue Jun 18 09:33:34 2019 -0400 12.3 @@ -63,12 +63,20 @@ 12.4 null); 12.5 12.6 public static final BooleanSupplier SHA256_INSTRUCTION_AVAILABLE 12.7 - = new CPUSpecificPredicate("sparc.*", new String[] { "sha256" }, 12.8 - null); 12.9 + = new OrPredicate(new CPUSpecificPredicate("sparc.*", new String[] { "sha256" }, 12.10 + null), 12.11 + new OrPredicate(new CPUSpecificPredicate("ppc64.*", new String[] { "sha" }, 12.12 + null), 12.13 + new CPUSpecificPredicate("ppc64le.*", new String[] { "sha" }, 12.14 + null))); 12.15 12.16 public static final BooleanSupplier SHA512_INSTRUCTION_AVAILABLE 12.17 - = new CPUSpecificPredicate("sparc.*", new String[] { "sha512" }, 12.18 - null); 12.19 + = new OrPredicate(new CPUSpecificPredicate("sparc.*", new String[] { "sha512" }, 12.20 + null), 12.21 + new OrPredicate(new CPUSpecificPredicate("ppc64.*", new String[] { "sha" }, 12.22 + null), 12.23 + new CPUSpecificPredicate("ppc64le.*", new String[] { "sha" }, 12.24 + null))); 12.25 12.26 public static final BooleanSupplier ANY_SHA_INSTRUCTION_AVAILABLE 12.27 = new OrPredicate(IntrinsicPredicates.SHA1_INSTRUCTION_AVAILABLE,