Tue, 18 Jun 2019 09:33:34 -0400
8185979: PPC64: Implement SHA2 intrinsic
Reviewed-by: mdoerr, goetz
Contributed-by: Bruno Rosa <bruno.rosa@eldorado.org.br>, Gustavo Serra Scalet <gustavo.scalet@eldorado.org.br>, Igor Nunes <igor.nunes@eldorado.org.br>, Martin Doerr <martin.doerr@sap.com>
ogatak@9713 | 1 | // Copyright (c) 2017 Instituto de Pesquisas Eldorado. All rights reserved. |
ogatak@9713 | 2 | // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
ogatak@9713 | 3 | // |
ogatak@9713 | 4 | // This code is free software; you can redistribute it and/or modify it |
ogatak@9713 | 5 | // under the terms of the GNU General Public License version 2 only, as |
ogatak@9713 | 6 | // published by the Free Software Foundation. |
ogatak@9713 | 7 | // |
ogatak@9713 | 8 | // This code is distributed in the hope that it will be useful, but WITHOUT |
ogatak@9713 | 9 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
ogatak@9713 | 10 | // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
ogatak@9713 | 11 | // version 2 for more details (a copy is included in the LICENSE file that |
ogatak@9713 | 12 | // accompanied this code). |
ogatak@9713 | 13 | // |
ogatak@9713 | 14 | // You should have received a copy of the GNU General Public License version |
ogatak@9713 | 15 | // 2 along with this work; if not, write to the Free Software Foundation, |
ogatak@9713 | 16 | // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
ogatak@9713 | 17 | // |
ogatak@9713 | 18 | // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
ogatak@9713 | 19 | // or visit www.oracle.com if you need additional information or have any |
ogatak@9713 | 20 | // questions. |
ogatak@9713 | 21 | |
ogatak@9713 | 22 | // Implemented according to "Descriptions of SHA-256, SHA-384, and SHA-512" |
ogatak@9713 | 23 | // (http://www.iwar.org.uk/comsec/resources/cipher/sha256-384-512.pdf). |
ogatak@9713 | 24 | |
ogatak@9713 | 25 | #include "asm/macroAssembler.inline.hpp" |
ogatak@9713 | 26 | #include "runtime/stubRoutines.hpp" |
ogatak@9713 | 27 | |
ogatak@9713 | 28 | /********************************************************************** |
ogatak@9713 | 29 | * SHA 256 |
ogatak@9713 | 30 | *********************************************************************/ |
ogatak@9713 | 31 | |
ogatak@9713 | 32 | void MacroAssembler::sha256_deque(const VectorRegister src, |
ogatak@9713 | 33 | const VectorRegister dst1, |
ogatak@9713 | 34 | const VectorRegister dst2, |
ogatak@9713 | 35 | const VectorRegister dst3) { |
ogatak@9713 | 36 | vsldoi (dst1, src, src, 12); |
ogatak@9713 | 37 | vsldoi (dst2, src, src, 8); |
ogatak@9713 | 38 | vsldoi (dst3, src, src, 4); |
ogatak@9713 | 39 | } |
ogatak@9713 | 40 | |
ogatak@9713 | 41 | void MacroAssembler::sha256_round(const VectorRegister* hs, |
ogatak@9713 | 42 | const int total_hs, |
ogatak@9713 | 43 | int& h_cnt, |
ogatak@9713 | 44 | const VectorRegister kpw) { |
ogatak@9713 | 45 | // convenience registers: cycle from 0-7 downwards |
ogatak@9713 | 46 | const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs]; |
ogatak@9713 | 47 | const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs]; |
ogatak@9713 | 48 | const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs]; |
ogatak@9713 | 49 | const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs]; |
ogatak@9713 | 50 | const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs]; |
ogatak@9713 | 51 | const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs]; |
ogatak@9713 | 52 | const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs]; |
ogatak@9713 | 53 | const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs]; |
ogatak@9713 | 54 | // temporaries |
ogatak@9713 | 55 | VectorRegister ch = VR0; |
ogatak@9713 | 56 | VectorRegister maj = VR1; |
ogatak@9713 | 57 | VectorRegister bsa = VR2; |
ogatak@9713 | 58 | VectorRegister bse = VR3; |
ogatak@9713 | 59 | VectorRegister vt0 = VR4; |
ogatak@9713 | 60 | VectorRegister vt1 = VR5; |
ogatak@9713 | 61 | VectorRegister vt2 = VR6; |
ogatak@9713 | 62 | VectorRegister vt3 = VR7; |
ogatak@9713 | 63 | |
ogatak@9713 | 64 | vsel (ch, g, f, e); |
ogatak@9713 | 65 | vxor (maj, a, b); |
ogatak@9713 | 66 | vshasigmaw (bse, e, 1, 0xf); |
ogatak@9713 | 67 | vadduwm (vt2, ch, kpw); |
ogatak@9713 | 68 | vadduwm (vt1, h, bse); |
ogatak@9713 | 69 | vsel (maj, b, c, maj); |
ogatak@9713 | 70 | vadduwm (vt3, vt1, vt2); |
ogatak@9713 | 71 | vshasigmaw (bsa, a, 1, 0); |
ogatak@9713 | 72 | vadduwm (vt0, bsa, maj); |
ogatak@9713 | 73 | |
ogatak@9713 | 74 | vadduwm (d, d, vt3); |
ogatak@9713 | 75 | vadduwm (h, vt3, vt0); |
ogatak@9713 | 76 | |
ogatak@9713 | 77 | // advance vector pointer to the next iteration |
ogatak@9713 | 78 | h_cnt++; |
ogatak@9713 | 79 | } |
ogatak@9713 | 80 | |
ogatak@9713 | 81 | void MacroAssembler::sha256_load_h_vec(const VectorRegister a, |
ogatak@9713 | 82 | const VectorRegister e, |
ogatak@9713 | 83 | const Register hptr) { |
ogatak@9713 | 84 | // temporaries |
ogatak@9713 | 85 | Register tmp = R8; |
ogatak@9713 | 86 | VectorRegister vt0 = VR0; |
ogatak@9713 | 87 | VectorRegister vRb = VR6; |
ogatak@9713 | 88 | // labels |
ogatak@9713 | 89 | Label sha256_aligned; |
ogatak@9713 | 90 | |
ogatak@9713 | 91 | andi_ (tmp, hptr, 0xf); |
ogatak@9713 | 92 | lvx (a, hptr); |
ogatak@9713 | 93 | addi (tmp, hptr, 16); |
ogatak@9713 | 94 | lvx (e, tmp); |
ogatak@9713 | 95 | beq (CCR0, sha256_aligned); |
ogatak@9713 | 96 | |
ogatak@9713 | 97 | // handle unaligned accesses |
ogatak@9713 | 98 | load_perm(vRb, hptr); |
ogatak@9713 | 99 | addi (tmp, hptr, 32); |
ogatak@9713 | 100 | vec_perm(a, e, vRb); |
ogatak@9713 | 101 | |
ogatak@9713 | 102 | lvx (vt0, tmp); |
ogatak@9713 | 103 | vec_perm(e, vt0, vRb); |
ogatak@9713 | 104 | |
ogatak@9713 | 105 | // aligned accesses |
ogatak@9713 | 106 | bind(sha256_aligned); |
ogatak@9713 | 107 | } |
ogatak@9713 | 108 | |
ogatak@9713 | 109 | void MacroAssembler::sha256_load_w_plus_k_vec(const Register buf_in, |
ogatak@9713 | 110 | const VectorRegister* ws, |
ogatak@9713 | 111 | const int total_ws, |
ogatak@9713 | 112 | const Register k, |
ogatak@9713 | 113 | const VectorRegister* kpws, |
ogatak@9713 | 114 | const int total_kpws) { |
ogatak@9713 | 115 | Label w_aligned, after_w_load; |
ogatak@9713 | 116 | |
ogatak@9713 | 117 | Register tmp = R8; |
ogatak@9713 | 118 | VectorRegister vt0 = VR0; |
ogatak@9713 | 119 | VectorRegister vt1 = VR1; |
ogatak@9713 | 120 | VectorRegister vRb = VR6; |
ogatak@9713 | 121 | |
ogatak@9713 | 122 | andi_ (tmp, buf_in, 0xF); |
ogatak@9713 | 123 | beq (CCR0, w_aligned); // address ends with 0x0, not 0x8 |
ogatak@9713 | 124 | |
ogatak@9713 | 125 | // deal with unaligned addresses |
ogatak@9713 | 126 | lvx (ws[0], buf_in); |
ogatak@9713 | 127 | load_perm(vRb, buf_in); |
ogatak@9713 | 128 | |
ogatak@9713 | 129 | for (int n = 1; n < total_ws; n++) { |
ogatak@9713 | 130 | VectorRegister w_cur = ws[n]; |
ogatak@9713 | 131 | VectorRegister w_prev = ws[n-1]; |
ogatak@9713 | 132 | |
ogatak@9713 | 133 | addi (tmp, buf_in, n * 16); |
ogatak@9713 | 134 | lvx (w_cur, tmp); |
ogatak@9713 | 135 | vec_perm(w_prev, w_cur, vRb); |
ogatak@9713 | 136 | } |
ogatak@9713 | 137 | addi (tmp, buf_in, total_ws * 16); |
ogatak@9713 | 138 | lvx (vt0, tmp); |
ogatak@9713 | 139 | vec_perm(ws[total_ws-1], vt0, vRb); |
ogatak@9713 | 140 | b (after_w_load); |
ogatak@9713 | 141 | |
ogatak@9713 | 142 | bind(w_aligned); |
ogatak@9713 | 143 | |
ogatak@9713 | 144 | // deal with aligned addresses |
ogatak@9713 | 145 | lvx(ws[0], buf_in); |
ogatak@9713 | 146 | for (int n = 1; n < total_ws; n++) { |
ogatak@9713 | 147 | VectorRegister w = ws[n]; |
ogatak@9713 | 148 | addi (tmp, buf_in, n * 16); |
ogatak@9713 | 149 | lvx (w, tmp); |
ogatak@9713 | 150 | } |
ogatak@9713 | 151 | |
ogatak@9713 | 152 | bind(after_w_load); |
ogatak@9713 | 153 | |
ogatak@9713 | 154 | #if defined(VM_LITTLE_ENDIAN) |
ogatak@9713 | 155 | // Byte swapping within int values |
ogatak@9713 | 156 | li (tmp, 8); |
ogatak@9713 | 157 | lvsl (vt0, tmp); |
ogatak@9713 | 158 | vspltisb (vt1, 0xb); |
ogatak@9713 | 159 | vxor (vt1, vt0, vt1); |
ogatak@9713 | 160 | for (int n = 0; n < total_ws; n++) { |
ogatak@9713 | 161 | VectorRegister w = ws[n]; |
ogatak@9713 | 162 | vec_perm(w, w, vt1); |
ogatak@9713 | 163 | } |
ogatak@9713 | 164 | #endif |
ogatak@9713 | 165 | |
ogatak@9713 | 166 | // Loading k, which is always aligned to 16-bytes |
ogatak@9713 | 167 | lvx (kpws[0], k); |
ogatak@9713 | 168 | for (int n = 1; n < total_kpws; n++) { |
ogatak@9713 | 169 | VectorRegister kpw = kpws[n]; |
ogatak@9713 | 170 | addi (tmp, k, 16 * n); |
ogatak@9713 | 171 | lvx (kpw, tmp); |
ogatak@9713 | 172 | } |
ogatak@9713 | 173 | |
ogatak@9713 | 174 | // Add w to K |
ogatak@9713 | 175 | assert(total_ws == total_kpws, "Redesign the loop below"); |
ogatak@9713 | 176 | for (int n = 0; n < total_kpws; n++) { |
ogatak@9713 | 177 | VectorRegister kpw = kpws[n]; |
ogatak@9713 | 178 | VectorRegister w = ws[n]; |
ogatak@9713 | 179 | |
ogatak@9713 | 180 | vadduwm (kpw, kpw, w); |
ogatak@9713 | 181 | } |
ogatak@9713 | 182 | } |
ogatak@9713 | 183 | |
ogatak@9713 | 184 | void MacroAssembler::sha256_calc_4w(const VectorRegister w0, |
ogatak@9713 | 185 | const VectorRegister w1, |
ogatak@9713 | 186 | const VectorRegister w2, |
ogatak@9713 | 187 | const VectorRegister w3, |
ogatak@9713 | 188 | const VectorRegister kpw0, |
ogatak@9713 | 189 | const VectorRegister kpw1, |
ogatak@9713 | 190 | const VectorRegister kpw2, |
ogatak@9713 | 191 | const VectorRegister kpw3, |
ogatak@9713 | 192 | const Register j, |
ogatak@9713 | 193 | const Register k) { |
ogatak@9713 | 194 | // Temporaries |
ogatak@9713 | 195 | const VectorRegister vt0 = VR0; |
ogatak@9713 | 196 | const VectorRegister vt1 = VR1; |
ogatak@9713 | 197 | const VectorSRegister vsrt1 = vt1->to_vsr(); |
ogatak@9713 | 198 | const VectorRegister vt2 = VR2; |
ogatak@9713 | 199 | const VectorRegister vt3 = VR3; |
ogatak@9713 | 200 | const VectorSRegister vst3 = vt3->to_vsr(); |
ogatak@9713 | 201 | const VectorRegister vt4 = VR4; |
ogatak@9713 | 202 | |
ogatak@9713 | 203 | // load to k[j] |
ogatak@9713 | 204 | lvx (vt0, j, k); |
ogatak@9713 | 205 | |
ogatak@9713 | 206 | // advance j |
ogatak@9713 | 207 | addi (j, j, 16); // 16 bytes were read |
ogatak@9713 | 208 | |
ogatak@9713 | 209 | #if defined(VM_LITTLE_ENDIAN) |
ogatak@9713 | 210 | // b = w[j-15], w[j-14], w[j-13], w[j-12] |
ogatak@9713 | 211 | vsldoi (vt1, w1, w0, 12); |
ogatak@9713 | 212 | |
ogatak@9713 | 213 | // c = w[j-7], w[j-6], w[j-5], w[j-4] |
ogatak@9713 | 214 | vsldoi (vt2, w3, w2, 12); |
ogatak@9713 | 215 | |
ogatak@9713 | 216 | #else |
ogatak@9713 | 217 | // b = w[j-15], w[j-14], w[j-13], w[j-12] |
ogatak@9713 | 218 | vsldoi (vt1, w0, w1, 4); |
ogatak@9713 | 219 | |
ogatak@9713 | 220 | // c = w[j-7], w[j-6], w[j-5], w[j-4] |
ogatak@9713 | 221 | vsldoi (vt2, w2, w3, 4); |
ogatak@9713 | 222 | #endif |
ogatak@9713 | 223 | |
ogatak@9713 | 224 | // d = w[j-2], w[j-1], w[j-4], w[j-3] |
ogatak@9713 | 225 | vsldoi (vt3, w3, w3, 8); |
ogatak@9713 | 226 | |
ogatak@9713 | 227 | // b = s0(w[j-15]) , s0(w[j-14]) , s0(w[j-13]) , s0(w[j-12]) |
ogatak@9713 | 228 | vshasigmaw (vt1, vt1, 0, 0); |
ogatak@9713 | 229 | |
ogatak@9713 | 230 | // d = s1(w[j-2]) , s1(w[j-1]) , s1(w[j-4]) , s1(w[j-3]) |
ogatak@9713 | 231 | vshasigmaw (vt3, vt3, 0, 0xf); |
ogatak@9713 | 232 | |
ogatak@9713 | 233 | // c = s0(w[j-15]) + w[j-7], |
ogatak@9713 | 234 | // s0(w[j-14]) + w[j-6], |
ogatak@9713 | 235 | // s0(w[j-13]) + w[j-5], |
ogatak@9713 | 236 | // s0(w[j-12]) + w[j-4] |
ogatak@9713 | 237 | vadduwm (vt2, vt1, vt2); |
ogatak@9713 | 238 | |
ogatak@9713 | 239 | // c = s0(w[j-15]) + w[j-7] + w[j-16], |
ogatak@9713 | 240 | // s0(w[j-14]) + w[j-6] + w[j-15], |
ogatak@9713 | 241 | // s0(w[j-13]) + w[j-5] + w[j-14], |
ogatak@9713 | 242 | // s0(w[j-12]) + w[j-4] + w[j-13] |
ogatak@9713 | 243 | vadduwm (vt2, vt2, w0); |
ogatak@9713 | 244 | |
ogatak@9713 | 245 | // e = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j] |
ogatak@9713 | 246 | // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1] |
ogatak@9713 | 247 | // s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j-4]), // UNDEFINED |
ogatak@9713 | 248 | // s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j-3]) // UNDEFINED |
ogatak@9713 | 249 | vadduwm (vt4, vt2, vt3); |
ogatak@9713 | 250 | |
ogatak@9713 | 251 | // At this point, e[0] and e[1] are the correct values to be stored at w[j] |
ogatak@9713 | 252 | // and w[j+1]. |
ogatak@9713 | 253 | // e[2] and e[3] are not considered. |
ogatak@9713 | 254 | // b = s1(w[j]) , s1(s(w[j+1]) , UNDEFINED , UNDEFINED |
ogatak@9713 | 255 | vshasigmaw (vt1, vt4, 0, 0xf); |
ogatak@9713 | 256 | |
ogatak@9713 | 257 | // v5 = s1(w[j-2]) , s1(w[j-1]) , s1(w[j]) , s1(w[j+1]) |
ogatak@9713 | 258 | #if defined(VM_LITTLE_ENDIAN) |
ogatak@9713 | 259 | xxmrgld (vst3, vsrt1, vst3); |
ogatak@9713 | 260 | #else |
ogatak@9713 | 261 | xxmrghd (vst3, vst3, vsrt1); |
ogatak@9713 | 262 | #endif |
ogatak@9713 | 263 | |
ogatak@9713 | 264 | // c = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j] |
ogatak@9713 | 265 | // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1] |
ogatak@9713 | 266 | // s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j]), // w[j+2] |
ogatak@9713 | 267 | // s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j+1]) // w[j+4] |
ogatak@9713 | 268 | vadduwm (vt2, vt2, vt3); |
ogatak@9713 | 269 | |
ogatak@9713 | 270 | // Updating w0 to w3 to hold the new previous 16 values from w. |
ogatak@9713 | 271 | vmr (w0, w1); |
ogatak@9713 | 272 | vmr (w1, w2); |
ogatak@9713 | 273 | vmr (w2, w3); |
ogatak@9713 | 274 | vmr (w3, vt2); |
ogatak@9713 | 275 | |
ogatak@9713 | 276 | // store k + w to v9 (4 values at once) |
ogatak@9713 | 277 | #if defined(VM_LITTLE_ENDIAN) |
ogatak@9713 | 278 | vadduwm (kpw0, vt2, vt0); |
ogatak@9713 | 279 | |
ogatak@9713 | 280 | vsldoi (kpw1, kpw0, kpw0, 12); |
ogatak@9713 | 281 | vsldoi (kpw2, kpw0, kpw0, 8); |
ogatak@9713 | 282 | vsldoi (kpw3, kpw0, kpw0, 4); |
ogatak@9713 | 283 | #else |
ogatak@9713 | 284 | vadduwm (kpw3, vt2, vt0); |
ogatak@9713 | 285 | |
ogatak@9713 | 286 | vsldoi (kpw2, kpw3, kpw3, 12); |
ogatak@9713 | 287 | vsldoi (kpw1, kpw3, kpw3, 8); |
ogatak@9713 | 288 | vsldoi (kpw0, kpw3, kpw3, 4); |
ogatak@9713 | 289 | #endif |
ogatak@9713 | 290 | } |
ogatak@9713 | 291 | |
ogatak@9713 | 292 | void MacroAssembler::sha256_update_sha_state(const VectorRegister a, |
ogatak@9713 | 293 | const VectorRegister b_, |
ogatak@9713 | 294 | const VectorRegister c, |
ogatak@9713 | 295 | const VectorRegister d, |
ogatak@9713 | 296 | const VectorRegister e, |
ogatak@9713 | 297 | const VectorRegister f, |
ogatak@9713 | 298 | const VectorRegister g, |
ogatak@9713 | 299 | const VectorRegister h, |
ogatak@9713 | 300 | const Register hptr) { |
ogatak@9713 | 301 | // temporaries |
ogatak@9713 | 302 | VectorRegister vt0 = VR0; |
ogatak@9713 | 303 | VectorRegister vt1 = VR1; |
ogatak@9713 | 304 | VectorRegister vt2 = VR2; |
ogatak@9713 | 305 | VectorRegister vt3 = VR3; |
ogatak@9713 | 306 | VectorRegister vt4 = VR4; |
ogatak@9713 | 307 | VectorRegister vt5 = VR5; |
ogatak@9713 | 308 | VectorRegister vaux = VR6; |
ogatak@9713 | 309 | VectorRegister vRb = VR6; |
ogatak@9713 | 310 | Register tmp = R8; |
ogatak@9713 | 311 | Register of16 = R8; |
ogatak@9713 | 312 | Register of32 = R9; |
ogatak@9713 | 313 | Label state_load_aligned; |
ogatak@9713 | 314 | |
ogatak@9713 | 315 | // Load hptr |
ogatak@9713 | 316 | andi_ (tmp, hptr, 0xf); |
ogatak@9713 | 317 | li (of16, 16); |
ogatak@9713 | 318 | lvx (vt0, hptr); |
ogatak@9713 | 319 | lvx (vt5, of16, hptr); |
ogatak@9713 | 320 | beq (CCR0, state_load_aligned); |
ogatak@9713 | 321 | |
ogatak@9713 | 322 | // handle unaligned accesses |
ogatak@9713 | 323 | li (of32, 32); |
ogatak@9713 | 324 | load_perm(vRb, hptr); |
ogatak@9713 | 325 | |
ogatak@9713 | 326 | vec_perm(vt0, vt5, vRb); // vt0 = hptr[0]..hptr[3] |
ogatak@9713 | 327 | |
ogatak@9713 | 328 | lvx (vt1, hptr, of32); |
ogatak@9713 | 329 | vec_perm(vt5, vt1, vRb); // vt5 = hptr[4]..hptr[7] |
ogatak@9713 | 330 | |
ogatak@9713 | 331 | // aligned accesses |
ogatak@9713 | 332 | bind(state_load_aligned); |
ogatak@9713 | 333 | |
ogatak@9713 | 334 | #if defined(VM_LITTLE_ENDIAN) |
ogatak@9713 | 335 | vmrglw (vt1, b_, a); // vt1 = {a, b, ?, ?} |
ogatak@9713 | 336 | vmrglw (vt2, d, c); // vt2 = {c, d, ?, ?} |
ogatak@9713 | 337 | vmrglw (vt3, f, e); // vt3 = {e, f, ?, ?} |
ogatak@9713 | 338 | vmrglw (vt4, h, g); // vt4 = {g, h, ?, ?} |
ogatak@9713 | 339 | xxmrgld (vt1->to_vsr(), vt2->to_vsr(), vt1->to_vsr()); // vt1 = {a, b, c, d} |
ogatak@9713 | 340 | xxmrgld (vt3->to_vsr(), vt4->to_vsr(), vt3->to_vsr()); // vt3 = {e, f, g, h} |
ogatak@9713 | 341 | vadduwm (a, vt0, vt1); // a = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]} |
ogatak@9713 | 342 | vadduwm (e, vt5, vt3); // e = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]} |
ogatak@9713 | 343 | |
ogatak@9713 | 344 | // Save hptr back, works for any alignment |
ogatak@9713 | 345 | xxswapd (vt0->to_vsr(), a->to_vsr()); |
ogatak@9713 | 346 | stxvd2x (vt0->to_vsr(), hptr); |
ogatak@9713 | 347 | xxswapd (vt5->to_vsr(), e->to_vsr()); |
ogatak@9713 | 348 | stxvd2x (vt5->to_vsr(), of16, hptr); |
ogatak@9713 | 349 | #else |
ogatak@9713 | 350 | vmrglw (vt1, a, b_); // vt1 = {a, b, ?, ?} |
ogatak@9713 | 351 | vmrglw (vt2, c, d); // vt2 = {c, d, ?, ?} |
ogatak@9713 | 352 | vmrglw (vt3, e, f); // vt3 = {e, f, ?, ?} |
ogatak@9713 | 353 | vmrglw (vt4, g, h); // vt4 = {g, h, ?, ?} |
ogatak@9713 | 354 | xxmrgld (vt1->to_vsr(), vt1->to_vsr(), vt2->to_vsr()); // vt1 = {a, b, c, d} |
ogatak@9713 | 355 | xxmrgld (vt3->to_vsr(), vt3->to_vsr(), vt4->to_vsr()); // vt3 = {e, f, g, h} |
ogatak@9713 | 356 | vadduwm (d, vt0, vt1); // d = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]} |
ogatak@9713 | 357 | vadduwm (h, vt5, vt3); // h = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]} |
ogatak@9713 | 358 | |
ogatak@9713 | 359 | // Save hptr back, works for any alignment |
ogatak@9713 | 360 | stxvd2x (d->to_vsr(), hptr); |
ogatak@9713 | 361 | stxvd2x (h->to_vsr(), of16, hptr); |
ogatak@9713 | 362 | #endif |
ogatak@9713 | 363 | } |
ogatak@9713 | 364 | |
ogatak@9713 | 365 | static const uint32_t sha256_round_table[64] __attribute((aligned(16))) = { |
ogatak@9713 | 366 | 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, |
ogatak@9713 | 367 | 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, |
ogatak@9713 | 368 | 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, |
ogatak@9713 | 369 | 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, |
ogatak@9713 | 370 | 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, |
ogatak@9713 | 371 | 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, |
ogatak@9713 | 372 | 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, |
ogatak@9713 | 373 | 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, |
ogatak@9713 | 374 | 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, |
ogatak@9713 | 375 | 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, |
ogatak@9713 | 376 | 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, |
ogatak@9713 | 377 | 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, |
ogatak@9713 | 378 | 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, |
ogatak@9713 | 379 | 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, |
ogatak@9713 | 380 | 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, |
ogatak@9713 | 381 | 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, |
ogatak@9713 | 382 | }; |
ogatak@9713 | 383 | static const uint32_t *sha256_round_consts = sha256_round_table; |
ogatak@9713 | 384 | |
ogatak@9713 | 385 | // R3_ARG1 - byte[] Input string with padding but in Big Endian |
ogatak@9713 | 386 | // R4_ARG2 - int[] SHA.state (at first, the root of primes) |
ogatak@9713 | 387 | // R5_ARG3 - int offset |
ogatak@9713 | 388 | // R6_ARG4 - int limit |
ogatak@9713 | 389 | // |
ogatak@9713 | 390 | // Internal Register usage: |
ogatak@9713 | 391 | // R7 - k |
ogatak@9713 | 392 | // R8 - tmp | j | of16 |
ogatak@9713 | 393 | // R9 - of32 |
ogatak@9713 | 394 | // VR0-VR8 - ch, maj, bsa, bse, vt0-vt3 | vt0-vt5, vaux/vRb |
ogatak@9713 | 395 | // VR9-VR16 - a-h |
ogatak@9713 | 396 | // VR17-VR20 - w0-w3 |
ogatak@9713 | 397 | // VR21-VR23 - vRb | vaux0-vaux2 |
ogatak@9713 | 398 | // VR24-VR27 - kpw0-kpw3 |
ogatak@9713 | 399 | void MacroAssembler::sha256(bool multi_block) { |
ogatak@9713 | 400 | static const ssize_t buf_size = 64; |
ogatak@9713 | 401 | static const uint8_t w_size = sizeof(sha256_round_table)/sizeof(uint32_t); |
ogatak@9713 | 402 | #ifdef AIX |
ogatak@9713 | 403 | // malloc provides 16 byte alignment |
ogatak@9713 | 404 | if (((uintptr_t)sha256_round_consts & 0xF) != 0) { |
ogatak@9713 | 405 | uint32_t *new_round_consts = (uint32_t*)malloc(sizeof(sha256_round_table)); |
ogatak@9713 | 406 | guarantee(new_round_consts, "oom"); |
ogatak@9713 | 407 | memcpy(new_round_consts, sha256_round_consts, sizeof(sha256_round_table)); |
ogatak@9713 | 408 | sha256_round_consts = (const uint32_t*)new_round_consts; |
ogatak@9713 | 409 | } |
ogatak@9713 | 410 | #endif |
ogatak@9713 | 411 | |
ogatak@9713 | 412 | Register buf_in = R3_ARG1; |
ogatak@9713 | 413 | Register state = R4_ARG2; |
ogatak@9713 | 414 | Register ofs = R5_ARG3; |
ogatak@9713 | 415 | Register limit = R6_ARG4; |
ogatak@9713 | 416 | |
ogatak@9713 | 417 | Label sha_loop, core_loop; |
ogatak@9713 | 418 | |
ogatak@9713 | 419 | // Save non-volatile vector registers in the red zone |
ogatak@9713 | 420 | static const VectorRegister nv[] = { |
ogatak@9713 | 421 | VR20, VR21, VR22, VR23, VR24, VR25, VR26, VR27/*, VR28, VR29, VR30, VR31*/ |
ogatak@9713 | 422 | }; |
ogatak@9713 | 423 | static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister); |
ogatak@9713 | 424 | |
ogatak@9713 | 425 | for (int c = 0; c < nv_size; c++) { |
ogatak@9713 | 426 | Register tmp = R8; |
ogatak@9713 | 427 | li (tmp, (c - (nv_size)) * 16); |
ogatak@9713 | 428 | stvx(nv[c], tmp, R1); |
ogatak@9713 | 429 | } |
ogatak@9713 | 430 | |
ogatak@9713 | 431 | // Load hash state to registers |
ogatak@9713 | 432 | VectorRegister a = VR9; |
ogatak@9713 | 433 | VectorRegister b = VR10; |
ogatak@9713 | 434 | VectorRegister c = VR11; |
ogatak@9713 | 435 | VectorRegister d = VR12; |
ogatak@9713 | 436 | VectorRegister e = VR13; |
ogatak@9713 | 437 | VectorRegister f = VR14; |
ogatak@9713 | 438 | VectorRegister g = VR15; |
ogatak@9713 | 439 | VectorRegister h = VR16; |
ogatak@9713 | 440 | static const VectorRegister hs[] = {a, b, c, d, e, f, g, h}; |
ogatak@9713 | 441 | static const int total_hs = sizeof(hs)/sizeof(VectorRegister); |
ogatak@9713 | 442 | // counter for cycling through hs vector to avoid register moves between iterations |
ogatak@9713 | 443 | int h_cnt = 0; |
ogatak@9713 | 444 | |
ogatak@9713 | 445 | // Load a-h registers from the memory pointed by state |
ogatak@9713 | 446 | #if defined(VM_LITTLE_ENDIAN) |
ogatak@9713 | 447 | sha256_load_h_vec(a, e, state); |
ogatak@9713 | 448 | #else |
ogatak@9713 | 449 | sha256_load_h_vec(d, h, state); |
ogatak@9713 | 450 | #endif |
ogatak@9713 | 451 | |
ogatak@9713 | 452 | // keep k loaded also during MultiBlock loops |
ogatak@9713 | 453 | Register k = R7; |
ogatak@9713 | 454 | assert(((uintptr_t)sha256_round_consts & 0xF) == 0, "k alignment"); |
ogatak@9713 | 455 | load_const_optimized(k, (address)sha256_round_consts, R0); |
ogatak@9713 | 456 | |
ogatak@9713 | 457 | // Avoiding redundant loads |
ogatak@9713 | 458 | if (multi_block) { |
ogatak@9713 | 459 | align(OptoLoopAlignment); |
ogatak@9713 | 460 | } |
ogatak@9713 | 461 | bind(sha_loop); |
ogatak@9713 | 462 | #if defined(VM_LITTLE_ENDIAN) |
ogatak@9713 | 463 | sha256_deque(a, b, c, d); |
ogatak@9713 | 464 | sha256_deque(e, f, g, h); |
ogatak@9713 | 465 | #else |
ogatak@9713 | 466 | sha256_deque(d, c, b, a); |
ogatak@9713 | 467 | sha256_deque(h, g, f, e); |
ogatak@9713 | 468 | #endif |
ogatak@9713 | 469 | |
ogatak@9713 | 470 | // Load 16 elements from w out of the loop. |
ogatak@9713 | 471 | // Order of the int values is Endianess specific. |
ogatak@9713 | 472 | VectorRegister w0 = VR17; |
ogatak@9713 | 473 | VectorRegister w1 = VR18; |
ogatak@9713 | 474 | VectorRegister w2 = VR19; |
ogatak@9713 | 475 | VectorRegister w3 = VR20; |
ogatak@9713 | 476 | static const VectorRegister ws[] = {w0, w1, w2, w3}; |
ogatak@9713 | 477 | static const int total_ws = sizeof(ws)/sizeof(VectorRegister); |
ogatak@9713 | 478 | |
ogatak@9713 | 479 | VectorRegister kpw0 = VR24; |
ogatak@9713 | 480 | VectorRegister kpw1 = VR25; |
ogatak@9713 | 481 | VectorRegister kpw2 = VR26; |
ogatak@9713 | 482 | VectorRegister kpw3 = VR27; |
ogatak@9713 | 483 | static const VectorRegister kpws[] = {kpw0, kpw1, kpw2, kpw3}; |
ogatak@9713 | 484 | static const int total_kpws = sizeof(kpws)/sizeof(VectorRegister); |
ogatak@9713 | 485 | |
ogatak@9713 | 486 | sha256_load_w_plus_k_vec(buf_in, ws, total_ws, k, kpws, total_kpws); |
ogatak@9713 | 487 | |
ogatak@9713 | 488 | // Cycle through the first 16 elements |
ogatak@9713 | 489 | assert(total_ws == total_kpws, "Redesign the loop below"); |
ogatak@9713 | 490 | for (int n = 0; n < total_ws; n++) { |
ogatak@9713 | 491 | VectorRegister vaux0 = VR21; |
ogatak@9713 | 492 | VectorRegister vaux1 = VR22; |
ogatak@9713 | 493 | VectorRegister vaux2 = VR23; |
ogatak@9713 | 494 | |
ogatak@9713 | 495 | sha256_deque(kpws[n], vaux0, vaux1, vaux2); |
ogatak@9713 | 496 | |
ogatak@9713 | 497 | #if defined(VM_LITTLE_ENDIAN) |
ogatak@9713 | 498 | sha256_round(hs, total_hs, h_cnt, kpws[n]); |
ogatak@9713 | 499 | sha256_round(hs, total_hs, h_cnt, vaux0); |
ogatak@9713 | 500 | sha256_round(hs, total_hs, h_cnt, vaux1); |
ogatak@9713 | 501 | sha256_round(hs, total_hs, h_cnt, vaux2); |
ogatak@9713 | 502 | #else |
ogatak@9713 | 503 | sha256_round(hs, total_hs, h_cnt, vaux2); |
ogatak@9713 | 504 | sha256_round(hs, total_hs, h_cnt, vaux1); |
ogatak@9713 | 505 | sha256_round(hs, total_hs, h_cnt, vaux0); |
ogatak@9713 | 506 | sha256_round(hs, total_hs, h_cnt, kpws[n]); |
ogatak@9713 | 507 | #endif |
ogatak@9713 | 508 | } |
ogatak@9713 | 509 | |
ogatak@9713 | 510 | Register tmp = R8; |
ogatak@9713 | 511 | // loop the 16th to the 64th iteration by 8 steps |
ogatak@9713 | 512 | li (tmp, (w_size - 16) / total_hs); |
ogatak@9713 | 513 | mtctr(tmp); |
ogatak@9713 | 514 | |
ogatak@9713 | 515 | // j will be aligned to 4 for loading words. |
ogatak@9713 | 516 | // Whenever read, advance the pointer (e.g: when j is used in a function) |
ogatak@9713 | 517 | Register j = R8; |
ogatak@9713 | 518 | li (j, 16*4); |
ogatak@9713 | 519 | |
ogatak@9713 | 520 | align(OptoLoopAlignment); |
ogatak@9713 | 521 | bind(core_loop); |
ogatak@9713 | 522 | |
ogatak@9713 | 523 | // due to VectorRegister rotate, always iterate in multiples of total_hs |
ogatak@9713 | 524 | for (int n = 0; n < total_hs/4; n++) { |
ogatak@9713 | 525 | sha256_calc_4w(w0, w1, w2, w3, kpw0, kpw1, kpw2, kpw3, j, k); |
ogatak@9713 | 526 | sha256_round(hs, total_hs, h_cnt, kpw0); |
ogatak@9713 | 527 | sha256_round(hs, total_hs, h_cnt, kpw1); |
ogatak@9713 | 528 | sha256_round(hs, total_hs, h_cnt, kpw2); |
ogatak@9713 | 529 | sha256_round(hs, total_hs, h_cnt, kpw3); |
ogatak@9713 | 530 | } |
ogatak@9713 | 531 | |
ogatak@9713 | 532 | bdnz (core_loop); |
ogatak@9713 | 533 | |
ogatak@9713 | 534 | // Update hash state |
ogatak@9713 | 535 | sha256_update_sha_state(a, b, c, d, e, f, g, h, state); |
ogatak@9713 | 536 | |
ogatak@9713 | 537 | if (multi_block) { |
ogatak@9713 | 538 | addi(buf_in, buf_in, buf_size); |
ogatak@9713 | 539 | addi(ofs, ofs, buf_size); |
ogatak@9713 | 540 | cmplw(CCR0, ofs, limit); |
ogatak@9713 | 541 | ble(CCR0, sha_loop); |
ogatak@9713 | 542 | |
ogatak@9713 | 543 | // return ofs |
ogatak@9713 | 544 | mr(R3_RET, ofs); |
ogatak@9713 | 545 | } |
ogatak@9713 | 546 | |
ogatak@9713 | 547 | // Restore non-volatile registers |
ogatak@9713 | 548 | for (int c = 0; c < nv_size; c++) { |
ogatak@9713 | 549 | Register tmp = R8; |
ogatak@9713 | 550 | li (tmp, (c - (nv_size)) * 16); |
ogatak@9713 | 551 | lvx(nv[c], tmp, R1); |
ogatak@9713 | 552 | } |
ogatak@9713 | 553 | } |
ogatak@9713 | 554 | |
ogatak@9713 | 555 | |
ogatak@9713 | 556 | /********************************************************************** |
ogatak@9713 | 557 | * SHA 512 |
ogatak@9713 | 558 | *********************************************************************/ |
ogatak@9713 | 559 | |
ogatak@9713 | 560 | void MacroAssembler::sha512_load_w_vec(const Register buf_in, |
ogatak@9713 | 561 | const VectorRegister* ws, |
ogatak@9713 | 562 | const int total_ws) { |
ogatak@9713 | 563 | Register tmp = R8; |
ogatak@9713 | 564 | VectorRegister vRb = VR8; |
ogatak@9713 | 565 | VectorRegister aux = VR9; |
ogatak@9713 | 566 | Label is_aligned, after_alignment; |
ogatak@9713 | 567 | |
ogatak@9713 | 568 | andi_ (tmp, buf_in, 0xF); |
ogatak@9713 | 569 | beq (CCR0, is_aligned); // address ends with 0x0, not 0x8 |
ogatak@9713 | 570 | |
ogatak@9713 | 571 | // deal with unaligned addresses |
ogatak@9713 | 572 | lvx (ws[0], buf_in); |
ogatak@9713 | 573 | load_perm(vRb, buf_in); |
ogatak@9713 | 574 | |
ogatak@9713 | 575 | for (int n = 1; n < total_ws; n++) { |
ogatak@9713 | 576 | VectorRegister w_cur = ws[n]; |
ogatak@9713 | 577 | VectorRegister w_prev = ws[n-1]; |
ogatak@9713 | 578 | addi (tmp, buf_in, n * 16); |
ogatak@9713 | 579 | lvx (w_cur, tmp); |
ogatak@9713 | 580 | vec_perm(w_prev, w_cur, vRb); |
ogatak@9713 | 581 | } |
ogatak@9713 | 582 | addi (tmp, buf_in, total_ws * 16); |
ogatak@9713 | 583 | lvx (aux, tmp); |
ogatak@9713 | 584 | vec_perm(ws[total_ws-1], aux, vRb); |
ogatak@9713 | 585 | b (after_alignment); |
ogatak@9713 | 586 | |
ogatak@9713 | 587 | bind(is_aligned); |
ogatak@9713 | 588 | lvx (ws[0], buf_in); |
ogatak@9713 | 589 | for (int n = 1; n < total_ws; n++) { |
ogatak@9713 | 590 | VectorRegister w = ws[n]; |
ogatak@9713 | 591 | addi (tmp, buf_in, n * 16); |
ogatak@9713 | 592 | lvx (w, tmp); |
ogatak@9713 | 593 | } |
ogatak@9713 | 594 | |
ogatak@9713 | 595 | bind(after_alignment); |
ogatak@9713 | 596 | } |
ogatak@9713 | 597 | |
ogatak@9713 | 598 | // Update hash state |
ogatak@9713 | 599 | void MacroAssembler::sha512_update_sha_state(const Register state, |
ogatak@9713 | 600 | const VectorRegister* hs, |
ogatak@9713 | 601 | const int total_hs) { |
ogatak@9713 | 602 | |
ogatak@9713 | 603 | #if defined(VM_LITTLE_ENDIAN) |
ogatak@9713 | 604 | int start_idx = 0; |
ogatak@9713 | 605 | #else |
ogatak@9713 | 606 | int start_idx = 1; |
ogatak@9713 | 607 | #endif |
ogatak@9713 | 608 | |
ogatak@9713 | 609 | // load initial hash from the memory pointed by state |
ogatak@9713 | 610 | VectorRegister ini_a = VR10; |
ogatak@9713 | 611 | VectorRegister ini_c = VR12; |
ogatak@9713 | 612 | VectorRegister ini_e = VR14; |
ogatak@9713 | 613 | VectorRegister ini_g = VR16; |
ogatak@9713 | 614 | static const VectorRegister inis[] = {ini_a, ini_c, ini_e, ini_g}; |
ogatak@9713 | 615 | static const int total_inis = sizeof(inis)/sizeof(VectorRegister); |
ogatak@9713 | 616 | |
ogatak@9713 | 617 | Label state_save_aligned, after_state_save_aligned; |
ogatak@9713 | 618 | |
ogatak@9713 | 619 | Register addr = R7; |
ogatak@9713 | 620 | Register tmp = R8; |
ogatak@9713 | 621 | VectorRegister vRb = VR8; |
ogatak@9713 | 622 | VectorRegister aux = VR9; |
ogatak@9713 | 623 | |
ogatak@9713 | 624 | andi_(tmp, state, 0xf); |
ogatak@9713 | 625 | beq(CCR0, state_save_aligned); |
ogatak@9713 | 626 | // deal with unaligned addresses |
ogatak@9713 | 627 | |
ogatak@9713 | 628 | { |
ogatak@9713 | 629 | VectorRegister a = hs[0]; |
ogatak@9713 | 630 | VectorRegister b_ = hs[1]; |
ogatak@9713 | 631 | VectorRegister c = hs[2]; |
ogatak@9713 | 632 | VectorRegister d = hs[3]; |
ogatak@9713 | 633 | VectorRegister e = hs[4]; |
ogatak@9713 | 634 | VectorRegister f = hs[5]; |
ogatak@9713 | 635 | VectorRegister g = hs[6]; |
ogatak@9713 | 636 | VectorRegister h = hs[7]; |
ogatak@9713 | 637 | load_perm(vRb, state); |
ogatak@9713 | 638 | lvx (ini_a, state); |
ogatak@9713 | 639 | addi (addr, state, 16); |
ogatak@9713 | 640 | |
ogatak@9713 | 641 | lvx (ini_c, addr); |
ogatak@9713 | 642 | addi (addr, state, 32); |
ogatak@9713 | 643 | vec_perm(ini_a, ini_c, vRb); |
ogatak@9713 | 644 | |
ogatak@9713 | 645 | lvx (ini_e, addr); |
ogatak@9713 | 646 | addi (addr, state, 48); |
ogatak@9713 | 647 | vec_perm(ini_c, ini_e, vRb); |
ogatak@9713 | 648 | |
ogatak@9713 | 649 | lvx (ini_g, addr); |
ogatak@9713 | 650 | addi (addr, state, 64); |
ogatak@9713 | 651 | vec_perm(ini_e, ini_g, vRb); |
ogatak@9713 | 652 | |
ogatak@9713 | 653 | lvx (aux, addr); |
ogatak@9713 | 654 | vec_perm(ini_g, aux, vRb); |
ogatak@9713 | 655 | |
ogatak@9713 | 656 | #if defined(VM_LITTLE_ENDIAN) |
ogatak@9713 | 657 | xxmrgld(a->to_vsr(), b_->to_vsr(), a->to_vsr()); |
ogatak@9713 | 658 | xxmrgld(c->to_vsr(), d->to_vsr(), c->to_vsr()); |
ogatak@9713 | 659 | xxmrgld(e->to_vsr(), f->to_vsr(), e->to_vsr()); |
ogatak@9713 | 660 | xxmrgld(g->to_vsr(), h->to_vsr(), g->to_vsr()); |
ogatak@9713 | 661 | #else |
ogatak@9713 | 662 | xxmrgld(b_->to_vsr(), a->to_vsr(), b_->to_vsr()); |
ogatak@9713 | 663 | xxmrgld(d->to_vsr(), c->to_vsr(), d->to_vsr()); |
ogatak@9713 | 664 | xxmrgld(f->to_vsr(), e->to_vsr(), f->to_vsr()); |
ogatak@9713 | 665 | xxmrgld(h->to_vsr(), g->to_vsr(), h->to_vsr()); |
ogatak@9713 | 666 | #endif |
ogatak@9713 | 667 | |
ogatak@9713 | 668 | for (int n = start_idx; n < total_hs; n += 2) { |
ogatak@9713 | 669 | VectorRegister h_cur = hs[n]; |
ogatak@9713 | 670 | VectorRegister ini_cur = inis[n/2]; |
ogatak@9713 | 671 | |
ogatak@9713 | 672 | vaddudm(h_cur, ini_cur, h_cur); |
ogatak@9713 | 673 | } |
ogatak@9713 | 674 | |
ogatak@9713 | 675 | for (int n = start_idx; n < total_hs; n += 2) { |
ogatak@9713 | 676 | VectorRegister h_cur = hs[n]; |
ogatak@9713 | 677 | |
ogatak@9713 | 678 | mfvrd (tmp, h_cur); |
ogatak@9713 | 679 | #if defined(VM_LITTLE_ENDIAN) |
ogatak@9713 | 680 | std (tmp, 8*n + 8, state); |
ogatak@9713 | 681 | #else |
ogatak@9713 | 682 | std (tmp, 8*n - 8, state); |
ogatak@9713 | 683 | #endif |
ogatak@9713 | 684 | vsldoi (aux, h_cur, h_cur, 8); |
ogatak@9713 | 685 | mfvrd (tmp, aux); |
ogatak@9713 | 686 | std (tmp, 8*n + 0, state); |
ogatak@9713 | 687 | } |
ogatak@9713 | 688 | |
ogatak@9713 | 689 | b (after_state_save_aligned); |
ogatak@9713 | 690 | } |
ogatak@9713 | 691 | |
ogatak@9713 | 692 | bind(state_save_aligned); |
ogatak@9713 | 693 | { |
ogatak@9713 | 694 | for (int n = 0; n < total_hs; n += 2) { |
ogatak@9713 | 695 | #if defined(VM_LITTLE_ENDIAN) |
ogatak@9713 | 696 | VectorRegister h_cur = hs[n]; |
ogatak@9713 | 697 | VectorRegister h_next = hs[n+1]; |
ogatak@9713 | 698 | #else |
ogatak@9713 | 699 | VectorRegister h_cur = hs[n+1]; |
ogatak@9713 | 700 | VectorRegister h_next = hs[n]; |
ogatak@9713 | 701 | #endif |
ogatak@9713 | 702 | VectorRegister ini_cur = inis[n/2]; |
ogatak@9713 | 703 | |
ogatak@9713 | 704 | if (n/2 == 0) { |
ogatak@9713 | 705 | lvx(ini_cur, state); |
ogatak@9713 | 706 | } else { |
ogatak@9713 | 707 | addi(addr, state, (n/2) * 16); |
ogatak@9713 | 708 | lvx(ini_cur, addr); |
ogatak@9713 | 709 | } |
ogatak@9713 | 710 | xxmrgld(h_cur->to_vsr(), h_next->to_vsr(), h_cur->to_vsr()); |
ogatak@9713 | 711 | } |
ogatak@9713 | 712 | |
ogatak@9713 | 713 | for (int n = start_idx; n < total_hs; n += 2) { |
ogatak@9713 | 714 | VectorRegister h_cur = hs[n]; |
ogatak@9713 | 715 | VectorRegister ini_cur = inis[n/2]; |
ogatak@9713 | 716 | |
ogatak@9713 | 717 | vaddudm(h_cur, ini_cur, h_cur); |
ogatak@9713 | 718 | } |
ogatak@9713 | 719 | |
ogatak@9713 | 720 | for (int n = start_idx; n < total_hs; n += 2) { |
ogatak@9713 | 721 | VectorRegister h_cur = hs[n]; |
ogatak@9713 | 722 | |
ogatak@9713 | 723 | if (n/2 == 0) { |
ogatak@9713 | 724 | stvx(h_cur, state); |
ogatak@9713 | 725 | } else { |
ogatak@9713 | 726 | addi(addr, state, (n/2) * 16); |
ogatak@9713 | 727 | stvx(h_cur, addr); |
ogatak@9713 | 728 | } |
ogatak@9713 | 729 | } |
ogatak@9713 | 730 | } |
ogatak@9713 | 731 | |
ogatak@9713 | 732 | bind(after_state_save_aligned); |
ogatak@9713 | 733 | } |
ogatak@9713 | 734 | |
ogatak@9713 | 735 | // Use h_cnt to cycle through hs elements but also increment it at the end |
ogatak@9713 | 736 | void MacroAssembler::sha512_round(const VectorRegister* hs, |
ogatak@9713 | 737 | const int total_hs, int& h_cnt, |
ogatak@9713 | 738 | const VectorRegister kpw) { |
ogatak@9713 | 739 | |
ogatak@9713 | 740 | // convenience registers: cycle from 0-7 downwards |
ogatak@9713 | 741 | const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs]; |
ogatak@9713 | 742 | const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs]; |
ogatak@9713 | 743 | const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs]; |
ogatak@9713 | 744 | const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs]; |
ogatak@9713 | 745 | const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs]; |
ogatak@9713 | 746 | const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs]; |
ogatak@9713 | 747 | const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs]; |
ogatak@9713 | 748 | const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs]; |
ogatak@9713 | 749 | // temporaries |
ogatak@9713 | 750 | const VectorRegister Ch = VR20; |
ogatak@9713 | 751 | const VectorRegister Maj = VR21; |
ogatak@9713 | 752 | const VectorRegister bsa = VR22; |
ogatak@9713 | 753 | const VectorRegister bse = VR23; |
ogatak@9713 | 754 | const VectorRegister tmp1 = VR24; |
ogatak@9713 | 755 | const VectorRegister tmp2 = VR25; |
ogatak@9713 | 756 | |
ogatak@9713 | 757 | vsel (Ch, g, f, e); |
ogatak@9713 | 758 | vxor (Maj, a, b); |
ogatak@9713 | 759 | vshasigmad(bse, e, 1, 0xf); |
ogatak@9713 | 760 | vaddudm (tmp2, Ch, kpw); |
ogatak@9713 | 761 | vaddudm (tmp1, h, bse); |
ogatak@9713 | 762 | vsel (Maj, b, c, Maj); |
ogatak@9713 | 763 | vaddudm (tmp1, tmp1, tmp2); |
ogatak@9713 | 764 | vshasigmad(bsa, a, 1, 0); |
ogatak@9713 | 765 | vaddudm (tmp2, bsa, Maj); |
ogatak@9713 | 766 | vaddudm (d, d, tmp1); |
ogatak@9713 | 767 | vaddudm (h, tmp1, tmp2); |
ogatak@9713 | 768 | |
ogatak@9713 | 769 | // advance vector pointer to the next iteration |
ogatak@9713 | 770 | h_cnt++; |
ogatak@9713 | 771 | } |
ogatak@9713 | 772 | |
ogatak@9713 | 773 | void MacroAssembler::sha512_calc_2w(const VectorRegister w0, |
ogatak@9713 | 774 | const VectorRegister w1, |
ogatak@9713 | 775 | const VectorRegister w2, |
ogatak@9713 | 776 | const VectorRegister w3, |
ogatak@9713 | 777 | const VectorRegister w4, |
ogatak@9713 | 778 | const VectorRegister w5, |
ogatak@9713 | 779 | const VectorRegister w6, |
ogatak@9713 | 780 | const VectorRegister w7, |
ogatak@9713 | 781 | const VectorRegister kpw0, |
ogatak@9713 | 782 | const VectorRegister kpw1, |
ogatak@9713 | 783 | const Register j, |
ogatak@9713 | 784 | const VectorRegister vRb, |
ogatak@9713 | 785 | const Register k) { |
ogatak@9713 | 786 | // Temporaries |
ogatak@9713 | 787 | const VectorRegister VR_a = VR20; |
ogatak@9713 | 788 | const VectorRegister VR_b = VR21; |
ogatak@9713 | 789 | const VectorRegister VR_c = VR22; |
ogatak@9713 | 790 | const VectorRegister VR_d = VR23; |
ogatak@9713 | 791 | |
ogatak@9713 | 792 | // load to k[j] |
ogatak@9713 | 793 | lvx (VR_a, j, k); |
ogatak@9713 | 794 | // advance j |
ogatak@9713 | 795 | addi (j, j, 16); // 16 bytes were read |
ogatak@9713 | 796 | |
ogatak@9713 | 797 | #if defined(VM_LITTLE_ENDIAN) |
ogatak@9713 | 798 | // v6 = w[j-15], w[j-14] |
ogatak@9713 | 799 | vperm (VR_b, w1, w0, vRb); |
ogatak@9713 | 800 | // v12 = w[j-7], w[j-6] |
ogatak@9713 | 801 | vperm (VR_c, w5, w4, vRb); |
ogatak@9713 | 802 | #else |
ogatak@9713 | 803 | // v6 = w[j-15], w[j-14] |
ogatak@9713 | 804 | vperm (VR_b, w0, w1, vRb); |
ogatak@9713 | 805 | // v12 = w[j-7], w[j-6] |
ogatak@9713 | 806 | vperm (VR_c, w4, w5, vRb); |
ogatak@9713 | 807 | #endif |
ogatak@9713 | 808 | |
ogatak@9713 | 809 | // v6 = s0(w[j-15]) , s0(w[j-14]) |
ogatak@9713 | 810 | vshasigmad (VR_b, VR_b, 0, 0); |
ogatak@9713 | 811 | // v5 = s1(w[j-2]) , s1(w[j-1]) |
ogatak@9713 | 812 | vshasigmad (VR_d, w7, 0, 0xf); |
ogatak@9713 | 813 | // v6 = s0(w[j-15]) + w[j-7] , s0(w[j-14]) + w[j-6] |
ogatak@9713 | 814 | vaddudm (VR_b, VR_b, VR_c); |
ogatak@9713 | 815 | // v8 = s1(w[j-2]) + w[j-16] , s1(w[j-1]) + w[j-15] |
ogatak@9713 | 816 | vaddudm (VR_d, VR_d, w0); |
ogatak@9713 | 817 | // v9 = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j] |
ogatak@9713 | 818 | // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1] |
ogatak@9713 | 819 | vaddudm (VR_c, VR_d, VR_b); |
ogatak@9713 | 820 | // Updating w0 to w7 to hold the new previous 16 values from w. |
ogatak@9713 | 821 | vmr (w0, w1); |
ogatak@9713 | 822 | vmr (w1, w2); |
ogatak@9713 | 823 | vmr (w2, w3); |
ogatak@9713 | 824 | vmr (w3, w4); |
ogatak@9713 | 825 | vmr (w4, w5); |
ogatak@9713 | 826 | vmr (w5, w6); |
ogatak@9713 | 827 | vmr (w6, w7); |
ogatak@9713 | 828 | vmr (w7, VR_c); |
ogatak@9713 | 829 | |
ogatak@9713 | 830 | #if defined(VM_LITTLE_ENDIAN) |
ogatak@9713 | 831 | // store k + w to kpw0 (2 values at once) |
ogatak@9713 | 832 | vaddudm (kpw0, VR_c, VR_a); |
ogatak@9713 | 833 | // kpw1 holds (k + w)[1] |
ogatak@9713 | 834 | vsldoi (kpw1, kpw0, kpw0, 8); |
ogatak@9713 | 835 | #else |
ogatak@9713 | 836 | // store k + w to kpw0 (2 values at once) |
ogatak@9713 | 837 | vaddudm (kpw1, VR_c, VR_a); |
ogatak@9713 | 838 | // kpw1 holds (k + w)[1] |
ogatak@9713 | 839 | vsldoi (kpw0, kpw1, kpw1, 8); |
ogatak@9713 | 840 | #endif |
ogatak@9713 | 841 | } |
ogatak@9713 | 842 | |
ogatak@9713 | 843 | void MacroAssembler::sha512_load_h_vec(const Register state, |
ogatak@9713 | 844 | const VectorRegister* hs, |
ogatak@9713 | 845 | const int total_hs) { |
ogatak@9713 | 846 | #if defined(VM_LITTLE_ENDIAN) |
ogatak@9713 | 847 | VectorRegister a = hs[0]; |
ogatak@9713 | 848 | VectorRegister g = hs[6]; |
ogatak@9713 | 849 | int start_idx = 0; |
ogatak@9713 | 850 | #else |
ogatak@9713 | 851 | VectorRegister a = hs[1]; |
ogatak@9713 | 852 | VectorRegister g = hs[7]; |
ogatak@9713 | 853 | int start_idx = 1; |
ogatak@9713 | 854 | #endif |
ogatak@9713 | 855 | |
ogatak@9713 | 856 | Register addr = R7; |
ogatak@9713 | 857 | VectorRegister vRb = VR8; |
ogatak@9713 | 858 | Register tmp = R8; |
ogatak@9713 | 859 | Label state_aligned, after_state_aligned; |
ogatak@9713 | 860 | |
ogatak@9713 | 861 | andi_(tmp, state, 0xf); |
ogatak@9713 | 862 | beq(CCR0, state_aligned); |
ogatak@9713 | 863 | |
ogatak@9713 | 864 | // deal with unaligned addresses |
ogatak@9713 | 865 | VectorRegister aux = VR9; |
ogatak@9713 | 866 | |
ogatak@9713 | 867 | lvx(hs[start_idx], state); |
ogatak@9713 | 868 | load_perm(vRb, state); |
ogatak@9713 | 869 | |
ogatak@9713 | 870 | for (int n = start_idx + 2; n < total_hs; n += 2) { |
ogatak@9713 | 871 | VectorRegister h_cur = hs[n]; |
ogatak@9713 | 872 | VectorRegister h_prev2 = hs[n - 2]; |
ogatak@9713 | 873 | addi(addr, state, (n/2) * 16); |
ogatak@9713 | 874 | lvx(h_cur, addr); |
ogatak@9713 | 875 | vec_perm(h_prev2, h_cur, vRb); |
ogatak@9713 | 876 | } |
ogatak@9713 | 877 | addi(addr, state, (total_hs/2) * 16); |
ogatak@9713 | 878 | lvx (aux, addr); |
ogatak@9713 | 879 | vec_perm(hs[total_hs - 2 + start_idx], aux, vRb); |
ogatak@9713 | 880 | b (after_state_aligned); |
ogatak@9713 | 881 | |
ogatak@9713 | 882 | bind(state_aligned); |
ogatak@9713 | 883 | |
ogatak@9713 | 884 | // deal with aligned addresses |
ogatak@9713 | 885 | lvx(hs[start_idx], state); |
ogatak@9713 | 886 | |
ogatak@9713 | 887 | for (int n = start_idx + 2; n < total_hs; n += 2) { |
ogatak@9713 | 888 | VectorRegister h_cur = hs[n]; |
ogatak@9713 | 889 | addi(addr, state, (n/2) * 16); |
ogatak@9713 | 890 | lvx(h_cur, addr); |
ogatak@9713 | 891 | } |
ogatak@9713 | 892 | |
ogatak@9713 | 893 | bind(after_state_aligned); |
ogatak@9713 | 894 | } |
ogatak@9713 | 895 | |
ogatak@9713 | 896 | static const uint64_t sha512_round_table[80] __attribute((aligned(16))) = { |
ogatak@9713 | 897 | 0x428a2f98d728ae22, 0x7137449123ef65cd, |
ogatak@9713 | 898 | 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc, |
ogatak@9713 | 899 | 0x3956c25bf348b538, 0x59f111f1b605d019, |
ogatak@9713 | 900 | 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, |
ogatak@9713 | 901 | 0xd807aa98a3030242, 0x12835b0145706fbe, |
ogatak@9713 | 902 | 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2, |
ogatak@9713 | 903 | 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, |
ogatak@9713 | 904 | 0x9bdc06a725c71235, 0xc19bf174cf692694, |
ogatak@9713 | 905 | 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, |
ogatak@9713 | 906 | 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, |
ogatak@9713 | 907 | 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, |
ogatak@9713 | 908 | 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5, |
ogatak@9713 | 909 | 0x983e5152ee66dfab, 0xa831c66d2db43210, |
ogatak@9713 | 910 | 0xb00327c898fb213f, 0xbf597fc7beef0ee4, |
ogatak@9713 | 911 | 0xc6e00bf33da88fc2, 0xd5a79147930aa725, |
ogatak@9713 | 912 | 0x06ca6351e003826f, 0x142929670a0e6e70, |
ogatak@9713 | 913 | 0x27b70a8546d22ffc, 0x2e1b21385c26c926, |
ogatak@9713 | 914 | 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df, |
ogatak@9713 | 915 | 0x650a73548baf63de, 0x766a0abb3c77b2a8, |
ogatak@9713 | 916 | 0x81c2c92e47edaee6, 0x92722c851482353b, |
ogatak@9713 | 917 | 0xa2bfe8a14cf10364, 0xa81a664bbc423001, |
ogatak@9713 | 918 | 0xc24b8b70d0f89791, 0xc76c51a30654be30, |
ogatak@9713 | 919 | 0xd192e819d6ef5218, 0xd69906245565a910, |
ogatak@9713 | 920 | 0xf40e35855771202a, 0x106aa07032bbd1b8, |
ogatak@9713 | 921 | 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, |
ogatak@9713 | 922 | 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8, |
ogatak@9713 | 923 | 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, |
ogatak@9713 | 924 | 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, |
ogatak@9713 | 925 | 0x748f82ee5defb2fc, 0x78a5636f43172f60, |
ogatak@9713 | 926 | 0x84c87814a1f0ab72, 0x8cc702081a6439ec, |
ogatak@9713 | 927 | 0x90befffa23631e28, 0xa4506cebde82bde9, |
ogatak@9713 | 928 | 0xbef9a3f7b2c67915, 0xc67178f2e372532b, |
ogatak@9713 | 929 | 0xca273eceea26619c, 0xd186b8c721c0c207, |
ogatak@9713 | 930 | 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, |
ogatak@9713 | 931 | 0x06f067aa72176fba, 0x0a637dc5a2c898a6, |
ogatak@9713 | 932 | 0x113f9804bef90dae, 0x1b710b35131c471b, |
ogatak@9713 | 933 | 0x28db77f523047d84, 0x32caab7b40c72493, |
ogatak@9713 | 934 | 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c, |
ogatak@9713 | 935 | 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, |
ogatak@9713 | 936 | 0x5fcb6fab3ad6faec, 0x6c44198c4a475817, |
ogatak@9713 | 937 | }; |
ogatak@9713 | 938 | static const uint64_t *sha512_round_consts = sha512_round_table; |
ogatak@9713 | 939 | |
ogatak@9713 | 940 | // R3_ARG1 - byte[] Input string with padding but in Big Endian |
ogatak@9713 | 941 | // R4_ARG2 - int[] SHA.state (at first, the root of primes) |
ogatak@9713 | 942 | // R5_ARG3 - int offset |
ogatak@9713 | 943 | // R6_ARG4 - int limit |
ogatak@9713 | 944 | // |
ogatak@9713 | 945 | // Internal Register usage: |
ogatak@9713 | 946 | // R7 R8 R9 - volatile temporaries |
ogatak@9713 | 947 | // VR0-VR7 - a-h |
ogatak@9713 | 948 | // VR8 - vRb |
ogatak@9713 | 949 | // VR9 - aux (highly volatile, use with care) |
ogatak@9713 | 950 | // VR10-VR17 - w0-w7 | ini_a-ini_h |
ogatak@9713 | 951 | // VR18 - vsp16 | kplusw0 |
ogatak@9713 | 952 | // VR19 - vsp32 | kplusw1 |
ogatak@9713 | 953 | // VR20-VR25 - sha512_calc_2w and sha512_round temporaries |
ogatak@9713 | 954 | void MacroAssembler::sha512(bool multi_block) { |
ogatak@9713 | 955 | static const ssize_t buf_size = 128; |
ogatak@9713 | 956 | static const uint8_t w_size = sizeof(sha512_round_table)/sizeof(uint64_t); |
ogatak@9713 | 957 | #ifdef AIX |
ogatak@9713 | 958 | // malloc provides 16 byte alignment |
ogatak@9713 | 959 | if (((uintptr_t)sha512_round_consts & 0xF) != 0) { |
ogatak@9713 | 960 | uint64_t *new_round_consts = (uint64_t*)malloc(sizeof(sha512_round_table)); |
ogatak@9713 | 961 | guarantee(new_round_consts, "oom"); |
ogatak@9713 | 962 | memcpy(new_round_consts, sha512_round_consts, sizeof(sha512_round_table)); |
ogatak@9713 | 963 | sha512_round_consts = (const uint64_t*)new_round_consts; |
ogatak@9713 | 964 | } |
ogatak@9713 | 965 | #endif |
ogatak@9713 | 966 | |
ogatak@9713 | 967 | Register buf_in = R3_ARG1; |
ogatak@9713 | 968 | Register state = R4_ARG2; |
ogatak@9713 | 969 | Register ofs = R5_ARG3; |
ogatak@9713 | 970 | Register limit = R6_ARG4; |
ogatak@9713 | 971 | |
ogatak@9713 | 972 | Label sha_loop, core_loop; |
ogatak@9713 | 973 | |
ogatak@9713 | 974 | // Save non-volatile vector registers in the red zone |
ogatak@9713 | 975 | static const VectorRegister nv[] = { |
ogatak@9713 | 976 | VR20, VR21, VR22, VR23, VR24, VR25/*, VR26, VR27, VR28, VR29, VR30, VR31*/ |
ogatak@9713 | 977 | }; |
ogatak@9713 | 978 | static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister); |
ogatak@9713 | 979 | |
ogatak@9713 | 980 | for (int c = 0; c < nv_size; c++) { |
ogatak@9713 | 981 | Register idx = R7; |
ogatak@9713 | 982 | li (idx, (c - (nv_size)) * 16); |
ogatak@9713 | 983 | stvx(nv[c], idx, R1); |
ogatak@9713 | 984 | } |
ogatak@9713 | 985 | |
ogatak@9713 | 986 | // Load hash state to registers |
ogatak@9713 | 987 | VectorRegister a = VR0; |
ogatak@9713 | 988 | VectorRegister b = VR1; |
ogatak@9713 | 989 | VectorRegister c = VR2; |
ogatak@9713 | 990 | VectorRegister d = VR3; |
ogatak@9713 | 991 | VectorRegister e = VR4; |
ogatak@9713 | 992 | VectorRegister f = VR5; |
ogatak@9713 | 993 | VectorRegister g = VR6; |
ogatak@9713 | 994 | VectorRegister h = VR7; |
ogatak@9713 | 995 | static const VectorRegister hs[] = {a, b, c, d, e, f, g, h}; |
ogatak@9713 | 996 | static const int total_hs = sizeof(hs)/sizeof(VectorRegister); |
ogatak@9713 | 997 | // counter for cycling through hs vector to avoid register moves between iterations |
ogatak@9713 | 998 | int h_cnt = 0; |
ogatak@9713 | 999 | |
ogatak@9713 | 1000 | // Load a-h registers from the memory pointed by state |
ogatak@9713 | 1001 | sha512_load_h_vec(state, hs, total_hs); |
ogatak@9713 | 1002 | |
ogatak@9713 | 1003 | Register k = R9; |
ogatak@9713 | 1004 | assert(((uintptr_t)sha512_round_consts & 0xF) == 0, "k alignment"); |
ogatak@9713 | 1005 | load_const_optimized(k, (address)sha512_round_consts, R0); |
ogatak@9713 | 1006 | |
ogatak@9713 | 1007 | if (multi_block) { |
ogatak@9713 | 1008 | align(OptoLoopAlignment); |
ogatak@9713 | 1009 | } |
ogatak@9713 | 1010 | bind(sha_loop); |
ogatak@9713 | 1011 | |
ogatak@9713 | 1012 | for (int n = 0; n < total_hs; n += 2) { |
ogatak@9713 | 1013 | #if defined(VM_LITTLE_ENDIAN) |
ogatak@9713 | 1014 | VectorRegister h_cur = hs[n]; |
ogatak@9713 | 1015 | VectorRegister h_next = hs[n + 1]; |
ogatak@9713 | 1016 | #else |
ogatak@9713 | 1017 | VectorRegister h_cur = hs[n + 1]; |
ogatak@9713 | 1018 | VectorRegister h_next = hs[n]; |
ogatak@9713 | 1019 | #endif |
ogatak@9713 | 1020 | vsldoi (h_next, h_cur, h_cur, 8); |
ogatak@9713 | 1021 | } |
ogatak@9713 | 1022 | |
ogatak@9713 | 1023 | // Load 16 elements from w out of the loop. |
ogatak@9713 | 1024 | // Order of the long values is Endianess specific. |
ogatak@9713 | 1025 | VectorRegister w0 = VR10; |
ogatak@9713 | 1026 | VectorRegister w1 = VR11; |
ogatak@9713 | 1027 | VectorRegister w2 = VR12; |
ogatak@9713 | 1028 | VectorRegister w3 = VR13; |
ogatak@9713 | 1029 | VectorRegister w4 = VR14; |
ogatak@9713 | 1030 | VectorRegister w5 = VR15; |
ogatak@9713 | 1031 | VectorRegister w6 = VR16; |
ogatak@9713 | 1032 | VectorRegister w7 = VR17; |
ogatak@9713 | 1033 | static const VectorRegister ws[] = {w0, w1, w2, w3, w4, w5, w6, w7}; |
ogatak@9713 | 1034 | static const int total_ws = sizeof(ws)/sizeof(VectorRegister); |
ogatak@9713 | 1035 | |
ogatak@9713 | 1036 | // Load 16 w into vectors and setup vsl for vperm |
ogatak@9713 | 1037 | sha512_load_w_vec(buf_in, ws, total_ws); |
ogatak@9713 | 1038 | |
ogatak@9713 | 1039 | #if defined(VM_LITTLE_ENDIAN) |
ogatak@9713 | 1040 | VectorRegister vsp16 = VR18; |
ogatak@9713 | 1041 | VectorRegister vsp32 = VR19; |
ogatak@9713 | 1042 | VectorRegister shiftarg = VR9; |
ogatak@9713 | 1043 | |
ogatak@9713 | 1044 | vspltisw(vsp16, 8); |
ogatak@9713 | 1045 | vspltisw(shiftarg, 1); |
ogatak@9713 | 1046 | vsl (vsp16, vsp16, shiftarg); |
ogatak@9713 | 1047 | vsl (vsp32, vsp16, shiftarg); |
ogatak@9713 | 1048 | |
ogatak@9713 | 1049 | VectorRegister vsp8 = VR9; |
ogatak@9713 | 1050 | vspltish(vsp8, 8); |
ogatak@9713 | 1051 | |
ogatak@9713 | 1052 | // Convert input from Big Endian to Little Endian |
ogatak@9713 | 1053 | for (int c = 0; c < total_ws; c++) { |
ogatak@9713 | 1054 | VectorRegister w = ws[c]; |
ogatak@9713 | 1055 | vrlh (w, w, vsp8); |
ogatak@9713 | 1056 | } |
ogatak@9713 | 1057 | for (int c = 0; c < total_ws; c++) { |
ogatak@9713 | 1058 | VectorRegister w = ws[c]; |
ogatak@9713 | 1059 | vrlw (w, w, vsp16); |
ogatak@9713 | 1060 | } |
ogatak@9713 | 1061 | for (int c = 0; c < total_ws; c++) { |
ogatak@9713 | 1062 | VectorRegister w = ws[c]; |
ogatak@9713 | 1063 | vrld (w, w, vsp32); |
ogatak@9713 | 1064 | } |
ogatak@9713 | 1065 | #endif |
ogatak@9713 | 1066 | |
ogatak@9713 | 1067 | Register Rb = R10; |
ogatak@9713 | 1068 | VectorRegister vRb = VR8; |
ogatak@9713 | 1069 | li (Rb, 8); |
ogatak@9713 | 1070 | load_perm(vRb, Rb); |
ogatak@9713 | 1071 | |
ogatak@9713 | 1072 | VectorRegister kplusw0 = VR18; |
ogatak@9713 | 1073 | VectorRegister kplusw1 = VR19; |
ogatak@9713 | 1074 | |
ogatak@9713 | 1075 | Register addr = R7; |
ogatak@9713 | 1076 | |
ogatak@9713 | 1077 | for (int n = 0; n < total_ws; n++) { |
ogatak@9713 | 1078 | VectorRegister w = ws[n]; |
ogatak@9713 | 1079 | |
ogatak@9713 | 1080 | if (n == 0) { |
ogatak@9713 | 1081 | lvx (kplusw0, k); |
ogatak@9713 | 1082 | } else { |
ogatak@9713 | 1083 | addi (addr, k, n * 16); |
ogatak@9713 | 1084 | lvx (kplusw0, addr); |
ogatak@9713 | 1085 | } |
ogatak@9713 | 1086 | #if defined(VM_LITTLE_ENDIAN) |
ogatak@9713 | 1087 | vaddudm(kplusw0, kplusw0, w); |
ogatak@9713 | 1088 | vsldoi (kplusw1, kplusw0, kplusw0, 8); |
ogatak@9713 | 1089 | #else |
ogatak@9713 | 1090 | vaddudm(kplusw1, kplusw0, w); |
ogatak@9713 | 1091 | vsldoi (kplusw0, kplusw1, kplusw1, 8); |
ogatak@9713 | 1092 | #endif |
ogatak@9713 | 1093 | |
ogatak@9713 | 1094 | sha512_round(hs, total_hs, h_cnt, kplusw0); |
ogatak@9713 | 1095 | sha512_round(hs, total_hs, h_cnt, kplusw1); |
ogatak@9713 | 1096 | } |
ogatak@9713 | 1097 | |
ogatak@9713 | 1098 | Register tmp = R8; |
ogatak@9713 | 1099 | li (tmp, (w_size-16)/total_hs); |
ogatak@9713 | 1100 | mtctr (tmp); |
ogatak@9713 | 1101 | // j will be aligned to 4 for loading words. |
ogatak@9713 | 1102 | // Whenever read, advance the pointer (e.g: when j is used in a function) |
ogatak@9713 | 1103 | Register j = tmp; |
ogatak@9713 | 1104 | li (j, 8*16); |
ogatak@9713 | 1105 | |
ogatak@9713 | 1106 | align(OptoLoopAlignment); |
ogatak@9713 | 1107 | bind(core_loop); |
ogatak@9713 | 1108 | |
ogatak@9713 | 1109 | // due to VectorRegister rotate, always iterate in multiples of total_hs |
ogatak@9713 | 1110 | for (int n = 0; n < total_hs/2; n++) { |
ogatak@9713 | 1111 | sha512_calc_2w(w0, w1, w2, w3, w4, w5, w6, w7, kplusw0, kplusw1, j, vRb, k); |
ogatak@9713 | 1112 | sha512_round(hs, total_hs, h_cnt, kplusw0); |
ogatak@9713 | 1113 | sha512_round(hs, total_hs, h_cnt, kplusw1); |
ogatak@9713 | 1114 | } |
ogatak@9713 | 1115 | |
ogatak@9713 | 1116 | bdnz (core_loop); |
ogatak@9713 | 1117 | |
ogatak@9713 | 1118 | sha512_update_sha_state(state, hs, total_hs); |
ogatak@9713 | 1119 | |
ogatak@9713 | 1120 | if (multi_block) { |
ogatak@9713 | 1121 | addi(buf_in, buf_in, buf_size); |
ogatak@9713 | 1122 | addi(ofs, ofs, buf_size); |
ogatak@9713 | 1123 | cmplw(CCR0, ofs, limit); |
ogatak@9713 | 1124 | ble(CCR0, sha_loop); |
ogatak@9713 | 1125 | |
ogatak@9713 | 1126 | // return ofs |
ogatak@9713 | 1127 | mr(R3_RET, ofs); |
ogatak@9713 | 1128 | } |
ogatak@9713 | 1129 | |
ogatak@9713 | 1130 | // Restore non-volatile registers |
ogatak@9713 | 1131 | for (int c = 0; c < nv_size; c++) { |
ogatak@9713 | 1132 | Register idx = R7; |
ogatak@9713 | 1133 | li (idx, (c - (nv_size)) * 16); |
ogatak@9713 | 1134 | lvx(nv[c], idx, R1); |
ogatak@9713 | 1135 | } |
ogatak@9713 | 1136 | } |