src/cpu/ppc/vm/macroAssembler_ppc_sha.cpp

changeset 9713
c4567d28f31f
equal deleted inserted replaced
9712:d7e1e002b496 9713:c4567d28f31f
1 // Copyright (c) 2017 Instituto de Pesquisas Eldorado. All rights reserved.
2 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
3 //
4 // This code is free software; you can redistribute it and/or modify it
5 // under the terms of the GNU General Public License version 2 only, as
6 // published by the Free Software Foundation.
7 //
8 // This code is distributed in the hope that it will be useful, but WITHOUT
9 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11 // version 2 for more details (a copy is included in the LICENSE file that
12 // accompanied this code).
13 //
14 // You should have received a copy of the GNU General Public License version
15 // 2 along with this work; if not, write to the Free Software Foundation,
16 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
17 //
18 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
19 // or visit www.oracle.com if you need additional information or have any
20 // questions.
21
22 // Implemented according to "Descriptions of SHA-256, SHA-384, and SHA-512"
23 // (http://www.iwar.org.uk/comsec/resources/cipher/sha256-384-512.pdf).
24
25 #include "asm/macroAssembler.inline.hpp"
26 #include "runtime/stubRoutines.hpp"
27
28 /**********************************************************************
29 * SHA 256
30 *********************************************************************/
31
32 void MacroAssembler::sha256_deque(const VectorRegister src,
33 const VectorRegister dst1,
34 const VectorRegister dst2,
35 const VectorRegister dst3) {
36 vsldoi (dst1, src, src, 12);
37 vsldoi (dst2, src, src, 8);
38 vsldoi (dst3, src, src, 4);
39 }
40
41 void MacroAssembler::sha256_round(const VectorRegister* hs,
42 const int total_hs,
43 int& h_cnt,
44 const VectorRegister kpw) {
45 // convenience registers: cycle from 0-7 downwards
46 const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
47 const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
48 const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
49 const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
50 const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
51 const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
52 const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
53 const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
54 // temporaries
55 VectorRegister ch = VR0;
56 VectorRegister maj = VR1;
57 VectorRegister bsa = VR2;
58 VectorRegister bse = VR3;
59 VectorRegister vt0 = VR4;
60 VectorRegister vt1 = VR5;
61 VectorRegister vt2 = VR6;
62 VectorRegister vt3 = VR7;
63
64 vsel (ch, g, f, e);
65 vxor (maj, a, b);
66 vshasigmaw (bse, e, 1, 0xf);
67 vadduwm (vt2, ch, kpw);
68 vadduwm (vt1, h, bse);
69 vsel (maj, b, c, maj);
70 vadduwm (vt3, vt1, vt2);
71 vshasigmaw (bsa, a, 1, 0);
72 vadduwm (vt0, bsa, maj);
73
74 vadduwm (d, d, vt3);
75 vadduwm (h, vt3, vt0);
76
77 // advance vector pointer to the next iteration
78 h_cnt++;
79 }
80
81 void MacroAssembler::sha256_load_h_vec(const VectorRegister a,
82 const VectorRegister e,
83 const Register hptr) {
84 // temporaries
85 Register tmp = R8;
86 VectorRegister vt0 = VR0;
87 VectorRegister vRb = VR6;
88 // labels
89 Label sha256_aligned;
90
91 andi_ (tmp, hptr, 0xf);
92 lvx (a, hptr);
93 addi (tmp, hptr, 16);
94 lvx (e, tmp);
95 beq (CCR0, sha256_aligned);
96
97 // handle unaligned accesses
98 load_perm(vRb, hptr);
99 addi (tmp, hptr, 32);
100 vec_perm(a, e, vRb);
101
102 lvx (vt0, tmp);
103 vec_perm(e, vt0, vRb);
104
105 // aligned accesses
106 bind(sha256_aligned);
107 }
108
109 void MacroAssembler::sha256_load_w_plus_k_vec(const Register buf_in,
110 const VectorRegister* ws,
111 const int total_ws,
112 const Register k,
113 const VectorRegister* kpws,
114 const int total_kpws) {
115 Label w_aligned, after_w_load;
116
117 Register tmp = R8;
118 VectorRegister vt0 = VR0;
119 VectorRegister vt1 = VR1;
120 VectorRegister vRb = VR6;
121
122 andi_ (tmp, buf_in, 0xF);
123 beq (CCR0, w_aligned); // address ends with 0x0, not 0x8
124
125 // deal with unaligned addresses
126 lvx (ws[0], buf_in);
127 load_perm(vRb, buf_in);
128
129 for (int n = 1; n < total_ws; n++) {
130 VectorRegister w_cur = ws[n];
131 VectorRegister w_prev = ws[n-1];
132
133 addi (tmp, buf_in, n * 16);
134 lvx (w_cur, tmp);
135 vec_perm(w_prev, w_cur, vRb);
136 }
137 addi (tmp, buf_in, total_ws * 16);
138 lvx (vt0, tmp);
139 vec_perm(ws[total_ws-1], vt0, vRb);
140 b (after_w_load);
141
142 bind(w_aligned);
143
144 // deal with aligned addresses
145 lvx(ws[0], buf_in);
146 for (int n = 1; n < total_ws; n++) {
147 VectorRegister w = ws[n];
148 addi (tmp, buf_in, n * 16);
149 lvx (w, tmp);
150 }
151
152 bind(after_w_load);
153
154 #if defined(VM_LITTLE_ENDIAN)
155 // Byte swapping within int values
156 li (tmp, 8);
157 lvsl (vt0, tmp);
158 vspltisb (vt1, 0xb);
159 vxor (vt1, vt0, vt1);
160 for (int n = 0; n < total_ws; n++) {
161 VectorRegister w = ws[n];
162 vec_perm(w, w, vt1);
163 }
164 #endif
165
166 // Loading k, which is always aligned to 16-bytes
167 lvx (kpws[0], k);
168 for (int n = 1; n < total_kpws; n++) {
169 VectorRegister kpw = kpws[n];
170 addi (tmp, k, 16 * n);
171 lvx (kpw, tmp);
172 }
173
174 // Add w to K
175 assert(total_ws == total_kpws, "Redesign the loop below");
176 for (int n = 0; n < total_kpws; n++) {
177 VectorRegister kpw = kpws[n];
178 VectorRegister w = ws[n];
179
180 vadduwm (kpw, kpw, w);
181 }
182 }
183
184 void MacroAssembler::sha256_calc_4w(const VectorRegister w0,
185 const VectorRegister w1,
186 const VectorRegister w2,
187 const VectorRegister w3,
188 const VectorRegister kpw0,
189 const VectorRegister kpw1,
190 const VectorRegister kpw2,
191 const VectorRegister kpw3,
192 const Register j,
193 const Register k) {
194 // Temporaries
195 const VectorRegister vt0 = VR0;
196 const VectorRegister vt1 = VR1;
197 const VectorSRegister vsrt1 = vt1->to_vsr();
198 const VectorRegister vt2 = VR2;
199 const VectorRegister vt3 = VR3;
200 const VectorSRegister vst3 = vt3->to_vsr();
201 const VectorRegister vt4 = VR4;
202
203 // load to k[j]
204 lvx (vt0, j, k);
205
206 // advance j
207 addi (j, j, 16); // 16 bytes were read
208
209 #if defined(VM_LITTLE_ENDIAN)
210 // b = w[j-15], w[j-14], w[j-13], w[j-12]
211 vsldoi (vt1, w1, w0, 12);
212
213 // c = w[j-7], w[j-6], w[j-5], w[j-4]
214 vsldoi (vt2, w3, w2, 12);
215
216 #else
217 // b = w[j-15], w[j-14], w[j-13], w[j-12]
218 vsldoi (vt1, w0, w1, 4);
219
220 // c = w[j-7], w[j-6], w[j-5], w[j-4]
221 vsldoi (vt2, w2, w3, 4);
222 #endif
223
224 // d = w[j-2], w[j-1], w[j-4], w[j-3]
225 vsldoi (vt3, w3, w3, 8);
226
227 // b = s0(w[j-15]) , s0(w[j-14]) , s0(w[j-13]) , s0(w[j-12])
228 vshasigmaw (vt1, vt1, 0, 0);
229
230 // d = s1(w[j-2]) , s1(w[j-1]) , s1(w[j-4]) , s1(w[j-3])
231 vshasigmaw (vt3, vt3, 0, 0xf);
232
233 // c = s0(w[j-15]) + w[j-7],
234 // s0(w[j-14]) + w[j-6],
235 // s0(w[j-13]) + w[j-5],
236 // s0(w[j-12]) + w[j-4]
237 vadduwm (vt2, vt1, vt2);
238
239 // c = s0(w[j-15]) + w[j-7] + w[j-16],
240 // s0(w[j-14]) + w[j-6] + w[j-15],
241 // s0(w[j-13]) + w[j-5] + w[j-14],
242 // s0(w[j-12]) + w[j-4] + w[j-13]
243 vadduwm (vt2, vt2, w0);
244
245 // e = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
246 // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
247 // s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j-4]), // UNDEFINED
248 // s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j-3]) // UNDEFINED
249 vadduwm (vt4, vt2, vt3);
250
251 // At this point, e[0] and e[1] are the correct values to be stored at w[j]
252 // and w[j+1].
253 // e[2] and e[3] are not considered.
254 // b = s1(w[j]) , s1(s(w[j+1]) , UNDEFINED , UNDEFINED
255 vshasigmaw (vt1, vt4, 0, 0xf);
256
257 // v5 = s1(w[j-2]) , s1(w[j-1]) , s1(w[j]) , s1(w[j+1])
258 #if defined(VM_LITTLE_ENDIAN)
259 xxmrgld (vst3, vsrt1, vst3);
260 #else
261 xxmrghd (vst3, vst3, vsrt1);
262 #endif
263
264 // c = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
265 // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
266 // s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j]), // w[j+2]
267 // s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j+1]) // w[j+4]
268 vadduwm (vt2, vt2, vt3);
269
270 // Updating w0 to w3 to hold the new previous 16 values from w.
271 vmr (w0, w1);
272 vmr (w1, w2);
273 vmr (w2, w3);
274 vmr (w3, vt2);
275
276 // store k + w to v9 (4 values at once)
277 #if defined(VM_LITTLE_ENDIAN)
278 vadduwm (kpw0, vt2, vt0);
279
280 vsldoi (kpw1, kpw0, kpw0, 12);
281 vsldoi (kpw2, kpw0, kpw0, 8);
282 vsldoi (kpw3, kpw0, kpw0, 4);
283 #else
284 vadduwm (kpw3, vt2, vt0);
285
286 vsldoi (kpw2, kpw3, kpw3, 12);
287 vsldoi (kpw1, kpw3, kpw3, 8);
288 vsldoi (kpw0, kpw3, kpw3, 4);
289 #endif
290 }
291
292 void MacroAssembler::sha256_update_sha_state(const VectorRegister a,
293 const VectorRegister b_,
294 const VectorRegister c,
295 const VectorRegister d,
296 const VectorRegister e,
297 const VectorRegister f,
298 const VectorRegister g,
299 const VectorRegister h,
300 const Register hptr) {
301 // temporaries
302 VectorRegister vt0 = VR0;
303 VectorRegister vt1 = VR1;
304 VectorRegister vt2 = VR2;
305 VectorRegister vt3 = VR3;
306 VectorRegister vt4 = VR4;
307 VectorRegister vt5 = VR5;
308 VectorRegister vaux = VR6;
309 VectorRegister vRb = VR6;
310 Register tmp = R8;
311 Register of16 = R8;
312 Register of32 = R9;
313 Label state_load_aligned;
314
315 // Load hptr
316 andi_ (tmp, hptr, 0xf);
317 li (of16, 16);
318 lvx (vt0, hptr);
319 lvx (vt5, of16, hptr);
320 beq (CCR0, state_load_aligned);
321
322 // handle unaligned accesses
323 li (of32, 32);
324 load_perm(vRb, hptr);
325
326 vec_perm(vt0, vt5, vRb); // vt0 = hptr[0]..hptr[3]
327
328 lvx (vt1, hptr, of32);
329 vec_perm(vt5, vt1, vRb); // vt5 = hptr[4]..hptr[7]
330
331 // aligned accesses
332 bind(state_load_aligned);
333
334 #if defined(VM_LITTLE_ENDIAN)
335 vmrglw (vt1, b_, a); // vt1 = {a, b, ?, ?}
336 vmrglw (vt2, d, c); // vt2 = {c, d, ?, ?}
337 vmrglw (vt3, f, e); // vt3 = {e, f, ?, ?}
338 vmrglw (vt4, h, g); // vt4 = {g, h, ?, ?}
339 xxmrgld (vt1->to_vsr(), vt2->to_vsr(), vt1->to_vsr()); // vt1 = {a, b, c, d}
340 xxmrgld (vt3->to_vsr(), vt4->to_vsr(), vt3->to_vsr()); // vt3 = {e, f, g, h}
341 vadduwm (a, vt0, vt1); // a = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
342 vadduwm (e, vt5, vt3); // e = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
343
344 // Save hptr back, works for any alignment
345 xxswapd (vt0->to_vsr(), a->to_vsr());
346 stxvd2x (vt0->to_vsr(), hptr);
347 xxswapd (vt5->to_vsr(), e->to_vsr());
348 stxvd2x (vt5->to_vsr(), of16, hptr);
349 #else
350 vmrglw (vt1, a, b_); // vt1 = {a, b, ?, ?}
351 vmrglw (vt2, c, d); // vt2 = {c, d, ?, ?}
352 vmrglw (vt3, e, f); // vt3 = {e, f, ?, ?}
353 vmrglw (vt4, g, h); // vt4 = {g, h, ?, ?}
354 xxmrgld (vt1->to_vsr(), vt1->to_vsr(), vt2->to_vsr()); // vt1 = {a, b, c, d}
355 xxmrgld (vt3->to_vsr(), vt3->to_vsr(), vt4->to_vsr()); // vt3 = {e, f, g, h}
356 vadduwm (d, vt0, vt1); // d = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
357 vadduwm (h, vt5, vt3); // h = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
358
359 // Save hptr back, works for any alignment
360 stxvd2x (d->to_vsr(), hptr);
361 stxvd2x (h->to_vsr(), of16, hptr);
362 #endif
363 }
364
365 static const uint32_t sha256_round_table[64] __attribute((aligned(16))) = {
366 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
367 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
368 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
369 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
370 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
371 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
372 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
373 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
374 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
375 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
376 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
377 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
378 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
379 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
380 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
381 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
382 };
383 static const uint32_t *sha256_round_consts = sha256_round_table;
384
385 // R3_ARG1 - byte[] Input string with padding but in Big Endian
386 // R4_ARG2 - int[] SHA.state (at first, the root of primes)
387 // R5_ARG3 - int offset
388 // R6_ARG4 - int limit
389 //
390 // Internal Register usage:
391 // R7 - k
392 // R8 - tmp | j | of16
393 // R9 - of32
394 // VR0-VR8 - ch, maj, bsa, bse, vt0-vt3 | vt0-vt5, vaux/vRb
395 // VR9-VR16 - a-h
396 // VR17-VR20 - w0-w3
397 // VR21-VR23 - vRb | vaux0-vaux2
398 // VR24-VR27 - kpw0-kpw3
399 void MacroAssembler::sha256(bool multi_block) {
400 static const ssize_t buf_size = 64;
401 static const uint8_t w_size = sizeof(sha256_round_table)/sizeof(uint32_t);
402 #ifdef AIX
403 // malloc provides 16 byte alignment
404 if (((uintptr_t)sha256_round_consts & 0xF) != 0) {
405 uint32_t *new_round_consts = (uint32_t*)malloc(sizeof(sha256_round_table));
406 guarantee(new_round_consts, "oom");
407 memcpy(new_round_consts, sha256_round_consts, sizeof(sha256_round_table));
408 sha256_round_consts = (const uint32_t*)new_round_consts;
409 }
410 #endif
411
412 Register buf_in = R3_ARG1;
413 Register state = R4_ARG2;
414 Register ofs = R5_ARG3;
415 Register limit = R6_ARG4;
416
417 Label sha_loop, core_loop;
418
419 // Save non-volatile vector registers in the red zone
420 static const VectorRegister nv[] = {
421 VR20, VR21, VR22, VR23, VR24, VR25, VR26, VR27/*, VR28, VR29, VR30, VR31*/
422 };
423 static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
424
425 for (int c = 0; c < nv_size; c++) {
426 Register tmp = R8;
427 li (tmp, (c - (nv_size)) * 16);
428 stvx(nv[c], tmp, R1);
429 }
430
431 // Load hash state to registers
432 VectorRegister a = VR9;
433 VectorRegister b = VR10;
434 VectorRegister c = VR11;
435 VectorRegister d = VR12;
436 VectorRegister e = VR13;
437 VectorRegister f = VR14;
438 VectorRegister g = VR15;
439 VectorRegister h = VR16;
440 static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
441 static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
442 // counter for cycling through hs vector to avoid register moves between iterations
443 int h_cnt = 0;
444
445 // Load a-h registers from the memory pointed by state
446 #if defined(VM_LITTLE_ENDIAN)
447 sha256_load_h_vec(a, e, state);
448 #else
449 sha256_load_h_vec(d, h, state);
450 #endif
451
452 // keep k loaded also during MultiBlock loops
453 Register k = R7;
454 assert(((uintptr_t)sha256_round_consts & 0xF) == 0, "k alignment");
455 load_const_optimized(k, (address)sha256_round_consts, R0);
456
457 // Avoiding redundant loads
458 if (multi_block) {
459 align(OptoLoopAlignment);
460 }
461 bind(sha_loop);
462 #if defined(VM_LITTLE_ENDIAN)
463 sha256_deque(a, b, c, d);
464 sha256_deque(e, f, g, h);
465 #else
466 sha256_deque(d, c, b, a);
467 sha256_deque(h, g, f, e);
468 #endif
469
470 // Load 16 elements from w out of the loop.
471 // Order of the int values is Endianess specific.
472 VectorRegister w0 = VR17;
473 VectorRegister w1 = VR18;
474 VectorRegister w2 = VR19;
475 VectorRegister w3 = VR20;
476 static const VectorRegister ws[] = {w0, w1, w2, w3};
477 static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
478
479 VectorRegister kpw0 = VR24;
480 VectorRegister kpw1 = VR25;
481 VectorRegister kpw2 = VR26;
482 VectorRegister kpw3 = VR27;
483 static const VectorRegister kpws[] = {kpw0, kpw1, kpw2, kpw3};
484 static const int total_kpws = sizeof(kpws)/sizeof(VectorRegister);
485
486 sha256_load_w_plus_k_vec(buf_in, ws, total_ws, k, kpws, total_kpws);
487
488 // Cycle through the first 16 elements
489 assert(total_ws == total_kpws, "Redesign the loop below");
490 for (int n = 0; n < total_ws; n++) {
491 VectorRegister vaux0 = VR21;
492 VectorRegister vaux1 = VR22;
493 VectorRegister vaux2 = VR23;
494
495 sha256_deque(kpws[n], vaux0, vaux1, vaux2);
496
497 #if defined(VM_LITTLE_ENDIAN)
498 sha256_round(hs, total_hs, h_cnt, kpws[n]);
499 sha256_round(hs, total_hs, h_cnt, vaux0);
500 sha256_round(hs, total_hs, h_cnt, vaux1);
501 sha256_round(hs, total_hs, h_cnt, vaux2);
502 #else
503 sha256_round(hs, total_hs, h_cnt, vaux2);
504 sha256_round(hs, total_hs, h_cnt, vaux1);
505 sha256_round(hs, total_hs, h_cnt, vaux0);
506 sha256_round(hs, total_hs, h_cnt, kpws[n]);
507 #endif
508 }
509
510 Register tmp = R8;
511 // loop the 16th to the 64th iteration by 8 steps
512 li (tmp, (w_size - 16) / total_hs);
513 mtctr(tmp);
514
515 // j will be aligned to 4 for loading words.
516 // Whenever read, advance the pointer (e.g: when j is used in a function)
517 Register j = R8;
518 li (j, 16*4);
519
520 align(OptoLoopAlignment);
521 bind(core_loop);
522
523 // due to VectorRegister rotate, always iterate in multiples of total_hs
524 for (int n = 0; n < total_hs/4; n++) {
525 sha256_calc_4w(w0, w1, w2, w3, kpw0, kpw1, kpw2, kpw3, j, k);
526 sha256_round(hs, total_hs, h_cnt, kpw0);
527 sha256_round(hs, total_hs, h_cnt, kpw1);
528 sha256_round(hs, total_hs, h_cnt, kpw2);
529 sha256_round(hs, total_hs, h_cnt, kpw3);
530 }
531
532 bdnz (core_loop);
533
534 // Update hash state
535 sha256_update_sha_state(a, b, c, d, e, f, g, h, state);
536
537 if (multi_block) {
538 addi(buf_in, buf_in, buf_size);
539 addi(ofs, ofs, buf_size);
540 cmplw(CCR0, ofs, limit);
541 ble(CCR0, sha_loop);
542
543 // return ofs
544 mr(R3_RET, ofs);
545 }
546
547 // Restore non-volatile registers
548 for (int c = 0; c < nv_size; c++) {
549 Register tmp = R8;
550 li (tmp, (c - (nv_size)) * 16);
551 lvx(nv[c], tmp, R1);
552 }
553 }
554
555
556 /**********************************************************************
557 * SHA 512
558 *********************************************************************/
559
560 void MacroAssembler::sha512_load_w_vec(const Register buf_in,
561 const VectorRegister* ws,
562 const int total_ws) {
563 Register tmp = R8;
564 VectorRegister vRb = VR8;
565 VectorRegister aux = VR9;
566 Label is_aligned, after_alignment;
567
568 andi_ (tmp, buf_in, 0xF);
569 beq (CCR0, is_aligned); // address ends with 0x0, not 0x8
570
571 // deal with unaligned addresses
572 lvx (ws[0], buf_in);
573 load_perm(vRb, buf_in);
574
575 for (int n = 1; n < total_ws; n++) {
576 VectorRegister w_cur = ws[n];
577 VectorRegister w_prev = ws[n-1];
578 addi (tmp, buf_in, n * 16);
579 lvx (w_cur, tmp);
580 vec_perm(w_prev, w_cur, vRb);
581 }
582 addi (tmp, buf_in, total_ws * 16);
583 lvx (aux, tmp);
584 vec_perm(ws[total_ws-1], aux, vRb);
585 b (after_alignment);
586
587 bind(is_aligned);
588 lvx (ws[0], buf_in);
589 for (int n = 1; n < total_ws; n++) {
590 VectorRegister w = ws[n];
591 addi (tmp, buf_in, n * 16);
592 lvx (w, tmp);
593 }
594
595 bind(after_alignment);
596 }
597
598 // Update hash state
599 void MacroAssembler::sha512_update_sha_state(const Register state,
600 const VectorRegister* hs,
601 const int total_hs) {
602
603 #if defined(VM_LITTLE_ENDIAN)
604 int start_idx = 0;
605 #else
606 int start_idx = 1;
607 #endif
608
609 // load initial hash from the memory pointed by state
610 VectorRegister ini_a = VR10;
611 VectorRegister ini_c = VR12;
612 VectorRegister ini_e = VR14;
613 VectorRegister ini_g = VR16;
614 static const VectorRegister inis[] = {ini_a, ini_c, ini_e, ini_g};
615 static const int total_inis = sizeof(inis)/sizeof(VectorRegister);
616
617 Label state_save_aligned, after_state_save_aligned;
618
619 Register addr = R7;
620 Register tmp = R8;
621 VectorRegister vRb = VR8;
622 VectorRegister aux = VR9;
623
624 andi_(tmp, state, 0xf);
625 beq(CCR0, state_save_aligned);
626 // deal with unaligned addresses
627
628 {
629 VectorRegister a = hs[0];
630 VectorRegister b_ = hs[1];
631 VectorRegister c = hs[2];
632 VectorRegister d = hs[3];
633 VectorRegister e = hs[4];
634 VectorRegister f = hs[5];
635 VectorRegister g = hs[6];
636 VectorRegister h = hs[7];
637 load_perm(vRb, state);
638 lvx (ini_a, state);
639 addi (addr, state, 16);
640
641 lvx (ini_c, addr);
642 addi (addr, state, 32);
643 vec_perm(ini_a, ini_c, vRb);
644
645 lvx (ini_e, addr);
646 addi (addr, state, 48);
647 vec_perm(ini_c, ini_e, vRb);
648
649 lvx (ini_g, addr);
650 addi (addr, state, 64);
651 vec_perm(ini_e, ini_g, vRb);
652
653 lvx (aux, addr);
654 vec_perm(ini_g, aux, vRb);
655
656 #if defined(VM_LITTLE_ENDIAN)
657 xxmrgld(a->to_vsr(), b_->to_vsr(), a->to_vsr());
658 xxmrgld(c->to_vsr(), d->to_vsr(), c->to_vsr());
659 xxmrgld(e->to_vsr(), f->to_vsr(), e->to_vsr());
660 xxmrgld(g->to_vsr(), h->to_vsr(), g->to_vsr());
661 #else
662 xxmrgld(b_->to_vsr(), a->to_vsr(), b_->to_vsr());
663 xxmrgld(d->to_vsr(), c->to_vsr(), d->to_vsr());
664 xxmrgld(f->to_vsr(), e->to_vsr(), f->to_vsr());
665 xxmrgld(h->to_vsr(), g->to_vsr(), h->to_vsr());
666 #endif
667
668 for (int n = start_idx; n < total_hs; n += 2) {
669 VectorRegister h_cur = hs[n];
670 VectorRegister ini_cur = inis[n/2];
671
672 vaddudm(h_cur, ini_cur, h_cur);
673 }
674
675 for (int n = start_idx; n < total_hs; n += 2) {
676 VectorRegister h_cur = hs[n];
677
678 mfvrd (tmp, h_cur);
679 #if defined(VM_LITTLE_ENDIAN)
680 std (tmp, 8*n + 8, state);
681 #else
682 std (tmp, 8*n - 8, state);
683 #endif
684 vsldoi (aux, h_cur, h_cur, 8);
685 mfvrd (tmp, aux);
686 std (tmp, 8*n + 0, state);
687 }
688
689 b (after_state_save_aligned);
690 }
691
692 bind(state_save_aligned);
693 {
694 for (int n = 0; n < total_hs; n += 2) {
695 #if defined(VM_LITTLE_ENDIAN)
696 VectorRegister h_cur = hs[n];
697 VectorRegister h_next = hs[n+1];
698 #else
699 VectorRegister h_cur = hs[n+1];
700 VectorRegister h_next = hs[n];
701 #endif
702 VectorRegister ini_cur = inis[n/2];
703
704 if (n/2 == 0) {
705 lvx(ini_cur, state);
706 } else {
707 addi(addr, state, (n/2) * 16);
708 lvx(ini_cur, addr);
709 }
710 xxmrgld(h_cur->to_vsr(), h_next->to_vsr(), h_cur->to_vsr());
711 }
712
713 for (int n = start_idx; n < total_hs; n += 2) {
714 VectorRegister h_cur = hs[n];
715 VectorRegister ini_cur = inis[n/2];
716
717 vaddudm(h_cur, ini_cur, h_cur);
718 }
719
720 for (int n = start_idx; n < total_hs; n += 2) {
721 VectorRegister h_cur = hs[n];
722
723 if (n/2 == 0) {
724 stvx(h_cur, state);
725 } else {
726 addi(addr, state, (n/2) * 16);
727 stvx(h_cur, addr);
728 }
729 }
730 }
731
732 bind(after_state_save_aligned);
733 }
734
735 // Use h_cnt to cycle through hs elements but also increment it at the end
736 void MacroAssembler::sha512_round(const VectorRegister* hs,
737 const int total_hs, int& h_cnt,
738 const VectorRegister kpw) {
739
740 // convenience registers: cycle from 0-7 downwards
741 const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
742 const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
743 const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
744 const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
745 const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
746 const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
747 const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
748 const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
749 // temporaries
750 const VectorRegister Ch = VR20;
751 const VectorRegister Maj = VR21;
752 const VectorRegister bsa = VR22;
753 const VectorRegister bse = VR23;
754 const VectorRegister tmp1 = VR24;
755 const VectorRegister tmp2 = VR25;
756
757 vsel (Ch, g, f, e);
758 vxor (Maj, a, b);
759 vshasigmad(bse, e, 1, 0xf);
760 vaddudm (tmp2, Ch, kpw);
761 vaddudm (tmp1, h, bse);
762 vsel (Maj, b, c, Maj);
763 vaddudm (tmp1, tmp1, tmp2);
764 vshasigmad(bsa, a, 1, 0);
765 vaddudm (tmp2, bsa, Maj);
766 vaddudm (d, d, tmp1);
767 vaddudm (h, tmp1, tmp2);
768
769 // advance vector pointer to the next iteration
770 h_cnt++;
771 }
772
773 void MacroAssembler::sha512_calc_2w(const VectorRegister w0,
774 const VectorRegister w1,
775 const VectorRegister w2,
776 const VectorRegister w3,
777 const VectorRegister w4,
778 const VectorRegister w5,
779 const VectorRegister w6,
780 const VectorRegister w7,
781 const VectorRegister kpw0,
782 const VectorRegister kpw1,
783 const Register j,
784 const VectorRegister vRb,
785 const Register k) {
786 // Temporaries
787 const VectorRegister VR_a = VR20;
788 const VectorRegister VR_b = VR21;
789 const VectorRegister VR_c = VR22;
790 const VectorRegister VR_d = VR23;
791
792 // load to k[j]
793 lvx (VR_a, j, k);
794 // advance j
795 addi (j, j, 16); // 16 bytes were read
796
797 #if defined(VM_LITTLE_ENDIAN)
798 // v6 = w[j-15], w[j-14]
799 vperm (VR_b, w1, w0, vRb);
800 // v12 = w[j-7], w[j-6]
801 vperm (VR_c, w5, w4, vRb);
802 #else
803 // v6 = w[j-15], w[j-14]
804 vperm (VR_b, w0, w1, vRb);
805 // v12 = w[j-7], w[j-6]
806 vperm (VR_c, w4, w5, vRb);
807 #endif
808
809 // v6 = s0(w[j-15]) , s0(w[j-14])
810 vshasigmad (VR_b, VR_b, 0, 0);
811 // v5 = s1(w[j-2]) , s1(w[j-1])
812 vshasigmad (VR_d, w7, 0, 0xf);
813 // v6 = s0(w[j-15]) + w[j-7] , s0(w[j-14]) + w[j-6]
814 vaddudm (VR_b, VR_b, VR_c);
815 // v8 = s1(w[j-2]) + w[j-16] , s1(w[j-1]) + w[j-15]
816 vaddudm (VR_d, VR_d, w0);
817 // v9 = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
818 // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
819 vaddudm (VR_c, VR_d, VR_b);
820 // Updating w0 to w7 to hold the new previous 16 values from w.
821 vmr (w0, w1);
822 vmr (w1, w2);
823 vmr (w2, w3);
824 vmr (w3, w4);
825 vmr (w4, w5);
826 vmr (w5, w6);
827 vmr (w6, w7);
828 vmr (w7, VR_c);
829
830 #if defined(VM_LITTLE_ENDIAN)
831 // store k + w to kpw0 (2 values at once)
832 vaddudm (kpw0, VR_c, VR_a);
833 // kpw1 holds (k + w)[1]
834 vsldoi (kpw1, kpw0, kpw0, 8);
835 #else
836 // store k + w to kpw0 (2 values at once)
837 vaddudm (kpw1, VR_c, VR_a);
838 // kpw1 holds (k + w)[1]
839 vsldoi (kpw0, kpw1, kpw1, 8);
840 #endif
841 }
842
843 void MacroAssembler::sha512_load_h_vec(const Register state,
844 const VectorRegister* hs,
845 const int total_hs) {
846 #if defined(VM_LITTLE_ENDIAN)
847 VectorRegister a = hs[0];
848 VectorRegister g = hs[6];
849 int start_idx = 0;
850 #else
851 VectorRegister a = hs[1];
852 VectorRegister g = hs[7];
853 int start_idx = 1;
854 #endif
855
856 Register addr = R7;
857 VectorRegister vRb = VR8;
858 Register tmp = R8;
859 Label state_aligned, after_state_aligned;
860
861 andi_(tmp, state, 0xf);
862 beq(CCR0, state_aligned);
863
864 // deal with unaligned addresses
865 VectorRegister aux = VR9;
866
867 lvx(hs[start_idx], state);
868 load_perm(vRb, state);
869
870 for (int n = start_idx + 2; n < total_hs; n += 2) {
871 VectorRegister h_cur = hs[n];
872 VectorRegister h_prev2 = hs[n - 2];
873 addi(addr, state, (n/2) * 16);
874 lvx(h_cur, addr);
875 vec_perm(h_prev2, h_cur, vRb);
876 }
877 addi(addr, state, (total_hs/2) * 16);
878 lvx (aux, addr);
879 vec_perm(hs[total_hs - 2 + start_idx], aux, vRb);
880 b (after_state_aligned);
881
882 bind(state_aligned);
883
884 // deal with aligned addresses
885 lvx(hs[start_idx], state);
886
887 for (int n = start_idx + 2; n < total_hs; n += 2) {
888 VectorRegister h_cur = hs[n];
889 addi(addr, state, (n/2) * 16);
890 lvx(h_cur, addr);
891 }
892
893 bind(after_state_aligned);
894 }
895
896 static const uint64_t sha512_round_table[80] __attribute((aligned(16))) = {
897 0x428a2f98d728ae22, 0x7137449123ef65cd,
898 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
899 0x3956c25bf348b538, 0x59f111f1b605d019,
900 0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
901 0xd807aa98a3030242, 0x12835b0145706fbe,
902 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
903 0x72be5d74f27b896f, 0x80deb1fe3b1696b1,
904 0x9bdc06a725c71235, 0xc19bf174cf692694,
905 0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
906 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
907 0x2de92c6f592b0275, 0x4a7484aa6ea6e483,
908 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
909 0x983e5152ee66dfab, 0xa831c66d2db43210,
910 0xb00327c898fb213f, 0xbf597fc7beef0ee4,
911 0xc6e00bf33da88fc2, 0xd5a79147930aa725,
912 0x06ca6351e003826f, 0x142929670a0e6e70,
913 0x27b70a8546d22ffc, 0x2e1b21385c26c926,
914 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
915 0x650a73548baf63de, 0x766a0abb3c77b2a8,
916 0x81c2c92e47edaee6, 0x92722c851482353b,
917 0xa2bfe8a14cf10364, 0xa81a664bbc423001,
918 0xc24b8b70d0f89791, 0xc76c51a30654be30,
919 0xd192e819d6ef5218, 0xd69906245565a910,
920 0xf40e35855771202a, 0x106aa07032bbd1b8,
921 0x19a4c116b8d2d0c8, 0x1e376c085141ab53,
922 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
923 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
924 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
925 0x748f82ee5defb2fc, 0x78a5636f43172f60,
926 0x84c87814a1f0ab72, 0x8cc702081a6439ec,
927 0x90befffa23631e28, 0xa4506cebde82bde9,
928 0xbef9a3f7b2c67915, 0xc67178f2e372532b,
929 0xca273eceea26619c, 0xd186b8c721c0c207,
930 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
931 0x06f067aa72176fba, 0x0a637dc5a2c898a6,
932 0x113f9804bef90dae, 0x1b710b35131c471b,
933 0x28db77f523047d84, 0x32caab7b40c72493,
934 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
935 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
936 0x5fcb6fab3ad6faec, 0x6c44198c4a475817,
937 };
938 static const uint64_t *sha512_round_consts = sha512_round_table;
939
940 // R3_ARG1 - byte[] Input string with padding but in Big Endian
941 // R4_ARG2 - int[] SHA.state (at first, the root of primes)
942 // R5_ARG3 - int offset
943 // R6_ARG4 - int limit
944 //
945 // Internal Register usage:
946 // R7 R8 R9 - volatile temporaries
947 // VR0-VR7 - a-h
948 // VR8 - vRb
949 // VR9 - aux (highly volatile, use with care)
950 // VR10-VR17 - w0-w7 | ini_a-ini_h
951 // VR18 - vsp16 | kplusw0
952 // VR19 - vsp32 | kplusw1
953 // VR20-VR25 - sha512_calc_2w and sha512_round temporaries
954 void MacroAssembler::sha512(bool multi_block) {
955 static const ssize_t buf_size = 128;
956 static const uint8_t w_size = sizeof(sha512_round_table)/sizeof(uint64_t);
957 #ifdef AIX
958 // malloc provides 16 byte alignment
959 if (((uintptr_t)sha512_round_consts & 0xF) != 0) {
960 uint64_t *new_round_consts = (uint64_t*)malloc(sizeof(sha512_round_table));
961 guarantee(new_round_consts, "oom");
962 memcpy(new_round_consts, sha512_round_consts, sizeof(sha512_round_table));
963 sha512_round_consts = (const uint64_t*)new_round_consts;
964 }
965 #endif
966
967 Register buf_in = R3_ARG1;
968 Register state = R4_ARG2;
969 Register ofs = R5_ARG3;
970 Register limit = R6_ARG4;
971
972 Label sha_loop, core_loop;
973
974 // Save non-volatile vector registers in the red zone
975 static const VectorRegister nv[] = {
976 VR20, VR21, VR22, VR23, VR24, VR25/*, VR26, VR27, VR28, VR29, VR30, VR31*/
977 };
978 static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
979
980 for (int c = 0; c < nv_size; c++) {
981 Register idx = R7;
982 li (idx, (c - (nv_size)) * 16);
983 stvx(nv[c], idx, R1);
984 }
985
986 // Load hash state to registers
987 VectorRegister a = VR0;
988 VectorRegister b = VR1;
989 VectorRegister c = VR2;
990 VectorRegister d = VR3;
991 VectorRegister e = VR4;
992 VectorRegister f = VR5;
993 VectorRegister g = VR6;
994 VectorRegister h = VR7;
995 static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
996 static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
997 // counter for cycling through hs vector to avoid register moves between iterations
998 int h_cnt = 0;
999
1000 // Load a-h registers from the memory pointed by state
1001 sha512_load_h_vec(state, hs, total_hs);
1002
1003 Register k = R9;
1004 assert(((uintptr_t)sha512_round_consts & 0xF) == 0, "k alignment");
1005 load_const_optimized(k, (address)sha512_round_consts, R0);
1006
1007 if (multi_block) {
1008 align(OptoLoopAlignment);
1009 }
1010 bind(sha_loop);
1011
1012 for (int n = 0; n < total_hs; n += 2) {
1013 #if defined(VM_LITTLE_ENDIAN)
1014 VectorRegister h_cur = hs[n];
1015 VectorRegister h_next = hs[n + 1];
1016 #else
1017 VectorRegister h_cur = hs[n + 1];
1018 VectorRegister h_next = hs[n];
1019 #endif
1020 vsldoi (h_next, h_cur, h_cur, 8);
1021 }
1022
1023 // Load 16 elements from w out of the loop.
1024 // Order of the long values is Endianess specific.
1025 VectorRegister w0 = VR10;
1026 VectorRegister w1 = VR11;
1027 VectorRegister w2 = VR12;
1028 VectorRegister w3 = VR13;
1029 VectorRegister w4 = VR14;
1030 VectorRegister w5 = VR15;
1031 VectorRegister w6 = VR16;
1032 VectorRegister w7 = VR17;
1033 static const VectorRegister ws[] = {w0, w1, w2, w3, w4, w5, w6, w7};
1034 static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
1035
1036 // Load 16 w into vectors and setup vsl for vperm
1037 sha512_load_w_vec(buf_in, ws, total_ws);
1038
1039 #if defined(VM_LITTLE_ENDIAN)
1040 VectorRegister vsp16 = VR18;
1041 VectorRegister vsp32 = VR19;
1042 VectorRegister shiftarg = VR9;
1043
1044 vspltisw(vsp16, 8);
1045 vspltisw(shiftarg, 1);
1046 vsl (vsp16, vsp16, shiftarg);
1047 vsl (vsp32, vsp16, shiftarg);
1048
1049 VectorRegister vsp8 = VR9;
1050 vspltish(vsp8, 8);
1051
1052 // Convert input from Big Endian to Little Endian
1053 for (int c = 0; c < total_ws; c++) {
1054 VectorRegister w = ws[c];
1055 vrlh (w, w, vsp8);
1056 }
1057 for (int c = 0; c < total_ws; c++) {
1058 VectorRegister w = ws[c];
1059 vrlw (w, w, vsp16);
1060 }
1061 for (int c = 0; c < total_ws; c++) {
1062 VectorRegister w = ws[c];
1063 vrld (w, w, vsp32);
1064 }
1065 #endif
1066
1067 Register Rb = R10;
1068 VectorRegister vRb = VR8;
1069 li (Rb, 8);
1070 load_perm(vRb, Rb);
1071
1072 VectorRegister kplusw0 = VR18;
1073 VectorRegister kplusw1 = VR19;
1074
1075 Register addr = R7;
1076
1077 for (int n = 0; n < total_ws; n++) {
1078 VectorRegister w = ws[n];
1079
1080 if (n == 0) {
1081 lvx (kplusw0, k);
1082 } else {
1083 addi (addr, k, n * 16);
1084 lvx (kplusw0, addr);
1085 }
1086 #if defined(VM_LITTLE_ENDIAN)
1087 vaddudm(kplusw0, kplusw0, w);
1088 vsldoi (kplusw1, kplusw0, kplusw0, 8);
1089 #else
1090 vaddudm(kplusw1, kplusw0, w);
1091 vsldoi (kplusw0, kplusw1, kplusw1, 8);
1092 #endif
1093
1094 sha512_round(hs, total_hs, h_cnt, kplusw0);
1095 sha512_round(hs, total_hs, h_cnt, kplusw1);
1096 }
1097
1098 Register tmp = R8;
1099 li (tmp, (w_size-16)/total_hs);
1100 mtctr (tmp);
1101 // j will be aligned to 4 for loading words.
1102 // Whenever read, advance the pointer (e.g: when j is used in a function)
1103 Register j = tmp;
1104 li (j, 8*16);
1105
1106 align(OptoLoopAlignment);
1107 bind(core_loop);
1108
1109 // due to VectorRegister rotate, always iterate in multiples of total_hs
1110 for (int n = 0; n < total_hs/2; n++) {
1111 sha512_calc_2w(w0, w1, w2, w3, w4, w5, w6, w7, kplusw0, kplusw1, j, vRb, k);
1112 sha512_round(hs, total_hs, h_cnt, kplusw0);
1113 sha512_round(hs, total_hs, h_cnt, kplusw1);
1114 }
1115
1116 bdnz (core_loop);
1117
1118 sha512_update_sha_state(state, hs, total_hs);
1119
1120 if (multi_block) {
1121 addi(buf_in, buf_in, buf_size);
1122 addi(ofs, ofs, buf_size);
1123 cmplw(CCR0, ofs, limit);
1124 ble(CCR0, sha_loop);
1125
1126 // return ofs
1127 mr(R3_RET, ofs);
1128 }
1129
1130 // Restore non-volatile registers
1131 for (int c = 0; c < nv_size; c++) {
1132 Register idx = R7;
1133 li (idx, (c - (nv_size)) * 16);
1134 lvx(nv[c], idx, R1);
1135 }
1136 }

mercurial