src/cpu/ppc/vm/macroAssembler_ppc_sha.cpp

Tue, 18 Jun 2019 09:33:34 -0400

author
ogatak
date
Tue, 18 Jun 2019 09:33:34 -0400
changeset 9713
c4567d28f31f
permissions
-rw-r--r--

8185979: PPC64: Implement SHA2 intrinsic
Reviewed-by: mdoerr, goetz
Contributed-by: Bruno Rosa <bruno.rosa@eldorado.org.br>, Gustavo Serra Scalet <gustavo.scalet@eldorado.org.br>, Igor Nunes <igor.nunes@eldorado.org.br>, Martin Doerr <martin.doerr@sap.com>

     1 // Copyright (c) 2017 Instituto de Pesquisas Eldorado. All rights reserved.
     2 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     3 //
     4 // This code is free software; you can redistribute it and/or modify it
     5 // under the terms of the GNU General Public License version 2 only, as
     6 // published by the Free Software Foundation.
     7 //
     8 // This code is distributed in the hope that it will be useful, but WITHOUT
     9 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    10 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    11 // version 2 for more details (a copy is included in the LICENSE file that
    12 // accompanied this code).
    13 //
    14 // You should have received a copy of the GNU General Public License version
    15 // 2 along with this work; if not, write to the Free Software Foundation,
    16 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    17 //
    18 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    19 // or visit www.oracle.com if you need additional information or have any
    20 // questions.
    22 // Implemented according to "Descriptions of SHA-256, SHA-384, and SHA-512"
    23 // (http://www.iwar.org.uk/comsec/resources/cipher/sha256-384-512.pdf).
    25 #include "asm/macroAssembler.inline.hpp"
    26 #include "runtime/stubRoutines.hpp"
    28 /**********************************************************************
    29  * SHA 256
    30  *********************************************************************/
    32 void MacroAssembler::sha256_deque(const VectorRegister src,
    33                                   const VectorRegister dst1,
    34                                   const VectorRegister dst2,
    35                                   const VectorRegister dst3) {
    36   vsldoi (dst1, src, src, 12);
    37   vsldoi (dst2, src, src, 8);
    38   vsldoi (dst3, src, src, 4);
    39 }
    41 void MacroAssembler::sha256_round(const VectorRegister* hs,
    42                                   const int total_hs,
    43                                   int& h_cnt,
    44                                   const VectorRegister kpw) {
    45   // convenience registers: cycle from 0-7 downwards
    46   const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
    47   const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
    48   const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
    49   const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
    50   const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
    51   const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
    52   const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
    53   const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
    54   // temporaries
    55   VectorRegister ch  = VR0;
    56   VectorRegister maj = VR1;
    57   VectorRegister bsa = VR2;
    58   VectorRegister bse = VR3;
    59   VectorRegister vt0 = VR4;
    60   VectorRegister vt1 = VR5;
    61   VectorRegister vt2 = VR6;
    62   VectorRegister vt3 = VR7;
    64   vsel       (ch,  g,   f, e);
    65   vxor       (maj, a,   b);
    66   vshasigmaw (bse, e,   1, 0xf);
    67   vadduwm    (vt2, ch,  kpw);
    68   vadduwm    (vt1, h,   bse);
    69   vsel       (maj, b,   c, maj);
    70   vadduwm    (vt3, vt1, vt2);
    71   vshasigmaw (bsa, a,   1, 0);
    72   vadduwm    (vt0, bsa, maj);
    74   vadduwm    (d,   d,   vt3);
    75   vadduwm    (h,   vt3, vt0);
    77   // advance vector pointer to the next iteration
    78   h_cnt++;
    79 }
    81 void MacroAssembler::sha256_load_h_vec(const VectorRegister a,
    82                                        const VectorRegister e,
    83                                        const Register hptr) {
    84   // temporaries
    85   Register tmp = R8;
    86   VectorRegister vt0 = VR0;
    87   VectorRegister vRb = VR6;
    88   // labels
    89   Label sha256_aligned;
    91   andi_  (tmp,  hptr, 0xf);
    92   lvx    (a,    hptr);
    93   addi   (tmp,  hptr, 16);
    94   lvx    (e,    tmp);
    95   beq    (CCR0, sha256_aligned);
    97   // handle unaligned accesses
    98   load_perm(vRb, hptr);
    99   addi   (tmp, hptr, 32);
   100   vec_perm(a,   e,    vRb);
   102   lvx    (vt0,  tmp);
   103   vec_perm(e,   vt0,  vRb);
   105   // aligned accesses
   106   bind(sha256_aligned);
   107 }
   109 void MacroAssembler::sha256_load_w_plus_k_vec(const Register buf_in,
   110                                               const VectorRegister* ws,
   111                                               const int total_ws,
   112                                               const Register k,
   113                                               const VectorRegister* kpws,
   114                                               const int total_kpws) {
   115   Label w_aligned, after_w_load;
   117   Register tmp       = R8;
   118   VectorRegister vt0 = VR0;
   119   VectorRegister vt1 = VR1;
   120   VectorRegister vRb = VR6;
   122   andi_ (tmp, buf_in, 0xF);
   123   beq   (CCR0, w_aligned); // address ends with 0x0, not 0x8
   125   // deal with unaligned addresses
   126   lvx    (ws[0], buf_in);
   127   load_perm(vRb, buf_in);
   129   for (int n = 1; n < total_ws; n++) {
   130     VectorRegister w_cur = ws[n];
   131     VectorRegister w_prev = ws[n-1];
   133     addi (tmp, buf_in, n * 16);
   134     lvx  (w_cur, tmp);
   135     vec_perm(w_prev, w_cur, vRb);
   136   }
   137   addi   (tmp, buf_in, total_ws * 16);
   138   lvx    (vt0, tmp);
   139   vec_perm(ws[total_ws-1], vt0, vRb);
   140   b      (after_w_load);
   142   bind(w_aligned);
   144   // deal with aligned addresses
   145   lvx(ws[0], buf_in);
   146   for (int n = 1; n < total_ws; n++) {
   147     VectorRegister w = ws[n];
   148     addi (tmp, buf_in, n * 16);
   149     lvx  (w, tmp);
   150   }
   152   bind(after_w_load);
   154 #if defined(VM_LITTLE_ENDIAN)
   155   // Byte swapping within int values
   156   li       (tmp, 8);
   157   lvsl     (vt0, tmp);
   158   vspltisb (vt1, 0xb);
   159   vxor     (vt1, vt0, vt1);
   160   for (int n = 0; n < total_ws; n++) {
   161     VectorRegister w = ws[n];
   162     vec_perm(w, w, vt1);
   163   }
   164 #endif
   166   // Loading k, which is always aligned to 16-bytes
   167   lvx    (kpws[0], k);
   168   for (int n = 1; n < total_kpws; n++) {
   169     VectorRegister kpw = kpws[n];
   170     addi (tmp, k, 16 * n);
   171     lvx  (kpw, tmp);
   172   }
   174   // Add w to K
   175   assert(total_ws == total_kpws, "Redesign the loop below");
   176   for (int n = 0; n < total_kpws; n++) {
   177     VectorRegister kpw = kpws[n];
   178     VectorRegister w   = ws[n];
   180     vadduwm  (kpw, kpw, w);
   181   }
   182 }
   184 void MacroAssembler::sha256_calc_4w(const VectorRegister w0,
   185                                     const VectorRegister w1,
   186                                     const VectorRegister w2,
   187                                     const VectorRegister w3,
   188                                     const VectorRegister kpw0,
   189                                     const VectorRegister kpw1,
   190                                     const VectorRegister kpw2,
   191                                     const VectorRegister kpw3,
   192                                     const Register j,
   193                                     const Register k) {
   194   // Temporaries
   195   const VectorRegister  vt0  = VR0;
   196   const VectorRegister  vt1  = VR1;
   197   const VectorSRegister vsrt1 = vt1->to_vsr();
   198   const VectorRegister  vt2  = VR2;
   199   const VectorRegister  vt3  = VR3;
   200   const VectorSRegister vst3 = vt3->to_vsr();
   201   const VectorRegister  vt4  = VR4;
   203   // load to k[j]
   204   lvx        (vt0, j,   k);
   206   // advance j
   207   addi       (j,   j,   16); // 16 bytes were read
   209 #if defined(VM_LITTLE_ENDIAN)
   210   // b = w[j-15], w[j-14], w[j-13], w[j-12]
   211   vsldoi     (vt1, w1,  w0, 12);
   213   // c = w[j-7], w[j-6], w[j-5], w[j-4]
   214   vsldoi     (vt2, w3,  w2, 12);
   216 #else
   217   // b = w[j-15], w[j-14], w[j-13], w[j-12]
   218   vsldoi     (vt1, w0,  w1, 4);
   220   // c = w[j-7], w[j-6], w[j-5], w[j-4]
   221   vsldoi     (vt2, w2,  w3, 4);
   222 #endif
   224   // d = w[j-2], w[j-1], w[j-4], w[j-3]
   225   vsldoi     (vt3, w3,  w3, 8);
   227   // b = s0(w[j-15]) , s0(w[j-14]) , s0(w[j-13]) , s0(w[j-12])
   228   vshasigmaw (vt1, vt1, 0,  0);
   230   // d = s1(w[j-2]) , s1(w[j-1]) , s1(w[j-4]) , s1(w[j-3])
   231   vshasigmaw (vt3, vt3, 0,  0xf);
   233   // c = s0(w[j-15]) + w[j-7],
   234   //     s0(w[j-14]) + w[j-6],
   235   //     s0(w[j-13]) + w[j-5],
   236   //     s0(w[j-12]) + w[j-4]
   237   vadduwm    (vt2, vt1, vt2);
   239   // c = s0(w[j-15]) + w[j-7] + w[j-16],
   240   //     s0(w[j-14]) + w[j-6] + w[j-15],
   241   //     s0(w[j-13]) + w[j-5] + w[j-14],
   242   //     s0(w[j-12]) + w[j-4] + w[j-13]
   243   vadduwm    (vt2, vt2, w0);
   245   // e = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
   246   //     s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
   247   //     s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j-4]), // UNDEFINED
   248   //     s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j-3])  // UNDEFINED
   249   vadduwm    (vt4, vt2, vt3);
   251   // At this point, e[0] and e[1] are the correct values to be stored at w[j]
   252   // and w[j+1].
   253   // e[2] and e[3] are not considered.
   254   // b = s1(w[j]) , s1(s(w[j+1]) , UNDEFINED , UNDEFINED
   255   vshasigmaw (vt1, vt4, 0,  0xf);
   257   // v5 = s1(w[j-2]) , s1(w[j-1]) , s1(w[j]) , s1(w[j+1])
   258 #if defined(VM_LITTLE_ENDIAN)
   259   xxmrgld    (vst3, vsrt1, vst3);
   260 #else
   261   xxmrghd    (vst3, vst3, vsrt1);
   262 #endif
   264   // c = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
   265   //     s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
   266   //     s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j]),   // w[j+2]
   267   //     s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j+1])  // w[j+4]
   268   vadduwm    (vt2, vt2, vt3);
   270   // Updating w0 to w3 to hold the new previous 16 values from w.
   271   vmr        (w0,  w1);
   272   vmr        (w1,  w2);
   273   vmr        (w2,  w3);
   274   vmr        (w3,  vt2);
   276   // store k + w to v9 (4 values at once)
   277 #if defined(VM_LITTLE_ENDIAN)
   278   vadduwm    (kpw0, vt2, vt0);
   280   vsldoi     (kpw1, kpw0, kpw0, 12);
   281   vsldoi     (kpw2, kpw0, kpw0, 8);
   282   vsldoi     (kpw3, kpw0, kpw0, 4);
   283 #else
   284   vadduwm    (kpw3, vt2, vt0);
   286   vsldoi     (kpw2, kpw3, kpw3, 12);
   287   vsldoi     (kpw1, kpw3, kpw3, 8);
   288   vsldoi     (kpw0, kpw3, kpw3, 4);
   289 #endif
   290 }
   292 void MacroAssembler::sha256_update_sha_state(const VectorRegister a,
   293                                              const VectorRegister b_,
   294                                              const VectorRegister c,
   295                                              const VectorRegister d,
   296                                              const VectorRegister e,
   297                                              const VectorRegister f,
   298                                              const VectorRegister g,
   299                                              const VectorRegister h,
   300                                              const Register hptr) {
   301   // temporaries
   302   VectorRegister vt0  = VR0;
   303   VectorRegister vt1  = VR1;
   304   VectorRegister vt2  = VR2;
   305   VectorRegister vt3  = VR3;
   306   VectorRegister vt4  = VR4;
   307   VectorRegister vt5  = VR5;
   308   VectorRegister vaux = VR6;
   309   VectorRegister vRb  = VR6;
   310   Register tmp        = R8;
   311   Register of16       = R8;
   312   Register of32       = R9;
   313   Label state_load_aligned;
   315   // Load hptr
   316   andi_   (tmp, hptr, 0xf);
   317   li      (of16, 16);
   318   lvx     (vt0, hptr);
   319   lvx     (vt5, of16, hptr);
   320   beq     (CCR0, state_load_aligned);
   322   // handle unaligned accesses
   323   li      (of32, 32);
   324   load_perm(vRb, hptr);
   326   vec_perm(vt0, vt5,  vRb);        // vt0 = hptr[0]..hptr[3]
   328   lvx     (vt1, hptr, of32);
   329   vec_perm(vt5, vt1,  vRb);        // vt5 = hptr[4]..hptr[7]
   331   // aligned accesses
   332   bind(state_load_aligned);
   334 #if defined(VM_LITTLE_ENDIAN)
   335   vmrglw  (vt1, b_, a);            // vt1 = {a, b, ?, ?}
   336   vmrglw  (vt2, d, c);             // vt2 = {c, d, ?, ?}
   337   vmrglw  (vt3, f, e);             // vt3 = {e, f, ?, ?}
   338   vmrglw  (vt4, h, g);             // vt4 = {g, h, ?, ?}
   339   xxmrgld (vt1->to_vsr(), vt2->to_vsr(), vt1->to_vsr()); // vt1 = {a, b, c, d}
   340   xxmrgld (vt3->to_vsr(), vt4->to_vsr(), vt3->to_vsr()); // vt3 = {e, f, g, h}
   341   vadduwm (a,   vt0, vt1);         // a = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
   342   vadduwm (e,   vt5, vt3);         // e = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
   344   // Save hptr back, works for any alignment
   345   xxswapd (vt0->to_vsr(), a->to_vsr());
   346   stxvd2x (vt0->to_vsr(), hptr);
   347   xxswapd (vt5->to_vsr(), e->to_vsr());
   348   stxvd2x (vt5->to_vsr(), of16, hptr);
   349 #else
   350   vmrglw  (vt1, a, b_);            // vt1 = {a, b, ?, ?}
   351   vmrglw  (vt2, c, d);             // vt2 = {c, d, ?, ?}
   352   vmrglw  (vt3, e, f);             // vt3 = {e, f, ?, ?}
   353   vmrglw  (vt4, g, h);             // vt4 = {g, h, ?, ?}
   354   xxmrgld (vt1->to_vsr(), vt1->to_vsr(), vt2->to_vsr()); // vt1 = {a, b, c, d}
   355   xxmrgld (vt3->to_vsr(), vt3->to_vsr(), vt4->to_vsr()); // vt3 = {e, f, g, h}
   356   vadduwm (d,   vt0, vt1);         // d = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
   357   vadduwm (h,   vt5, vt3);         // h = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
   359   // Save hptr back, works for any alignment
   360   stxvd2x (d->to_vsr(), hptr);
   361   stxvd2x (h->to_vsr(), of16, hptr);
   362 #endif
   363 }
   365 static const uint32_t sha256_round_table[64] __attribute((aligned(16))) = {
   366   0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
   367   0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
   368   0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
   369   0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
   370   0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
   371   0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
   372   0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
   373   0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
   374   0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
   375   0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
   376   0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
   377   0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
   378   0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
   379   0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
   380   0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
   381   0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
   382 };
   383 static const uint32_t *sha256_round_consts = sha256_round_table;
   385 //   R3_ARG1   - byte[]  Input string with padding but in Big Endian
   386 //   R4_ARG2   - int[]   SHA.state (at first, the root of primes)
   387 //   R5_ARG3   - int     offset
   388 //   R6_ARG4   - int     limit
   389 //
   390 //   Internal Register usage:
   391 //   R7        - k
   392 //   R8        - tmp | j | of16
   393 //   R9        - of32
   394 //   VR0-VR8   - ch, maj, bsa, bse, vt0-vt3 | vt0-vt5, vaux/vRb
   395 //   VR9-VR16  - a-h
   396 //   VR17-VR20 - w0-w3
   397 //   VR21-VR23 - vRb | vaux0-vaux2
   398 //   VR24-VR27 - kpw0-kpw3
   399 void MacroAssembler::sha256(bool multi_block) {
   400   static const ssize_t buf_size = 64;
   401   static const uint8_t w_size = sizeof(sha256_round_table)/sizeof(uint32_t);
   402 #ifdef AIX
   403   // malloc provides 16 byte alignment
   404   if (((uintptr_t)sha256_round_consts & 0xF) != 0) {
   405     uint32_t *new_round_consts = (uint32_t*)malloc(sizeof(sha256_round_table));
   406     guarantee(new_round_consts, "oom");
   407     memcpy(new_round_consts, sha256_round_consts, sizeof(sha256_round_table));
   408     sha256_round_consts = (const uint32_t*)new_round_consts;
   409   }
   410 #endif
   412   Register buf_in = R3_ARG1;
   413   Register state  = R4_ARG2;
   414   Register ofs    = R5_ARG3;
   415   Register limit  = R6_ARG4;
   417   Label sha_loop, core_loop;
   419   // Save non-volatile vector registers in the red zone
   420   static const VectorRegister nv[] = {
   421     VR20, VR21, VR22, VR23, VR24, VR25, VR26, VR27/*, VR28, VR29, VR30, VR31*/
   422   };
   423   static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
   425   for (int c = 0; c < nv_size; c++) {
   426     Register tmp = R8;
   427     li  (tmp, (c - (nv_size)) * 16);
   428     stvx(nv[c], tmp, R1);
   429   }
   431   // Load hash state to registers
   432   VectorRegister a = VR9;
   433   VectorRegister b = VR10;
   434   VectorRegister c = VR11;
   435   VectorRegister d = VR12;
   436   VectorRegister e = VR13;
   437   VectorRegister f = VR14;
   438   VectorRegister g = VR15;
   439   VectorRegister h = VR16;
   440   static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
   441   static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
   442   // counter for cycling through hs vector to avoid register moves between iterations
   443   int h_cnt = 0;
   445   // Load a-h registers from the memory pointed by state
   446 #if defined(VM_LITTLE_ENDIAN)
   447   sha256_load_h_vec(a, e, state);
   448 #else
   449   sha256_load_h_vec(d, h, state);
   450 #endif
   452   // keep k loaded also during MultiBlock loops
   453   Register k = R7;
   454   assert(((uintptr_t)sha256_round_consts & 0xF) == 0, "k alignment");
   455   load_const_optimized(k, (address)sha256_round_consts, R0);
   457   // Avoiding redundant loads
   458   if (multi_block) {
   459     align(OptoLoopAlignment);
   460   }
   461   bind(sha_loop);
   462 #if defined(VM_LITTLE_ENDIAN)
   463   sha256_deque(a, b, c, d);
   464   sha256_deque(e, f, g, h);
   465 #else
   466   sha256_deque(d, c, b, a);
   467   sha256_deque(h, g, f, e);
   468 #endif
   470   // Load 16 elements from w out of the loop.
   471   // Order of the int values is Endianess specific.
   472   VectorRegister w0 = VR17;
   473   VectorRegister w1 = VR18;
   474   VectorRegister w2 = VR19;
   475   VectorRegister w3 = VR20;
   476   static const VectorRegister ws[] = {w0, w1, w2, w3};
   477   static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
   479   VectorRegister kpw0 = VR24;
   480   VectorRegister kpw1 = VR25;
   481   VectorRegister kpw2 = VR26;
   482   VectorRegister kpw3 = VR27;
   483   static const VectorRegister kpws[] = {kpw0, kpw1, kpw2, kpw3};
   484   static const int total_kpws = sizeof(kpws)/sizeof(VectorRegister);
   486   sha256_load_w_plus_k_vec(buf_in, ws, total_ws, k, kpws, total_kpws);
   488   // Cycle through the first 16 elements
   489   assert(total_ws == total_kpws, "Redesign the loop below");
   490   for (int n = 0; n < total_ws; n++) {
   491     VectorRegister vaux0 = VR21;
   492     VectorRegister vaux1 = VR22;
   493     VectorRegister vaux2 = VR23;
   495     sha256_deque(kpws[n], vaux0, vaux1, vaux2);
   497 #if defined(VM_LITTLE_ENDIAN)
   498     sha256_round(hs, total_hs, h_cnt, kpws[n]);
   499     sha256_round(hs, total_hs, h_cnt, vaux0);
   500     sha256_round(hs, total_hs, h_cnt, vaux1);
   501     sha256_round(hs, total_hs, h_cnt, vaux2);
   502 #else
   503     sha256_round(hs, total_hs, h_cnt, vaux2);
   504     sha256_round(hs, total_hs, h_cnt, vaux1);
   505     sha256_round(hs, total_hs, h_cnt, vaux0);
   506     sha256_round(hs, total_hs, h_cnt, kpws[n]);
   507 #endif
   508   }
   510   Register tmp = R8;
   511   // loop the 16th to the 64th iteration by 8 steps
   512   li   (tmp, (w_size - 16) / total_hs);
   513   mtctr(tmp);
   515   // j will be aligned to 4 for loading words.
   516   // Whenever read, advance the pointer (e.g: when j is used in a function)
   517   Register j = R8;
   518   li   (j, 16*4);
   520   align(OptoLoopAlignment);
   521   bind(core_loop);
   523   // due to VectorRegister rotate, always iterate in multiples of total_hs
   524   for (int n = 0; n < total_hs/4; n++) {
   525     sha256_calc_4w(w0, w1, w2, w3, kpw0, kpw1, kpw2, kpw3, j, k);
   526     sha256_round(hs, total_hs, h_cnt, kpw0);
   527     sha256_round(hs, total_hs, h_cnt, kpw1);
   528     sha256_round(hs, total_hs, h_cnt, kpw2);
   529     sha256_round(hs, total_hs, h_cnt, kpw3);
   530   }
   532   bdnz   (core_loop);
   534   // Update hash state
   535   sha256_update_sha_state(a, b, c, d, e, f, g, h, state);
   537   if (multi_block) {
   538     addi(buf_in, buf_in, buf_size);
   539     addi(ofs, ofs, buf_size);
   540     cmplw(CCR0, ofs, limit);
   541     ble(CCR0, sha_loop);
   543     // return ofs
   544     mr(R3_RET, ofs);
   545   }
   547   // Restore non-volatile registers
   548   for (int c = 0; c < nv_size; c++) {
   549     Register tmp = R8;
   550     li  (tmp, (c - (nv_size)) * 16);
   551     lvx(nv[c], tmp, R1);
   552   }
   553 }
   556 /**********************************************************************
   557  * SHA 512
   558  *********************************************************************/
   560 void MacroAssembler::sha512_load_w_vec(const Register buf_in,
   561                                        const VectorRegister* ws,
   562                                        const int total_ws) {
   563   Register tmp       = R8;
   564   VectorRegister vRb = VR8;
   565   VectorRegister aux = VR9;
   566   Label is_aligned, after_alignment;
   568   andi_  (tmp, buf_in, 0xF);
   569   beq    (CCR0, is_aligned); // address ends with 0x0, not 0x8
   571   // deal with unaligned addresses
   572   lvx    (ws[0], buf_in);
   573   load_perm(vRb, buf_in);
   575   for (int n = 1; n < total_ws; n++) {
   576     VectorRegister w_cur = ws[n];
   577     VectorRegister w_prev = ws[n-1];
   578     addi (tmp, buf_in, n * 16);
   579     lvx  (w_cur, tmp);
   580     vec_perm(w_prev, w_cur, vRb);
   581   }
   582   addi   (tmp, buf_in, total_ws * 16);
   583   lvx    (aux, tmp);
   584   vec_perm(ws[total_ws-1], aux, vRb);
   585   b      (after_alignment);
   587   bind(is_aligned);
   588   lvx  (ws[0], buf_in);
   589   for (int n = 1; n < total_ws; n++) {
   590     VectorRegister w = ws[n];
   591     addi (tmp, buf_in, n * 16);
   592     lvx  (w, tmp);
   593   }
   595   bind(after_alignment);
   596 }
   598 // Update hash state
   599 void MacroAssembler::sha512_update_sha_state(const Register state,
   600                                              const VectorRegister* hs,
   601                                              const int total_hs) {
   603 #if defined(VM_LITTLE_ENDIAN)
   604   int start_idx = 0;
   605 #else
   606   int start_idx = 1;
   607 #endif
   609   // load initial hash from the memory pointed by state
   610   VectorRegister ini_a = VR10;
   611   VectorRegister ini_c = VR12;
   612   VectorRegister ini_e = VR14;
   613   VectorRegister ini_g = VR16;
   614   static const VectorRegister inis[] = {ini_a, ini_c, ini_e, ini_g};
   615   static const int total_inis = sizeof(inis)/sizeof(VectorRegister);
   617   Label state_save_aligned, after_state_save_aligned;
   619   Register addr      = R7;
   620   Register tmp       = R8;
   621   VectorRegister vRb = VR8;
   622   VectorRegister aux = VR9;
   624   andi_(tmp, state, 0xf);
   625   beq(CCR0, state_save_aligned);
   626   // deal with unaligned addresses
   628   {
   629     VectorRegister a = hs[0];
   630     VectorRegister b_ = hs[1];
   631     VectorRegister c = hs[2];
   632     VectorRegister d = hs[3];
   633     VectorRegister e = hs[4];
   634     VectorRegister f = hs[5];
   635     VectorRegister g = hs[6];
   636     VectorRegister h = hs[7];
   637     load_perm(vRb, state);
   638     lvx    (ini_a, state);
   639     addi   (addr, state, 16);
   641     lvx    (ini_c, addr);
   642     addi   (addr, state, 32);
   643     vec_perm(ini_a, ini_c, vRb);
   645     lvx    (ini_e, addr);
   646     addi   (addr, state, 48);
   647     vec_perm(ini_c, ini_e, vRb);
   649     lvx    (ini_g, addr);
   650     addi   (addr, state, 64);
   651     vec_perm(ini_e, ini_g, vRb);
   653     lvx    (aux, addr);
   654     vec_perm(ini_g, aux, vRb);
   656 #if defined(VM_LITTLE_ENDIAN)
   657     xxmrgld(a->to_vsr(), b_->to_vsr(), a->to_vsr());
   658     xxmrgld(c->to_vsr(), d->to_vsr(), c->to_vsr());
   659     xxmrgld(e->to_vsr(), f->to_vsr(), e->to_vsr());
   660     xxmrgld(g->to_vsr(), h->to_vsr(), g->to_vsr());
   661 #else
   662     xxmrgld(b_->to_vsr(), a->to_vsr(), b_->to_vsr());
   663     xxmrgld(d->to_vsr(), c->to_vsr(), d->to_vsr());
   664     xxmrgld(f->to_vsr(), e->to_vsr(), f->to_vsr());
   665     xxmrgld(h->to_vsr(), g->to_vsr(), h->to_vsr());
   666 #endif
   668     for (int n = start_idx; n < total_hs; n += 2) {
   669       VectorRegister h_cur = hs[n];
   670       VectorRegister ini_cur = inis[n/2];
   672       vaddudm(h_cur, ini_cur, h_cur);
   673     }
   675     for (int n = start_idx; n < total_hs; n += 2) {
   676       VectorRegister h_cur = hs[n];
   678       mfvrd  (tmp, h_cur);
   679 #if defined(VM_LITTLE_ENDIAN)
   680       std    (tmp, 8*n + 8, state);
   681 #else
   682       std    (tmp, 8*n - 8, state);
   683 #endif
   684       vsldoi (aux, h_cur, h_cur, 8);
   685       mfvrd  (tmp, aux);
   686       std    (tmp, 8*n + 0, state);
   687     }
   689     b      (after_state_save_aligned);
   690   }
   692   bind(state_save_aligned);
   693   {
   694     for (int n = 0; n < total_hs; n += 2) {
   695 #if defined(VM_LITTLE_ENDIAN)
   696       VectorRegister h_cur = hs[n];
   697       VectorRegister h_next = hs[n+1];
   698 #else
   699       VectorRegister h_cur = hs[n+1];
   700       VectorRegister h_next = hs[n];
   701 #endif
   702       VectorRegister ini_cur = inis[n/2];
   704       if (n/2 == 0) {
   705         lvx(ini_cur, state);
   706       } else {
   707         addi(addr, state, (n/2) * 16);
   708         lvx(ini_cur, addr);
   709       }
   710       xxmrgld(h_cur->to_vsr(), h_next->to_vsr(), h_cur->to_vsr());
   711     }
   713     for (int n = start_idx; n < total_hs; n += 2) {
   714       VectorRegister h_cur = hs[n];
   715       VectorRegister ini_cur = inis[n/2];
   717       vaddudm(h_cur, ini_cur, h_cur);
   718     }
   720     for (int n = start_idx; n < total_hs; n += 2) {
   721       VectorRegister h_cur = hs[n];
   723       if (n/2 == 0) {
   724         stvx(h_cur, state);
   725       } else {
   726         addi(addr, state, (n/2) * 16);
   727         stvx(h_cur, addr);
   728       }
   729     }
   730   }
   732   bind(after_state_save_aligned);
   733 }
   735 // Use h_cnt to cycle through hs elements but also increment it at the end
   736 void MacroAssembler::sha512_round(const VectorRegister* hs,
   737                                   const int total_hs, int& h_cnt,
   738                                   const VectorRegister kpw) {
   740   // convenience registers: cycle from 0-7 downwards
   741   const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
   742   const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
   743   const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
   744   const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
   745   const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
   746   const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
   747   const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
   748   const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
   749   // temporaries
   750   const VectorRegister Ch   = VR20;
   751   const VectorRegister Maj  = VR21;
   752   const VectorRegister bsa  = VR22;
   753   const VectorRegister bse  = VR23;
   754   const VectorRegister tmp1 = VR24;
   755   const VectorRegister tmp2 = VR25;
   757   vsel      (Ch,   g,    f,   e);
   758   vxor      (Maj,  a,    b);
   759   vshasigmad(bse,  e,    1,   0xf);
   760   vaddudm   (tmp2, Ch,   kpw);
   761   vaddudm   (tmp1, h,    bse);
   762   vsel      (Maj,  b,    c,   Maj);
   763   vaddudm   (tmp1, tmp1, tmp2);
   764   vshasigmad(bsa,  a,    1,   0);
   765   vaddudm   (tmp2, bsa,  Maj);
   766   vaddudm   (d,    d,    tmp1);
   767   vaddudm   (h,    tmp1, tmp2);
   769   // advance vector pointer to the next iteration
   770   h_cnt++;
   771 }
   773 void MacroAssembler::sha512_calc_2w(const VectorRegister w0,
   774                                     const VectorRegister w1,
   775                                     const VectorRegister w2,
   776                                     const VectorRegister w3,
   777                                     const VectorRegister w4,
   778                                     const VectorRegister w5,
   779                                     const VectorRegister w6,
   780                                     const VectorRegister w7,
   781                                     const VectorRegister kpw0,
   782                                     const VectorRegister kpw1,
   783                                     const Register j,
   784                                     const VectorRegister vRb,
   785                                     const Register k) {
   786   // Temporaries
   787   const VectorRegister VR_a = VR20;
   788   const VectorRegister VR_b = VR21;
   789   const VectorRegister VR_c = VR22;
   790   const VectorRegister VR_d = VR23;
   792   // load to k[j]
   793   lvx        (VR_a, j,    k);
   794   // advance j
   795   addi       (j,    j,    16); // 16 bytes were read
   797 #if defined(VM_LITTLE_ENDIAN)
   798   // v6 = w[j-15], w[j-14]
   799   vperm      (VR_b, w1,   w0,  vRb);
   800   // v12 = w[j-7], w[j-6]
   801   vperm      (VR_c, w5,   w4,  vRb);
   802 #else
   803   // v6 = w[j-15], w[j-14]
   804   vperm      (VR_b, w0,   w1,  vRb);
   805   // v12 = w[j-7], w[j-6]
   806   vperm      (VR_c, w4,   w5,  vRb);
   807 #endif
   809   // v6 = s0(w[j-15]) , s0(w[j-14])
   810   vshasigmad (VR_b, VR_b,    0,   0);
   811   // v5 = s1(w[j-2]) , s1(w[j-1])
   812   vshasigmad (VR_d, w7,      0,   0xf);
   813   // v6 = s0(w[j-15]) + w[j-7] , s0(w[j-14]) + w[j-6]
   814   vaddudm    (VR_b, VR_b, VR_c);
   815   // v8 = s1(w[j-2]) + w[j-16] , s1(w[j-1]) + w[j-15]
   816   vaddudm    (VR_d, VR_d, w0);
   817   // v9 = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
   818   //      s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
   819   vaddudm    (VR_c, VR_d, VR_b);
   820   // Updating w0 to w7 to hold the new previous 16 values from w.
   821   vmr        (w0,   w1);
   822   vmr        (w1,   w2);
   823   vmr        (w2,   w3);
   824   vmr        (w3,   w4);
   825   vmr        (w4,   w5);
   826   vmr        (w5,   w6);
   827   vmr        (w6,   w7);
   828   vmr        (w7,   VR_c);
   830 #if defined(VM_LITTLE_ENDIAN)
   831   // store k + w to kpw0 (2 values at once)
   832   vaddudm    (kpw0, VR_c, VR_a);
   833   // kpw1 holds (k + w)[1]
   834   vsldoi     (kpw1, kpw0, kpw0, 8);
   835 #else
   836   // store k + w to kpw0 (2 values at once)
   837   vaddudm    (kpw1, VR_c, VR_a);
   838   // kpw1 holds (k + w)[1]
   839   vsldoi     (kpw0, kpw1, kpw1, 8);
   840 #endif
   841 }
   843 void MacroAssembler::sha512_load_h_vec(const Register state,
   844                                        const VectorRegister* hs,
   845                                        const int total_hs) {
   846 #if defined(VM_LITTLE_ENDIAN)
   847   VectorRegister a   = hs[0];
   848   VectorRegister g   = hs[6];
   849   int start_idx = 0;
   850 #else
   851   VectorRegister a   = hs[1];
   852   VectorRegister g   = hs[7];
   853   int start_idx = 1;
   854 #endif
   856   Register addr      = R7;
   857   VectorRegister vRb = VR8;
   858   Register tmp       = R8;
   859   Label state_aligned, after_state_aligned;
   861   andi_(tmp, state, 0xf);
   862   beq(CCR0, state_aligned);
   864   // deal with unaligned addresses
   865   VectorRegister aux = VR9;
   867   lvx(hs[start_idx], state);
   868   load_perm(vRb, state);
   870   for (int n = start_idx + 2; n < total_hs; n += 2) {
   871     VectorRegister h_cur   = hs[n];
   872     VectorRegister h_prev2 = hs[n - 2];
   873     addi(addr, state, (n/2) * 16);
   874     lvx(h_cur, addr);
   875     vec_perm(h_prev2, h_cur, vRb);
   876   }
   877   addi(addr, state, (total_hs/2) * 16);
   878   lvx    (aux, addr);
   879   vec_perm(hs[total_hs - 2 + start_idx], aux, vRb);
   880   b      (after_state_aligned);
   882   bind(state_aligned);
   884   // deal with aligned addresses
   885   lvx(hs[start_idx], state);
   887   for (int n = start_idx + 2; n < total_hs; n += 2) {
   888     VectorRegister h_cur = hs[n];
   889     addi(addr, state, (n/2) * 16);
   890     lvx(h_cur, addr);
   891   }
   893   bind(after_state_aligned);
   894 }
   896 static const uint64_t sha512_round_table[80] __attribute((aligned(16))) = {
   897   0x428a2f98d728ae22, 0x7137449123ef65cd,
   898   0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
   899   0x3956c25bf348b538, 0x59f111f1b605d019,
   900   0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
   901   0xd807aa98a3030242, 0x12835b0145706fbe,
   902   0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
   903   0x72be5d74f27b896f, 0x80deb1fe3b1696b1,
   904   0x9bdc06a725c71235, 0xc19bf174cf692694,
   905   0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
   906   0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
   907   0x2de92c6f592b0275, 0x4a7484aa6ea6e483,
   908   0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
   909   0x983e5152ee66dfab, 0xa831c66d2db43210,
   910   0xb00327c898fb213f, 0xbf597fc7beef0ee4,
   911   0xc6e00bf33da88fc2, 0xd5a79147930aa725,
   912   0x06ca6351e003826f, 0x142929670a0e6e70,
   913   0x27b70a8546d22ffc, 0x2e1b21385c26c926,
   914   0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
   915   0x650a73548baf63de, 0x766a0abb3c77b2a8,
   916   0x81c2c92e47edaee6, 0x92722c851482353b,
   917   0xa2bfe8a14cf10364, 0xa81a664bbc423001,
   918   0xc24b8b70d0f89791, 0xc76c51a30654be30,
   919   0xd192e819d6ef5218, 0xd69906245565a910,
   920   0xf40e35855771202a, 0x106aa07032bbd1b8,
   921   0x19a4c116b8d2d0c8, 0x1e376c085141ab53,
   922   0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
   923   0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
   924   0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
   925   0x748f82ee5defb2fc, 0x78a5636f43172f60,
   926   0x84c87814a1f0ab72, 0x8cc702081a6439ec,
   927   0x90befffa23631e28, 0xa4506cebde82bde9,
   928   0xbef9a3f7b2c67915, 0xc67178f2e372532b,
   929   0xca273eceea26619c, 0xd186b8c721c0c207,
   930   0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
   931   0x06f067aa72176fba, 0x0a637dc5a2c898a6,
   932   0x113f9804bef90dae, 0x1b710b35131c471b,
   933   0x28db77f523047d84, 0x32caab7b40c72493,
   934   0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
   935   0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
   936   0x5fcb6fab3ad6faec, 0x6c44198c4a475817,
   937 };
   938 static const uint64_t *sha512_round_consts = sha512_round_table;
   940 //   R3_ARG1   - byte[]  Input string with padding but in Big Endian
   941 //   R4_ARG2   - int[]   SHA.state (at first, the root of primes)
   942 //   R5_ARG3   - int     offset
   943 //   R6_ARG4   - int     limit
   944 //
   945 //   Internal Register usage:
   946 //   R7 R8 R9  - volatile temporaries
   947 //   VR0-VR7   - a-h
   948 //   VR8       - vRb
   949 //   VR9       - aux (highly volatile, use with care)
   950 //   VR10-VR17 - w0-w7 | ini_a-ini_h
   951 //   VR18      - vsp16 | kplusw0
   952 //   VR19      - vsp32 | kplusw1
   953 //   VR20-VR25 - sha512_calc_2w and sha512_round temporaries
   954 void MacroAssembler::sha512(bool multi_block) {
   955   static const ssize_t buf_size = 128;
   956   static const uint8_t w_size = sizeof(sha512_round_table)/sizeof(uint64_t);
   957 #ifdef AIX
   958   // malloc provides 16 byte alignment
   959   if (((uintptr_t)sha512_round_consts & 0xF) != 0) {
   960     uint64_t *new_round_consts = (uint64_t*)malloc(sizeof(sha512_round_table));
   961     guarantee(new_round_consts, "oom");
   962     memcpy(new_round_consts, sha512_round_consts, sizeof(sha512_round_table));
   963     sha512_round_consts = (const uint64_t*)new_round_consts;
   964   }
   965 #endif
   967   Register buf_in = R3_ARG1;
   968   Register state  = R4_ARG2;
   969   Register ofs    = R5_ARG3;
   970   Register limit  = R6_ARG4;
   972   Label sha_loop, core_loop;
   974   // Save non-volatile vector registers in the red zone
   975   static const VectorRegister nv[] = {
   976     VR20, VR21, VR22, VR23, VR24, VR25/*, VR26, VR27, VR28, VR29, VR30, VR31*/
   977   };
   978   static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
   980   for (int c = 0; c < nv_size; c++) {
   981     Register idx = R7;
   982     li  (idx, (c - (nv_size)) * 16);
   983     stvx(nv[c], idx, R1);
   984   }
   986   // Load hash state to registers
   987   VectorRegister a = VR0;
   988   VectorRegister b = VR1;
   989   VectorRegister c = VR2;
   990   VectorRegister d = VR3;
   991   VectorRegister e = VR4;
   992   VectorRegister f = VR5;
   993   VectorRegister g = VR6;
   994   VectorRegister h = VR7;
   995   static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
   996   static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
   997   // counter for cycling through hs vector to avoid register moves between iterations
   998   int h_cnt = 0;
  1000   // Load a-h registers from the memory pointed by state
  1001   sha512_load_h_vec(state, hs, total_hs);
  1003   Register k = R9;
  1004   assert(((uintptr_t)sha512_round_consts & 0xF) == 0, "k alignment");
  1005   load_const_optimized(k, (address)sha512_round_consts, R0);
  1007   if (multi_block) {
  1008     align(OptoLoopAlignment);
  1010   bind(sha_loop);
  1012   for (int n = 0; n < total_hs; n += 2) {
  1013 #if defined(VM_LITTLE_ENDIAN)
  1014     VectorRegister h_cur = hs[n];
  1015     VectorRegister h_next = hs[n + 1];
  1016 #else
  1017     VectorRegister h_cur = hs[n + 1];
  1018     VectorRegister h_next = hs[n];
  1019 #endif
  1020     vsldoi (h_next, h_cur, h_cur, 8);
  1023   // Load 16 elements from w out of the loop.
  1024   // Order of the long values is Endianess specific.
  1025   VectorRegister w0 = VR10;
  1026   VectorRegister w1 = VR11;
  1027   VectorRegister w2 = VR12;
  1028   VectorRegister w3 = VR13;
  1029   VectorRegister w4 = VR14;
  1030   VectorRegister w5 = VR15;
  1031   VectorRegister w6 = VR16;
  1032   VectorRegister w7 = VR17;
  1033   static const VectorRegister ws[] = {w0, w1, w2, w3, w4, w5, w6, w7};
  1034   static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
  1036   // Load 16 w into vectors and setup vsl for vperm
  1037   sha512_load_w_vec(buf_in, ws, total_ws);
  1039 #if defined(VM_LITTLE_ENDIAN)
  1040   VectorRegister vsp16 = VR18;
  1041   VectorRegister vsp32 = VR19;
  1042   VectorRegister shiftarg = VR9;
  1044   vspltisw(vsp16,    8);
  1045   vspltisw(shiftarg, 1);
  1046   vsl     (vsp16,    vsp16, shiftarg);
  1047   vsl     (vsp32,    vsp16, shiftarg);
  1049   VectorRegister vsp8 = VR9;
  1050   vspltish(vsp8,     8);
  1052   // Convert input from Big Endian to Little Endian
  1053   for (int c = 0; c < total_ws; c++) {
  1054     VectorRegister w = ws[c];
  1055     vrlh  (w, w, vsp8);
  1057   for (int c = 0; c < total_ws; c++) {
  1058     VectorRegister w = ws[c];
  1059     vrlw  (w, w, vsp16);
  1061   for (int c = 0; c < total_ws; c++) {
  1062     VectorRegister w = ws[c];
  1063     vrld  (w, w, vsp32);
  1065 #endif
  1067   Register Rb        = R10;
  1068   VectorRegister vRb = VR8;
  1069   li      (Rb, 8);
  1070   load_perm(vRb, Rb);
  1072   VectorRegister kplusw0 = VR18;
  1073   VectorRegister kplusw1 = VR19;
  1075   Register addr      = R7;
  1077   for (int n = 0; n < total_ws; n++) {
  1078     VectorRegister w = ws[n];
  1080     if (n == 0) {
  1081       lvx  (kplusw0, k);
  1082     } else {
  1083       addi (addr, k, n * 16);
  1084       lvx  (kplusw0, addr);
  1086 #if defined(VM_LITTLE_ENDIAN)
  1087     vaddudm(kplusw0, kplusw0, w);
  1088     vsldoi (kplusw1, kplusw0, kplusw0, 8);
  1089 #else
  1090     vaddudm(kplusw1, kplusw0, w);
  1091     vsldoi (kplusw0, kplusw1, kplusw1, 8);
  1092 #endif
  1094     sha512_round(hs, total_hs, h_cnt, kplusw0);
  1095     sha512_round(hs, total_hs, h_cnt, kplusw1);
  1098   Register tmp       = R8;
  1099   li    (tmp, (w_size-16)/total_hs);
  1100   mtctr (tmp);
  1101   // j will be aligned to 4 for loading words.
  1102   // Whenever read, advance the pointer (e.g: when j is used in a function)
  1103   Register j = tmp;
  1104   li     (j, 8*16);
  1106   align(OptoLoopAlignment);
  1107   bind(core_loop);
  1109   // due to VectorRegister rotate, always iterate in multiples of total_hs
  1110   for (int n = 0; n < total_hs/2; n++) {
  1111     sha512_calc_2w(w0, w1, w2, w3, w4, w5, w6, w7, kplusw0, kplusw1, j, vRb, k);
  1112     sha512_round(hs, total_hs, h_cnt, kplusw0);
  1113     sha512_round(hs, total_hs, h_cnt, kplusw1);
  1116   bdnz   (core_loop);
  1118   sha512_update_sha_state(state, hs, total_hs);
  1120   if (multi_block) {
  1121     addi(buf_in, buf_in, buf_size);
  1122     addi(ofs, ofs, buf_size);
  1123     cmplw(CCR0, ofs, limit);
  1124     ble(CCR0, sha_loop);
  1126     // return ofs
  1127     mr(R3_RET, ofs);
  1130   // Restore non-volatile registers
  1131   for (int c = 0; c < nv_size; c++) {
  1132     Register idx = R7;
  1133     li  (idx, (c - (nv_size)) * 16);
  1134     lvx(nv[c], idx, R1);

mercurial