src/cpu/ppc/vm/sharedRuntime_ppc.cpp

changeset 8903
9575483cce09
parent 8182
e9e252c83b2b
child 9013
18366fa39fe0
     1.1 --- a/src/cpu/ppc/vm/sharedRuntime_ppc.cpp	Tue Oct 03 18:40:36 2017 -0700
     1.2 +++ b/src/cpu/ppc/vm/sharedRuntime_ppc.cpp	Thu Oct 12 16:29:52 2017 +0100
     1.3 @@ -42,6 +42,8 @@
     1.4  #include "opto/runtime.hpp"
     1.5  #endif
     1.6  
     1.7 +#include <alloca.h>
     1.8 +
     1.9  #define __ masm->
    1.10  
    1.11  #ifdef PRODUCT
    1.12 @@ -3268,3 +3270,245 @@
    1.13    return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_bytes/wordSize,
    1.14                                         oop_maps, true);
    1.15  }
    1.16 +
    1.17 +
    1.18 +//------------------------------Montgomery multiplication------------------------
    1.19 +//
    1.20 +
    1.21 +// Subtract 0:b from carry:a. Return carry.
    1.22 +static unsigned long
    1.23 +sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
    1.24 +  long i = 0;
    1.25 +  unsigned long tmp, tmp2;
    1.26 +  __asm__ __volatile__ (
    1.27 +    "subfc  %[tmp], %[tmp], %[tmp]   \n" // pre-set CA
    1.28 +    "mtctr  %[len]                   \n"
    1.29 +    "0:                              \n"
    1.30 +    "ldx    %[tmp], %[i], %[a]       \n"
    1.31 +    "ldx    %[tmp2], %[i], %[b]      \n"
    1.32 +    "subfe  %[tmp], %[tmp2], %[tmp]  \n" // subtract extended
    1.33 +    "stdx   %[tmp], %[i], %[a]       \n"
    1.34 +    "addi   %[i], %[i], 8            \n"
    1.35 +    "bdnz   0b                       \n"
    1.36 +    "addme  %[tmp], %[carry]         \n" // carry + CA - 1
    1.37 +    : [i]"+b"(i), [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2)
    1.38 +    : [a]"r"(a), [b]"r"(b), [carry]"r"(carry), [len]"r"(len)
    1.39 +    : "ctr", "xer", "memory"
    1.40 +  );
    1.41 +  return tmp;
    1.42 +}
    1.43 +
    1.44 +// Multiply (unsigned) Long A by Long B, accumulating the double-
    1.45 +// length result into the accumulator formed of T0, T1, and T2.
    1.46 +inline void MACC(unsigned long A, unsigned long B, unsigned long &T0, unsigned long &T1, unsigned long &T2) {
    1.47 +  unsigned long hi, lo;
    1.48 +  __asm__ __volatile__ (
    1.49 +    "mulld  %[lo], %[A], %[B]    \n"
    1.50 +    "mulhdu %[hi], %[A], %[B]    \n"
    1.51 +    "addc   %[T0], %[T0], %[lo]  \n"
    1.52 +    "adde   %[T1], %[T1], %[hi]  \n"
    1.53 +    "addze  %[T2], %[T2]         \n"
    1.54 +    : [hi]"=&r"(hi), [lo]"=&r"(lo), [T0]"+r"(T0), [T1]"+r"(T1), [T2]"+r"(T2)
    1.55 +    : [A]"r"(A), [B]"r"(B)
    1.56 +    : "xer"
    1.57 +  );
    1.58 +}
    1.59 +
    1.60 +// As above, but add twice the double-length result into the
    1.61 +// accumulator.
    1.62 +inline void MACC2(unsigned long A, unsigned long B, unsigned long &T0, unsigned long &T1, unsigned long &T2) {
    1.63 +  unsigned long hi, lo;
    1.64 +  __asm__ __volatile__ (
    1.65 +    "mulld  %[lo], %[A], %[B]    \n"
    1.66 +    "mulhdu %[hi], %[A], %[B]    \n"
    1.67 +    "addc   %[T0], %[T0], %[lo]  \n"
    1.68 +    "adde   %[T1], %[T1], %[hi]  \n"
    1.69 +    "addze  %[T2], %[T2]         \n"
    1.70 +    "addc   %[T0], %[T0], %[lo]  \n"
    1.71 +    "adde   %[T1], %[T1], %[hi]  \n"
    1.72 +    "addze  %[T2], %[T2]         \n"
    1.73 +    : [hi]"=&r"(hi), [lo]"=&r"(lo), [T0]"+r"(T0), [T1]"+r"(T1), [T2]"+r"(T2)
    1.74 +    : [A]"r"(A), [B]"r"(B)
    1.75 +    : "xer"
    1.76 +  );
    1.77 +}
    1.78 +
    1.79 +// Fast Montgomery multiplication. The derivation of the algorithm is
    1.80 +// in "A Cryptographic Library for the Motorola DSP56000,
    1.81 +// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237".
    1.82 +static void
    1.83 +montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
    1.84 +                    unsigned long m[], unsigned long inv, int len) {
    1.85 +  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
    1.86 +  int i;
    1.87 +
    1.88 +  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
    1.89 +
    1.90 +  for (i = 0; i < len; i++) {
    1.91 +    int j;
    1.92 +    for (j = 0; j < i; j++) {
    1.93 +      MACC(a[j], b[i-j], t0, t1, t2);
    1.94 +      MACC(m[j], n[i-j], t0, t1, t2);
    1.95 +    }
    1.96 +    MACC(a[i], b[0], t0, t1, t2);
    1.97 +    m[i] = t0 * inv;
    1.98 +    MACC(m[i], n[0], t0, t1, t2);
    1.99 +
   1.100 +    assert(t0 == 0, "broken Montgomery multiply");
   1.101 +
   1.102 +    t0 = t1; t1 = t2; t2 = 0;
   1.103 +  }
   1.104 +
   1.105 +  for (i = len; i < 2*len; i++) {
   1.106 +    int j;
   1.107 +    for (j = i-len+1; j < len; j++) {
   1.108 +      MACC(a[j], b[i-j], t0, t1, t2);
   1.109 +      MACC(m[j], n[i-j], t0, t1, t2);
   1.110 +    }
   1.111 +    m[i-len] = t0;
   1.112 +    t0 = t1; t1 = t2; t2 = 0;
   1.113 +  }
   1.114 +
   1.115 +  while (t0) {
   1.116 +    t0 = sub(m, n, t0, len);
   1.117 +  }
   1.118 +}
   1.119 +
   1.120 +// Fast Montgomery squaring. This uses asymptotically 25% fewer
   1.121 +// multiplies so it should be up to 25% faster than Montgomery
   1.122 +// multiplication. However, its loop control is more complex and it
   1.123 +// may actually run slower on some machines.
   1.124 +static void
   1.125 +montgomery_square(unsigned long a[], unsigned long n[],
   1.126 +                  unsigned long m[], unsigned long inv, int len) {
   1.127 +  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
   1.128 +  int i;
   1.129 +
   1.130 +  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
   1.131 +
   1.132 +  for (i = 0; i < len; i++) {
   1.133 +    int j;
   1.134 +    int end = (i+1)/2;
   1.135 +    for (j = 0; j < end; j++) {
   1.136 +      MACC2(a[j], a[i-j], t0, t1, t2);
   1.137 +      MACC(m[j], n[i-j], t0, t1, t2);
   1.138 +    }
   1.139 +    if ((i & 1) == 0) {
   1.140 +      MACC(a[j], a[j], t0, t1, t2);
   1.141 +    }
   1.142 +    for (; j < i; j++) {
   1.143 +      MACC(m[j], n[i-j], t0, t1, t2);
   1.144 +    }
   1.145 +    m[i] = t0 * inv;
   1.146 +    MACC(m[i], n[0], t0, t1, t2);
   1.147 +
   1.148 +    assert(t0 == 0, "broken Montgomery square");
   1.149 +
   1.150 +    t0 = t1; t1 = t2; t2 = 0;
   1.151 +  }
   1.152 +
   1.153 +  for (i = len; i < 2*len; i++) {
   1.154 +    int start = i-len+1;
   1.155 +    int end = start + (len - start)/2;
   1.156 +    int j;
   1.157 +    for (j = start; j < end; j++) {
   1.158 +      MACC2(a[j], a[i-j], t0, t1, t2);
   1.159 +      MACC(m[j], n[i-j], t0, t1, t2);
   1.160 +    }
   1.161 +    if ((i & 1) == 0) {
   1.162 +      MACC(a[j], a[j], t0, t1, t2);
   1.163 +    }
   1.164 +    for (; j < len; j++) {
   1.165 +      MACC(m[j], n[i-j], t0, t1, t2);
   1.166 +    }
   1.167 +    m[i-len] = t0;
   1.168 +    t0 = t1; t1 = t2; t2 = 0;
   1.169 +  }
   1.170 +
   1.171 +  while (t0) {
   1.172 +    t0 = sub(m, n, t0, len);
   1.173 +  }
   1.174 +}
   1.175 +
   1.176 +// The threshold at which squaring is advantageous was determined
   1.177 +// experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
   1.178 +// Doesn't seem to be relevant for Power8 so we use the same value.
   1.179 +#define MONTGOMERY_SQUARING_THRESHOLD 64
   1.180 +
   1.181 +// Copy len longwords from s to d, word-swapping as we go. The
   1.182 +// destination array is reversed.
   1.183 +static void reverse_words(unsigned long *s, unsigned long *d, int len) {
   1.184 +  d += len;
   1.185 +  while(len-- > 0) {
   1.186 +    d--;
   1.187 +    unsigned long s_val = *s;
   1.188 +    // Swap words in a longword on little endian machines.
   1.189 +#ifdef VM_LITTLE_ENDIAN
   1.190 +     s_val = (s_val << 32) | (s_val >> 32);
   1.191 +#endif
   1.192 +    *d = s_val;
   1.193 +    s++;
   1.194 +  }
   1.195 +}
   1.196 +
   1.197 +void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
   1.198 +                                        jint len, jlong inv,
   1.199 +                                        jint *m_ints) {
   1.200 +  assert(len % 2 == 0, "array length in montgomery_multiply must be even");
   1.201 +  int longwords = len/2;
   1.202 +  assert(longwords > 0, "unsupported");
   1.203 +
   1.204 +  // Make very sure we don't use so much space that the stack might
   1.205 +  // overflow. 512 jints corresponds to an 16384-bit integer and
   1.206 +  // will use here a total of 8k bytes of stack space.
   1.207 +  int total_allocation = longwords * sizeof (unsigned long) * 4;
   1.208 +  guarantee(total_allocation <= 8192, "must be");
   1.209 +  unsigned long *scratch = (unsigned long *)alloca(total_allocation);
   1.210 +
   1.211 +  // Local scratch arrays
   1.212 +  unsigned long
   1.213 +    *a = scratch + 0 * longwords,
   1.214 +    *b = scratch + 1 * longwords,
   1.215 +    *n = scratch + 2 * longwords,
   1.216 +    *m = scratch + 3 * longwords;
   1.217 +
   1.218 +  reverse_words((unsigned long *)a_ints, a, longwords);
   1.219 +  reverse_words((unsigned long *)b_ints, b, longwords);
   1.220 +  reverse_words((unsigned long *)n_ints, n, longwords);
   1.221 +
   1.222 +  ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
   1.223 +
   1.224 +  reverse_words(m, (unsigned long *)m_ints, longwords);
   1.225 +}
   1.226 +
   1.227 +void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
   1.228 +                                      jint len, jlong inv,
   1.229 +                                      jint *m_ints) {
   1.230 +  assert(len % 2 == 0, "array length in montgomery_square must be even");
   1.231 +  int longwords = len/2;
   1.232 +  assert(longwords > 0, "unsupported");
   1.233 +
   1.234 +  // Make very sure we don't use so much space that the stack might
   1.235 +  // overflow. 512 jints corresponds to an 16384-bit integer and
   1.236 +  // will use here a total of 6k bytes of stack space.
   1.237 +  int total_allocation = longwords * sizeof (unsigned long) * 3;
   1.238 +  guarantee(total_allocation <= 8192, "must be");
   1.239 +  unsigned long *scratch = (unsigned long *)alloca(total_allocation);
   1.240 +
   1.241 +  // Local scratch arrays
   1.242 +  unsigned long
   1.243 +    *a = scratch + 0 * longwords,
   1.244 +    *n = scratch + 1 * longwords,
   1.245 +    *m = scratch + 2 * longwords;
   1.246 +
   1.247 +  reverse_words((unsigned long *)a_ints, a, longwords);
   1.248 +  reverse_words((unsigned long *)n_ints, n, longwords);
   1.249 +
   1.250 +  if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
   1.251 +    ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
   1.252 +  } else {
   1.253 +    ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
   1.254 +  }
   1.255 +
   1.256 +  reverse_words(m, (unsigned long *)m_ints, longwords);
   1.257 +}

mercurial