#9516 [Backport of #9435] Implement BigInteger.montgomeryMultiply intrinsic.

Tue, 09 Jul 2019 15:15:56 +0800

author
wanghaomin
date
Tue, 09 Jul 2019 15:15:56 +0800
changeset 9639
38d2ba487173
parent 9638
b2109cbbbddb
child 9640
017b3a2aca37

#9516 [Backport of #9435] Implement BigInteger.montgomeryMultiply intrinsic.
Reviewed-by: aoqi

src/cpu/mips/vm/sharedRuntime_mips_64.cpp file | annotate | diff | comparison | revisions
src/cpu/mips/vm/stubGenerator_mips_64.cpp file | annotate | diff | comparison | revisions
src/cpu/mips/vm/vm_version_mips.cpp file | annotate | diff | comparison | revisions
     1.1 --- a/src/cpu/mips/vm/sharedRuntime_mips_64.cpp	Thu Jul 04 18:11:24 2019 +0800
     1.2 +++ b/src/cpu/mips/vm/sharedRuntime_mips_64.cpp	Tue Jul 09 15:15:56 2019 +0800
     1.3 @@ -42,6 +42,8 @@
     1.4  #include "opto/runtime.hpp"
     1.5  #endif
     1.6  
     1.7 +#include <alloca.h>
     1.8 +
     1.9  #define __ masm->
    1.10  
    1.11  const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
    1.12 @@ -3718,3 +3720,264 @@
    1.13  }
    1.14  
    1.15  extern "C" int SpinPause() {return 0;}
    1.16 +
    1.17 +
    1.18 +//------------------------------Montgomery multiplication------------------------
    1.19 +//
    1.20 +
    1.21 +// Subtract 0:b from carry:a.  Return carry.
    1.22 +static unsigned long
    1.23 +sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
    1.24 +  long borrow = 0, t = 0;
    1.25 +  unsigned long tmp0, tmp1;
    1.26 +  __asm__ __volatile__ (
    1.27 +    "0:                                            \n"
    1.28 +    "ld      %[tmp0],     0(%[a])                  \n"
    1.29 +    "ld      %[tmp1],     0(%[b])                  \n"
    1.30 +    "sltu    %[t],        %[tmp0],     %[borrow]   \n"
    1.31 +    "dsubu   %[tmp0],     %[tmp0],     %[borrow]   \n"
    1.32 +    "sltu    %[borrow],   %[tmp0],     %[tmp1]     \n"
    1.33 +    "or      %[borrow],   %[borrow],   %[t]        \n"
    1.34 +    "dsubu   %[tmp0],     %[tmp0],     %[tmp1]     \n"
    1.35 +    "sd      %[tmp0],     0(%[a])                  \n"
    1.36 +    "daddiu  %[a],        %[a],         8          \n"
    1.37 +    "daddiu  %[b],        %[b],         8          \n"
    1.38 +    "daddiu  %[len],      %[len],      -1          \n"
    1.39 +    "bgtz    %[len],      0b                       \n"
    1.40 +    "dsubu   %[tmp0],     %[carry],    %[borrow]   \n"
    1.41 +    : [len]"+r"(len), [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1), [borrow]"+r"(borrow), [a]"+r"(a), [b]"+r"(b), [t]"+r"(t)
    1.42 +    : [carry]"r"(carry)
    1.43 +    : "memory"
    1.44 +  );
    1.45 +  return tmp0;
    1.46 +}
    1.47 +
    1.48 +// Multiply (unsigned) Long A by Long B, accumulating the double-
    1.49 +// length result into the accumulator formed of t0, t1, and t2.
    1.50 +inline void MACC(unsigned long A, unsigned long B, unsigned long &t0, unsigned long &t1, unsigned long &t2) {
    1.51 +  unsigned long hi, lo, carry = 0, t = 0;
    1.52 +  __asm__ __volatile__(
    1.53 +    "dmultu  %[A],        %[B]                     \n"
    1.54 +    "mfhi    %[hi]                                 \n"
    1.55 +    "mflo    %[lo]                                 \n"
    1.56 +    "daddu   %[t0],       %[t0],       %[lo]       \n"
    1.57 +    "sltu    %[carry],    %[t0],       %[lo]       \n"
    1.58 +    "daddu   %[t1],       %[t1],       %[carry]    \n"
    1.59 +    "sltu    %[t],        %[t1],       %[carry]    \n"
    1.60 +    "daddu   %[t1],       %[t1],       %[hi]       \n"
    1.61 +    "sltu    %[carry],    %[t1],       %[hi]       \n"
    1.62 +    "or      %[carry],    %[carry],    %[t]        \n"
    1.63 +    "daddu   %[t2],       %[t2],       %[carry]    \n"
    1.64 +    : [hi]"=&r"(hi), [lo]"=&r"(lo), [t0]"+r"(t0), [t1]"+r"(t1), [t2]"+r"(t2), [carry]"+r"(carry), [t]"+r"(t)
    1.65 +    : [A]"r"(A), [B]"r"(B)
    1.66 +    :
    1.67 +  );
    1.68 +}
    1.69 +
    1.70 +// As above, but add twice the double-length result into the
    1.71 +// accumulator.
    1.72 +inline void MACC2(unsigned long A, unsigned long B, unsigned long &t0, unsigned long &t1, unsigned long &t2) {
    1.73 +  unsigned long hi, lo, carry = 0, t = 0;
    1.74 +  __asm__ __volatile__(
    1.75 +    "dmultu  %[A],        %[B]                     \n"
    1.76 +    "mfhi    %[hi]                                 \n"
    1.77 +    "mflo    %[lo]                                 \n"
    1.78 +    "daddu   %[t0],       %[t0],       %[lo]       \n"
    1.79 +    "sltu    %[carry],    %[t0],       %[lo]       \n"
    1.80 +    "daddu   %[t1],       %[t1],       %[carry]    \n"
    1.81 +    "sltu    %[t],        %[t1],       %[carry]    \n"
    1.82 +    "daddu   %[t1],       %[t1],       %[hi]       \n"
    1.83 +    "sltu    %[carry],    %[t1],       %[hi]       \n"
    1.84 +    "or      %[carry],    %[carry],    %[t]        \n"
    1.85 +    "daddu   %[t2],       %[t2],       %[carry]    \n"
    1.86 +    "daddu   %[t0],       %[t0],       %[lo]       \n"
    1.87 +    "sltu    %[carry],    %[t0],       %[lo]       \n"
    1.88 +    "daddu   %[t1],       %[t1],       %[carry]    \n"
    1.89 +    "sltu    %[t],        %[t1],       %[carry]    \n"
    1.90 +    "daddu   %[t1],       %[t1],       %[hi]       \n"
    1.91 +    "sltu    %[carry],    %[t1],       %[hi]       \n"
    1.92 +    "or      %[carry],    %[carry],    %[t]        \n"
    1.93 +    "daddu   %[t2],       %[t2],       %[carry]    \n"
    1.94 +    : [hi]"=&r"(hi), [lo]"=&r"(lo), [t0]"+r"(t0), [t1]"+r"(t1), [t2]"+r"(t2), [carry]"+r"(carry), [t]"+r"(t)
    1.95 +    : [A]"r"(A), [B]"r"(B)
    1.96 +    :
    1.97 +  );
    1.98 +}
    1.99 +
   1.100 +// Fast Montgomery multiplication.  The derivation of the algorithm is
   1.101 +// in  A Cryptographic Library for the Motorola DSP56000,
   1.102 +// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
   1.103 +
   1.104 +static void __attribute__((noinline))
   1.105 +montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
   1.106 +                    unsigned long m[], unsigned long inv, int len) {
   1.107 +  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
   1.108 +  int i;
   1.109 +
   1.110 +  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
   1.111 +
   1.112 +  for (i = 0; i < len; i++) {
   1.113 +    int j;
   1.114 +    for (j = 0; j < i; j++) {
   1.115 +      MACC(a[j], b[i-j], t0, t1, t2);
   1.116 +      MACC(m[j], n[i-j], t0, t1, t2);
   1.117 +    }
   1.118 +    MACC(a[i], b[0], t0, t1, t2);
   1.119 +    m[i] = t0 * inv;
   1.120 +    MACC(m[i], n[0], t0, t1, t2);
   1.121 +
   1.122 +    assert(t0 == 0, "broken Montgomery multiply");
   1.123 +
   1.124 +    t0 = t1; t1 = t2; t2 = 0;
   1.125 +  }
   1.126 +
   1.127 +  for (i = len; i < 2*len; i++) {
   1.128 +    int j;
   1.129 +    for (j = i-len+1; j < len; j++) {
   1.130 +      MACC(a[j], b[i-j], t0, t1, t2);
   1.131 +      MACC(m[j], n[i-j], t0, t1, t2);
   1.132 +    }
   1.133 +    m[i-len] = t0;
   1.134 +    t0 = t1; t1 = t2; t2 = 0;
   1.135 +  }
   1.136 +
   1.137 +  while (t0)
   1.138 +    t0 = sub(m, n, t0, len);
   1.139 +}
   1.140 +
   1.141 +// Fast Montgomery squaring.  This uses asymptotically 25% fewer
   1.142 +// multiplies so it should be up to 25% faster than Montgomery
   1.143 +// multiplication.  However, its loop control is more complex and it
   1.144 +// may actually run slower on some machines.
   1.145 +
   1.146 +static void __attribute__((noinline))
   1.147 +montgomery_square(unsigned long a[], unsigned long n[],
   1.148 +                  unsigned long m[], unsigned long inv, int len) {
   1.149 +  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
   1.150 +  int i;
   1.151 +
   1.152 +  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
   1.153 +
   1.154 +  for (i = 0; i < len; i++) {
   1.155 +    int j;
   1.156 +    int end = (i+1)/2;
   1.157 +    for (j = 0; j < end; j++) {
   1.158 +      MACC2(a[j], a[i-j], t0, t1, t2);
   1.159 +      MACC(m[j], n[i-j], t0, t1, t2);
   1.160 +    }
   1.161 +    if ((i & 1) == 0) {
   1.162 +      MACC(a[j], a[j], t0, t1, t2);
   1.163 +    }
   1.164 +    for (; j < i; j++) {
   1.165 +      MACC(m[j], n[i-j], t0, t1, t2);
   1.166 +    }
   1.167 +    m[i] = t0 * inv;
   1.168 +    MACC(m[i], n[0], t0, t1, t2);
   1.169 +
   1.170 +    assert(t0 == 0, "broken Montgomery square");
   1.171 +
   1.172 +    t0 = t1; t1 = t2; t2 = 0;
   1.173 +  }
   1.174 +
   1.175 +  for (i = len; i < 2*len; i++) {
   1.176 +    int start = i-len+1;
   1.177 +    int end = start + (len - start)/2;
   1.178 +    int j;
   1.179 +    for (j = start; j < end; j++) {
   1.180 +      MACC2(a[j], a[i-j], t0, t1, t2);
   1.181 +      MACC(m[j], n[i-j], t0, t1, t2);
   1.182 +    }
   1.183 +    if ((i & 1) == 0) {
   1.184 +      MACC(a[j], a[j], t0, t1, t2);
   1.185 +    }
   1.186 +    for (; j < len; j++) {
   1.187 +      MACC(m[j], n[i-j], t0, t1, t2);
   1.188 +    }
   1.189 +    m[i-len] = t0;
   1.190 +    t0 = t1; t1 = t2; t2 = 0;
   1.191 +  }
   1.192 +
   1.193 +  while (t0)
   1.194 +    t0 = sub(m, n, t0, len);
   1.195 +}
   1.196 +
   1.197 +// Swap words in a longword.
   1.198 +static unsigned long swap(unsigned long x) {
   1.199 +  return (x << 32) | (x >> 32);
   1.200 +}
   1.201 +
   1.202 +// Copy len longwords from s to d, word-swapping as we go.  The
   1.203 +// destination array is reversed.
   1.204 +static void reverse_words(unsigned long *s, unsigned long *d, int len) {
   1.205 +  d += len;
   1.206 +  while(len-- > 0) {
   1.207 +    d--;
   1.208 +    *d = swap(*s);
   1.209 +    s++;
   1.210 +  }
   1.211 +}
   1.212 +
   1.213 +// The threshold at which squaring is advantageous was determined
   1.214 +// experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
   1.215 +// Doesn't seem to be relevant for MIPS64 so we use the same value.
   1.216 +#define MONTGOMERY_SQUARING_THRESHOLD 64
   1.217 +
   1.218 +void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
   1.219 +                                        jint len, jlong inv,
   1.220 +                                        jint *m_ints) {
   1.221 +  assert(len % 2 == 0, "array length in montgomery_multiply must be even");
   1.222 +  int longwords = len/2;
   1.223 +
   1.224 +  // Make very sure we don't use so much space that the stack might
   1.225 +  // overflow.  512 jints corresponds to an 16384-bit integer and
   1.226 +  // will use here a total of 8k bytes of stack space.
   1.227 +  int total_allocation = longwords * sizeof (unsigned long) * 4;
   1.228 +  guarantee(total_allocation <= 8192, "must be");
   1.229 +  unsigned long *scratch = (unsigned long *)alloca(total_allocation);
   1.230 +
   1.231 +  // Local scratch arrays
   1.232 +  unsigned long
   1.233 +    *a = scratch + 0 * longwords,
   1.234 +    *b = scratch + 1 * longwords,
   1.235 +    *n = scratch + 2 * longwords,
   1.236 +    *m = scratch + 3 * longwords;
   1.237 +
   1.238 +  reverse_words((unsigned long *)a_ints, a, longwords);
   1.239 +  reverse_words((unsigned long *)b_ints, b, longwords);
   1.240 +  reverse_words((unsigned long *)n_ints, n, longwords);
   1.241 +
   1.242 +  ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
   1.243 +
   1.244 +  reverse_words(m, (unsigned long *)m_ints, longwords);
   1.245 +}
   1.246 +
   1.247 +void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
   1.248 +                                      jint len, jlong inv,
   1.249 +                                      jint *m_ints) {
   1.250 +  assert(len % 2 == 0, "array length in montgomery_square must be even");
   1.251 +  int longwords = len/2;
   1.252 +
   1.253 +  // Make very sure we don't use so much space that the stack might
   1.254 +  // overflow.  512 jints corresponds to an 16384-bit integer and
   1.255 +  // will use here a total of 6k bytes of stack space.
   1.256 +  int total_allocation = longwords * sizeof (unsigned long) * 3;
   1.257 +  guarantee(total_allocation <= 8192, "must be");
   1.258 +  unsigned long *scratch = (unsigned long *)alloca(total_allocation);
   1.259 +
   1.260 +  // Local scratch arrays
   1.261 +  unsigned long
   1.262 +    *a = scratch + 0 * longwords,
   1.263 +    *n = scratch + 1 * longwords,
   1.264 +    *m = scratch + 2 * longwords;
   1.265 +
   1.266 +  reverse_words((unsigned long *)a_ints, a, longwords);
   1.267 +  reverse_words((unsigned long *)n_ints, n, longwords);
   1.268 +
   1.269 +  if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
   1.270 +    ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
   1.271 +  } else {
   1.272 +    ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
   1.273 +  }
   1.274 +
   1.275 +  reverse_words(m, (unsigned long *)m_ints, longwords);
   1.276 +}
     2.1 --- a/src/cpu/mips/vm/stubGenerator_mips_64.cpp	Thu Jul 04 18:11:24 2019 +0800
     2.2 +++ b/src/cpu/mips/vm/stubGenerator_mips_64.cpp	Tue Jul 09 15:15:56 2019 +0800
     2.3 @@ -2130,6 +2130,15 @@
     2.4      generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
     2.5                                                         &StubRoutines::_safefetchN_fault_pc,
     2.6                                                         &StubRoutines::_safefetchN_continuation_pc);
     2.7 +
     2.8 +    if (UseMontgomeryMultiplyIntrinsic) {
     2.9 +      StubRoutines::_montgomeryMultiply
    2.10 +        = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
    2.11 +    }
    2.12 +    if (UseMontgomerySquareIntrinsic) {
    2.13 +      StubRoutines::_montgomerySquare
    2.14 +        = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
    2.15 +    }
    2.16    }
    2.17  
    2.18   public:
     3.1 --- a/src/cpu/mips/vm/vm_version_mips.cpp	Thu Jul 04 18:11:24 2019 +0800
     3.2 +++ b/src/cpu/mips/vm/vm_version_mips.cpp	Tue Jul 09 15:15:56 2019 +0800
     3.3 @@ -178,6 +178,13 @@
     3.4      FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
     3.5    }
     3.6  
     3.7 +  if (FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) {
     3.8 +    UseMontgomeryMultiplyIntrinsic = true;
     3.9 +  }
    3.10 +  if (FLAG_IS_DEFAULT(UseMontgomerySquareIntrinsic)) {
    3.11 +    UseMontgomerySquareIntrinsic = true;
    3.12 +  }
    3.13 +
    3.14    NOT_PRODUCT( if (PrintMiscellaneous && Verbose) print_features(); );
    3.15  }
    3.16  

mercurial