Tue, 09 Jul 2019 15:15:56 +0800
#9516 [Backport of #9435] Implement BigInteger.montgomeryMultiply intrinsic.
Reviewed-by: aoqi
1.1 --- a/src/cpu/mips/vm/sharedRuntime_mips_64.cpp Thu Jul 04 18:11:24 2019 +0800 1.2 +++ b/src/cpu/mips/vm/sharedRuntime_mips_64.cpp Tue Jul 09 15:15:56 2019 +0800 1.3 @@ -42,6 +42,8 @@ 1.4 #include "opto/runtime.hpp" 1.5 #endif 1.6 1.7 +#include <alloca.h> 1.8 + 1.9 #define __ masm-> 1.10 1.11 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 1.12 @@ -3718,3 +3720,264 @@ 1.13 } 1.14 1.15 extern "C" int SpinPause() {return 0;} 1.16 + 1.17 + 1.18 +//------------------------------Montgomery multiplication------------------------ 1.19 +// 1.20 + 1.21 +// Subtract 0:b from carry:a. Return carry. 1.22 +static unsigned long 1.23 +sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) { 1.24 + long borrow = 0, t = 0; 1.25 + unsigned long tmp0, tmp1; 1.26 + __asm__ __volatile__ ( 1.27 + "0: \n" 1.28 + "ld %[tmp0], 0(%[a]) \n" 1.29 + "ld %[tmp1], 0(%[b]) \n" 1.30 + "sltu %[t], %[tmp0], %[borrow] \n" 1.31 + "dsubu %[tmp0], %[tmp0], %[borrow] \n" 1.32 + "sltu %[borrow], %[tmp0], %[tmp1] \n" 1.33 + "or %[borrow], %[borrow], %[t] \n" 1.34 + "dsubu %[tmp0], %[tmp0], %[tmp1] \n" 1.35 + "sd %[tmp0], 0(%[a]) \n" 1.36 + "daddiu %[a], %[a], 8 \n" 1.37 + "daddiu %[b], %[b], 8 \n" 1.38 + "daddiu %[len], %[len], -1 \n" 1.39 + "bgtz %[len], 0b \n" 1.40 + "dsubu %[tmp0], %[carry], %[borrow] \n" 1.41 + : [len]"+r"(len), [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1), [borrow]"+r"(borrow), [a]"+r"(a), [b]"+r"(b), [t]"+r"(t) 1.42 + : [carry]"r"(carry) 1.43 + : "memory" 1.44 + ); 1.45 + return tmp0; 1.46 +} 1.47 + 1.48 +// Multiply (unsigned) Long A by Long B, accumulating the double- 1.49 +// length result into the accumulator formed of t0, t1, and t2. 1.50 +inline void MACC(unsigned long A, unsigned long B, unsigned long &t0, unsigned long &t1, unsigned long &t2) { 1.51 + unsigned long hi, lo, carry = 0, t = 0; 1.52 + __asm__ __volatile__( 1.53 + "dmultu %[A], %[B] \n" 1.54 + "mfhi %[hi] \n" 1.55 + "mflo %[lo] \n" 1.56 + "daddu %[t0], %[t0], %[lo] \n" 1.57 + "sltu %[carry], %[t0], %[lo] \n" 1.58 + "daddu %[t1], %[t1], %[carry] \n" 1.59 + "sltu %[t], %[t1], %[carry] \n" 1.60 + "daddu %[t1], %[t1], %[hi] \n" 1.61 + "sltu %[carry], %[t1], %[hi] \n" 1.62 + "or %[carry], %[carry], %[t] \n" 1.63 + "daddu %[t2], %[t2], %[carry] \n" 1.64 + : [hi]"=&r"(hi), [lo]"=&r"(lo), [t0]"+r"(t0), [t1]"+r"(t1), [t2]"+r"(t2), [carry]"+r"(carry), [t]"+r"(t) 1.65 + : [A]"r"(A), [B]"r"(B) 1.66 + : 1.67 + ); 1.68 +} 1.69 + 1.70 +// As above, but add twice the double-length result into the 1.71 +// accumulator. 1.72 +inline void MACC2(unsigned long A, unsigned long B, unsigned long &t0, unsigned long &t1, unsigned long &t2) { 1.73 + unsigned long hi, lo, carry = 0, t = 0; 1.74 + __asm__ __volatile__( 1.75 + "dmultu %[A], %[B] \n" 1.76 + "mfhi %[hi] \n" 1.77 + "mflo %[lo] \n" 1.78 + "daddu %[t0], %[t0], %[lo] \n" 1.79 + "sltu %[carry], %[t0], %[lo] \n" 1.80 + "daddu %[t1], %[t1], %[carry] \n" 1.81 + "sltu %[t], %[t1], %[carry] \n" 1.82 + "daddu %[t1], %[t1], %[hi] \n" 1.83 + "sltu %[carry], %[t1], %[hi] \n" 1.84 + "or %[carry], %[carry], %[t] \n" 1.85 + "daddu %[t2], %[t2], %[carry] \n" 1.86 + "daddu %[t0], %[t0], %[lo] \n" 1.87 + "sltu %[carry], %[t0], %[lo] \n" 1.88 + "daddu %[t1], %[t1], %[carry] \n" 1.89 + "sltu %[t], %[t1], %[carry] \n" 1.90 + "daddu %[t1], %[t1], %[hi] \n" 1.91 + "sltu %[carry], %[t1], %[hi] \n" 1.92 + "or %[carry], %[carry], %[t] \n" 1.93 + "daddu %[t2], %[t2], %[carry] \n" 1.94 + : [hi]"=&r"(hi), [lo]"=&r"(lo), [t0]"+r"(t0), [t1]"+r"(t1), [t2]"+r"(t2), [carry]"+r"(carry), [t]"+r"(t) 1.95 + : [A]"r"(A), [B]"r"(B) 1.96 + : 1.97 + ); 1.98 +} 1.99 + 1.100 +// Fast Montgomery multiplication. The derivation of the algorithm is 1.101 +// in A Cryptographic Library for the Motorola DSP56000, 1.102 +// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 1.103 + 1.104 +static void __attribute__((noinline)) 1.105 +montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[], 1.106 + unsigned long m[], unsigned long inv, int len) { 1.107 + unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 1.108 + int i; 1.109 + 1.110 + assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 1.111 + 1.112 + for (i = 0; i < len; i++) { 1.113 + int j; 1.114 + for (j = 0; j < i; j++) { 1.115 + MACC(a[j], b[i-j], t0, t1, t2); 1.116 + MACC(m[j], n[i-j], t0, t1, t2); 1.117 + } 1.118 + MACC(a[i], b[0], t0, t1, t2); 1.119 + m[i] = t0 * inv; 1.120 + MACC(m[i], n[0], t0, t1, t2); 1.121 + 1.122 + assert(t0 == 0, "broken Montgomery multiply"); 1.123 + 1.124 + t0 = t1; t1 = t2; t2 = 0; 1.125 + } 1.126 + 1.127 + for (i = len; i < 2*len; i++) { 1.128 + int j; 1.129 + for (j = i-len+1; j < len; j++) { 1.130 + MACC(a[j], b[i-j], t0, t1, t2); 1.131 + MACC(m[j], n[i-j], t0, t1, t2); 1.132 + } 1.133 + m[i-len] = t0; 1.134 + t0 = t1; t1 = t2; t2 = 0; 1.135 + } 1.136 + 1.137 + while (t0) 1.138 + t0 = sub(m, n, t0, len); 1.139 +} 1.140 + 1.141 +// Fast Montgomery squaring. This uses asymptotically 25% fewer 1.142 +// multiplies so it should be up to 25% faster than Montgomery 1.143 +// multiplication. However, its loop control is more complex and it 1.144 +// may actually run slower on some machines. 1.145 + 1.146 +static void __attribute__((noinline)) 1.147 +montgomery_square(unsigned long a[], unsigned long n[], 1.148 + unsigned long m[], unsigned long inv, int len) { 1.149 + unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 1.150 + int i; 1.151 + 1.152 + assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 1.153 + 1.154 + for (i = 0; i < len; i++) { 1.155 + int j; 1.156 + int end = (i+1)/2; 1.157 + for (j = 0; j < end; j++) { 1.158 + MACC2(a[j], a[i-j], t0, t1, t2); 1.159 + MACC(m[j], n[i-j], t0, t1, t2); 1.160 + } 1.161 + if ((i & 1) == 0) { 1.162 + MACC(a[j], a[j], t0, t1, t2); 1.163 + } 1.164 + for (; j < i; j++) { 1.165 + MACC(m[j], n[i-j], t0, t1, t2); 1.166 + } 1.167 + m[i] = t0 * inv; 1.168 + MACC(m[i], n[0], t0, t1, t2); 1.169 + 1.170 + assert(t0 == 0, "broken Montgomery square"); 1.171 + 1.172 + t0 = t1; t1 = t2; t2 = 0; 1.173 + } 1.174 + 1.175 + for (i = len; i < 2*len; i++) { 1.176 + int start = i-len+1; 1.177 + int end = start + (len - start)/2; 1.178 + int j; 1.179 + for (j = start; j < end; j++) { 1.180 + MACC2(a[j], a[i-j], t0, t1, t2); 1.181 + MACC(m[j], n[i-j], t0, t1, t2); 1.182 + } 1.183 + if ((i & 1) == 0) { 1.184 + MACC(a[j], a[j], t0, t1, t2); 1.185 + } 1.186 + for (; j < len; j++) { 1.187 + MACC(m[j], n[i-j], t0, t1, t2); 1.188 + } 1.189 + m[i-len] = t0; 1.190 + t0 = t1; t1 = t2; t2 = 0; 1.191 + } 1.192 + 1.193 + while (t0) 1.194 + t0 = sub(m, n, t0, len); 1.195 +} 1.196 + 1.197 +// Swap words in a longword. 1.198 +static unsigned long swap(unsigned long x) { 1.199 + return (x << 32) | (x >> 32); 1.200 +} 1.201 + 1.202 +// Copy len longwords from s to d, word-swapping as we go. The 1.203 +// destination array is reversed. 1.204 +static void reverse_words(unsigned long *s, unsigned long *d, int len) { 1.205 + d += len; 1.206 + while(len-- > 0) { 1.207 + d--; 1.208 + *d = swap(*s); 1.209 + s++; 1.210 + } 1.211 +} 1.212 + 1.213 +// The threshold at which squaring is advantageous was determined 1.214 +// experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 1.215 +// Doesn't seem to be relevant for MIPS64 so we use the same value. 1.216 +#define MONTGOMERY_SQUARING_THRESHOLD 64 1.217 + 1.218 +void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 1.219 + jint len, jlong inv, 1.220 + jint *m_ints) { 1.221 + assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 1.222 + int longwords = len/2; 1.223 + 1.224 + // Make very sure we don't use so much space that the stack might 1.225 + // overflow. 512 jints corresponds to an 16384-bit integer and 1.226 + // will use here a total of 8k bytes of stack space. 1.227 + int total_allocation = longwords * sizeof (unsigned long) * 4; 1.228 + guarantee(total_allocation <= 8192, "must be"); 1.229 + unsigned long *scratch = (unsigned long *)alloca(total_allocation); 1.230 + 1.231 + // Local scratch arrays 1.232 + unsigned long 1.233 + *a = scratch + 0 * longwords, 1.234 + *b = scratch + 1 * longwords, 1.235 + *n = scratch + 2 * longwords, 1.236 + *m = scratch + 3 * longwords; 1.237 + 1.238 + reverse_words((unsigned long *)a_ints, a, longwords); 1.239 + reverse_words((unsigned long *)b_ints, b, longwords); 1.240 + reverse_words((unsigned long *)n_ints, n, longwords); 1.241 + 1.242 + ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords); 1.243 + 1.244 + reverse_words(m, (unsigned long *)m_ints, longwords); 1.245 +} 1.246 + 1.247 +void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 1.248 + jint len, jlong inv, 1.249 + jint *m_ints) { 1.250 + assert(len % 2 == 0, "array length in montgomery_square must be even"); 1.251 + int longwords = len/2; 1.252 + 1.253 + // Make very sure we don't use so much space that the stack might 1.254 + // overflow. 512 jints corresponds to an 16384-bit integer and 1.255 + // will use here a total of 6k bytes of stack space. 1.256 + int total_allocation = longwords * sizeof (unsigned long) * 3; 1.257 + guarantee(total_allocation <= 8192, "must be"); 1.258 + unsigned long *scratch = (unsigned long *)alloca(total_allocation); 1.259 + 1.260 + // Local scratch arrays 1.261 + unsigned long 1.262 + *a = scratch + 0 * longwords, 1.263 + *n = scratch + 1 * longwords, 1.264 + *m = scratch + 2 * longwords; 1.265 + 1.266 + reverse_words((unsigned long *)a_ints, a, longwords); 1.267 + reverse_words((unsigned long *)n_ints, n, longwords); 1.268 + 1.269 + if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 1.270 + ::montgomery_square(a, n, m, (unsigned long)inv, longwords); 1.271 + } else { 1.272 + ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords); 1.273 + } 1.274 + 1.275 + reverse_words(m, (unsigned long *)m_ints, longwords); 1.276 +}
2.1 --- a/src/cpu/mips/vm/stubGenerator_mips_64.cpp Thu Jul 04 18:11:24 2019 +0800 2.2 +++ b/src/cpu/mips/vm/stubGenerator_mips_64.cpp Tue Jul 09 15:15:56 2019 +0800 2.3 @@ -2130,6 +2130,15 @@ 2.4 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 2.5 &StubRoutines::_safefetchN_fault_pc, 2.6 &StubRoutines::_safefetchN_continuation_pc); 2.7 + 2.8 + if (UseMontgomeryMultiplyIntrinsic) { 2.9 + StubRoutines::_montgomeryMultiply 2.10 + = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply); 2.11 + } 2.12 + if (UseMontgomerySquareIntrinsic) { 2.13 + StubRoutines::_montgomerySquare 2.14 + = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square); 2.15 + } 2.16 } 2.17 2.18 public:
3.1 --- a/src/cpu/mips/vm/vm_version_mips.cpp Thu Jul 04 18:11:24 2019 +0800 3.2 +++ b/src/cpu/mips/vm/vm_version_mips.cpp Tue Jul 09 15:15:56 2019 +0800 3.3 @@ -178,6 +178,13 @@ 3.4 FLAG_SET_DEFAULT(UseSHA512Intrinsics, false); 3.5 } 3.6 3.7 + if (FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) { 3.8 + UseMontgomeryMultiplyIntrinsic = true; 3.9 + } 3.10 + if (FLAG_IS_DEFAULT(UseMontgomerySquareIntrinsic)) { 3.11 + UseMontgomerySquareIntrinsic = true; 3.12 + } 3.13 + 3.14 NOT_PRODUCT( if (PrintMiscellaneous && Verbose) print_features(); ); 3.15 } 3.16