1.1 --- a/src/cpu/ppc/vm/sharedRuntime_ppc.cpp Tue Oct 03 18:40:36 2017 -0700 1.2 +++ b/src/cpu/ppc/vm/sharedRuntime_ppc.cpp Thu Oct 12 16:29:52 2017 +0100 1.3 @@ -42,6 +42,8 @@ 1.4 #include "opto/runtime.hpp" 1.5 #endif 1.6 1.7 +#include <alloca.h> 1.8 + 1.9 #define __ masm-> 1.10 1.11 #ifdef PRODUCT 1.12 @@ -3268,3 +3270,245 @@ 1.13 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_bytes/wordSize, 1.14 oop_maps, true); 1.15 } 1.16 + 1.17 + 1.18 +//------------------------------Montgomery multiplication------------------------ 1.19 +// 1.20 + 1.21 +// Subtract 0:b from carry:a. Return carry. 1.22 +static unsigned long 1.23 +sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) { 1.24 + long i = 0; 1.25 + unsigned long tmp, tmp2; 1.26 + __asm__ __volatile__ ( 1.27 + "subfc %[tmp], %[tmp], %[tmp] \n" // pre-set CA 1.28 + "mtctr %[len] \n" 1.29 + "0: \n" 1.30 + "ldx %[tmp], %[i], %[a] \n" 1.31 + "ldx %[tmp2], %[i], %[b] \n" 1.32 + "subfe %[tmp], %[tmp2], %[tmp] \n" // subtract extended 1.33 + "stdx %[tmp], %[i], %[a] \n" 1.34 + "addi %[i], %[i], 8 \n" 1.35 + "bdnz 0b \n" 1.36 + "addme %[tmp], %[carry] \n" // carry + CA - 1 1.37 + : [i]"+b"(i), [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2) 1.38 + : [a]"r"(a), [b]"r"(b), [carry]"r"(carry), [len]"r"(len) 1.39 + : "ctr", "xer", "memory" 1.40 + ); 1.41 + return tmp; 1.42 +} 1.43 + 1.44 +// Multiply (unsigned) Long A by Long B, accumulating the double- 1.45 +// length result into the accumulator formed of T0, T1, and T2. 1.46 +inline void MACC(unsigned long A, unsigned long B, unsigned long &T0, unsigned long &T1, unsigned long &T2) { 1.47 + unsigned long hi, lo; 1.48 + __asm__ __volatile__ ( 1.49 + "mulld %[lo], %[A], %[B] \n" 1.50 + "mulhdu %[hi], %[A], %[B] \n" 1.51 + "addc %[T0], %[T0], %[lo] \n" 1.52 + "adde %[T1], %[T1], %[hi] \n" 1.53 + "addze %[T2], %[T2] \n" 1.54 + : [hi]"=&r"(hi), [lo]"=&r"(lo), [T0]"+r"(T0), [T1]"+r"(T1), [T2]"+r"(T2) 1.55 + : [A]"r"(A), [B]"r"(B) 1.56 + : "xer" 1.57 + ); 1.58 +} 1.59 + 1.60 +// As above, but add twice the double-length result into the 1.61 +// accumulator. 1.62 +inline void MACC2(unsigned long A, unsigned long B, unsigned long &T0, unsigned long &T1, unsigned long &T2) { 1.63 + unsigned long hi, lo; 1.64 + __asm__ __volatile__ ( 1.65 + "mulld %[lo], %[A], %[B] \n" 1.66 + "mulhdu %[hi], %[A], %[B] \n" 1.67 + "addc %[T0], %[T0], %[lo] \n" 1.68 + "adde %[T1], %[T1], %[hi] \n" 1.69 + "addze %[T2], %[T2] \n" 1.70 + "addc %[T0], %[T0], %[lo] \n" 1.71 + "adde %[T1], %[T1], %[hi] \n" 1.72 + "addze %[T2], %[T2] \n" 1.73 + : [hi]"=&r"(hi), [lo]"=&r"(lo), [T0]"+r"(T0), [T1]"+r"(T1), [T2]"+r"(T2) 1.74 + : [A]"r"(A), [B]"r"(B) 1.75 + : "xer" 1.76 + ); 1.77 +} 1.78 + 1.79 +// Fast Montgomery multiplication. The derivation of the algorithm is 1.80 +// in "A Cryptographic Library for the Motorola DSP56000, 1.81 +// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237". 1.82 +static void 1.83 +montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[], 1.84 + unsigned long m[], unsigned long inv, int len) { 1.85 + unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 1.86 + int i; 1.87 + 1.88 + assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 1.89 + 1.90 + for (i = 0; i < len; i++) { 1.91 + int j; 1.92 + for (j = 0; j < i; j++) { 1.93 + MACC(a[j], b[i-j], t0, t1, t2); 1.94 + MACC(m[j], n[i-j], t0, t1, t2); 1.95 + } 1.96 + MACC(a[i], b[0], t0, t1, t2); 1.97 + m[i] = t0 * inv; 1.98 + MACC(m[i], n[0], t0, t1, t2); 1.99 + 1.100 + assert(t0 == 0, "broken Montgomery multiply"); 1.101 + 1.102 + t0 = t1; t1 = t2; t2 = 0; 1.103 + } 1.104 + 1.105 + for (i = len; i < 2*len; i++) { 1.106 + int j; 1.107 + for (j = i-len+1; j < len; j++) { 1.108 + MACC(a[j], b[i-j], t0, t1, t2); 1.109 + MACC(m[j], n[i-j], t0, t1, t2); 1.110 + } 1.111 + m[i-len] = t0; 1.112 + t0 = t1; t1 = t2; t2 = 0; 1.113 + } 1.114 + 1.115 + while (t0) { 1.116 + t0 = sub(m, n, t0, len); 1.117 + } 1.118 +} 1.119 + 1.120 +// Fast Montgomery squaring. This uses asymptotically 25% fewer 1.121 +// multiplies so it should be up to 25% faster than Montgomery 1.122 +// multiplication. However, its loop control is more complex and it 1.123 +// may actually run slower on some machines. 1.124 +static void 1.125 +montgomery_square(unsigned long a[], unsigned long n[], 1.126 + unsigned long m[], unsigned long inv, int len) { 1.127 + unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 1.128 + int i; 1.129 + 1.130 + assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 1.131 + 1.132 + for (i = 0; i < len; i++) { 1.133 + int j; 1.134 + int end = (i+1)/2; 1.135 + for (j = 0; j < end; j++) { 1.136 + MACC2(a[j], a[i-j], t0, t1, t2); 1.137 + MACC(m[j], n[i-j], t0, t1, t2); 1.138 + } 1.139 + if ((i & 1) == 0) { 1.140 + MACC(a[j], a[j], t0, t1, t2); 1.141 + } 1.142 + for (; j < i; j++) { 1.143 + MACC(m[j], n[i-j], t0, t1, t2); 1.144 + } 1.145 + m[i] = t0 * inv; 1.146 + MACC(m[i], n[0], t0, t1, t2); 1.147 + 1.148 + assert(t0 == 0, "broken Montgomery square"); 1.149 + 1.150 + t0 = t1; t1 = t2; t2 = 0; 1.151 + } 1.152 + 1.153 + for (i = len; i < 2*len; i++) { 1.154 + int start = i-len+1; 1.155 + int end = start + (len - start)/2; 1.156 + int j; 1.157 + for (j = start; j < end; j++) { 1.158 + MACC2(a[j], a[i-j], t0, t1, t2); 1.159 + MACC(m[j], n[i-j], t0, t1, t2); 1.160 + } 1.161 + if ((i & 1) == 0) { 1.162 + MACC(a[j], a[j], t0, t1, t2); 1.163 + } 1.164 + for (; j < len; j++) { 1.165 + MACC(m[j], n[i-j], t0, t1, t2); 1.166 + } 1.167 + m[i-len] = t0; 1.168 + t0 = t1; t1 = t2; t2 = 0; 1.169 + } 1.170 + 1.171 + while (t0) { 1.172 + t0 = sub(m, n, t0, len); 1.173 + } 1.174 +} 1.175 + 1.176 +// The threshold at which squaring is advantageous was determined 1.177 +// experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 1.178 +// Doesn't seem to be relevant for Power8 so we use the same value. 1.179 +#define MONTGOMERY_SQUARING_THRESHOLD 64 1.180 + 1.181 +// Copy len longwords from s to d, word-swapping as we go. The 1.182 +// destination array is reversed. 1.183 +static void reverse_words(unsigned long *s, unsigned long *d, int len) { 1.184 + d += len; 1.185 + while(len-- > 0) { 1.186 + d--; 1.187 + unsigned long s_val = *s; 1.188 + // Swap words in a longword on little endian machines. 1.189 +#ifdef VM_LITTLE_ENDIAN 1.190 + s_val = (s_val << 32) | (s_val >> 32); 1.191 +#endif 1.192 + *d = s_val; 1.193 + s++; 1.194 + } 1.195 +} 1.196 + 1.197 +void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 1.198 + jint len, jlong inv, 1.199 + jint *m_ints) { 1.200 + assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 1.201 + int longwords = len/2; 1.202 + assert(longwords > 0, "unsupported"); 1.203 + 1.204 + // Make very sure we don't use so much space that the stack might 1.205 + // overflow. 512 jints corresponds to an 16384-bit integer and 1.206 + // will use here a total of 8k bytes of stack space. 1.207 + int total_allocation = longwords * sizeof (unsigned long) * 4; 1.208 + guarantee(total_allocation <= 8192, "must be"); 1.209 + unsigned long *scratch = (unsigned long *)alloca(total_allocation); 1.210 + 1.211 + // Local scratch arrays 1.212 + unsigned long 1.213 + *a = scratch + 0 * longwords, 1.214 + *b = scratch + 1 * longwords, 1.215 + *n = scratch + 2 * longwords, 1.216 + *m = scratch + 3 * longwords; 1.217 + 1.218 + reverse_words((unsigned long *)a_ints, a, longwords); 1.219 + reverse_words((unsigned long *)b_ints, b, longwords); 1.220 + reverse_words((unsigned long *)n_ints, n, longwords); 1.221 + 1.222 + ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords); 1.223 + 1.224 + reverse_words(m, (unsigned long *)m_ints, longwords); 1.225 +} 1.226 + 1.227 +void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 1.228 + jint len, jlong inv, 1.229 + jint *m_ints) { 1.230 + assert(len % 2 == 0, "array length in montgomery_square must be even"); 1.231 + int longwords = len/2; 1.232 + assert(longwords > 0, "unsupported"); 1.233 + 1.234 + // Make very sure we don't use so much space that the stack might 1.235 + // overflow. 512 jints corresponds to an 16384-bit integer and 1.236 + // will use here a total of 6k bytes of stack space. 1.237 + int total_allocation = longwords * sizeof (unsigned long) * 3; 1.238 + guarantee(total_allocation <= 8192, "must be"); 1.239 + unsigned long *scratch = (unsigned long *)alloca(total_allocation); 1.240 + 1.241 + // Local scratch arrays 1.242 + unsigned long 1.243 + *a = scratch + 0 * longwords, 1.244 + *n = scratch + 1 * longwords, 1.245 + *m = scratch + 2 * longwords; 1.246 + 1.247 + reverse_words((unsigned long *)a_ints, a, longwords); 1.248 + reverse_words((unsigned long *)n_ints, n, longwords); 1.249 + 1.250 + if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 1.251 + ::montgomery_square(a, n, m, (unsigned long)inv, longwords); 1.252 + } else { 1.253 + ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords); 1.254 + } 1.255 + 1.256 + reverse_words(m, (unsigned long *)m_ints, longwords); 1.257 +}