Thu, 22 Sep 2016 12:17:24 +0200
8164920: ppc: enhancement of CRC32 intrinsic
Reviewed-by: goetz, mdoerr
Contributed-by: Hiroshi H Horii <horii@jp.ibm.com>
1.1 --- a/src/cpu/ppc/vm/assembler_ppc.hpp Mon Sep 24 17:18:38 2018 -0400 1.2 +++ b/src/cpu/ppc/vm/assembler_ppc.hpp Thu Sep 22 12:17:24 2016 +0200 1.3 @@ -468,6 +468,10 @@ 1.4 LVSL_OPCODE = (31u << OPCODE_SHIFT | 6u << 1), 1.5 LVSR_OPCODE = (31u << OPCODE_SHIFT | 38u << 1), 1.6 1.7 + // Vector-Scalar (VSX) instruction support. 1.8 + MTVSRD_OPCODE = (31u << OPCODE_SHIFT | 179u << 1), 1.9 + MFVSRD_OPCODE = (31u << OPCODE_SHIFT | 51u << 1), 1.10 + 1.11 // Vector Permute and Formatting 1.12 VPKPX_OPCODE = (4u << OPCODE_SHIFT | 782u ), 1.13 VPKSHSS_OPCODE = (4u << OPCODE_SHIFT | 398u ), 1.14 @@ -1938,6 +1942,10 @@ 1.15 inline void mtvscr( VectorRegister b); 1.16 inline void mfvscr( VectorRegister d); 1.17 1.18 + // Vector-Scalar (VSX) instructions. 1.19 + inline void mtvrd( VectorRegister d, Register a); 1.20 + inline void mfvrd( Register a, VectorRegister d); 1.21 + 1.22 // AES (introduced with Power 8) 1.23 inline void vcipher( VectorRegister d, VectorRegister a, VectorRegister b); 1.24 inline void vcipherlast( VectorRegister d, VectorRegister a, VectorRegister b);
2.1 --- a/src/cpu/ppc/vm/assembler_ppc.inline.hpp Mon Sep 24 17:18:38 2018 -0400 2.2 +++ b/src/cpu/ppc/vm/assembler_ppc.inline.hpp Thu Sep 22 12:17:24 2016 +0200 2.3 @@ -623,6 +623,10 @@ 2.4 inline void Assembler::lvsl( VectorRegister d, Register s1, Register s2) { emit_int32( LVSL_OPCODE | vrt(d) | ra0mem(s1) | rb(s2)); } 2.5 inline void Assembler::lvsr( VectorRegister d, Register s1, Register s2) { emit_int32( LVSR_OPCODE | vrt(d) | ra0mem(s1) | rb(s2)); } 2.6 2.7 +// Vector-Scalar (VSX) instructions. 2.8 +inline void Assembler::mtvrd( VectorRegister d, Register a) { emit_int32( MTVSRD_OPCODE | vrt(d) | ra(a) | 1u); } // 1u: d is treated as Vector (VMX/Altivec). 2.9 +inline void Assembler::mfvrd( Register a, VectorRegister d) { emit_int32( MFVSRD_OPCODE | vrt(d) | ra(a) | 1u); } // 1u: d is treated as Vector (VMX/Altivec). 2.10 + 2.11 inline void Assembler::vpkpx( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKPX_OPCODE | vrt(d) | vra(a) | vrb(b)); } 2.12 inline void Assembler::vpkshss( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKSHSS_OPCODE | vrt(d) | vra(a) | vrb(b)); } 2.13 inline void Assembler::vpkswss( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKSWSS_OPCODE | vrt(d) | vra(a) | vrb(b)); }
3.1 --- a/src/cpu/ppc/vm/macroAssembler_ppc.cpp Mon Sep 24 17:18:38 2018 -0400 3.2 +++ b/src/cpu/ppc/vm/macroAssembler_ppc.cpp Thu Sep 22 12:17:24 2016 +0200 3.3 @@ -3423,6 +3423,565 @@ 3.4 BLOCK_COMMENT("} kernel_crc32_1byte"); 3.5 } 3.6 3.7 +/** 3.8 + * @param crc register containing existing CRC (32-bit) 3.9 + * @param buf register pointing to input byte buffer (byte*) 3.10 + * @param len register containing number of bytes 3.11 + * @param table register pointing to CRC table 3.12 + * @param constants register pointing to CRC table for 128-bit aligned memory 3.13 + * @param barretConstants register pointing to table for barrett reduction 3.14 + * @param t0 volatile register 3.15 + * @param t1 volatile register 3.16 + * @param t2 volatile register 3.17 + * @param t3 volatile register 3.18 + */ 3.19 +void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table, 3.20 + Register constants, Register barretConstants, 3.21 + Register t0, Register t1, Register t2, Register t3, Register t4) { 3.22 + assert_different_registers(crc, buf, len, table); 3.23 + 3.24 + Label L_alignedHead, L_tail, L_alignTail, L_start, L_end; 3.25 + 3.26 + Register prealign = t0; 3.27 + Register postalign = t0; 3.28 + 3.29 + BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {"); 3.30 + 3.31 + // 1. use kernel_crc32_1word for shorter than 384bit 3.32 + clrldi(len, len, 32); 3.33 + cmpdi(CCR0, len, 384); 3.34 + bge(CCR0, L_start); 3.35 + 3.36 + Register tc0 = t4; 3.37 + Register tc1 = constants; 3.38 + Register tc2 = barretConstants; 3.39 + kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table); 3.40 + b(L_end); 3.41 + 3.42 + BIND(L_start); 3.43 + 3.44 + // 2. ~c 3.45 + nand(crc, crc, crc); 3.46 + 3.47 + // 3. calculate from 0 to first 128bit-aligned address 3.48 + clrldi_(prealign, buf, 57); 3.49 + beq(CCR0, L_alignedHead); 3.50 + 3.51 + subfic(prealign, prealign, 128); 3.52 + 3.53 + subf(len, prealign, len); 3.54 + update_byteLoop_crc32(crc, buf, prealign, table, t2, false, false); 3.55 + 3.56 + // 4. calculate from first 128bit-aligned address to last 128bit-aligned address 3.57 + BIND(L_alignedHead); 3.58 + 3.59 + clrldi(postalign, len, 57); 3.60 + subf(len, postalign, len); 3.61 + 3.62 + // len must be more than 256bit 3.63 + kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3); 3.64 + 3.65 + // 5. calculate remaining 3.66 + cmpdi(CCR0, postalign, 0); 3.67 + beq(CCR0, L_tail); 3.68 + 3.69 + update_byteLoop_crc32(crc, buf, postalign, table, t2, false, false); 3.70 + 3.71 + BIND(L_tail); 3.72 + 3.73 + // 6. ~c 3.74 + nand(crc, crc, crc); 3.75 + 3.76 + BIND(L_end); 3.77 + 3.78 + BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb"); 3.79 +} 3.80 + 3.81 +/** 3.82 + * @param crc register containing existing CRC (32-bit) 3.83 + * @param buf register pointing to input byte buffer (byte*) 3.84 + * @param len register containing number of bytes 3.85 + * @param constants register pointing to CRC table for 128-bit aligned memory 3.86 + * @param barretConstants register pointing to table for barrett reduction 3.87 + * @param t0 volatile register 3.88 + * @param t1 volatile register 3.89 + * @param t2 volatile register 3.90 + */ 3.91 +void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len, 3.92 + Register constants, Register barretConstants, Register t0, Register t1, Register t2) { 3.93 + Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test; 3.94 + Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15; 3.95 + Label L_1, L_2, L_3, L_4; 3.96 + 3.97 + Register rLoaded = t0; 3.98 + Register rTmp1 = t1; 3.99 + Register rTmp2 = t2; 3.100 + Register off16 = R22; 3.101 + Register off32 = R23; 3.102 + Register off48 = R24; 3.103 + Register off64 = R25; 3.104 + Register off80 = R26; 3.105 + Register off96 = R27; 3.106 + Register off112 = R28; 3.107 + Register rIdx = R29; 3.108 + Register rMax = R30; 3.109 + Register constantsPos = R31; 3.110 + 3.111 + VectorRegister mask_32bit = VR24; 3.112 + VectorRegister mask_64bit = VR25; 3.113 + VectorRegister zeroes = VR26; 3.114 + VectorRegister const1 = VR27; 3.115 + VectorRegister const2 = VR28; 3.116 + 3.117 + // Save non-volatile vector registers (frameless). 3.118 + Register offset = t1; int offsetInt = 0; 3.119 + offsetInt -= 16; li(offset, -16); stvx(VR20, offset, R1_SP); 3.120 + offsetInt -= 16; addi(offset, offset, -16); stvx(VR21, offset, R1_SP); 3.121 + offsetInt -= 16; addi(offset, offset, -16); stvx(VR22, offset, R1_SP); 3.122 + offsetInt -= 16; addi(offset, offset, -16); stvx(VR23, offset, R1_SP); 3.123 + offsetInt -= 16; addi(offset, offset, -16); stvx(VR24, offset, R1_SP); 3.124 + offsetInt -= 16; addi(offset, offset, -16); stvx(VR25, offset, R1_SP); 3.125 + offsetInt -= 16; addi(offset, offset, -16); stvx(VR26, offset, R1_SP); 3.126 + offsetInt -= 16; addi(offset, offset, -16); stvx(VR27, offset, R1_SP); 3.127 + offsetInt -= 16; addi(offset, offset, -16); stvx(VR28, offset, R1_SP); 3.128 + offsetInt -= 8; std(R22, offsetInt, R1_SP); 3.129 + offsetInt -= 8; std(R23, offsetInt, R1_SP); 3.130 + offsetInt -= 8; std(R24, offsetInt, R1_SP); 3.131 + offsetInt -= 8; std(R25, offsetInt, R1_SP); 3.132 + offsetInt -= 8; std(R26, offsetInt, R1_SP); 3.133 + offsetInt -= 8; std(R27, offsetInt, R1_SP); 3.134 + offsetInt -= 8; std(R28, offsetInt, R1_SP); 3.135 + offsetInt -= 8; std(R29, offsetInt, R1_SP); 3.136 + offsetInt -= 8; std(R30, offsetInt, R1_SP); 3.137 + offsetInt -= 8; std(R31, offsetInt, R1_SP); 3.138 + 3.139 + // Set constants 3.140 + li(off16, 16); 3.141 + li(off32, 32); 3.142 + li(off48, 48); 3.143 + li(off64, 64); 3.144 + li(off80, 80); 3.145 + li(off96, 96); 3.146 + li(off112, 112); 3.147 + 3.148 + clrldi(crc, crc, 32); 3.149 + 3.150 + vxor(zeroes, zeroes, zeroes); 3.151 + vspltisw(VR0, -1); 3.152 + 3.153 + vsldoi(mask_32bit, zeroes, VR0, 4); 3.154 + vsldoi(mask_64bit, zeroes, VR0, -8); 3.155 + 3.156 + // Get the initial value into v8 3.157 + vxor(VR8, VR8, VR8); 3.158 + mtvrd(VR8, crc); 3.159 + vsldoi(VR8, zeroes, VR8, -8); // shift into bottom 32 bits 3.160 + 3.161 + li (rLoaded, 0); 3.162 + 3.163 + rldicr(rIdx, len, 0, 56); 3.164 + 3.165 + { 3.166 + BIND(L_1); 3.167 + // Checksum in blocks of MAX_SIZE (32768) 3.168 + lis(rMax, 0); 3.169 + ori(rMax, rMax, 32768); 3.170 + mr(rTmp2, rMax); 3.171 + cmpd(CCR0, rIdx, rMax); 3.172 + bgt(CCR0, L_2); 3.173 + mr(rMax, rIdx); 3.174 + 3.175 + BIND(L_2); 3.176 + subf(rIdx, rMax, rIdx); 3.177 + 3.178 + // our main loop does 128 bytes at a time 3.179 + srdi(rMax, rMax, 7); 3.180 + 3.181 + /* 3.182 + * Work out the offset into the constants table to start at. Each 3.183 + * constant is 16 bytes, and it is used against 128 bytes of input 3.184 + * data - 128 / 16 = 8 3.185 + */ 3.186 + sldi(rTmp1, rMax, 4); 3.187 + srdi(rTmp2, rTmp2, 3); 3.188 + subf(rTmp1, rTmp1, rTmp2); 3.189 + 3.190 + // We reduce our final 128 bytes in a separate step 3.191 + addi(rMax, rMax, -1); 3.192 + mtctr(rMax); 3.193 + 3.194 + // Find the start of our constants 3.195 + add(constantsPos, constants, rTmp1); 3.196 + 3.197 + // zero VR0-v7 which will contain our checksums 3.198 + vxor(VR0, VR0, VR0); 3.199 + vxor(VR1, VR1, VR1); 3.200 + vxor(VR2, VR2, VR2); 3.201 + vxor(VR3, VR3, VR3); 3.202 + vxor(VR4, VR4, VR4); 3.203 + vxor(VR5, VR5, VR5); 3.204 + vxor(VR6, VR6, VR6); 3.205 + vxor(VR7, VR7, VR7); 3.206 + 3.207 + lvx(const1, constantsPos); 3.208 + 3.209 + /* 3.210 + * If we are looping back to consume more data we use the values 3.211 + * already in VR16-v23. 3.212 + */ 3.213 + cmpdi(CCR0, rLoaded, 1); 3.214 + beq(CCR0, L_3); 3.215 + { 3.216 + 3.217 + // First warm up pass 3.218 + lvx(VR16, buf); 3.219 + lvx(VR17, off16, buf); 3.220 + lvx(VR18, off32, buf); 3.221 + lvx(VR19, off48, buf); 3.222 + lvx(VR20, off64, buf); 3.223 + lvx(VR21, off80, buf); 3.224 + lvx(VR22, off96, buf); 3.225 + lvx(VR23, off112, buf); 3.226 + addi(buf, buf, 8*16); 3.227 + 3.228 + // xor in initial value 3.229 + vxor(VR16, VR16, VR8); 3.230 + } 3.231 + 3.232 + BIND(L_3); 3.233 + bdz(L_first_warm_up_done); 3.234 + 3.235 + addi(constantsPos, constantsPos, 16); 3.236 + lvx(const2, constantsPos); 3.237 + 3.238 + // Second warm up pass 3.239 + vpmsumd(VR8, VR16, const1); 3.240 + lvx(VR16, buf); 3.241 + 3.242 + vpmsumd(VR9, VR17, const1); 3.243 + lvx(VR17, off16, buf); 3.244 + 3.245 + vpmsumd(VR10, VR18, const1); 3.246 + lvx(VR18, off32, buf); 3.247 + 3.248 + vpmsumd(VR11, VR19, const1); 3.249 + lvx(VR19, off48, buf); 3.250 + 3.251 + vpmsumd(VR12, VR20, const1); 3.252 + lvx(VR20, off64, buf); 3.253 + 3.254 + vpmsumd(VR13, VR21, const1); 3.255 + lvx(VR21, off80, buf); 3.256 + 3.257 + vpmsumd(VR14, VR22, const1); 3.258 + lvx(VR22, off96, buf); 3.259 + 3.260 + vpmsumd(VR15, VR23, const1); 3.261 + lvx(VR23, off112, buf); 3.262 + 3.263 + addi(buf, buf, 8 * 16); 3.264 + 3.265 + bdz(L_first_cool_down); 3.266 + 3.267 + /* 3.268 + * main loop. We modulo schedule it such that it takes three iterations 3.269 + * to complete - first iteration load, second iteration vpmsum, third 3.270 + * iteration xor. 3.271 + */ 3.272 + { 3.273 + BIND(L_4); 3.274 + lvx(const1, constantsPos); addi(constantsPos, constantsPos, 16); 3.275 + 3.276 + vxor(VR0, VR0, VR8); 3.277 + vpmsumd(VR8, VR16, const2); 3.278 + lvx(VR16, buf); 3.279 + 3.280 + vxor(VR1, VR1, VR9); 3.281 + vpmsumd(VR9, VR17, const2); 3.282 + lvx(VR17, off16, buf); 3.283 + 3.284 + vxor(VR2, VR2, VR10); 3.285 + vpmsumd(VR10, VR18, const2); 3.286 + lvx(VR18, off32, buf); 3.287 + 3.288 + vxor(VR3, VR3, VR11); 3.289 + vpmsumd(VR11, VR19, const2); 3.290 + lvx(VR19, off48, buf); 3.291 + lvx(const2, constantsPos); 3.292 + 3.293 + vxor(VR4, VR4, VR12); 3.294 + vpmsumd(VR12, VR20, const1); 3.295 + lvx(VR20, off64, buf); 3.296 + 3.297 + vxor(VR5, VR5, VR13); 3.298 + vpmsumd(VR13, VR21, const1); 3.299 + lvx(VR21, off80, buf); 3.300 + 3.301 + vxor(VR6, VR6, VR14); 3.302 + vpmsumd(VR14, VR22, const1); 3.303 + lvx(VR22, off96, buf); 3.304 + 3.305 + vxor(VR7, VR7, VR15); 3.306 + vpmsumd(VR15, VR23, const1); 3.307 + lvx(VR23, off112, buf); 3.308 + 3.309 + addi(buf, buf, 8 * 16); 3.310 + 3.311 + bdnz(L_4); 3.312 + } 3.313 + 3.314 + BIND(L_first_cool_down); 3.315 + 3.316 + // First cool down pass 3.317 + lvx(const1, constantsPos); 3.318 + addi(constantsPos, constantsPos, 16); 3.319 + 3.320 + vxor(VR0, VR0, VR8); 3.321 + vpmsumd(VR8, VR16, const1); 3.322 + 3.323 + vxor(VR1, VR1, VR9); 3.324 + vpmsumd(VR9, VR17, const1); 3.325 + 3.326 + vxor(VR2, VR2, VR10); 3.327 + vpmsumd(VR10, VR18, const1); 3.328 + 3.329 + vxor(VR3, VR3, VR11); 3.330 + vpmsumd(VR11, VR19, const1); 3.331 + 3.332 + vxor(VR4, VR4, VR12); 3.333 + vpmsumd(VR12, VR20, const1); 3.334 + 3.335 + vxor(VR5, VR5, VR13); 3.336 + vpmsumd(VR13, VR21, const1); 3.337 + 3.338 + vxor(VR6, VR6, VR14); 3.339 + vpmsumd(VR14, VR22, const1); 3.340 + 3.341 + vxor(VR7, VR7, VR15); 3.342 + vpmsumd(VR15, VR23, const1); 3.343 + 3.344 + BIND(L_second_cool_down); 3.345 + // Second cool down pass 3.346 + vxor(VR0, VR0, VR8); 3.347 + vxor(VR1, VR1, VR9); 3.348 + vxor(VR2, VR2, VR10); 3.349 + vxor(VR3, VR3, VR11); 3.350 + vxor(VR4, VR4, VR12); 3.351 + vxor(VR5, VR5, VR13); 3.352 + vxor(VR6, VR6, VR14); 3.353 + vxor(VR7, VR7, VR15); 3.354 + 3.355 + /* 3.356 + * vpmsumd produces a 96 bit result in the least significant bits 3.357 + * of the register. Since we are bit reflected we have to shift it 3.358 + * left 32 bits so it occupies the least significant bits in the 3.359 + * bit reflected domain. 3.360 + */ 3.361 + vsldoi(VR0, VR0, zeroes, 4); 3.362 + vsldoi(VR1, VR1, zeroes, 4); 3.363 + vsldoi(VR2, VR2, zeroes, 4); 3.364 + vsldoi(VR3, VR3, zeroes, 4); 3.365 + vsldoi(VR4, VR4, zeroes, 4); 3.366 + vsldoi(VR5, VR5, zeroes, 4); 3.367 + vsldoi(VR6, VR6, zeroes, 4); 3.368 + vsldoi(VR7, VR7, zeroes, 4); 3.369 + 3.370 + // xor with last 1024 bits 3.371 + lvx(VR8, buf); 3.372 + lvx(VR9, off16, buf); 3.373 + lvx(VR10, off32, buf); 3.374 + lvx(VR11, off48, buf); 3.375 + lvx(VR12, off64, buf); 3.376 + lvx(VR13, off80, buf); 3.377 + lvx(VR14, off96, buf); 3.378 + lvx(VR15, off112, buf); 3.379 + addi(buf, buf, 8 * 16); 3.380 + 3.381 + vxor(VR16, VR0, VR8); 3.382 + vxor(VR17, VR1, VR9); 3.383 + vxor(VR18, VR2, VR10); 3.384 + vxor(VR19, VR3, VR11); 3.385 + vxor(VR20, VR4, VR12); 3.386 + vxor(VR21, VR5, VR13); 3.387 + vxor(VR22, VR6, VR14); 3.388 + vxor(VR23, VR7, VR15); 3.389 + 3.390 + li(rLoaded, 1); 3.391 + cmpdi(CCR0, rIdx, 0); 3.392 + addi(rIdx, rIdx, 128); 3.393 + bne(CCR0, L_1); 3.394 + } 3.395 + 3.396 + // Work out how many bytes we have left 3.397 + andi_(len, len, 127); 3.398 + 3.399 + // Calculate where in the constant table we need to start 3.400 + subfic(rTmp1, len, 128); 3.401 + add(constantsPos, constantsPos, rTmp1); 3.402 + 3.403 + // How many 16 byte chunks are in the tail 3.404 + srdi(rIdx, len, 4); 3.405 + mtctr(rIdx); 3.406 + 3.407 + /* 3.408 + * Reduce the previously calculated 1024 bits to 64 bits, shifting 3.409 + * 32 bits to include the trailing 32 bits of zeros 3.410 + */ 3.411 + lvx(VR0, constantsPos); 3.412 + lvx(VR1, off16, constantsPos); 3.413 + lvx(VR2, off32, constantsPos); 3.414 + lvx(VR3, off48, constantsPos); 3.415 + lvx(VR4, off64, constantsPos); 3.416 + lvx(VR5, off80, constantsPos); 3.417 + lvx(VR6, off96, constantsPos); 3.418 + lvx(VR7, off112, constantsPos); 3.419 + addi(constantsPos, constantsPos, 8 * 16); 3.420 + 3.421 + vpmsumw(VR0, VR16, VR0); 3.422 + vpmsumw(VR1, VR17, VR1); 3.423 + vpmsumw(VR2, VR18, VR2); 3.424 + vpmsumw(VR3, VR19, VR3); 3.425 + vpmsumw(VR4, VR20, VR4); 3.426 + vpmsumw(VR5, VR21, VR5); 3.427 + vpmsumw(VR6, VR22, VR6); 3.428 + vpmsumw(VR7, VR23, VR7); 3.429 + 3.430 + // Now reduce the tail (0 - 112 bytes) 3.431 + cmpdi(CCR0, rIdx, 0); 3.432 + beq(CCR0, L_XOR); 3.433 + 3.434 + lvx(VR16, buf); addi(buf, buf, 16); 3.435 + lvx(VR17, constantsPos); 3.436 + vpmsumw(VR16, VR16, VR17); 3.437 + vxor(VR0, VR0, VR16); 3.438 + beq(CCR0, L_XOR); 3.439 + 3.440 + lvx(VR16, buf); addi(buf, buf, 16); 3.441 + lvx(VR17, off16, constantsPos); 3.442 + vpmsumw(VR16, VR16, VR17); 3.443 + vxor(VR0, VR0, VR16); 3.444 + beq(CCR0, L_XOR); 3.445 + 3.446 + lvx(VR16, buf); addi(buf, buf, 16); 3.447 + lvx(VR17, off32, constantsPos); 3.448 + vpmsumw(VR16, VR16, VR17); 3.449 + vxor(VR0, VR0, VR16); 3.450 + beq(CCR0, L_XOR); 3.451 + 3.452 + lvx(VR16, buf); addi(buf, buf, 16); 3.453 + lvx(VR17, off48,constantsPos); 3.454 + vpmsumw(VR16, VR16, VR17); 3.455 + vxor(VR0, VR0, VR16); 3.456 + beq(CCR0, L_XOR); 3.457 + 3.458 + lvx(VR16, buf); addi(buf, buf, 16); 3.459 + lvx(VR17, off64, constantsPos); 3.460 + vpmsumw(VR16, VR16, VR17); 3.461 + vxor(VR0, VR0, VR16); 3.462 + beq(CCR0, L_XOR); 3.463 + 3.464 + lvx(VR16, buf); addi(buf, buf, 16); 3.465 + lvx(VR17, off80, constantsPos); 3.466 + vpmsumw(VR16, VR16, VR17); 3.467 + vxor(VR0, VR0, VR16); 3.468 + beq(CCR0, L_XOR); 3.469 + 3.470 + lvx(VR16, buf); addi(buf, buf, 16); 3.471 + lvx(VR17, off96, constantsPos); 3.472 + vpmsumw(VR16, VR16, VR17); 3.473 + vxor(VR0, VR0, VR16); 3.474 + 3.475 + // Now xor all the parallel chunks together 3.476 + BIND(L_XOR); 3.477 + vxor(VR0, VR0, VR1); 3.478 + vxor(VR2, VR2, VR3); 3.479 + vxor(VR4, VR4, VR5); 3.480 + vxor(VR6, VR6, VR7); 3.481 + 3.482 + vxor(VR0, VR0, VR2); 3.483 + vxor(VR4, VR4, VR6); 3.484 + 3.485 + vxor(VR0, VR0, VR4); 3.486 + 3.487 + b(L_barrett_reduction); 3.488 + 3.489 + BIND(L_first_warm_up_done); 3.490 + lvx(const1, constantsPos); 3.491 + addi(constantsPos, constantsPos, 16); 3.492 + vpmsumd(VR8, VR16, const1); 3.493 + vpmsumd(VR9, VR17, const1); 3.494 + vpmsumd(VR10, VR18, const1); 3.495 + vpmsumd(VR11, VR19, const1); 3.496 + vpmsumd(VR12, VR20, const1); 3.497 + vpmsumd(VR13, VR21, const1); 3.498 + vpmsumd(VR14, VR22, const1); 3.499 + vpmsumd(VR15, VR23, const1); 3.500 + b(L_second_cool_down); 3.501 + 3.502 + BIND(L_barrett_reduction); 3.503 + 3.504 + lvx(const1, barretConstants); 3.505 + addi(barretConstants, barretConstants, 16); 3.506 + lvx(const2, barretConstants); 3.507 + 3.508 + vsldoi(VR1, VR0, VR0, -8); 3.509 + vxor(VR0, VR0, VR1); // xor two 64 bit results together 3.510 + 3.511 + // shift left one bit 3.512 + vspltisb(VR1, 1); 3.513 + vsl(VR0, VR0, VR1); 3.514 + 3.515 + vand(VR0, VR0, mask_64bit); 3.516 + 3.517 + /* 3.518 + * The reflected version of Barrett reduction. Instead of bit 3.519 + * reflecting our data (which is expensive to do), we bit reflect our 3.520 + * constants and our algorithm, which means the intermediate data in 3.521 + * our vector registers goes from 0-63 instead of 63-0. We can reflect 3.522 + * the algorithm because we don't carry in mod 2 arithmetic. 3.523 + */ 3.524 + vand(VR1, VR0, mask_32bit); // bottom 32 bits of a 3.525 + vpmsumd(VR1, VR1, const1); // ma 3.526 + vand(VR1, VR1, mask_32bit); // bottom 32bits of ma 3.527 + vpmsumd(VR1, VR1, const2); // qn */ 3.528 + vxor(VR0, VR0, VR1); // a - qn, subtraction is xor in GF(2) 3.529 + 3.530 + /* 3.531 + * Since we are bit reflected, the result (ie the low 32 bits) is in 3.532 + * the high 32 bits. We just need to shift it left 4 bytes 3.533 + * V0 [ 0 1 X 3 ] 3.534 + * V0 [ 0 X 2 3 ] 3.535 + */ 3.536 + vsldoi(VR0, VR0, zeroes, 4); // shift result into top 64 bits of 3.537 + 3.538 + // Get it into r3 3.539 + mfvrd(crc, VR0); 3.540 + 3.541 + BIND(L_end); 3.542 + 3.543 + offsetInt = 0; 3.544 + // Restore non-volatile Vector registers (frameless). 3.545 + offsetInt -= 16; li(offset, -16); lvx(VR20, offset, R1_SP); 3.546 + offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP); 3.547 + offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP); 3.548 + offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP); 3.549 + offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP); 3.550 + offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP); 3.551 + offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP); 3.552 + offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP); 3.553 + offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP); 3.554 + offsetInt -= 8; ld(R22, offsetInt, R1_SP); 3.555 + offsetInt -= 8; ld(R23, offsetInt, R1_SP); 3.556 + offsetInt -= 8; ld(R24, offsetInt, R1_SP); 3.557 + offsetInt -= 8; ld(R25, offsetInt, R1_SP); 3.558 + offsetInt -= 8; ld(R26, offsetInt, R1_SP); 3.559 + offsetInt -= 8; ld(R27, offsetInt, R1_SP); 3.560 + offsetInt -= 8; ld(R28, offsetInt, R1_SP); 3.561 + offsetInt -= 8; ld(R29, offsetInt, R1_SP); 3.562 + offsetInt -= 8; ld(R30, offsetInt, R1_SP); 3.563 + offsetInt -= 8; ld(R31, offsetInt, R1_SP); 3.564 +} 3.565 + 3.566 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) { 3.567 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp); 3.568
4.1 --- a/src/cpu/ppc/vm/macroAssembler_ppc.hpp Mon Sep 24 17:18:38 2018 -0400 4.2 +++ b/src/cpu/ppc/vm/macroAssembler_ppc.hpp Thu Sep 22 12:17:24 2016 +0200 4.3 @@ -656,6 +656,13 @@ 4.4 Register tc0, Register tc1, Register tc2, Register tc3); 4.5 void kernel_crc32_1byte(Register crc, Register buf, Register len, Register table, 4.6 Register t0, Register t1, Register t2, Register t3); 4.7 + void kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table, 4.8 + Register constants, Register barretConstants, 4.9 + Register t0, Register t1, Register t2, Register t3, Register t4); 4.10 + void kernel_crc32_1word_aligned(Register crc, Register buf, Register len, 4.11 + Register constants, Register barretConstants, 4.12 + Register t0, Register t1, Register t2); 4.13 + 4.14 void kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp); 4.15 4.16 //
5.1 --- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp Mon Sep 24 17:18:38 2018 -0400 5.2 +++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp Thu Sep 22 12:17:24 2016 +0200 5.3 @@ -2482,9 +2482,7 @@ 5.4 * R5_ARG3 - int length (of buffer) 5.5 * 5.6 * scratch: 5.7 - * R6_ARG4 - crc table address 5.8 - * R7_ARG5 - tmp1 5.9 - * R8_ARG6 - tmp2 5.10 + * R2, R6-R12 5.11 * 5.12 * Ouput: 5.13 * R3_RET - int crc result 5.14 @@ -2496,28 +2494,62 @@ 5.15 address start = __ function_entry(); // Remember stub start address (is rtn value). 5.16 5.17 // arguments to kernel_crc32: 5.18 - Register crc = R3_ARG1; // Current checksum, preset by caller or result from previous call. 5.19 - Register data = R4_ARG2; // source byte array 5.20 - Register dataLen = R5_ARG3; // #bytes to process 5.21 - Register table = R6_ARG4; // crc table address 5.22 - 5.23 - Register t0 = R9; // work reg for kernel* emitters 5.24 - Register t1 = R10; // work reg for kernel* emitters 5.25 - Register t2 = R11; // work reg for kernel* emitters 5.26 - Register t3 = R12; // work reg for kernel* emitters 5.27 - 5.28 - BLOCK_COMMENT("Stub body {"); 5.29 - assert_different_registers(crc, data, dataLen, table); 5.30 - 5.31 - StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table); 5.32 - 5.33 - __ kernel_crc32_1byte(crc, data, dataLen, table, t0, t1, t2, t3); 5.34 - 5.35 - BLOCK_COMMENT("return"); 5.36 - __ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET). 5.37 - __ blr(); 5.38 - 5.39 - BLOCK_COMMENT("} Stub body"); 5.40 + const Register crc = R3_ARG1; // Current checksum, preset by caller or result from previous call. 5.41 + const Register data = R4_ARG2; // source byte array 5.42 + const Register dataLen = R5_ARG3; // #bytes to process 5.43 + 5.44 + const Register table = R6; // crc table address 5.45 + 5.46 +#ifdef VM_LITTLE_ENDIAN 5.47 + if (VM_Version::has_vpmsumb()) { 5.48 + const Register constants = R2; // constants address 5.49 + const Register bconstants = R8; // barret table address 5.50 + 5.51 + const Register t0 = R9; 5.52 + const Register t1 = R10; 5.53 + const Register t2 = R11; 5.54 + const Register t3 = R12; 5.55 + const Register t4 = R7; 5.56 + 5.57 + BLOCK_COMMENT("Stub body {"); 5.58 + assert_different_registers(crc, data, dataLen, table); 5.59 + 5.60 + StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table); 5.61 + StubRoutines::ppc64::generate_load_crc_constants_addr(_masm, constants); 5.62 + StubRoutines::ppc64::generate_load_crc_barret_constants_addr(_masm, bconstants); 5.63 + 5.64 + __ kernel_crc32_1word_vpmsumd(crc, data, dataLen, table, constants, bconstants, t0, t1, t2, t3, t4); 5.65 + 5.66 + BLOCK_COMMENT("return"); 5.67 + __ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET). 5.68 + __ blr(); 5.69 + 5.70 + BLOCK_COMMENT("} Stub body"); 5.71 + } else 5.72 +#endif 5.73 + { 5.74 + const Register t0 = R2; 5.75 + const Register t1 = R7; 5.76 + const Register t2 = R8; 5.77 + const Register t3 = R9; 5.78 + const Register tc0 = R10; 5.79 + const Register tc1 = R11; 5.80 + const Register tc2 = R12; 5.81 + 5.82 + BLOCK_COMMENT("Stub body {"); 5.83 + assert_different_registers(crc, data, dataLen, table); 5.84 + 5.85 + StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table); 5.86 + 5.87 + __ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, table); 5.88 + 5.89 + BLOCK_COMMENT("return"); 5.90 + __ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET). 5.91 + __ blr(); 5.92 + 5.93 + BLOCK_COMMENT("} Stub body"); 5.94 + } 5.95 + 5.96 return start; 5.97 } 5.98
6.1 --- a/src/cpu/ppc/vm/stubRoutines_ppc_64.cpp Mon Sep 24 17:18:38 2018 -0400 6.2 +++ b/src/cpu/ppc/vm/stubRoutines_ppc_64.cpp Thu Sep 22 12:17:24 2016 +0200 6.3 @@ -37,6 +37,311 @@ 6.4 __ load_const(table, StubRoutines::_crc_table_adr); 6.5 } 6.6 6.7 +void StubRoutines::ppc64::generate_load_crc_constants_addr(MacroAssembler* masm, Register table) { 6.8 + __ load_const_optimized(table, (address)StubRoutines::ppc64::_constants, R0); 6.9 +} 6.10 + 6.11 +void StubRoutines::ppc64::generate_load_crc_barret_constants_addr(MacroAssembler* masm, Register table) { 6.12 + __ load_const_optimized(table, (address)StubRoutines::ppc64::_barret_constants, R0); 6.13 +} 6.14 + 6.15 +juint* StubRoutines::ppc64::generate_crc_constants() { 6.16 + juint constants[CRC32_CONSTANTS_SIZE] = { 6.17 + // Reduce 262144 kbits to 1024 bits 6.18 + 0x99ea94a8UL, 0x00000000UL, 0x651797d2UL, 0x00000001UL, // x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 6.19 + 0x945a8420UL, 0x00000000UL, 0x21e0d56cUL, 0x00000000UL, // x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 6.20 + 0x30762706UL, 0x00000000UL, 0x0f95ecaaUL, 0x00000000UL, // x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 6.21 + 0xa52fc582UL, 0x00000001UL, 0xebd224acUL, 0x00000001UL, // x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 6.22 + 0xa4a7167aUL, 0x00000001UL, 0x0ccb97caUL, 0x00000000UL, // x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 6.23 + 0x0c18249aUL, 0x00000000UL, 0x006ec8a8UL, 0x00000001UL, // x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 6.24 + 0xa924ae7cUL, 0x00000000UL, 0x4f58f196UL, 0x00000001UL, // x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 6.25 + 0xe12ccc12UL, 0x00000001UL, 0xa7192ca6UL, 0x00000001UL, // x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 6.26 + 0xa0b9d4acUL, 0x00000000UL, 0x9a64bab2UL, 0x00000001UL, // x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 6.27 + 0x95e8ddfeUL, 0x00000000UL, 0x14f4ed2eUL, 0x00000000UL, // x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 6.28 + 0x233fddc4UL, 0x00000000UL, 0x1092b6a2UL, 0x00000001UL, // x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 6.29 + 0xb4529b62UL, 0x00000001UL, 0xc8a1629cUL, 0x00000000UL, // x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 6.30 + 0xa7fa0e64UL, 0x00000001UL, 0x7bf32e8eUL, 0x00000001UL, // x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 6.31 + 0xb5334592UL, 0x00000001UL, 0xf8cc6582UL, 0x00000001UL, // x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 6.32 + 0x1f8ee1b4UL, 0x00000001UL, 0x8631ddf0UL, 0x00000000UL, // x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 6.33 + 0x6252e632UL, 0x00000000UL, 0x7e5a76d0UL, 0x00000000UL, // x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 6.34 + 0xab973e84UL, 0x00000000UL, 0x2b09b31cUL, 0x00000000UL, // x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 6.35 + 0x7734f5ecUL, 0x00000000UL, 0xb2df1f84UL, 0x00000001UL, // x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 6.36 + 0x7c547798UL, 0x00000000UL, 0xd6f56afcUL, 0x00000001UL, // x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 6.37 + 0x7ec40210UL, 0x00000000UL, 0xb9b5e70cUL, 0x00000001UL, // x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 6.38 + 0xab1695a8UL, 0x00000001UL, 0x34b626d2UL, 0x00000000UL, // x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 6.39 + 0x90494bbaUL, 0x00000000UL, 0x4c53479aUL, 0x00000001UL, // x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 6.40 + 0x123fb816UL, 0x00000001UL, 0xa6d179a4UL, 0x00000001UL, // x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 6.41 + 0xe188c74cUL, 0x00000001UL, 0x5abd16b4UL, 0x00000001UL, // x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 6.42 + 0xc2d3451cUL, 0x00000001UL, 0x018f9852UL, 0x00000000UL, // x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 6.43 + 0xf55cf1caUL, 0x00000000UL, 0x1fb3084aUL, 0x00000000UL, // x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 6.44 + 0xa0531540UL, 0x00000001UL, 0xc53dfb04UL, 0x00000000UL, // x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 6.45 + 0x32cd7ebcUL, 0x00000001UL, 0xe10c9ad6UL, 0x00000000UL, // x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 6.46 + 0x73ab7f36UL, 0x00000000UL, 0x25aa994aUL, 0x00000000UL, // x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 6.47 + 0x41aed1c2UL, 0x00000000UL, 0xfa3a74c4UL, 0x00000000UL, // x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 6.48 + 0x36c53800UL, 0x00000001UL, 0x33eb3f40UL, 0x00000000UL, // x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 6.49 + 0x26835a30UL, 0x00000001UL, 0x7193f296UL, 0x00000001UL, // x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 6.50 + 0x6241b502UL, 0x00000000UL, 0x43f6c86aUL, 0x00000000UL, // x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 6.51 + 0xd5196ad4UL, 0x00000000UL, 0x6b513ec6UL, 0x00000001UL, // x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 6.52 + 0x9cfa769aUL, 0x00000000UL, 0xc8f25b4eUL, 0x00000000UL, // x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 6.53 + 0x920e5df4UL, 0x00000000UL, 0xa45048ecUL, 0x00000001UL, // x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 6.54 + 0x69dc310eUL, 0x00000001UL, 0x0c441004UL, 0x00000000UL, // x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 6.55 + 0x09fc331cUL, 0x00000000UL, 0x0e17cad6UL, 0x00000000UL, // x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 6.56 + 0x0d94a81eUL, 0x00000001UL, 0x253ae964UL, 0x00000001UL, // x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 6.57 + 0x27a20ab2UL, 0x00000000UL, 0xd7c88ebcUL, 0x00000001UL, // x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 6.58 + 0x14f87504UL, 0x00000001UL, 0xe7ca913aUL, 0x00000001UL, // x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 6.59 + 0x4b076d96UL, 0x00000000UL, 0x33ed078aUL, 0x00000000UL, // x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 6.60 + 0xda4d1e74UL, 0x00000000UL, 0xe1839c78UL, 0x00000000UL, // x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 6.61 + 0x1b81f672UL, 0x00000000UL, 0x322b267eUL, 0x00000001UL, // x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 6.62 + 0x9367c988UL, 0x00000000UL, 0x638231b6UL, 0x00000000UL, // x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 6.63 + 0x717214caUL, 0x00000001UL, 0xee7f16f4UL, 0x00000001UL, // x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 6.64 + 0x9f47d820UL, 0x00000000UL, 0x17d9924aUL, 0x00000001UL, // x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 6.65 + 0x0d9a47d2UL, 0x00000001UL, 0xe1a9e0c4UL, 0x00000000UL, // x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 6.66 + 0xa696c58cUL, 0x00000000UL, 0x403731dcUL, 0x00000001UL, // x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 6.67 + 0x2aa28ec6UL, 0x00000000UL, 0xa5ea9682UL, 0x00000001UL, // x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 6.68 + 0xfe18fd9aUL, 0x00000001UL, 0x01c5c578UL, 0x00000001UL, // x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 6.69 + 0x9d4fc1aeUL, 0x00000001UL, 0xdddf6494UL, 0x00000000UL, // x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 6.70 + 0xba0e3deaUL, 0x00000001UL, 0xf1c3db28UL, 0x00000000UL, // x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 6.71 + 0x74b59a5eUL, 0x00000000UL, 0x3112fb9cUL, 0x00000001UL, // x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 6.72 + 0xf2b5ea98UL, 0x00000000UL, 0xb680b906UL, 0x00000000UL, // x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 6.73 + 0x87132676UL, 0x00000001UL, 0x1a282932UL, 0x00000000UL, // x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 6.74 + 0x0a8c6ad4UL, 0x00000001UL, 0x89406e7eUL, 0x00000000UL, // x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 6.75 + 0xe21dfe70UL, 0x00000001UL, 0xdef6be8cUL, 0x00000001UL, // x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 6.76 + 0xda0050e4UL, 0x00000001UL, 0x75258728UL, 0x00000000UL, // x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 6.77 + 0x772172aeUL, 0x00000000UL, 0x9536090aUL, 0x00000001UL, // x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 6.78 + 0xe47724aaUL, 0x00000000UL, 0xf2455bfcUL, 0x00000000UL, // x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 6.79 + 0x3cd63ac4UL, 0x00000000UL, 0x8c40baf4UL, 0x00000001UL, // x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 6.80 + 0xbf47d352UL, 0x00000001UL, 0x4cd390d4UL, 0x00000000UL, // x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 6.81 + 0x8dc1d708UL, 0x00000001UL, 0xe4ece95aUL, 0x00000001UL, // x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 6.82 + 0x2d4620a4UL, 0x00000000UL, 0x1a3ee918UL, 0x00000000UL, // x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 6.83 + 0x58fd1740UL, 0x00000000UL, 0x7c652fb8UL, 0x00000000UL, // x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 6.84 + 0xdadd9bfcUL, 0x00000000UL, 0x1c67842cUL, 0x00000001UL, // x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 6.85 + 0xea2140beUL, 0x00000001UL, 0x254f759cUL, 0x00000000UL, // x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 6.86 + 0x9de128baUL, 0x00000000UL, 0x7ece94caUL, 0x00000000UL, // x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 6.87 + 0x3ac3aa8eUL, 0x00000001UL, 0x38f258c2UL, 0x00000000UL, // x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 6.88 + 0x99980562UL, 0x00000000UL, 0xcdf17b00UL, 0x00000001UL, // x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 6.89 + 0xc1579c86UL, 0x00000001UL, 0x1f882c16UL, 0x00000001UL, // x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 6.90 + 0x68dbbf94UL, 0x00000000UL, 0x00093fc8UL, 0x00000001UL, // x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 6.91 + 0x4509fb04UL, 0x00000000UL, 0xcd684f16UL, 0x00000001UL, // x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 6.92 + 0x202f6398UL, 0x00000001UL, 0x4bc6a70aUL, 0x00000000UL, // x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 6.93 + 0x3aea243eUL, 0x00000001UL, 0x4fc7e8e4UL, 0x00000000UL, // x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 6.94 + 0xb4052ae6UL, 0x00000001UL, 0x30103f1cUL, 0x00000001UL, // x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 6.95 + 0xcd2a0ae8UL, 0x00000001UL, 0x11b0024cUL, 0x00000001UL, // x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 6.96 + 0xfe4aa8b4UL, 0x00000001UL, 0x0b3079daUL, 0x00000001UL, // x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 6.97 + 0xd1559a42UL, 0x00000001UL, 0x0192bcc2UL, 0x00000001UL, // x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 6.98 + 0xf3e05eccUL, 0x00000001UL, 0x74838d50UL, 0x00000000UL, // x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 6.99 + 0x04ddd2ccUL, 0x00000001UL, 0x1b20f520UL, 0x00000000UL, // x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 6.100 + 0x5393153cUL, 0x00000001UL, 0x50c3590aUL, 0x00000000UL, // x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 6.101 + 0x57e942c6UL, 0x00000000UL, 0xb41cac8eUL, 0x00000000UL, // x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 6.102 + 0x2c633850UL, 0x00000001UL, 0x0c72cc78UL, 0x00000000UL, // x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 6.103 + 0xebcaae4cUL, 0x00000000UL, 0x30cdb032UL, 0x00000000UL, // x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 6.104 + 0x3ee532a6UL, 0x00000001UL, 0x3e09fc32UL, 0x00000001UL, // x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 6.105 + 0xbf0cbc7eUL, 0x00000001UL, 0x1ed624d2UL, 0x00000000UL, // x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 6.106 + 0xd50b7a5aUL, 0x00000000UL, 0x781aee1aUL, 0x00000000UL, // x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 6.107 + 0x02fca6e8UL, 0x00000000UL, 0xc4d8348cUL, 0x00000001UL, // x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 6.108 + 0x7af40044UL, 0x00000000UL, 0x57a40336UL, 0x00000000UL, // x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 6.109 + 0x16178744UL, 0x00000000UL, 0x85544940UL, 0x00000000UL, // x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 6.110 + 0x4c177458UL, 0x00000001UL, 0x9cd21e80UL, 0x00000001UL, // x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 6.111 + 0x1b6ddf04UL, 0x00000001UL, 0x3eb95bc0UL, 0x00000001UL, // x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 6.112 + 0xf3e29cccUL, 0x00000001UL, 0xdfc9fdfcUL, 0x00000001UL, // x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 6.113 + 0x35ae7562UL, 0x00000001UL, 0xcd028bc2UL, 0x00000000UL, // x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 6.114 + 0x90ef812cUL, 0x00000001UL, 0x90db8c44UL, 0x00000000UL, // x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 6.115 + 0x67a2c786UL, 0x00000000UL, 0x0010a4ceUL, 0x00000001UL, // x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 6.116 + 0x48b9496cUL, 0x00000000UL, 0xc8f4c72cUL, 0x00000001UL, // x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 6.117 + 0x5a422de6UL, 0x00000001UL, 0x1c26170cUL, 0x00000000UL, // x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 6.118 + 0xef0e3640UL, 0x00000001UL, 0xe3fccf68UL, 0x00000000UL, // x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 6.119 + 0x006d2d26UL, 0x00000001UL, 0xd513ed24UL, 0x00000000UL, // x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 6.120 + 0x170d56d6UL, 0x00000001UL, 0x141beadaUL, 0x00000000UL, // x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 6.121 + 0xa5fb613cUL, 0x00000000UL, 0x1071aea0UL, 0x00000001UL, // x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 6.122 + 0x40bbf7fcUL, 0x00000000UL, 0x2e19080aUL, 0x00000001UL, // x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 6.123 + 0x6ac3a5b2UL, 0x00000001UL, 0x00ecf826UL, 0x00000001UL, // x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 6.124 + 0xabf16230UL, 0x00000000UL, 0x69b09412UL, 0x00000000UL, // x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 6.125 + 0xebe23facUL, 0x00000001UL, 0x22297bacUL, 0x00000001UL, // x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 6.126 + 0x8b6a0894UL, 0x00000000UL, 0xe9e4b068UL, 0x00000000UL, // x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 6.127 + 0x288ea478UL, 0x00000001UL, 0x4b38651aUL, 0x00000000UL, // x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 6.128 + 0x6619c442UL, 0x00000001UL, 0x468360e2UL, 0x00000001UL, // x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 6.129 + 0x86230038UL, 0x00000000UL, 0x121c2408UL, 0x00000000UL, // x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 6.130 + 0x7746a756UL, 0x00000001UL, 0xda7e7d08UL, 0x00000000UL, // x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 6.131 + 0x91b8f8f8UL, 0x00000001UL, 0x058d7652UL, 0x00000001UL, // x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 6.132 + 0x8e167708UL, 0x00000000UL, 0x4a098a90UL, 0x00000001UL, // x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 6.133 + 0x48b22d54UL, 0x00000001UL, 0x20dbe72eUL, 0x00000000UL, // x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 6.134 + 0x44ba2c3cUL, 0x00000000UL, 0x1e7323e8UL, 0x00000001UL, // x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 6.135 + 0xb54d2b52UL, 0x00000000UL, 0xd5d4bf94UL, 0x00000000UL, // x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 6.136 + 0x05a4fd8aUL, 0x00000000UL, 0x99d8746cUL, 0x00000001UL, // x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 6.137 + 0x39f9fc46UL, 0x00000001UL, 0xce9ca8a0UL, 0x00000000UL, // x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 6.138 + 0x5a1fa824UL, 0x00000001UL, 0x136edeceUL, 0x00000000UL, // x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 6.139 + 0x0a61ae4cUL, 0x00000000UL, 0x9b92a068UL, 0x00000001UL, // x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 6.140 + 0x45e9113eUL, 0x00000001UL, 0x71d62206UL, 0x00000000UL, // x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 6.141 + 0x6a348448UL, 0x00000000UL, 0xdfc50158UL, 0x00000000UL, // x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 6.142 + 0x4d80a08cUL, 0x00000000UL, 0x517626bcUL, 0x00000001UL, // x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 6.143 + 0x4b6837a0UL, 0x00000001UL, 0x48d1e4faUL, 0x00000001UL, // x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 6.144 + 0x6896a7fcUL, 0x00000001UL, 0x94d8266eUL, 0x00000000UL, // x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 6.145 + 0x4f187140UL, 0x00000001UL, 0x606c5e34UL, 0x00000000UL, // x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 6.146 + 0x9581b9daUL, 0x00000001UL, 0x9766beaaUL, 0x00000001UL, // x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 6.147 + 0x091bc984UL, 0x00000001UL, 0xd80c506cUL, 0x00000001UL, // x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 6.148 + 0x1067223cUL, 0x00000000UL, 0x1e73837cUL, 0x00000000UL, // x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 6.149 + 0xab16ea02UL, 0x00000001UL, 0x64d587deUL, 0x00000000UL, // x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 6.150 + 0x3c4598a8UL, 0x00000001UL, 0xf4a507b0UL, 0x00000000UL, // x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 6.151 + 0xb3735430UL, 0x00000000UL, 0x40e342fcUL, 0x00000000UL, // x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 6.152 + 0xbb3fc0c0UL, 0x00000001UL, 0xd5ad9c3aUL, 0x00000001UL, // x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 6.153 + 0x570ae19cUL, 0x00000001UL, 0x94a691a4UL, 0x00000000UL, // x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 6.154 + 0xea910712UL, 0x00000001UL, 0x271ecdfaUL, 0x00000001UL, // x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 6.155 + 0x67127128UL, 0x00000001UL, 0x9e54475aUL, 0x00000000UL, // x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 6.156 + 0x19e790a2UL, 0x00000000UL, 0xc9c099eeUL, 0x00000000UL, // x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 6.157 + 0x3788f710UL, 0x00000000UL, 0x9a2f736cUL, 0x00000000UL, // x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 6.158 + 0x682a160eUL, 0x00000001UL, 0xbb9f4996UL, 0x00000000UL, // x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 6.159 + 0x7f0ebd2eUL, 0x00000000UL, 0xdb688050UL, 0x00000001UL, // x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 6.160 + 0x2b032080UL, 0x00000000UL, 0xe9b10af4UL, 0x00000000UL, // x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 6.161 + 0xcfd1664aUL, 0x00000000UL, 0x2d4545e4UL, 0x00000001UL, // x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 6.162 + 0xaa1181c2UL, 0x00000000UL, 0x0361139cUL, 0x00000000UL, // x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 6.163 + 0xddd08002UL, 0x00000000UL, 0xa5a1a3a8UL, 0x00000001UL, // x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 6.164 + 0xe8dd0446UL, 0x00000000UL, 0x6844e0b0UL, 0x00000000UL, // x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 6.165 + 0xbbd94a00UL, 0x00000001UL, 0xc3762f28UL, 0x00000000UL, // x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 6.166 + 0xab6cd180UL, 0x00000000UL, 0xd26287a2UL, 0x00000001UL, // x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 6.167 + 0x31803ce2UL, 0x00000000UL, 0xf6f0bba8UL, 0x00000001UL, // x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 6.168 + 0x24f40b0cUL, 0x00000000UL, 0x2ffabd62UL, 0x00000000UL, // x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 6.169 + 0xba1d9834UL, 0x00000001UL, 0xfb4516b8UL, 0x00000000UL, // x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 6.170 + 0x04de61aaUL, 0x00000001UL, 0x8cfa961cUL, 0x00000001UL, // x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 6.171 + 0x13e40d46UL, 0x00000001UL, 0x9e588d52UL, 0x00000001UL, // x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 6.172 + 0x415598a0UL, 0x00000001UL, 0x180f0bbcUL, 0x00000001UL, // x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 6.173 + 0xbf6c8c90UL, 0x00000000UL, 0xe1d9177aUL, 0x00000000UL, // x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 6.174 + 0x788b0504UL, 0x00000001UL, 0x05abc27cUL, 0x00000001UL, // x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 6.175 + 0x38385d02UL, 0x00000000UL, 0x972e4a58UL, 0x00000000UL, // x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 6.176 + 0xb6c83844UL, 0x00000001UL, 0x83499a5eUL, 0x00000001UL, // x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 6.177 + 0x51061a8aUL, 0x00000000UL, 0xc96a8ccaUL, 0x00000001UL, // x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 6.178 + 0x7351388aUL, 0x00000001UL, 0xa1a5b60cUL, 0x00000001UL, // x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 6.179 + 0x32928f92UL, 0x00000001UL, 0xe4b6ac9cUL, 0x00000000UL, // x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 6.180 + 0xe6b4f48aUL, 0x00000000UL, 0x807e7f5aUL, 0x00000001UL, // x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 6.181 + 0x39d15e90UL, 0x00000000UL, 0x7a7e3bc8UL, 0x00000001UL, // x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 6.182 + 0x312d6074UL, 0x00000000UL, 0xd73975daUL, 0x00000000UL, // x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 6.183 + 0x7bbb2cc4UL, 0x00000001UL, 0x7375d038UL, 0x00000001UL, // x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 6.184 + 0x6ded3e18UL, 0x00000001UL, 0x193680bcUL, 0x00000000UL, // x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 6.185 + 0xf1638b16UL, 0x00000000UL, 0x999b06f6UL, 0x00000000UL, // x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 6.186 + 0xd38b9eccUL, 0x00000001UL, 0xf685d2b8UL, 0x00000001UL, // x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 6.187 + 0x8b8d09dcUL, 0x00000001UL, 0xf4ecbed2UL, 0x00000001UL, // x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 6.188 + 0xe7bc27d2UL, 0x00000000UL, 0xba16f1a0UL, 0x00000000UL, // x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 6.189 + 0x275e1e96UL, 0x00000000UL, 0x15aceac4UL, 0x00000001UL, // x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 6.190 + 0xe2e3031eUL, 0x00000000UL, 0xaeff6292UL, 0x00000001UL, // x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 6.191 + 0x041c84d8UL, 0x00000001UL, 0x9640124cUL, 0x00000000UL, // x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 6.192 + 0x706ce672UL, 0x00000000UL, 0x14f41f02UL, 0x00000001UL, // x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 6.193 + 0x5d5070daUL, 0x00000001UL, 0x9c5f3586UL, 0x00000000UL, // x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 6.194 + 0x38f9493aUL, 0x00000000UL, 0x878275faUL, 0x00000001UL, // x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 6.195 + 0xa3348a76UL, 0x00000000UL, 0xddc42ce8UL, 0x00000000UL, // x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 6.196 + 0xad0aab92UL, 0x00000001UL, 0x81d2c73aUL, 0x00000001UL, // x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 6.197 + 0x9e85f712UL, 0x00000001UL, 0x41c9320aUL, 0x00000001UL, // x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 6.198 + 0x5a871e76UL, 0x00000000UL, 0x5235719aUL, 0x00000001UL, // x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 6.199 + 0x7249c662UL, 0x00000001UL, 0xbe27d804UL, 0x00000000UL, // x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 6.200 + 0x3a084712UL, 0x00000000UL, 0x6242d45aUL, 0x00000000UL, // x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 6.201 + 0xed438478UL, 0x00000000UL, 0x9a53638eUL, 0x00000000UL, // x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 6.202 + 0xabac34ccUL, 0x00000000UL, 0x001ecfb6UL, 0x00000001UL, // x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 6.203 + 0x5f35ef3eUL, 0x00000000UL, 0x6d7c2d64UL, 0x00000001UL, // x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 6.204 + 0x47d6608cUL, 0x00000000UL, 0xd0ce46c0UL, 0x00000001UL, // x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 6.205 + 0x2d01470eUL, 0x00000000UL, 0x24c907b4UL, 0x00000001UL, // x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 6.206 + 0x58bbc7b0UL, 0x00000001UL, 0x18a555caUL, 0x00000000UL, // x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 6.207 + 0xc0a23e8eUL, 0x00000000UL, 0x6b0980bcUL, 0x00000000UL, // x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 6.208 + 0xebd85c88UL, 0x00000001UL, 0x8bbba964UL, 0x00000000UL, // x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 6.209 + 0x9ee20bb2UL, 0x00000001UL, 0x070a5a1eUL, 0x00000001UL, // x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 6.210 + 0xacabf2d6UL, 0x00000001UL, 0x2204322aUL, 0x00000000UL, // x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 6.211 + 0xb7963d56UL, 0x00000001UL, 0xa27524d0UL, 0x00000000UL, // x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 6.212 + 0x7bffa1feUL, 0x00000001UL, 0x20b1e4baUL, 0x00000000UL, // x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 6.213 + 0x1f15333eUL, 0x00000000UL, 0x32cc27fcUL, 0x00000000UL, // x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 6.214 + 0x8593129eUL, 0x00000001UL, 0x44dd22b8UL, 0x00000000UL, // x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 6.215 + 0x9cb32602UL, 0x00000001UL, 0xdffc9e0aUL, 0x00000000UL, // x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 6.216 + 0x42b05cc8UL, 0x00000001UL, 0xb7a0ed14UL, 0x00000001UL, // x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 6.217 + 0xbe49e7a4UL, 0x00000001UL, 0xc7842488UL, 0x00000000UL, // x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 6.218 + 0x08f69d6cUL, 0x00000001UL, 0xc02a4feeUL, 0x00000001UL, // x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 6.219 + 0x6c0971f0UL, 0x00000000UL, 0x3c273778UL, 0x00000000UL, // x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 6.220 + 0x5b16467aUL, 0x00000000UL, 0xd63f8894UL, 0x00000001UL, // x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 6.221 + 0x551a628eUL, 0x00000001UL, 0x6be557d6UL, 0x00000000UL, // x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 6.222 + 0x9e42ea92UL, 0x00000001UL, 0x6a7806eaUL, 0x00000000UL, // x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 6.223 + 0x2fa83ff2UL, 0x00000001UL, 0x6155aa0cUL, 0x00000001UL, // x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 6.224 + 0x1ca9cde0UL, 0x00000001UL, 0x908650acUL, 0x00000000UL, // x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 6.225 + 0xc8e5cd74UL, 0x00000000UL, 0xaa5a8084UL, 0x00000000UL, // x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 6.226 + 0x96c27f0cUL, 0x00000000UL, 0x91bb500aUL, 0x00000001UL, // x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 6.227 + 0x2baed926UL, 0x00000000UL, 0x64e9bed0UL, 0x00000000UL, // x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 6.228 + 0x7c8de8d2UL, 0x00000001UL, 0x9444f302UL, 0x00000000UL, // x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 6.229 + 0xd43d6068UL, 0x00000000UL, 0x9db07d3cUL, 0x00000001UL, // x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 6.230 + 0xcb2c4b26UL, 0x00000000UL, 0x359e3e6eUL, 0x00000001UL, // x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 6.231 + 0x45b8da26UL, 0x00000001UL, 0xe4f10dd2UL, 0x00000001UL, // x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 6.232 + 0x8fff4b08UL, 0x00000001UL, 0x24f5735eUL, 0x00000001UL, // x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 6.233 + 0x50b58ed0UL, 0x00000001UL, 0x24760a4cUL, 0x00000001UL, // x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 6.234 + 0x549f39bcUL, 0x00000001UL, 0x0f1fc186UL, 0x00000000UL, // x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 6.235 + 0xef4d2f42UL, 0x00000000UL, 0x150e4cc4UL, 0x00000000UL, // x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 6.236 + 0xb1468572UL, 0x00000001UL, 0x2a6204e8UL, 0x00000000UL, // x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 6.237 + 0x3d7403b2UL, 0x00000001UL, 0xbeb1d432UL, 0x00000000UL, // x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 6.238 + 0xa4681842UL, 0x00000001UL, 0x35f3f1f0UL, 0x00000001UL, // x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 6.239 + 0x67714492UL, 0x00000001UL, 0x74fe2232UL, 0x00000000UL, // x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 6.240 + 0xe599099aUL, 0x00000001UL, 0x1ac6e2baUL, 0x00000000UL, // x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 6.241 + 0xfe128194UL, 0x00000000UL, 0x13fca91eUL, 0x00000000UL, // x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 6.242 + 0x77e8b990UL, 0x00000000UL, 0x83f4931eUL, 0x00000001UL, // x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 6.243 + 0xa267f63aUL, 0x00000001UL, 0xb6d9b4e4UL, 0x00000000UL, // x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 6.244 + 0x945c245aUL, 0x00000001UL, 0xb5188656UL, 0x00000000UL, // x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 6.245 + 0x49002e76UL, 0x00000001UL, 0x27a81a84UL, 0x00000000UL, // x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 6.246 + 0xbb8310a4UL, 0x00000001UL, 0x25699258UL, 0x00000001UL, // x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 6.247 + 0x9ec60bccUL, 0x00000001UL, 0xb23de796UL, 0x00000001UL, // x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 6.248 + 0x2d8590aeUL, 0x00000001UL, 0xfe4365dcUL, 0x00000000UL, // x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 6.249 + 0x65b00684UL, 0x00000000UL, 0xc68f497aUL, 0x00000000UL, // x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 6.250 + 0x5e5aeadcUL, 0x00000001UL, 0xfbf521eeUL, 0x00000000UL, // x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 6.251 + 0xb77ff2b0UL, 0x00000000UL, 0x5eac3378UL, 0x00000001UL, // x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 6.252 + 0x88da2ff6UL, 0x00000001UL, 0x34914b90UL, 0x00000001UL, // x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 6.253 + 0x63da929aUL, 0x00000000UL, 0x16335cfeUL, 0x00000000UL, // x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 6.254 + 0x389caa80UL, 0x00000001UL, 0x0372d10cUL, 0x00000001UL, // x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 6.255 + 0x3db599d2UL, 0x00000001UL, 0x5097b908UL, 0x00000001UL, // x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 6.256 + 0x22505a86UL, 0x00000001UL, 0x227a7572UL, 0x00000001UL, // x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 6.257 + 0x6bd72746UL, 0x00000001UL, 0x9a8f75c0UL, 0x00000000UL, // x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 6.258 + 0xc3faf1d4UL, 0x00000001UL, 0x682c77a2UL, 0x00000000UL, // x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 6.259 + 0x111c826cUL, 0x00000001UL, 0x231f091cUL, 0x00000000UL, // x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 6.260 + 0x153e9fb2UL, 0x00000000UL, 0x7d4439f2UL, 0x00000000UL, // x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 6.261 + 0x2b1f7b60UL, 0x00000000UL, 0x7e221efcUL, 0x00000001UL, // x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 6.262 + 0xb1dba570UL, 0x00000000UL, 0x67457c38UL, 0x00000001UL, // x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 6.263 + 0xf6397b76UL, 0x00000001UL, 0xbdf081c4UL, 0x00000000UL, // x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 6.264 + 0x56335214UL, 0x00000001UL, 0x6286d6b0UL, 0x00000001UL, // x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 6.265 + 0xd70e3986UL, 0x00000001UL, 0xc84f001cUL, 0x00000000UL, // x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 6.266 + 0x3701a774UL, 0x00000000UL, 0x64efe7c0UL, 0x00000000UL, // x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 6.267 + 0xac81ef72UL, 0x00000000UL, 0x0ac2d904UL, 0x00000000UL, // x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 6.268 + 0x33212464UL, 0x00000001UL, 0xfd226d14UL, 0x00000000UL, // x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 6.269 + 0xe4e45610UL, 0x00000000UL, 0x1cfd42e0UL, 0x00000001UL, // x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 6.270 + 0x0c1bd370UL, 0x00000000UL, 0x6e5a5678UL, 0x00000001UL, // x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 6.271 + 0xa7b9e7a6UL, 0x00000001UL, 0xd888fe22UL, 0x00000001UL, // x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 6.272 + 0x7d657a10UL, 0x00000000UL, 0xaf77fcd4UL, 0x00000001UL, // x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 6.273 + 6.274 + // Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros 6.275 + 0xec447f11UL, 0x99168a18UL, 0x13e8221eUL, 0xed837b26UL, // x^2048 mod p(x)`, x^2016 mod p(x)`, x^1984 mod p(x)`, x^1952 mod p(x)` 6.276 + 0x8fd2cd3cUL, 0xe23e954eUL, 0x47b9ce5aUL, 0xc8acdd81UL, // x^1920 mod p(x)`, x^1888 mod p(x)`, x^1856 mod p(x)`, x^1824 mod p(x)` 6.277 + 0x6b1d2b53UL, 0x92f8befeUL, 0xd4277e25UL, 0xd9ad6d87UL, // x^1792 mod p(x)`, x^1760 mod p(x)`, x^1728 mod p(x)`, x^1696 mod p(x)` 6.278 + 0x291ea462UL, 0xf38a3556UL, 0x33fbca3bUL, 0xc10ec5e0UL, // x^1664 mod p(x)`, x^1632 mod p(x)`, x^1600 mod p(x)`, x^1568 mod p(x)` 6.279 + 0x62b6ca4bUL, 0x974ac562UL, 0x82e02e2fUL, 0xc0b55b0eUL, // x^1536 mod p(x)`, x^1504 mod p(x)`, x^1472 mod p(x)`, x^1440 mod p(x)` 6.280 + 0x784d2a56UL, 0x855712b3UL, 0xe172334dUL, 0x71aa1df0UL, // x^1408 mod p(x)`, x^1376 mod p(x)`, x^1344 mod p(x)`, x^1312 mod p(x)` 6.281 + 0x0eaee722UL, 0xa5abe9f8UL, 0x3969324dUL, 0xfee3053eUL, // x^1280 mod p(x)`, x^1248 mod p(x)`, x^1216 mod p(x)`, x^1184 mod p(x)` 6.282 + 0xdb54814cUL, 0x1fa0943dUL, 0x3eb2bd08UL, 0xf44779b9UL, // x^1152 mod p(x)`, x^1120 mod p(x)`, x^1088 mod p(x)`, x^1056 mod p(x)` 6.283 + 0xd7bbfe6aUL, 0xa53ff440UL, 0x00cc3374UL, 0xf5449b3fUL, // x^1024 mod p(x)`, x^992 mod p(x)`, x^960 mod p(x)`, x^928 mod p(x)` 6.284 + 0x6325605cUL, 0xebe7e356UL, 0xd777606eUL, 0x6f8346e1UL, // x^896 mod p(x)`, x^864 mod p(x)`, x^832 mod p(x)`, x^800 mod p(x)` 6.285 + 0xe5b592b8UL, 0xc65a272cUL, 0xc0b95347UL, 0xe3ab4f2aUL, // x^768 mod p(x)`, x^736 mod p(x)`, x^704 mod p(x)`, x^672 mod p(x)` 6.286 + 0x4721589fUL, 0x5705a9caUL, 0x329ecc11UL, 0xaa2215eaUL, // x^640 mod p(x)`, x^608 mod p(x)`, x^576 mod p(x)`, x^544 mod p(x)` 6.287 + 0x88d14467UL, 0xe3720acbUL, 0xd95efd26UL, 0x1ed8f66eUL, // x^512 mod p(x)`, x^480 mod p(x)`, x^448 mod p(x)`, x^416 mod p(x)` 6.288 + 0x15141c31UL, 0xba1aca03UL, 0xa700e96aUL, 0x78ed02d5UL, // x^384 mod p(x)`, x^352 mod p(x)`, x^320 mod p(x)`, x^288 mod p(x)` 6.289 + 0xed627daeUL, 0xad2a31b3UL, 0x32b39da3UL, 0xba8ccbe8UL, // x^256 mod p(x)`, x^224 mod p(x)`, x^192 mod p(x)`, x^160 mod p(x)` 6.290 + 0xa06a2517UL, 0x6655004fUL, 0xb1e6b092UL, 0xedb88320UL // x^128 mod p(x)`, x^96 mod p(x)`, x^64 mod p(x)`, x^32 mod p(x)` 6.291 + }; 6.292 + 6.293 + juint* ptr = (juint*) malloc(sizeof(juint) * CRC32_CONSTANTS_SIZE); 6.294 + guarantee(((intptr_t)ptr & 0xF) == 0, "16-byte alignment needed"); 6.295 + guarantee(ptr != NULL, "allocation error of a crc table"); 6.296 + memcpy((void*)ptr, constants, sizeof(juint) * CRC32_CONSTANTS_SIZE); 6.297 + return ptr; 6.298 +} 6.299 + 6.300 +juint* StubRoutines::ppc64::generate_crc_barret_constants() { 6.301 + juint barret_constants[CRC32_BARRET_CONSTANTS] = { 6.302 + 0xf7011641UL, 0x00000001UL, 0x00000000UL, 0x00000000UL, 6.303 + 0xdb710641UL, 0x00000001UL, 0x00000000UL, 0x00000000UL 6.304 + }; 6.305 + juint* ptr = (juint*) malloc(sizeof(juint) * CRC32_CONSTANTS_SIZE); 6.306 + guarantee(((intptr_t)ptr & 0xF) == 0, "16-byte alignment needed"); 6.307 + guarantee(ptr != NULL, "allocation error of a crc table"); 6.308 + memcpy((void*) ptr, barret_constants, sizeof(juint) * CRC32_BARRET_CONSTANTS); 6.309 + return ptr; 6.310 +} 6.311 + 6.312 // CRC32 Intrinsics. 6.313 /** 6.314 * crc_table[] from jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.h 6.315 @@ -477,3 +782,7 @@ 6.316 #endif 6.317 } 6.318 }; 6.319 + 6.320 +juint* StubRoutines::ppc64::_constants = StubRoutines::ppc64::generate_crc_constants(); 6.321 + 6.322 +juint* StubRoutines::ppc64::_barret_constants = StubRoutines::ppc64::generate_crc_barret_constants();
7.1 --- a/src/cpu/ppc/vm/stubRoutines_ppc_64.hpp Mon Sep 24 17:18:38 2018 -0400 7.2 +++ b/src/cpu/ppc/vm/stubRoutines_ppc_64.hpp Thu Sep 22 12:17:24 2016 +0200 7.3 @@ -45,6 +45,8 @@ 7.4 #else 7.5 #define CRC32_TABLES 1 7.6 #endif 7.7 +#define CRC32_CONSTANTS_SIZE 1084 7.8 +#define CRC32_BARRET_CONSTANTS 10 7.9 7.10 class ppc64 { 7.11 friend class StubGenerator; 7.12 @@ -53,11 +55,17 @@ 7.13 7.14 // CRC32 Intrinsics. 7.15 static juint _crc_table[CRC32_TABLES][CRC32_COLUMN_SIZE]; 7.16 + static juint* _constants; 7.17 + static juint* _barret_constants; 7.18 7.19 public: 7.20 7.21 // CRC32 Intrinsics. 7.22 static void generate_load_crc_table_addr(MacroAssembler* masm, Register table); 7.23 + static void generate_load_crc_constants_addr(MacroAssembler* masm, Register table); 7.24 + static void generate_load_crc_barret_constants_addr(MacroAssembler* masm, Register table); 7.25 + static juint* generate_crc_constants(); 7.26 + static juint* generate_crc_barret_constants(); 7.27 7.28 }; 7.29
8.1 --- a/src/cpu/ppc/vm/vm_version_ppc.cpp Mon Sep 24 17:18:38 2018 -0400 8.2 +++ b/src/cpu/ppc/vm/vm_version_ppc.cpp Thu Sep 22 12:17:24 2016 +0200 8.3 @@ -102,7 +102,7 @@ 8.4 // Create and print feature-string. 8.5 char buf[(num_features+1) * 16]; // Max 16 chars per feature. 8.6 jio_snprintf(buf, sizeof(buf), 8.7 - "ppc64%s%s%s%s%s%s%s%s%s", 8.8 + "ppc64%s%s%s%s%s%s%s%s%s%s", 8.9 (has_fsqrt() ? " fsqrt" : ""), 8.10 (has_isel() ? " isel" : ""), 8.11 (has_lxarxeh() ? " lxarxeh" : ""), 8.12 @@ -112,7 +112,8 @@ 8.13 (has_popcntw() ? " popcntw" : ""), 8.14 (has_fcfids() ? " fcfids" : ""), 8.15 (has_vand() ? " vand" : ""), 8.16 - (has_vcipher() ? " aes" : "") 8.17 + (has_vcipher() ? " aes" : ""), 8.18 + (has_vpmsumb() ? " vpmsumb" : "") 8.19 // Make sure number of %s matches num_features! 8.20 ); 8.21 _features_str = strdup(buf); 8.22 @@ -485,6 +486,7 @@ 8.23 a->fcfids(F3, F4); // code[8] -> fcfids 8.24 a->vand(VR0, VR0, VR0); // code[9] -> vand 8.25 a->vcipher(VR0, VR1, VR2); // code[10] -> vcipher 8.26 + a->vpmsumb(VR0, VR1, VR2); // code[11] -> vpmsumb 8.27 a->blr(); 8.28 8.29 // Emit function to set one cache line to zero. Emit function descriptor and get pointer to it. 8.30 @@ -529,6 +531,7 @@ 8.31 if (code[feature_cntr++]) features |= fcfids_m; 8.32 if (code[feature_cntr++]) features |= vand_m; 8.33 if (code[feature_cntr++]) features |= vcipher_m; 8.34 + if (code[feature_cntr++]) features |= vpmsumb_m; 8.35 8.36 // Print the detection code. 8.37 if (PrintAssembly) {
9.1 --- a/src/cpu/ppc/vm/vm_version_ppc.hpp Mon Sep 24 17:18:38 2018 -0400 9.2 +++ b/src/cpu/ppc/vm/vm_version_ppc.hpp Thu Sep 22 12:17:24 2016 +0200 9.3 @@ -43,6 +43,7 @@ 9.4 vand, 9.5 dcba, 9.6 vcipher, 9.7 + vpmsumb, 9.8 num_features // last entry to count features 9.9 }; 9.10 enum Feature_Flag_Set { 9.11 @@ -58,6 +59,7 @@ 9.12 vand_m = (1 << vand ), 9.13 dcba_m = (1 << dcba ), 9.14 vcipher_m = (1 << vcipher), 9.15 + vpmsumb_m = (1 << vpmsumb), 9.16 all_features_m = -1 9.17 }; 9.18 static int _features; 9.19 @@ -86,6 +88,7 @@ 9.20 static bool has_vand() { return (_features & vand_m) != 0; } 9.21 static bool has_dcba() { return (_features & dcba_m) != 0; } 9.22 static bool has_vcipher() { return (_features & vcipher_m) != 0; } 9.23 + static bool has_vpmsumb() { return (_features & vpmsumb_m) != 0; } 9.24 9.25 static const char* cpu_features() { return _features_str; } 9.26