8164920: ppc: enhancement of CRC32 intrinsic

Thu, 22 Sep 2016 12:17:24 +0200

author
mdoerr
date
Thu, 22 Sep 2016 12:17:24 +0200
changeset 9497
f892c3b6b651
parent 9496
bcccbecdde63
child 9498
7a436f5d4b0c

8164920: ppc: enhancement of CRC32 intrinsic
Reviewed-by: goetz, mdoerr
Contributed-by: Hiroshi H Horii <horii@jp.ibm.com>

src/cpu/ppc/vm/assembler_ppc.hpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/assembler_ppc.inline.hpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/macroAssembler_ppc.cpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/macroAssembler_ppc.hpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/stubGenerator_ppc.cpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/stubRoutines_ppc_64.cpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/stubRoutines_ppc_64.hpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/vm_version_ppc.cpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/vm_version_ppc.hpp file | annotate | diff | comparison | revisions
     1.1 --- a/src/cpu/ppc/vm/assembler_ppc.hpp	Mon Sep 24 17:18:38 2018 -0400
     1.2 +++ b/src/cpu/ppc/vm/assembler_ppc.hpp	Thu Sep 22 12:17:24 2016 +0200
     1.3 @@ -468,6 +468,10 @@
     1.4      LVSL_OPCODE    = (31u << OPCODE_SHIFT |    6u << 1),
     1.5      LVSR_OPCODE    = (31u << OPCODE_SHIFT |   38u << 1),
     1.6  
     1.7 +    // Vector-Scalar (VSX) instruction support.
     1.8 +    MTVSRD_OPCODE  = (31u << OPCODE_SHIFT |  179u << 1),
     1.9 +    MFVSRD_OPCODE  = (31u << OPCODE_SHIFT |   51u << 1),
    1.10 +
    1.11      // Vector Permute and Formatting
    1.12      VPKPX_OPCODE   = (4u  << OPCODE_SHIFT |  782u     ),
    1.13      VPKSHSS_OPCODE = (4u  << OPCODE_SHIFT |  398u     ),
    1.14 @@ -1938,6 +1942,10 @@
    1.15    inline void mtvscr(   VectorRegister b);
    1.16    inline void mfvscr(   VectorRegister d);
    1.17  
    1.18 +  // Vector-Scalar (VSX) instructions.
    1.19 +  inline void mtvrd(    VectorRegister  d, Register a);
    1.20 +  inline void mfvrd(    Register        a, VectorRegister d);
    1.21 +
    1.22    // AES (introduced with Power 8)
    1.23    inline void vcipher(     VectorRegister d, VectorRegister a, VectorRegister b);
    1.24    inline void vcipherlast( VectorRegister d, VectorRegister a, VectorRegister b);
     2.1 --- a/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Mon Sep 24 17:18:38 2018 -0400
     2.2 +++ b/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Thu Sep 22 12:17:24 2016 +0200
     2.3 @@ -623,6 +623,10 @@
     2.4  inline void Assembler::lvsl(  VectorRegister d, Register s1, Register s2) { emit_int32( LVSL_OPCODE   | vrt(d) | ra0mem(s1) | rb(s2)); }
     2.5  inline void Assembler::lvsr(  VectorRegister d, Register s1, Register s2) { emit_int32( LVSR_OPCODE   | vrt(d) | ra0mem(s1) | rb(s2)); }
     2.6  
     2.7 +// Vector-Scalar (VSX) instructions.
     2.8 +inline void Assembler::mtvrd(  VectorRegister  d, Register a)               { emit_int32( MTVSRD_OPCODE  | vrt(d)  | ra(a)  | 1u); } // 1u: d is treated as Vector (VMX/Altivec).
     2.9 +inline void Assembler::mfvrd(  Register        a, VectorRegister d)         { emit_int32( MFVSRD_OPCODE  | vrt(d)  | ra(a)  | 1u); } // 1u: d is treated as Vector (VMX/Altivec).
    2.10 +
    2.11  inline void Assembler::vpkpx(   VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKPX_OPCODE   | vrt(d) | vra(a) | vrb(b)); }
    2.12  inline void Assembler::vpkshss( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKSHSS_OPCODE | vrt(d) | vra(a) | vrb(b)); }
    2.13  inline void Assembler::vpkswss( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKSWSS_OPCODE | vrt(d) | vra(a) | vrb(b)); }
     3.1 --- a/src/cpu/ppc/vm/macroAssembler_ppc.cpp	Mon Sep 24 17:18:38 2018 -0400
     3.2 +++ b/src/cpu/ppc/vm/macroAssembler_ppc.cpp	Thu Sep 22 12:17:24 2016 +0200
     3.3 @@ -3423,6 +3423,565 @@
     3.4    BLOCK_COMMENT("} kernel_crc32_1byte");
     3.5  }
     3.6  
     3.7 +/**
     3.8 + * @param crc             register containing existing CRC (32-bit)
     3.9 + * @param buf             register pointing to input byte buffer (byte*)
    3.10 + * @param len             register containing number of bytes
    3.11 + * @param table           register pointing to CRC table
    3.12 + * @param constants       register pointing to CRC table for 128-bit aligned memory
    3.13 + * @param barretConstants register pointing to table for barrett reduction
    3.14 + * @param t0              volatile register
    3.15 + * @param t1              volatile register
    3.16 + * @param t2              volatile register
    3.17 + * @param t3              volatile register
    3.18 + */
    3.19 +void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
    3.20 +                        Register constants,  Register barretConstants,
    3.21 +                        Register t0,  Register t1, Register t2, Register t3, Register t4) {
    3.22 +  assert_different_registers(crc, buf, len, table);
    3.23 +
    3.24 +  Label L_alignedHead, L_tail, L_alignTail, L_start, L_end;
    3.25 +
    3.26 +  Register  prealign     = t0;
    3.27 +  Register  postalign    = t0;
    3.28 +
    3.29 +  BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {");
    3.30 +
    3.31 +  // 1. use kernel_crc32_1word for shorter than 384bit
    3.32 +  clrldi(len, len, 32);
    3.33 +  cmpdi(CCR0, len, 384);
    3.34 +  bge(CCR0, L_start);
    3.35 +
    3.36 +    Register tc0 = t4;
    3.37 +    Register tc1 = constants;
    3.38 +    Register tc2 = barretConstants;
    3.39 +    kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table);
    3.40 +    b(L_end);
    3.41 +
    3.42 +  BIND(L_start);
    3.43 +
    3.44 +    // 2. ~c
    3.45 +    nand(crc, crc, crc);
    3.46 +
    3.47 +    // 3. calculate from 0 to first 128bit-aligned address
    3.48 +    clrldi_(prealign, buf, 57);
    3.49 +    beq(CCR0, L_alignedHead);
    3.50 +
    3.51 +    subfic(prealign, prealign, 128);
    3.52 +
    3.53 +    subf(len, prealign, len);
    3.54 +    update_byteLoop_crc32(crc, buf, prealign, table, t2, false, false);
    3.55 +
    3.56 +    // 4. calculate from first 128bit-aligned address to last 128bit-aligned address
    3.57 +    BIND(L_alignedHead);
    3.58 +
    3.59 +    clrldi(postalign, len, 57);
    3.60 +    subf(len, postalign, len);
    3.61 +
    3.62 +    // len must be more than 256bit
    3.63 +    kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3);
    3.64 +
    3.65 +    // 5. calculate remaining
    3.66 +    cmpdi(CCR0, postalign, 0);
    3.67 +    beq(CCR0, L_tail);
    3.68 +
    3.69 +    update_byteLoop_crc32(crc, buf, postalign, table, t2, false, false);
    3.70 +
    3.71 +    BIND(L_tail);
    3.72 +
    3.73 +    // 6. ~c
    3.74 +    nand(crc, crc, crc);
    3.75 +
    3.76 +  BIND(L_end);
    3.77 +
    3.78 +  BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb");
    3.79 +}
    3.80 +
    3.81 +/**
    3.82 + * @param crc             register containing existing CRC (32-bit)
    3.83 + * @param buf             register pointing to input byte buffer (byte*)
    3.84 + * @param len             register containing number of bytes
    3.85 + * @param constants       register pointing to CRC table for 128-bit aligned memory
    3.86 + * @param barretConstants register pointing to table for barrett reduction
    3.87 + * @param t0              volatile register
    3.88 + * @param t1              volatile register
    3.89 + * @param t2              volatile register
    3.90 + */
    3.91 +void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
    3.92 +    Register constants, Register barretConstants, Register t0, Register t1, Register t2) {
    3.93 +  Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test;
    3.94 +  Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15;
    3.95 +  Label L_1, L_2, L_3, L_4;
    3.96 +
    3.97 +  Register  rLoaded      = t0;
    3.98 +  Register  rTmp1        = t1;
    3.99 +  Register  rTmp2        = t2;
   3.100 +  Register  off16        = R22;
   3.101 +  Register  off32        = R23;
   3.102 +  Register  off48        = R24;
   3.103 +  Register  off64        = R25;
   3.104 +  Register  off80        = R26;
   3.105 +  Register  off96        = R27;
   3.106 +  Register  off112       = R28;
   3.107 +  Register  rIdx         = R29;
   3.108 +  Register  rMax         = R30;
   3.109 +  Register  constantsPos = R31;
   3.110 +
   3.111 +  VectorRegister mask_32bit = VR24;
   3.112 +  VectorRegister mask_64bit = VR25;
   3.113 +  VectorRegister zeroes     = VR26;
   3.114 +  VectorRegister const1     = VR27;
   3.115 +  VectorRegister const2     = VR28;
   3.116 +
   3.117 +  // Save non-volatile vector registers (frameless).
   3.118 +  Register offset = t1;   int offsetInt = 0;
   3.119 +  offsetInt -= 16; li(offset, -16);           stvx(VR20, offset, R1_SP);
   3.120 +  offsetInt -= 16; addi(offset, offset, -16); stvx(VR21, offset, R1_SP);
   3.121 +  offsetInt -= 16; addi(offset, offset, -16); stvx(VR22, offset, R1_SP);
   3.122 +  offsetInt -= 16; addi(offset, offset, -16); stvx(VR23, offset, R1_SP);
   3.123 +  offsetInt -= 16; addi(offset, offset, -16); stvx(VR24, offset, R1_SP);
   3.124 +  offsetInt -= 16; addi(offset, offset, -16); stvx(VR25, offset, R1_SP);
   3.125 +  offsetInt -= 16; addi(offset, offset, -16); stvx(VR26, offset, R1_SP);
   3.126 +  offsetInt -= 16; addi(offset, offset, -16); stvx(VR27, offset, R1_SP);
   3.127 +  offsetInt -= 16; addi(offset, offset, -16); stvx(VR28, offset, R1_SP);
   3.128 +  offsetInt -= 8; std(R22, offsetInt, R1_SP);
   3.129 +  offsetInt -= 8; std(R23, offsetInt, R1_SP);
   3.130 +  offsetInt -= 8; std(R24, offsetInt, R1_SP);
   3.131 +  offsetInt -= 8; std(R25, offsetInt, R1_SP);
   3.132 +  offsetInt -= 8; std(R26, offsetInt, R1_SP);
   3.133 +  offsetInt -= 8; std(R27, offsetInt, R1_SP);
   3.134 +  offsetInt -= 8; std(R28, offsetInt, R1_SP);
   3.135 +  offsetInt -= 8; std(R29, offsetInt, R1_SP);
   3.136 +  offsetInt -= 8; std(R30, offsetInt, R1_SP);
   3.137 +  offsetInt -= 8; std(R31, offsetInt, R1_SP);
   3.138 +
   3.139 +  // Set constants
   3.140 +  li(off16, 16);
   3.141 +  li(off32, 32);
   3.142 +  li(off48, 48);
   3.143 +  li(off64, 64);
   3.144 +  li(off80, 80);
   3.145 +  li(off96, 96);
   3.146 +  li(off112, 112);
   3.147 +
   3.148 +  clrldi(crc, crc, 32);
   3.149 +
   3.150 +  vxor(zeroes, zeroes, zeroes);
   3.151 +  vspltisw(VR0, -1);
   3.152 +
   3.153 +  vsldoi(mask_32bit, zeroes, VR0, 4);
   3.154 +  vsldoi(mask_64bit, zeroes, VR0, -8);
   3.155 +
   3.156 +  // Get the initial value into v8
   3.157 +  vxor(VR8, VR8, VR8);
   3.158 +  mtvrd(VR8, crc);
   3.159 +  vsldoi(VR8, zeroes, VR8, -8); // shift into bottom 32 bits
   3.160 +
   3.161 +  li (rLoaded, 0);
   3.162 +
   3.163 +  rldicr(rIdx, len, 0, 56);
   3.164 +
   3.165 +  {
   3.166 +    BIND(L_1);
   3.167 +    // Checksum in blocks of MAX_SIZE (32768)
   3.168 +    lis(rMax, 0);
   3.169 +    ori(rMax, rMax, 32768);
   3.170 +    mr(rTmp2, rMax);
   3.171 +    cmpd(CCR0, rIdx, rMax);
   3.172 +    bgt(CCR0, L_2);
   3.173 +    mr(rMax, rIdx);
   3.174 +
   3.175 +    BIND(L_2);
   3.176 +    subf(rIdx, rMax, rIdx);
   3.177 +
   3.178 +    // our main loop does 128 bytes at a time
   3.179 +    srdi(rMax, rMax, 7);
   3.180 +
   3.181 +    /*
   3.182 +     * Work out the offset into the constants table to start at. Each
   3.183 +     * constant is 16 bytes, and it is used against 128 bytes of input
   3.184 +     * data - 128 / 16 = 8
   3.185 +     */
   3.186 +    sldi(rTmp1, rMax, 4);
   3.187 +    srdi(rTmp2, rTmp2, 3);
   3.188 +    subf(rTmp1, rTmp1, rTmp2);
   3.189 +
   3.190 +    // We reduce our final 128 bytes in a separate step
   3.191 +    addi(rMax, rMax, -1);
   3.192 +    mtctr(rMax);
   3.193 +
   3.194 +    // Find the start of our constants
   3.195 +    add(constantsPos, constants, rTmp1);
   3.196 +
   3.197 +    // zero VR0-v7 which will contain our checksums
   3.198 +    vxor(VR0, VR0, VR0);
   3.199 +    vxor(VR1, VR1, VR1);
   3.200 +    vxor(VR2, VR2, VR2);
   3.201 +    vxor(VR3, VR3, VR3);
   3.202 +    vxor(VR4, VR4, VR4);
   3.203 +    vxor(VR5, VR5, VR5);
   3.204 +    vxor(VR6, VR6, VR6);
   3.205 +    vxor(VR7, VR7, VR7);
   3.206 +
   3.207 +    lvx(const1, constantsPos);
   3.208 +
   3.209 +    /*
   3.210 +     * If we are looping back to consume more data we use the values
   3.211 +     * already in VR16-v23.
   3.212 +     */
   3.213 +    cmpdi(CCR0, rLoaded, 1);
   3.214 +    beq(CCR0, L_3);
   3.215 +    {
   3.216 +
   3.217 +      // First warm up pass
   3.218 +      lvx(VR16, buf);
   3.219 +      lvx(VR17, off16, buf);
   3.220 +      lvx(VR18, off32, buf);
   3.221 +      lvx(VR19, off48, buf);
   3.222 +      lvx(VR20, off64, buf);
   3.223 +      lvx(VR21, off80, buf);
   3.224 +      lvx(VR22, off96, buf);
   3.225 +      lvx(VR23, off112, buf);
   3.226 +      addi(buf, buf, 8*16);
   3.227 +
   3.228 +      // xor in initial value
   3.229 +      vxor(VR16, VR16, VR8);
   3.230 +    }
   3.231 +
   3.232 +    BIND(L_3);
   3.233 +    bdz(L_first_warm_up_done);
   3.234 +
   3.235 +    addi(constantsPos, constantsPos, 16);
   3.236 +    lvx(const2, constantsPos);
   3.237 +
   3.238 +    // Second warm up pass
   3.239 +    vpmsumd(VR8, VR16, const1);
   3.240 +    lvx(VR16, buf);
   3.241 +
   3.242 +    vpmsumd(VR9, VR17, const1);
   3.243 +    lvx(VR17, off16, buf);
   3.244 +
   3.245 +    vpmsumd(VR10, VR18, const1);
   3.246 +    lvx(VR18, off32, buf);
   3.247 +
   3.248 +    vpmsumd(VR11, VR19, const1);
   3.249 +    lvx(VR19, off48, buf);
   3.250 +
   3.251 +    vpmsumd(VR12, VR20, const1);
   3.252 +    lvx(VR20, off64, buf);
   3.253 +
   3.254 +    vpmsumd(VR13, VR21, const1);
   3.255 +    lvx(VR21, off80, buf);
   3.256 +
   3.257 +    vpmsumd(VR14, VR22, const1);
   3.258 +    lvx(VR22, off96, buf);
   3.259 +
   3.260 +    vpmsumd(VR15, VR23, const1);
   3.261 +    lvx(VR23, off112, buf);
   3.262 +
   3.263 +    addi(buf, buf, 8 * 16);
   3.264 +
   3.265 +    bdz(L_first_cool_down);
   3.266 +
   3.267 +    /*
   3.268 +     * main loop. We modulo schedule it such that it takes three iterations
   3.269 +     * to complete - first iteration load, second iteration vpmsum, third
   3.270 +     * iteration xor.
   3.271 +     */
   3.272 +    {
   3.273 +      BIND(L_4);
   3.274 +      lvx(const1, constantsPos); addi(constantsPos, constantsPos, 16);
   3.275 +
   3.276 +      vxor(VR0, VR0, VR8);
   3.277 +      vpmsumd(VR8, VR16, const2);
   3.278 +      lvx(VR16, buf);
   3.279 +
   3.280 +      vxor(VR1, VR1, VR9);
   3.281 +      vpmsumd(VR9, VR17, const2);
   3.282 +      lvx(VR17, off16, buf);
   3.283 +
   3.284 +      vxor(VR2, VR2, VR10);
   3.285 +      vpmsumd(VR10, VR18, const2);
   3.286 +      lvx(VR18, off32, buf);
   3.287 +
   3.288 +      vxor(VR3, VR3, VR11);
   3.289 +      vpmsumd(VR11, VR19, const2);
   3.290 +      lvx(VR19, off48, buf);
   3.291 +      lvx(const2, constantsPos);
   3.292 +
   3.293 +      vxor(VR4, VR4, VR12);
   3.294 +      vpmsumd(VR12, VR20, const1);
   3.295 +      lvx(VR20, off64, buf);
   3.296 +
   3.297 +      vxor(VR5, VR5, VR13);
   3.298 +      vpmsumd(VR13, VR21, const1);
   3.299 +      lvx(VR21, off80, buf);
   3.300 +
   3.301 +      vxor(VR6, VR6, VR14);
   3.302 +      vpmsumd(VR14, VR22, const1);
   3.303 +      lvx(VR22, off96, buf);
   3.304 +
   3.305 +      vxor(VR7, VR7, VR15);
   3.306 +      vpmsumd(VR15, VR23, const1);
   3.307 +      lvx(VR23, off112, buf);
   3.308 +
   3.309 +      addi(buf, buf, 8 * 16);
   3.310 +
   3.311 +      bdnz(L_4);
   3.312 +    }
   3.313 +
   3.314 +    BIND(L_first_cool_down);
   3.315 +
   3.316 +    // First cool down pass
   3.317 +    lvx(const1, constantsPos);
   3.318 +    addi(constantsPos, constantsPos, 16);
   3.319 +
   3.320 +    vxor(VR0, VR0, VR8);
   3.321 +    vpmsumd(VR8, VR16, const1);
   3.322 +
   3.323 +    vxor(VR1, VR1, VR9);
   3.324 +    vpmsumd(VR9, VR17, const1);
   3.325 +
   3.326 +    vxor(VR2, VR2, VR10);
   3.327 +    vpmsumd(VR10, VR18, const1);
   3.328 +
   3.329 +    vxor(VR3, VR3, VR11);
   3.330 +    vpmsumd(VR11, VR19, const1);
   3.331 +
   3.332 +    vxor(VR4, VR4, VR12);
   3.333 +    vpmsumd(VR12, VR20, const1);
   3.334 +
   3.335 +    vxor(VR5, VR5, VR13);
   3.336 +    vpmsumd(VR13, VR21, const1);
   3.337 +
   3.338 +    vxor(VR6, VR6, VR14);
   3.339 +    vpmsumd(VR14, VR22, const1);
   3.340 +
   3.341 +    vxor(VR7, VR7, VR15);
   3.342 +    vpmsumd(VR15, VR23, const1);
   3.343 +
   3.344 +    BIND(L_second_cool_down);
   3.345 +    // Second cool down pass
   3.346 +    vxor(VR0, VR0, VR8);
   3.347 +    vxor(VR1, VR1, VR9);
   3.348 +    vxor(VR2, VR2, VR10);
   3.349 +    vxor(VR3, VR3, VR11);
   3.350 +    vxor(VR4, VR4, VR12);
   3.351 +    vxor(VR5, VR5, VR13);
   3.352 +    vxor(VR6, VR6, VR14);
   3.353 +    vxor(VR7, VR7, VR15);
   3.354 +
   3.355 +    /*
   3.356 +     * vpmsumd produces a 96 bit result in the least significant bits
   3.357 +     * of the register. Since we are bit reflected we have to shift it
   3.358 +     * left 32 bits so it occupies the least significant bits in the
   3.359 +     * bit reflected domain.
   3.360 +     */
   3.361 +    vsldoi(VR0, VR0, zeroes, 4);
   3.362 +    vsldoi(VR1, VR1, zeroes, 4);
   3.363 +    vsldoi(VR2, VR2, zeroes, 4);
   3.364 +    vsldoi(VR3, VR3, zeroes, 4);
   3.365 +    vsldoi(VR4, VR4, zeroes, 4);
   3.366 +    vsldoi(VR5, VR5, zeroes, 4);
   3.367 +    vsldoi(VR6, VR6, zeroes, 4);
   3.368 +    vsldoi(VR7, VR7, zeroes, 4);
   3.369 +
   3.370 +    // xor with last 1024 bits
   3.371 +    lvx(VR8, buf);
   3.372 +    lvx(VR9, off16, buf);
   3.373 +    lvx(VR10, off32, buf);
   3.374 +    lvx(VR11, off48, buf);
   3.375 +    lvx(VR12, off64, buf);
   3.376 +    lvx(VR13, off80, buf);
   3.377 +    lvx(VR14, off96, buf);
   3.378 +    lvx(VR15, off112, buf);
   3.379 +    addi(buf, buf, 8 * 16);
   3.380 +
   3.381 +    vxor(VR16, VR0, VR8);
   3.382 +    vxor(VR17, VR1, VR9);
   3.383 +    vxor(VR18, VR2, VR10);
   3.384 +    vxor(VR19, VR3, VR11);
   3.385 +    vxor(VR20, VR4, VR12);
   3.386 +    vxor(VR21, VR5, VR13);
   3.387 +    vxor(VR22, VR6, VR14);
   3.388 +    vxor(VR23, VR7, VR15);
   3.389 +
   3.390 +    li(rLoaded, 1);
   3.391 +    cmpdi(CCR0, rIdx, 0);
   3.392 +    addi(rIdx, rIdx, 128);
   3.393 +    bne(CCR0, L_1);
   3.394 +  }
   3.395 +
   3.396 +  // Work out how many bytes we have left
   3.397 +  andi_(len, len, 127);
   3.398 +
   3.399 +  // Calculate where in the constant table we need to start
   3.400 +  subfic(rTmp1, len, 128);
   3.401 +  add(constantsPos, constantsPos, rTmp1);
   3.402 +
   3.403 +  // How many 16 byte chunks are in the tail
   3.404 +  srdi(rIdx, len, 4);
   3.405 +  mtctr(rIdx);
   3.406 +
   3.407 +  /*
   3.408 +   * Reduce the previously calculated 1024 bits to 64 bits, shifting
   3.409 +   * 32 bits to include the trailing 32 bits of zeros
   3.410 +   */
   3.411 +  lvx(VR0, constantsPos);
   3.412 +  lvx(VR1, off16, constantsPos);
   3.413 +  lvx(VR2, off32, constantsPos);
   3.414 +  lvx(VR3, off48, constantsPos);
   3.415 +  lvx(VR4, off64, constantsPos);
   3.416 +  lvx(VR5, off80, constantsPos);
   3.417 +  lvx(VR6, off96, constantsPos);
   3.418 +  lvx(VR7, off112, constantsPos);
   3.419 +  addi(constantsPos, constantsPos, 8 * 16);
   3.420 +
   3.421 +  vpmsumw(VR0, VR16, VR0);
   3.422 +  vpmsumw(VR1, VR17, VR1);
   3.423 +  vpmsumw(VR2, VR18, VR2);
   3.424 +  vpmsumw(VR3, VR19, VR3);
   3.425 +  vpmsumw(VR4, VR20, VR4);
   3.426 +  vpmsumw(VR5, VR21, VR5);
   3.427 +  vpmsumw(VR6, VR22, VR6);
   3.428 +  vpmsumw(VR7, VR23, VR7);
   3.429 +
   3.430 +  // Now reduce the tail (0 - 112 bytes)
   3.431 +  cmpdi(CCR0, rIdx, 0);
   3.432 +  beq(CCR0, L_XOR);
   3.433 +
   3.434 +  lvx(VR16, buf); addi(buf, buf, 16);
   3.435 +  lvx(VR17, constantsPos);
   3.436 +  vpmsumw(VR16, VR16, VR17);
   3.437 +  vxor(VR0, VR0, VR16);
   3.438 +  beq(CCR0, L_XOR);
   3.439 +
   3.440 +  lvx(VR16, buf); addi(buf, buf, 16);
   3.441 +  lvx(VR17, off16, constantsPos);
   3.442 +  vpmsumw(VR16, VR16, VR17);
   3.443 +  vxor(VR0, VR0, VR16);
   3.444 +  beq(CCR0, L_XOR);
   3.445 +
   3.446 +  lvx(VR16, buf); addi(buf, buf, 16);
   3.447 +  lvx(VR17, off32, constantsPos);
   3.448 +  vpmsumw(VR16, VR16, VR17);
   3.449 +  vxor(VR0, VR0, VR16);
   3.450 +  beq(CCR0, L_XOR);
   3.451 +
   3.452 +  lvx(VR16, buf); addi(buf, buf, 16);
   3.453 +  lvx(VR17, off48,constantsPos);
   3.454 +  vpmsumw(VR16, VR16, VR17);
   3.455 +  vxor(VR0, VR0, VR16);
   3.456 +  beq(CCR0, L_XOR);
   3.457 +
   3.458 +  lvx(VR16, buf); addi(buf, buf, 16);
   3.459 +  lvx(VR17, off64, constantsPos);
   3.460 +  vpmsumw(VR16, VR16, VR17);
   3.461 +  vxor(VR0, VR0, VR16);
   3.462 +  beq(CCR0, L_XOR);
   3.463 +
   3.464 +  lvx(VR16, buf); addi(buf, buf, 16);
   3.465 +  lvx(VR17, off80, constantsPos);
   3.466 +  vpmsumw(VR16, VR16, VR17);
   3.467 +  vxor(VR0, VR0, VR16);
   3.468 +  beq(CCR0, L_XOR);
   3.469 +
   3.470 +  lvx(VR16, buf); addi(buf, buf, 16);
   3.471 +  lvx(VR17, off96, constantsPos);
   3.472 +  vpmsumw(VR16, VR16, VR17);
   3.473 +  vxor(VR0, VR0, VR16);
   3.474 +
   3.475 +  // Now xor all the parallel chunks together
   3.476 +  BIND(L_XOR);
   3.477 +  vxor(VR0, VR0, VR1);
   3.478 +  vxor(VR2, VR2, VR3);
   3.479 +  vxor(VR4, VR4, VR5);
   3.480 +  vxor(VR6, VR6, VR7);
   3.481 +
   3.482 +  vxor(VR0, VR0, VR2);
   3.483 +  vxor(VR4, VR4, VR6);
   3.484 +
   3.485 +  vxor(VR0, VR0, VR4);
   3.486 +
   3.487 +  b(L_barrett_reduction);
   3.488 +
   3.489 +  BIND(L_first_warm_up_done);
   3.490 +  lvx(const1, constantsPos);
   3.491 +  addi(constantsPos, constantsPos, 16);
   3.492 +  vpmsumd(VR8,  VR16, const1);
   3.493 +  vpmsumd(VR9,  VR17, const1);
   3.494 +  vpmsumd(VR10, VR18, const1);
   3.495 +  vpmsumd(VR11, VR19, const1);
   3.496 +  vpmsumd(VR12, VR20, const1);
   3.497 +  vpmsumd(VR13, VR21, const1);
   3.498 +  vpmsumd(VR14, VR22, const1);
   3.499 +  vpmsumd(VR15, VR23, const1);
   3.500 +  b(L_second_cool_down);
   3.501 +
   3.502 +  BIND(L_barrett_reduction);
   3.503 +
   3.504 +  lvx(const1, barretConstants);
   3.505 +  addi(barretConstants, barretConstants, 16);
   3.506 +  lvx(const2, barretConstants);
   3.507 +
   3.508 +  vsldoi(VR1, VR0, VR0, -8);
   3.509 +  vxor(VR0, VR0, VR1);    // xor two 64 bit results together
   3.510 +
   3.511 +  // shift left one bit
   3.512 +  vspltisb(VR1, 1);
   3.513 +  vsl(VR0, VR0, VR1);
   3.514 +
   3.515 +  vand(VR0, VR0, mask_64bit);
   3.516 +
   3.517 +  /*
   3.518 +   * The reflected version of Barrett reduction. Instead of bit
   3.519 +   * reflecting our data (which is expensive to do), we bit reflect our
   3.520 +   * constants and our algorithm, which means the intermediate data in
   3.521 +   * our vector registers goes from 0-63 instead of 63-0. We can reflect
   3.522 +   * the algorithm because we don't carry in mod 2 arithmetic.
   3.523 +   */
   3.524 +  vand(VR1, VR0, mask_32bit);  // bottom 32 bits of a
   3.525 +  vpmsumd(VR1, VR1, const1);   // ma
   3.526 +  vand(VR1, VR1, mask_32bit);  // bottom 32bits of ma
   3.527 +  vpmsumd(VR1, VR1, const2);   // qn */
   3.528 +  vxor(VR0, VR0, VR1);         // a - qn, subtraction is xor in GF(2)
   3.529 +
   3.530 +  /*
   3.531 +   * Since we are bit reflected, the result (ie the low 32 bits) is in
   3.532 +   * the high 32 bits. We just need to shift it left 4 bytes
   3.533 +   * V0 [ 0 1 X 3 ]
   3.534 +   * V0 [ 0 X 2 3 ]
   3.535 +   */
   3.536 +  vsldoi(VR0, VR0, zeroes, 4);    // shift result into top 64 bits of
   3.537 +
   3.538 +  // Get it into r3
   3.539 +  mfvrd(crc, VR0);
   3.540 +
   3.541 +  BIND(L_end);
   3.542 +
   3.543 +  offsetInt = 0;
   3.544 +  // Restore non-volatile Vector registers (frameless).
   3.545 +  offsetInt -= 16; li(offset, -16);           lvx(VR20, offset, R1_SP);
   3.546 +  offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP);
   3.547 +  offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP);
   3.548 +  offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP);
   3.549 +  offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP);
   3.550 +  offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP);
   3.551 +  offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP);
   3.552 +  offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP);
   3.553 +  offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP);
   3.554 +  offsetInt -= 8;  ld(R22, offsetInt, R1_SP);
   3.555 +  offsetInt -= 8;  ld(R23, offsetInt, R1_SP);
   3.556 +  offsetInt -= 8;  ld(R24, offsetInt, R1_SP);
   3.557 +  offsetInt -= 8;  ld(R25, offsetInt, R1_SP);
   3.558 +  offsetInt -= 8;  ld(R26, offsetInt, R1_SP);
   3.559 +  offsetInt -= 8;  ld(R27, offsetInt, R1_SP);
   3.560 +  offsetInt -= 8;  ld(R28, offsetInt, R1_SP);
   3.561 +  offsetInt -= 8;  ld(R29, offsetInt, R1_SP);
   3.562 +  offsetInt -= 8;  ld(R30, offsetInt, R1_SP);
   3.563 +  offsetInt -= 8;  ld(R31, offsetInt, R1_SP);
   3.564 +}
   3.565 +
   3.566  void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
   3.567    assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
   3.568  
     4.1 --- a/src/cpu/ppc/vm/macroAssembler_ppc.hpp	Mon Sep 24 17:18:38 2018 -0400
     4.2 +++ b/src/cpu/ppc/vm/macroAssembler_ppc.hpp	Thu Sep 22 12:17:24 2016 +0200
     4.3 @@ -656,6 +656,13 @@
     4.4                            Register tc0, Register tc1, Register tc2, Register tc3);
     4.5    void kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
     4.6                            Register t0,  Register t1,  Register t2,  Register t3);
     4.7 +  void kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
     4.8 +                          Register constants, Register barretConstants,
     4.9 +                          Register t0,  Register t1, Register t2, Register t3, Register t4);
    4.10 +  void kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
    4.11 +                          Register constants, Register barretConstants,
    4.12 +                          Register t0, Register t1, Register t2);
    4.13 +
    4.14    void kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp);
    4.15  
    4.16    //
     5.1 --- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Mon Sep 24 17:18:38 2018 -0400
     5.2 +++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Thu Sep 22 12:17:24 2016 +0200
     5.3 @@ -2482,9 +2482,7 @@
     5.4     *   R5_ARG3    - int   length (of buffer)
     5.5     *
     5.6     * scratch:
     5.7 -   *   R6_ARG4    - crc table address
     5.8 -   *   R7_ARG5    - tmp1
     5.9 -   *   R8_ARG6    - tmp2
    5.10 +   *   R2, R6-R12
    5.11     *
    5.12     * Ouput:
    5.13     *   R3_RET     - int   crc result
    5.14 @@ -2496,28 +2494,62 @@
    5.15      address start = __ function_entry();  // Remember stub start address (is rtn value).
    5.16  
    5.17      // arguments to kernel_crc32:
    5.18 -    Register       crc     = R3_ARG1;  // Current checksum, preset by caller or result from previous call.
    5.19 -    Register       data    = R4_ARG2;  // source byte array
    5.20 -    Register       dataLen = R5_ARG3;  // #bytes to process
    5.21 -    Register       table   = R6_ARG4;  // crc table address
    5.22 -
    5.23 -    Register       t0      = R9;       // work reg for kernel* emitters
    5.24 -    Register       t1      = R10;      // work reg for kernel* emitters
    5.25 -    Register       t2      = R11;      // work reg for kernel* emitters
    5.26 -    Register       t3      = R12;      // work reg for kernel* emitters
    5.27 -
    5.28 -    BLOCK_COMMENT("Stub body {");
    5.29 -    assert_different_registers(crc, data, dataLen, table);
    5.30 -
    5.31 -    StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
    5.32 -
    5.33 -    __ kernel_crc32_1byte(crc, data, dataLen, table, t0, t1, t2, t3);
    5.34 -
    5.35 -    BLOCK_COMMENT("return");
    5.36 -    __ mr_if_needed(R3_RET, crc);      // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
    5.37 -    __ blr();
    5.38 -
    5.39 -    BLOCK_COMMENT("} Stub body");
    5.40 +    const Register crc     = R3_ARG1;  // Current checksum, preset by caller or result from previous call.
    5.41 +    const Register data    = R4_ARG2;  // source byte array
    5.42 +    const Register dataLen = R5_ARG3;  // #bytes to process
    5.43 +
    5.44 +    const Register table   = R6;       // crc table address
    5.45 +
    5.46 +#ifdef VM_LITTLE_ENDIAN
    5.47 +    if (VM_Version::has_vpmsumb()) {
    5.48 +      const Register constants    = R2;  // constants address
    5.49 +      const Register bconstants   = R8;  // barret table address
    5.50 +
    5.51 +      const Register t0      = R9;
    5.52 +      const Register t1      = R10;
    5.53 +      const Register t2      = R11;
    5.54 +      const Register t3      = R12;
    5.55 +      const Register t4      = R7;
    5.56 +
    5.57 +      BLOCK_COMMENT("Stub body {");
    5.58 +      assert_different_registers(crc, data, dataLen, table);
    5.59 +
    5.60 +      StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
    5.61 +      StubRoutines::ppc64::generate_load_crc_constants_addr(_masm, constants);
    5.62 +      StubRoutines::ppc64::generate_load_crc_barret_constants_addr(_masm, bconstants);
    5.63 +
    5.64 +      __ kernel_crc32_1word_vpmsumd(crc, data, dataLen, table, constants, bconstants, t0, t1, t2, t3, t4);
    5.65 +
    5.66 +      BLOCK_COMMENT("return");
    5.67 +      __ mr_if_needed(R3_RET, crc);      // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
    5.68 +      __ blr();
    5.69 +
    5.70 +      BLOCK_COMMENT("} Stub body");
    5.71 +    } else
    5.72 +#endif
    5.73 +    {
    5.74 +      const Register t0      = R2;
    5.75 +      const Register t1      = R7;
    5.76 +      const Register t2      = R8;
    5.77 +      const Register t3      = R9;
    5.78 +      const Register tc0     = R10;
    5.79 +      const Register tc1     = R11;
    5.80 +      const Register tc2     = R12;
    5.81 +
    5.82 +      BLOCK_COMMENT("Stub body {");
    5.83 +      assert_different_registers(crc, data, dataLen, table);
    5.84 +
    5.85 +      StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
    5.86 +
    5.87 +      __ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, table);
    5.88 +
    5.89 +      BLOCK_COMMENT("return");
    5.90 +      __ mr_if_needed(R3_RET, crc);      // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
    5.91 +      __ blr();
    5.92 +
    5.93 +      BLOCK_COMMENT("} Stub body");
    5.94 +    }
    5.95 +
    5.96      return start;
    5.97    }
    5.98  
     6.1 --- a/src/cpu/ppc/vm/stubRoutines_ppc_64.cpp	Mon Sep 24 17:18:38 2018 -0400
     6.2 +++ b/src/cpu/ppc/vm/stubRoutines_ppc_64.cpp	Thu Sep 22 12:17:24 2016 +0200
     6.3 @@ -37,6 +37,311 @@
     6.4    __ load_const(table, StubRoutines::_crc_table_adr);
     6.5  }
     6.6  
     6.7 +void StubRoutines::ppc64::generate_load_crc_constants_addr(MacroAssembler* masm, Register table) {
     6.8 +  __ load_const_optimized(table, (address)StubRoutines::ppc64::_constants, R0);
     6.9 +}
    6.10 +
    6.11 +void StubRoutines::ppc64::generate_load_crc_barret_constants_addr(MacroAssembler* masm, Register table) {
    6.12 +  __ load_const_optimized(table, (address)StubRoutines::ppc64::_barret_constants, R0);
    6.13 +}
    6.14 +
    6.15 +juint* StubRoutines::ppc64::generate_crc_constants() {
    6.16 +  juint constants[CRC32_CONSTANTS_SIZE] = {
    6.17 +      // Reduce 262144 kbits to 1024 bits
    6.18 +      0x99ea94a8UL, 0x00000000UL, 0x651797d2UL, 0x00000001UL,       // x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1
    6.19 +      0x945a8420UL, 0x00000000UL, 0x21e0d56cUL, 0x00000000UL,       // x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1
    6.20 +      0x30762706UL, 0x00000000UL, 0x0f95ecaaUL, 0x00000000UL,       // x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1
    6.21 +      0xa52fc582UL, 0x00000001UL, 0xebd224acUL, 0x00000001UL,       // x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1
    6.22 +      0xa4a7167aUL, 0x00000001UL, 0x0ccb97caUL, 0x00000000UL,       // x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1
    6.23 +      0x0c18249aUL, 0x00000000UL, 0x006ec8a8UL, 0x00000001UL,       // x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1
    6.24 +      0xa924ae7cUL, 0x00000000UL, 0x4f58f196UL, 0x00000001UL,       // x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1
    6.25 +      0xe12ccc12UL, 0x00000001UL, 0xa7192ca6UL, 0x00000001UL,       // x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1
    6.26 +      0xa0b9d4acUL, 0x00000000UL, 0x9a64bab2UL, 0x00000001UL,       // x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1
    6.27 +      0x95e8ddfeUL, 0x00000000UL, 0x14f4ed2eUL, 0x00000000UL,       // x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1
    6.28 +      0x233fddc4UL, 0x00000000UL, 0x1092b6a2UL, 0x00000001UL,       // x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1
    6.29 +      0xb4529b62UL, 0x00000001UL, 0xc8a1629cUL, 0x00000000UL,       // x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1
    6.30 +      0xa7fa0e64UL, 0x00000001UL, 0x7bf32e8eUL, 0x00000001UL,       // x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1
    6.31 +      0xb5334592UL, 0x00000001UL, 0xf8cc6582UL, 0x00000001UL,       // x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1
    6.32 +      0x1f8ee1b4UL, 0x00000001UL, 0x8631ddf0UL, 0x00000000UL,       // x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1
    6.33 +      0x6252e632UL, 0x00000000UL, 0x7e5a76d0UL, 0x00000000UL,       // x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1
    6.34 +      0xab973e84UL, 0x00000000UL, 0x2b09b31cUL, 0x00000000UL,       // x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1
    6.35 +      0x7734f5ecUL, 0x00000000UL, 0xb2df1f84UL, 0x00000001UL,       // x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1
    6.36 +      0x7c547798UL, 0x00000000UL, 0xd6f56afcUL, 0x00000001UL,       // x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1
    6.37 +      0x7ec40210UL, 0x00000000UL, 0xb9b5e70cUL, 0x00000001UL,       // x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1
    6.38 +      0xab1695a8UL, 0x00000001UL, 0x34b626d2UL, 0x00000000UL,       // x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1
    6.39 +      0x90494bbaUL, 0x00000000UL, 0x4c53479aUL, 0x00000001UL,       // x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1
    6.40 +      0x123fb816UL, 0x00000001UL, 0xa6d179a4UL, 0x00000001UL,       // x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1
    6.41 +      0xe188c74cUL, 0x00000001UL, 0x5abd16b4UL, 0x00000001UL,       // x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1
    6.42 +      0xc2d3451cUL, 0x00000001UL, 0x018f9852UL, 0x00000000UL,       // x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1
    6.43 +      0xf55cf1caUL, 0x00000000UL, 0x1fb3084aUL, 0x00000000UL,       // x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1
    6.44 +      0xa0531540UL, 0x00000001UL, 0xc53dfb04UL, 0x00000000UL,       // x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1
    6.45 +      0x32cd7ebcUL, 0x00000001UL, 0xe10c9ad6UL, 0x00000000UL,       // x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1
    6.46 +      0x73ab7f36UL, 0x00000000UL, 0x25aa994aUL, 0x00000000UL,       // x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1
    6.47 +      0x41aed1c2UL, 0x00000000UL, 0xfa3a74c4UL, 0x00000000UL,       // x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1
    6.48 +      0x36c53800UL, 0x00000001UL, 0x33eb3f40UL, 0x00000000UL,       // x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1
    6.49 +      0x26835a30UL, 0x00000001UL, 0x7193f296UL, 0x00000001UL,       // x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1
    6.50 +      0x6241b502UL, 0x00000000UL, 0x43f6c86aUL, 0x00000000UL,       // x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1
    6.51 +      0xd5196ad4UL, 0x00000000UL, 0x6b513ec6UL, 0x00000001UL,       // x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1
    6.52 +      0x9cfa769aUL, 0x00000000UL, 0xc8f25b4eUL, 0x00000000UL,       // x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1
    6.53 +      0x920e5df4UL, 0x00000000UL, 0xa45048ecUL, 0x00000001UL,       // x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1
    6.54 +      0x69dc310eUL, 0x00000001UL, 0x0c441004UL, 0x00000000UL,       // x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1
    6.55 +      0x09fc331cUL, 0x00000000UL, 0x0e17cad6UL, 0x00000000UL,       // x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1
    6.56 +      0x0d94a81eUL, 0x00000001UL, 0x253ae964UL, 0x00000001UL,       // x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1
    6.57 +      0x27a20ab2UL, 0x00000000UL, 0xd7c88ebcUL, 0x00000001UL,       // x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1
    6.58 +      0x14f87504UL, 0x00000001UL, 0xe7ca913aUL, 0x00000001UL,       // x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1
    6.59 +      0x4b076d96UL, 0x00000000UL, 0x33ed078aUL, 0x00000000UL,       // x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1
    6.60 +      0xda4d1e74UL, 0x00000000UL, 0xe1839c78UL, 0x00000000UL,       // x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1
    6.61 +      0x1b81f672UL, 0x00000000UL, 0x322b267eUL, 0x00000001UL,       // x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1
    6.62 +      0x9367c988UL, 0x00000000UL, 0x638231b6UL, 0x00000000UL,       // x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1
    6.63 +      0x717214caUL, 0x00000001UL, 0xee7f16f4UL, 0x00000001UL,       // x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1
    6.64 +      0x9f47d820UL, 0x00000000UL, 0x17d9924aUL, 0x00000001UL,       // x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1
    6.65 +      0x0d9a47d2UL, 0x00000001UL, 0xe1a9e0c4UL, 0x00000000UL,       // x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1
    6.66 +      0xa696c58cUL, 0x00000000UL, 0x403731dcUL, 0x00000001UL,       // x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1
    6.67 +      0x2aa28ec6UL, 0x00000000UL, 0xa5ea9682UL, 0x00000001UL,       // x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1
    6.68 +      0xfe18fd9aUL, 0x00000001UL, 0x01c5c578UL, 0x00000001UL,       // x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1
    6.69 +      0x9d4fc1aeUL, 0x00000001UL, 0xdddf6494UL, 0x00000000UL,       // x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1
    6.70 +      0xba0e3deaUL, 0x00000001UL, 0xf1c3db28UL, 0x00000000UL,       // x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1
    6.71 +      0x74b59a5eUL, 0x00000000UL, 0x3112fb9cUL, 0x00000001UL,       // x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1
    6.72 +      0xf2b5ea98UL, 0x00000000UL, 0xb680b906UL, 0x00000000UL,       // x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1
    6.73 +      0x87132676UL, 0x00000001UL, 0x1a282932UL, 0x00000000UL,       // x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1
    6.74 +      0x0a8c6ad4UL, 0x00000001UL, 0x89406e7eUL, 0x00000000UL,       // x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1
    6.75 +      0xe21dfe70UL, 0x00000001UL, 0xdef6be8cUL, 0x00000001UL,       // x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1
    6.76 +      0xda0050e4UL, 0x00000001UL, 0x75258728UL, 0x00000000UL,       // x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1
    6.77 +      0x772172aeUL, 0x00000000UL, 0x9536090aUL, 0x00000001UL,       // x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1
    6.78 +      0xe47724aaUL, 0x00000000UL, 0xf2455bfcUL, 0x00000000UL,       // x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1
    6.79 +      0x3cd63ac4UL, 0x00000000UL, 0x8c40baf4UL, 0x00000001UL,       // x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1
    6.80 +      0xbf47d352UL, 0x00000001UL, 0x4cd390d4UL, 0x00000000UL,       // x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1
    6.81 +      0x8dc1d708UL, 0x00000001UL, 0xe4ece95aUL, 0x00000001UL,       // x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1
    6.82 +      0x2d4620a4UL, 0x00000000UL, 0x1a3ee918UL, 0x00000000UL,       // x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1
    6.83 +      0x58fd1740UL, 0x00000000UL, 0x7c652fb8UL, 0x00000000UL,       // x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1
    6.84 +      0xdadd9bfcUL, 0x00000000UL, 0x1c67842cUL, 0x00000001UL,       // x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1
    6.85 +      0xea2140beUL, 0x00000001UL, 0x254f759cUL, 0x00000000UL,       // x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1
    6.86 +      0x9de128baUL, 0x00000000UL, 0x7ece94caUL, 0x00000000UL,       // x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1
    6.87 +      0x3ac3aa8eUL, 0x00000001UL, 0x38f258c2UL, 0x00000000UL,       // x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1
    6.88 +      0x99980562UL, 0x00000000UL, 0xcdf17b00UL, 0x00000001UL,       // x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1
    6.89 +      0xc1579c86UL, 0x00000001UL, 0x1f882c16UL, 0x00000001UL,       // x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1
    6.90 +      0x68dbbf94UL, 0x00000000UL, 0x00093fc8UL, 0x00000001UL,       // x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1
    6.91 +      0x4509fb04UL, 0x00000000UL, 0xcd684f16UL, 0x00000001UL,       // x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1
    6.92 +      0x202f6398UL, 0x00000001UL, 0x4bc6a70aUL, 0x00000000UL,       // x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1
    6.93 +      0x3aea243eUL, 0x00000001UL, 0x4fc7e8e4UL, 0x00000000UL,       // x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1
    6.94 +      0xb4052ae6UL, 0x00000001UL, 0x30103f1cUL, 0x00000001UL,       // x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1
    6.95 +      0xcd2a0ae8UL, 0x00000001UL, 0x11b0024cUL, 0x00000001UL,       // x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1
    6.96 +      0xfe4aa8b4UL, 0x00000001UL, 0x0b3079daUL, 0x00000001UL,       // x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1
    6.97 +      0xd1559a42UL, 0x00000001UL, 0x0192bcc2UL, 0x00000001UL,       // x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1
    6.98 +      0xf3e05eccUL, 0x00000001UL, 0x74838d50UL, 0x00000000UL,       // x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1
    6.99 +      0x04ddd2ccUL, 0x00000001UL, 0x1b20f520UL, 0x00000000UL,       // x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1
   6.100 +      0x5393153cUL, 0x00000001UL, 0x50c3590aUL, 0x00000000UL,       // x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1
   6.101 +      0x57e942c6UL, 0x00000000UL, 0xb41cac8eUL, 0x00000000UL,       // x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1
   6.102 +      0x2c633850UL, 0x00000001UL, 0x0c72cc78UL, 0x00000000UL,       // x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1
   6.103 +      0xebcaae4cUL, 0x00000000UL, 0x30cdb032UL, 0x00000000UL,       // x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1
   6.104 +      0x3ee532a6UL, 0x00000001UL, 0x3e09fc32UL, 0x00000001UL,       // x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1
   6.105 +      0xbf0cbc7eUL, 0x00000001UL, 0x1ed624d2UL, 0x00000000UL,       // x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1
   6.106 +      0xd50b7a5aUL, 0x00000000UL, 0x781aee1aUL, 0x00000000UL,       // x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1
   6.107 +      0x02fca6e8UL, 0x00000000UL, 0xc4d8348cUL, 0x00000001UL,       // x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1
   6.108 +      0x7af40044UL, 0x00000000UL, 0x57a40336UL, 0x00000000UL,       // x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1
   6.109 +      0x16178744UL, 0x00000000UL, 0x85544940UL, 0x00000000UL,       // x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1
   6.110 +      0x4c177458UL, 0x00000001UL, 0x9cd21e80UL, 0x00000001UL,       // x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1
   6.111 +      0x1b6ddf04UL, 0x00000001UL, 0x3eb95bc0UL, 0x00000001UL,       // x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1
   6.112 +      0xf3e29cccUL, 0x00000001UL, 0xdfc9fdfcUL, 0x00000001UL,       // x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1
   6.113 +      0x35ae7562UL, 0x00000001UL, 0xcd028bc2UL, 0x00000000UL,       // x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1
   6.114 +      0x90ef812cUL, 0x00000001UL, 0x90db8c44UL, 0x00000000UL,       // x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1
   6.115 +      0x67a2c786UL, 0x00000000UL, 0x0010a4ceUL, 0x00000001UL,       // x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1
   6.116 +      0x48b9496cUL, 0x00000000UL, 0xc8f4c72cUL, 0x00000001UL,       // x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1
   6.117 +      0x5a422de6UL, 0x00000001UL, 0x1c26170cUL, 0x00000000UL,       // x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1
   6.118 +      0xef0e3640UL, 0x00000001UL, 0xe3fccf68UL, 0x00000000UL,       // x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1
   6.119 +      0x006d2d26UL, 0x00000001UL, 0xd513ed24UL, 0x00000000UL,       // x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1
   6.120 +      0x170d56d6UL, 0x00000001UL, 0x141beadaUL, 0x00000000UL,       // x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1
   6.121 +      0xa5fb613cUL, 0x00000000UL, 0x1071aea0UL, 0x00000001UL,       // x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1
   6.122 +      0x40bbf7fcUL, 0x00000000UL, 0x2e19080aUL, 0x00000001UL,       // x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1
   6.123 +      0x6ac3a5b2UL, 0x00000001UL, 0x00ecf826UL, 0x00000001UL,       // x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1
   6.124 +      0xabf16230UL, 0x00000000UL, 0x69b09412UL, 0x00000000UL,       // x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1
   6.125 +      0xebe23facUL, 0x00000001UL, 0x22297bacUL, 0x00000001UL,       // x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1
   6.126 +      0x8b6a0894UL, 0x00000000UL, 0xe9e4b068UL, 0x00000000UL,       // x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1
   6.127 +      0x288ea478UL, 0x00000001UL, 0x4b38651aUL, 0x00000000UL,       // x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1
   6.128 +      0x6619c442UL, 0x00000001UL, 0x468360e2UL, 0x00000001UL,       // x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1
   6.129 +      0x86230038UL, 0x00000000UL, 0x121c2408UL, 0x00000000UL,       // x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1
   6.130 +      0x7746a756UL, 0x00000001UL, 0xda7e7d08UL, 0x00000000UL,       // x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1
   6.131 +      0x91b8f8f8UL, 0x00000001UL, 0x058d7652UL, 0x00000001UL,       // x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1
   6.132 +      0x8e167708UL, 0x00000000UL, 0x4a098a90UL, 0x00000001UL,       // x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1
   6.133 +      0x48b22d54UL, 0x00000001UL, 0x20dbe72eUL, 0x00000000UL,       // x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1
   6.134 +      0x44ba2c3cUL, 0x00000000UL, 0x1e7323e8UL, 0x00000001UL,       // x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1
   6.135 +      0xb54d2b52UL, 0x00000000UL, 0xd5d4bf94UL, 0x00000000UL,       // x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1
   6.136 +      0x05a4fd8aUL, 0x00000000UL, 0x99d8746cUL, 0x00000001UL,       // x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1
   6.137 +      0x39f9fc46UL, 0x00000001UL, 0xce9ca8a0UL, 0x00000000UL,       // x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1
   6.138 +      0x5a1fa824UL, 0x00000001UL, 0x136edeceUL, 0x00000000UL,       // x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1
   6.139 +      0x0a61ae4cUL, 0x00000000UL, 0x9b92a068UL, 0x00000001UL,       // x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1
   6.140 +      0x45e9113eUL, 0x00000001UL, 0x71d62206UL, 0x00000000UL,       // x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1
   6.141 +      0x6a348448UL, 0x00000000UL, 0xdfc50158UL, 0x00000000UL,       // x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1
   6.142 +      0x4d80a08cUL, 0x00000000UL, 0x517626bcUL, 0x00000001UL,       // x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1
   6.143 +      0x4b6837a0UL, 0x00000001UL, 0x48d1e4faUL, 0x00000001UL,       // x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1
   6.144 +      0x6896a7fcUL, 0x00000001UL, 0x94d8266eUL, 0x00000000UL,       // x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1
   6.145 +      0x4f187140UL, 0x00000001UL, 0x606c5e34UL, 0x00000000UL,       // x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1
   6.146 +      0x9581b9daUL, 0x00000001UL, 0x9766beaaUL, 0x00000001UL,       // x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1
   6.147 +      0x091bc984UL, 0x00000001UL, 0xd80c506cUL, 0x00000001UL,       // x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1
   6.148 +      0x1067223cUL, 0x00000000UL, 0x1e73837cUL, 0x00000000UL,       // x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1
   6.149 +      0xab16ea02UL, 0x00000001UL, 0x64d587deUL, 0x00000000UL,       // x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1
   6.150 +      0x3c4598a8UL, 0x00000001UL, 0xf4a507b0UL, 0x00000000UL,       // x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1
   6.151 +      0xb3735430UL, 0x00000000UL, 0x40e342fcUL, 0x00000000UL,       // x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1
   6.152 +      0xbb3fc0c0UL, 0x00000001UL, 0xd5ad9c3aUL, 0x00000001UL,       // x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1
   6.153 +      0x570ae19cUL, 0x00000001UL, 0x94a691a4UL, 0x00000000UL,       // x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1
   6.154 +      0xea910712UL, 0x00000001UL, 0x271ecdfaUL, 0x00000001UL,       // x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1
   6.155 +      0x67127128UL, 0x00000001UL, 0x9e54475aUL, 0x00000000UL,       // x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1
   6.156 +      0x19e790a2UL, 0x00000000UL, 0xc9c099eeUL, 0x00000000UL,       // x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1
   6.157 +      0x3788f710UL, 0x00000000UL, 0x9a2f736cUL, 0x00000000UL,       // x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1
   6.158 +      0x682a160eUL, 0x00000001UL, 0xbb9f4996UL, 0x00000000UL,       // x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1
   6.159 +      0x7f0ebd2eUL, 0x00000000UL, 0xdb688050UL, 0x00000001UL,       // x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1
   6.160 +      0x2b032080UL, 0x00000000UL, 0xe9b10af4UL, 0x00000000UL,       // x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1
   6.161 +      0xcfd1664aUL, 0x00000000UL, 0x2d4545e4UL, 0x00000001UL,       // x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1
   6.162 +      0xaa1181c2UL, 0x00000000UL, 0x0361139cUL, 0x00000000UL,       // x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1
   6.163 +      0xddd08002UL, 0x00000000UL, 0xa5a1a3a8UL, 0x00000001UL,       // x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1
   6.164 +      0xe8dd0446UL, 0x00000000UL, 0x6844e0b0UL, 0x00000000UL,       // x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1
   6.165 +      0xbbd94a00UL, 0x00000001UL, 0xc3762f28UL, 0x00000000UL,       // x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1
   6.166 +      0xab6cd180UL, 0x00000000UL, 0xd26287a2UL, 0x00000001UL,       // x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1
   6.167 +      0x31803ce2UL, 0x00000000UL, 0xf6f0bba8UL, 0x00000001UL,       // x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1
   6.168 +      0x24f40b0cUL, 0x00000000UL, 0x2ffabd62UL, 0x00000000UL,       // x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1
   6.169 +      0xba1d9834UL, 0x00000001UL, 0xfb4516b8UL, 0x00000000UL,       // x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1
   6.170 +      0x04de61aaUL, 0x00000001UL, 0x8cfa961cUL, 0x00000001UL,       // x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1
   6.171 +      0x13e40d46UL, 0x00000001UL, 0x9e588d52UL, 0x00000001UL,       // x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1
   6.172 +      0x415598a0UL, 0x00000001UL, 0x180f0bbcUL, 0x00000001UL,       // x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1
   6.173 +      0xbf6c8c90UL, 0x00000000UL, 0xe1d9177aUL, 0x00000000UL,       // x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1
   6.174 +      0x788b0504UL, 0x00000001UL, 0x05abc27cUL, 0x00000001UL,       // x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1
   6.175 +      0x38385d02UL, 0x00000000UL, 0x972e4a58UL, 0x00000000UL,       // x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1
   6.176 +      0xb6c83844UL, 0x00000001UL, 0x83499a5eUL, 0x00000001UL,       // x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1
   6.177 +      0x51061a8aUL, 0x00000000UL, 0xc96a8ccaUL, 0x00000001UL,       // x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1
   6.178 +      0x7351388aUL, 0x00000001UL, 0xa1a5b60cUL, 0x00000001UL,       // x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1
   6.179 +      0x32928f92UL, 0x00000001UL, 0xe4b6ac9cUL, 0x00000000UL,       // x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1
   6.180 +      0xe6b4f48aUL, 0x00000000UL, 0x807e7f5aUL, 0x00000001UL,       // x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1
   6.181 +      0x39d15e90UL, 0x00000000UL, 0x7a7e3bc8UL, 0x00000001UL,       // x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1
   6.182 +      0x312d6074UL, 0x00000000UL, 0xd73975daUL, 0x00000000UL,       // x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1
   6.183 +      0x7bbb2cc4UL, 0x00000001UL, 0x7375d038UL, 0x00000001UL,       // x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1
   6.184 +      0x6ded3e18UL, 0x00000001UL, 0x193680bcUL, 0x00000000UL,       // x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1
   6.185 +      0xf1638b16UL, 0x00000000UL, 0x999b06f6UL, 0x00000000UL,       // x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1
   6.186 +      0xd38b9eccUL, 0x00000001UL, 0xf685d2b8UL, 0x00000001UL,       // x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1
   6.187 +      0x8b8d09dcUL, 0x00000001UL, 0xf4ecbed2UL, 0x00000001UL,       // x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1
   6.188 +      0xe7bc27d2UL, 0x00000000UL, 0xba16f1a0UL, 0x00000000UL,       // x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1
   6.189 +      0x275e1e96UL, 0x00000000UL, 0x15aceac4UL, 0x00000001UL,       // x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1
   6.190 +      0xe2e3031eUL, 0x00000000UL, 0xaeff6292UL, 0x00000001UL,       // x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1
   6.191 +      0x041c84d8UL, 0x00000001UL, 0x9640124cUL, 0x00000000UL,       // x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1
   6.192 +      0x706ce672UL, 0x00000000UL, 0x14f41f02UL, 0x00000001UL,       // x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1
   6.193 +      0x5d5070daUL, 0x00000001UL, 0x9c5f3586UL, 0x00000000UL,       // x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1
   6.194 +      0x38f9493aUL, 0x00000000UL, 0x878275faUL, 0x00000001UL,       // x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1
   6.195 +      0xa3348a76UL, 0x00000000UL, 0xddc42ce8UL, 0x00000000UL,       // x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1
   6.196 +      0xad0aab92UL, 0x00000001UL, 0x81d2c73aUL, 0x00000001UL,       // x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1
   6.197 +      0x9e85f712UL, 0x00000001UL, 0x41c9320aUL, 0x00000001UL,       // x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1
   6.198 +      0x5a871e76UL, 0x00000000UL, 0x5235719aUL, 0x00000001UL,       // x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1
   6.199 +      0x7249c662UL, 0x00000001UL, 0xbe27d804UL, 0x00000000UL,       // x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1
   6.200 +      0x3a084712UL, 0x00000000UL, 0x6242d45aUL, 0x00000000UL,       // x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1
   6.201 +      0xed438478UL, 0x00000000UL, 0x9a53638eUL, 0x00000000UL,       // x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1
   6.202 +      0xabac34ccUL, 0x00000000UL, 0x001ecfb6UL, 0x00000001UL,       // x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1
   6.203 +      0x5f35ef3eUL, 0x00000000UL, 0x6d7c2d64UL, 0x00000001UL,       // x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1
   6.204 +      0x47d6608cUL, 0x00000000UL, 0xd0ce46c0UL, 0x00000001UL,       // x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1
   6.205 +      0x2d01470eUL, 0x00000000UL, 0x24c907b4UL, 0x00000001UL,       // x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1
   6.206 +      0x58bbc7b0UL, 0x00000001UL, 0x18a555caUL, 0x00000000UL,       // x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1
   6.207 +      0xc0a23e8eUL, 0x00000000UL, 0x6b0980bcUL, 0x00000000UL,       // x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1
   6.208 +      0xebd85c88UL, 0x00000001UL, 0x8bbba964UL, 0x00000000UL,       // x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1
   6.209 +      0x9ee20bb2UL, 0x00000001UL, 0x070a5a1eUL, 0x00000001UL,       // x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1
   6.210 +      0xacabf2d6UL, 0x00000001UL, 0x2204322aUL, 0x00000000UL,       // x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1
   6.211 +      0xb7963d56UL, 0x00000001UL, 0xa27524d0UL, 0x00000000UL,       // x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1
   6.212 +      0x7bffa1feUL, 0x00000001UL, 0x20b1e4baUL, 0x00000000UL,       // x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1
   6.213 +      0x1f15333eUL, 0x00000000UL, 0x32cc27fcUL, 0x00000000UL,       // x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1
   6.214 +      0x8593129eUL, 0x00000001UL, 0x44dd22b8UL, 0x00000000UL,       // x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1
   6.215 +      0x9cb32602UL, 0x00000001UL, 0xdffc9e0aUL, 0x00000000UL,       // x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1
   6.216 +      0x42b05cc8UL, 0x00000001UL, 0xb7a0ed14UL, 0x00000001UL,       // x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1
   6.217 +      0xbe49e7a4UL, 0x00000001UL, 0xc7842488UL, 0x00000000UL,       // x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1
   6.218 +      0x08f69d6cUL, 0x00000001UL, 0xc02a4feeUL, 0x00000001UL,       // x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1
   6.219 +      0x6c0971f0UL, 0x00000000UL, 0x3c273778UL, 0x00000000UL,       // x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1
   6.220 +      0x5b16467aUL, 0x00000000UL, 0xd63f8894UL, 0x00000001UL,       // x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1
   6.221 +      0x551a628eUL, 0x00000001UL, 0x6be557d6UL, 0x00000000UL,       // x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1
   6.222 +      0x9e42ea92UL, 0x00000001UL, 0x6a7806eaUL, 0x00000000UL,       // x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1
   6.223 +      0x2fa83ff2UL, 0x00000001UL, 0x6155aa0cUL, 0x00000001UL,       // x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1
   6.224 +      0x1ca9cde0UL, 0x00000001UL, 0x908650acUL, 0x00000000UL,       // x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1
   6.225 +      0xc8e5cd74UL, 0x00000000UL, 0xaa5a8084UL, 0x00000000UL,       // x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1
   6.226 +      0x96c27f0cUL, 0x00000000UL, 0x91bb500aUL, 0x00000001UL,       // x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1
   6.227 +      0x2baed926UL, 0x00000000UL, 0x64e9bed0UL, 0x00000000UL,       // x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1
   6.228 +      0x7c8de8d2UL, 0x00000001UL, 0x9444f302UL, 0x00000000UL,       // x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1
   6.229 +      0xd43d6068UL, 0x00000000UL, 0x9db07d3cUL, 0x00000001UL,       // x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1
   6.230 +      0xcb2c4b26UL, 0x00000000UL, 0x359e3e6eUL, 0x00000001UL,       // x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1
   6.231 +      0x45b8da26UL, 0x00000001UL, 0xe4f10dd2UL, 0x00000001UL,       // x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1
   6.232 +      0x8fff4b08UL, 0x00000001UL, 0x24f5735eUL, 0x00000001UL,       // x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1
   6.233 +      0x50b58ed0UL, 0x00000001UL, 0x24760a4cUL, 0x00000001UL,       // x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1
   6.234 +      0x549f39bcUL, 0x00000001UL, 0x0f1fc186UL, 0x00000000UL,       // x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1
   6.235 +      0xef4d2f42UL, 0x00000000UL, 0x150e4cc4UL, 0x00000000UL,       // x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1
   6.236 +      0xb1468572UL, 0x00000001UL, 0x2a6204e8UL, 0x00000000UL,       // x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1
   6.237 +      0x3d7403b2UL, 0x00000001UL, 0xbeb1d432UL, 0x00000000UL,       // x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1
   6.238 +      0xa4681842UL, 0x00000001UL, 0x35f3f1f0UL, 0x00000001UL,       // x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1
   6.239 +      0x67714492UL, 0x00000001UL, 0x74fe2232UL, 0x00000000UL,       // x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1
   6.240 +      0xe599099aUL, 0x00000001UL, 0x1ac6e2baUL, 0x00000000UL,       // x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1
   6.241 +      0xfe128194UL, 0x00000000UL, 0x13fca91eUL, 0x00000000UL,       // x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1
   6.242 +      0x77e8b990UL, 0x00000000UL, 0x83f4931eUL, 0x00000001UL,       // x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1
   6.243 +      0xa267f63aUL, 0x00000001UL, 0xb6d9b4e4UL, 0x00000000UL,       // x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1
   6.244 +      0x945c245aUL, 0x00000001UL, 0xb5188656UL, 0x00000000UL,       // x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1
   6.245 +      0x49002e76UL, 0x00000001UL, 0x27a81a84UL, 0x00000000UL,       // x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1
   6.246 +      0xbb8310a4UL, 0x00000001UL, 0x25699258UL, 0x00000001UL,       // x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1
   6.247 +      0x9ec60bccUL, 0x00000001UL, 0xb23de796UL, 0x00000001UL,       // x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1
   6.248 +      0x2d8590aeUL, 0x00000001UL, 0xfe4365dcUL, 0x00000000UL,       // x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1
   6.249 +      0x65b00684UL, 0x00000000UL, 0xc68f497aUL, 0x00000000UL,       // x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1
   6.250 +      0x5e5aeadcUL, 0x00000001UL, 0xfbf521eeUL, 0x00000000UL,       // x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1
   6.251 +      0xb77ff2b0UL, 0x00000000UL, 0x5eac3378UL, 0x00000001UL,       // x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1
   6.252 +      0x88da2ff6UL, 0x00000001UL, 0x34914b90UL, 0x00000001UL,       // x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1
   6.253 +      0x63da929aUL, 0x00000000UL, 0x16335cfeUL, 0x00000000UL,       // x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1
   6.254 +      0x389caa80UL, 0x00000001UL, 0x0372d10cUL, 0x00000001UL,       // x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1
   6.255 +      0x3db599d2UL, 0x00000001UL, 0x5097b908UL, 0x00000001UL,       // x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1
   6.256 +      0x22505a86UL, 0x00000001UL, 0x227a7572UL, 0x00000001UL,       // x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1
   6.257 +      0x6bd72746UL, 0x00000001UL, 0x9a8f75c0UL, 0x00000000UL,       // x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1
   6.258 +      0xc3faf1d4UL, 0x00000001UL, 0x682c77a2UL, 0x00000000UL,       // x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1
   6.259 +      0x111c826cUL, 0x00000001UL, 0x231f091cUL, 0x00000000UL,       // x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1
   6.260 +      0x153e9fb2UL, 0x00000000UL, 0x7d4439f2UL, 0x00000000UL,       // x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1
   6.261 +      0x2b1f7b60UL, 0x00000000UL, 0x7e221efcUL, 0x00000001UL,       // x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1
   6.262 +      0xb1dba570UL, 0x00000000UL, 0x67457c38UL, 0x00000001UL,       // x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1
   6.263 +      0xf6397b76UL, 0x00000001UL, 0xbdf081c4UL, 0x00000000UL,       // x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1
   6.264 +      0x56335214UL, 0x00000001UL, 0x6286d6b0UL, 0x00000001UL,       // x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1
   6.265 +      0xd70e3986UL, 0x00000001UL, 0xc84f001cUL, 0x00000000UL,       // x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1
   6.266 +      0x3701a774UL, 0x00000000UL, 0x64efe7c0UL, 0x00000000UL,       // x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1
   6.267 +      0xac81ef72UL, 0x00000000UL, 0x0ac2d904UL, 0x00000000UL,       // x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1
   6.268 +      0x33212464UL, 0x00000001UL, 0xfd226d14UL, 0x00000000UL,       // x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1
   6.269 +      0xe4e45610UL, 0x00000000UL, 0x1cfd42e0UL, 0x00000001UL,       // x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1
   6.270 +      0x0c1bd370UL, 0x00000000UL, 0x6e5a5678UL, 0x00000001UL,       // x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1
   6.271 +      0xa7b9e7a6UL, 0x00000001UL, 0xd888fe22UL, 0x00000001UL,       // x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1
   6.272 +      0x7d657a10UL, 0x00000000UL, 0xaf77fcd4UL, 0x00000001UL,       // x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1
   6.273 +
   6.274 +      // Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros
   6.275 +      0xec447f11UL, 0x99168a18UL, 0x13e8221eUL, 0xed837b26UL,       // x^2048 mod p(x)`, x^2016 mod p(x)`, x^1984 mod p(x)`, x^1952 mod p(x)`
   6.276 +      0x8fd2cd3cUL, 0xe23e954eUL, 0x47b9ce5aUL, 0xc8acdd81UL,       // x^1920 mod p(x)`, x^1888 mod p(x)`, x^1856 mod p(x)`, x^1824 mod p(x)`
   6.277 +      0x6b1d2b53UL, 0x92f8befeUL, 0xd4277e25UL, 0xd9ad6d87UL,       // x^1792 mod p(x)`, x^1760 mod p(x)`, x^1728 mod p(x)`, x^1696 mod p(x)`
   6.278 +      0x291ea462UL, 0xf38a3556UL, 0x33fbca3bUL, 0xc10ec5e0UL,       // x^1664 mod p(x)`, x^1632 mod p(x)`, x^1600 mod p(x)`, x^1568 mod p(x)`
   6.279 +      0x62b6ca4bUL, 0x974ac562UL, 0x82e02e2fUL, 0xc0b55b0eUL,       // x^1536 mod p(x)`, x^1504 mod p(x)`, x^1472 mod p(x)`, x^1440 mod p(x)`
   6.280 +      0x784d2a56UL, 0x855712b3UL, 0xe172334dUL, 0x71aa1df0UL,       // x^1408 mod p(x)`, x^1376 mod p(x)`, x^1344 mod p(x)`, x^1312 mod p(x)`
   6.281 +      0x0eaee722UL, 0xa5abe9f8UL, 0x3969324dUL, 0xfee3053eUL,       // x^1280 mod p(x)`, x^1248 mod p(x)`, x^1216 mod p(x)`, x^1184 mod p(x)`
   6.282 +      0xdb54814cUL, 0x1fa0943dUL, 0x3eb2bd08UL, 0xf44779b9UL,       // x^1152 mod p(x)`, x^1120 mod p(x)`, x^1088 mod p(x)`, x^1056 mod p(x)`
   6.283 +      0xd7bbfe6aUL, 0xa53ff440UL, 0x00cc3374UL, 0xf5449b3fUL,       // x^1024 mod p(x)`, x^992 mod p(x)`,  x^960 mod p(x)`,  x^928 mod p(x)`
   6.284 +      0x6325605cUL, 0xebe7e356UL, 0xd777606eUL, 0x6f8346e1UL,       // x^896 mod p(x)`,  x^864 mod p(x)`,  x^832 mod p(x)`,  x^800 mod p(x)`
   6.285 +      0xe5b592b8UL, 0xc65a272cUL, 0xc0b95347UL, 0xe3ab4f2aUL,       // x^768 mod p(x)`,  x^736 mod p(x)`,  x^704 mod p(x)`,  x^672 mod p(x)`
   6.286 +      0x4721589fUL, 0x5705a9caUL, 0x329ecc11UL, 0xaa2215eaUL,       // x^640 mod p(x)`,  x^608 mod p(x)`,  x^576 mod p(x)`,  x^544 mod p(x)`
   6.287 +      0x88d14467UL, 0xe3720acbUL, 0xd95efd26UL, 0x1ed8f66eUL,       // x^512 mod p(x)`,  x^480 mod p(x)`,  x^448 mod p(x)`,  x^416 mod p(x)`
   6.288 +      0x15141c31UL, 0xba1aca03UL, 0xa700e96aUL, 0x78ed02d5UL,       // x^384 mod p(x)`,  x^352 mod p(x)`,  x^320 mod p(x)`,  x^288 mod p(x)`
   6.289 +      0xed627daeUL, 0xad2a31b3UL, 0x32b39da3UL, 0xba8ccbe8UL,       // x^256 mod p(x)`,  x^224 mod p(x)`,  x^192 mod p(x)`,  x^160 mod p(x)`
   6.290 +      0xa06a2517UL, 0x6655004fUL, 0xb1e6b092UL, 0xedb88320UL        // x^128 mod p(x)`,  x^96 mod p(x)`,   x^64 mod p(x)`,   x^32 mod p(x)`
   6.291 +  };
   6.292 +
   6.293 +  juint* ptr = (juint*) malloc(sizeof(juint) * CRC32_CONSTANTS_SIZE);
   6.294 +  guarantee(((intptr_t)ptr & 0xF) == 0, "16-byte alignment needed");
   6.295 +  guarantee(ptr != NULL, "allocation error of a crc table");
   6.296 +  memcpy((void*)ptr, constants, sizeof(juint) * CRC32_CONSTANTS_SIZE);
   6.297 +  return ptr;
   6.298 +}
   6.299 +
   6.300 +juint* StubRoutines::ppc64::generate_crc_barret_constants() {
   6.301 +  juint barret_constants[CRC32_BARRET_CONSTANTS] = {
   6.302 +      0xf7011641UL, 0x00000001UL, 0x00000000UL, 0x00000000UL,
   6.303 +      0xdb710641UL, 0x00000001UL, 0x00000000UL, 0x00000000UL
   6.304 +  };
   6.305 +  juint* ptr = (juint*) malloc(sizeof(juint) * CRC32_CONSTANTS_SIZE);
   6.306 +  guarantee(((intptr_t)ptr & 0xF) == 0, "16-byte alignment needed");
   6.307 +  guarantee(ptr != NULL, "allocation error of a crc table");
   6.308 +  memcpy((void*) ptr, barret_constants, sizeof(juint) * CRC32_BARRET_CONSTANTS);
   6.309 +  return ptr;
   6.310 +}
   6.311 +
   6.312  // CRC32 Intrinsics.
   6.313  /**
   6.314   *  crc_table[] from jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.h
   6.315 @@ -477,3 +782,7 @@
   6.316  #endif
   6.317    }
   6.318  };
   6.319 +
   6.320 +juint* StubRoutines::ppc64::_constants = StubRoutines::ppc64::generate_crc_constants();
   6.321 +
   6.322 +juint* StubRoutines::ppc64::_barret_constants = StubRoutines::ppc64::generate_crc_barret_constants();
     7.1 --- a/src/cpu/ppc/vm/stubRoutines_ppc_64.hpp	Mon Sep 24 17:18:38 2018 -0400
     7.2 +++ b/src/cpu/ppc/vm/stubRoutines_ppc_64.hpp	Thu Sep 22 12:17:24 2016 +0200
     7.3 @@ -45,6 +45,8 @@
     7.4  #else
     7.5    #define CRC32_TABLES 1
     7.6  #endif
     7.7 +#define CRC32_CONSTANTS_SIZE 1084
     7.8 +#define CRC32_BARRET_CONSTANTS 10
     7.9  
    7.10  class ppc64 {
    7.11   friend class StubGenerator;
    7.12 @@ -53,11 +55,17 @@
    7.13  
    7.14    // CRC32 Intrinsics.
    7.15    static juint _crc_table[CRC32_TABLES][CRC32_COLUMN_SIZE];
    7.16 +  static juint* _constants;
    7.17 +  static juint* _barret_constants;
    7.18  
    7.19   public:
    7.20  
    7.21    // CRC32 Intrinsics.
    7.22    static void generate_load_crc_table_addr(MacroAssembler* masm, Register table);
    7.23 +  static void generate_load_crc_constants_addr(MacroAssembler* masm, Register table);
    7.24 +  static void generate_load_crc_barret_constants_addr(MacroAssembler* masm, Register table);
    7.25 +  static juint* generate_crc_constants();
    7.26 +  static juint* generate_crc_barret_constants();
    7.27  
    7.28  };
    7.29  
     8.1 --- a/src/cpu/ppc/vm/vm_version_ppc.cpp	Mon Sep 24 17:18:38 2018 -0400
     8.2 +++ b/src/cpu/ppc/vm/vm_version_ppc.cpp	Thu Sep 22 12:17:24 2016 +0200
     8.3 @@ -102,7 +102,7 @@
     8.4    // Create and print feature-string.
     8.5    char buf[(num_features+1) * 16]; // Max 16 chars per feature.
     8.6    jio_snprintf(buf, sizeof(buf),
     8.7 -               "ppc64%s%s%s%s%s%s%s%s%s",
     8.8 +               "ppc64%s%s%s%s%s%s%s%s%s%s",
     8.9                 (has_fsqrt()   ? " fsqrt"   : ""),
    8.10                 (has_isel()    ? " isel"    : ""),
    8.11                 (has_lxarxeh() ? " lxarxeh" : ""),
    8.12 @@ -112,7 +112,8 @@
    8.13                 (has_popcntw() ? " popcntw" : ""),
    8.14                 (has_fcfids()  ? " fcfids"  : ""),
    8.15                 (has_vand()    ? " vand"    : ""),
    8.16 -               (has_vcipher() ? " aes"     : "")
    8.17 +               (has_vcipher() ? " aes"     : ""),
    8.18 +               (has_vpmsumb() ? " vpmsumb" : "")
    8.19                 // Make sure number of %s matches num_features!
    8.20                );
    8.21    _features_str = strdup(buf);
    8.22 @@ -485,6 +486,7 @@
    8.23    a->fcfids(F3, F4);                           // code[8] -> fcfids
    8.24    a->vand(VR0, VR0, VR0);                      // code[9] -> vand
    8.25    a->vcipher(VR0, VR1, VR2);                   // code[10] -> vcipher
    8.26 +  a->vpmsumb(VR0, VR1, VR2);                   // code[11] -> vpmsumb
    8.27    a->blr();
    8.28  
    8.29    // Emit function to set one cache line to zero. Emit function descriptor and get pointer to it.
    8.30 @@ -529,6 +531,7 @@
    8.31    if (code[feature_cntr++]) features |= fcfids_m;
    8.32    if (code[feature_cntr++]) features |= vand_m;
    8.33    if (code[feature_cntr++]) features |= vcipher_m;
    8.34 +  if (code[feature_cntr++]) features |= vpmsumb_m;
    8.35  
    8.36    // Print the detection code.
    8.37    if (PrintAssembly) {
     9.1 --- a/src/cpu/ppc/vm/vm_version_ppc.hpp	Mon Sep 24 17:18:38 2018 -0400
     9.2 +++ b/src/cpu/ppc/vm/vm_version_ppc.hpp	Thu Sep 22 12:17:24 2016 +0200
     9.3 @@ -43,6 +43,7 @@
     9.4      vand,
     9.5      dcba,
     9.6      vcipher,
     9.7 +    vpmsumb,
     9.8      num_features // last entry to count features
     9.9    };
    9.10    enum Feature_Flag_Set {
    9.11 @@ -58,6 +59,7 @@
    9.12      vand_m                = (1 << vand   ),
    9.13      dcba_m                = (1 << dcba   ),
    9.14      vcipher_m             = (1 << vcipher),
    9.15 +    vpmsumb_m             = (1 << vpmsumb),
    9.16      all_features_m        = -1
    9.17    };
    9.18    static int  _features;
    9.19 @@ -86,6 +88,7 @@
    9.20    static bool has_vand()    { return (_features & vand_m) != 0; }
    9.21    static bool has_dcba()    { return (_features & dcba_m) != 0; }
    9.22    static bool has_vcipher() { return (_features & vcipher_m) != 0; }
    9.23 +  static bool has_vpmsumb() { return (_features & vpmsumb_m) != 0; }
    9.24  
    9.25    static const char* cpu_features() { return _features_str; }
    9.26  

mercurial