8052081: Optimize generated by C2 code for Intel's Atom processor

Tue, 05 Aug 2014 15:02:10 -0700

author
kvn
date
Tue, 05 Aug 2014 15:02:10 -0700
changeset 7025
b1bc1af04c6e
parent 7024
bfba6779654b
child 7026
922c87c9aed4

8052081: Optimize generated by C2 code for Intel's Atom processor
Summary: Allow to execute vectorization and crc32 optimization on Atom. Enable UseFPUForSpilling by default on x86.
Reviewed-by: roland

src/cpu/x86/vm/assembler_x86.cpp file | annotate | diff | comparison | revisions
src/cpu/x86/vm/assembler_x86.hpp file | annotate | diff | comparison | revisions
src/cpu/x86/vm/macroAssembler_x86.cpp file | annotate | diff | comparison | revisions
src/cpu/x86/vm/macroAssembler_x86.hpp file | annotate | diff | comparison | revisions
src/cpu/x86/vm/vm_version_x86.cpp file | annotate | diff | comparison | revisions
src/share/vm/opto/lcm.cpp file | annotate | diff | comparison | revisions
src/share/vm/opto/superword.cpp file | annotate | diff | comparison | revisions
     1.1 --- a/src/cpu/x86/vm/assembler_x86.cpp	Tue Aug 12 15:17:46 2014 +0000
     1.2 +++ b/src/cpu/x86/vm/assembler_x86.cpp	Tue Aug 05 15:02:10 2014 -0700
     1.3 @@ -3854,6 +3854,15 @@
     1.4  }
     1.5  
     1.6  // Carry-Less Multiplication Quadword
     1.7 +void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) {
     1.8 +  assert(VM_Version::supports_clmul(), "");
     1.9 +  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
    1.10 +  emit_int8(0x44);
    1.11 +  emit_int8((unsigned char)(0xC0 | encode));
    1.12 +  emit_int8((unsigned char)mask);
    1.13 +}
    1.14 +
    1.15 +// Carry-Less Multiplication Quadword
    1.16  void Assembler::vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask) {
    1.17    assert(VM_Version::supports_avx() && VM_Version::supports_clmul(), "");
    1.18    bool vector256 = false;
     2.1 --- a/src/cpu/x86/vm/assembler_x86.hpp	Tue Aug 12 15:17:46 2014 +0000
     2.2 +++ b/src/cpu/x86/vm/assembler_x86.hpp	Tue Aug 05 15:02:10 2014 -0700
     2.3 @@ -1837,6 +1837,7 @@
     2.4    void vpbroadcastd(XMMRegister dst, XMMRegister src);
     2.5  
     2.6    // Carry-Less Multiplication Quadword
     2.7 +  void pclmulqdq(XMMRegister dst, XMMRegister src, int mask);
     2.8    void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask);
     2.9  
    2.10    // AVX instruction which is used to clear upper 128 bits of YMM registers and
     3.1 --- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Tue Aug 12 15:17:46 2014 +0000
     3.2 +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Tue Aug 05 15:02:10 2014 -0700
     3.3 @@ -7316,17 +7316,34 @@
     3.4   * Fold 128-bit data chunk
     3.5   */
     3.6  void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
     3.7 -  vpclmulhdq(xtmp, xK, xcrc); // [123:64]
     3.8 -  vpclmulldq(xcrc, xK, xcrc); // [63:0]
     3.9 -  vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */);
    3.10 -  pxor(xcrc, xtmp);
    3.11 +  if (UseAVX > 0) {
    3.12 +    vpclmulhdq(xtmp, xK, xcrc); // [123:64]
    3.13 +    vpclmulldq(xcrc, xK, xcrc); // [63:0]
    3.14 +    vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */);
    3.15 +    pxor(xcrc, xtmp);
    3.16 +  } else {
    3.17 +    movdqa(xtmp, xcrc);
    3.18 +    pclmulhdq(xtmp, xK);   // [123:64]
    3.19 +    pclmulldq(xcrc, xK);   // [63:0]
    3.20 +    pxor(xcrc, xtmp);
    3.21 +    movdqu(xtmp, Address(buf, offset));
    3.22 +    pxor(xcrc, xtmp);
    3.23 +  }
    3.24  }
    3.25  
    3.26  void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
    3.27 -  vpclmulhdq(xtmp, xK, xcrc);
    3.28 -  vpclmulldq(xcrc, xK, xcrc);
    3.29 -  pxor(xcrc, xbuf);
    3.30 -  pxor(xcrc, xtmp);
    3.31 +  if (UseAVX > 0) {
    3.32 +    vpclmulhdq(xtmp, xK, xcrc);
    3.33 +    vpclmulldq(xcrc, xK, xcrc);
    3.34 +    pxor(xcrc, xbuf);
    3.35 +    pxor(xcrc, xtmp);
    3.36 +  } else {
    3.37 +    movdqa(xtmp, xcrc);
    3.38 +    pclmulhdq(xtmp, xK);
    3.39 +    pclmulldq(xcrc, xK);
    3.40 +    pxor(xcrc, xbuf);
    3.41 +    pxor(xcrc, xtmp);
    3.42 +  }
    3.43  }
    3.44  
    3.45  /**
    3.46 @@ -7444,9 +7461,17 @@
    3.47    // Fold 128 bits in xmm1 down into 32 bits in crc register.
    3.48    BIND(L_fold_128b);
    3.49    movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
    3.50 -  vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
    3.51 -  vpand(xmm3, xmm0, xmm2, false /* vector256 */);
    3.52 -  vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
    3.53 +  if (UseAVX > 0) {
    3.54 +    vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
    3.55 +    vpand(xmm3, xmm0, xmm2, false /* vector256 */);
    3.56 +    vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
    3.57 +  } else {
    3.58 +    movdqa(xmm2, xmm0);
    3.59 +    pclmulqdq(xmm2, xmm1, 0x1);
    3.60 +    movdqa(xmm3, xmm0);
    3.61 +    pand(xmm3, xmm2);
    3.62 +    pclmulqdq(xmm0, xmm3, 0x1);
    3.63 +  }
    3.64    psrldq(xmm1, 8);
    3.65    psrldq(xmm2, 4);
    3.66    pxor(xmm0, xmm1);
     4.1 --- a/src/cpu/x86/vm/macroAssembler_x86.hpp	Tue Aug 12 15:17:46 2014 +0000
     4.2 +++ b/src/cpu/x86/vm/macroAssembler_x86.hpp	Tue Aug 05 15:02:10 2014 -0700
     4.3 @@ -966,6 +966,16 @@
     4.4    void mulss(XMMRegister dst, Address src)        { Assembler::mulss(dst, src); }
     4.5    void mulss(XMMRegister dst, AddressLiteral src);
     4.6  
     4.7 +  // Carry-Less Multiplication Quadword
     4.8 +  void pclmulldq(XMMRegister dst, XMMRegister src) {
     4.9 +    // 0x00 - multiply lower 64 bits [0:63]
    4.10 +    Assembler::pclmulqdq(dst, src, 0x00);
    4.11 +  }
    4.12 +  void pclmulhdq(XMMRegister dst, XMMRegister src) {
    4.13 +    // 0x11 - multiply upper 64 bits [64:127]
    4.14 +    Assembler::pclmulqdq(dst, src, 0x11);
    4.15 +  }
    4.16 +
    4.17    void sqrtsd(XMMRegister dst, XMMRegister src)    { Assembler::sqrtsd(dst, src); }
    4.18    void sqrtsd(XMMRegister dst, Address src)        { Assembler::sqrtsd(dst, src); }
    4.19    void sqrtsd(XMMRegister dst, AddressLiteral src);
     5.1 --- a/src/cpu/x86/vm/vm_version_x86.cpp	Tue Aug 12 15:17:46 2014 +0000
     5.2 +++ b/src/cpu/x86/vm/vm_version_x86.cpp	Tue Aug 05 15:02:10 2014 -0700
     5.3 @@ -568,7 +568,7 @@
     5.4      FLAG_SET_DEFAULT(UseCLMUL, false);
     5.5    }
     5.6  
     5.7 -  if (UseCLMUL && (UseAVX > 0) && (UseSSE > 2)) {
     5.8 +  if (UseCLMUL && (UseSSE > 2)) {
     5.9      if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
    5.10        UseCRC32Intrinsics = true;
    5.11      }
    5.12 @@ -803,6 +803,21 @@
    5.13          }
    5.14        }
    5.15      }
    5.16 +    if ((cpu_family() == 0x06) &&
    5.17 +        ((extended_cpu_model() == 0x36) || // Centerton
    5.18 +         (extended_cpu_model() == 0x37) || // Silvermont
    5.19 +         (extended_cpu_model() == 0x4D))) {
    5.20 +#ifdef COMPILER2
    5.21 +      if (FLAG_IS_DEFAULT(OptoScheduling)) {
    5.22 +        OptoScheduling = true;
    5.23 +      }
    5.24 +#endif
    5.25 +      if (supports_sse4_2()) { // Silvermont
    5.26 +        if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
    5.27 +          UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
    5.28 +        }
    5.29 +      }
    5.30 +    }
    5.31    }
    5.32  
    5.33    // Use count leading zeros count instruction if available.
    5.34 @@ -890,23 +905,25 @@
    5.35    AllocatePrefetchDistance = allocate_prefetch_distance();
    5.36    AllocatePrefetchStyle    = allocate_prefetch_style();
    5.37  
    5.38 -  if( is_intel() && cpu_family() == 6 && supports_sse3() ) {
    5.39 -    if( AllocatePrefetchStyle == 2 ) { // watermark prefetching on Core
    5.40 +  if (is_intel() && cpu_family() == 6 && supports_sse3()) {
    5.41 +    if (AllocatePrefetchStyle == 2) { // watermark prefetching on Core
    5.42  #ifdef _LP64
    5.43        AllocatePrefetchDistance = 384;
    5.44  #else
    5.45        AllocatePrefetchDistance = 320;
    5.46  #endif
    5.47      }
    5.48 -    if( supports_sse4_2() && supports_ht() ) { // Nehalem based cpus
    5.49 +    if (supports_sse4_2() && supports_ht()) { // Nehalem based cpus
    5.50        AllocatePrefetchDistance = 192;
    5.51        AllocatePrefetchLines = 4;
    5.52 +    }
    5.53  #ifdef COMPILER2
    5.54 -      if (AggressiveOpts && FLAG_IS_DEFAULT(UseFPUForSpilling)) {
    5.55 +    if (supports_sse4_2()) {
    5.56 +      if (FLAG_IS_DEFAULT(UseFPUForSpilling)) {
    5.57          FLAG_SET_DEFAULT(UseFPUForSpilling, true);
    5.58        }
    5.59 +    }
    5.60  #endif
    5.61 -    }
    5.62    }
    5.63    assert(AllocatePrefetchDistance % AllocatePrefetchStepSize == 0, "invalid value");
    5.64  
     6.1 --- a/src/share/vm/opto/lcm.cpp	Tue Aug 12 15:17:46 2014 +0000
     6.2 +++ b/src/share/vm/opto/lcm.cpp	Tue Aug 05 15:02:10 2014 -0700
     6.3 @@ -484,7 +484,9 @@
     6.4          iop == Op_CreateEx ||   // Create-exception must start block
     6.5          iop == Op_CheckCastPP
     6.6          ) {
     6.7 -      worklist.map(i,worklist.pop());
     6.8 +      // select the node n
     6.9 +      // remove n from worklist and retain the order of remaining nodes
    6.10 +      worklist.remove((uint)i);
    6.11        return n;
    6.12      }
    6.13  
    6.14 @@ -570,7 +572,9 @@
    6.15    assert(idx >= 0, "index should be set");
    6.16    Node *n = worklist[(uint)idx];      // Get the winner
    6.17  
    6.18 -  worklist.map((uint)idx, worklist.pop());     // Compress worklist
    6.19 +  // select the node n
    6.20 +  // remove n from worklist and retain the order of remaining nodes
    6.21 +  worklist.remove((uint)idx);
    6.22    return n;
    6.23  }
    6.24  
     7.1 --- a/src/share/vm/opto/superword.cpp	Tue Aug 12 15:17:46 2014 +0000
     7.2 +++ b/src/share/vm/opto/superword.cpp	Tue Aug 05 15:02:10 2014 -0700
     7.3 @@ -1374,6 +1374,20 @@
     7.4        if (n->is_Load()) {
     7.5          Node* ctl = n->in(MemNode::Control);
     7.6          Node* mem = first->in(MemNode::Memory);
     7.7 +        SWPointer p1(n->as_Mem(), this);
     7.8 +        // Identify the memory dependency for the new loadVector node by
     7.9 +        // walking up through memory chain.
    7.10 +        // This is done to give flexibility to the new loadVector node so that
    7.11 +        // it can move above independent storeVector nodes.
    7.12 +        while (mem->is_StoreVector()) {
    7.13 +          SWPointer p2(mem->as_Mem(), this);
    7.14 +          int cmp = p1.cmp(p2);
    7.15 +          if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) {
    7.16 +            mem = mem->in(MemNode::Memory);
    7.17 +          } else {
    7.18 +            break; // dependent memory
    7.19 +          }
    7.20 +        }
    7.21          Node* adr = low_adr->in(MemNode::Address);
    7.22          const TypePtr* atyp = n->adr_type();
    7.23          vn = LoadVectorNode::make(C, opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));

mercurial