Tue, 05 Aug 2014 15:02:10 -0700
8052081: Optimize generated by C2 code for Intel's Atom processor
Summary: Allow to execute vectorization and crc32 optimization on Atom. Enable UseFPUForSpilling by default on x86.
Reviewed-by: roland
1.1 --- a/src/cpu/x86/vm/assembler_x86.cpp Tue Aug 12 15:17:46 2014 +0000 1.2 +++ b/src/cpu/x86/vm/assembler_x86.cpp Tue Aug 05 15:02:10 2014 -0700 1.3 @@ -3854,6 +3854,15 @@ 1.4 } 1.5 1.6 // Carry-Less Multiplication Quadword 1.7 +void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) { 1.8 + assert(VM_Version::supports_clmul(), ""); 1.9 + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A); 1.10 + emit_int8(0x44); 1.11 + emit_int8((unsigned char)(0xC0 | encode)); 1.12 + emit_int8((unsigned char)mask); 1.13 +} 1.14 + 1.15 +// Carry-Less Multiplication Quadword 1.16 void Assembler::vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask) { 1.17 assert(VM_Version::supports_avx() && VM_Version::supports_clmul(), ""); 1.18 bool vector256 = false;
2.1 --- a/src/cpu/x86/vm/assembler_x86.hpp Tue Aug 12 15:17:46 2014 +0000 2.2 +++ b/src/cpu/x86/vm/assembler_x86.hpp Tue Aug 05 15:02:10 2014 -0700 2.3 @@ -1837,6 +1837,7 @@ 2.4 void vpbroadcastd(XMMRegister dst, XMMRegister src); 2.5 2.6 // Carry-Less Multiplication Quadword 2.7 + void pclmulqdq(XMMRegister dst, XMMRegister src, int mask); 2.8 void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask); 2.9 2.10 // AVX instruction which is used to clear upper 128 bits of YMM registers and
3.1 --- a/src/cpu/x86/vm/macroAssembler_x86.cpp Tue Aug 12 15:17:46 2014 +0000 3.2 +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp Tue Aug 05 15:02:10 2014 -0700 3.3 @@ -7316,17 +7316,34 @@ 3.4 * Fold 128-bit data chunk 3.5 */ 3.6 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) { 3.7 - vpclmulhdq(xtmp, xK, xcrc); // [123:64] 3.8 - vpclmulldq(xcrc, xK, xcrc); // [63:0] 3.9 - vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */); 3.10 - pxor(xcrc, xtmp); 3.11 + if (UseAVX > 0) { 3.12 + vpclmulhdq(xtmp, xK, xcrc); // [123:64] 3.13 + vpclmulldq(xcrc, xK, xcrc); // [63:0] 3.14 + vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */); 3.15 + pxor(xcrc, xtmp); 3.16 + } else { 3.17 + movdqa(xtmp, xcrc); 3.18 + pclmulhdq(xtmp, xK); // [123:64] 3.19 + pclmulldq(xcrc, xK); // [63:0] 3.20 + pxor(xcrc, xtmp); 3.21 + movdqu(xtmp, Address(buf, offset)); 3.22 + pxor(xcrc, xtmp); 3.23 + } 3.24 } 3.25 3.26 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) { 3.27 - vpclmulhdq(xtmp, xK, xcrc); 3.28 - vpclmulldq(xcrc, xK, xcrc); 3.29 - pxor(xcrc, xbuf); 3.30 - pxor(xcrc, xtmp); 3.31 + if (UseAVX > 0) { 3.32 + vpclmulhdq(xtmp, xK, xcrc); 3.33 + vpclmulldq(xcrc, xK, xcrc); 3.34 + pxor(xcrc, xbuf); 3.35 + pxor(xcrc, xtmp); 3.36 + } else { 3.37 + movdqa(xtmp, xcrc); 3.38 + pclmulhdq(xtmp, xK); 3.39 + pclmulldq(xcrc, xK); 3.40 + pxor(xcrc, xbuf); 3.41 + pxor(xcrc, xtmp); 3.42 + } 3.43 } 3.44 3.45 /** 3.46 @@ -7444,9 +7461,17 @@ 3.47 // Fold 128 bits in xmm1 down into 32 bits in crc register. 3.48 BIND(L_fold_128b); 3.49 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr())); 3.50 - vpclmulqdq(xmm2, xmm0, xmm1, 0x1); 3.51 - vpand(xmm3, xmm0, xmm2, false /* vector256 */); 3.52 - vpclmulqdq(xmm0, xmm0, xmm3, 0x1); 3.53 + if (UseAVX > 0) { 3.54 + vpclmulqdq(xmm2, xmm0, xmm1, 0x1); 3.55 + vpand(xmm3, xmm0, xmm2, false /* vector256 */); 3.56 + vpclmulqdq(xmm0, xmm0, xmm3, 0x1); 3.57 + } else { 3.58 + movdqa(xmm2, xmm0); 3.59 + pclmulqdq(xmm2, xmm1, 0x1); 3.60 + movdqa(xmm3, xmm0); 3.61 + pand(xmm3, xmm2); 3.62 + pclmulqdq(xmm0, xmm3, 0x1); 3.63 + } 3.64 psrldq(xmm1, 8); 3.65 psrldq(xmm2, 4); 3.66 pxor(xmm0, xmm1);
4.1 --- a/src/cpu/x86/vm/macroAssembler_x86.hpp Tue Aug 12 15:17:46 2014 +0000 4.2 +++ b/src/cpu/x86/vm/macroAssembler_x86.hpp Tue Aug 05 15:02:10 2014 -0700 4.3 @@ -966,6 +966,16 @@ 4.4 void mulss(XMMRegister dst, Address src) { Assembler::mulss(dst, src); } 4.5 void mulss(XMMRegister dst, AddressLiteral src); 4.6 4.7 + // Carry-Less Multiplication Quadword 4.8 + void pclmulldq(XMMRegister dst, XMMRegister src) { 4.9 + // 0x00 - multiply lower 64 bits [0:63] 4.10 + Assembler::pclmulqdq(dst, src, 0x00); 4.11 + } 4.12 + void pclmulhdq(XMMRegister dst, XMMRegister src) { 4.13 + // 0x11 - multiply upper 64 bits [64:127] 4.14 + Assembler::pclmulqdq(dst, src, 0x11); 4.15 + } 4.16 + 4.17 void sqrtsd(XMMRegister dst, XMMRegister src) { Assembler::sqrtsd(dst, src); } 4.18 void sqrtsd(XMMRegister dst, Address src) { Assembler::sqrtsd(dst, src); } 4.19 void sqrtsd(XMMRegister dst, AddressLiteral src);
5.1 --- a/src/cpu/x86/vm/vm_version_x86.cpp Tue Aug 12 15:17:46 2014 +0000 5.2 +++ b/src/cpu/x86/vm/vm_version_x86.cpp Tue Aug 05 15:02:10 2014 -0700 5.3 @@ -568,7 +568,7 @@ 5.4 FLAG_SET_DEFAULT(UseCLMUL, false); 5.5 } 5.6 5.7 - if (UseCLMUL && (UseAVX > 0) && (UseSSE > 2)) { 5.8 + if (UseCLMUL && (UseSSE > 2)) { 5.9 if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) { 5.10 UseCRC32Intrinsics = true; 5.11 } 5.12 @@ -803,6 +803,21 @@ 5.13 } 5.14 } 5.15 } 5.16 + if ((cpu_family() == 0x06) && 5.17 + ((extended_cpu_model() == 0x36) || // Centerton 5.18 + (extended_cpu_model() == 0x37) || // Silvermont 5.19 + (extended_cpu_model() == 0x4D))) { 5.20 +#ifdef COMPILER2 5.21 + if (FLAG_IS_DEFAULT(OptoScheduling)) { 5.22 + OptoScheduling = true; 5.23 + } 5.24 +#endif 5.25 + if (supports_sse4_2()) { // Silvermont 5.26 + if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) { 5.27 + UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus 5.28 + } 5.29 + } 5.30 + } 5.31 } 5.32 5.33 // Use count leading zeros count instruction if available. 5.34 @@ -890,23 +905,25 @@ 5.35 AllocatePrefetchDistance = allocate_prefetch_distance(); 5.36 AllocatePrefetchStyle = allocate_prefetch_style(); 5.37 5.38 - if( is_intel() && cpu_family() == 6 && supports_sse3() ) { 5.39 - if( AllocatePrefetchStyle == 2 ) { // watermark prefetching on Core 5.40 + if (is_intel() && cpu_family() == 6 && supports_sse3()) { 5.41 + if (AllocatePrefetchStyle == 2) { // watermark prefetching on Core 5.42 #ifdef _LP64 5.43 AllocatePrefetchDistance = 384; 5.44 #else 5.45 AllocatePrefetchDistance = 320; 5.46 #endif 5.47 } 5.48 - if( supports_sse4_2() && supports_ht() ) { // Nehalem based cpus 5.49 + if (supports_sse4_2() && supports_ht()) { // Nehalem based cpus 5.50 AllocatePrefetchDistance = 192; 5.51 AllocatePrefetchLines = 4; 5.52 + } 5.53 #ifdef COMPILER2 5.54 - if (AggressiveOpts && FLAG_IS_DEFAULT(UseFPUForSpilling)) { 5.55 + if (supports_sse4_2()) { 5.56 + if (FLAG_IS_DEFAULT(UseFPUForSpilling)) { 5.57 FLAG_SET_DEFAULT(UseFPUForSpilling, true); 5.58 } 5.59 + } 5.60 #endif 5.61 - } 5.62 } 5.63 assert(AllocatePrefetchDistance % AllocatePrefetchStepSize == 0, "invalid value"); 5.64
6.1 --- a/src/share/vm/opto/lcm.cpp Tue Aug 12 15:17:46 2014 +0000 6.2 +++ b/src/share/vm/opto/lcm.cpp Tue Aug 05 15:02:10 2014 -0700 6.3 @@ -484,7 +484,9 @@ 6.4 iop == Op_CreateEx || // Create-exception must start block 6.5 iop == Op_CheckCastPP 6.6 ) { 6.7 - worklist.map(i,worklist.pop()); 6.8 + // select the node n 6.9 + // remove n from worklist and retain the order of remaining nodes 6.10 + worklist.remove((uint)i); 6.11 return n; 6.12 } 6.13 6.14 @@ -570,7 +572,9 @@ 6.15 assert(idx >= 0, "index should be set"); 6.16 Node *n = worklist[(uint)idx]; // Get the winner 6.17 6.18 - worklist.map((uint)idx, worklist.pop()); // Compress worklist 6.19 + // select the node n 6.20 + // remove n from worklist and retain the order of remaining nodes 6.21 + worklist.remove((uint)idx); 6.22 return n; 6.23 } 6.24
7.1 --- a/src/share/vm/opto/superword.cpp Tue Aug 12 15:17:46 2014 +0000 7.2 +++ b/src/share/vm/opto/superword.cpp Tue Aug 05 15:02:10 2014 -0700 7.3 @@ -1374,6 +1374,20 @@ 7.4 if (n->is_Load()) { 7.5 Node* ctl = n->in(MemNode::Control); 7.6 Node* mem = first->in(MemNode::Memory); 7.7 + SWPointer p1(n->as_Mem(), this); 7.8 + // Identify the memory dependency for the new loadVector node by 7.9 + // walking up through memory chain. 7.10 + // This is done to give flexibility to the new loadVector node so that 7.11 + // it can move above independent storeVector nodes. 7.12 + while (mem->is_StoreVector()) { 7.13 + SWPointer p2(mem->as_Mem(), this); 7.14 + int cmp = p1.cmp(p2); 7.15 + if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) { 7.16 + mem = mem->in(MemNode::Memory); 7.17 + } else { 7.18 + break; // dependent memory 7.19 + } 7.20 + } 7.21 Node* adr = low_adr->in(MemNode::Address); 7.22 const TypePtr* atyp = n->adr_type(); 7.23 vn = LoadVectorNode::make(C, opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));