Wed, 07 Apr 2010 09:37:47 -0700
6940701: Don't align loops in stubs for Niagara sparc
Summary: Don't align loops in stubs for Niagara sparc since NOPs are expensive.
Reviewed-by: twisti, never
1.1 --- a/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp Tue Apr 06 15:18:10 2010 -0700 1.2 +++ b/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp Wed Apr 07 09:37:47 2010 -0700 1.3 @@ -2849,7 +2849,7 @@ 1.4 1.5 1.6 void LIR_Assembler::align_backward_branch_target() { 1.7 - __ align(16); 1.8 + __ align(OptoLoopAlignment); 1.9 } 1.10 1.11
2.1 --- a/src/cpu/sparc/vm/c2_globals_sparc.hpp Tue Apr 06 15:18:10 2010 -0700 2.2 +++ b/src/cpu/sparc/vm/c2_globals_sparc.hpp Wed Apr 07 09:37:47 2010 -0700 2.3 @@ -60,9 +60,6 @@ 2.4 define_pd_global(intx, INTPRESSURE, 48); // large register set 2.5 define_pd_global(intx, InteriorEntryAlignment, 16); // = CodeEntryAlignment 2.6 define_pd_global(intx, NewSizeThreadIncrease, ScaleForWordSize(4*K)); 2.7 -// The default setting 16/16 seems to work best. 2.8 -// (For _228_jack 16/16 is 2% better than 4/4, 16/4, 32/32, 32/16, or 16/32.) 2.9 -define_pd_global(intx, OptoLoopAlignment, 16); // = 4*wordSize 2.10 define_pd_global(intx, RegisterCostAreaRatio, 12000); 2.11 define_pd_global(bool, UseTLAB, true); 2.12 define_pd_global(bool, ResizeTLAB, true);
3.1 --- a/src/cpu/sparc/vm/globals_sparc.hpp Tue Apr 06 15:18:10 2010 -0700 3.2 +++ b/src/cpu/sparc/vm/globals_sparc.hpp Wed Apr 07 09:37:47 2010 -0700 3.3 @@ -40,6 +40,9 @@ 3.4 define_pd_global(bool, UncommonNullCast, true); // Uncommon-trap NULLs past to check cast 3.5 3.6 define_pd_global(intx, CodeEntryAlignment, 32); 3.7 +// The default setting 16/16 seems to work best. 3.8 +// (For _228_jack 16/16 is 2% better than 4/4, 16/4, 32/32, 32/16, or 16/32.) 3.9 +define_pd_global(intx, OptoLoopAlignment, 16); // = 4*wordSize 3.10 define_pd_global(intx, InlineFrequencyCount, 50); // we can use more inlining on the SPARC 3.11 define_pd_global(intx, InlineSmallCode, 1500); 3.12 #ifdef _LP64
4.1 --- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp Tue Apr 06 15:18:10 2010 -0700 4.2 +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp Wed Apr 07 09:37:47 2010 -0700 4.3 @@ -1148,7 +1148,7 @@ 4.4 __ andn(from, 7, from); // Align address 4.5 __ ldx(from, 0, O3); 4.6 __ inc(from, 8); 4.7 - __ align(16); 4.8 + __ align(OptoLoopAlignment); 4.9 __ BIND(L_loop); 4.10 __ ldx(from, 0, O4); 4.11 __ deccc(count, count_dec); // Can we do next iteration after this one? 4.12 @@ -1220,7 +1220,7 @@ 4.13 // 4.14 __ andn(end_from, 7, end_from); // Align address 4.15 __ ldx(end_from, 0, O3); 4.16 - __ align(16); 4.17 + __ align(OptoLoopAlignment); 4.18 __ BIND(L_loop); 4.19 __ ldx(end_from, -8, O4); 4.20 __ deccc(count, count_dec); // Can we do next iteration after this one? 4.21 @@ -1349,7 +1349,7 @@ 4.22 __ BIND(L_copy_byte); 4.23 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); 4.24 __ delayed()->nop(); 4.25 - __ align(16); 4.26 + __ align(OptoLoopAlignment); 4.27 __ BIND(L_copy_byte_loop); 4.28 __ ldub(from, offset, O3); 4.29 __ deccc(count); 4.30 @@ -1445,7 +1445,7 @@ 4.31 L_aligned_copy, L_copy_byte); 4.32 } 4.33 // copy 4 elements (16 bytes) at a time 4.34 - __ align(16); 4.35 + __ align(OptoLoopAlignment); 4.36 __ BIND(L_aligned_copy); 4.37 __ dec(end_from, 16); 4.38 __ ldx(end_from, 8, O3); 4.39 @@ -1461,7 +1461,7 @@ 4.40 __ BIND(L_copy_byte); 4.41 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); 4.42 __ delayed()->nop(); 4.43 - __ align(16); 4.44 + __ align(OptoLoopAlignment); 4.45 __ BIND(L_copy_byte_loop); 4.46 __ dec(end_from); 4.47 __ dec(end_to); 4.48 @@ -1577,7 +1577,7 @@ 4.49 __ BIND(L_copy_2_bytes); 4.50 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); 4.51 __ delayed()->nop(); 4.52 - __ align(16); 4.53 + __ align(OptoLoopAlignment); 4.54 __ BIND(L_copy_2_bytes_loop); 4.55 __ lduh(from, offset, O3); 4.56 __ deccc(count); 4.57 @@ -1684,7 +1684,7 @@ 4.58 L_aligned_copy, L_copy_2_bytes); 4.59 } 4.60 // copy 4 elements (16 bytes) at a time 4.61 - __ align(16); 4.62 + __ align(OptoLoopAlignment); 4.63 __ BIND(L_aligned_copy); 4.64 __ dec(end_from, 16); 4.65 __ ldx(end_from, 8, O3); 4.66 @@ -1781,7 +1781,7 @@ 4.67 // copy with shift 4 elements (16 bytes) at a time 4.68 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4 4.69 4.70 - __ align(16); 4.71 + __ align(OptoLoopAlignment); 4.72 __ BIND(L_copy_16_bytes); 4.73 __ ldx(from, 4, O4); 4.74 __ deccc(count, 4); // Can we do next iteration after this one? 4.75 @@ -1907,7 +1907,7 @@ 4.76 // to form 2 aligned 8-bytes chunks to store. 4.77 // 4.78 __ ldx(end_from, -4, O3); 4.79 - __ align(16); 4.80 + __ align(OptoLoopAlignment); 4.81 __ BIND(L_copy_16_bytes); 4.82 __ ldx(end_from, -12, O4); 4.83 __ deccc(count, 4); 4.84 @@ -1929,7 +1929,7 @@ 4.85 __ delayed()->inc(count, 4); 4.86 4.87 // copy 4 elements (16 bytes) at a time 4.88 - __ align(16); 4.89 + __ align(OptoLoopAlignment); 4.90 __ BIND(L_aligned_copy); 4.91 __ dec(end_from, 16); 4.92 __ ldx(end_from, 8, O3); 4.93 @@ -2045,7 +2045,7 @@ 4.94 __ mov(O3, count); 4.95 __ mov(from, from64); 4.96 4.97 - __ align(16); 4.98 + __ align(OptoLoopAlignment); 4.99 __ BIND(L_copy_64_bytes); 4.100 for( int off = 0; off < 64; off += 16 ) { 4.101 __ ldx(from64, off+0, O4); 4.102 @@ -2065,7 +2065,7 @@ 4.103 __ delayed()->add(offset0, 8, offset8); 4.104 4.105 // Copy by 16 bytes chunks 4.106 - __ align(16); 4.107 + __ align(OptoLoopAlignment); 4.108 __ BIND(L_copy_16_bytes); 4.109 __ ldx(from, offset0, O3); 4.110 __ ldx(from, offset8, G3); 4.111 @@ -2139,7 +2139,7 @@ 4.112 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes ); 4.113 __ delayed()->sllx(count, LogBytesPerLong, offset8); 4.114 __ sub(offset8, 8, offset0); 4.115 - __ align(16); 4.116 + __ align(OptoLoopAlignment); 4.117 __ BIND(L_copy_16_bytes); 4.118 __ ldx(from, offset8, O2); 4.119 __ ldx(from, offset0, O3); 4.120 @@ -2405,7 +2405,7 @@ 4.121 // (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays 4.122 // (O2 = len; O2 != 0; O2--) --- number of oops *remaining* 4.123 // G3, G4, G5 --- current oop, oop.klass, oop.klass.super 4.124 - __ align(16); 4.125 + __ align(OptoLoopAlignment); 4.126 4.127 __ BIND(store_element); 4.128 __ deccc(G1_remain); // decrement the count
5.1 --- a/src/cpu/sparc/vm/vm_version_sparc.cpp Tue Apr 06 15:18:10 2010 -0700 5.2 +++ b/src/cpu/sparc/vm/vm_version_sparc.cpp Wed Apr 07 09:37:47 2010 -0700 5.3 @@ -86,14 +86,14 @@ 5.4 if (FLAG_IS_DEFAULT(InteriorEntryAlignment)) { 5.5 FLAG_SET_DEFAULT(InteriorEntryAlignment, 4); 5.6 } 5.7 - if (FLAG_IS_DEFAULT(OptoLoopAlignment)) { 5.8 - FLAG_SET_DEFAULT(OptoLoopAlignment, 4); 5.9 - } 5.10 if (is_niagara1_plus() && FLAG_IS_DEFAULT(AllocatePrefetchDistance)) { 5.11 // Use smaller prefetch distance on N2 5.12 FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256); 5.13 } 5.14 #endif 5.15 + if (FLAG_IS_DEFAULT(OptoLoopAlignment)) { 5.16 + FLAG_SET_DEFAULT(OptoLoopAlignment, 4); 5.17 + } 5.18 } 5.19 5.20 // Use hardware population count instruction if available.
6.1 --- a/src/cpu/x86/vm/c2_globals_x86.hpp Tue Apr 06 15:18:10 2010 -0700 6.2 +++ b/src/cpu/x86/vm/c2_globals_x86.hpp Wed Apr 07 09:37:47 2010 -0700 6.3 @@ -80,7 +80,6 @@ 6.4 // Ergonomics related flags 6.5 define_pd_global(uint64_t,MaxRAM, 4ULL*G); 6.6 #endif // AMD64 6.7 -define_pd_global(intx, OptoLoopAlignment, 16); 6.8 define_pd_global(intx, RegisterCostAreaRatio, 16000); 6.9 6.10 // Peephole and CISC spilling both break the graph, and so makes the
7.1 --- a/src/cpu/x86/vm/globals_x86.hpp Tue Apr 06 15:18:10 2010 -0700 7.2 +++ b/src/cpu/x86/vm/globals_x86.hpp Wed Apr 07 09:37:47 2010 -0700 7.3 @@ -45,6 +45,7 @@ 7.4 #else 7.5 define_pd_global(intx, CodeEntryAlignment, 16); 7.6 #endif // COMPILER2 7.7 +define_pd_global(intx, OptoLoopAlignment, 16); 7.8 define_pd_global(intx, InlineFrequencyCount, 100); 7.9 define_pd_global(intx, InlineSmallCode, 1000); 7.10
8.1 --- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp Tue Apr 06 15:18:10 2010 -0700 8.2 +++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp Wed Apr 07 09:37:47 2010 -0700 8.3 @@ -812,7 +812,7 @@ 8.4 Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; 8.5 // Copy 64-byte chunks 8.6 __ jmpb(L_copy_64_bytes); 8.7 - __ align(16); 8.8 + __ align(OptoLoopAlignment); 8.9 __ BIND(L_copy_64_bytes_loop); 8.10 8.11 if(UseUnalignedLoadStores) { 8.12 @@ -874,7 +874,7 @@ 8.13 Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; 8.14 // Copy 64-byte chunks 8.15 __ jmpb(L_copy_64_bytes); 8.16 - __ align(16); 8.17 + __ align(OptoLoopAlignment); 8.18 __ BIND(L_copy_64_bytes_loop); 8.19 __ movq(mmx0, Address(from, 0)); 8.20 __ movq(mmx1, Address(from, 8)); 8.21 @@ -1144,7 +1144,7 @@ 8.22 __ movl(Address(to, count, sf, 0), rdx); 8.23 __ jmpb(L_copy_8_bytes); 8.24 8.25 - __ align(16); 8.26 + __ align(OptoLoopAlignment); 8.27 // Move 8 bytes 8.28 __ BIND(L_copy_8_bytes_loop); 8.29 if (UseXMMForArrayCopy) { 8.30 @@ -1235,7 +1235,7 @@ 8.31 } 8.32 } else { 8.33 __ jmpb(L_copy_8_bytes); 8.34 - __ align(16); 8.35 + __ align(OptoLoopAlignment); 8.36 __ BIND(L_copy_8_bytes_loop); 8.37 __ fild_d(Address(from, 0)); 8.38 __ fistp_d(Address(from, to_from, Address::times_1)); 8.39 @@ -1282,7 +1282,7 @@ 8.40 8.41 __ jmpb(L_copy_8_bytes); 8.42 8.43 - __ align(16); 8.44 + __ align(OptoLoopAlignment); 8.45 __ BIND(L_copy_8_bytes_loop); 8.46 if (VM_Version::supports_mmx()) { 8.47 if (UseXMMForArrayCopy) { 8.48 @@ -1454,7 +1454,7 @@ 8.49 // Loop control: 8.50 // for (count = -count; count != 0; count++) 8.51 // Base pointers src, dst are biased by 8*count,to last element. 8.52 - __ align(16); 8.53 + __ align(OptoLoopAlignment); 8.54 8.55 __ BIND(L_store_element); 8.56 __ movptr(to_element_addr, elem); // store the oop
9.1 --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp Tue Apr 06 15:18:10 2010 -0700 9.2 +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp Wed Apr 07 09:37:47 2010 -0700 9.3 @@ -871,9 +871,8 @@ 9.4 } 9.5 9.6 address generate_fp_mask(const char *stub_name, int64_t mask) { 9.7 + __ align(CodeEntryAlignment); 9.8 StubCodeMark mark(this, "StubRoutines", stub_name); 9.9 - 9.10 - __ align(16); 9.11 address start = __ pc(); 9.12 9.13 __ emit_data64( mask, relocInfo::none ); 9.14 @@ -1268,7 +1267,7 @@ 9.15 Label& L_copy_32_bytes, Label& L_copy_8_bytes) { 9.16 DEBUG_ONLY(__ stop("enter at entry label, not here")); 9.17 Label L_loop; 9.18 - __ align(16); 9.19 + __ align(OptoLoopAlignment); 9.20 __ BIND(L_loop); 9.21 if(UseUnalignedLoadStores) { 9.22 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); 9.23 @@ -1309,7 +1308,7 @@ 9.24 Label& L_copy_32_bytes, Label& L_copy_8_bytes) { 9.25 DEBUG_ONLY(__ stop("enter at entry label, not here")); 9.26 Label L_loop; 9.27 - __ align(16); 9.28 + __ align(OptoLoopAlignment); 9.29 __ BIND(L_loop); 9.30 if(UseUnalignedLoadStores) { 9.31 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16)); 9.32 @@ -2229,7 +2228,7 @@ 9.33 // Loop control: 9.34 // for (count = -count; count != 0; count++) 9.35 // Base pointers src, dst are biased by 8*(count-1),to last element. 9.36 - __ align(16); 9.37 + __ align(OptoLoopAlignment); 9.38 9.39 __ BIND(L_store_element); 9.40 __ store_heap_oop(to_element_addr, rax_oop); // store the oop
10.1 --- a/src/share/vm/opto/c2_globals.hpp Tue Apr 06 15:18:10 2010 -0700 10.2 +++ b/src/share/vm/opto/c2_globals.hpp Wed Apr 07 09:37:47 2010 -0700 10.3 @@ -52,9 +52,6 @@ 10.4 "Code alignment for interior entry points " \ 10.5 "in generated code (in bytes)") \ 10.6 \ 10.7 - product_pd(intx, OptoLoopAlignment, \ 10.8 - "Align inner loops to zero relative to this modulus") \ 10.9 - \ 10.10 product(intx, MaxLoopPad, (OptoLoopAlignment-1), \ 10.11 "Align a loop if padding size in bytes is less or equal to this value") \ 10.12 \
11.1 --- a/src/share/vm/runtime/globals.hpp Tue Apr 06 15:18:10 2010 -0700 11.2 +++ b/src/share/vm/runtime/globals.hpp Wed Apr 07 09:37:47 2010 -0700 11.3 @@ -3110,6 +3110,9 @@ 11.4 develop_pd(intx, CodeEntryAlignment, \ 11.5 "Code entry alignment for generated code (in bytes)") \ 11.6 \ 11.7 + product_pd(intx, OptoLoopAlignment, \ 11.8 + "Align inner loops to zero relative to this modulus") \ 11.9 + \ 11.10 product_pd(uintx, InitialCodeCacheSize, \ 11.11 "Initial code cache size (in bytes)") \ 11.12 \