6940701: Don't align loops in stubs for Niagara sparc

Wed, 07 Apr 2010 09:37:47 -0700

author
kvn
date
Wed, 07 Apr 2010 09:37:47 -0700
changeset 1800
6476042f815c
parent 1799
0dc88ad3244e
child 1801
b9d85fcdf743

6940701: Don't align loops in stubs for Niagara sparc
Summary: Don't align loops in stubs for Niagara sparc since NOPs are expensive.
Reviewed-by: twisti, never

src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp file | annotate | diff | comparison | revisions
src/cpu/sparc/vm/c2_globals_sparc.hpp file | annotate | diff | comparison | revisions
src/cpu/sparc/vm/globals_sparc.hpp file | annotate | diff | comparison | revisions
src/cpu/sparc/vm/stubGenerator_sparc.cpp file | annotate | diff | comparison | revisions
src/cpu/sparc/vm/vm_version_sparc.cpp file | annotate | diff | comparison | revisions
src/cpu/x86/vm/c2_globals_x86.hpp file | annotate | diff | comparison | revisions
src/cpu/x86/vm/globals_x86.hpp file | annotate | diff | comparison | revisions
src/cpu/x86/vm/stubGenerator_x86_32.cpp file | annotate | diff | comparison | revisions
src/cpu/x86/vm/stubGenerator_x86_64.cpp file | annotate | diff | comparison | revisions
src/share/vm/opto/c2_globals.hpp file | annotate | diff | comparison | revisions
src/share/vm/runtime/globals.hpp file | annotate | diff | comparison | revisions
     1.1 --- a/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp	Tue Apr 06 15:18:10 2010 -0700
     1.2 +++ b/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp	Wed Apr 07 09:37:47 2010 -0700
     1.3 @@ -2849,7 +2849,7 @@
     1.4  
     1.5  
     1.6  void LIR_Assembler::align_backward_branch_target() {
     1.7 -  __ align(16);
     1.8 +  __ align(OptoLoopAlignment);
     1.9  }
    1.10  
    1.11  
     2.1 --- a/src/cpu/sparc/vm/c2_globals_sparc.hpp	Tue Apr 06 15:18:10 2010 -0700
     2.2 +++ b/src/cpu/sparc/vm/c2_globals_sparc.hpp	Wed Apr 07 09:37:47 2010 -0700
     2.3 @@ -60,9 +60,6 @@
     2.4  define_pd_global(intx, INTPRESSURE,                  48);  // large register set
     2.5  define_pd_global(intx, InteriorEntryAlignment,       16);  // = CodeEntryAlignment
     2.6  define_pd_global(intx, NewSizeThreadIncrease, ScaleForWordSize(4*K));
     2.7 -// The default setting 16/16 seems to work best.
     2.8 -// (For _228_jack 16/16 is 2% better than 4/4, 16/4, 32/32, 32/16, or 16/32.)
     2.9 -define_pd_global(intx, OptoLoopAlignment,            16);  // = 4*wordSize
    2.10  define_pd_global(intx, RegisterCostAreaRatio,        12000);
    2.11  define_pd_global(bool, UseTLAB,                      true);
    2.12  define_pd_global(bool, ResizeTLAB,                   true);
     3.1 --- a/src/cpu/sparc/vm/globals_sparc.hpp	Tue Apr 06 15:18:10 2010 -0700
     3.2 +++ b/src/cpu/sparc/vm/globals_sparc.hpp	Wed Apr 07 09:37:47 2010 -0700
     3.3 @@ -40,6 +40,9 @@
     3.4  define_pd_global(bool, UncommonNullCast,            true);  // Uncommon-trap NULLs past to check cast
     3.5  
     3.6  define_pd_global(intx, CodeEntryAlignment,    32);
     3.7 +// The default setting 16/16 seems to work best.
     3.8 +// (For _228_jack 16/16 is 2% better than 4/4, 16/4, 32/32, 32/16, or 16/32.)
     3.9 +define_pd_global(intx, OptoLoopAlignment,     16);  // = 4*wordSize
    3.10  define_pd_global(intx, InlineFrequencyCount,  50);  // we can use more inlining on the SPARC
    3.11  define_pd_global(intx, InlineSmallCode,       1500);
    3.12  #ifdef _LP64
     4.1 --- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Tue Apr 06 15:18:10 2010 -0700
     4.2 +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Wed Apr 07 09:37:47 2010 -0700
     4.3 @@ -1148,7 +1148,7 @@
     4.4        __ andn(from, 7, from);     // Align address
     4.5        __ ldx(from, 0, O3);
     4.6        __ inc(from, 8);
     4.7 -      __ align(16);
     4.8 +      __ align(OptoLoopAlignment);
     4.9      __ BIND(L_loop);
    4.10        __ ldx(from, 0, O4);
    4.11        __ deccc(count, count_dec); // Can we do next iteration after this one?
    4.12 @@ -1220,7 +1220,7 @@
    4.13      //
    4.14        __ andn(end_from, 7, end_from);     // Align address
    4.15        __ ldx(end_from, 0, O3);
    4.16 -      __ align(16);
    4.17 +      __ align(OptoLoopAlignment);
    4.18      __ BIND(L_loop);
    4.19        __ ldx(end_from, -8, O4);
    4.20        __ deccc(count, count_dec); // Can we do next iteration after this one?
    4.21 @@ -1349,7 +1349,7 @@
    4.22      __ BIND(L_copy_byte);
    4.23        __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
    4.24        __ delayed()->nop();
    4.25 -      __ align(16);
    4.26 +      __ align(OptoLoopAlignment);
    4.27      __ BIND(L_copy_byte_loop);
    4.28        __ ldub(from, offset, O3);
    4.29        __ deccc(count);
    4.30 @@ -1445,7 +1445,7 @@
    4.31                                          L_aligned_copy, L_copy_byte);
    4.32      }
    4.33      // copy 4 elements (16 bytes) at a time
    4.34 -      __ align(16);
    4.35 +      __ align(OptoLoopAlignment);
    4.36      __ BIND(L_aligned_copy);
    4.37        __ dec(end_from, 16);
    4.38        __ ldx(end_from, 8, O3);
    4.39 @@ -1461,7 +1461,7 @@
    4.40      __ BIND(L_copy_byte);
    4.41        __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
    4.42        __ delayed()->nop();
    4.43 -      __ align(16);
    4.44 +      __ align(OptoLoopAlignment);
    4.45      __ BIND(L_copy_byte_loop);
    4.46        __ dec(end_from);
    4.47        __ dec(end_to);
    4.48 @@ -1577,7 +1577,7 @@
    4.49      __ BIND(L_copy_2_bytes);
    4.50        __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
    4.51        __ delayed()->nop();
    4.52 -      __ align(16);
    4.53 +      __ align(OptoLoopAlignment);
    4.54      __ BIND(L_copy_2_bytes_loop);
    4.55        __ lduh(from, offset, O3);
    4.56        __ deccc(count);
    4.57 @@ -1684,7 +1684,7 @@
    4.58                                          L_aligned_copy, L_copy_2_bytes);
    4.59      }
    4.60      // copy 4 elements (16 bytes) at a time
    4.61 -      __ align(16);
    4.62 +      __ align(OptoLoopAlignment);
    4.63      __ BIND(L_aligned_copy);
    4.64        __ dec(end_from, 16);
    4.65        __ ldx(end_from, 8, O3);
    4.66 @@ -1781,7 +1781,7 @@
    4.67      // copy with shift 4 elements (16 bytes) at a time
    4.68        __ dec(count, 4);   // The cmp at the beginning guaranty count >= 4
    4.69  
    4.70 -      __ align(16);
    4.71 +      __ align(OptoLoopAlignment);
    4.72      __ BIND(L_copy_16_bytes);
    4.73        __ ldx(from, 4, O4);
    4.74        __ deccc(count, 4); // Can we do next iteration after this one?
    4.75 @@ -1907,7 +1907,7 @@
    4.76      // to form 2 aligned 8-bytes chunks to store.
    4.77      //
    4.78        __ ldx(end_from, -4, O3);
    4.79 -      __ align(16);
    4.80 +      __ align(OptoLoopAlignment);
    4.81      __ BIND(L_copy_16_bytes);
    4.82        __ ldx(end_from, -12, O4);
    4.83        __ deccc(count, 4);
    4.84 @@ -1929,7 +1929,7 @@
    4.85        __ delayed()->inc(count, 4);
    4.86  
    4.87      // copy 4 elements (16 bytes) at a time
    4.88 -      __ align(16);
    4.89 +      __ align(OptoLoopAlignment);
    4.90      __ BIND(L_aligned_copy);
    4.91        __ dec(end_from, 16);
    4.92        __ ldx(end_from, 8, O3);
    4.93 @@ -2045,7 +2045,7 @@
    4.94        __ mov(O3, count);
    4.95        __ mov(from, from64);
    4.96  
    4.97 -      __ align(16);
    4.98 +      __ align(OptoLoopAlignment);
    4.99      __ BIND(L_copy_64_bytes);
   4.100        for( int off = 0; off < 64; off += 16 ) {
   4.101          __ ldx(from64,  off+0, O4);
   4.102 @@ -2065,7 +2065,7 @@
   4.103        __ delayed()->add(offset0, 8, offset8);
   4.104  
   4.105        // Copy by 16 bytes chunks
   4.106 -      __ align(16);
   4.107 +      __ align(OptoLoopAlignment);
   4.108      __ BIND(L_copy_16_bytes);
   4.109        __ ldx(from, offset0, O3);
   4.110        __ ldx(from, offset8, G3);
   4.111 @@ -2139,7 +2139,7 @@
   4.112        __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
   4.113        __ delayed()->sllx(count, LogBytesPerLong, offset8);
   4.114        __ sub(offset8, 8, offset0);
   4.115 -      __ align(16);
   4.116 +      __ align(OptoLoopAlignment);
   4.117      __ BIND(L_copy_16_bytes);
   4.118        __ ldx(from, offset8, O2);
   4.119        __ ldx(from, offset0, O3);
   4.120 @@ -2405,7 +2405,7 @@
   4.121      //   (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
   4.122      //   (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
   4.123      //   G3, G4, G5 --- current oop, oop.klass, oop.klass.super
   4.124 -    __ align(16);
   4.125 +    __ align(OptoLoopAlignment);
   4.126  
   4.127      __ BIND(store_element);
   4.128      __ deccc(G1_remain);                // decrement the count
     5.1 --- a/src/cpu/sparc/vm/vm_version_sparc.cpp	Tue Apr 06 15:18:10 2010 -0700
     5.2 +++ b/src/cpu/sparc/vm/vm_version_sparc.cpp	Wed Apr 07 09:37:47 2010 -0700
     5.3 @@ -86,14 +86,14 @@
     5.4      if (FLAG_IS_DEFAULT(InteriorEntryAlignment)) {
     5.5        FLAG_SET_DEFAULT(InteriorEntryAlignment, 4);
     5.6      }
     5.7 -    if (FLAG_IS_DEFAULT(OptoLoopAlignment)) {
     5.8 -      FLAG_SET_DEFAULT(OptoLoopAlignment, 4);
     5.9 -    }
    5.10      if (is_niagara1_plus() && FLAG_IS_DEFAULT(AllocatePrefetchDistance)) {
    5.11        // Use smaller prefetch distance on N2
    5.12        FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256);
    5.13      }
    5.14  #endif
    5.15 +    if (FLAG_IS_DEFAULT(OptoLoopAlignment)) {
    5.16 +      FLAG_SET_DEFAULT(OptoLoopAlignment, 4);
    5.17 +    }
    5.18    }
    5.19  
    5.20    // Use hardware population count instruction if available.
     6.1 --- a/src/cpu/x86/vm/c2_globals_x86.hpp	Tue Apr 06 15:18:10 2010 -0700
     6.2 +++ b/src/cpu/x86/vm/c2_globals_x86.hpp	Wed Apr 07 09:37:47 2010 -0700
     6.3 @@ -80,7 +80,6 @@
     6.4  // Ergonomics related flags
     6.5  define_pd_global(uint64_t,MaxRAM,                    4ULL*G);
     6.6  #endif // AMD64
     6.7 -define_pd_global(intx, OptoLoopAlignment,            16);
     6.8  define_pd_global(intx, RegisterCostAreaRatio,        16000);
     6.9  
    6.10  // Peephole and CISC spilling both break the graph, and so makes the
     7.1 --- a/src/cpu/x86/vm/globals_x86.hpp	Tue Apr 06 15:18:10 2010 -0700
     7.2 +++ b/src/cpu/x86/vm/globals_x86.hpp	Wed Apr 07 09:37:47 2010 -0700
     7.3 @@ -45,6 +45,7 @@
     7.4  #else
     7.5  define_pd_global(intx, CodeEntryAlignment,       16);
     7.6  #endif // COMPILER2
     7.7 +define_pd_global(intx, OptoLoopAlignment,        16);
     7.8  define_pd_global(intx, InlineFrequencyCount,     100);
     7.9  define_pd_global(intx, InlineSmallCode,          1000);
    7.10  
     8.1 --- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Tue Apr 06 15:18:10 2010 -0700
     8.2 +++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Wed Apr 07 09:37:47 2010 -0700
     8.3 @@ -812,7 +812,7 @@
     8.4      Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
     8.5      // Copy 64-byte chunks
     8.6      __ jmpb(L_copy_64_bytes);
     8.7 -    __ align(16);
     8.8 +    __ align(OptoLoopAlignment);
     8.9    __ BIND(L_copy_64_bytes_loop);
    8.10  
    8.11      if(UseUnalignedLoadStores) {
    8.12 @@ -874,7 +874,7 @@
    8.13      Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
    8.14      // Copy 64-byte chunks
    8.15      __ jmpb(L_copy_64_bytes);
    8.16 -    __ align(16);
    8.17 +    __ align(OptoLoopAlignment);
    8.18    __ BIND(L_copy_64_bytes_loop);
    8.19      __ movq(mmx0, Address(from, 0));
    8.20      __ movq(mmx1, Address(from, 8));
    8.21 @@ -1144,7 +1144,7 @@
    8.22        __ movl(Address(to, count, sf, 0), rdx);
    8.23        __ jmpb(L_copy_8_bytes);
    8.24  
    8.25 -      __ align(16);
    8.26 +      __ align(OptoLoopAlignment);
    8.27        // Move 8 bytes
    8.28      __ BIND(L_copy_8_bytes_loop);
    8.29        if (UseXMMForArrayCopy) {
    8.30 @@ -1235,7 +1235,7 @@
    8.31        }
    8.32      } else {
    8.33        __ jmpb(L_copy_8_bytes);
    8.34 -      __ align(16);
    8.35 +      __ align(OptoLoopAlignment);
    8.36      __ BIND(L_copy_8_bytes_loop);
    8.37        __ fild_d(Address(from, 0));
    8.38        __ fistp_d(Address(from, to_from, Address::times_1));
    8.39 @@ -1282,7 +1282,7 @@
    8.40  
    8.41      __ jmpb(L_copy_8_bytes);
    8.42  
    8.43 -    __ align(16);
    8.44 +    __ align(OptoLoopAlignment);
    8.45    __ BIND(L_copy_8_bytes_loop);
    8.46      if (VM_Version::supports_mmx()) {
    8.47        if (UseXMMForArrayCopy) {
    8.48 @@ -1454,7 +1454,7 @@
    8.49      // Loop control:
    8.50      //   for (count = -count; count != 0; count++)
    8.51      // Base pointers src, dst are biased by 8*count,to last element.
    8.52 -    __ align(16);
    8.53 +    __ align(OptoLoopAlignment);
    8.54  
    8.55      __ BIND(L_store_element);
    8.56      __ movptr(to_element_addr, elem);     // store the oop
     9.1 --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Tue Apr 06 15:18:10 2010 -0700
     9.2 +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Wed Apr 07 09:37:47 2010 -0700
     9.3 @@ -871,9 +871,8 @@
     9.4    }
     9.5  
     9.6    address generate_fp_mask(const char *stub_name, int64_t mask) {
     9.7 +    __ align(CodeEntryAlignment);
     9.8      StubCodeMark mark(this, "StubRoutines", stub_name);
     9.9 -
    9.10 -    __ align(16);
    9.11      address start = __ pc();
    9.12  
    9.13      __ emit_data64( mask, relocInfo::none );
    9.14 @@ -1268,7 +1267,7 @@
    9.15                               Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
    9.16      DEBUG_ONLY(__ stop("enter at entry label, not here"));
    9.17      Label L_loop;
    9.18 -    __ align(16);
    9.19 +    __ align(OptoLoopAlignment);
    9.20    __ BIND(L_loop);
    9.21      if(UseUnalignedLoadStores) {
    9.22        __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
    9.23 @@ -1309,7 +1308,7 @@
    9.24                                Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
    9.25      DEBUG_ONLY(__ stop("enter at entry label, not here"));
    9.26      Label L_loop;
    9.27 -    __ align(16);
    9.28 +    __ align(OptoLoopAlignment);
    9.29    __ BIND(L_loop);
    9.30      if(UseUnalignedLoadStores) {
    9.31        __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
    9.32 @@ -2229,7 +2228,7 @@
    9.33      // Loop control:
    9.34      //   for (count = -count; count != 0; count++)
    9.35      // Base pointers src, dst are biased by 8*(count-1),to last element.
    9.36 -    __ align(16);
    9.37 +    __ align(OptoLoopAlignment);
    9.38  
    9.39      __ BIND(L_store_element);
    9.40      __ store_heap_oop(to_element_addr, rax_oop);  // store the oop
    10.1 --- a/src/share/vm/opto/c2_globals.hpp	Tue Apr 06 15:18:10 2010 -0700
    10.2 +++ b/src/share/vm/opto/c2_globals.hpp	Wed Apr 07 09:37:47 2010 -0700
    10.3 @@ -52,9 +52,6 @@
    10.4            "Code alignment for interior entry points "                       \
    10.5            "in generated code (in bytes)")                                   \
    10.6                                                                              \
    10.7 -  product_pd(intx, OptoLoopAlignment,                                       \
    10.8 -          "Align inner loops to zero relative to this modulus")             \
    10.9 -                                                                            \
   10.10    product(intx, MaxLoopPad, (OptoLoopAlignment-1),                          \
   10.11            "Align a loop if padding size in bytes is less or equal to this value") \
   10.12                                                                              \
    11.1 --- a/src/share/vm/runtime/globals.hpp	Tue Apr 06 15:18:10 2010 -0700
    11.2 +++ b/src/share/vm/runtime/globals.hpp	Wed Apr 07 09:37:47 2010 -0700
    11.3 @@ -3110,6 +3110,9 @@
    11.4    develop_pd(intx, CodeEntryAlignment,                                      \
    11.5            "Code entry alignment for generated code (in bytes)")             \
    11.6                                                                              \
    11.7 +  product_pd(intx, OptoLoopAlignment,                                       \
    11.8 +          "Align inner loops to zero relative to this modulus")             \
    11.9 +                                                                            \
   11.10    product_pd(uintx, InitialCodeCacheSize,                                   \
   11.11            "Initial code cache size (in bytes)")                             \
   11.12                                                                              \

mercurial