jdk8-mips64-public/hotspot: changeset 840:2649e5276dd7

6532536: Optimize arraycopy stubs for Intel cpus
Summary: Use SSE2 movdqu in arraycopy stubs on newest Intel's cpus
Reviewed-by: rasbold

     1.1 --- a/src/cpu/x86/vm/assembler_x86.cpp	Tue Oct 14 06:58:58 2008 -0700
     1.2 +++ b/src/cpu/x86/vm/assembler_x86.cpp	Tue Oct 14 15:10:26 2008 -0700
     1.3 @@ -1575,6 +1575,35 @@
     1.4    emit_operand(src, dst);
     1.5  }
     1.6  
     1.7 +void Assembler::movdqu(XMMRegister dst, Address src) {
     1.8 +  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
     1.9 +  InstructionMark im(this);
    1.10 +  emit_byte(0xF3);
    1.11 +  prefix(src, dst);
    1.12 +  emit_byte(0x0F);
    1.13 +  emit_byte(0x6F);
    1.14 +  emit_operand(dst, src);
    1.15 +}
    1.16 +
    1.17 +void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
    1.18 +  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
    1.19 +  emit_byte(0xF3);
    1.20 +  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
    1.21 +  emit_byte(0x0F);
    1.22 +  emit_byte(0x6F);
    1.23 +  emit_byte(0xC0 | encode);
    1.24 +}
    1.25 +
    1.26 +void Assembler::movdqu(Address dst, XMMRegister src) {
    1.27 +  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
    1.28 +  InstructionMark im(this);
    1.29 +  emit_byte(0xF3);
    1.30 +  prefix(dst, src);
    1.31 +  emit_byte(0x0F);
    1.32 +  emit_byte(0x7F);
    1.33 +  emit_operand(src, dst);
    1.34 +}
    1.35 +
    1.36  // Uses zero extension on 64bit
    1.37  
    1.38  void Assembler::movl(Register dst, int32_t imm32) {

     2.1 --- a/src/cpu/x86/vm/assembler_x86.hpp	Tue Oct 14 06:58:58 2008 -0700
     2.2 +++ b/src/cpu/x86/vm/assembler_x86.hpp	Tue Oct 14 15:10:26 2008 -0700
     2.3 @@ -1055,6 +1055,11 @@
     2.4    void movdqa(XMMRegister dst, Address src);
     2.5    void movdqa(XMMRegister dst, XMMRegister src);
     2.6  
     2.7 +  // Move Unaligned Double Quadword
     2.8 +  void movdqu(Address     dst, XMMRegister src);
     2.9 +  void movdqu(XMMRegister dst, Address src);
    2.10 +  void movdqu(XMMRegister dst, XMMRegister src);
    2.11 +
    2.12    void movl(Register dst, int32_t imm32);
    2.13    void movl(Address dst, int32_t imm32);
    2.14    void movl(Register dst, Register src);

     3.1 --- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Tue Oct 14 06:58:58 2008 -0700
     3.2 +++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Tue Oct 14 15:10:26 2008 -0700
     3.3 @@ -791,6 +791,69 @@
     3.4      }
     3.5    }
     3.6  
     3.7 +
     3.8 +  // Copy 64 bytes chunks
     3.9 +  //
    3.10 +  // Inputs:
    3.11 +  //   from        - source array address
    3.12 +  //   to_from     - destination array address - from
    3.13 +  //   qword_count - 8-bytes element count, negative
    3.14 +  //
    3.15 +  void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
    3.16 +    assert( UseSSE >= 2, "supported cpu only" );
    3.17 +    Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
    3.18 +    // Copy 64-byte chunks
    3.19 +    __ jmpb(L_copy_64_bytes);
    3.20 +    __ align(16);
    3.21 +  __ BIND(L_copy_64_bytes_loop);
    3.22 +
    3.23 +    if(UseUnalignedLoadStores) {
    3.24 +      __ movdqu(xmm0, Address(from, 0));
    3.25 +      __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
    3.26 +      __ movdqu(xmm1, Address(from, 16));
    3.27 +      __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
    3.28 +      __ movdqu(xmm2, Address(from, 32));
    3.29 +      __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
    3.30 +      __ movdqu(xmm3, Address(from, 48));
    3.31 +      __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
    3.32 +
    3.33 +    } else {
    3.34 +      __ movq(xmm0, Address(from, 0));
    3.35 +      __ movq(Address(from, to_from, Address::times_1, 0), xmm0);
    3.36 +      __ movq(xmm1, Address(from, 8));
    3.37 +      __ movq(Address(from, to_from, Address::times_1, 8), xmm1);
    3.38 +      __ movq(xmm2, Address(from, 16));
    3.39 +      __ movq(Address(from, to_from, Address::times_1, 16), xmm2);
    3.40 +      __ movq(xmm3, Address(from, 24));
    3.41 +      __ movq(Address(from, to_from, Address::times_1, 24), xmm3);
    3.42 +      __ movq(xmm4, Address(from, 32));
    3.43 +      __ movq(Address(from, to_from, Address::times_1, 32), xmm4);
    3.44 +      __ movq(xmm5, Address(from, 40));
    3.45 +      __ movq(Address(from, to_from, Address::times_1, 40), xmm5);
    3.46 +      __ movq(xmm6, Address(from, 48));
    3.47 +      __ movq(Address(from, to_from, Address::times_1, 48), xmm6);
    3.48 +      __ movq(xmm7, Address(from, 56));
    3.49 +      __ movq(Address(from, to_from, Address::times_1, 56), xmm7);
    3.50 +    }
    3.51 +
    3.52 +    __ addl(from, 64);
    3.53 +  __ BIND(L_copy_64_bytes);
    3.54 +    __ subl(qword_count, 8);
    3.55 +    __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
    3.56 +    __ addl(qword_count, 8);
    3.57 +    __ jccb(Assembler::zero, L_exit);
    3.58 +    //
    3.59 +    // length is too short, just copy qwords
    3.60 +    //
    3.61 +  __ BIND(L_copy_8_bytes);
    3.62 +    __ movq(xmm0, Address(from, 0));
    3.63 +    __ movq(Address(from, to_from, Address::times_1), xmm0);
    3.64 +    __ addl(from, 8);
    3.65 +    __ decrement(qword_count);
    3.66 +    __ jcc(Assembler::greater, L_copy_8_bytes);
    3.67 +  __ BIND(L_exit);
    3.68 +  }
    3.69 +
    3.70    // Copy 64 bytes chunks
    3.71    //
    3.72    // Inputs:
    3.73 @@ -799,6 +862,7 @@
    3.74    //   qword_count - 8-bytes element count, negative
    3.75    //
    3.76    void mmx_copy_forward(Register from, Register to_from, Register qword_count) {
    3.77 +    assert( VM_Version::supports_mmx(), "supported cpu only" );
    3.78      Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
    3.79      // Copy 64-byte chunks
    3.80      __ jmpb(L_copy_64_bytes);
    3.81 @@ -876,7 +940,7 @@
    3.82      __ subptr(to, from); // to --> to_from
    3.83      __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
    3.84      __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
    3.85 -    if (!aligned && (t == T_BYTE || t == T_SHORT)) {
    3.86 +    if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
    3.87        // align source address at 4 bytes address boundary
    3.88        if (t == T_BYTE) {
    3.89          // One byte misalignment happens only for byte arrays
    3.90 @@ -906,20 +970,26 @@
    3.91        __ mov(count, rax);      // restore 'count'
    3.92        __ jmpb(L_copy_2_bytes); // all dwords were copied
    3.93      } else {
    3.94 -      // align to 8 bytes, we know we are 4 byte aligned to start
    3.95 -      __ testptr(from, 4);
    3.96 -      __ jccb(Assembler::zero, L_copy_64_bytes);
    3.97 -      __ movl(rax, Address(from, 0));
    3.98 -      __ movl(Address(from, to_from, Address::times_1, 0), rax);
    3.99 -      __ addptr(from, 4);
   3.100 -      __ subl(count, 1<<shift);
   3.101 +      if (!UseUnalignedLoadStores) {
   3.102 +        // align to 8 bytes, we know we are 4 byte aligned to start
   3.103 +        __ testptr(from, 4);
   3.104 +        __ jccb(Assembler::zero, L_copy_64_bytes);
   3.105 +        __ movl(rax, Address(from, 0));
   3.106 +        __ movl(Address(from, to_from, Address::times_1, 0), rax);
   3.107 +        __ addptr(from, 4);
   3.108 +        __ subl(count, 1<<shift);
   3.109 +      }
   3.110      __ BIND(L_copy_64_bytes);
   3.111        __ mov(rax, count);
   3.112        __ shrl(rax, shift+1);  // 8 bytes chunk count
   3.113        //
   3.114        // Copy 8-byte chunks through MMX registers, 8 per iteration of the loop
   3.115        //
   3.116 -      mmx_copy_forward(from, to_from, rax);
   3.117 +      if (UseXMMForArrayCopy) {
   3.118 +        xmm_copy_forward(from, to_from, rax);
   3.119 +      } else {
   3.120 +        mmx_copy_forward(from, to_from, rax);
   3.121 +      }
   3.122      }
   3.123      // copy tailing dword
   3.124    __ BIND(L_copy_4_bytes);
   3.125 @@ -1069,13 +1139,20 @@
   3.126        __ align(16);
   3.127        // Move 8 bytes
   3.128      __ BIND(L_copy_8_bytes_loop);
   3.129 -      __ movq(mmx0, Address(from, count, sf, 0));
   3.130 -      __ movq(Address(to, count, sf, 0), mmx0);
   3.131 +      if (UseXMMForArrayCopy) {
   3.132 +        __ movq(xmm0, Address(from, count, sf, 0));
   3.133 +        __ movq(Address(to, count, sf, 0), xmm0);
   3.134 +      } else {
   3.135 +        __ movq(mmx0, Address(from, count, sf, 0));
   3.136 +        __ movq(Address(to, count, sf, 0), mmx0);
   3.137 +      }
   3.138      __ BIND(L_copy_8_bytes);
   3.139        __ subl(count, 2<<shift);
   3.140        __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
   3.141        __ addl(count, 2<<shift);
   3.142 -      __ emms();
   3.143 +      if (!UseXMMForArrayCopy) {
   3.144 +        __ emms();
   3.145 +      }
   3.146      }
   3.147    __ BIND(L_copy_4_bytes);
   3.148      // copy prefix qword
   3.149 @@ -1143,7 +1220,11 @@
   3.150  
   3.151      __ subptr(to, from); // to --> to_from
   3.152      if (VM_Version::supports_mmx()) {
   3.153 -      mmx_copy_forward(from, to_from, count);
   3.154 +      if (UseXMMForArrayCopy) {
   3.155 +        xmm_copy_forward(from, to_from, count);
   3.156 +      } else {
   3.157 +        mmx_copy_forward(from, to_from, count);
   3.158 +      }
   3.159      } else {
   3.160        __ jmpb(L_copy_8_bytes);
   3.161        __ align(16);
   3.162 @@ -1196,8 +1277,13 @@
   3.163      __ align(16);
   3.164    __ BIND(L_copy_8_bytes_loop);
   3.165      if (VM_Version::supports_mmx()) {
   3.166 -      __ movq(mmx0, Address(from, count, Address::times_8));
   3.167 -      __ movq(Address(to, count, Address::times_8), mmx0);
   3.168 +      if (UseXMMForArrayCopy) {
   3.169 +        __ movq(xmm0, Address(from, count, Address::times_8));
   3.170 +        __ movq(Address(to, count, Address::times_8), xmm0);
   3.171 +      } else {
   3.172 +        __ movq(mmx0, Address(from, count, Address::times_8));
   3.173 +        __ movq(Address(to, count, Address::times_8), mmx0);
   3.174 +      }
   3.175      } else {
   3.176        __ fild_d(Address(from, count, Address::times_8));
   3.177        __ fistp_d(Address(to, count, Address::times_8));
   3.178 @@ -1206,7 +1292,7 @@
   3.179      __ decrement(count);
   3.180      __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
   3.181  
   3.182 -    if (VM_Version::supports_mmx()) {
   3.183 +    if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) {
   3.184        __ emms();
   3.185      }
   3.186      inc_copy_counter_np(T_LONG);

     4.1 --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Tue Oct 14 06:58:58 2008 -0700
     4.2 +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Tue Oct 14 15:10:26 2008 -0700
     4.3 @@ -1251,6 +1251,7 @@
     4.4      }
     4.5    }
     4.6  
     4.7 +
     4.8    // Copy big chunks forward
     4.9    //
    4.10    // Inputs:
    4.11 @@ -1268,14 +1269,22 @@
    4.12      Label L_loop;
    4.13      __ align(16);
    4.14    __ BIND(L_loop);
    4.15 -    __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
    4.16 -    __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
    4.17 -    __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
    4.18 -    __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
    4.19 -    __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
    4.20 -    __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
    4.21 -    __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
    4.22 -    __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
    4.23 +    if(UseUnalignedLoadStores) {
    4.24 +      __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
    4.25 +      __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
    4.26 +      __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
    4.27 +      __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
    4.28 +
    4.29 +    } else {
    4.30 +      __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
    4.31 +      __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
    4.32 +      __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
    4.33 +      __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
    4.34 +      __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
    4.35 +      __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
    4.36 +      __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
    4.37 +      __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
    4.38 +    }
    4.39    __ BIND(L_copy_32_bytes);
    4.40      __ addptr(qword_count, 4);
    4.41      __ jcc(Assembler::lessEqual, L_loop);
    4.42 @@ -1301,14 +1310,22 @@
    4.43      Label L_loop;
    4.44      __ align(16);
    4.45    __ BIND(L_loop);
    4.46 -    __ movq(to, Address(from, qword_count, Address::times_8, 24));
    4.47 -    __ movq(Address(dest, qword_count, Address::times_8, 24), to);
    4.48 -    __ movq(to, Address(from, qword_count, Address::times_8, 16));
    4.49 -    __ movq(Address(dest, qword_count, Address::times_8, 16), to);
    4.50 -    __ movq(to, Address(from, qword_count, Address::times_8,  8));
    4.51 -    __ movq(Address(dest, qword_count, Address::times_8,  8), to);
    4.52 -    __ movq(to, Address(from, qword_count, Address::times_8,  0));
    4.53 -    __ movq(Address(dest, qword_count, Address::times_8,  0), to);
    4.54 +    if(UseUnalignedLoadStores) {
    4.55 +      __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
    4.56 +      __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
    4.57 +      __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
    4.58 +      __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
    4.59 +
    4.60 +    } else {
    4.61 +      __ movq(to, Address(from, qword_count, Address::times_8, 24));
    4.62 +      __ movq(Address(dest, qword_count, Address::times_8, 24), to);
    4.63 +      __ movq(to, Address(from, qword_count, Address::times_8, 16));
    4.64 +      __ movq(Address(dest, qword_count, Address::times_8, 16), to);
    4.65 +      __ movq(to, Address(from, qword_count, Address::times_8,  8));
    4.66 +      __ movq(Address(dest, qword_count, Address::times_8,  8), to);
    4.67 +      __ movq(to, Address(from, qword_count, Address::times_8,  0));
    4.68 +      __ movq(Address(dest, qword_count, Address::times_8,  0), to);
    4.69 +    }
    4.70    __ BIND(L_copy_32_bytes);
    4.71      __ subptr(qword_count, 4);
    4.72      __ jcc(Assembler::greaterEqual, L_loop);

     5.1 --- a/src/cpu/x86/vm/vm_version_x86_32.cpp	Tue Oct 14 06:58:58 2008 -0700
     5.2 +++ b/src/cpu/x86/vm/vm_version_x86_32.cpp	Tue Oct 14 15:10:26 2008 -0700
     5.3 @@ -242,9 +242,11 @@
     5.4    _supports_cx8 = supports_cmpxchg8();
     5.5    // if the OS doesn't support SSE, we can't use this feature even if the HW does
     5.6    if( !os::supports_sse())
     5.7 -    _cpuFeatures &= ~(CPU_SSE|CPU_SSE2|CPU_SSE3|CPU_SSSE3|CPU_SSE4|CPU_SSE4A);
     5.8 -  if (UseSSE < 4)
     5.9 -    _cpuFeatures &= ~CPU_SSE4;
    5.10 +    _cpuFeatures &= ~(CPU_SSE|CPU_SSE2|CPU_SSE3|CPU_SSSE3|CPU_SSE4A|CPU_SSE4_1|CPU_SSE4_2);
    5.11 +  if (UseSSE < 4) {
    5.12 +    _cpuFeatures &= ~CPU_SSE4_1;
    5.13 +    _cpuFeatures &= ~CPU_SSE4_2;
    5.14 +  }
    5.15    if (UseSSE < 3) {
    5.16      _cpuFeatures &= ~CPU_SSE3;
    5.17      _cpuFeatures &= ~CPU_SSSE3;
    5.18 @@ -261,7 +263,7 @@
    5.19    }
    5.20  
    5.21    char buf[256];
    5.22 -  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
    5.23 +  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
    5.24                 cores_per_cpu(), threads_per_core(),
    5.25                 cpu_family(), _model, _stepping,
    5.26                 (supports_cmov() ? ", cmov" : ""),
    5.27 @@ -272,7 +274,8 @@
    5.28                 (supports_sse2() ? ", sse2" : ""),
    5.29                 (supports_sse3() ? ", sse3" : ""),
    5.30                 (supports_ssse3()? ", ssse3": ""),
    5.31 -               (supports_sse4() ? ", sse4" : ""),
    5.32 +               (supports_sse4_1() ? ", sse4.1" : ""),
    5.33 +               (supports_sse4_2() ? ", sse4.2" : ""),
    5.34                 (supports_mmx_ext() ? ", mmxext" : ""),
    5.35                 (supports_3dnow()   ? ", 3dnow"  : ""),
    5.36                 (supports_3dnow2()  ? ", 3dnowext" : ""),
    5.37 @@ -285,7 +288,7 @@
    5.38    // older Pentiums which do not support it.
    5.39    if( UseSSE > 4 ) UseSSE=4;
    5.40    if( UseSSE < 0 ) UseSSE=0;
    5.41 -  if( !supports_sse4() ) // Drop to 3 if no SSE4 support
    5.42 +  if( !supports_sse4_1() ) // Drop to 3 if no SSE4 support
    5.43      UseSSE = MIN2((intx)3,UseSSE);
    5.44    if( !supports_sse3() ) // Drop to 2 if no SSE3 support
    5.45      UseSSE = MIN2((intx)2,UseSSE);
    5.46 @@ -375,6 +378,14 @@
    5.47          MaxLoopPad = 11;
    5.48        }
    5.49  #endif // COMPILER2
    5.50 +      if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) {
    5.51 +        UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus
    5.52 +      }
    5.53 +      if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus
    5.54 +        if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) {
    5.55 +          UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
    5.56 +        }
    5.57 +      }
    5.58      }
    5.59    }
    5.60  
    5.61 @@ -413,7 +424,7 @@
    5.62  
    5.63  #ifndef PRODUCT
    5.64    if (PrintMiscellaneous && Verbose) {
    5.65 -    tty->print_cr("Logical CPUs per package: %u",
    5.66 +    tty->print_cr("Logical CPUs per core: %u",
    5.67                    logical_processors_per_package());
    5.68      tty->print_cr("UseSSE=%d",UseSSE);
    5.69      tty->print("Allocation: ");

     6.1 --- a/src/cpu/x86/vm/vm_version_x86_32.hpp	Tue Oct 14 06:58:58 2008 -0700
     6.2 +++ b/src/cpu/x86/vm/vm_version_x86_32.hpp	Tue Oct 14 15:10:26 2008 -0700
     6.3 @@ -68,9 +68,9 @@
     6.4                 cmpxchg16: 1,
     6.5                          : 4,
     6.6                 dca      : 1,
     6.7 -                        : 4,
     6.8 -               popcnt   : 1,
     6.9 -                        : 8;
    6.10 +               sse4_1   : 1,
    6.11 +               sse4_2   : 1,
    6.12 +                        : 11;
    6.13      } bits;
    6.14    };
    6.15  
    6.16 @@ -177,8 +177,9 @@
    6.17       CPU_SSE2 = (1 << 7),
    6.18       CPU_SSE3 = (1 << 8), // sse3  comes from cpuid 1 (ECX)
    6.19       CPU_SSSE3= (1 << 9),
    6.20 -     CPU_SSE4 = (1 <<10),
    6.21 -     CPU_SSE4A= (1 <<11)
    6.22 +     CPU_SSE4A= (1 <<10),
    6.23 +     CPU_SSE4_1 = (1 << 11),
    6.24 +     CPU_SSE4_2 = (1 << 12)
    6.25     } cpuFeatureFlags;
    6.26  
    6.27    // cpuid information block.  All info derived from executing cpuid with
    6.28 @@ -240,22 +241,14 @@
    6.29    static CpuidInfo _cpuid_info;
    6.30  
    6.31    // Extractors and predicates
    6.32 -  static bool is_extended_cpu_family() {
    6.33 -    const uint32_t Extended_Cpu_Family = 0xf;
    6.34 -    return _cpuid_info.std_cpuid1_rax.bits.family == Extended_Cpu_Family;
    6.35 -  }
    6.36    static uint32_t extended_cpu_family() {
    6.37      uint32_t result = _cpuid_info.std_cpuid1_rax.bits.family;
    6.38 -    if (is_extended_cpu_family()) {
    6.39 -      result += _cpuid_info.std_cpuid1_rax.bits.ext_family;
    6.40 -    }
    6.41 +    result += _cpuid_info.std_cpuid1_rax.bits.ext_family;
    6.42      return result;
    6.43    }
    6.44    static uint32_t extended_cpu_model() {
    6.45      uint32_t result = _cpuid_info.std_cpuid1_rax.bits.model;
    6.46 -    if (is_extended_cpu_family()) {
    6.47 -      result |= _cpuid_info.std_cpuid1_rax.bits.ext_model << 4;
    6.48 -    }
    6.49 +    result |= _cpuid_info.std_cpuid1_rax.bits.ext_model << 4;
    6.50      return result;
    6.51    }
    6.52    static uint32_t cpu_stepping() {
    6.53 @@ -293,6 +286,10 @@
    6.54        result |= CPU_SSSE3;
    6.55      if (is_amd() && _cpuid_info.ext_cpuid1_rcx.bits.sse4a != 0)
    6.56        result |= CPU_SSE4A;
    6.57 +    if (_cpuid_info.std_cpuid1_rcx.bits.sse4_1 != 0)
    6.58 +      result |= CPU_SSE4_1;
    6.59 +    if (_cpuid_info.std_cpuid1_rcx.bits.sse4_2 != 0)
    6.60 +      result |= CPU_SSE4_2;
    6.61      return result;
    6.62    }
    6.63  
    6.64 @@ -380,7 +377,8 @@
    6.65    static bool supports_sse2()     { return (_cpuFeatures & CPU_SSE2) != 0; }
    6.66    static bool supports_sse3()     { return (_cpuFeatures & CPU_SSE3) != 0; }
    6.67    static bool supports_ssse3()    { return (_cpuFeatures & CPU_SSSE3)!= 0; }
    6.68 -  static bool supports_sse4()     { return (_cpuFeatures & CPU_SSE4) != 0; }
    6.69 +  static bool supports_sse4_1()   { return (_cpuFeatures & CPU_SSE4_1) != 0; }
    6.70 +  static bool supports_sse4_2()   { return (_cpuFeatures & CPU_SSE4_2) != 0; }
    6.71    //
    6.72    // AMD features
    6.73    //

     7.1 --- a/src/cpu/x86/vm/vm_version_x86_64.cpp	Tue Oct 14 06:58:58 2008 -0700
     7.2 +++ b/src/cpu/x86/vm/vm_version_x86_64.cpp	Tue Oct 14 15:10:26 2008 -0700
     7.3 @@ -186,8 +186,10 @@
     7.4    if (!VM_Version::supports_sse2()) {
     7.5      vm_exit_during_initialization("Unknown x64 processor: SSE2 not supported");
     7.6    }
     7.7 -  if (UseSSE < 4)
     7.8 -    _cpuFeatures &= ~CPU_SSE4;
     7.9 +  if (UseSSE < 4) {
    7.10 +    _cpuFeatures &= ~CPU_SSE4_1;
    7.11 +    _cpuFeatures &= ~CPU_SSE4_2;
    7.12 +  }
    7.13    if (UseSSE < 3) {
    7.14      _cpuFeatures &= ~CPU_SSE3;
    7.15      _cpuFeatures &= ~CPU_SSSE3;
    7.16 @@ -204,7 +206,7 @@
    7.17    }
    7.18  
    7.19    char buf[256];
    7.20 -  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
    7.21 +  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
    7.22                 cores_per_cpu(), threads_per_core(),
    7.23                 cpu_family(), _model, _stepping,
    7.24                 (supports_cmov() ? ", cmov" : ""),
    7.25 @@ -215,7 +217,8 @@
    7.26                 (supports_sse2() ? ", sse2" : ""),
    7.27                 (supports_sse3() ? ", sse3" : ""),
    7.28                 (supports_ssse3()? ", ssse3": ""),
    7.29 -               (supports_sse4() ? ", sse4" : ""),
    7.30 +               (supports_sse4_1() ? ", sse4.1" : ""),
    7.31 +               (supports_sse4_2() ? ", sse4.2" : ""),
    7.32                 (supports_mmx_ext() ? ", mmxext" : ""),
    7.33                 (supports_3dnow()   ? ", 3dnow"  : ""),
    7.34                 (supports_3dnow2()  ? ", 3dnowext" : ""),
    7.35 @@ -228,7 +231,7 @@
    7.36    // older Pentiums which do not support it.
    7.37    if( UseSSE > 4 ) UseSSE=4;
    7.38    if( UseSSE < 0 ) UseSSE=0;
    7.39 -  if( !supports_sse4() ) // Drop to 3 if no SSE4 support
    7.40 +  if( !supports_sse4_1() ) // Drop to 3 if no SSE4 support
    7.41      UseSSE = MIN2((intx)3,UseSSE);
    7.42    if( !supports_sse3() ) // Drop to 2 if no SSE3 support
    7.43      UseSSE = MIN2((intx)2,UseSSE);
    7.44 @@ -314,6 +317,14 @@
    7.45          MaxLoopPad = 11;
    7.46        }
    7.47  #endif // COMPILER2
    7.48 +      if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) {
    7.49 +        UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus
    7.50 +      }
    7.51 +      if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus
    7.52 +        if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) {
    7.53 +          UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
    7.54 +        }
    7.55 +      }
    7.56      }
    7.57    }
    7.58  
    7.59 @@ -355,7 +366,7 @@
    7.60  
    7.61  #ifndef PRODUCT
    7.62    if (PrintMiscellaneous && Verbose) {
    7.63 -    tty->print_cr("Logical CPUs per package: %u",
    7.64 +    tty->print_cr("Logical CPUs per core: %u",
    7.65                    logical_processors_per_package());
    7.66      tty->print_cr("UseSSE=%d",UseSSE);
    7.67      tty->print("Allocation: ");

     8.1 --- a/src/cpu/x86/vm/vm_version_x86_64.hpp	Tue Oct 14 06:58:58 2008 -0700
     8.2 +++ b/src/cpu/x86/vm/vm_version_x86_64.hpp	Tue Oct 14 15:10:26 2008 -0700
     8.3 @@ -68,9 +68,9 @@
     8.4                 cmpxchg16: 1,
     8.5                          : 4,
     8.6                 dca      : 1,
     8.7 -                        : 4,
     8.8 -               popcnt   : 1,
     8.9 -                        : 8;
    8.10 +               sse4_1   : 1,
    8.11 +               sse4_2   : 1,
    8.12 +                        : 11;
    8.13      } bits;
    8.14    };
    8.15  
    8.16 @@ -177,8 +177,9 @@
    8.17       CPU_SSE2 = (1 << 7),
    8.18       CPU_SSE3 = (1 << 8),
    8.19       CPU_SSSE3= (1 << 9),
    8.20 -     CPU_SSE4 = (1 <<10),
    8.21 -     CPU_SSE4A= (1 <<11)
    8.22 +     CPU_SSE4A= (1 <<10),
    8.23 +     CPU_SSE4_1 = (1 << 11),
    8.24 +     CPU_SSE4_2 = (1 << 12)
    8.25     } cpuFeatureFlags;
    8.26  
    8.27    // cpuid information block.  All info derived from executing cpuid with
    8.28 @@ -240,22 +241,14 @@
    8.29    static CpuidInfo _cpuid_info;
    8.30  
    8.31    // Extractors and predicates
    8.32 -  static bool is_extended_cpu_family() {
    8.33 -    const uint32_t Extended_Cpu_Family = 0xf;
    8.34 -    return _cpuid_info.std_cpuid1_eax.bits.family == Extended_Cpu_Family;
    8.35 -  }
    8.36    static uint32_t extended_cpu_family() {
    8.37      uint32_t result = _cpuid_info.std_cpuid1_eax.bits.family;
    8.38 -    if (is_extended_cpu_family()) {
    8.39 -      result += _cpuid_info.std_cpuid1_eax.bits.ext_family;
    8.40 -    }
    8.41 +    result += _cpuid_info.std_cpuid1_eax.bits.ext_family;
    8.42      return result;
    8.43    }
    8.44    static uint32_t extended_cpu_model() {
    8.45      uint32_t result = _cpuid_info.std_cpuid1_eax.bits.model;
    8.46 -    if (is_extended_cpu_family()) {
    8.47 -      result |= _cpuid_info.std_cpuid1_eax.bits.ext_model << 4;
    8.48 -    }
    8.49 +    result |= _cpuid_info.std_cpuid1_eax.bits.ext_model << 4;
    8.50      return result;
    8.51    }
    8.52    static uint32_t cpu_stepping() {
    8.53 @@ -293,6 +286,10 @@
    8.54        result |= CPU_SSSE3;
    8.55      if (is_amd() && _cpuid_info.ext_cpuid1_ecx.bits.sse4a != 0)
    8.56        result |= CPU_SSE4A;
    8.57 +    if (_cpuid_info.std_cpuid1_ecx.bits.sse4_1 != 0)
    8.58 +      result |= CPU_SSE4_1;
    8.59 +    if (_cpuid_info.std_cpuid1_ecx.bits.sse4_2 != 0)
    8.60 +      result |= CPU_SSE4_2;
    8.61      return result;
    8.62    }
    8.63  
    8.64 @@ -380,7 +377,8 @@
    8.65    static bool supports_sse2()     { return (_cpuFeatures & CPU_SSE2) != 0; }
    8.66    static bool supports_sse3()     { return (_cpuFeatures & CPU_SSE3) != 0; }
    8.67    static bool supports_ssse3()    { return (_cpuFeatures & CPU_SSSE3)!= 0; }
    8.68 -  static bool supports_sse4()     { return (_cpuFeatures & CPU_SSE4) != 0; }
    8.69 +  static bool supports_sse4_1()   { return (_cpuFeatures & CPU_SSE4_1) != 0; }
    8.70 +  static bool supports_sse4_2()   { return (_cpuFeatures & CPU_SSE4_2) != 0; }
    8.71    //
    8.72    // AMD features
    8.73    //

     9.1 --- a/src/os/solaris/vm/os_solaris.cpp	Tue Oct 14 06:58:58 2008 -0700
     9.2 +++ b/src/os/solaris/vm/os_solaris.cpp	Tue Oct 14 15:10:26 2008 -0700
     9.3 @@ -3758,7 +3758,7 @@
     9.4      int maxClamped     = MIN2(iaLimits.maxPrio, (int)iaInfo->ia_uprilim);
     9.5      iaInfo->ia_upri    = scale_to_lwp_priority(iaLimits.minPrio, maxClamped, newPrio);
     9.6      iaInfo->ia_uprilim = IA_NOCHANGE;
     9.7 -    iaInfo->ia_nice    = IA_NOCHANGE;
     9.8 +//    iaInfo->ia_nice    = IA_NOCHANGE;
     9.9      iaInfo->ia_mode    = IA_NOCHANGE;
    9.10      if (ThreadPriorityVerbose) {
    9.11        tty->print_cr ("IA: [%d...%d] %d->%d\n",

    10.1 --- a/src/share/vm/runtime/globals.hpp	Tue Oct 14 06:58:58 2008 -0700
    10.2 +++ b/src/share/vm/runtime/globals.hpp	Tue Oct 14 15:10:26 2008 -0700
    10.3 @@ -991,6 +991,12 @@
    10.4    product(bool, UseXmmI2F, false,                                           \
    10.5            "Use SSE2 CVTDQ2PS instruction to convert Integer to Float")      \
    10.6                                                                              \
    10.7 +  product(bool, UseXMMForArrayCopy, false,                                  \
    10.8 +          "Use SSE2 MOVQ instruction for Arraycopy")                        \
    10.9 +                                                                            \
   10.10 +  product(bool, UseUnalignedLoadStores, false,                              \
   10.11 +          "Use SSE2 MOVDQU instruction for Arraycopy")                      \
   10.12 +                                                                            \
   10.13    product(intx, FieldsAllocationStyle, 1,                                   \
   10.14            "0 - type based with oops first, 1 - with oops last")             \
   10.15                                                                              \

src/cpu/x86/vm/assembler_x86.cpp		file \| annotate \| diff \| comparison \| revisions
src/cpu/x86/vm/assembler_x86.hpp		file \| annotate \| diff \| comparison \| revisions
src/cpu/x86/vm/stubGenerator_x86_32.cpp		file \| annotate \| diff \| comparison \| revisions
src/cpu/x86/vm/stubGenerator_x86_64.cpp		file \| annotate \| diff \| comparison \| revisions
src/cpu/x86/vm/vm_version_x86_32.cpp		file \| annotate \| diff \| comparison \| revisions
src/cpu/x86/vm/vm_version_x86_32.hpp		file \| annotate \| diff \| comparison \| revisions
src/cpu/x86/vm/vm_version_x86_64.cpp		file \| annotate \| diff \| comparison \| revisions
src/cpu/x86/vm/vm_version_x86_64.hpp		file \| annotate \| diff \| comparison \| revisions
src/os/solaris/vm/os_solaris.cpp		file \| annotate \| diff \| comparison \| revisions
src/share/vm/runtime/globals.hpp		file \| annotate \| diff \| comparison \| revisions

Mercurial > jdk8-mips64-public > hotspot / changeset

changeset

6532536: Optimize arraycopy stubs for Intel cpus