Tue, 14 Oct 2008 15:10:26 -0700
6532536: Optimize arraycopy stubs for Intel cpus
Summary: Use SSE2 movdqu in arraycopy stubs on newest Intel's cpus
Reviewed-by: rasbold
1.1 --- a/src/cpu/x86/vm/assembler_x86.cpp Tue Oct 14 06:58:58 2008 -0700 1.2 +++ b/src/cpu/x86/vm/assembler_x86.cpp Tue Oct 14 15:10:26 2008 -0700 1.3 @@ -1575,6 +1575,35 @@ 1.4 emit_operand(src, dst); 1.5 } 1.6 1.7 +void Assembler::movdqu(XMMRegister dst, Address src) { 1.8 + NOT_LP64(assert(VM_Version::supports_sse2(), "")); 1.9 + InstructionMark im(this); 1.10 + emit_byte(0xF3); 1.11 + prefix(src, dst); 1.12 + emit_byte(0x0F); 1.13 + emit_byte(0x6F); 1.14 + emit_operand(dst, src); 1.15 +} 1.16 + 1.17 +void Assembler::movdqu(XMMRegister dst, XMMRegister src) { 1.18 + NOT_LP64(assert(VM_Version::supports_sse2(), "")); 1.19 + emit_byte(0xF3); 1.20 + int encode = prefixq_and_encode(dst->encoding(), src->encoding()); 1.21 + emit_byte(0x0F); 1.22 + emit_byte(0x6F); 1.23 + emit_byte(0xC0 | encode); 1.24 +} 1.25 + 1.26 +void Assembler::movdqu(Address dst, XMMRegister src) { 1.27 + NOT_LP64(assert(VM_Version::supports_sse2(), "")); 1.28 + InstructionMark im(this); 1.29 + emit_byte(0xF3); 1.30 + prefix(dst, src); 1.31 + emit_byte(0x0F); 1.32 + emit_byte(0x7F); 1.33 + emit_operand(src, dst); 1.34 +} 1.35 + 1.36 // Uses zero extension on 64bit 1.37 1.38 void Assembler::movl(Register dst, int32_t imm32) {
2.1 --- a/src/cpu/x86/vm/assembler_x86.hpp Tue Oct 14 06:58:58 2008 -0700 2.2 +++ b/src/cpu/x86/vm/assembler_x86.hpp Tue Oct 14 15:10:26 2008 -0700 2.3 @@ -1055,6 +1055,11 @@ 2.4 void movdqa(XMMRegister dst, Address src); 2.5 void movdqa(XMMRegister dst, XMMRegister src); 2.6 2.7 + // Move Unaligned Double Quadword 2.8 + void movdqu(Address dst, XMMRegister src); 2.9 + void movdqu(XMMRegister dst, Address src); 2.10 + void movdqu(XMMRegister dst, XMMRegister src); 2.11 + 2.12 void movl(Register dst, int32_t imm32); 2.13 void movl(Address dst, int32_t imm32); 2.14 void movl(Register dst, Register src);
3.1 --- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp Tue Oct 14 06:58:58 2008 -0700 3.2 +++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp Tue Oct 14 15:10:26 2008 -0700 3.3 @@ -791,6 +791,69 @@ 3.4 } 3.5 } 3.6 3.7 + 3.8 + // Copy 64 bytes chunks 3.9 + // 3.10 + // Inputs: 3.11 + // from - source array address 3.12 + // to_from - destination array address - from 3.13 + // qword_count - 8-bytes element count, negative 3.14 + // 3.15 + void xmm_copy_forward(Register from, Register to_from, Register qword_count) { 3.16 + assert( UseSSE >= 2, "supported cpu only" ); 3.17 + Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; 3.18 + // Copy 64-byte chunks 3.19 + __ jmpb(L_copy_64_bytes); 3.20 + __ align(16); 3.21 + __ BIND(L_copy_64_bytes_loop); 3.22 + 3.23 + if(UseUnalignedLoadStores) { 3.24 + __ movdqu(xmm0, Address(from, 0)); 3.25 + __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0); 3.26 + __ movdqu(xmm1, Address(from, 16)); 3.27 + __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1); 3.28 + __ movdqu(xmm2, Address(from, 32)); 3.29 + __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2); 3.30 + __ movdqu(xmm3, Address(from, 48)); 3.31 + __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3); 3.32 + 3.33 + } else { 3.34 + __ movq(xmm0, Address(from, 0)); 3.35 + __ movq(Address(from, to_from, Address::times_1, 0), xmm0); 3.36 + __ movq(xmm1, Address(from, 8)); 3.37 + __ movq(Address(from, to_from, Address::times_1, 8), xmm1); 3.38 + __ movq(xmm2, Address(from, 16)); 3.39 + __ movq(Address(from, to_from, Address::times_1, 16), xmm2); 3.40 + __ movq(xmm3, Address(from, 24)); 3.41 + __ movq(Address(from, to_from, Address::times_1, 24), xmm3); 3.42 + __ movq(xmm4, Address(from, 32)); 3.43 + __ movq(Address(from, to_from, Address::times_1, 32), xmm4); 3.44 + __ movq(xmm5, Address(from, 40)); 3.45 + __ movq(Address(from, to_from, Address::times_1, 40), xmm5); 3.46 + __ movq(xmm6, Address(from, 48)); 3.47 + __ movq(Address(from, to_from, Address::times_1, 48), xmm6); 3.48 + __ movq(xmm7, Address(from, 56)); 3.49 + __ movq(Address(from, to_from, Address::times_1, 56), xmm7); 3.50 + } 3.51 + 3.52 + __ addl(from, 64); 3.53 + __ BIND(L_copy_64_bytes); 3.54 + __ subl(qword_count, 8); 3.55 + __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop); 3.56 + __ addl(qword_count, 8); 3.57 + __ jccb(Assembler::zero, L_exit); 3.58 + // 3.59 + // length is too short, just copy qwords 3.60 + // 3.61 + __ BIND(L_copy_8_bytes); 3.62 + __ movq(xmm0, Address(from, 0)); 3.63 + __ movq(Address(from, to_from, Address::times_1), xmm0); 3.64 + __ addl(from, 8); 3.65 + __ decrement(qword_count); 3.66 + __ jcc(Assembler::greater, L_copy_8_bytes); 3.67 + __ BIND(L_exit); 3.68 + } 3.69 + 3.70 // Copy 64 bytes chunks 3.71 // 3.72 // Inputs: 3.73 @@ -799,6 +862,7 @@ 3.74 // qword_count - 8-bytes element count, negative 3.75 // 3.76 void mmx_copy_forward(Register from, Register to_from, Register qword_count) { 3.77 + assert( VM_Version::supports_mmx(), "supported cpu only" ); 3.78 Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; 3.79 // Copy 64-byte chunks 3.80 __ jmpb(L_copy_64_bytes); 3.81 @@ -876,7 +940,7 @@ 3.82 __ subptr(to, from); // to --> to_from 3.83 __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element 3.84 __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp 3.85 - if (!aligned && (t == T_BYTE || t == T_SHORT)) { 3.86 + if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) { 3.87 // align source address at 4 bytes address boundary 3.88 if (t == T_BYTE) { 3.89 // One byte misalignment happens only for byte arrays 3.90 @@ -906,20 +970,26 @@ 3.91 __ mov(count, rax); // restore 'count' 3.92 __ jmpb(L_copy_2_bytes); // all dwords were copied 3.93 } else { 3.94 - // align to 8 bytes, we know we are 4 byte aligned to start 3.95 - __ testptr(from, 4); 3.96 - __ jccb(Assembler::zero, L_copy_64_bytes); 3.97 - __ movl(rax, Address(from, 0)); 3.98 - __ movl(Address(from, to_from, Address::times_1, 0), rax); 3.99 - __ addptr(from, 4); 3.100 - __ subl(count, 1<<shift); 3.101 + if (!UseUnalignedLoadStores) { 3.102 + // align to 8 bytes, we know we are 4 byte aligned to start 3.103 + __ testptr(from, 4); 3.104 + __ jccb(Assembler::zero, L_copy_64_bytes); 3.105 + __ movl(rax, Address(from, 0)); 3.106 + __ movl(Address(from, to_from, Address::times_1, 0), rax); 3.107 + __ addptr(from, 4); 3.108 + __ subl(count, 1<<shift); 3.109 + } 3.110 __ BIND(L_copy_64_bytes); 3.111 __ mov(rax, count); 3.112 __ shrl(rax, shift+1); // 8 bytes chunk count 3.113 // 3.114 // Copy 8-byte chunks through MMX registers, 8 per iteration of the loop 3.115 // 3.116 - mmx_copy_forward(from, to_from, rax); 3.117 + if (UseXMMForArrayCopy) { 3.118 + xmm_copy_forward(from, to_from, rax); 3.119 + } else { 3.120 + mmx_copy_forward(from, to_from, rax); 3.121 + } 3.122 } 3.123 // copy tailing dword 3.124 __ BIND(L_copy_4_bytes); 3.125 @@ -1069,13 +1139,20 @@ 3.126 __ align(16); 3.127 // Move 8 bytes 3.128 __ BIND(L_copy_8_bytes_loop); 3.129 - __ movq(mmx0, Address(from, count, sf, 0)); 3.130 - __ movq(Address(to, count, sf, 0), mmx0); 3.131 + if (UseXMMForArrayCopy) { 3.132 + __ movq(xmm0, Address(from, count, sf, 0)); 3.133 + __ movq(Address(to, count, sf, 0), xmm0); 3.134 + } else { 3.135 + __ movq(mmx0, Address(from, count, sf, 0)); 3.136 + __ movq(Address(to, count, sf, 0), mmx0); 3.137 + } 3.138 __ BIND(L_copy_8_bytes); 3.139 __ subl(count, 2<<shift); 3.140 __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop); 3.141 __ addl(count, 2<<shift); 3.142 - __ emms(); 3.143 + if (!UseXMMForArrayCopy) { 3.144 + __ emms(); 3.145 + } 3.146 } 3.147 __ BIND(L_copy_4_bytes); 3.148 // copy prefix qword 3.149 @@ -1143,7 +1220,11 @@ 3.150 3.151 __ subptr(to, from); // to --> to_from 3.152 if (VM_Version::supports_mmx()) { 3.153 - mmx_copy_forward(from, to_from, count); 3.154 + if (UseXMMForArrayCopy) { 3.155 + xmm_copy_forward(from, to_from, count); 3.156 + } else { 3.157 + mmx_copy_forward(from, to_from, count); 3.158 + } 3.159 } else { 3.160 __ jmpb(L_copy_8_bytes); 3.161 __ align(16); 3.162 @@ -1196,8 +1277,13 @@ 3.163 __ align(16); 3.164 __ BIND(L_copy_8_bytes_loop); 3.165 if (VM_Version::supports_mmx()) { 3.166 - __ movq(mmx0, Address(from, count, Address::times_8)); 3.167 - __ movq(Address(to, count, Address::times_8), mmx0); 3.168 + if (UseXMMForArrayCopy) { 3.169 + __ movq(xmm0, Address(from, count, Address::times_8)); 3.170 + __ movq(Address(to, count, Address::times_8), xmm0); 3.171 + } else { 3.172 + __ movq(mmx0, Address(from, count, Address::times_8)); 3.173 + __ movq(Address(to, count, Address::times_8), mmx0); 3.174 + } 3.175 } else { 3.176 __ fild_d(Address(from, count, Address::times_8)); 3.177 __ fistp_d(Address(to, count, Address::times_8)); 3.178 @@ -1206,7 +1292,7 @@ 3.179 __ decrement(count); 3.180 __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop); 3.181 3.182 - if (VM_Version::supports_mmx()) { 3.183 + if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) { 3.184 __ emms(); 3.185 } 3.186 inc_copy_counter_np(T_LONG);
4.1 --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp Tue Oct 14 06:58:58 2008 -0700 4.2 +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp Tue Oct 14 15:10:26 2008 -0700 4.3 @@ -1251,6 +1251,7 @@ 4.4 } 4.5 } 4.6 4.7 + 4.8 // Copy big chunks forward 4.9 // 4.10 // Inputs: 4.11 @@ -1268,14 +1269,22 @@ 4.12 Label L_loop; 4.13 __ align(16); 4.14 __ BIND(L_loop); 4.15 - __ movq(to, Address(end_from, qword_count, Address::times_8, -24)); 4.16 - __ movq(Address(end_to, qword_count, Address::times_8, -24), to); 4.17 - __ movq(to, Address(end_from, qword_count, Address::times_8, -16)); 4.18 - __ movq(Address(end_to, qword_count, Address::times_8, -16), to); 4.19 - __ movq(to, Address(end_from, qword_count, Address::times_8, - 8)); 4.20 - __ movq(Address(end_to, qword_count, Address::times_8, - 8), to); 4.21 - __ movq(to, Address(end_from, qword_count, Address::times_8, - 0)); 4.22 - __ movq(Address(end_to, qword_count, Address::times_8, - 0), to); 4.23 + if(UseUnalignedLoadStores) { 4.24 + __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); 4.25 + __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0); 4.26 + __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8)); 4.27 + __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1); 4.28 + 4.29 + } else { 4.30 + __ movq(to, Address(end_from, qword_count, Address::times_8, -24)); 4.31 + __ movq(Address(end_to, qword_count, Address::times_8, -24), to); 4.32 + __ movq(to, Address(end_from, qword_count, Address::times_8, -16)); 4.33 + __ movq(Address(end_to, qword_count, Address::times_8, -16), to); 4.34 + __ movq(to, Address(end_from, qword_count, Address::times_8, - 8)); 4.35 + __ movq(Address(end_to, qword_count, Address::times_8, - 8), to); 4.36 + __ movq(to, Address(end_from, qword_count, Address::times_8, - 0)); 4.37 + __ movq(Address(end_to, qword_count, Address::times_8, - 0), to); 4.38 + } 4.39 __ BIND(L_copy_32_bytes); 4.40 __ addptr(qword_count, 4); 4.41 __ jcc(Assembler::lessEqual, L_loop); 4.42 @@ -1301,14 +1310,22 @@ 4.43 Label L_loop; 4.44 __ align(16); 4.45 __ BIND(L_loop); 4.46 - __ movq(to, Address(from, qword_count, Address::times_8, 24)); 4.47 - __ movq(Address(dest, qword_count, Address::times_8, 24), to); 4.48 - __ movq(to, Address(from, qword_count, Address::times_8, 16)); 4.49 - __ movq(Address(dest, qword_count, Address::times_8, 16), to); 4.50 - __ movq(to, Address(from, qword_count, Address::times_8, 8)); 4.51 - __ movq(Address(dest, qword_count, Address::times_8, 8), to); 4.52 - __ movq(to, Address(from, qword_count, Address::times_8, 0)); 4.53 - __ movq(Address(dest, qword_count, Address::times_8, 0), to); 4.54 + if(UseUnalignedLoadStores) { 4.55 + __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16)); 4.56 + __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0); 4.57 + __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); 4.58 + __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); 4.59 + 4.60 + } else { 4.61 + __ movq(to, Address(from, qword_count, Address::times_8, 24)); 4.62 + __ movq(Address(dest, qword_count, Address::times_8, 24), to); 4.63 + __ movq(to, Address(from, qword_count, Address::times_8, 16)); 4.64 + __ movq(Address(dest, qword_count, Address::times_8, 16), to); 4.65 + __ movq(to, Address(from, qword_count, Address::times_8, 8)); 4.66 + __ movq(Address(dest, qword_count, Address::times_8, 8), to); 4.67 + __ movq(to, Address(from, qword_count, Address::times_8, 0)); 4.68 + __ movq(Address(dest, qword_count, Address::times_8, 0), to); 4.69 + } 4.70 __ BIND(L_copy_32_bytes); 4.71 __ subptr(qword_count, 4); 4.72 __ jcc(Assembler::greaterEqual, L_loop);
5.1 --- a/src/cpu/x86/vm/vm_version_x86_32.cpp Tue Oct 14 06:58:58 2008 -0700 5.2 +++ b/src/cpu/x86/vm/vm_version_x86_32.cpp Tue Oct 14 15:10:26 2008 -0700 5.3 @@ -242,9 +242,11 @@ 5.4 _supports_cx8 = supports_cmpxchg8(); 5.5 // if the OS doesn't support SSE, we can't use this feature even if the HW does 5.6 if( !os::supports_sse()) 5.7 - _cpuFeatures &= ~(CPU_SSE|CPU_SSE2|CPU_SSE3|CPU_SSSE3|CPU_SSE4|CPU_SSE4A); 5.8 - if (UseSSE < 4) 5.9 - _cpuFeatures &= ~CPU_SSE4; 5.10 + _cpuFeatures &= ~(CPU_SSE|CPU_SSE2|CPU_SSE3|CPU_SSSE3|CPU_SSE4A|CPU_SSE4_1|CPU_SSE4_2); 5.11 + if (UseSSE < 4) { 5.12 + _cpuFeatures &= ~CPU_SSE4_1; 5.13 + _cpuFeatures &= ~CPU_SSE4_2; 5.14 + } 5.15 if (UseSSE < 3) { 5.16 _cpuFeatures &= ~CPU_SSE3; 5.17 _cpuFeatures &= ~CPU_SSSE3; 5.18 @@ -261,7 +263,7 @@ 5.19 } 5.20 5.21 char buf[256]; 5.22 - jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s", 5.23 + jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", 5.24 cores_per_cpu(), threads_per_core(), 5.25 cpu_family(), _model, _stepping, 5.26 (supports_cmov() ? ", cmov" : ""), 5.27 @@ -272,7 +274,8 @@ 5.28 (supports_sse2() ? ", sse2" : ""), 5.29 (supports_sse3() ? ", sse3" : ""), 5.30 (supports_ssse3()? ", ssse3": ""), 5.31 - (supports_sse4() ? ", sse4" : ""), 5.32 + (supports_sse4_1() ? ", sse4.1" : ""), 5.33 + (supports_sse4_2() ? ", sse4.2" : ""), 5.34 (supports_mmx_ext() ? ", mmxext" : ""), 5.35 (supports_3dnow() ? ", 3dnow" : ""), 5.36 (supports_3dnow2() ? ", 3dnowext" : ""), 5.37 @@ -285,7 +288,7 @@ 5.38 // older Pentiums which do not support it. 5.39 if( UseSSE > 4 ) UseSSE=4; 5.40 if( UseSSE < 0 ) UseSSE=0; 5.41 - if( !supports_sse4() ) // Drop to 3 if no SSE4 support 5.42 + if( !supports_sse4_1() ) // Drop to 3 if no SSE4 support 5.43 UseSSE = MIN2((intx)3,UseSSE); 5.44 if( !supports_sse3() ) // Drop to 2 if no SSE3 support 5.45 UseSSE = MIN2((intx)2,UseSSE); 5.46 @@ -375,6 +378,14 @@ 5.47 MaxLoopPad = 11; 5.48 } 5.49 #endif // COMPILER2 5.50 + if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) { 5.51 + UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus 5.52 + } 5.53 + if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus 5.54 + if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) { 5.55 + UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus 5.56 + } 5.57 + } 5.58 } 5.59 } 5.60 5.61 @@ -413,7 +424,7 @@ 5.62 5.63 #ifndef PRODUCT 5.64 if (PrintMiscellaneous && Verbose) { 5.65 - tty->print_cr("Logical CPUs per package: %u", 5.66 + tty->print_cr("Logical CPUs per core: %u", 5.67 logical_processors_per_package()); 5.68 tty->print_cr("UseSSE=%d",UseSSE); 5.69 tty->print("Allocation: ");
6.1 --- a/src/cpu/x86/vm/vm_version_x86_32.hpp Tue Oct 14 06:58:58 2008 -0700 6.2 +++ b/src/cpu/x86/vm/vm_version_x86_32.hpp Tue Oct 14 15:10:26 2008 -0700 6.3 @@ -68,9 +68,9 @@ 6.4 cmpxchg16: 1, 6.5 : 4, 6.6 dca : 1, 6.7 - : 4, 6.8 - popcnt : 1, 6.9 - : 8; 6.10 + sse4_1 : 1, 6.11 + sse4_2 : 1, 6.12 + : 11; 6.13 } bits; 6.14 }; 6.15 6.16 @@ -177,8 +177,9 @@ 6.17 CPU_SSE2 = (1 << 7), 6.18 CPU_SSE3 = (1 << 8), // sse3 comes from cpuid 1 (ECX) 6.19 CPU_SSSE3= (1 << 9), 6.20 - CPU_SSE4 = (1 <<10), 6.21 - CPU_SSE4A= (1 <<11) 6.22 + CPU_SSE4A= (1 <<10), 6.23 + CPU_SSE4_1 = (1 << 11), 6.24 + CPU_SSE4_2 = (1 << 12) 6.25 } cpuFeatureFlags; 6.26 6.27 // cpuid information block. All info derived from executing cpuid with 6.28 @@ -240,22 +241,14 @@ 6.29 static CpuidInfo _cpuid_info; 6.30 6.31 // Extractors and predicates 6.32 - static bool is_extended_cpu_family() { 6.33 - const uint32_t Extended_Cpu_Family = 0xf; 6.34 - return _cpuid_info.std_cpuid1_rax.bits.family == Extended_Cpu_Family; 6.35 - } 6.36 static uint32_t extended_cpu_family() { 6.37 uint32_t result = _cpuid_info.std_cpuid1_rax.bits.family; 6.38 - if (is_extended_cpu_family()) { 6.39 - result += _cpuid_info.std_cpuid1_rax.bits.ext_family; 6.40 - } 6.41 + result += _cpuid_info.std_cpuid1_rax.bits.ext_family; 6.42 return result; 6.43 } 6.44 static uint32_t extended_cpu_model() { 6.45 uint32_t result = _cpuid_info.std_cpuid1_rax.bits.model; 6.46 - if (is_extended_cpu_family()) { 6.47 - result |= _cpuid_info.std_cpuid1_rax.bits.ext_model << 4; 6.48 - } 6.49 + result |= _cpuid_info.std_cpuid1_rax.bits.ext_model << 4; 6.50 return result; 6.51 } 6.52 static uint32_t cpu_stepping() { 6.53 @@ -293,6 +286,10 @@ 6.54 result |= CPU_SSSE3; 6.55 if (is_amd() && _cpuid_info.ext_cpuid1_rcx.bits.sse4a != 0) 6.56 result |= CPU_SSE4A; 6.57 + if (_cpuid_info.std_cpuid1_rcx.bits.sse4_1 != 0) 6.58 + result |= CPU_SSE4_1; 6.59 + if (_cpuid_info.std_cpuid1_rcx.bits.sse4_2 != 0) 6.60 + result |= CPU_SSE4_2; 6.61 return result; 6.62 } 6.63 6.64 @@ -380,7 +377,8 @@ 6.65 static bool supports_sse2() { return (_cpuFeatures & CPU_SSE2) != 0; } 6.66 static bool supports_sse3() { return (_cpuFeatures & CPU_SSE3) != 0; } 6.67 static bool supports_ssse3() { return (_cpuFeatures & CPU_SSSE3)!= 0; } 6.68 - static bool supports_sse4() { return (_cpuFeatures & CPU_SSE4) != 0; } 6.69 + static bool supports_sse4_1() { return (_cpuFeatures & CPU_SSE4_1) != 0; } 6.70 + static bool supports_sse4_2() { return (_cpuFeatures & CPU_SSE4_2) != 0; } 6.71 // 6.72 // AMD features 6.73 //
7.1 --- a/src/cpu/x86/vm/vm_version_x86_64.cpp Tue Oct 14 06:58:58 2008 -0700 7.2 +++ b/src/cpu/x86/vm/vm_version_x86_64.cpp Tue Oct 14 15:10:26 2008 -0700 7.3 @@ -186,8 +186,10 @@ 7.4 if (!VM_Version::supports_sse2()) { 7.5 vm_exit_during_initialization("Unknown x64 processor: SSE2 not supported"); 7.6 } 7.7 - if (UseSSE < 4) 7.8 - _cpuFeatures &= ~CPU_SSE4; 7.9 + if (UseSSE < 4) { 7.10 + _cpuFeatures &= ~CPU_SSE4_1; 7.11 + _cpuFeatures &= ~CPU_SSE4_2; 7.12 + } 7.13 if (UseSSE < 3) { 7.14 _cpuFeatures &= ~CPU_SSE3; 7.15 _cpuFeatures &= ~CPU_SSSE3; 7.16 @@ -204,7 +206,7 @@ 7.17 } 7.18 7.19 char buf[256]; 7.20 - jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s", 7.21 + jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", 7.22 cores_per_cpu(), threads_per_core(), 7.23 cpu_family(), _model, _stepping, 7.24 (supports_cmov() ? ", cmov" : ""), 7.25 @@ -215,7 +217,8 @@ 7.26 (supports_sse2() ? ", sse2" : ""), 7.27 (supports_sse3() ? ", sse3" : ""), 7.28 (supports_ssse3()? ", ssse3": ""), 7.29 - (supports_sse4() ? ", sse4" : ""), 7.30 + (supports_sse4_1() ? ", sse4.1" : ""), 7.31 + (supports_sse4_2() ? ", sse4.2" : ""), 7.32 (supports_mmx_ext() ? ", mmxext" : ""), 7.33 (supports_3dnow() ? ", 3dnow" : ""), 7.34 (supports_3dnow2() ? ", 3dnowext" : ""), 7.35 @@ -228,7 +231,7 @@ 7.36 // older Pentiums which do not support it. 7.37 if( UseSSE > 4 ) UseSSE=4; 7.38 if( UseSSE < 0 ) UseSSE=0; 7.39 - if( !supports_sse4() ) // Drop to 3 if no SSE4 support 7.40 + if( !supports_sse4_1() ) // Drop to 3 if no SSE4 support 7.41 UseSSE = MIN2((intx)3,UseSSE); 7.42 if( !supports_sse3() ) // Drop to 2 if no SSE3 support 7.43 UseSSE = MIN2((intx)2,UseSSE); 7.44 @@ -314,6 +317,14 @@ 7.45 MaxLoopPad = 11; 7.46 } 7.47 #endif // COMPILER2 7.48 + if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) { 7.49 + UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus 7.50 + } 7.51 + if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus 7.52 + if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) { 7.53 + UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus 7.54 + } 7.55 + } 7.56 } 7.57 } 7.58 7.59 @@ -355,7 +366,7 @@ 7.60 7.61 #ifndef PRODUCT 7.62 if (PrintMiscellaneous && Verbose) { 7.63 - tty->print_cr("Logical CPUs per package: %u", 7.64 + tty->print_cr("Logical CPUs per core: %u", 7.65 logical_processors_per_package()); 7.66 tty->print_cr("UseSSE=%d",UseSSE); 7.67 tty->print("Allocation: ");
8.1 --- a/src/cpu/x86/vm/vm_version_x86_64.hpp Tue Oct 14 06:58:58 2008 -0700 8.2 +++ b/src/cpu/x86/vm/vm_version_x86_64.hpp Tue Oct 14 15:10:26 2008 -0700 8.3 @@ -68,9 +68,9 @@ 8.4 cmpxchg16: 1, 8.5 : 4, 8.6 dca : 1, 8.7 - : 4, 8.8 - popcnt : 1, 8.9 - : 8; 8.10 + sse4_1 : 1, 8.11 + sse4_2 : 1, 8.12 + : 11; 8.13 } bits; 8.14 }; 8.15 8.16 @@ -177,8 +177,9 @@ 8.17 CPU_SSE2 = (1 << 7), 8.18 CPU_SSE3 = (1 << 8), 8.19 CPU_SSSE3= (1 << 9), 8.20 - CPU_SSE4 = (1 <<10), 8.21 - CPU_SSE4A= (1 <<11) 8.22 + CPU_SSE4A= (1 <<10), 8.23 + CPU_SSE4_1 = (1 << 11), 8.24 + CPU_SSE4_2 = (1 << 12) 8.25 } cpuFeatureFlags; 8.26 8.27 // cpuid information block. All info derived from executing cpuid with 8.28 @@ -240,22 +241,14 @@ 8.29 static CpuidInfo _cpuid_info; 8.30 8.31 // Extractors and predicates 8.32 - static bool is_extended_cpu_family() { 8.33 - const uint32_t Extended_Cpu_Family = 0xf; 8.34 - return _cpuid_info.std_cpuid1_eax.bits.family == Extended_Cpu_Family; 8.35 - } 8.36 static uint32_t extended_cpu_family() { 8.37 uint32_t result = _cpuid_info.std_cpuid1_eax.bits.family; 8.38 - if (is_extended_cpu_family()) { 8.39 - result += _cpuid_info.std_cpuid1_eax.bits.ext_family; 8.40 - } 8.41 + result += _cpuid_info.std_cpuid1_eax.bits.ext_family; 8.42 return result; 8.43 } 8.44 static uint32_t extended_cpu_model() { 8.45 uint32_t result = _cpuid_info.std_cpuid1_eax.bits.model; 8.46 - if (is_extended_cpu_family()) { 8.47 - result |= _cpuid_info.std_cpuid1_eax.bits.ext_model << 4; 8.48 - } 8.49 + result |= _cpuid_info.std_cpuid1_eax.bits.ext_model << 4; 8.50 return result; 8.51 } 8.52 static uint32_t cpu_stepping() { 8.53 @@ -293,6 +286,10 @@ 8.54 result |= CPU_SSSE3; 8.55 if (is_amd() && _cpuid_info.ext_cpuid1_ecx.bits.sse4a != 0) 8.56 result |= CPU_SSE4A; 8.57 + if (_cpuid_info.std_cpuid1_ecx.bits.sse4_1 != 0) 8.58 + result |= CPU_SSE4_1; 8.59 + if (_cpuid_info.std_cpuid1_ecx.bits.sse4_2 != 0) 8.60 + result |= CPU_SSE4_2; 8.61 return result; 8.62 } 8.63 8.64 @@ -380,7 +377,8 @@ 8.65 static bool supports_sse2() { return (_cpuFeatures & CPU_SSE2) != 0; } 8.66 static bool supports_sse3() { return (_cpuFeatures & CPU_SSE3) != 0; } 8.67 static bool supports_ssse3() { return (_cpuFeatures & CPU_SSSE3)!= 0; } 8.68 - static bool supports_sse4() { return (_cpuFeatures & CPU_SSE4) != 0; } 8.69 + static bool supports_sse4_1() { return (_cpuFeatures & CPU_SSE4_1) != 0; } 8.70 + static bool supports_sse4_2() { return (_cpuFeatures & CPU_SSE4_2) != 0; } 8.71 // 8.72 // AMD features 8.73 //
9.1 --- a/src/os/solaris/vm/os_solaris.cpp Tue Oct 14 06:58:58 2008 -0700 9.2 +++ b/src/os/solaris/vm/os_solaris.cpp Tue Oct 14 15:10:26 2008 -0700 9.3 @@ -3758,7 +3758,7 @@ 9.4 int maxClamped = MIN2(iaLimits.maxPrio, (int)iaInfo->ia_uprilim); 9.5 iaInfo->ia_upri = scale_to_lwp_priority(iaLimits.minPrio, maxClamped, newPrio); 9.6 iaInfo->ia_uprilim = IA_NOCHANGE; 9.7 - iaInfo->ia_nice = IA_NOCHANGE; 9.8 +// iaInfo->ia_nice = IA_NOCHANGE; 9.9 iaInfo->ia_mode = IA_NOCHANGE; 9.10 if (ThreadPriorityVerbose) { 9.11 tty->print_cr ("IA: [%d...%d] %d->%d\n",
10.1 --- a/src/share/vm/runtime/globals.hpp Tue Oct 14 06:58:58 2008 -0700 10.2 +++ b/src/share/vm/runtime/globals.hpp Tue Oct 14 15:10:26 2008 -0700 10.3 @@ -991,6 +991,12 @@ 10.4 product(bool, UseXmmI2F, false, \ 10.5 "Use SSE2 CVTDQ2PS instruction to convert Integer to Float") \ 10.6 \ 10.7 + product(bool, UseXMMForArrayCopy, false, \ 10.8 + "Use SSE2 MOVQ instruction for Arraycopy") \ 10.9 + \ 10.10 + product(bool, UseUnalignedLoadStores, false, \ 10.11 + "Use SSE2 MOVDQU instruction for Arraycopy") \ 10.12 + \ 10.13 product(intx, FieldsAllocationStyle, 1, \ 10.14 "0 - type based with oops first, 1 - with oops last") \ 10.15 \