Mon, 23 Jun 2008 14:11:12 -0700
6708714: Optimize long LShift on 32-bits x86
Summary: For small (1-3 bits) left long shifts in 32-bits VM use sets of add+addc instructions instead of shld+shl on new AMD cpus.
Reviewed-by: never
Contributed-by: shrinivas.joshi@amd.com
1.1 --- a/src/cpu/x86/vm/vm_version_x86_32.cpp Sat Jun 21 10:03:31 2008 -0700 1.2 +++ b/src/cpu/x86/vm/vm_version_x86_32.cpp Mon Jun 23 14:11:12 2008 -0700 1.3 @@ -307,6 +307,10 @@ 1.4 // Use it on new AMD cpus starting from Opteron. 1.5 UseAddressNop = true; 1.6 } 1.7 + if( supports_sse2() && FLAG_IS_DEFAULT(UseNewLongLShift) ) { 1.8 + // Use it on new AMD cpus starting from Opteron. 1.9 + UseNewLongLShift = true; 1.10 + } 1.11 if( FLAG_IS_DEFAULT(UseXmmLoadAndClearUpper) ) { 1.12 if( supports_sse4a() ) { 1.13 UseXmmLoadAndClearUpper = true; // use movsd only on '10h' Opteron
2.1 --- a/src/cpu/x86/vm/x86_32.ad Sat Jun 21 10:03:31 2008 -0700 2.2 +++ b/src/cpu/x86/vm/x86_32.ad Mon Jun 23 14:11:12 2008 -0700 2.3 @@ -4754,6 +4754,33 @@ 2.4 interface(CONST_INTER); 2.5 %} 2.6 2.7 +operand immI_1() %{ 2.8 + predicate( n->get_int() == 1 ); 2.9 + match(ConI); 2.10 + 2.11 + op_cost(0); 2.12 + format %{ %} 2.13 + interface(CONST_INTER); 2.14 +%} 2.15 + 2.16 +operand immI_2() %{ 2.17 + predicate( n->get_int() == 2 ); 2.18 + match(ConI); 2.19 + 2.20 + op_cost(0); 2.21 + format %{ %} 2.22 + interface(CONST_INTER); 2.23 +%} 2.24 + 2.25 +operand immI_3() %{ 2.26 + predicate( n->get_int() == 3 ); 2.27 + match(ConI); 2.28 + 2.29 + op_cost(0); 2.30 + format %{ %} 2.31 + interface(CONST_INTER); 2.32 +%} 2.33 + 2.34 // Pointer Immediate 2.35 operand immP() %{ 2.36 match(ConP); 2.37 @@ -8943,6 +8970,63 @@ 2.38 ins_pipe( ialu_reg_long_mem ); 2.39 %} 2.40 2.41 +// Shift Left Long by 1 2.42 +instruct shlL_eReg_1(eRegL dst, immI_1 cnt, eFlagsReg cr) %{ 2.43 + predicate(UseNewLongLShift); 2.44 + match(Set dst (LShiftL dst cnt)); 2.45 + effect(KILL cr); 2.46 + ins_cost(100); 2.47 + format %{ "ADD $dst.lo,$dst.lo\n\t" 2.48 + "ADC $dst.hi,$dst.hi" %} 2.49 + ins_encode %{ 2.50 + __ addl($dst$$Register,$dst$$Register); 2.51 + __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register)); 2.52 + %} 2.53 + ins_pipe( ialu_reg_long ); 2.54 +%} 2.55 + 2.56 +// Shift Left Long by 2 2.57 +instruct shlL_eReg_2(eRegL dst, immI_2 cnt, eFlagsReg cr) %{ 2.58 + predicate(UseNewLongLShift); 2.59 + match(Set dst (LShiftL dst cnt)); 2.60 + effect(KILL cr); 2.61 + ins_cost(100); 2.62 + format %{ "ADD $dst.lo,$dst.lo\n\t" 2.63 + "ADC $dst.hi,$dst.hi\n\t" 2.64 + "ADD $dst.lo,$dst.lo\n\t" 2.65 + "ADC $dst.hi,$dst.hi" %} 2.66 + ins_encode %{ 2.67 + __ addl($dst$$Register,$dst$$Register); 2.68 + __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register)); 2.69 + __ addl($dst$$Register,$dst$$Register); 2.70 + __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register)); 2.71 + %} 2.72 + ins_pipe( ialu_reg_long ); 2.73 +%} 2.74 + 2.75 +// Shift Left Long by 3 2.76 +instruct shlL_eReg_3(eRegL dst, immI_3 cnt, eFlagsReg cr) %{ 2.77 + predicate(UseNewLongLShift); 2.78 + match(Set dst (LShiftL dst cnt)); 2.79 + effect(KILL cr); 2.80 + ins_cost(100); 2.81 + format %{ "ADD $dst.lo,$dst.lo\n\t" 2.82 + "ADC $dst.hi,$dst.hi\n\t" 2.83 + "ADD $dst.lo,$dst.lo\n\t" 2.84 + "ADC $dst.hi,$dst.hi\n\t" 2.85 + "ADD $dst.lo,$dst.lo\n\t" 2.86 + "ADC $dst.hi,$dst.hi" %} 2.87 + ins_encode %{ 2.88 + __ addl($dst$$Register,$dst$$Register); 2.89 + __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register)); 2.90 + __ addl($dst$$Register,$dst$$Register); 2.91 + __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register)); 2.92 + __ addl($dst$$Register,$dst$$Register); 2.93 + __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register)); 2.94 + %} 2.95 + ins_pipe( ialu_reg_long ); 2.96 +%} 2.97 + 2.98 // Shift Left Long by 1-31 2.99 instruct shlL_eReg_1_31(eRegL dst, immI_1_31 cnt, eFlagsReg cr) %{ 2.100 match(Set dst (LShiftL dst cnt));
3.1 --- a/src/share/vm/runtime/globals.hpp Sat Jun 21 10:03:31 2008 -0700 3.2 +++ b/src/share/vm/runtime/globals.hpp Mon Jun 23 14:11:12 2008 -0700 3.3 @@ -946,6 +946,9 @@ 3.4 diagnostic(bool, UseIncDec, true, \ 3.5 "Use INC, DEC instructions on x86") \ 3.6 \ 3.7 + product(bool, UseNewLongLShift, false, \ 3.8 + "Use optimized bitwise shift left") \ 3.9 + \ 3.10 product(bool, UseStoreImmI16, true, \ 3.11 "Use store immediate 16-bits value instruction on x86") \ 3.12 \