Wed, 19 Oct 2016 15:22:23 +0800
#4659 C2: Fixed a performance bug, causing many unlock operations to go through slow path.
ObjectMonitor::owner_offset_in_bytes is a pointer to owning thread.
$ stress -c 100 -i 10
$ time java SSLEngineDeadlock
Before the patch:
real 0m29.566s
user 0m38.156s
sys 0m1.590s
After the patch:
real 0m26.213s
user 0m34.996s
sys 0m1.668s
The improvement is nearly 13%.
src/cpu/mips/vm/assembler_mips.cpp | file | annotate | diff | comparison | revisions |
1.1 --- a/src/cpu/mips/vm/assembler_mips.cpp Wed Oct 19 14:29:12 2016 +0800 1.2 +++ b/src/cpu/mips/vm/assembler_mips.cpp Wed Oct 19 15:22:23 2016 +0800 1.3 @@ -3193,13 +3193,11 @@ 1.4 biased_locking_exit(objReg, tmpReg, DONE_LABEL); 1.5 } 1.6 1.7 - ld(tmpReg, Address(objReg, 0)) ; // Examine the object's markword 1.8 ld(AT, Address(boxReg, 0)) ; // Examine the displaced header 1.9 beq(AT, R0, DONE_LABEL) ; // 0 indicates recursive stack-lock 1.10 - //move(AT, 0x1); 1.11 - //delayed()->nop(); 1.12 delayed()->daddiu(AT, R0, 0x1); 1.13 1.14 + ld(tmpReg, Address(objReg, 0)) ; // Examine the object's markword 1.15 andi(AT, tmpReg, markOopDesc::monitor_value) ; // Inflated? 1.16 beq(AT, R0, Stacked) ; // Inflated? 1.17 delayed()->nop(); 1.18 @@ -3228,164 +3226,13 @@ 1.19 // IA32's memory-model is SPO, so STs are ordered with respect to 1.20 // each other and there's no need for an explicit barrier (fence). 1.21 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 1.22 -#ifdef OPT_THREAD 1.23 - move(boxReg, TREG); 1.24 -#else 1.25 - get_thread (boxReg) ; 1.26 +#ifndef OPT_THREAD 1.27 + get_thread (TREG) ; 1.28 #endif 1.29 1.30 -#ifndef _LP64 1.31 - 1.32 - // Note that we could employ various encoding schemes to reduce 1.33 - // the number of loads below (currently 4) to just 2 or 3. 1.34 - // Refer to the comments in synchronizer.cpp. 1.35 - // In practice the chain of fetches doesn't seem to impact performance, however. 1.36 - if ((EmitSync & 65536) == 0 && (EmitSync & 256)) { 1.37 - // Attempt to reduce branch density - AMD's branch predictor. 1.38 - ld(AT, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 1.39 - xorr(boxReg, boxReg, AT); 1.40 - 1.41 - ld(AT, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ; 1.42 - orr(boxReg, boxReg, AT); 1.43 - 1.44 - ld(AT, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 1.45 - orr(boxReg, boxReg, AT); 1.46 - 1.47 - ld(AT, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 1.48 - orr(boxReg, boxReg, AT); 1.49 - 1.50 - bne(boxReg, R0, DONE_LABEL); 1.51 - move(AT, R0); /* delay slot */ 1.52 - 1.53 - sw(R0, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 1.54 - b(DONE_LABEL); 1.55 - move(AT, 0x1); /* delay slot */ 1.56 - } else { 1.57 - ld(AT, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 1.58 - xorr(boxReg, boxReg, AT); 1.59 - 1.60 - ld(AT, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ; 1.61 - orr(boxReg, boxReg, AT); 1.62 - 1.63 - bne(boxReg, R0, DONE_LABEL); 1.64 - move(AT, R0); /* delay slot */ 1.65 - 1.66 - ld(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 1.67 - ld(AT, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 1.68 - orr(boxReg, boxReg, AT); 1.69 - 1.70 - bne(boxReg, R0, CheckSucc); 1.71 - move(AT, R0); /* delay slot */ 1.72 - 1.73 - sd(R0, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 1.74 - b(DONE_LABEL); 1.75 - move(AT, 0x1); /* delay slot */ 1.76 - } 1.77 - 1.78 - // The Following code fragment (EmitSync & 65536) improves the performance of 1.79 - // contended applications and contended synchronization microbenchmarks. 1.80 - // Unfortunately the emission of the code - even though not executed - causes regressions 1.81 - // in scimark and jetstream, evidently because of $ effects. Replacing the code 1.82 - // with an equal number of never-executed NOPs results in the same regression. 1.83 - // We leave it off by default. 1.84 - 1.85 - if ((EmitSync & 65536) != 0) { 1.86 - Label LSuccess, LGoSlowPath ; 1.87 - 1.88 - bind(CheckSucc) ; 1.89 - 1.90 - // Optional pre-test ... it's safe to elide this 1.91 - if ((EmitSync & 16) == 0) { 1.92 - ld(AT, Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2)) ; 1.93 - beq(AT, R0, LGoSlowPath); 1.94 - delayed()->nop(); 1.95 - } 1.96 - 1.97 - // We have a classic Dekker-style idiom: 1.98 - // ST m->_owner = 0 ; MEMBAR; LD m->_succ 1.99 - // There are a number of ways to implement the barrier: 1.100 - // (1) lock:andl &m->_owner, 0 1.101 - // is fast, but mask doesn't currently support the "ANDL M,IMM32" form. 1.102 - // LOCK: ANDL [ebx+Offset(_Owner)-2], 0 1.103 - // Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8 1.104 - // (2) If supported, an explicit MFENCE is appealing. 1.105 - // In older IA32 processors MFENCE is slower than lock:add or xchg 1.106 - // particularly if the write-buffer is full as might be the case if 1.107 - // if stores closely precede the fence or fence-equivalent instruction. 1.108 - // In more modern implementations MFENCE appears faster, however. 1.109 - // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack 1.110 - // The $lines underlying the top-of-stack should be in M-state. 1.111 - // The locked add instruction is serializing, of course. 1.112 - // (4) Use xchg, which is serializing 1.113 - // mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works 1.114 - // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0. 1.115 - // The integer condition codes will tell us if succ was 0. 1.116 - // Since _succ and _owner should reside in the same $line and 1.117 - // we just stored into _owner, it's likely that the $line 1.118 - // remains in M-state for the lock:orl. 1.119 - // 1.120 - // We currently use (3), although it's likely that switching to (2) 1.121 - // is correct for the future. 1.122 - 1.123 - sd(R0, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 1.124 - 1.125 - // Ratify _succ remains non-null 1.126 - ld(AT, Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2)) ; 1.127 - bne(AT, R0, LSuccess); 1.128 - delayed()->nop(); /* delay slot */ 1.129 - /* 1.130 - masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; 1.131 - masm.jccb (Assembler::notZero, LSuccess) ; 1.132 - */ 1.133 - 1.134 - move(boxReg, R0) ; // box is really EAX 1.135 - 1.136 - cmpxchg(SP, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg); 1.137 - beq(AT, R0, LSuccess); 1.138 - delayed()->nop(); 1.139 - 1.140 - // Since we're low on registers we installed rsp as a placeholding in _owner. 1.141 - // Now install Self over rsp. This is safe as we're transitioning from 1.142 - // non-null to non=null 1.143 - get_thread (boxReg) ; 1.144 - sd(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 1.145 - // Intentional fall-through into LGoSlowPath ... 1.146 - 1.147 - bind(LGoSlowPath) ; 1.148 - ori(boxReg, boxReg, 1) ; // set ICC.ZF=0 to indicate failure 1.149 - b(DONE_LABEL) ; 1.150 - move(AT, R0) ; /* delay slot */ 1.151 - 1.152 - bind(LSuccess) ; 1.153 - move(boxReg, R0) ; // set ICC.ZF=1 to indicate success 1.154 - b(DONE_LABEL) ; 1.155 - move(AT, 0x1) ; /* delay slot */ 1.156 - } 1.157 - 1.158 - bind (Stacked) ; 1.159 - // It's not inflated and it's not recursively stack-locked and it's not biased. 1.160 - // It must be stack-locked. 1.161 - // Try to reset the header to displaced header. 1.162 - // The "box" value on the stack is stable, so we can reload 1.163 - // and be assured we observe the same value as above. 1.164 - ld(tmpReg, Address(boxReg, 0)) ; 1.165 - 1.166 - cmpxchg(tmpReg, Address(objReg, 0), boxReg); // Uses EAX which is box 1.167 - // Intention fall-thru into DONE_LABEL 1.168 - 1.169 - 1.170 - // DONE_LABEL is a hot target - we'd really like to place it at the 1.171 - // start of cache line by padding with NOPs. 1.172 - // See the AMD and Intel software optimization manuals for the 1.173 - // most efficient "long" NOP encodings. 1.174 - // Unfortunately none of our alignment mechanisms suffice. 1.175 - if ((EmitSync & 65536) == 0) { 1.176 - bind (CheckSucc) ; 1.177 - } 1.178 -#else // _LP64 1.179 // It's inflated 1.180 - ld(AT, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 1.181 - xorr(boxReg, boxReg, AT); 1.182 + ld(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 1.183 + xorr(boxReg, boxReg, TREG); 1.184 1.185 ld(AT, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ; 1.186 orr(boxReg, boxReg, AT); 1.187 @@ -3394,8 +3241,8 @@ 1.188 bne(boxReg, R0, DONE_LABEL); 1.189 delayed()->nop(); 1.190 1.191 - ld(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 1.192 - ld(AT, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 1.193 + ld(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 1.194 + ld(AT, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 1.195 orr(boxReg, boxReg, AT); 1.196 1.197 move(AT, R0); 1.198 @@ -3421,30 +3268,31 @@ 1.199 // are all faster when the write buffer is populated. 1.200 sd(R0, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 1.201 if (os::is_MP()) { 1.202 - // lock (); 1.203 - //addl (Address(rsp, 0), 0); //? 1.204 + // lock (); 1.205 } 1.206 ld(AT, Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2)) ; 1.207 bne(AT, R0, LSuccess); 1.208 delayed()->nop(); 1.209 1.210 +#ifndef OPT_THREAD 1.211 + get_thread (TREG) ; 1.212 +#endif 1.213 move(boxReg, R0) ; // box is really EAX 1.214 //if (os::is_MP()) { lock(); } 1.215 - cmpxchg(SP, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg); 1.216 + cmpxchg(TREG, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg); 1.217 beq(AT, R0, LSuccess); 1.218 delayed()->nop(); 1.219 // Intentional fall-through into slow-path 1.220 1.221 bind (LGoSlowPath); 1.222 - ori(boxReg, boxReg, 1) ; // set ICC.ZF=0 to indicate failure 1.223 move(AT, R0); 1.224 b(DONE_LABEL) ; 1.225 delayed()->nop(); 1.226 1.227 1.228 bind (LSuccess); 1.229 - move(boxReg, R0) ; // set ICC.ZF=1 to indicate success 1.230 - move(AT, 0x1) ; 1.231 + move(AT, 0); 1.232 + sltiu(AT, boxReg, 1) ; // set ICC.ZF=1 to indicate success 1.233 b(DONE_LABEL) ; 1.234 delayed()->nop(); 1.235 } 1.236 @@ -3452,12 +3300,11 @@ 1.237 bind (Stacked); 1.238 ld(tmpReg, Address(boxReg, 0)) ; 1.239 //if (os::is_MP()) { lock(); } 1.240 - cmpxchg(tmpReg, Address(objReg, 0), boxReg); // Uses EAX which is box 1.241 + cmpxchg(tmpReg, Address(objReg, 0), boxReg); 1.242 1.243 if (EmitSync & 65536) { 1.244 bind (CheckSucc); 1.245 } 1.246 -#endif 1.247 1.248 bind(DONE_LABEL); 1.249