#4659 C2: Fixed a performance bug, causing many unlock operations to go through slow path.

Wed, 19 Oct 2016 15:22:23 +0800

author
aoqi
date
Wed, 19 Oct 2016 15:22:23 +0800
changeset 137
569333e03a95
parent 136
c6a2514e22d7
child 138
f1afd3b6c786

#4659 C2: Fixed a performance bug, causing many unlock operations to go through slow path.
ObjectMonitor::owner_offset_in_bytes is a pointer to owning thread.

$ stress -c 100 -i 10
$ time java SSLEngineDeadlock
Before the patch:
real 0m29.566s
user 0m38.156s
sys 0m1.590s

After the patch:
real 0m26.213s
user 0m34.996s
sys 0m1.668s

The improvement is nearly 13%.

src/cpu/mips/vm/assembler_mips.cpp file | annotate | diff | comparison | revisions
     1.1 --- a/src/cpu/mips/vm/assembler_mips.cpp	Wed Oct 19 14:29:12 2016 +0800
     1.2 +++ b/src/cpu/mips/vm/assembler_mips.cpp	Wed Oct 19 15:22:23 2016 +0800
     1.3 @@ -3193,13 +3193,11 @@
     1.4          biased_locking_exit(objReg, tmpReg, DONE_LABEL);
     1.5        }
     1.6  
     1.7 -      ld(tmpReg, Address(objReg, 0)) ;       // Examine the object's markword
     1.8        ld(AT, Address(boxReg, 0)) ;            // Examine the displaced header
     1.9        beq(AT, R0, DONE_LABEL) ;      // 0 indicates recursive stack-lock
    1.10 -      //move(AT, 0x1);
    1.11 -      //delayed()->nop();
    1.12        delayed()->daddiu(AT, R0, 0x1);
    1.13  
    1.14 +      ld(tmpReg, Address(objReg, 0)) ;       // Examine the object's markword
    1.15        andi(AT, tmpReg, markOopDesc::monitor_value) ;                     // Inflated?
    1.16        beq(AT, R0, Stacked) ;                     // Inflated?
    1.17        delayed()->nop();
    1.18 @@ -3228,164 +3226,13 @@
    1.19        // IA32's memory-model is SPO, so STs are ordered with respect to
    1.20        // each other and there's no need for an explicit barrier (fence).
    1.21        // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
    1.22 -#ifdef OPT_THREAD
    1.23 -      move(boxReg, TREG);
    1.24 -#else
    1.25 -      get_thread (boxReg) ;
    1.26 +#ifndef OPT_THREAD
    1.27 +      get_thread (TREG) ;
    1.28  #endif
    1.29  
    1.30 -#ifndef _LP64
    1.31 -
    1.32 -      // Note that we could employ various encoding schemes to reduce
    1.33 -      // the number of loads below (currently 4) to just 2 or 3.
    1.34 -      // Refer to the comments in synchronizer.cpp.
    1.35 -      // In practice the chain of fetches doesn't seem to impact performance, however.
    1.36 -      if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
    1.37 -        // Attempt to reduce branch density - AMD's branch predictor.
    1.38 -        ld(AT, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
    1.39 -        xorr(boxReg, boxReg, AT);
    1.40 -
    1.41 -        ld(AT, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
    1.42 -        orr(boxReg, boxReg, AT);
    1.43 -
    1.44 -        ld(AT, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
    1.45 -        orr(boxReg, boxReg, AT);
    1.46 -
    1.47 -        ld(AT, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
    1.48 -        orr(boxReg, boxReg, AT);
    1.49 -
    1.50 -        bne(boxReg, R0, DONE_LABEL);
    1.51 -        move(AT, R0);	/* delay slot */
    1.52 -
    1.53 -        sw(R0, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
    1.54 -        b(DONE_LABEL);
    1.55 -        move(AT, 0x1);	/* delay slot */
    1.56 -      } else {
    1.57 -        ld(AT, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
    1.58 -        xorr(boxReg, boxReg, AT);
    1.59 -
    1.60 -        ld(AT, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
    1.61 -        orr(boxReg, boxReg, AT);
    1.62 -
    1.63 -        bne(boxReg, R0, DONE_LABEL);
    1.64 -        move(AT, R0);	/* delay slot */
    1.65 -
    1.66 -        ld(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
    1.67 -        ld(AT, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
    1.68 -        orr(boxReg, boxReg, AT);
    1.69 -
    1.70 -        bne(boxReg, R0, CheckSucc);
    1.71 -        move(AT, R0);	/* delay slot */
    1.72 -
    1.73 -        sd(R0, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
    1.74 -        b(DONE_LABEL);
    1.75 -        move(AT, 0x1);	/* delay slot */
    1.76 -      }
    1.77 -
    1.78 -      // The Following code fragment (EmitSync & 65536) improves the performance of
    1.79 -      // contended applications and contended synchronization microbenchmarks.
    1.80 -      // Unfortunately the emission of the code - even though not executed - causes regressions
    1.81 -      // in scimark and jetstream, evidently because of $ effects.  Replacing the code
    1.82 -      // with an equal number of never-executed NOPs results in the same regression.
    1.83 -      // We leave it off by default.
    1.84 -
    1.85 -      if ((EmitSync & 65536) != 0) {
    1.86 -        Label LSuccess, LGoSlowPath ;
    1.87 -
    1.88 -        bind(CheckSucc) ;
    1.89 -
    1.90 -        // Optional pre-test ... it's safe to elide this
    1.91 -        if ((EmitSync & 16) == 0) {
    1.92 -          ld(AT, Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2)) ;
    1.93 -          beq(AT, R0, LGoSlowPath);
    1.94 -          delayed()->nop();
    1.95 -        }
    1.96 -
    1.97 -        // We have a classic Dekker-style idiom:
    1.98 -        //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
    1.99 -        // There are a number of ways to implement the barrier:
   1.100 -        // (1) lock:andl &m->_owner, 0
   1.101 -        //     is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
   1.102 -        //     LOCK: ANDL [ebx+Offset(_Owner)-2], 0
   1.103 -        //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
   1.104 -        // (2) If supported, an explicit MFENCE is appealing.
   1.105 -        //     In older IA32 processors MFENCE is slower than lock:add or xchg
   1.106 -        //     particularly if the write-buffer is full as might be the case if
   1.107 -        //     if stores closely precede the fence or fence-equivalent instruction.
   1.108 -        //     In more modern implementations MFENCE appears faster, however.
   1.109 -        // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
   1.110 -        //     The $lines underlying the top-of-stack should be in M-state.
   1.111 -        //     The locked add instruction is serializing, of course.
   1.112 -        // (4) Use xchg, which is serializing
   1.113 -        //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
   1.114 -        // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
   1.115 -        //     The integer condition codes will tell us if succ was 0.
   1.116 -        //     Since _succ and _owner should reside in the same $line and
   1.117 -        //     we just stored into _owner, it's likely that the $line
   1.118 -        //     remains in M-state for the lock:orl.
   1.119 -        //
   1.120 -        // We currently use (3), although it's likely that switching to (2)
   1.121 -        // is correct for the future.
   1.122 -
   1.123 -        sd(R0, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
   1.124 -
   1.125 -        // Ratify _succ remains non-null
   1.126 -        ld(AT, Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2)) ;
   1.127 -        bne(AT, R0, LSuccess);
   1.128 -        delayed()->nop();		/* delay slot */
   1.129 -        /*
   1.130 -           masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ;
   1.131 -           masm.jccb  (Assembler::notZero, LSuccess) ;
   1.132 -           */
   1.133 -
   1.134 -        move(boxReg, R0) ;                  // box is really EAX
   1.135 -
   1.136 -        cmpxchg(SP, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg);
   1.137 -        beq(AT, R0, LSuccess);
   1.138 -        delayed()->nop();
   1.139 -
   1.140 -        // Since we're low on registers we installed rsp as a placeholding in _owner.
   1.141 -        // Now install Self over rsp.  This is safe as we're transitioning from
   1.142 -        // non-null to non=null
   1.143 -        get_thread (boxReg) ;
   1.144 -        sd(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
   1.145 -        // Intentional fall-through into LGoSlowPath ...
   1.146 -
   1.147 -        bind(LGoSlowPath) ;
   1.148 -        ori(boxReg, boxReg, 1) ;                      // set ICC.ZF=0 to indicate failure
   1.149 -        b(DONE_LABEL) ;
   1.150 -        move(AT, R0) ;	/* delay slot */
   1.151 -
   1.152 -        bind(LSuccess) ;
   1.153 -        move(boxReg, R0) ;                 // set ICC.ZF=1 to indicate success
   1.154 -        b(DONE_LABEL) ;
   1.155 -        move(AT, 0x1) ;	/* delay slot */
   1.156 -      }
   1.157 -
   1.158 -      bind (Stacked) ;
   1.159 -      // It's not inflated and it's not recursively stack-locked and it's not biased.
   1.160 -      // It must be stack-locked.
   1.161 -      // Try to reset the header to displaced header.
   1.162 -      // The "box" value on the stack is stable, so we can reload
   1.163 -      // and be assured we observe the same value as above.
   1.164 -      ld(tmpReg, Address(boxReg, 0)) ;
   1.165 -
   1.166 -      cmpxchg(tmpReg, Address(objReg, 0), boxReg); // Uses EAX which is box
   1.167 -      // Intention fall-thru into DONE_LABEL
   1.168 -
   1.169 -
   1.170 -      // DONE_LABEL is a hot target - we'd really like to place it at the
   1.171 -      // start of cache line by padding with NOPs.
   1.172 -      // See the AMD and Intel software optimization manuals for the
   1.173 -      // most efficient "long" NOP encodings.
   1.174 -      // Unfortunately none of our alignment mechanisms suffice.
   1.175 -      if ((EmitSync & 65536) == 0) {
   1.176 -        bind (CheckSucc) ;
   1.177 -      }
   1.178 -#else // _LP64
   1.179        // It's inflated
   1.180 -      ld(AT, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
   1.181 -      xorr(boxReg, boxReg, AT);
   1.182 +      ld(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
   1.183 +      xorr(boxReg, boxReg, TREG);
   1.184  
   1.185        ld(AT, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
   1.186        orr(boxReg, boxReg, AT);
   1.187 @@ -3394,8 +3241,8 @@
   1.188        bne(boxReg, R0, DONE_LABEL);
   1.189        delayed()->nop();
   1.190  
   1.191 -      ld(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
   1.192 -      ld(AT, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
   1.193 +      ld(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
   1.194 +      ld(AT, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
   1.195        orr(boxReg, boxReg, AT);
   1.196  
   1.197        move(AT, R0);
   1.198 @@ -3421,30 +3268,31 @@
   1.199          // are all faster when the write buffer is populated.
   1.200          sd(R0, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
   1.201          if (os::is_MP()) {
   1.202 -          // lock (); 
   1.203 -          //addl (Address(rsp, 0), 0); //?
   1.204 +          // lock ();
   1.205          }
   1.206          ld(AT, Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2)) ;
   1.207          bne(AT, R0, LSuccess);
   1.208          delayed()->nop();
   1.209  
   1.210 +#ifndef OPT_THREAD
   1.211 +        get_thread (TREG) ;
   1.212 +#endif
   1.213          move(boxReg, R0) ;                  // box is really EAX
   1.214          //if (os::is_MP()) { lock(); }
   1.215 -        cmpxchg(SP, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg);
   1.216 +        cmpxchg(TREG, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg);
   1.217          beq(AT, R0, LSuccess);
   1.218          delayed()->nop();
   1.219          // Intentional fall-through into slow-path
   1.220  
   1.221          bind  (LGoSlowPath);
   1.222 -        ori(boxReg, boxReg, 1) ;                      // set ICC.ZF=0 to indicate failure
   1.223          move(AT, R0);
   1.224          b(DONE_LABEL) ;
   1.225          delayed()->nop();
   1.226  
   1.227  
   1.228          bind  (LSuccess);
   1.229 -        move(boxReg, R0) ;                 // set ICC.ZF=1 to indicate success
   1.230 -        move(AT, 0x1) ;
   1.231 +        move(AT, 0);
   1.232 +        sltiu(AT, boxReg, 1) ;                 // set ICC.ZF=1 to indicate success
   1.233          b(DONE_LABEL) ;
   1.234          delayed()->nop();
   1.235        }
   1.236 @@ -3452,12 +3300,11 @@
   1.237        bind  (Stacked);
   1.238        ld(tmpReg, Address(boxReg, 0)) ;
   1.239        //if (os::is_MP()) { lock(); }
   1.240 -      cmpxchg(tmpReg, Address(objReg, 0), boxReg); // Uses EAX which is box
   1.241 +      cmpxchg(tmpReg, Address(objReg, 0), boxReg);
   1.242  
   1.243        if (EmitSync & 65536) {
   1.244          bind (CheckSucc);
   1.245        }
   1.246 -#endif
   1.247  
   1.248        bind(DONE_LABEL);
   1.249  

mercurial