1.1 --- a/src/cpu/x86/vm/macroAssembler_x86.cpp Thu Mar 06 12:45:59 2014 +0400 1.2 +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp Mon Feb 24 15:12:26 2014 -0800 1.3 @@ -98,217 +98,6 @@ 1.4 return Address::make_array(adr); 1.5 } 1.6 1.7 -int MacroAssembler::biased_locking_enter(Register lock_reg, 1.8 - Register obj_reg, 1.9 - Register swap_reg, 1.10 - Register tmp_reg, 1.11 - bool swap_reg_contains_mark, 1.12 - Label& done, 1.13 - Label* slow_case, 1.14 - BiasedLockingCounters* counters) { 1.15 - assert(UseBiasedLocking, "why call this otherwise?"); 1.16 - assert(swap_reg == rax, "swap_reg must be rax, for cmpxchg"); 1.17 - assert_different_registers(lock_reg, obj_reg, swap_reg); 1.18 - 1.19 - if (PrintBiasedLockingStatistics && counters == NULL) 1.20 - counters = BiasedLocking::counters(); 1.21 - 1.22 - bool need_tmp_reg = false; 1.23 - if (tmp_reg == noreg) { 1.24 - need_tmp_reg = true; 1.25 - tmp_reg = lock_reg; 1.26 - } else { 1.27 - assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg); 1.28 - } 1.29 - assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 1.30 - Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 1.31 - Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 1.32 - Address saved_mark_addr(lock_reg, 0); 1.33 - 1.34 - // Biased locking 1.35 - // See whether the lock is currently biased toward our thread and 1.36 - // whether the epoch is still valid 1.37 - // Note that the runtime guarantees sufficient alignment of JavaThread 1.38 - // pointers to allow age to be placed into low bits 1.39 - // First check to see whether biasing is even enabled for this object 1.40 - Label cas_label; 1.41 - int null_check_offset = -1; 1.42 - if (!swap_reg_contains_mark) { 1.43 - null_check_offset = offset(); 1.44 - movl(swap_reg, mark_addr); 1.45 - } 1.46 - if (need_tmp_reg) { 1.47 - push(tmp_reg); 1.48 - } 1.49 - movl(tmp_reg, swap_reg); 1.50 - andl(tmp_reg, markOopDesc::biased_lock_mask_in_place); 1.51 - cmpl(tmp_reg, markOopDesc::biased_lock_pattern); 1.52 - if (need_tmp_reg) { 1.53 - pop(tmp_reg); 1.54 - } 1.55 - jcc(Assembler::notEqual, cas_label); 1.56 - // The bias pattern is present in the object's header. Need to check 1.57 - // whether the bias owner and the epoch are both still current. 1.58 - // Note that because there is no current thread register on x86 we 1.59 - // need to store off the mark word we read out of the object to 1.60 - // avoid reloading it and needing to recheck invariants below. This 1.61 - // store is unfortunate but it makes the overall code shorter and 1.62 - // simpler. 1.63 - movl(saved_mark_addr, swap_reg); 1.64 - if (need_tmp_reg) { 1.65 - push(tmp_reg); 1.66 - } 1.67 - get_thread(tmp_reg); 1.68 - xorl(swap_reg, tmp_reg); 1.69 - if (swap_reg_contains_mark) { 1.70 - null_check_offset = offset(); 1.71 - } 1.72 - movl(tmp_reg, klass_addr); 1.73 - xorl(swap_reg, Address(tmp_reg, Klass::prototype_header_offset())); 1.74 - andl(swap_reg, ~((int) markOopDesc::age_mask_in_place)); 1.75 - if (need_tmp_reg) { 1.76 - pop(tmp_reg); 1.77 - } 1.78 - if (counters != NULL) { 1.79 - cond_inc32(Assembler::zero, 1.80 - ExternalAddress((address)counters->biased_lock_entry_count_addr())); 1.81 - } 1.82 - jcc(Assembler::equal, done); 1.83 - 1.84 - Label try_revoke_bias; 1.85 - Label try_rebias; 1.86 - 1.87 - // At this point we know that the header has the bias pattern and 1.88 - // that we are not the bias owner in the current epoch. We need to 1.89 - // figure out more details about the state of the header in order to 1.90 - // know what operations can be legally performed on the object's 1.91 - // header. 1.92 - 1.93 - // If the low three bits in the xor result aren't clear, that means 1.94 - // the prototype header is no longer biased and we have to revoke 1.95 - // the bias on this object. 1.96 - testl(swap_reg, markOopDesc::biased_lock_mask_in_place); 1.97 - jcc(Assembler::notZero, try_revoke_bias); 1.98 - 1.99 - // Biasing is still enabled for this data type. See whether the 1.100 - // epoch of the current bias is still valid, meaning that the epoch 1.101 - // bits of the mark word are equal to the epoch bits of the 1.102 - // prototype header. (Note that the prototype header's epoch bits 1.103 - // only change at a safepoint.) If not, attempt to rebias the object 1.104 - // toward the current thread. Note that we must be absolutely sure 1.105 - // that the current epoch is invalid in order to do this because 1.106 - // otherwise the manipulations it performs on the mark word are 1.107 - // illegal. 1.108 - testl(swap_reg, markOopDesc::epoch_mask_in_place); 1.109 - jcc(Assembler::notZero, try_rebias); 1.110 - 1.111 - // The epoch of the current bias is still valid but we know nothing 1.112 - // about the owner; it might be set or it might be clear. Try to 1.113 - // acquire the bias of the object using an atomic operation. If this 1.114 - // fails we will go in to the runtime to revoke the object's bias. 1.115 - // Note that we first construct the presumed unbiased header so we 1.116 - // don't accidentally blow away another thread's valid bias. 1.117 - movl(swap_reg, saved_mark_addr); 1.118 - andl(swap_reg, 1.119 - markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 1.120 - if (need_tmp_reg) { 1.121 - push(tmp_reg); 1.122 - } 1.123 - get_thread(tmp_reg); 1.124 - orl(tmp_reg, swap_reg); 1.125 - if (os::is_MP()) { 1.126 - lock(); 1.127 - } 1.128 - cmpxchgptr(tmp_reg, Address(obj_reg, 0)); 1.129 - if (need_tmp_reg) { 1.130 - pop(tmp_reg); 1.131 - } 1.132 - // If the biasing toward our thread failed, this means that 1.133 - // another thread succeeded in biasing it toward itself and we 1.134 - // need to revoke that bias. The revocation will occur in the 1.135 - // interpreter runtime in the slow case. 1.136 - if (counters != NULL) { 1.137 - cond_inc32(Assembler::zero, 1.138 - ExternalAddress((address)counters->anonymously_biased_lock_entry_count_addr())); 1.139 - } 1.140 - if (slow_case != NULL) { 1.141 - jcc(Assembler::notZero, *slow_case); 1.142 - } 1.143 - jmp(done); 1.144 - 1.145 - bind(try_rebias); 1.146 - // At this point we know the epoch has expired, meaning that the 1.147 - // current "bias owner", if any, is actually invalid. Under these 1.148 - // circumstances _only_, we are allowed to use the current header's 1.149 - // value as the comparison value when doing the cas to acquire the 1.150 - // bias in the current epoch. In other words, we allow transfer of 1.151 - // the bias from one thread to another directly in this situation. 1.152 - // 1.153 - // FIXME: due to a lack of registers we currently blow away the age 1.154 - // bits in this situation. Should attempt to preserve them. 1.155 - if (need_tmp_reg) { 1.156 - push(tmp_reg); 1.157 - } 1.158 - get_thread(tmp_reg); 1.159 - movl(swap_reg, klass_addr); 1.160 - orl(tmp_reg, Address(swap_reg, Klass::prototype_header_offset())); 1.161 - movl(swap_reg, saved_mark_addr); 1.162 - if (os::is_MP()) { 1.163 - lock(); 1.164 - } 1.165 - cmpxchgptr(tmp_reg, Address(obj_reg, 0)); 1.166 - if (need_tmp_reg) { 1.167 - pop(tmp_reg); 1.168 - } 1.169 - // If the biasing toward our thread failed, then another thread 1.170 - // succeeded in biasing it toward itself and we need to revoke that 1.171 - // bias. The revocation will occur in the runtime in the slow case. 1.172 - if (counters != NULL) { 1.173 - cond_inc32(Assembler::zero, 1.174 - ExternalAddress((address)counters->rebiased_lock_entry_count_addr())); 1.175 - } 1.176 - if (slow_case != NULL) { 1.177 - jcc(Assembler::notZero, *slow_case); 1.178 - } 1.179 - jmp(done); 1.180 - 1.181 - bind(try_revoke_bias); 1.182 - // The prototype mark in the klass doesn't have the bias bit set any 1.183 - // more, indicating that objects of this data type are not supposed 1.184 - // to be biased any more. We are going to try to reset the mark of 1.185 - // this object to the prototype value and fall through to the 1.186 - // CAS-based locking scheme. Note that if our CAS fails, it means 1.187 - // that another thread raced us for the privilege of revoking the 1.188 - // bias of this particular object, so it's okay to continue in the 1.189 - // normal locking code. 1.190 - // 1.191 - // FIXME: due to a lack of registers we currently blow away the age 1.192 - // bits in this situation. Should attempt to preserve them. 1.193 - movl(swap_reg, saved_mark_addr); 1.194 - if (need_tmp_reg) { 1.195 - push(tmp_reg); 1.196 - } 1.197 - movl(tmp_reg, klass_addr); 1.198 - movl(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset())); 1.199 - if (os::is_MP()) { 1.200 - lock(); 1.201 - } 1.202 - cmpxchgptr(tmp_reg, Address(obj_reg, 0)); 1.203 - if (need_tmp_reg) { 1.204 - pop(tmp_reg); 1.205 - } 1.206 - // Fall through to the normal CAS-based lock, because no matter what 1.207 - // the result of the above CAS, some thread must have succeeded in 1.208 - // removing the bias bit from the object's header. 1.209 - if (counters != NULL) { 1.210 - cond_inc32(Assembler::zero, 1.211 - ExternalAddress((address)counters->revoked_lock_entry_count_addr())); 1.212 - } 1.213 - 1.214 - bind(cas_label); 1.215 - 1.216 - return null_check_offset; 1.217 -} 1.218 void MacroAssembler::call_VM_leaf_base(address entry_point, 1.219 int number_of_arguments) { 1.220 call(RuntimeAddress(entry_point)); 1.221 @@ -726,165 +515,6 @@ 1.222 return array; 1.223 } 1.224 1.225 -int MacroAssembler::biased_locking_enter(Register lock_reg, 1.226 - Register obj_reg, 1.227 - Register swap_reg, 1.228 - Register tmp_reg, 1.229 - bool swap_reg_contains_mark, 1.230 - Label& done, 1.231 - Label* slow_case, 1.232 - BiasedLockingCounters* counters) { 1.233 - assert(UseBiasedLocking, "why call this otherwise?"); 1.234 - assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq"); 1.235 - assert(tmp_reg != noreg, "tmp_reg must be supplied"); 1.236 - assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg); 1.237 - assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 1.238 - Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 1.239 - Address saved_mark_addr(lock_reg, 0); 1.240 - 1.241 - if (PrintBiasedLockingStatistics && counters == NULL) 1.242 - counters = BiasedLocking::counters(); 1.243 - 1.244 - // Biased locking 1.245 - // See whether the lock is currently biased toward our thread and 1.246 - // whether the epoch is still valid 1.247 - // Note that the runtime guarantees sufficient alignment of JavaThread 1.248 - // pointers to allow age to be placed into low bits 1.249 - // First check to see whether biasing is even enabled for this object 1.250 - Label cas_label; 1.251 - int null_check_offset = -1; 1.252 - if (!swap_reg_contains_mark) { 1.253 - null_check_offset = offset(); 1.254 - movq(swap_reg, mark_addr); 1.255 - } 1.256 - movq(tmp_reg, swap_reg); 1.257 - andq(tmp_reg, markOopDesc::biased_lock_mask_in_place); 1.258 - cmpq(tmp_reg, markOopDesc::biased_lock_pattern); 1.259 - jcc(Assembler::notEqual, cas_label); 1.260 - // The bias pattern is present in the object's header. Need to check 1.261 - // whether the bias owner and the epoch are both still current. 1.262 - load_prototype_header(tmp_reg, obj_reg); 1.263 - orq(tmp_reg, r15_thread); 1.264 - xorq(tmp_reg, swap_reg); 1.265 - andq(tmp_reg, ~((int) markOopDesc::age_mask_in_place)); 1.266 - if (counters != NULL) { 1.267 - cond_inc32(Assembler::zero, 1.268 - ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr())); 1.269 - } 1.270 - jcc(Assembler::equal, done); 1.271 - 1.272 - Label try_revoke_bias; 1.273 - Label try_rebias; 1.274 - 1.275 - // At this point we know that the header has the bias pattern and 1.276 - // that we are not the bias owner in the current epoch. We need to 1.277 - // figure out more details about the state of the header in order to 1.278 - // know what operations can be legally performed on the object's 1.279 - // header. 1.280 - 1.281 - // If the low three bits in the xor result aren't clear, that means 1.282 - // the prototype header is no longer biased and we have to revoke 1.283 - // the bias on this object. 1.284 - testq(tmp_reg, markOopDesc::biased_lock_mask_in_place); 1.285 - jcc(Assembler::notZero, try_revoke_bias); 1.286 - 1.287 - // Biasing is still enabled for this data type. See whether the 1.288 - // epoch of the current bias is still valid, meaning that the epoch 1.289 - // bits of the mark word are equal to the epoch bits of the 1.290 - // prototype header. (Note that the prototype header's epoch bits 1.291 - // only change at a safepoint.) If not, attempt to rebias the object 1.292 - // toward the current thread. Note that we must be absolutely sure 1.293 - // that the current epoch is invalid in order to do this because 1.294 - // otherwise the manipulations it performs on the mark word are 1.295 - // illegal. 1.296 - testq(tmp_reg, markOopDesc::epoch_mask_in_place); 1.297 - jcc(Assembler::notZero, try_rebias); 1.298 - 1.299 - // The epoch of the current bias is still valid but we know nothing 1.300 - // about the owner; it might be set or it might be clear. Try to 1.301 - // acquire the bias of the object using an atomic operation. If this 1.302 - // fails we will go in to the runtime to revoke the object's bias. 1.303 - // Note that we first construct the presumed unbiased header so we 1.304 - // don't accidentally blow away another thread's valid bias. 1.305 - andq(swap_reg, 1.306 - markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 1.307 - movq(tmp_reg, swap_reg); 1.308 - orq(tmp_reg, r15_thread); 1.309 - if (os::is_MP()) { 1.310 - lock(); 1.311 - } 1.312 - cmpxchgq(tmp_reg, Address(obj_reg, 0)); 1.313 - // If the biasing toward our thread failed, this means that 1.314 - // another thread succeeded in biasing it toward itself and we 1.315 - // need to revoke that bias. The revocation will occur in the 1.316 - // interpreter runtime in the slow case. 1.317 - if (counters != NULL) { 1.318 - cond_inc32(Assembler::zero, 1.319 - ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr())); 1.320 - } 1.321 - if (slow_case != NULL) { 1.322 - jcc(Assembler::notZero, *slow_case); 1.323 - } 1.324 - jmp(done); 1.325 - 1.326 - bind(try_rebias); 1.327 - // At this point we know the epoch has expired, meaning that the 1.328 - // current "bias owner", if any, is actually invalid. Under these 1.329 - // circumstances _only_, we are allowed to use the current header's 1.330 - // value as the comparison value when doing the cas to acquire the 1.331 - // bias in the current epoch. In other words, we allow transfer of 1.332 - // the bias from one thread to another directly in this situation. 1.333 - // 1.334 - // FIXME: due to a lack of registers we currently blow away the age 1.335 - // bits in this situation. Should attempt to preserve them. 1.336 - load_prototype_header(tmp_reg, obj_reg); 1.337 - orq(tmp_reg, r15_thread); 1.338 - if (os::is_MP()) { 1.339 - lock(); 1.340 - } 1.341 - cmpxchgq(tmp_reg, Address(obj_reg, 0)); 1.342 - // If the biasing toward our thread failed, then another thread 1.343 - // succeeded in biasing it toward itself and we need to revoke that 1.344 - // bias. The revocation will occur in the runtime in the slow case. 1.345 - if (counters != NULL) { 1.346 - cond_inc32(Assembler::zero, 1.347 - ExternalAddress((address) counters->rebiased_lock_entry_count_addr())); 1.348 - } 1.349 - if (slow_case != NULL) { 1.350 - jcc(Assembler::notZero, *slow_case); 1.351 - } 1.352 - jmp(done); 1.353 - 1.354 - bind(try_revoke_bias); 1.355 - // The prototype mark in the klass doesn't have the bias bit set any 1.356 - // more, indicating that objects of this data type are not supposed 1.357 - // to be biased any more. We are going to try to reset the mark of 1.358 - // this object to the prototype value and fall through to the 1.359 - // CAS-based locking scheme. Note that if our CAS fails, it means 1.360 - // that another thread raced us for the privilege of revoking the 1.361 - // bias of this particular object, so it's okay to continue in the 1.362 - // normal locking code. 1.363 - // 1.364 - // FIXME: due to a lack of registers we currently blow away the age 1.365 - // bits in this situation. Should attempt to preserve them. 1.366 - load_prototype_header(tmp_reg, obj_reg); 1.367 - if (os::is_MP()) { 1.368 - lock(); 1.369 - } 1.370 - cmpxchgq(tmp_reg, Address(obj_reg, 0)); 1.371 - // Fall through to the normal CAS-based lock, because no matter what 1.372 - // the result of the above CAS, some thread must have succeeded in 1.373 - // removing the bias bit from the object's header. 1.374 - if (counters != NULL) { 1.375 - cond_inc32(Assembler::zero, 1.376 - ExternalAddress((address) counters->revoked_lock_entry_count_addr())); 1.377 - } 1.378 - 1.379 - bind(cas_label); 1.380 - 1.381 - return null_check_offset; 1.382 -} 1.383 - 1.384 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) { 1.385 Label L, E; 1.386 1.387 @@ -1360,9 +990,16 @@ 1.388 1.389 void MacroAssembler::atomic_incl(AddressLiteral counter_addr) { 1.390 pushf(); 1.391 - if (os::is_MP()) 1.392 - lock(); 1.393 - incrementl(counter_addr); 1.394 + if (reachable(counter_addr)) { 1.395 + if (os::is_MP()) 1.396 + lock(); 1.397 + incrementl(as_Address(counter_addr)); 1.398 + } else { 1.399 + lea(rscratch1, counter_addr); 1.400 + if (os::is_MP()) 1.401 + lock(); 1.402 + incrementl(Address(rscratch1, 0)); 1.403 + } 1.404 popf(); 1.405 } 1.406 1.407 @@ -1393,6 +1030,234 @@ 1.408 } 1.409 } 1.410 1.411 +int MacroAssembler::biased_locking_enter(Register lock_reg, 1.412 + Register obj_reg, 1.413 + Register swap_reg, 1.414 + Register tmp_reg, 1.415 + bool swap_reg_contains_mark, 1.416 + Label& done, 1.417 + Label* slow_case, 1.418 + BiasedLockingCounters* counters) { 1.419 + assert(UseBiasedLocking, "why call this otherwise?"); 1.420 + assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq"); 1.421 + LP64_ONLY( assert(tmp_reg != noreg, "tmp_reg must be supplied"); ) 1.422 + bool need_tmp_reg = false; 1.423 + if (tmp_reg == noreg) { 1.424 + need_tmp_reg = true; 1.425 + tmp_reg = lock_reg; 1.426 + assert_different_registers(lock_reg, obj_reg, swap_reg); 1.427 + } else { 1.428 + assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg); 1.429 + } 1.430 + assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 1.431 + Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 1.432 + Address saved_mark_addr(lock_reg, 0); 1.433 + 1.434 + if (PrintBiasedLockingStatistics && counters == NULL) { 1.435 + counters = BiasedLocking::counters(); 1.436 + } 1.437 + // Biased locking 1.438 + // See whether the lock is currently biased toward our thread and 1.439 + // whether the epoch is still valid 1.440 + // Note that the runtime guarantees sufficient alignment of JavaThread 1.441 + // pointers to allow age to be placed into low bits 1.442 + // First check to see whether biasing is even enabled for this object 1.443 + Label cas_label; 1.444 + int null_check_offset = -1; 1.445 + if (!swap_reg_contains_mark) { 1.446 + null_check_offset = offset(); 1.447 + movptr(swap_reg, mark_addr); 1.448 + } 1.449 + if (need_tmp_reg) { 1.450 + push(tmp_reg); 1.451 + } 1.452 + movptr(tmp_reg, swap_reg); 1.453 + andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place); 1.454 + cmpptr(tmp_reg, markOopDesc::biased_lock_pattern); 1.455 + if (need_tmp_reg) { 1.456 + pop(tmp_reg); 1.457 + } 1.458 + jcc(Assembler::notEqual, cas_label); 1.459 + // The bias pattern is present in the object's header. Need to check 1.460 + // whether the bias owner and the epoch are both still current. 1.461 +#ifndef _LP64 1.462 + // Note that because there is no current thread register on x86_32 we 1.463 + // need to store off the mark word we read out of the object to 1.464 + // avoid reloading it and needing to recheck invariants below. This 1.465 + // store is unfortunate but it makes the overall code shorter and 1.466 + // simpler. 1.467 + movptr(saved_mark_addr, swap_reg); 1.468 +#endif 1.469 + if (need_tmp_reg) { 1.470 + push(tmp_reg); 1.471 + } 1.472 + if (swap_reg_contains_mark) { 1.473 + null_check_offset = offset(); 1.474 + } 1.475 + load_prototype_header(tmp_reg, obj_reg); 1.476 +#ifdef _LP64 1.477 + orptr(tmp_reg, r15_thread); 1.478 + xorptr(tmp_reg, swap_reg); 1.479 + Register header_reg = tmp_reg; 1.480 +#else 1.481 + xorptr(tmp_reg, swap_reg); 1.482 + get_thread(swap_reg); 1.483 + xorptr(swap_reg, tmp_reg); 1.484 + Register header_reg = swap_reg; 1.485 +#endif 1.486 + andptr(header_reg, ~((int) markOopDesc::age_mask_in_place)); 1.487 + if (need_tmp_reg) { 1.488 + pop(tmp_reg); 1.489 + } 1.490 + if (counters != NULL) { 1.491 + cond_inc32(Assembler::zero, 1.492 + ExternalAddress((address) counters->biased_lock_entry_count_addr())); 1.493 + } 1.494 + jcc(Assembler::equal, done); 1.495 + 1.496 + Label try_revoke_bias; 1.497 + Label try_rebias; 1.498 + 1.499 + // At this point we know that the header has the bias pattern and 1.500 + // that we are not the bias owner in the current epoch. We need to 1.501 + // figure out more details about the state of the header in order to 1.502 + // know what operations can be legally performed on the object's 1.503 + // header. 1.504 + 1.505 + // If the low three bits in the xor result aren't clear, that means 1.506 + // the prototype header is no longer biased and we have to revoke 1.507 + // the bias on this object. 1.508 + testptr(header_reg, markOopDesc::biased_lock_mask_in_place); 1.509 + jccb(Assembler::notZero, try_revoke_bias); 1.510 + 1.511 + // Biasing is still enabled for this data type. See whether the 1.512 + // epoch of the current bias is still valid, meaning that the epoch 1.513 + // bits of the mark word are equal to the epoch bits of the 1.514 + // prototype header. (Note that the prototype header's epoch bits 1.515 + // only change at a safepoint.) If not, attempt to rebias the object 1.516 + // toward the current thread. Note that we must be absolutely sure 1.517 + // that the current epoch is invalid in order to do this because 1.518 + // otherwise the manipulations it performs on the mark word are 1.519 + // illegal. 1.520 + testptr(header_reg, markOopDesc::epoch_mask_in_place); 1.521 + jccb(Assembler::notZero, try_rebias); 1.522 + 1.523 + // The epoch of the current bias is still valid but we know nothing 1.524 + // about the owner; it might be set or it might be clear. Try to 1.525 + // acquire the bias of the object using an atomic operation. If this 1.526 + // fails we will go in to the runtime to revoke the object's bias. 1.527 + // Note that we first construct the presumed unbiased header so we 1.528 + // don't accidentally blow away another thread's valid bias. 1.529 + NOT_LP64( movptr(swap_reg, saved_mark_addr); ) 1.530 + andptr(swap_reg, 1.531 + markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 1.532 + if (need_tmp_reg) { 1.533 + push(tmp_reg); 1.534 + } 1.535 +#ifdef _LP64 1.536 + movptr(tmp_reg, swap_reg); 1.537 + orptr(tmp_reg, r15_thread); 1.538 +#else 1.539 + get_thread(tmp_reg); 1.540 + orptr(tmp_reg, swap_reg); 1.541 +#endif 1.542 + if (os::is_MP()) { 1.543 + lock(); 1.544 + } 1.545 + cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg 1.546 + if (need_tmp_reg) { 1.547 + pop(tmp_reg); 1.548 + } 1.549 + // If the biasing toward our thread failed, this means that 1.550 + // another thread succeeded in biasing it toward itself and we 1.551 + // need to revoke that bias. The revocation will occur in the 1.552 + // interpreter runtime in the slow case. 1.553 + if (counters != NULL) { 1.554 + cond_inc32(Assembler::zero, 1.555 + ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr())); 1.556 + } 1.557 + if (slow_case != NULL) { 1.558 + jcc(Assembler::notZero, *slow_case); 1.559 + } 1.560 + jmp(done); 1.561 + 1.562 + bind(try_rebias); 1.563 + // At this point we know the epoch has expired, meaning that the 1.564 + // current "bias owner", if any, is actually invalid. Under these 1.565 + // circumstances _only_, we are allowed to use the current header's 1.566 + // value as the comparison value when doing the cas to acquire the 1.567 + // bias in the current epoch. In other words, we allow transfer of 1.568 + // the bias from one thread to another directly in this situation. 1.569 + // 1.570 + // FIXME: due to a lack of registers we currently blow away the age 1.571 + // bits in this situation. Should attempt to preserve them. 1.572 + if (need_tmp_reg) { 1.573 + push(tmp_reg); 1.574 + } 1.575 + load_prototype_header(tmp_reg, obj_reg); 1.576 +#ifdef _LP64 1.577 + orptr(tmp_reg, r15_thread); 1.578 +#else 1.579 + get_thread(swap_reg); 1.580 + orptr(tmp_reg, swap_reg); 1.581 + movptr(swap_reg, saved_mark_addr); 1.582 +#endif 1.583 + if (os::is_MP()) { 1.584 + lock(); 1.585 + } 1.586 + cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg 1.587 + if (need_tmp_reg) { 1.588 + pop(tmp_reg); 1.589 + } 1.590 + // If the biasing toward our thread failed, then another thread 1.591 + // succeeded in biasing it toward itself and we need to revoke that 1.592 + // bias. The revocation will occur in the runtime in the slow case. 1.593 + if (counters != NULL) { 1.594 + cond_inc32(Assembler::zero, 1.595 + ExternalAddress((address) counters->rebiased_lock_entry_count_addr())); 1.596 + } 1.597 + if (slow_case != NULL) { 1.598 + jcc(Assembler::notZero, *slow_case); 1.599 + } 1.600 + jmp(done); 1.601 + 1.602 + bind(try_revoke_bias); 1.603 + // The prototype mark in the klass doesn't have the bias bit set any 1.604 + // more, indicating that objects of this data type are not supposed 1.605 + // to be biased any more. We are going to try to reset the mark of 1.606 + // this object to the prototype value and fall through to the 1.607 + // CAS-based locking scheme. Note that if our CAS fails, it means 1.608 + // that another thread raced us for the privilege of revoking the 1.609 + // bias of this particular object, so it's okay to continue in the 1.610 + // normal locking code. 1.611 + // 1.612 + // FIXME: due to a lack of registers we currently blow away the age 1.613 + // bits in this situation. Should attempt to preserve them. 1.614 + NOT_LP64( movptr(swap_reg, saved_mark_addr); ) 1.615 + if (need_tmp_reg) { 1.616 + push(tmp_reg); 1.617 + } 1.618 + load_prototype_header(tmp_reg, obj_reg); 1.619 + if (os::is_MP()) { 1.620 + lock(); 1.621 + } 1.622 + cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg 1.623 + if (need_tmp_reg) { 1.624 + pop(tmp_reg); 1.625 + } 1.626 + // Fall through to the normal CAS-based lock, because no matter what 1.627 + // the result of the above CAS, some thread must have succeeded in 1.628 + // removing the bias bit from the object's header. 1.629 + if (counters != NULL) { 1.630 + cond_inc32(Assembler::zero, 1.631 + ExternalAddress((address) counters->revoked_lock_entry_count_addr())); 1.632 + } 1.633 + 1.634 + bind(cas_label); 1.635 + 1.636 + return null_check_offset; 1.637 +} 1.638 + 1.639 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 1.640 assert(UseBiasedLocking, "why call this otherwise?"); 1.641 1.642 @@ -1408,6 +1273,620 @@ 1.643 jcc(Assembler::equal, done); 1.644 } 1.645 1.646 +#ifdef COMPILER2 1.647 +// Fast_Lock and Fast_Unlock used by C2 1.648 + 1.649 +// Because the transitions from emitted code to the runtime 1.650 +// monitorenter/exit helper stubs are so slow it's critical that 1.651 +// we inline both the stack-locking fast-path and the inflated fast path. 1.652 +// 1.653 +// See also: cmpFastLock and cmpFastUnlock. 1.654 +// 1.655 +// What follows is a specialized inline transliteration of the code 1.656 +// in slow_enter() and slow_exit(). If we're concerned about I$ bloat 1.657 +// another option would be to emit TrySlowEnter and TrySlowExit methods 1.658 +// at startup-time. These methods would accept arguments as 1.659 +// (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 1.660 +// indications in the icc.ZFlag. Fast_Lock and Fast_Unlock would simply 1.661 +// marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 1.662 +// In practice, however, the # of lock sites is bounded and is usually small. 1.663 +// Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 1.664 +// if the processor uses simple bimodal branch predictors keyed by EIP 1.665 +// Since the helper routines would be called from multiple synchronization 1.666 +// sites. 1.667 +// 1.668 +// An even better approach would be write "MonitorEnter()" and "MonitorExit()" 1.669 +// in java - using j.u.c and unsafe - and just bind the lock and unlock sites 1.670 +// to those specialized methods. That'd give us a mostly platform-independent 1.671 +// implementation that the JITs could optimize and inline at their pleasure. 1.672 +// Done correctly, the only time we'd need to cross to native could would be 1.673 +// to park() or unpark() threads. We'd also need a few more unsafe operators 1.674 +// to (a) prevent compiler-JIT reordering of non-volatile accesses, and 1.675 +// (b) explicit barriers or fence operations. 1.676 +// 1.677 +// TODO: 1.678 +// 1.679 +// * Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr). 1.680 +// This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals. 1.681 +// Given TLAB allocation, Self is usually manifested in a register, so passing it into 1.682 +// the lock operators would typically be faster than reifying Self. 1.683 +// 1.684 +// * Ideally I'd define the primitives as: 1.685 +// fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 1.686 +// fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 1.687 +// Unfortunately ADLC bugs prevent us from expressing the ideal form. 1.688 +// Instead, we're stuck with a rather awkward and brittle register assignments below. 1.689 +// Furthermore the register assignments are overconstrained, possibly resulting in 1.690 +// sub-optimal code near the synchronization site. 1.691 +// 1.692 +// * Eliminate the sp-proximity tests and just use "== Self" tests instead. 1.693 +// Alternately, use a better sp-proximity test. 1.694 +// 1.695 +// * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 1.696 +// Either one is sufficient to uniquely identify a thread. 1.697 +// TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 1.698 +// 1.699 +// * Intrinsify notify() and notifyAll() for the common cases where the 1.700 +// object is locked by the calling thread but the waitlist is empty. 1.701 +// avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 1.702 +// 1.703 +// * use jccb and jmpb instead of jcc and jmp to improve code density. 1.704 +// But beware of excessive branch density on AMD Opterons. 1.705 +// 1.706 +// * Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success 1.707 +// or failure of the fast-path. If the fast-path fails then we pass 1.708 +// control to the slow-path, typically in C. In Fast_Lock and 1.709 +// Fast_Unlock we often branch to DONE_LABEL, just to find that C2 1.710 +// will emit a conditional branch immediately after the node. 1.711 +// So we have branches to branches and lots of ICC.ZF games. 1.712 +// Instead, it might be better to have C2 pass a "FailureLabel" 1.713 +// into Fast_Lock and Fast_Unlock. In the case of success, control 1.714 +// will drop through the node. ICC.ZF is undefined at exit. 1.715 +// In the case of failure, the node will branch directly to the 1.716 +// FailureLabel 1.717 + 1.718 + 1.719 +// obj: object to lock 1.720 +// box: on-stack box address (displaced header location) - KILLED 1.721 +// rax,: tmp -- KILLED 1.722 +// scr: tmp -- KILLED 1.723 +void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, Register scrReg, BiasedLockingCounters* counters) { 1.724 + // Ensure the register assignents are disjoint 1.725 + guarantee (objReg != boxReg, ""); 1.726 + guarantee (objReg != tmpReg, ""); 1.727 + guarantee (objReg != scrReg, ""); 1.728 + guarantee (boxReg != tmpReg, ""); 1.729 + guarantee (boxReg != scrReg, ""); 1.730 + guarantee (tmpReg == rax, ""); 1.731 + 1.732 + if (counters != NULL) { 1.733 + atomic_incl(ExternalAddress((address)counters->total_entry_count_addr())); 1.734 + } 1.735 + if (EmitSync & 1) { 1.736 + // set box->dhw = unused_mark (3) 1.737 + // Force all sync thru slow-path: slow_enter() and slow_exit() 1.738 + movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark())); 1.739 + cmpptr (rsp, (int32_t)NULL_WORD); 1.740 + } else 1.741 + if (EmitSync & 2) { 1.742 + Label DONE_LABEL ; 1.743 + if (UseBiasedLocking) { 1.744 + // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument. 1.745 + biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters); 1.746 + } 1.747 + 1.748 + movptr(tmpReg, Address(objReg, 0)); // fetch markword 1.749 + orptr (tmpReg, 0x1); 1.750 + movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 1.751 + if (os::is_MP()) { 1.752 + lock(); 1.753 + } 1.754 + cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg 1.755 + jccb(Assembler::equal, DONE_LABEL); 1.756 + // Recursive locking 1.757 + subptr(tmpReg, rsp); 1.758 + andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) ); 1.759 + movptr(Address(boxReg, 0), tmpReg); 1.760 + bind(DONE_LABEL); 1.761 + } else { 1.762 + // Possible cases that we'll encounter in fast_lock 1.763 + // ------------------------------------------------ 1.764 + // * Inflated 1.765 + // -- unlocked 1.766 + // -- Locked 1.767 + // = by self 1.768 + // = by other 1.769 + // * biased 1.770 + // -- by Self 1.771 + // -- by other 1.772 + // * neutral 1.773 + // * stack-locked 1.774 + // -- by self 1.775 + // = sp-proximity test hits 1.776 + // = sp-proximity test generates false-negative 1.777 + // -- by other 1.778 + // 1.779 + 1.780 + Label IsInflated, DONE_LABEL; 1.781 + 1.782 + // it's stack-locked, biased or neutral 1.783 + // TODO: optimize away redundant LDs of obj->mark and improve the markword triage 1.784 + // order to reduce the number of conditional branches in the most common cases. 1.785 + // Beware -- there's a subtle invariant that fetch of the markword 1.786 + // at [FETCH], below, will never observe a biased encoding (*101b). 1.787 + // If this invariant is not held we risk exclusion (safety) failure. 1.788 + if (UseBiasedLocking && !UseOptoBiasInlining) { 1.789 + biased_locking_enter(boxReg, objReg, tmpReg, scrReg, true, DONE_LABEL, NULL, counters); 1.790 + } 1.791 + 1.792 + movptr(tmpReg, Address(objReg, 0)); // [FETCH] 1.793 + testl (tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased 1.794 + jccb (Assembler::notZero, IsInflated); 1.795 + 1.796 + // Attempt stack-locking ... 1.797 + orptr (tmpReg, 0x1); 1.798 + movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 1.799 + if (os::is_MP()) { 1.800 + lock(); 1.801 + } 1.802 + cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg 1.803 + if (counters != NULL) { 1.804 + cond_inc32(Assembler::equal, 1.805 + ExternalAddress((address)counters->fast_path_entry_count_addr())); 1.806 + } 1.807 + jccb(Assembler::equal, DONE_LABEL); 1.808 + 1.809 + // Recursive locking 1.810 + subptr(tmpReg, rsp); 1.811 + andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) ); 1.812 + movptr(Address(boxReg, 0), tmpReg); 1.813 + if (counters != NULL) { 1.814 + cond_inc32(Assembler::equal, 1.815 + ExternalAddress((address)counters->fast_path_entry_count_addr())); 1.816 + } 1.817 + jmpb(DONE_LABEL); 1.818 + 1.819 + bind(IsInflated); 1.820 +#ifndef _LP64 1.821 + // The object is inflated. 1.822 + // 1.823 + // TODO-FIXME: eliminate the ugly use of manifest constants: 1.824 + // Use markOopDesc::monitor_value instead of "2". 1.825 + // use markOop::unused_mark() instead of "3". 1.826 + // The tmpReg value is an objectMonitor reference ORed with 1.827 + // markOopDesc::monitor_value (2). We can either convert tmpReg to an 1.828 + // objectmonitor pointer by masking off the "2" bit or we can just 1.829 + // use tmpReg as an objectmonitor pointer but bias the objectmonitor 1.830 + // field offsets with "-2" to compensate for and annul the low-order tag bit. 1.831 + // 1.832 + // I use the latter as it avoids AGI stalls. 1.833 + // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]" 1.834 + // instead of "mov r, [tmpReg+OFFSETOF(Owner)]". 1.835 + // 1.836 + #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2) 1.837 + 1.838 + // boxReg refers to the on-stack BasicLock in the current frame. 1.839 + // We'd like to write: 1.840 + // set box->_displaced_header = markOop::unused_mark(). Any non-0 value suffices. 1.841 + // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 1.842 + // additional latency as we have another ST in the store buffer that must drain. 1.843 + 1.844 + if (EmitSync & 8192) { 1.845 + movptr(Address(boxReg, 0), 3); // results in ST-before-CAS penalty 1.846 + get_thread (scrReg); 1.847 + movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 1.848 + movptr(tmpReg, NULL_WORD); // consider: xor vs mov 1.849 + if (os::is_MP()) { 1.850 + lock(); 1.851 + } 1.852 + cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)); 1.853 + } else 1.854 + if ((EmitSync & 128) == 0) { // avoid ST-before-CAS 1.855 + movptr(scrReg, boxReg); 1.856 + movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 1.857 + 1.858 + // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes 1.859 + if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) { 1.860 + // prefetchw [eax + Offset(_owner)-2] 1.861 + prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); 1.862 + } 1.863 + 1.864 + if ((EmitSync & 64) == 0) { 1.865 + // Optimistic form: consider XORL tmpReg,tmpReg 1.866 + movptr(tmpReg, NULL_WORD); 1.867 + } else { 1.868 + // Can suffer RTS->RTO upgrades on shared or cold $ lines 1.869 + // Test-And-CAS instead of CAS 1.870 + movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); // rax, = m->_owner 1.871 + testptr(tmpReg, tmpReg); // Locked ? 1.872 + jccb (Assembler::notZero, DONE_LABEL); 1.873 + } 1.874 + 1.875 + // Appears unlocked - try to swing _owner from null to non-null. 1.876 + // Ideally, I'd manifest "Self" with get_thread and then attempt 1.877 + // to CAS the register containing Self into m->Owner. 1.878 + // But we don't have enough registers, so instead we can either try to CAS 1.879 + // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 1.880 + // we later store "Self" into m->Owner. Transiently storing a stack address 1.881 + // (rsp or the address of the box) into m->owner is harmless. 1.882 + // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 1.883 + if (os::is_MP()) { 1.884 + lock(); 1.885 + } 1.886 + cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)); 1.887 + movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 1.888 + jccb (Assembler::notZero, DONE_LABEL); 1.889 + get_thread (scrReg); // beware: clobbers ICCs 1.890 + movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg); 1.891 + xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 1.892 + 1.893 + // If the CAS fails we can either retry or pass control to the slow-path. 1.894 + // We use the latter tactic. 1.895 + // Pass the CAS result in the icc.ZFlag into DONE_LABEL 1.896 + // If the CAS was successful ... 1.897 + // Self has acquired the lock 1.898 + // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 1.899 + // Intentional fall-through into DONE_LABEL ... 1.900 + } else { 1.901 + movptr(Address(boxReg, 0), intptr_t(markOopDesc::unused_mark())); // results in ST-before-CAS penalty 1.902 + movptr(boxReg, tmpReg); 1.903 + 1.904 + // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes 1.905 + if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) { 1.906 + // prefetchw [eax + Offset(_owner)-2] 1.907 + prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); 1.908 + } 1.909 + 1.910 + if ((EmitSync & 64) == 0) { 1.911 + // Optimistic form 1.912 + xorptr (tmpReg, tmpReg); 1.913 + } else { 1.914 + // Can suffer RTS->RTO upgrades on shared or cold $ lines 1.915 + movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); // rax, = m->_owner 1.916 + testptr(tmpReg, tmpReg); // Locked ? 1.917 + jccb (Assembler::notZero, DONE_LABEL); 1.918 + } 1.919 + 1.920 + // Appears unlocked - try to swing _owner from null to non-null. 1.921 + // Use either "Self" (in scr) or rsp as thread identity in _owner. 1.922 + // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 1.923 + get_thread (scrReg); 1.924 + if (os::is_MP()) { 1.925 + lock(); 1.926 + } 1.927 + cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)); 1.928 + 1.929 + // If the CAS fails we can either retry or pass control to the slow-path. 1.930 + // We use the latter tactic. 1.931 + // Pass the CAS result in the icc.ZFlag into DONE_LABEL 1.932 + // If the CAS was successful ... 1.933 + // Self has acquired the lock 1.934 + // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 1.935 + // Intentional fall-through into DONE_LABEL ... 1.936 + } 1.937 +#else // _LP64 1.938 + // It's inflated 1.939 + 1.940 + // TODO: someday avoid the ST-before-CAS penalty by 1.941 + // relocating (deferring) the following ST. 1.942 + // We should also think about trying a CAS without having 1.943 + // fetched _owner. If the CAS is successful we may 1.944 + // avoid an RTO->RTS upgrade on the $line. 1.945 + 1.946 + // Without cast to int32_t a movptr will destroy r10 which is typically obj 1.947 + movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark())); 1.948 + 1.949 + mov (boxReg, tmpReg); 1.950 + movptr (tmpReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)); 1.951 + testptr(tmpReg, tmpReg); 1.952 + jccb (Assembler::notZero, DONE_LABEL); 1.953 + 1.954 + // It's inflated and appears unlocked 1.955 + if (os::is_MP()) { 1.956 + lock(); 1.957 + } 1.958 + cmpxchgptr(r15_thread, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)); 1.959 + // Intentional fall-through into DONE_LABEL ... 1.960 + 1.961 +#endif 1.962 + 1.963 + // DONE_LABEL is a hot target - we'd really like to place it at the 1.964 + // start of cache line by padding with NOPs. 1.965 + // See the AMD and Intel software optimization manuals for the 1.966 + // most efficient "long" NOP encodings. 1.967 + // Unfortunately none of our alignment mechanisms suffice. 1.968 + bind(DONE_LABEL); 1.969 + 1.970 + // At DONE_LABEL the icc ZFlag is set as follows ... 1.971 + // Fast_Unlock uses the same protocol. 1.972 + // ZFlag == 1 -> Success 1.973 + // ZFlag == 0 -> Failure - force control through the slow-path 1.974 + } 1.975 +} 1.976 + 1.977 +// obj: object to unlock 1.978 +// box: box address (displaced header location), killed. Must be EAX. 1.979 +// tmp: killed, cannot be obj nor box. 1.980 +// 1.981 +// Some commentary on balanced locking: 1.982 +// 1.983 +// Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites. 1.984 +// Methods that don't have provably balanced locking are forced to run in the 1.985 +// interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 1.986 +// The interpreter provides two properties: 1.987 +// I1: At return-time the interpreter automatically and quietly unlocks any 1.988 +// objects acquired the current activation (frame). Recall that the 1.989 +// interpreter maintains an on-stack list of locks currently held by 1.990 +// a frame. 1.991 +// I2: If a method attempts to unlock an object that is not held by the 1.992 +// the frame the interpreter throws IMSX. 1.993 +// 1.994 +// Lets say A(), which has provably balanced locking, acquires O and then calls B(). 1.995 +// B() doesn't have provably balanced locking so it runs in the interpreter. 1.996 +// Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 1.997 +// is still locked by A(). 1.998 +// 1.999 +// The only other source of unbalanced locking would be JNI. The "Java Native Interface: 1.1000 +// Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 1.1001 +// should not be unlocked by "normal" java-level locking and vice-versa. The specification 1.1002 +// doesn't specify what will occur if a program engages in such mixed-mode locking, however. 1.1003 + 1.1004 +void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 1.1005 + guarantee (objReg != boxReg, ""); 1.1006 + guarantee (objReg != tmpReg, ""); 1.1007 + guarantee (boxReg != tmpReg, ""); 1.1008 + guarantee (boxReg == rax, ""); 1.1009 + 1.1010 + if (EmitSync & 4) { 1.1011 + // Disable - inhibit all inlining. Force control through the slow-path 1.1012 + cmpptr (rsp, 0); 1.1013 + } else 1.1014 + if (EmitSync & 8) { 1.1015 + Label DONE_LABEL; 1.1016 + if (UseBiasedLocking) { 1.1017 + biased_locking_exit(objReg, tmpReg, DONE_LABEL); 1.1018 + } 1.1019 + // Classic stack-locking code ... 1.1020 + // Check whether the displaced header is 0 1.1021 + //(=> recursive unlock) 1.1022 + movptr(tmpReg, Address(boxReg, 0)); 1.1023 + testptr(tmpReg, tmpReg); 1.1024 + jccb(Assembler::zero, DONE_LABEL); 1.1025 + // If not recursive lock, reset the header to displaced header 1.1026 + if (os::is_MP()) { 1.1027 + lock(); 1.1028 + } 1.1029 + cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box 1.1030 + bind(DONE_LABEL); 1.1031 + } else { 1.1032 + Label DONE_LABEL, Stacked, CheckSucc; 1.1033 + 1.1034 + // Critically, the biased locking test must have precedence over 1.1035 + // and appear before the (box->dhw == 0) recursive stack-lock test. 1.1036 + if (UseBiasedLocking && !UseOptoBiasInlining) { 1.1037 + biased_locking_exit(objReg, tmpReg, DONE_LABEL); 1.1038 + } 1.1039 + 1.1040 + cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header 1.1041 + movptr(tmpReg, Address(objReg, 0)); // Examine the object's markword 1.1042 + jccb (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock 1.1043 + 1.1044 + testptr(tmpReg, 0x02); // Inflated? 1.1045 + jccb (Assembler::zero, Stacked); 1.1046 + 1.1047 + // It's inflated. 1.1048 + // Despite our balanced locking property we still check that m->_owner == Self 1.1049 + // as java routines or native JNI code called by this thread might 1.1050 + // have released the lock. 1.1051 + // Refer to the comments in synchronizer.cpp for how we might encode extra 1.1052 + // state in _succ so we can avoid fetching EntryList|cxq. 1.1053 + // 1.1054 + // I'd like to add more cases in fast_lock() and fast_unlock() -- 1.1055 + // such as recursive enter and exit -- but we have to be wary of 1.1056 + // I$ bloat, T$ effects and BP$ effects. 1.1057 + // 1.1058 + // If there's no contention try a 1-0 exit. That is, exit without 1.1059 + // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 1.1060 + // we detect and recover from the race that the 1-0 exit admits. 1.1061 + // 1.1062 + // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier 1.1063 + // before it STs null into _owner, releasing the lock. Updates 1.1064 + // to data protected by the critical section must be visible before 1.1065 + // we drop the lock (and thus before any other thread could acquire 1.1066 + // the lock and observe the fields protected by the lock). 1.1067 + // IA32's memory-model is SPO, so STs are ordered with respect to 1.1068 + // each other and there's no need for an explicit barrier (fence). 1.1069 + // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 1.1070 +#ifndef _LP64 1.1071 + get_thread (boxReg); 1.1072 + if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) { 1.1073 + // prefetchw [ebx + Offset(_owner)-2] 1.1074 + prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); 1.1075 + } 1.1076 + 1.1077 + // Note that we could employ various encoding schemes to reduce 1.1078 + // the number of loads below (currently 4) to just 2 or 3. 1.1079 + // Refer to the comments in synchronizer.cpp. 1.1080 + // In practice the chain of fetches doesn't seem to impact performance, however. 1.1081 + if ((EmitSync & 65536) == 0 && (EmitSync & 256)) { 1.1082 + // Attempt to reduce branch density - AMD's branch predictor. 1.1083 + xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); 1.1084 + orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)); 1.1085 + orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)); 1.1086 + orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)); 1.1087 + jccb (Assembler::notZero, DONE_LABEL); 1.1088 + movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD); 1.1089 + jmpb (DONE_LABEL); 1.1090 + } else { 1.1091 + xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); 1.1092 + orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)); 1.1093 + jccb (Assembler::notZero, DONE_LABEL); 1.1094 + movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)); 1.1095 + orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)); 1.1096 + jccb (Assembler::notZero, CheckSucc); 1.1097 + movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD); 1.1098 + jmpb (DONE_LABEL); 1.1099 + } 1.1100 + 1.1101 + // The Following code fragment (EmitSync & 65536) improves the performance of 1.1102 + // contended applications and contended synchronization microbenchmarks. 1.1103 + // Unfortunately the emission of the code - even though not executed - causes regressions 1.1104 + // in scimark and jetstream, evidently because of $ effects. Replacing the code 1.1105 + // with an equal number of never-executed NOPs results in the same regression. 1.1106 + // We leave it off by default. 1.1107 + 1.1108 + if ((EmitSync & 65536) != 0) { 1.1109 + Label LSuccess, LGoSlowPath ; 1.1110 + 1.1111 + bind (CheckSucc); 1.1112 + 1.1113 + // Optional pre-test ... it's safe to elide this 1.1114 + if ((EmitSync & 16) == 0) { 1.1115 + cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD); 1.1116 + jccb (Assembler::zero, LGoSlowPath); 1.1117 + } 1.1118 + 1.1119 + // We have a classic Dekker-style idiom: 1.1120 + // ST m->_owner = 0 ; MEMBAR; LD m->_succ 1.1121 + // There are a number of ways to implement the barrier: 1.1122 + // (1) lock:andl &m->_owner, 0 1.1123 + // is fast, but mask doesn't currently support the "ANDL M,IMM32" form. 1.1124 + // LOCK: ANDL [ebx+Offset(_Owner)-2], 0 1.1125 + // Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8 1.1126 + // (2) If supported, an explicit MFENCE is appealing. 1.1127 + // In older IA32 processors MFENCE is slower than lock:add or xchg 1.1128 + // particularly if the write-buffer is full as might be the case if 1.1129 + // if stores closely precede the fence or fence-equivalent instruction. 1.1130 + // In more modern implementations MFENCE appears faster, however. 1.1131 + // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack 1.1132 + // The $lines underlying the top-of-stack should be in M-state. 1.1133 + // The locked add instruction is serializing, of course. 1.1134 + // (4) Use xchg, which is serializing 1.1135 + // mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works 1.1136 + // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0. 1.1137 + // The integer condition codes will tell us if succ was 0. 1.1138 + // Since _succ and _owner should reside in the same $line and 1.1139 + // we just stored into _owner, it's likely that the $line 1.1140 + // remains in M-state for the lock:orl. 1.1141 + // 1.1142 + // We currently use (3), although it's likely that switching to (2) 1.1143 + // is correct for the future. 1.1144 + 1.1145 + movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD); 1.1146 + if (os::is_MP()) { 1.1147 + if (VM_Version::supports_sse2() && 1 == FenceInstruction) { 1.1148 + mfence(); 1.1149 + } else { 1.1150 + lock (); addptr(Address(rsp, 0), 0); 1.1151 + } 1.1152 + } 1.1153 + // Ratify _succ remains non-null 1.1154 + cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0); 1.1155 + jccb (Assembler::notZero, LSuccess); 1.1156 + 1.1157 + xorptr(boxReg, boxReg); // box is really EAX 1.1158 + if (os::is_MP()) { lock(); } 1.1159 + cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); 1.1160 + jccb (Assembler::notEqual, LSuccess); 1.1161 + // Since we're low on registers we installed rsp as a placeholding in _owner. 1.1162 + // Now install Self over rsp. This is safe as we're transitioning from 1.1163 + // non-null to non=null 1.1164 + get_thread (boxReg); 1.1165 + movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg); 1.1166 + // Intentional fall-through into LGoSlowPath ... 1.1167 + 1.1168 + bind (LGoSlowPath); 1.1169 + orptr(boxReg, 1); // set ICC.ZF=0 to indicate failure 1.1170 + jmpb (DONE_LABEL); 1.1171 + 1.1172 + bind (LSuccess); 1.1173 + xorptr(boxReg, boxReg); // set ICC.ZF=1 to indicate success 1.1174 + jmpb (DONE_LABEL); 1.1175 + } 1.1176 + 1.1177 + bind (Stacked); 1.1178 + // It's not inflated and it's not recursively stack-locked and it's not biased. 1.1179 + // It must be stack-locked. 1.1180 + // Try to reset the header to displaced header. 1.1181 + // The "box" value on the stack is stable, so we can reload 1.1182 + // and be assured we observe the same value as above. 1.1183 + movptr(tmpReg, Address(boxReg, 0)); 1.1184 + if (os::is_MP()) { 1.1185 + lock(); 1.1186 + } 1.1187 + cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box 1.1188 + // Intention fall-thru into DONE_LABEL 1.1189 + 1.1190 + // DONE_LABEL is a hot target - we'd really like to place it at the 1.1191 + // start of cache line by padding with NOPs. 1.1192 + // See the AMD and Intel software optimization manuals for the 1.1193 + // most efficient "long" NOP encodings. 1.1194 + // Unfortunately none of our alignment mechanisms suffice. 1.1195 + if ((EmitSync & 65536) == 0) { 1.1196 + bind (CheckSucc); 1.1197 + } 1.1198 +#else // _LP64 1.1199 + // It's inflated 1.1200 + movptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); 1.1201 + xorptr(boxReg, r15_thread); 1.1202 + orptr (boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)); 1.1203 + jccb (Assembler::notZero, DONE_LABEL); 1.1204 + movptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)); 1.1205 + orptr (boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)); 1.1206 + jccb (Assembler::notZero, CheckSucc); 1.1207 + movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD); 1.1208 + jmpb (DONE_LABEL); 1.1209 + 1.1210 + if ((EmitSync & 65536) == 0) { 1.1211 + Label LSuccess, LGoSlowPath ; 1.1212 + bind (CheckSucc); 1.1213 + cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD); 1.1214 + jccb (Assembler::zero, LGoSlowPath); 1.1215 + 1.1216 + // I'd much rather use lock:andl m->_owner, 0 as it's faster than the 1.1217 + // the explicit ST;MEMBAR combination, but masm doesn't currently support 1.1218 + // "ANDQ M,IMM". Don't use MFENCE here. lock:add to TOS, xchg, etc 1.1219 + // are all faster when the write buffer is populated. 1.1220 + movptr (Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD); 1.1221 + if (os::is_MP()) { 1.1222 + lock (); addl (Address(rsp, 0), 0); 1.1223 + } 1.1224 + cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD); 1.1225 + jccb (Assembler::notZero, LSuccess); 1.1226 + 1.1227 + movptr (boxReg, (int32_t)NULL_WORD); // box is really EAX 1.1228 + if (os::is_MP()) { lock(); } 1.1229 + cmpxchgptr(r15_thread, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); 1.1230 + jccb (Assembler::notEqual, LSuccess); 1.1231 + // Intentional fall-through into slow-path 1.1232 + 1.1233 + bind (LGoSlowPath); 1.1234 + orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 1.1235 + jmpb (DONE_LABEL); 1.1236 + 1.1237 + bind (LSuccess); 1.1238 + testl (boxReg, 0); // set ICC.ZF=1 to indicate success 1.1239 + jmpb (DONE_LABEL); 1.1240 + } 1.1241 + 1.1242 + bind (Stacked); 1.1243 + movptr(tmpReg, Address (boxReg, 0)); // re-fetch 1.1244 + if (os::is_MP()) { lock(); } 1.1245 + cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box 1.1246 + 1.1247 + if (EmitSync & 65536) { 1.1248 + bind (CheckSucc); 1.1249 + } 1.1250 +#endif 1.1251 + bind(DONE_LABEL); 1.1252 + // Avoid branch to branch on AMD processors 1.1253 + if (EmitSync & 32768) { 1.1254 + nop(); 1.1255 + } 1.1256 + } 1.1257 +} 1.1258 +#endif // COMPILER2 1.1259 + 1.1260 void MacroAssembler::c2bool(Register x) { 1.1261 // implements x == 0 ? 0 : 1 1.1262 // note: must only look at least-significant byte of x