src/cpu/x86/vm/macroAssembler_x86.cpp

changeset 6356
4d4ea046d32a
parent 6155
61746b5f0ed3
child 6429
606acabe7b5c
     1.1 --- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu Mar 06 12:45:59 2014 +0400
     1.2 +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon Feb 24 15:12:26 2014 -0800
     1.3 @@ -98,217 +98,6 @@
     1.4    return Address::make_array(adr);
     1.5  }
     1.6  
     1.7 -int MacroAssembler::biased_locking_enter(Register lock_reg,
     1.8 -                                         Register obj_reg,
     1.9 -                                         Register swap_reg,
    1.10 -                                         Register tmp_reg,
    1.11 -                                         bool swap_reg_contains_mark,
    1.12 -                                         Label& done,
    1.13 -                                         Label* slow_case,
    1.14 -                                         BiasedLockingCounters* counters) {
    1.15 -  assert(UseBiasedLocking, "why call this otherwise?");
    1.16 -  assert(swap_reg == rax, "swap_reg must be rax, for cmpxchg");
    1.17 -  assert_different_registers(lock_reg, obj_reg, swap_reg);
    1.18 -
    1.19 -  if (PrintBiasedLockingStatistics && counters == NULL)
    1.20 -    counters = BiasedLocking::counters();
    1.21 -
    1.22 -  bool need_tmp_reg = false;
    1.23 -  if (tmp_reg == noreg) {
    1.24 -    need_tmp_reg = true;
    1.25 -    tmp_reg = lock_reg;
    1.26 -  } else {
    1.27 -    assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
    1.28 -  }
    1.29 -  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
    1.30 -  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
    1.31 -  Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
    1.32 -  Address saved_mark_addr(lock_reg, 0);
    1.33 -
    1.34 -  // Biased locking
    1.35 -  // See whether the lock is currently biased toward our thread and
    1.36 -  // whether the epoch is still valid
    1.37 -  // Note that the runtime guarantees sufficient alignment of JavaThread
    1.38 -  // pointers to allow age to be placed into low bits
    1.39 -  // First check to see whether biasing is even enabled for this object
    1.40 -  Label cas_label;
    1.41 -  int null_check_offset = -1;
    1.42 -  if (!swap_reg_contains_mark) {
    1.43 -    null_check_offset = offset();
    1.44 -    movl(swap_reg, mark_addr);
    1.45 -  }
    1.46 -  if (need_tmp_reg) {
    1.47 -    push(tmp_reg);
    1.48 -  }
    1.49 -  movl(tmp_reg, swap_reg);
    1.50 -  andl(tmp_reg, markOopDesc::biased_lock_mask_in_place);
    1.51 -  cmpl(tmp_reg, markOopDesc::biased_lock_pattern);
    1.52 -  if (need_tmp_reg) {
    1.53 -    pop(tmp_reg);
    1.54 -  }
    1.55 -  jcc(Assembler::notEqual, cas_label);
    1.56 -  // The bias pattern is present in the object's header. Need to check
    1.57 -  // whether the bias owner and the epoch are both still current.
    1.58 -  // Note that because there is no current thread register on x86 we
    1.59 -  // need to store off the mark word we read out of the object to
    1.60 -  // avoid reloading it and needing to recheck invariants below. This
    1.61 -  // store is unfortunate but it makes the overall code shorter and
    1.62 -  // simpler.
    1.63 -  movl(saved_mark_addr, swap_reg);
    1.64 -  if (need_tmp_reg) {
    1.65 -    push(tmp_reg);
    1.66 -  }
    1.67 -  get_thread(tmp_reg);
    1.68 -  xorl(swap_reg, tmp_reg);
    1.69 -  if (swap_reg_contains_mark) {
    1.70 -    null_check_offset = offset();
    1.71 -  }
    1.72 -  movl(tmp_reg, klass_addr);
    1.73 -  xorl(swap_reg, Address(tmp_reg, Klass::prototype_header_offset()));
    1.74 -  andl(swap_reg, ~((int) markOopDesc::age_mask_in_place));
    1.75 -  if (need_tmp_reg) {
    1.76 -    pop(tmp_reg);
    1.77 -  }
    1.78 -  if (counters != NULL) {
    1.79 -    cond_inc32(Assembler::zero,
    1.80 -               ExternalAddress((address)counters->biased_lock_entry_count_addr()));
    1.81 -  }
    1.82 -  jcc(Assembler::equal, done);
    1.83 -
    1.84 -  Label try_revoke_bias;
    1.85 -  Label try_rebias;
    1.86 -
    1.87 -  // At this point we know that the header has the bias pattern and
    1.88 -  // that we are not the bias owner in the current epoch. We need to
    1.89 -  // figure out more details about the state of the header in order to
    1.90 -  // know what operations can be legally performed on the object's
    1.91 -  // header.
    1.92 -
    1.93 -  // If the low three bits in the xor result aren't clear, that means
    1.94 -  // the prototype header is no longer biased and we have to revoke
    1.95 -  // the bias on this object.
    1.96 -  testl(swap_reg, markOopDesc::biased_lock_mask_in_place);
    1.97 -  jcc(Assembler::notZero, try_revoke_bias);
    1.98 -
    1.99 -  // Biasing is still enabled for this data type. See whether the
   1.100 -  // epoch of the current bias is still valid, meaning that the epoch
   1.101 -  // bits of the mark word are equal to the epoch bits of the
   1.102 -  // prototype header. (Note that the prototype header's epoch bits
   1.103 -  // only change at a safepoint.) If not, attempt to rebias the object
   1.104 -  // toward the current thread. Note that we must be absolutely sure
   1.105 -  // that the current epoch is invalid in order to do this because
   1.106 -  // otherwise the manipulations it performs on the mark word are
   1.107 -  // illegal.
   1.108 -  testl(swap_reg, markOopDesc::epoch_mask_in_place);
   1.109 -  jcc(Assembler::notZero, try_rebias);
   1.110 -
   1.111 -  // The epoch of the current bias is still valid but we know nothing
   1.112 -  // about the owner; it might be set or it might be clear. Try to
   1.113 -  // acquire the bias of the object using an atomic operation. If this
   1.114 -  // fails we will go in to the runtime to revoke the object's bias.
   1.115 -  // Note that we first construct the presumed unbiased header so we
   1.116 -  // don't accidentally blow away another thread's valid bias.
   1.117 -  movl(swap_reg, saved_mark_addr);
   1.118 -  andl(swap_reg,
   1.119 -       markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
   1.120 -  if (need_tmp_reg) {
   1.121 -    push(tmp_reg);
   1.122 -  }
   1.123 -  get_thread(tmp_reg);
   1.124 -  orl(tmp_reg, swap_reg);
   1.125 -  if (os::is_MP()) {
   1.126 -    lock();
   1.127 -  }
   1.128 -  cmpxchgptr(tmp_reg, Address(obj_reg, 0));
   1.129 -  if (need_tmp_reg) {
   1.130 -    pop(tmp_reg);
   1.131 -  }
   1.132 -  // If the biasing toward our thread failed, this means that
   1.133 -  // another thread succeeded in biasing it toward itself and we
   1.134 -  // need to revoke that bias. The revocation will occur in the
   1.135 -  // interpreter runtime in the slow case.
   1.136 -  if (counters != NULL) {
   1.137 -    cond_inc32(Assembler::zero,
   1.138 -               ExternalAddress((address)counters->anonymously_biased_lock_entry_count_addr()));
   1.139 -  }
   1.140 -  if (slow_case != NULL) {
   1.141 -    jcc(Assembler::notZero, *slow_case);
   1.142 -  }
   1.143 -  jmp(done);
   1.144 -
   1.145 -  bind(try_rebias);
   1.146 -  // At this point we know the epoch has expired, meaning that the
   1.147 -  // current "bias owner", if any, is actually invalid. Under these
   1.148 -  // circumstances _only_, we are allowed to use the current header's
   1.149 -  // value as the comparison value when doing the cas to acquire the
   1.150 -  // bias in the current epoch. In other words, we allow transfer of
   1.151 -  // the bias from one thread to another directly in this situation.
   1.152 -  //
   1.153 -  // FIXME: due to a lack of registers we currently blow away the age
   1.154 -  // bits in this situation. Should attempt to preserve them.
   1.155 -  if (need_tmp_reg) {
   1.156 -    push(tmp_reg);
   1.157 -  }
   1.158 -  get_thread(tmp_reg);
   1.159 -  movl(swap_reg, klass_addr);
   1.160 -  orl(tmp_reg, Address(swap_reg, Klass::prototype_header_offset()));
   1.161 -  movl(swap_reg, saved_mark_addr);
   1.162 -  if (os::is_MP()) {
   1.163 -    lock();
   1.164 -  }
   1.165 -  cmpxchgptr(tmp_reg, Address(obj_reg, 0));
   1.166 -  if (need_tmp_reg) {
   1.167 -    pop(tmp_reg);
   1.168 -  }
   1.169 -  // If the biasing toward our thread failed, then another thread
   1.170 -  // succeeded in biasing it toward itself and we need to revoke that
   1.171 -  // bias. The revocation will occur in the runtime in the slow case.
   1.172 -  if (counters != NULL) {
   1.173 -    cond_inc32(Assembler::zero,
   1.174 -               ExternalAddress((address)counters->rebiased_lock_entry_count_addr()));
   1.175 -  }
   1.176 -  if (slow_case != NULL) {
   1.177 -    jcc(Assembler::notZero, *slow_case);
   1.178 -  }
   1.179 -  jmp(done);
   1.180 -
   1.181 -  bind(try_revoke_bias);
   1.182 -  // The prototype mark in the klass doesn't have the bias bit set any
   1.183 -  // more, indicating that objects of this data type are not supposed
   1.184 -  // to be biased any more. We are going to try to reset the mark of
   1.185 -  // this object to the prototype value and fall through to the
   1.186 -  // CAS-based locking scheme. Note that if our CAS fails, it means
   1.187 -  // that another thread raced us for the privilege of revoking the
   1.188 -  // bias of this particular object, so it's okay to continue in the
   1.189 -  // normal locking code.
   1.190 -  //
   1.191 -  // FIXME: due to a lack of registers we currently blow away the age
   1.192 -  // bits in this situation. Should attempt to preserve them.
   1.193 -  movl(swap_reg, saved_mark_addr);
   1.194 -  if (need_tmp_reg) {
   1.195 -    push(tmp_reg);
   1.196 -  }
   1.197 -  movl(tmp_reg, klass_addr);
   1.198 -  movl(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset()));
   1.199 -  if (os::is_MP()) {
   1.200 -    lock();
   1.201 -  }
   1.202 -  cmpxchgptr(tmp_reg, Address(obj_reg, 0));
   1.203 -  if (need_tmp_reg) {
   1.204 -    pop(tmp_reg);
   1.205 -  }
   1.206 -  // Fall through to the normal CAS-based lock, because no matter what
   1.207 -  // the result of the above CAS, some thread must have succeeded in
   1.208 -  // removing the bias bit from the object's header.
   1.209 -  if (counters != NULL) {
   1.210 -    cond_inc32(Assembler::zero,
   1.211 -               ExternalAddress((address)counters->revoked_lock_entry_count_addr()));
   1.212 -  }
   1.213 -
   1.214 -  bind(cas_label);
   1.215 -
   1.216 -  return null_check_offset;
   1.217 -}
   1.218  void MacroAssembler::call_VM_leaf_base(address entry_point,
   1.219                                         int number_of_arguments) {
   1.220    call(RuntimeAddress(entry_point));
   1.221 @@ -726,165 +515,6 @@
   1.222    return array;
   1.223  }
   1.224  
   1.225 -int MacroAssembler::biased_locking_enter(Register lock_reg,
   1.226 -                                         Register obj_reg,
   1.227 -                                         Register swap_reg,
   1.228 -                                         Register tmp_reg,
   1.229 -                                         bool swap_reg_contains_mark,
   1.230 -                                         Label& done,
   1.231 -                                         Label* slow_case,
   1.232 -                                         BiasedLockingCounters* counters) {
   1.233 -  assert(UseBiasedLocking, "why call this otherwise?");
   1.234 -  assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
   1.235 -  assert(tmp_reg != noreg, "tmp_reg must be supplied");
   1.236 -  assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
   1.237 -  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
   1.238 -  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
   1.239 -  Address saved_mark_addr(lock_reg, 0);
   1.240 -
   1.241 -  if (PrintBiasedLockingStatistics && counters == NULL)
   1.242 -    counters = BiasedLocking::counters();
   1.243 -
   1.244 -  // Biased locking
   1.245 -  // See whether the lock is currently biased toward our thread and
   1.246 -  // whether the epoch is still valid
   1.247 -  // Note that the runtime guarantees sufficient alignment of JavaThread
   1.248 -  // pointers to allow age to be placed into low bits
   1.249 -  // First check to see whether biasing is even enabled for this object
   1.250 -  Label cas_label;
   1.251 -  int null_check_offset = -1;
   1.252 -  if (!swap_reg_contains_mark) {
   1.253 -    null_check_offset = offset();
   1.254 -    movq(swap_reg, mark_addr);
   1.255 -  }
   1.256 -  movq(tmp_reg, swap_reg);
   1.257 -  andq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
   1.258 -  cmpq(tmp_reg, markOopDesc::biased_lock_pattern);
   1.259 -  jcc(Assembler::notEqual, cas_label);
   1.260 -  // The bias pattern is present in the object's header. Need to check
   1.261 -  // whether the bias owner and the epoch are both still current.
   1.262 -  load_prototype_header(tmp_reg, obj_reg);
   1.263 -  orq(tmp_reg, r15_thread);
   1.264 -  xorq(tmp_reg, swap_reg);
   1.265 -  andq(tmp_reg, ~((int) markOopDesc::age_mask_in_place));
   1.266 -  if (counters != NULL) {
   1.267 -    cond_inc32(Assembler::zero,
   1.268 -               ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
   1.269 -  }
   1.270 -  jcc(Assembler::equal, done);
   1.271 -
   1.272 -  Label try_revoke_bias;
   1.273 -  Label try_rebias;
   1.274 -
   1.275 -  // At this point we know that the header has the bias pattern and
   1.276 -  // that we are not the bias owner in the current epoch. We need to
   1.277 -  // figure out more details about the state of the header in order to
   1.278 -  // know what operations can be legally performed on the object's
   1.279 -  // header.
   1.280 -
   1.281 -  // If the low three bits in the xor result aren't clear, that means
   1.282 -  // the prototype header is no longer biased and we have to revoke
   1.283 -  // the bias on this object.
   1.284 -  testq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
   1.285 -  jcc(Assembler::notZero, try_revoke_bias);
   1.286 -
   1.287 -  // Biasing is still enabled for this data type. See whether the
   1.288 -  // epoch of the current bias is still valid, meaning that the epoch
   1.289 -  // bits of the mark word are equal to the epoch bits of the
   1.290 -  // prototype header. (Note that the prototype header's epoch bits
   1.291 -  // only change at a safepoint.) If not, attempt to rebias the object
   1.292 -  // toward the current thread. Note that we must be absolutely sure
   1.293 -  // that the current epoch is invalid in order to do this because
   1.294 -  // otherwise the manipulations it performs on the mark word are
   1.295 -  // illegal.
   1.296 -  testq(tmp_reg, markOopDesc::epoch_mask_in_place);
   1.297 -  jcc(Assembler::notZero, try_rebias);
   1.298 -
   1.299 -  // The epoch of the current bias is still valid but we know nothing
   1.300 -  // about the owner; it might be set or it might be clear. Try to
   1.301 -  // acquire the bias of the object using an atomic operation. If this
   1.302 -  // fails we will go in to the runtime to revoke the object's bias.
   1.303 -  // Note that we first construct the presumed unbiased header so we
   1.304 -  // don't accidentally blow away another thread's valid bias.
   1.305 -  andq(swap_reg,
   1.306 -       markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
   1.307 -  movq(tmp_reg, swap_reg);
   1.308 -  orq(tmp_reg, r15_thread);
   1.309 -  if (os::is_MP()) {
   1.310 -    lock();
   1.311 -  }
   1.312 -  cmpxchgq(tmp_reg, Address(obj_reg, 0));
   1.313 -  // If the biasing toward our thread failed, this means that
   1.314 -  // another thread succeeded in biasing it toward itself and we
   1.315 -  // need to revoke that bias. The revocation will occur in the
   1.316 -  // interpreter runtime in the slow case.
   1.317 -  if (counters != NULL) {
   1.318 -    cond_inc32(Assembler::zero,
   1.319 -               ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
   1.320 -  }
   1.321 -  if (slow_case != NULL) {
   1.322 -    jcc(Assembler::notZero, *slow_case);
   1.323 -  }
   1.324 -  jmp(done);
   1.325 -
   1.326 -  bind(try_rebias);
   1.327 -  // At this point we know the epoch has expired, meaning that the
   1.328 -  // current "bias owner", if any, is actually invalid. Under these
   1.329 -  // circumstances _only_, we are allowed to use the current header's
   1.330 -  // value as the comparison value when doing the cas to acquire the
   1.331 -  // bias in the current epoch. In other words, we allow transfer of
   1.332 -  // the bias from one thread to another directly in this situation.
   1.333 -  //
   1.334 -  // FIXME: due to a lack of registers we currently blow away the age
   1.335 -  // bits in this situation. Should attempt to preserve them.
   1.336 -  load_prototype_header(tmp_reg, obj_reg);
   1.337 -  orq(tmp_reg, r15_thread);
   1.338 -  if (os::is_MP()) {
   1.339 -    lock();
   1.340 -  }
   1.341 -  cmpxchgq(tmp_reg, Address(obj_reg, 0));
   1.342 -  // If the biasing toward our thread failed, then another thread
   1.343 -  // succeeded in biasing it toward itself and we need to revoke that
   1.344 -  // bias. The revocation will occur in the runtime in the slow case.
   1.345 -  if (counters != NULL) {
   1.346 -    cond_inc32(Assembler::zero,
   1.347 -               ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
   1.348 -  }
   1.349 -  if (slow_case != NULL) {
   1.350 -    jcc(Assembler::notZero, *slow_case);
   1.351 -  }
   1.352 -  jmp(done);
   1.353 -
   1.354 -  bind(try_revoke_bias);
   1.355 -  // The prototype mark in the klass doesn't have the bias bit set any
   1.356 -  // more, indicating that objects of this data type are not supposed
   1.357 -  // to be biased any more. We are going to try to reset the mark of
   1.358 -  // this object to the prototype value and fall through to the
   1.359 -  // CAS-based locking scheme. Note that if our CAS fails, it means
   1.360 -  // that another thread raced us for the privilege of revoking the
   1.361 -  // bias of this particular object, so it's okay to continue in the
   1.362 -  // normal locking code.
   1.363 -  //
   1.364 -  // FIXME: due to a lack of registers we currently blow away the age
   1.365 -  // bits in this situation. Should attempt to preserve them.
   1.366 -  load_prototype_header(tmp_reg, obj_reg);
   1.367 -  if (os::is_MP()) {
   1.368 -    lock();
   1.369 -  }
   1.370 -  cmpxchgq(tmp_reg, Address(obj_reg, 0));
   1.371 -  // Fall through to the normal CAS-based lock, because no matter what
   1.372 -  // the result of the above CAS, some thread must have succeeded in
   1.373 -  // removing the bias bit from the object's header.
   1.374 -  if (counters != NULL) {
   1.375 -    cond_inc32(Assembler::zero,
   1.376 -               ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
   1.377 -  }
   1.378 -
   1.379 -  bind(cas_label);
   1.380 -
   1.381 -  return null_check_offset;
   1.382 -}
   1.383 -
   1.384  void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
   1.385    Label L, E;
   1.386  
   1.387 @@ -1360,9 +990,16 @@
   1.388  
   1.389  void MacroAssembler::atomic_incl(AddressLiteral counter_addr) {
   1.390    pushf();
   1.391 -  if (os::is_MP())
   1.392 -    lock();
   1.393 -  incrementl(counter_addr);
   1.394 +  if (reachable(counter_addr)) {
   1.395 +    if (os::is_MP())
   1.396 +      lock();
   1.397 +    incrementl(as_Address(counter_addr));
   1.398 +  } else {
   1.399 +    lea(rscratch1, counter_addr);
   1.400 +    if (os::is_MP())
   1.401 +      lock();
   1.402 +    incrementl(Address(rscratch1, 0));
   1.403 +  }
   1.404    popf();
   1.405  }
   1.406  
   1.407 @@ -1393,6 +1030,234 @@
   1.408    }
   1.409  }
   1.410  
   1.411 +int MacroAssembler::biased_locking_enter(Register lock_reg,
   1.412 +                                         Register obj_reg,
   1.413 +                                         Register swap_reg,
   1.414 +                                         Register tmp_reg,
   1.415 +                                         bool swap_reg_contains_mark,
   1.416 +                                         Label& done,
   1.417 +                                         Label* slow_case,
   1.418 +                                         BiasedLockingCounters* counters) {
   1.419 +  assert(UseBiasedLocking, "why call this otherwise?");
   1.420 +  assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
   1.421 +  LP64_ONLY( assert(tmp_reg != noreg, "tmp_reg must be supplied"); )
   1.422 +  bool need_tmp_reg = false;
   1.423 +  if (tmp_reg == noreg) {
   1.424 +    need_tmp_reg = true;
   1.425 +    tmp_reg = lock_reg;
   1.426 +    assert_different_registers(lock_reg, obj_reg, swap_reg);
   1.427 +  } else {
   1.428 +    assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
   1.429 +  }
   1.430 +  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
   1.431 +  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
   1.432 +  Address saved_mark_addr(lock_reg, 0);
   1.433 +
   1.434 +  if (PrintBiasedLockingStatistics && counters == NULL) {
   1.435 +    counters = BiasedLocking::counters();
   1.436 +  }
   1.437 +  // Biased locking
   1.438 +  // See whether the lock is currently biased toward our thread and
   1.439 +  // whether the epoch is still valid
   1.440 +  // Note that the runtime guarantees sufficient alignment of JavaThread
   1.441 +  // pointers to allow age to be placed into low bits
   1.442 +  // First check to see whether biasing is even enabled for this object
   1.443 +  Label cas_label;
   1.444 +  int null_check_offset = -1;
   1.445 +  if (!swap_reg_contains_mark) {
   1.446 +    null_check_offset = offset();
   1.447 +    movptr(swap_reg, mark_addr);
   1.448 +  }
   1.449 +  if (need_tmp_reg) {
   1.450 +    push(tmp_reg);
   1.451 +  }
   1.452 +  movptr(tmp_reg, swap_reg);
   1.453 +  andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place);
   1.454 +  cmpptr(tmp_reg, markOopDesc::biased_lock_pattern);
   1.455 +  if (need_tmp_reg) {
   1.456 +    pop(tmp_reg);
   1.457 +  }
   1.458 +  jcc(Assembler::notEqual, cas_label);
   1.459 +  // The bias pattern is present in the object's header. Need to check
   1.460 +  // whether the bias owner and the epoch are both still current.
   1.461 +#ifndef _LP64
   1.462 +  // Note that because there is no current thread register on x86_32 we
   1.463 +  // need to store off the mark word we read out of the object to
   1.464 +  // avoid reloading it and needing to recheck invariants below. This
   1.465 +  // store is unfortunate but it makes the overall code shorter and
   1.466 +  // simpler.
   1.467 +  movptr(saved_mark_addr, swap_reg);
   1.468 +#endif
   1.469 +  if (need_tmp_reg) {
   1.470 +    push(tmp_reg);
   1.471 +  }
   1.472 +  if (swap_reg_contains_mark) {
   1.473 +    null_check_offset = offset();
   1.474 +  }
   1.475 +  load_prototype_header(tmp_reg, obj_reg);
   1.476 +#ifdef _LP64
   1.477 +  orptr(tmp_reg, r15_thread);
   1.478 +  xorptr(tmp_reg, swap_reg);
   1.479 +  Register header_reg = tmp_reg;
   1.480 +#else
   1.481 +  xorptr(tmp_reg, swap_reg);
   1.482 +  get_thread(swap_reg);
   1.483 +  xorptr(swap_reg, tmp_reg);
   1.484 +  Register header_reg = swap_reg;
   1.485 +#endif
   1.486 +  andptr(header_reg, ~((int) markOopDesc::age_mask_in_place));
   1.487 +  if (need_tmp_reg) {
   1.488 +    pop(tmp_reg);
   1.489 +  }
   1.490 +  if (counters != NULL) {
   1.491 +    cond_inc32(Assembler::zero,
   1.492 +               ExternalAddress((address) counters->biased_lock_entry_count_addr()));
   1.493 +  }
   1.494 +  jcc(Assembler::equal, done);
   1.495 +
   1.496 +  Label try_revoke_bias;
   1.497 +  Label try_rebias;
   1.498 +
   1.499 +  // At this point we know that the header has the bias pattern and
   1.500 +  // that we are not the bias owner in the current epoch. We need to
   1.501 +  // figure out more details about the state of the header in order to
   1.502 +  // know what operations can be legally performed on the object's
   1.503 +  // header.
   1.504 +
   1.505 +  // If the low three bits in the xor result aren't clear, that means
   1.506 +  // the prototype header is no longer biased and we have to revoke
   1.507 +  // the bias on this object.
   1.508 +  testptr(header_reg, markOopDesc::biased_lock_mask_in_place);
   1.509 +  jccb(Assembler::notZero, try_revoke_bias);
   1.510 +
   1.511 +  // Biasing is still enabled for this data type. See whether the
   1.512 +  // epoch of the current bias is still valid, meaning that the epoch
   1.513 +  // bits of the mark word are equal to the epoch bits of the
   1.514 +  // prototype header. (Note that the prototype header's epoch bits
   1.515 +  // only change at a safepoint.) If not, attempt to rebias the object
   1.516 +  // toward the current thread. Note that we must be absolutely sure
   1.517 +  // that the current epoch is invalid in order to do this because
   1.518 +  // otherwise the manipulations it performs on the mark word are
   1.519 +  // illegal.
   1.520 +  testptr(header_reg, markOopDesc::epoch_mask_in_place);
   1.521 +  jccb(Assembler::notZero, try_rebias);
   1.522 +
   1.523 +  // The epoch of the current bias is still valid but we know nothing
   1.524 +  // about the owner; it might be set or it might be clear. Try to
   1.525 +  // acquire the bias of the object using an atomic operation. If this
   1.526 +  // fails we will go in to the runtime to revoke the object's bias.
   1.527 +  // Note that we first construct the presumed unbiased header so we
   1.528 +  // don't accidentally blow away another thread's valid bias.
   1.529 +  NOT_LP64( movptr(swap_reg, saved_mark_addr); )
   1.530 +  andptr(swap_reg,
   1.531 +         markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
   1.532 +  if (need_tmp_reg) {
   1.533 +    push(tmp_reg);
   1.534 +  }
   1.535 +#ifdef _LP64
   1.536 +  movptr(tmp_reg, swap_reg);
   1.537 +  orptr(tmp_reg, r15_thread);
   1.538 +#else
   1.539 +  get_thread(tmp_reg);
   1.540 +  orptr(tmp_reg, swap_reg);
   1.541 +#endif
   1.542 +  if (os::is_MP()) {
   1.543 +    lock();
   1.544 +  }
   1.545 +  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
   1.546 +  if (need_tmp_reg) {
   1.547 +    pop(tmp_reg);
   1.548 +  }
   1.549 +  // If the biasing toward our thread failed, this means that
   1.550 +  // another thread succeeded in biasing it toward itself and we
   1.551 +  // need to revoke that bias. The revocation will occur in the
   1.552 +  // interpreter runtime in the slow case.
   1.553 +  if (counters != NULL) {
   1.554 +    cond_inc32(Assembler::zero,
   1.555 +               ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
   1.556 +  }
   1.557 +  if (slow_case != NULL) {
   1.558 +    jcc(Assembler::notZero, *slow_case);
   1.559 +  }
   1.560 +  jmp(done);
   1.561 +
   1.562 +  bind(try_rebias);
   1.563 +  // At this point we know the epoch has expired, meaning that the
   1.564 +  // current "bias owner", if any, is actually invalid. Under these
   1.565 +  // circumstances _only_, we are allowed to use the current header's
   1.566 +  // value as the comparison value when doing the cas to acquire the
   1.567 +  // bias in the current epoch. In other words, we allow transfer of
   1.568 +  // the bias from one thread to another directly in this situation.
   1.569 +  //
   1.570 +  // FIXME: due to a lack of registers we currently blow away the age
   1.571 +  // bits in this situation. Should attempt to preserve them.
   1.572 +  if (need_tmp_reg) {
   1.573 +    push(tmp_reg);
   1.574 +  }
   1.575 +  load_prototype_header(tmp_reg, obj_reg);
   1.576 +#ifdef _LP64
   1.577 +  orptr(tmp_reg, r15_thread);
   1.578 +#else
   1.579 +  get_thread(swap_reg);
   1.580 +  orptr(tmp_reg, swap_reg);
   1.581 +  movptr(swap_reg, saved_mark_addr);
   1.582 +#endif
   1.583 +  if (os::is_MP()) {
   1.584 +    lock();
   1.585 +  }
   1.586 +  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
   1.587 +  if (need_tmp_reg) {
   1.588 +    pop(tmp_reg);
   1.589 +  }
   1.590 +  // If the biasing toward our thread failed, then another thread
   1.591 +  // succeeded in biasing it toward itself and we need to revoke that
   1.592 +  // bias. The revocation will occur in the runtime in the slow case.
   1.593 +  if (counters != NULL) {
   1.594 +    cond_inc32(Assembler::zero,
   1.595 +               ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
   1.596 +  }
   1.597 +  if (slow_case != NULL) {
   1.598 +    jcc(Assembler::notZero, *slow_case);
   1.599 +  }
   1.600 +  jmp(done);
   1.601 +
   1.602 +  bind(try_revoke_bias);
   1.603 +  // The prototype mark in the klass doesn't have the bias bit set any
   1.604 +  // more, indicating that objects of this data type are not supposed
   1.605 +  // to be biased any more. We are going to try to reset the mark of
   1.606 +  // this object to the prototype value and fall through to the
   1.607 +  // CAS-based locking scheme. Note that if our CAS fails, it means
   1.608 +  // that another thread raced us for the privilege of revoking the
   1.609 +  // bias of this particular object, so it's okay to continue in the
   1.610 +  // normal locking code.
   1.611 +  //
   1.612 +  // FIXME: due to a lack of registers we currently blow away the age
   1.613 +  // bits in this situation. Should attempt to preserve them.
   1.614 +  NOT_LP64( movptr(swap_reg, saved_mark_addr); )
   1.615 +  if (need_tmp_reg) {
   1.616 +    push(tmp_reg);
   1.617 +  }
   1.618 +  load_prototype_header(tmp_reg, obj_reg);
   1.619 +  if (os::is_MP()) {
   1.620 +    lock();
   1.621 +  }
   1.622 +  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
   1.623 +  if (need_tmp_reg) {
   1.624 +    pop(tmp_reg);
   1.625 +  }
   1.626 +  // Fall through to the normal CAS-based lock, because no matter what
   1.627 +  // the result of the above CAS, some thread must have succeeded in
   1.628 +  // removing the bias bit from the object's header.
   1.629 +  if (counters != NULL) {
   1.630 +    cond_inc32(Assembler::zero,
   1.631 +               ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
   1.632 +  }
   1.633 +
   1.634 +  bind(cas_label);
   1.635 +
   1.636 +  return null_check_offset;
   1.637 +}
   1.638 +
   1.639  void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
   1.640    assert(UseBiasedLocking, "why call this otherwise?");
   1.641  
   1.642 @@ -1408,6 +1273,620 @@
   1.643    jcc(Assembler::equal, done);
   1.644  }
   1.645  
   1.646 +#ifdef COMPILER2
   1.647 +// Fast_Lock and Fast_Unlock used by C2
   1.648 +
   1.649 +// Because the transitions from emitted code to the runtime
   1.650 +// monitorenter/exit helper stubs are so slow it's critical that
   1.651 +// we inline both the stack-locking fast-path and the inflated fast path.
   1.652 +//
   1.653 +// See also: cmpFastLock and cmpFastUnlock.
   1.654 +//
   1.655 +// What follows is a specialized inline transliteration of the code
   1.656 +// in slow_enter() and slow_exit().  If we're concerned about I$ bloat
   1.657 +// another option would be to emit TrySlowEnter and TrySlowExit methods
   1.658 +// at startup-time.  These methods would accept arguments as
   1.659 +// (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
   1.660 +// indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
   1.661 +// marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
   1.662 +// In practice, however, the # of lock sites is bounded and is usually small.
   1.663 +// Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
   1.664 +// if the processor uses simple bimodal branch predictors keyed by EIP
   1.665 +// Since the helper routines would be called from multiple synchronization
   1.666 +// sites.
   1.667 +//
   1.668 +// An even better approach would be write "MonitorEnter()" and "MonitorExit()"
   1.669 +// in java - using j.u.c and unsafe - and just bind the lock and unlock sites
   1.670 +// to those specialized methods.  That'd give us a mostly platform-independent
   1.671 +// implementation that the JITs could optimize and inline at their pleasure.
   1.672 +// Done correctly, the only time we'd need to cross to native could would be
   1.673 +// to park() or unpark() threads.  We'd also need a few more unsafe operators
   1.674 +// to (a) prevent compiler-JIT reordering of non-volatile accesses, and
   1.675 +// (b) explicit barriers or fence operations.
   1.676 +//
   1.677 +// TODO:
   1.678 +//
   1.679 +// *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
   1.680 +//    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
   1.681 +//    Given TLAB allocation, Self is usually manifested in a register, so passing it into
   1.682 +//    the lock operators would typically be faster than reifying Self.
   1.683 +//
   1.684 +// *  Ideally I'd define the primitives as:
   1.685 +//       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
   1.686 +//       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
   1.687 +//    Unfortunately ADLC bugs prevent us from expressing the ideal form.
   1.688 +//    Instead, we're stuck with a rather awkward and brittle register assignments below.
   1.689 +//    Furthermore the register assignments are overconstrained, possibly resulting in
   1.690 +//    sub-optimal code near the synchronization site.
   1.691 +//
   1.692 +// *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
   1.693 +//    Alternately, use a better sp-proximity test.
   1.694 +//
   1.695 +// *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
   1.696 +//    Either one is sufficient to uniquely identify a thread.
   1.697 +//    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
   1.698 +//
   1.699 +// *  Intrinsify notify() and notifyAll() for the common cases where the
   1.700 +//    object is locked by the calling thread but the waitlist is empty.
   1.701 +//    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
   1.702 +//
   1.703 +// *  use jccb and jmpb instead of jcc and jmp to improve code density.
   1.704 +//    But beware of excessive branch density on AMD Opterons.
   1.705 +//
   1.706 +// *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
   1.707 +//    or failure of the fast-path.  If the fast-path fails then we pass
   1.708 +//    control to the slow-path, typically in C.  In Fast_Lock and
   1.709 +//    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
   1.710 +//    will emit a conditional branch immediately after the node.
   1.711 +//    So we have branches to branches and lots of ICC.ZF games.
   1.712 +//    Instead, it might be better to have C2 pass a "FailureLabel"
   1.713 +//    into Fast_Lock and Fast_Unlock.  In the case of success, control
   1.714 +//    will drop through the node.  ICC.ZF is undefined at exit.
   1.715 +//    In the case of failure, the node will branch directly to the
   1.716 +//    FailureLabel
   1.717 +
   1.718 +
   1.719 +// obj: object to lock
   1.720 +// box: on-stack box address (displaced header location) - KILLED
   1.721 +// rax,: tmp -- KILLED
   1.722 +// scr: tmp -- KILLED
   1.723 +void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, Register scrReg, BiasedLockingCounters* counters) {
   1.724 +  // Ensure the register assignents are disjoint
   1.725 +  guarantee (objReg != boxReg, "");
   1.726 +  guarantee (objReg != tmpReg, "");
   1.727 +  guarantee (objReg != scrReg, "");
   1.728 +  guarantee (boxReg != tmpReg, "");
   1.729 +  guarantee (boxReg != scrReg, "");
   1.730 +  guarantee (tmpReg == rax, "");
   1.731 +
   1.732 +  if (counters != NULL) {
   1.733 +    atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()));
   1.734 +  }
   1.735 +  if (EmitSync & 1) {
   1.736 +      // set box->dhw = unused_mark (3)
   1.737 +      // Force all sync thru slow-path: slow_enter() and slow_exit()
   1.738 +      movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
   1.739 +      cmpptr (rsp, (int32_t)NULL_WORD);
   1.740 +  } else
   1.741 +  if (EmitSync & 2) {
   1.742 +      Label DONE_LABEL ;
   1.743 +      if (UseBiasedLocking) {
   1.744 +         // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
   1.745 +         biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
   1.746 +      }
   1.747 +
   1.748 +      movptr(tmpReg, Address(objReg, 0));           // fetch markword
   1.749 +      orptr (tmpReg, 0x1);
   1.750 +      movptr(Address(boxReg, 0), tmpReg);           // Anticipate successful CAS
   1.751 +      if (os::is_MP()) {
   1.752 +        lock();
   1.753 +      }
   1.754 +      cmpxchgptr(boxReg, Address(objReg, 0));       // Updates tmpReg
   1.755 +      jccb(Assembler::equal, DONE_LABEL);
   1.756 +      // Recursive locking
   1.757 +      subptr(tmpReg, rsp);
   1.758 +      andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
   1.759 +      movptr(Address(boxReg, 0), tmpReg);
   1.760 +      bind(DONE_LABEL);
   1.761 +  } else {
   1.762 +    // Possible cases that we'll encounter in fast_lock
   1.763 +    // ------------------------------------------------
   1.764 +    // * Inflated
   1.765 +    //    -- unlocked
   1.766 +    //    -- Locked
   1.767 +    //       = by self
   1.768 +    //       = by other
   1.769 +    // * biased
   1.770 +    //    -- by Self
   1.771 +    //    -- by other
   1.772 +    // * neutral
   1.773 +    // * stack-locked
   1.774 +    //    -- by self
   1.775 +    //       = sp-proximity test hits
   1.776 +    //       = sp-proximity test generates false-negative
   1.777 +    //    -- by other
   1.778 +    //
   1.779 +
   1.780 +    Label IsInflated, DONE_LABEL;
   1.781 +
   1.782 +    // it's stack-locked, biased or neutral
   1.783 +    // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
   1.784 +    // order to reduce the number of conditional branches in the most common cases.
   1.785 +    // Beware -- there's a subtle invariant that fetch of the markword
   1.786 +    // at [FETCH], below, will never observe a biased encoding (*101b).
   1.787 +    // If this invariant is not held we risk exclusion (safety) failure.
   1.788 +    if (UseBiasedLocking && !UseOptoBiasInlining) {
   1.789 +      biased_locking_enter(boxReg, objReg, tmpReg, scrReg, true, DONE_LABEL, NULL, counters);
   1.790 +    }
   1.791 +
   1.792 +    movptr(tmpReg, Address(objReg, 0));          // [FETCH]
   1.793 +    testl (tmpReg, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
   1.794 +    jccb  (Assembler::notZero, IsInflated);
   1.795 +
   1.796 +    // Attempt stack-locking ...
   1.797 +    orptr (tmpReg, 0x1);
   1.798 +    movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
   1.799 +    if (os::is_MP()) {
   1.800 +      lock();
   1.801 +    }
   1.802 +    cmpxchgptr(boxReg, Address(objReg, 0));      // Updates tmpReg
   1.803 +    if (counters != NULL) {
   1.804 +      cond_inc32(Assembler::equal,
   1.805 +                 ExternalAddress((address)counters->fast_path_entry_count_addr()));
   1.806 +    }
   1.807 +    jccb(Assembler::equal, DONE_LABEL);
   1.808 +
   1.809 +    // Recursive locking
   1.810 +    subptr(tmpReg, rsp);
   1.811 +    andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
   1.812 +    movptr(Address(boxReg, 0), tmpReg);
   1.813 +    if (counters != NULL) {
   1.814 +      cond_inc32(Assembler::equal,
   1.815 +                 ExternalAddress((address)counters->fast_path_entry_count_addr()));
   1.816 +    }
   1.817 +    jmpb(DONE_LABEL);
   1.818 +
   1.819 +    bind(IsInflated);
   1.820 +#ifndef _LP64
   1.821 +    // The object is inflated.
   1.822 +    //
   1.823 +    // TODO-FIXME: eliminate the ugly use of manifest constants:
   1.824 +    //   Use markOopDesc::monitor_value instead of "2".
   1.825 +    //   use markOop::unused_mark() instead of "3".
   1.826 +    // The tmpReg value is an objectMonitor reference ORed with
   1.827 +    // markOopDesc::monitor_value (2).   We can either convert tmpReg to an
   1.828 +    // objectmonitor pointer by masking off the "2" bit or we can just
   1.829 +    // use tmpReg as an objectmonitor pointer but bias the objectmonitor
   1.830 +    // field offsets with "-2" to compensate for and annul the low-order tag bit.
   1.831 +    //
   1.832 +    // I use the latter as it avoids AGI stalls.
   1.833 +    // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]"
   1.834 +    // instead of "mov r, [tmpReg+OFFSETOF(Owner)]".
   1.835 +    //
   1.836 +    #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2)
   1.837 +
   1.838 +    // boxReg refers to the on-stack BasicLock in the current frame.
   1.839 +    // We'd like to write:
   1.840 +    //   set box->_displaced_header = markOop::unused_mark().  Any non-0 value suffices.
   1.841 +    // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
   1.842 +    // additional latency as we have another ST in the store buffer that must drain.
   1.843 +
   1.844 +    if (EmitSync & 8192) {
   1.845 +       movptr(Address(boxReg, 0), 3);            // results in ST-before-CAS penalty
   1.846 +       get_thread (scrReg);
   1.847 +       movptr(boxReg, tmpReg);                    // consider: LEA box, [tmp-2]
   1.848 +       movptr(tmpReg, NULL_WORD);                 // consider: xor vs mov
   1.849 +       if (os::is_MP()) {
   1.850 +         lock();
   1.851 +       }
   1.852 +       cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
   1.853 +    } else
   1.854 +    if ((EmitSync & 128) == 0) {                      // avoid ST-before-CAS
   1.855 +       movptr(scrReg, boxReg);
   1.856 +       movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
   1.857 +
   1.858 +       // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
   1.859 +       if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
   1.860 +          // prefetchw [eax + Offset(_owner)-2]
   1.861 +          prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
   1.862 +       }
   1.863 +
   1.864 +       if ((EmitSync & 64) == 0) {
   1.865 +         // Optimistic form: consider XORL tmpReg,tmpReg
   1.866 +         movptr(tmpReg, NULL_WORD);
   1.867 +       } else {
   1.868 +         // Can suffer RTS->RTO upgrades on shared or cold $ lines
   1.869 +         // Test-And-CAS instead of CAS
   1.870 +         movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));   // rax, = m->_owner
   1.871 +         testptr(tmpReg, tmpReg);                   // Locked ?
   1.872 +         jccb  (Assembler::notZero, DONE_LABEL);
   1.873 +       }
   1.874 +
   1.875 +       // Appears unlocked - try to swing _owner from null to non-null.
   1.876 +       // Ideally, I'd manifest "Self" with get_thread and then attempt
   1.877 +       // to CAS the register containing Self into m->Owner.
   1.878 +       // But we don't have enough registers, so instead we can either try to CAS
   1.879 +       // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
   1.880 +       // we later store "Self" into m->Owner.  Transiently storing a stack address
   1.881 +       // (rsp or the address of the box) into  m->owner is harmless.
   1.882 +       // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
   1.883 +       if (os::is_MP()) {
   1.884 +         lock();
   1.885 +       }
   1.886 +       cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
   1.887 +       movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
   1.888 +       jccb  (Assembler::notZero, DONE_LABEL);
   1.889 +       get_thread (scrReg);                    // beware: clobbers ICCs
   1.890 +       movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg);
   1.891 +       xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
   1.892 +
   1.893 +       // If the CAS fails we can either retry or pass control to the slow-path.
   1.894 +       // We use the latter tactic.
   1.895 +       // Pass the CAS result in the icc.ZFlag into DONE_LABEL
   1.896 +       // If the CAS was successful ...
   1.897 +       //   Self has acquired the lock
   1.898 +       //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
   1.899 +       // Intentional fall-through into DONE_LABEL ...
   1.900 +    } else {
   1.901 +       movptr(Address(boxReg, 0), intptr_t(markOopDesc::unused_mark()));  // results in ST-before-CAS penalty
   1.902 +       movptr(boxReg, tmpReg);
   1.903 +
   1.904 +       // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
   1.905 +       if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
   1.906 +          // prefetchw [eax + Offset(_owner)-2]
   1.907 +          prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
   1.908 +       }
   1.909 +
   1.910 +       if ((EmitSync & 64) == 0) {
   1.911 +         // Optimistic form
   1.912 +         xorptr  (tmpReg, tmpReg);
   1.913 +       } else {
   1.914 +         // Can suffer RTS->RTO upgrades on shared or cold $ lines
   1.915 +         movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));   // rax, = m->_owner
   1.916 +         testptr(tmpReg, tmpReg);                   // Locked ?
   1.917 +         jccb  (Assembler::notZero, DONE_LABEL);
   1.918 +       }
   1.919 +
   1.920 +       // Appears unlocked - try to swing _owner from null to non-null.
   1.921 +       // Use either "Self" (in scr) or rsp as thread identity in _owner.
   1.922 +       // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
   1.923 +       get_thread (scrReg);
   1.924 +       if (os::is_MP()) {
   1.925 +         lock();
   1.926 +       }
   1.927 +       cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
   1.928 +
   1.929 +       // If the CAS fails we can either retry or pass control to the slow-path.
   1.930 +       // We use the latter tactic.
   1.931 +       // Pass the CAS result in the icc.ZFlag into DONE_LABEL
   1.932 +       // If the CAS was successful ...
   1.933 +       //   Self has acquired the lock
   1.934 +       //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
   1.935 +       // Intentional fall-through into DONE_LABEL ...
   1.936 +    }
   1.937 +#else // _LP64
   1.938 +    // It's inflated
   1.939 +
   1.940 +    // TODO: someday avoid the ST-before-CAS penalty by
   1.941 +    // relocating (deferring) the following ST.
   1.942 +    // We should also think about trying a CAS without having
   1.943 +    // fetched _owner.  If the CAS is successful we may
   1.944 +    // avoid an RTO->RTS upgrade on the $line.
   1.945 +
   1.946 +    // Without cast to int32_t a movptr will destroy r10 which is typically obj
   1.947 +    movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
   1.948 +
   1.949 +    mov    (boxReg, tmpReg);
   1.950 +    movptr (tmpReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
   1.951 +    testptr(tmpReg, tmpReg);
   1.952 +    jccb   (Assembler::notZero, DONE_LABEL);
   1.953 +
   1.954 +    // It's inflated and appears unlocked
   1.955 +    if (os::is_MP()) {
   1.956 +      lock();
   1.957 +    }
   1.958 +    cmpxchgptr(r15_thread, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
   1.959 +    // Intentional fall-through into DONE_LABEL ...
   1.960 +
   1.961 +#endif
   1.962 +
   1.963 +    // DONE_LABEL is a hot target - we'd really like to place it at the
   1.964 +    // start of cache line by padding with NOPs.
   1.965 +    // See the AMD and Intel software optimization manuals for the
   1.966 +    // most efficient "long" NOP encodings.
   1.967 +    // Unfortunately none of our alignment mechanisms suffice.
   1.968 +    bind(DONE_LABEL);
   1.969 +
   1.970 +    // At DONE_LABEL the icc ZFlag is set as follows ...
   1.971 +    // Fast_Unlock uses the same protocol.
   1.972 +    // ZFlag == 1 -> Success
   1.973 +    // ZFlag == 0 -> Failure - force control through the slow-path
   1.974 +  }
   1.975 +}
   1.976 +
   1.977 +// obj: object to unlock
   1.978 +// box: box address (displaced header location), killed.  Must be EAX.
   1.979 +// tmp: killed, cannot be obj nor box.
   1.980 +//
   1.981 +// Some commentary on balanced locking:
   1.982 +//
   1.983 +// Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
   1.984 +// Methods that don't have provably balanced locking are forced to run in the
   1.985 +// interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
   1.986 +// The interpreter provides two properties:
   1.987 +// I1:  At return-time the interpreter automatically and quietly unlocks any
   1.988 +//      objects acquired the current activation (frame).  Recall that the
   1.989 +//      interpreter maintains an on-stack list of locks currently held by
   1.990 +//      a frame.
   1.991 +// I2:  If a method attempts to unlock an object that is not held by the
   1.992 +//      the frame the interpreter throws IMSX.
   1.993 +//
   1.994 +// Lets say A(), which has provably balanced locking, acquires O and then calls B().
   1.995 +// B() doesn't have provably balanced locking so it runs in the interpreter.
   1.996 +// Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
   1.997 +// is still locked by A().
   1.998 +//
   1.999 +// The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
  1.1000 +// Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
  1.1001 +// should not be unlocked by "normal" java-level locking and vice-versa.  The specification
  1.1002 +// doesn't specify what will occur if a program engages in such mixed-mode locking, however.
  1.1003 +
  1.1004 +void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
  1.1005 +  guarantee (objReg != boxReg, "");
  1.1006 +  guarantee (objReg != tmpReg, "");
  1.1007 +  guarantee (boxReg != tmpReg, "");
  1.1008 +  guarantee (boxReg == rax, "");
  1.1009 +
  1.1010 +  if (EmitSync & 4) {
  1.1011 +    // Disable - inhibit all inlining.  Force control through the slow-path
  1.1012 +    cmpptr (rsp, 0);
  1.1013 +  } else
  1.1014 +  if (EmitSync & 8) {
  1.1015 +    Label DONE_LABEL;
  1.1016 +    if (UseBiasedLocking) {
  1.1017 +       biased_locking_exit(objReg, tmpReg, DONE_LABEL);
  1.1018 +    }
  1.1019 +    // Classic stack-locking code ...
  1.1020 +    // Check whether the displaced header is 0
  1.1021 +    //(=> recursive unlock)
  1.1022 +    movptr(tmpReg, Address(boxReg, 0));
  1.1023 +    testptr(tmpReg, tmpReg);
  1.1024 +    jccb(Assembler::zero, DONE_LABEL);
  1.1025 +    // If not recursive lock, reset the header to displaced header
  1.1026 +    if (os::is_MP()) {
  1.1027 +      lock();
  1.1028 +    }
  1.1029 +    cmpxchgptr(tmpReg, Address(objReg, 0));   // Uses RAX which is box
  1.1030 +    bind(DONE_LABEL);
  1.1031 +  } else {
  1.1032 +    Label DONE_LABEL, Stacked, CheckSucc;
  1.1033 +
  1.1034 +    // Critically, the biased locking test must have precedence over
  1.1035 +    // and appear before the (box->dhw == 0) recursive stack-lock test.
  1.1036 +    if (UseBiasedLocking && !UseOptoBiasInlining) {
  1.1037 +       biased_locking_exit(objReg, tmpReg, DONE_LABEL);
  1.1038 +    }
  1.1039 +
  1.1040 +    cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
  1.1041 +    movptr(tmpReg, Address(objReg, 0));             // Examine the object's markword
  1.1042 +    jccb  (Assembler::zero, DONE_LABEL);            // 0 indicates recursive stack-lock
  1.1043 +
  1.1044 +    testptr(tmpReg, 0x02);                          // Inflated?
  1.1045 +    jccb  (Assembler::zero, Stacked);
  1.1046 +
  1.1047 +    // It's inflated.
  1.1048 +    // Despite our balanced locking property we still check that m->_owner == Self
  1.1049 +    // as java routines or native JNI code called by this thread might
  1.1050 +    // have released the lock.
  1.1051 +    // Refer to the comments in synchronizer.cpp for how we might encode extra
  1.1052 +    // state in _succ so we can avoid fetching EntryList|cxq.
  1.1053 +    //
  1.1054 +    // I'd like to add more cases in fast_lock() and fast_unlock() --
  1.1055 +    // such as recursive enter and exit -- but we have to be wary of
  1.1056 +    // I$ bloat, T$ effects and BP$ effects.
  1.1057 +    //
  1.1058 +    // If there's no contention try a 1-0 exit.  That is, exit without
  1.1059 +    // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
  1.1060 +    // we detect and recover from the race that the 1-0 exit admits.
  1.1061 +    //
  1.1062 +    // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
  1.1063 +    // before it STs null into _owner, releasing the lock.  Updates
  1.1064 +    // to data protected by the critical section must be visible before
  1.1065 +    // we drop the lock (and thus before any other thread could acquire
  1.1066 +    // the lock and observe the fields protected by the lock).
  1.1067 +    // IA32's memory-model is SPO, so STs are ordered with respect to
  1.1068 +    // each other and there's no need for an explicit barrier (fence).
  1.1069 +    // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
  1.1070 +#ifndef _LP64
  1.1071 +    get_thread (boxReg);
  1.1072 +    if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
  1.1073 +      // prefetchw [ebx + Offset(_owner)-2]
  1.1074 +      prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
  1.1075 +    }
  1.1076 +
  1.1077 +    // Note that we could employ various encoding schemes to reduce
  1.1078 +    // the number of loads below (currently 4) to just 2 or 3.
  1.1079 +    // Refer to the comments in synchronizer.cpp.
  1.1080 +    // In practice the chain of fetches doesn't seem to impact performance, however.
  1.1081 +    if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
  1.1082 +       // Attempt to reduce branch density - AMD's branch predictor.
  1.1083 +       xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
  1.1084 +       orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
  1.1085 +       orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
  1.1086 +       orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
  1.1087 +       jccb  (Assembler::notZero, DONE_LABEL);
  1.1088 +       movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
  1.1089 +       jmpb  (DONE_LABEL);
  1.1090 +    } else {
  1.1091 +       xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
  1.1092 +       orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
  1.1093 +       jccb  (Assembler::notZero, DONE_LABEL);
  1.1094 +       movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
  1.1095 +       orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
  1.1096 +       jccb  (Assembler::notZero, CheckSucc);
  1.1097 +       movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
  1.1098 +       jmpb  (DONE_LABEL);
  1.1099 +    }
  1.1100 +
  1.1101 +    // The Following code fragment (EmitSync & 65536) improves the performance of
  1.1102 +    // contended applications and contended synchronization microbenchmarks.
  1.1103 +    // Unfortunately the emission of the code - even though not executed - causes regressions
  1.1104 +    // in scimark and jetstream, evidently because of $ effects.  Replacing the code
  1.1105 +    // with an equal number of never-executed NOPs results in the same regression.
  1.1106 +    // We leave it off by default.
  1.1107 +
  1.1108 +    if ((EmitSync & 65536) != 0) {
  1.1109 +       Label LSuccess, LGoSlowPath ;
  1.1110 +
  1.1111 +       bind  (CheckSucc);
  1.1112 +
  1.1113 +       // Optional pre-test ... it's safe to elide this
  1.1114 +       if ((EmitSync & 16) == 0) {
  1.1115 +          cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
  1.1116 +          jccb  (Assembler::zero, LGoSlowPath);
  1.1117 +       }
  1.1118 +
  1.1119 +       // We have a classic Dekker-style idiom:
  1.1120 +       //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
  1.1121 +       // There are a number of ways to implement the barrier:
  1.1122 +       // (1) lock:andl &m->_owner, 0
  1.1123 +       //     is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
  1.1124 +       //     LOCK: ANDL [ebx+Offset(_Owner)-2], 0
  1.1125 +       //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
  1.1126 +       // (2) If supported, an explicit MFENCE is appealing.
  1.1127 +       //     In older IA32 processors MFENCE is slower than lock:add or xchg
  1.1128 +       //     particularly if the write-buffer is full as might be the case if
  1.1129 +       //     if stores closely precede the fence or fence-equivalent instruction.
  1.1130 +       //     In more modern implementations MFENCE appears faster, however.
  1.1131 +       // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
  1.1132 +       //     The $lines underlying the top-of-stack should be in M-state.
  1.1133 +       //     The locked add instruction is serializing, of course.
  1.1134 +       // (4) Use xchg, which is serializing
  1.1135 +       //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
  1.1136 +       // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
  1.1137 +       //     The integer condition codes will tell us if succ was 0.
  1.1138 +       //     Since _succ and _owner should reside in the same $line and
  1.1139 +       //     we just stored into _owner, it's likely that the $line
  1.1140 +       //     remains in M-state for the lock:orl.
  1.1141 +       //
  1.1142 +       // We currently use (3), although it's likely that switching to (2)
  1.1143 +       // is correct for the future.
  1.1144 +
  1.1145 +       movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
  1.1146 +       if (os::is_MP()) {
  1.1147 +          if (VM_Version::supports_sse2() && 1 == FenceInstruction) {
  1.1148 +            mfence();
  1.1149 +          } else {
  1.1150 +            lock (); addptr(Address(rsp, 0), 0);
  1.1151 +          }
  1.1152 +       }
  1.1153 +       // Ratify _succ remains non-null
  1.1154 +       cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0);
  1.1155 +       jccb  (Assembler::notZero, LSuccess);
  1.1156 +
  1.1157 +       xorptr(boxReg, boxReg);                  // box is really EAX
  1.1158 +       if (os::is_MP()) { lock(); }
  1.1159 +       cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
  1.1160 +       jccb  (Assembler::notEqual, LSuccess);
  1.1161 +       // Since we're low on registers we installed rsp as a placeholding in _owner.
  1.1162 +       // Now install Self over rsp.  This is safe as we're transitioning from
  1.1163 +       // non-null to non=null
  1.1164 +       get_thread (boxReg);
  1.1165 +       movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg);
  1.1166 +       // Intentional fall-through into LGoSlowPath ...
  1.1167 +
  1.1168 +       bind  (LGoSlowPath);
  1.1169 +       orptr(boxReg, 1);                      // set ICC.ZF=0 to indicate failure
  1.1170 +       jmpb  (DONE_LABEL);
  1.1171 +
  1.1172 +       bind  (LSuccess);
  1.1173 +       xorptr(boxReg, boxReg);                 // set ICC.ZF=1 to indicate success
  1.1174 +       jmpb  (DONE_LABEL);
  1.1175 +    }
  1.1176 +
  1.1177 +    bind (Stacked);
  1.1178 +    // It's not inflated and it's not recursively stack-locked and it's not biased.
  1.1179 +    // It must be stack-locked.
  1.1180 +    // Try to reset the header to displaced header.
  1.1181 +    // The "box" value on the stack is stable, so we can reload
  1.1182 +    // and be assured we observe the same value as above.
  1.1183 +    movptr(tmpReg, Address(boxReg, 0));
  1.1184 +    if (os::is_MP()) {
  1.1185 +      lock();
  1.1186 +    }
  1.1187 +    cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
  1.1188 +    // Intention fall-thru into DONE_LABEL
  1.1189 +
  1.1190 +    // DONE_LABEL is a hot target - we'd really like to place it at the
  1.1191 +    // start of cache line by padding with NOPs.
  1.1192 +    // See the AMD and Intel software optimization manuals for the
  1.1193 +    // most efficient "long" NOP encodings.
  1.1194 +    // Unfortunately none of our alignment mechanisms suffice.
  1.1195 +    if ((EmitSync & 65536) == 0) {
  1.1196 +       bind (CheckSucc);
  1.1197 +    }
  1.1198 +#else // _LP64
  1.1199 +    // It's inflated
  1.1200 +    movptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
  1.1201 +    xorptr(boxReg, r15_thread);
  1.1202 +    orptr (boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
  1.1203 +    jccb  (Assembler::notZero, DONE_LABEL);
  1.1204 +    movptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
  1.1205 +    orptr (boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
  1.1206 +    jccb  (Assembler::notZero, CheckSucc);
  1.1207 +    movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD);
  1.1208 +    jmpb  (DONE_LABEL);
  1.1209 +
  1.1210 +    if ((EmitSync & 65536) == 0) {
  1.1211 +      Label LSuccess, LGoSlowPath ;
  1.1212 +      bind  (CheckSucc);
  1.1213 +      cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
  1.1214 +      jccb  (Assembler::zero, LGoSlowPath);
  1.1215 +
  1.1216 +      // I'd much rather use lock:andl m->_owner, 0 as it's faster than the
  1.1217 +      // the explicit ST;MEMBAR combination, but masm doesn't currently support
  1.1218 +      // "ANDQ M,IMM".  Don't use MFENCE here.  lock:add to TOS, xchg, etc
  1.1219 +      // are all faster when the write buffer is populated.
  1.1220 +      movptr (Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD);
  1.1221 +      if (os::is_MP()) {
  1.1222 +         lock (); addl (Address(rsp, 0), 0);
  1.1223 +      }
  1.1224 +      cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
  1.1225 +      jccb  (Assembler::notZero, LSuccess);
  1.1226 +
  1.1227 +      movptr (boxReg, (int32_t)NULL_WORD);                   // box is really EAX
  1.1228 +      if (os::is_MP()) { lock(); }
  1.1229 +      cmpxchgptr(r15_thread, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
  1.1230 +      jccb  (Assembler::notEqual, LSuccess);
  1.1231 +      // Intentional fall-through into slow-path
  1.1232 +
  1.1233 +      bind  (LGoSlowPath);
  1.1234 +      orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
  1.1235 +      jmpb  (DONE_LABEL);
  1.1236 +
  1.1237 +      bind  (LSuccess);
  1.1238 +      testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
  1.1239 +      jmpb  (DONE_LABEL);
  1.1240 +    }
  1.1241 +
  1.1242 +    bind  (Stacked);
  1.1243 +    movptr(tmpReg, Address (boxReg, 0));      // re-fetch
  1.1244 +    if (os::is_MP()) { lock(); }
  1.1245 +    cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
  1.1246 +
  1.1247 +    if (EmitSync & 65536) {
  1.1248 +       bind (CheckSucc);
  1.1249 +    }
  1.1250 +#endif
  1.1251 +    bind(DONE_LABEL);
  1.1252 +    // Avoid branch to branch on AMD processors
  1.1253 +    if (EmitSync & 32768) {
  1.1254 +       nop();
  1.1255 +    }
  1.1256 +  }
  1.1257 +}
  1.1258 +#endif // COMPILER2
  1.1259 +
  1.1260  void MacroAssembler::c2bool(Register x) {
  1.1261    // implements x == 0 ? 0 : 1
  1.1262    // note: must only look at least-significant byte of x

mercurial