src/cpu/mips/vm/macroAssembler_mips.cpp

Wed, 08 Nov 2017 09:28:23 +0800

author
fujie
date
Wed, 08 Nov 2017 09:28:23 +0800
changeset 8006
b70d88852ac9
parent 8004
941851413ebf
child 8009
0477693968a6
permissions
-rw-r--r--

[GC] 17 out of 18 jtreg tests for g1 have passed (the same as x86 with jdk8u60-b32).

     1 /*
     2  * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
     3  * Copyright (c) 2017, Loongson Technology. All rights reserved.
     4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     5  *
     6  * This code is free software; you can redistribute it and/or modify it
     7  * under the terms of the GNU General Public License version 2 only, as
     8  * published by the Free Software Foundation.
     9  *
    10  * This code is distributed in the hope that it will be useful, but WITHOUT
    11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    13  * version 2 for more details (a copy is included in the LICENSE file that
    14  * accompanied this code).
    15  *
    16  * You should have received a copy of the GNU General Public License version
    17  * 2 along with this work; if not, write to the Free Software Foundation,
    18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    19  *
    20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    21  * or visit www.oracle.com if you need additional information or have any
    22  * questions.
    23  *
    24  */
    26 #include "precompiled.hpp"
    27 #include "asm/assembler.hpp"
    28 #include "asm/assembler.inline.hpp"
    29 #include "asm/macroAssembler.inline.hpp"
    30 #include "compiler/disassembler.hpp"
    31 #include "gc_interface/collectedHeap.inline.hpp"
    32 #include "interpreter/interpreter.hpp"
    33 #include "memory/cardTableModRefBS.hpp"
    34 #include "memory/resourceArea.hpp"
    35 #include "memory/universe.hpp"
    36 #include "prims/methodHandles.hpp"
    37 #include "runtime/biasedLocking.hpp"
    38 #include "runtime/interfaceSupport.hpp"
    39 #include "runtime/objectMonitor.hpp"
    40 #include "runtime/os.hpp"
    41 #include "runtime/sharedRuntime.hpp"
    42 #include "runtime/stubRoutines.hpp"
    43 #include "utilities/macros.hpp"
    44 #if INCLUDE_ALL_GCS
    45 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
    46 #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
    47 #include "gc_implementation/g1/heapRegion.hpp"
    48 #endif // INCLUDE_ALL_GCS
    50 // Implementation of MacroAssembler
    52 intptr_t MacroAssembler::i[32] = {0};
    53 float MacroAssembler::f[32] = {0.0};
    55 void MacroAssembler::print(outputStream *s) {
    56   unsigned int k;
    57   for(k=0; k<sizeof(i)/sizeof(i[0]); k++) {
    58     s->print_cr("i%d = 0x%.16lx", k, i[k]);
    59   }
    60   s->cr();
    62   for(k=0; k<sizeof(f)/sizeof(f[0]); k++) {
    63     s->print_cr("f%d = %f", k, f[k]);
    64   }
    65   s->cr();
    66 }
    68 int MacroAssembler::i_offset(unsigned int k) { return (intptr_t)&((MacroAssembler*)0)->i[k]; }
    69 int MacroAssembler::f_offset(unsigned int k) { return (intptr_t)&((MacroAssembler*)0)->f[k]; }
    71 void MacroAssembler::save_registers(MacroAssembler *masm) {
    72 #define __ masm->
    73   for(int k=0; k<32; k++) {
    74     __ sw (as_Register(k), A0, i_offset(k));
    75   }
    77   for(int k=0; k<32; k++) {
    78     __ swc1 (as_FloatRegister(k), A0, f_offset(k));
    79   }
    80 #undef __
    81 }
    83 void MacroAssembler::restore_registers(MacroAssembler *masm) {
    84 #define __ masm->
    85   for(int k=0; k<32; k++) {
    86     __ lw (as_Register(k), A0, i_offset(k));
    87   }
    89   for(int k=0; k<32; k++) {
    90     __ lwc1 (as_FloatRegister(k), A0, f_offset(k));
    91   }
    92 #undef __
    93 }
    96 void MacroAssembler::pd_patch_instruction(address branch, address target) {
    97   jint& stub_inst = *(jint*) branch;
    99 /* *
   100   move(AT, RA); // dadd
   101   emit_long(insn_ORRI(regimm_op, 0, bgezal_op, 1));
   102   nop();
   103         lui(T9, 0); // to be patched
   104         ori(T9, 0);
   105   daddu(T9, T9, RA);
   106   move(RA, AT);
   107   jr(T9);
   108  */
   109   if(special(stub_inst) == dadd_op) {
   110     jint *pc = (jint *)branch;
   112     assert(opcode(pc[3]) == lui_op
   113           && opcode(pc[4]) == ori_op
   114           && special(pc[5]) == daddu_op, "Not a branch label patch");
   115     if(!(opcode(pc[3]) == lui_op
   116           && opcode(pc[4]) == ori_op
   117           && special(pc[5]) == daddu_op)) { tty->print_cr("Not a branch label patch"); }
   119     int offset = target - branch;
   120     if (!is_simm16(offset))
   121     {
   122       pc[3] = (pc[3] & 0xffff0000) | high16(offset - 12);
   123       pc[4] = (pc[4] & 0xffff0000) | low16(offset - 12);
   124     }
   125     else
   126     {
   127       /* revert to "beq + nop" */
   128       CodeBuffer cb(branch, 4 * 10);
   129       MacroAssembler masm(&cb);
   130 #define __ masm.
   131       __ b(target);
   132       __ nop();
   133       __ nop();
   134       __ nop();
   135       __ nop();
   136       __ nop();
   137       __ nop();
   138       __ nop();
   139     }
   140     return;
   141   }
   143 #ifndef PRODUCT
   144   if (!is_simm16((target - branch - 4) >> 2))
   145   {
   146     tty->print_cr("Illegal patching: target=0x%lx", target);
   147     int *p = (int *)branch;
   148     for (int i = -10; i < 10; i++)
   149     {
   150        tty->print("0x%lx, ", p[i]);
   151     }
   152     tty->print_cr("");
   153   }
   154 #endif
   156   stub_inst = patched_branch(target - branch, stub_inst, 0);
   157 }
   159 static inline address first_cache_address() {
   160   return CodeCache::low_bound() + sizeof(HeapBlock::Header);
   161 }
   163 static inline address last_cache_address() {
   164   return CodeCache::high_bound() - Assembler::InstructionSize;
   165 }
   167 int MacroAssembler::call_size(address target, bool far, bool patchable) {
   168   if (patchable) return 6 << Assembler::LogInstructionSize;
   169   if (!far) return 2 << Assembler::LogInstructionSize; // jal + nop
   170   return (insts_for_set64((jlong)target) + 2) << Assembler::LogInstructionSize;
   171 }
   173 // Can we reach target using jal/j from anywhere
   174 // in the code cache (because code can be relocated)?
   175 bool MacroAssembler::reachable_from_cache(address target) {
   176   address cl = first_cache_address();
   177   address ch = last_cache_address();
   179   return fit_in_jal(target, cl) && fit_in_jal(target, ch);
   180 }
   182 void MacroAssembler::general_jump(address target) {
   183   if (reachable_from_cache(target)) {
   184     j(target);
   185     nop();
   186   } else {
   187     set64(T9, (long)target);
   188     jr(T9);
   189     nop();
   190   }
   191 }
   193 int MacroAssembler::insts_for_general_jump(address target) {
   194   if (reachable_from_cache(target)) {
   195     //j(target);
   196     //nop();
   197     return 2;
   198   } else {
   199     //set64(T9, (long)target);
   200     //jr(T9);
   201     //nop();
   202     return insts_for_set64((jlong)target) + 2;
   203   }
   204 }
   206 void MacroAssembler::patchable_jump(address target) {
   207   if (reachable_from_cache(target)) {
   208     nop();
   209     nop();
   210     nop();
   211     nop();
   212     j(target);
   213     nop();
   214   } else {
   215     patchable_set48(T9, (long)target);
   216     jr(T9);
   217     nop();
   218   }
   219 }
   221 int MacroAssembler::insts_for_patchable_jump(address target) {
   222   return 6;
   223 }
   225 void MacroAssembler::general_call(address target) {
   226   if (reachable_from_cache(target)) {
   227     jal(target);
   228     nop();
   229   } else {
   230     set64(T9, (long)target);
   231     jalr(T9);
   232     nop();
   233   }
   234 }
   236 int MacroAssembler::insts_for_general_call(address target) {
   237   if (reachable_from_cache(target)) {
   238     //jal(target);
   239     //nop();
   240     return 2;
   241   } else {
   242     //set64(T9, (long)target);
   243     //jalr(T9);
   244     //nop();
   245     return insts_for_set64((jlong)target) + 2;
   246   }
   247 }
   249 void MacroAssembler::patchable_call(address target) {
   250   if (reachable_from_cache(target)) {
   251     nop();
   252     nop();
   253     nop();
   254     nop();
   255     jal(target);
   256     nop();
   257   } else {
   258     patchable_set48(T9, (long)target);
   259     jalr(T9);
   260     nop();
   261   }
   262 }
   264 int MacroAssembler::insts_for_patchable_call(address target) {
   265   return 6;
   266 }
   268 void MacroAssembler::beq_far(Register rs, Register rt, address entry)
   269 {
   270   u_char * cur_pc = pc();
   272   /* Jin: Near/Far jump */
   273   if(is_simm16((entry - pc() - 4) / 4))
   274   {
   275     Assembler::beq(rs, rt, offset(entry));
   276   }
   277   else
   278   {
   279     Label not_jump;
   280     bne(rs, rt, not_jump);
   281     delayed()->nop();
   283     b_far(entry);
   284     delayed()->nop();
   286     bind(not_jump);
   287     has_delay_slot();
   288   }
   289 }
   291 void MacroAssembler::beq_far(Register rs, Register rt, Label& L)
   292 {
   293   if (L.is_bound()) {
   294     beq_far(rs, rt, target(L));
   295   } else {
   296     u_char * cur_pc = pc();
   297     Label not_jump;
   298     bne(rs, rt, not_jump);
   299     delayed()->nop();
   301     b_far(L);
   302     delayed()->nop();
   304     bind(not_jump);
   305     has_delay_slot();
   306   }
   307 }
   309 void MacroAssembler::bne_far(Register rs, Register rt, address entry)
   310 {
   311   u_char * cur_pc = pc();
   313   /* Jin: Near/Far jump */
   314   if(is_simm16((entry - pc() - 4) / 4))
   315   {
   316     Assembler::bne(rs, rt, offset(entry));
   317   }
   318   else
   319   {
   320     Label not_jump;
   321     beq(rs, rt, not_jump);
   322     delayed()->nop();
   324     b_far(entry);
   325     delayed()->nop();
   327     bind(not_jump);
   328     has_delay_slot();
   329   }
   330 }
   332 void MacroAssembler::bne_far(Register rs, Register rt, Label& L)
   333 {
   334   if (L.is_bound()) {
   335     bne_far(rs, rt, target(L));
   336   } else {
   337     u_char * cur_pc = pc();
   338     Label not_jump;
   339     beq(rs, rt, not_jump);
   340     delayed()->nop();
   342     b_far(L);
   343     delayed()->nop();
   345     bind(not_jump);
   346     has_delay_slot();
   347   }
   348 }
   350 void MacroAssembler::b_far(Label& L)
   351 {
   352   if (L.is_bound()) {
   353     b_far(target(L));
   354   } else {
   355   volatile address dest = target(L);
   356 /*
   357 MacroAssembler::pd_patch_instruction branch=55651ed514, target=55651ef6d8
   358    0x00000055651ed514: dadd at, ra, zero
   359    0x00000055651ed518: [4110001]bgezal zero, 0x00000055651ed520
   361    0x00000055651ed51c: sll zero, zero, 0
   362    0x00000055651ed520: lui t9, 0x0
   363    0x00000055651ed524: ori t9, t9, 0x21b8
   364    0x00000055651ed528: daddu t9, t9, ra
   365    0x00000055651ed52c: dadd ra, at, zero
   366    0x00000055651ed530: jr t9
   367    0x00000055651ed534: sll zero, zero, 0
   368 */
   369   move(AT, RA);
   370   emit_long(insn_ORRI(regimm_op, 0, bgezal_op, 1));
   371   nop();
   372         lui(T9, 0); // to be patched
   373         ori(T9, T9, 0);
   374   daddu(T9, T9, RA);
   375   move(RA, AT);
   376   jr(T9);
   377   }
   378 }
   380 void MacroAssembler::b_far(address entry)
   381 {
   382   u_char * cur_pc = pc();
   384   /* Jin: Near/Far jump */
   385   if(is_simm16((entry - pc() - 4) / 4))
   386   {
   387     b(offset(entry));
   388   }
   389   else
   390   {
   391     /* address must be bounded */
   392     move(AT, RA);
   393      emit_long(insn_ORRI(regimm_op, 0, bgezal_op, 1));
   394     nop();
   395     li32(T9, entry - pc());
   396     daddu(T9, T9, RA);
   397     move(RA, AT);
   398     jr(T9);
   399   }
   400 }
   402 void MacroAssembler::ld_ptr(Register rt, Register offset, Register base) {
   403   addu_long(AT, base, offset);
   404   ld_ptr(rt, 0, AT);
   405 }
   407 void MacroAssembler::st_ptr(Register rt, Register offset, Register base) {
   408   addu_long(AT, base, offset);
   409   st_ptr(rt, 0, AT);
   410 }
   412 void MacroAssembler::ld_long(Register rt, Register offset, Register base) {
   413   addu_long(AT, base, offset);
   414   ld_long(rt, 0, AT);
   415 }
   417 void MacroAssembler::st_long(Register rt, Register offset, Register base) {
   418   addu_long(AT, base, offset);
   419   st_long(rt, 0, AT);
   420 }
   422 Address MacroAssembler::as_Address(AddressLiteral adr) {
   423   return Address(adr.target(), adr.rspec());
   424 }
   426 Address MacroAssembler::as_Address(ArrayAddress adr) {
   427   return Address::make_array(adr);
   428 }
   430 // tmp_reg1 and tmp_reg2 should be saved outside of atomic_inc32 (caller saved).
   431 void MacroAssembler::atomic_inc32(address counter_addr, int inc, Register tmp_reg1, Register tmp_reg2) {
   432   Label again;
   434   li(tmp_reg1, counter_addr);
   435   bind(again);
   436   if(!Use3A2000) sync();
   437   ll(tmp_reg2, tmp_reg1, 0);
   438   addi(tmp_reg2, tmp_reg2, inc);
   439   sc(tmp_reg2, tmp_reg1, 0);
   440   beq(tmp_reg2, R0, again);
   441   delayed()->nop();
   442 }
   444 int MacroAssembler::biased_locking_enter(Register lock_reg,
   445                                          Register obj_reg,
   446                                          Register swap_reg,
   447                                          Register tmp_reg,
   448                                          bool swap_reg_contains_mark,
   449                                          Label& done,
   450                                          Label* slow_case,
   451                                          BiasedLockingCounters* counters) {
   452   assert(UseBiasedLocking, "why call this otherwise?");
   453   bool need_tmp_reg = false;
   454   if (tmp_reg == noreg) {
   455     need_tmp_reg = true;
   456     tmp_reg = T9;
   457   }
   458   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, AT);
   459   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
   460   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
   461   Address saved_mark_addr(lock_reg, 0);
   463   // Biased locking
   464   // See whether the lock is currently biased toward our thread and
   465   // whether the epoch is still valid
   466   // Note that the runtime guarantees sufficient alignment of JavaThread
   467   // pointers to allow age to be placed into low bits
   468   // First check to see whether biasing is even enabled for this object
   469   Label cas_label;
   470   int null_check_offset = -1;
   471   if (!swap_reg_contains_mark) {
   472     null_check_offset = offset();
   473     ld_ptr(swap_reg, mark_addr);
   474   }
   476   if (need_tmp_reg) {
   477     push(tmp_reg);
   478   }
   479   move(tmp_reg, swap_reg);
   480   andi(tmp_reg, tmp_reg, markOopDesc::biased_lock_mask_in_place);
   481 #ifdef _LP64
   482   daddi(AT, R0, markOopDesc::biased_lock_pattern);
   483   dsub(AT, AT, tmp_reg);
   484 #else
   485   addi(AT, R0, markOopDesc::biased_lock_pattern);
   486   sub(AT, AT, tmp_reg);
   487 #endif
   488   if (need_tmp_reg) {
   489     pop(tmp_reg);
   490   }
   492   bne(AT, R0, cas_label);
   493   delayed()->nop();
   496   // The bias pattern is present in the object's header. Need to check
   497   // whether the bias owner and the epoch are both still current.
   498   // Note that because there is no current thread register on MIPS we
   499   // need to store off the mark word we read out of the object to
   500   // avoid reloading it and needing to recheck invariants below. This
   501   // store is unfortunate but it makes the overall code shorter and
   502   // simpler.
   503   st_ptr(swap_reg, saved_mark_addr);
   504   if (need_tmp_reg) {
   505     push(tmp_reg);
   506   }
   507   if (swap_reg_contains_mark) {
   508     null_check_offset = offset();
   509   }
   510   load_prototype_header(tmp_reg, obj_reg);
   511   xorr(tmp_reg, tmp_reg, swap_reg);
   512   get_thread(swap_reg);
   513   xorr(swap_reg, swap_reg, tmp_reg);
   515   move(AT, ~((int) markOopDesc::age_mask_in_place));
   516   andr(swap_reg, swap_reg, AT);
   518   if (PrintBiasedLockingStatistics) {
   519     Label L;
   520     bne(swap_reg, R0, L);
   521     delayed()->nop();
   522     push(tmp_reg);
   523     push(A0);
   524     atomic_inc32((address)BiasedLocking::biased_lock_entry_count_addr(), 1, A0, tmp_reg);
   525     pop(A0);
   526     pop(tmp_reg);
   527     bind(L);
   528   }
   529   if (need_tmp_reg) {
   530     pop(tmp_reg);
   531   }
   532   beq(swap_reg, R0, done);
   533   delayed()->nop();
   534   Label try_revoke_bias;
   535   Label try_rebias;
   537   // At this point we know that the header has the bias pattern and
   538   // that we are not the bias owner in the current epoch. We need to
   539   // figure out more details about the state of the header in order to
   540   // know what operations can be legally performed on the object's
   541   // header.
   543   // If the low three bits in the xor result aren't clear, that means
   544   // the prototype header is no longer biased and we have to revoke
   545   // the bias on this object.
   547   move(AT, markOopDesc::biased_lock_mask_in_place);
   548   andr(AT, swap_reg, AT);
   549   bne(AT, R0, try_revoke_bias);
   550   delayed()->nop();
   551   // Biasing is still enabled for this data type. See whether the
   552   // epoch of the current bias is still valid, meaning that the epoch
   553   // bits of the mark word are equal to the epoch bits of the
   554   // prototype header. (Note that the prototype header's epoch bits
   555   // only change at a safepoint.) If not, attempt to rebias the object
   556   // toward the current thread. Note that we must be absolutely sure
   557   // that the current epoch is invalid in order to do this because
   558   // otherwise the manipulations it performs on the mark word are
   559   // illegal.
   561   move(AT, markOopDesc::epoch_mask_in_place);
   562   andr(AT,swap_reg, AT);
   563   bne(AT, R0, try_rebias);
   564   delayed()->nop();
   565   // The epoch of the current bias is still valid but we know nothing
   566   // about the owner; it might be set or it might be clear. Try to
   567   // acquire the bias of the object using an atomic operation. If this
   568   // fails we will go in to the runtime to revoke the object's bias.
   569   // Note that we first construct the presumed unbiased header so we
   570   // don't accidentally blow away another thread's valid bias.
   572   ld_ptr(swap_reg, saved_mark_addr);
   574   move(AT, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
   575   andr(swap_reg, swap_reg, AT);
   577   if (need_tmp_reg) {
   578     push(tmp_reg);
   579   }
   580   get_thread(tmp_reg);
   581   orr(tmp_reg, tmp_reg, swap_reg);
   582   //if (os::is_MP()) {
   583   //  sync();
   584   //}
   585   cmpxchg(tmp_reg, Address(obj_reg, 0), swap_reg);
   586   if (need_tmp_reg) {
   587     pop(tmp_reg);
   588   }
   589   // If the biasing toward our thread failed, this means that
   590   // another thread succeeded in biasing it toward itself and we
   591   // need to revoke that bias. The revocation will occur in the
   592   // interpreter runtime in the slow case.
   593   if (PrintBiasedLockingStatistics) {
   594     Label L;
   595     bne(AT, R0, L);
   596     delayed()->nop();
   597     push(tmp_reg);
   598     push(A0);
   599     atomic_inc32((address)BiasedLocking::anonymously_biased_lock_entry_count_addr(), 1, A0, tmp_reg);
   600     pop(A0);
   601     pop(tmp_reg);
   602     bind(L);
   603   }
   604   if (slow_case != NULL) {
   605     beq_far(AT, R0, *slow_case);
   606     delayed()->nop();
   607   }
   608   b(done);
   609   delayed()->nop();
   611   bind(try_rebias);
   612   // At this point we know the epoch has expired, meaning that the
   613   // current "bias owner", if any, is actually invalid. Under these
   614   // circumstances _only_, we are allowed to use the current header's
   615   // value as the comparison value when doing the cas to acquire the
   616   // bias in the current epoch. In other words, we allow transfer of
   617   // the bias from one thread to another directly in this situation.
   618   //
   619   // FIXME: due to a lack of registers we currently blow away the age
   620   // bits in this situation. Should attempt to preserve them.
   621   if (need_tmp_reg) {
   622     push(tmp_reg);
   623   }
   624   load_prototype_header(tmp_reg, obj_reg);
   625   get_thread(swap_reg);
   626   orr(tmp_reg, tmp_reg, swap_reg);
   627   ld_ptr(swap_reg, saved_mark_addr);
   629   //if (os::is_MP()) {
   630   //  sync();
   631   //}
   632   cmpxchg(tmp_reg, Address(obj_reg, 0), swap_reg);
   633   if (need_tmp_reg) {
   634     pop(tmp_reg);
   635   }
   636   // If the biasing toward our thread failed, then another thread
   637   // succeeded in biasing it toward itself and we need to revoke that
   638   // bias. The revocation will occur in the runtime in the slow case.
   639   if (PrintBiasedLockingStatistics) {
   640     Label L;
   641     bne(AT, R0, L);
   642     delayed()->nop();
   643     push(AT);
   644     push(tmp_reg);
   645     atomic_inc32((address)BiasedLocking::rebiased_lock_entry_count_addr(), 1, AT, tmp_reg);
   646     pop(tmp_reg);
   647     pop(AT);
   648     bind(L);
   649   }
   650   if (slow_case != NULL) {
   651     beq_far(AT, R0, *slow_case);
   652     delayed()->nop();
   653   }
   655   b(done);
   656   delayed()->nop();
   657   bind(try_revoke_bias);
   658   // The prototype mark in the klass doesn't have the bias bit set any
   659   // more, indicating that objects of this data type are not supposed
   660   // to be biased any more. We are going to try to reset the mark of
   661   // this object to the prototype value and fall through to the
   662   // CAS-based locking scheme. Note that if our CAS fails, it means
   663   // that another thread raced us for the privilege of revoking the
   664   // bias of this particular object, so it's okay to continue in the
   665   // normal locking code.
   666   //
   667   // FIXME: due to a lack of registers we currently blow away the age
   668   // bits in this situation. Should attempt to preserve them.
   669   ld_ptr(swap_reg, saved_mark_addr);
   671   if (need_tmp_reg) {
   672     push(tmp_reg);
   673   }
   674   load_prototype_header(tmp_reg, obj_reg);
   675   //if (os::is_MP()) {
   676   // lock();
   677   //}
   678   cmpxchg(tmp_reg, Address(obj_reg, 0), swap_reg);
   679   if (need_tmp_reg) {
   680     pop(tmp_reg);
   681   }
   682   // Fall through to the normal CAS-based lock, because no matter what
   683   // the result of the above CAS, some thread must have succeeded in
   684   // removing the bias bit from the object's header.
   685   if (PrintBiasedLockingStatistics) {
   686     Label L;
   687     bne(AT, R0, L);
   688     delayed()->nop();
   689     push(AT);
   690     push(tmp_reg);
   691     atomic_inc32((address)BiasedLocking::revoked_lock_entry_count_addr(), 1, AT, tmp_reg);
   692     pop(tmp_reg);
   693     pop(AT);
   694     bind(L);
   695   }
   697   bind(cas_label);
   698   return null_check_offset;
   699 }
   701 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
   702   assert(UseBiasedLocking, "why call this otherwise?");
   704   // Check for biased locking unlock case, which is a no-op
   705   // Note: we do not have to check the thread ID for two reasons.
   706   // First, the interpreter checks for IllegalMonitorStateException at
   707   // a higher level. Second, if the bias was revoked while we held the
   708   // lock, the object could not be rebiased toward another thread, so
   709   // the bias bit would be clear.
   710 #ifdef _LP64
   711   ld(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
   712   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
   713   daddi(AT, R0, markOopDesc::biased_lock_pattern);
   714 #else
   715   lw(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
   716   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
   717   addi(AT, R0, markOopDesc::biased_lock_pattern);
   718 #endif
   720   beq(AT, temp_reg, done);
   721   delayed()->nop();
   722 }
   724 // NOTE: we dont increment the SP after call like the x86 version, maybe this is a problem, FIXME.
   725 // the stack pointer adjustment is needed. see InterpreterMacroAssembler::super_call_VM_leaf
   726 // this method will handle the stack problem, you need not to preserve the stack space for the argument now
   727 void MacroAssembler::call_VM_leaf_base(address entry_point,
   728     int number_of_arguments) {
   729   //call(RuntimeAddress(entry_point));
   730   //increment(rsp, number_of_arguments * wordSize);
   731   Label L, E;
   733   assert(number_of_arguments <= 4, "just check");
   735   andi(AT, SP, 0xf);
   736   beq(AT, R0, L);
   737   delayed()->nop();
   738   daddi(SP, SP, -8);
   739   call(entry_point, relocInfo::runtime_call_type);
   740   delayed()->nop();
   741   daddi(SP, SP, 8);
   742   b(E);
   743   delayed()->nop();
   745   bind(L);
   746   call(entry_point, relocInfo::runtime_call_type);
   747   delayed()->nop();
   748   bind(E);
   749 }
   752 void MacroAssembler::jmp(address entry) {
   753   patchable_set48(T9, (long)entry);
   754   jr(T9);
   755 }
   757 void MacroAssembler::jmp(address entry, relocInfo::relocType rtype) {
   758   switch (rtype) {
   759     case relocInfo::runtime_call_type:
   760     case relocInfo::none:
   761       jmp(entry);
   762       break;
   763     default:
   764       {
   765       InstructionMark im(this);
   766       relocate(rtype);
   767       patchable_set48(T9, (long)entry);
   768       jr(T9);
   769       }
   770       break;
   771   }
   772 }
   774 void MacroAssembler::call(address entry) {
   775 // c/c++ code assume T9 is entry point, so we just always move entry to t9
   776 // maybe there is some more graceful method to handle this. FIXME
   777 // For more info, see class NativeCall.
   778 #ifndef _LP64
   779   move(T9, (int)entry);
   780 #else
   781   patchable_set48(T9, (long)entry);
   782 #endif
   783   jalr(T9);
   784 }
   786 void MacroAssembler::call(address entry, relocInfo::relocType rtype) {
   787   switch (rtype) {
   788     case relocInfo::runtime_call_type:
   789     case relocInfo::none:
   790       call(entry);
   791       break;
   792     default:
   793       {
   794   InstructionMark im(this);
   795   relocate(rtype);
   796   call(entry);
   797       }
   798       break;
   799   }
   800 }
   802 void MacroAssembler::call(address entry, RelocationHolder& rh)
   803 {
   804   switch (rh.type()) {
   805     case relocInfo::runtime_call_type:
   806     case relocInfo::none:
   807       call(entry);
   808       break;
   809     default:
   810       {
   811   InstructionMark im(this);
   812   relocate(rh);
   813   call(entry);
   814       }
   815       break;
   816   }
   817 }
   819 void MacroAssembler::ic_call(address entry) {
   820   RelocationHolder rh = virtual_call_Relocation::spec(pc());
   821   patchable_set48(IC_Klass, (long)Universe::non_oop_word());
   822   assert(entry != NULL, "call most probably wrong");
   823   InstructionMark im(this);
   824   relocate(rh);
   825         patchable_call(entry);
   826 }
   828 void MacroAssembler::c2bool(Register r) {
   829   Label L;
   830   Assembler::beq(r, R0, L);
   831   delayed()->nop();
   832   move(r, 1);
   833   bind(L);
   834 }
   836 #ifndef PRODUCT
   837 extern "C" void findpc(intptr_t x);
   838 #endif
   840 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
   841   // In order to get locks to work, we need to fake a in_VM state
   842   JavaThread* thread = JavaThread::current();
   843   JavaThreadState saved_state = thread->thread_state();
   844   thread->set_thread_state(_thread_in_vm);
   845   if (ShowMessageBoxOnError) {
   846     JavaThread* thread = JavaThread::current();
   847     JavaThreadState saved_state = thread->thread_state();
   848     thread->set_thread_state(_thread_in_vm);
   849     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
   850       ttyLocker ttyl;
   851       BytecodeCounter::print();
   852     }
   853     // To see where a verify_oop failed, get $ebx+40/X for this frame.
   854     // This is the value of eip which points to where verify_oop will return.
   855     if (os::message_box(msg, "Execution stopped, print registers?")) {
   856       ttyLocker ttyl;
   857       tty->print_cr("eip = 0x%08x", eip);
   858 #ifndef PRODUCT
   859       tty->cr();
   860       findpc(eip);
   861       tty->cr();
   862 #endif
   863       tty->print_cr("rax, = 0x%08x", rax);
   864       tty->print_cr("rbx, = 0x%08x", rbx);
   865       tty->print_cr("rcx = 0x%08x", rcx);
   866       tty->print_cr("rdx = 0x%08x", rdx);
   867       tty->print_cr("rdi = 0x%08x", rdi);
   868       tty->print_cr("rsi = 0x%08x", rsi);
   869       tty->print_cr("rbp, = 0x%08x", rbp);
   870       tty->print_cr("rsp = 0x%08x", rsp);
   871       BREAKPOINT;
   872     }
   873   } else {
   874     ttyLocker ttyl;
   875     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
   876     assert(false, "DEBUG MESSAGE");
   877   }
   878   ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
   879 }
   881 void MacroAssembler::debug(char* msg/*, RegistersForDebugging* regs*/) {
   882   if ( ShowMessageBoxOnError ) {
   883     JavaThreadState saved_state = JavaThread::current()->thread_state();
   884     JavaThread::current()->set_thread_state(_thread_in_vm);
   885     {
   886       // In order to get locks work, we need to fake a in_VM state
   887       ttyLocker ttyl;
   888       ::tty->print_cr("EXECUTION STOPPED: %s\n", msg);
   889       if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
   890   BytecodeCounter::print();
   891       }
   893       //      if (os::message_box(msg, "Execution stopped, print registers?"))
   894       //        regs->print(::tty);
   895     }
   896     ThreadStateTransition::transition(JavaThread::current(), _thread_in_vm, saved_state);
   897   }
   898   else
   899     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
   900 }
   903 void MacroAssembler::stop(const char* msg) {
   904   li(A0, (long)msg);
   905 #ifndef _LP64
   906   //reserver space for argument. added by yjl 7/10/2005
   907   addiu(SP, SP, - 1 * wordSize);
   908 #endif
   909   call(CAST_FROM_FN_PTR(address, MacroAssembler::debug), relocInfo::runtime_call_type);
   910   delayed()->nop();
   911 #ifndef _LP64
   912   //restore space for argument
   913   addiu(SP, SP, 1 * wordSize);
   914 #endif
   915   brk(17);
   916 }
   918 void MacroAssembler::warn(const char* msg) {
   919 #ifdef _LP64
   920   pushad();
   921   li(A0, (long)msg);
   922   push(S2);
   923   move(AT, -(StackAlignmentInBytes));
   924   move(S2, SP);     // use S2 as a sender SP holder
   925   andr(SP, SP, AT); // align stack as required by ABI
   926   call(CAST_FROM_FN_PTR(address, MacroAssembler::debug), relocInfo::runtime_call_type);
   927   delayed()->nop();
   928   move(SP, S2);     // use S2 as a sender SP holder
   929   pop(S2);
   930   popad();
   931 #else
   932   pushad();
   933   addi(SP, SP, -4);
   934   sw(A0, SP, -1 * wordSize);
   935   li(A0, (long)msg);
   936   addi(SP, SP, -1 * wordSize);
   937   call(CAST_FROM_FN_PTR(address, MacroAssembler::debug), relocInfo::runtime_call_type);
   938   delayed()->nop();
   939   addi(SP, SP, 1 * wordSize);
   940   lw(A0, SP, -1 * wordSize);
   941   addi(SP, SP, 4);
   942   popad();
   943 #endif
   944 }
   946 void MacroAssembler::print_reg(Register reg) {
   947 /*
   948 char *s = getenv("PRINT_REG");
   949 if (s == NULL)
   950   return;
   951 if (strcmp(s, "1") != 0)
   952   return;
   953 */
   954   void * cur_pc = pc();
   955   pushad();
   956   NOT_LP64(push(FP);)
   958   li(A0, (long)reg->name());
   959   if (reg == SP)
   960     addiu(A1, SP, wordSize * 23); //23 registers saved in pushad()
   961   else if (reg == A0)
   962     ld(A1, SP, wordSize * 19); //A0 has been modified by li(A0, (long)reg->name()). Ugly Code!
   963   else
   964     move(A1, reg);
   965   li(A2, (long)cur_pc);
   966   push(S2);
   967   move(AT, -(StackAlignmentInBytes));
   968   move(S2, SP);     // use S2 as a sender SP holder
   969   andr(SP, SP, AT); // align stack as required by ABI
   970   call(CAST_FROM_FN_PTR(address, SharedRuntime::print_reg_with_pc),relocInfo::runtime_call_type);
   971   delayed()->nop();
   972   move(SP, S2);     // use S2 as a sender SP holder
   973   pop(S2);
   974   NOT_LP64(pop(FP);)
   975   popad();
   977 /*
   978   pushad();
   979 #ifdef _LP64
   980   if (reg == SP)
   981     addiu(A0, SP, wordSize * 23); //23 registers saved in pushad()
   982   else
   983     move(A0, reg);
   984   call(CAST_FROM_FN_PTR(address, SharedRuntime::print_long),relocInfo::runtime_call_type);
   985   delayed()->nop();
   986 #else
   987   push(FP);
   988   move(A0, reg);
   989   dsrl32(A1, reg, 0);
   990   //call(CAST_FROM_FN_PTR(address, SharedRuntime::print_int),relocInfo::runtime_call_type);
   991   call(CAST_FROM_FN_PTR(address, SharedRuntime::print_long),relocInfo::runtime_call_type);
   992   delayed()->nop();
   993   pop(FP);
   994 #endif
   995   popad();
   996   pushad();
   997   NOT_LP64(push(FP);)
   998   char b[50];
   999   sprintf((char *)b, " pc: %p\n",cur_pc);
  1000   li(A0, (long)(char *)b);
  1001   call(CAST_FROM_FN_PTR(address, SharedRuntime::print_str),relocInfo::runtime_call_type);
  1002   delayed()->nop();
  1003   NOT_LP64(pop(FP);)
  1004   popad();
  1005 */
  1008 void MacroAssembler::print_reg(FloatRegister reg) {
  1009   void * cur_pc = pc();
  1010   pushad();
  1011   NOT_LP64(push(FP);)
  1012   li(A0, (long)reg->name());
  1013   push(S2);
  1014   move(AT, -(StackAlignmentInBytes));
  1015   move(S2, SP);     // use S2 as a sender SP holder
  1016   andr(SP, SP, AT); // align stack as required by ABI
  1017   call(CAST_FROM_FN_PTR(address, SharedRuntime::print_str),relocInfo::runtime_call_type);
  1018   delayed()->nop();
  1019   move(SP, S2);     // use S2 as a sender SP holder
  1020   pop(S2);
  1021   NOT_LP64(pop(FP);)
  1022   popad();
  1024   pushad();
  1025   NOT_LP64(push(FP);)
  1026 #if 1
  1027   move(FP, SP);
  1028   move(AT, -(StackAlignmentInBytes));
  1029   andr(SP , SP , AT);
  1030   mov_d(F12, reg);
  1031   call(CAST_FROM_FN_PTR(address, SharedRuntime::print_double),relocInfo::runtime_call_type);
  1032   delayed()->nop();
  1033   move(SP, FP);
  1034 #else
  1035   mov_s(F12, reg);
  1036   //call(CAST_FROM_FN_PTR(address, SharedRuntime::print_float),relocInfo::runtime_call_type);
  1037   //delayed()->nop();
  1038 #endif
  1039   NOT_LP64(pop(FP);)
  1040   popad();
  1042 #if 0
  1043   pushad();
  1044   NOT_LP64(push(FP);)
  1045   char* b = new char[50];
  1046   sprintf(b, " pc: %p\n", cur_pc);
  1047   li(A0, (long)b);
  1048   call(CAST_FROM_FN_PTR(address, SharedRuntime::print_str),relocInfo::runtime_call_type);
  1049   delayed()->nop();
  1050   NOT_LP64(pop(FP);)
  1051   popad();
  1052 #endif
  1055 void MacroAssembler::increment(Register reg, int imm) {
  1056   if (!imm) return;
  1057   if (is_simm16(imm)) {
  1058 #ifdef _LP64
  1059     daddiu(reg, reg, imm);
  1060 #else
  1061     addiu(reg, reg, imm);
  1062 #endif
  1063   } else {
  1064     move(AT, imm);
  1065 #ifdef _LP64
  1066     daddu(reg, reg, AT);
  1067 #else
  1068     addu(reg, reg, AT);
  1069 #endif
  1073 void MacroAssembler::decrement(Register reg, int imm) {
  1074   increment(reg, -imm);
  1078 void MacroAssembler::call_VM(Register oop_result,
  1079                              address entry_point,
  1080                              bool check_exceptions) {
  1081   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
  1084 void MacroAssembler::call_VM(Register oop_result,
  1085                              address entry_point,
  1086                              Register arg_1,
  1087                              bool check_exceptions) {
  1088   if (arg_1!=A1) move(A1, arg_1);
  1089   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
  1092 void MacroAssembler::call_VM(Register oop_result,
  1093                              address entry_point,
  1094                              Register arg_1,
  1095                              Register arg_2,
  1096                              bool check_exceptions) {
  1097   if (arg_1!=A1) move(A1, arg_1);
  1098   if (arg_2!=A2) move(A2, arg_2);
  1099   assert(arg_2 != A1, "smashed argument");
  1100   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
  1103 void MacroAssembler::call_VM(Register oop_result,
  1104                              address entry_point,
  1105                              Register arg_1,
  1106                              Register arg_2,
  1107                              Register arg_3,
  1108                              bool check_exceptions) {
  1109   if (arg_1!=A1) move(A1, arg_1);
  1110   if (arg_2!=A2) move(A2, arg_2); assert(arg_2 != A1, "smashed argument");
  1111   if (arg_3!=A3) move(A3, arg_3); assert(arg_3 != A1 && arg_3 != A2, "smashed argument");
  1112   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
  1115 void MacroAssembler::call_VM(Register oop_result,
  1116                              Register last_java_sp,
  1117                              address entry_point,
  1118                              int number_of_arguments,
  1119                              bool check_exceptions) {
  1120   call_VM_base(oop_result, NOREG, last_java_sp, entry_point, number_of_arguments, check_exceptions);
  1123 void MacroAssembler::call_VM(Register oop_result,
  1124                              Register last_java_sp,
  1125                              address entry_point,
  1126                              Register arg_1,
  1127                              bool check_exceptions) {
  1128   if (arg_1 != A1) move(A1, arg_1);
  1129   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
  1132 void MacroAssembler::call_VM(Register oop_result,
  1133                              Register last_java_sp,
  1134                              address entry_point,
  1135                              Register arg_1,
  1136                              Register arg_2,
  1137                              bool check_exceptions) {
  1138   if (arg_1 != A1) move(A1, arg_1);
  1139   if (arg_2 != A2) move(A2, arg_2); assert(arg_2 != A1, "smashed argument");
  1140   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
  1143 void MacroAssembler::call_VM(Register oop_result,
  1144                              Register last_java_sp,
  1145                              address entry_point,
  1146                              Register arg_1,
  1147                              Register arg_2,
  1148                              Register arg_3,
  1149                              bool check_exceptions) {
  1150   if (arg_1 != A1) move(A1, arg_1);
  1151   if (arg_2 != A2) move(A2, arg_2); assert(arg_2 != A1, "smashed argument");
  1152   if (arg_3 != A3) move(A3, arg_3); assert(arg_3 != A1 && arg_3 != A2, "smashed argument");
  1153   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
  1156 void MacroAssembler::call_VM_base(Register oop_result,
  1157                                   Register java_thread,
  1158                                   Register last_java_sp,
  1159                                   address  entry_point,
  1160                                   int      number_of_arguments,
  1161           bool     check_exceptions) {
  1163   address before_call_pc;
  1164   // determine java_thread register
  1165   if (!java_thread->is_valid()) {
  1166 #ifndef OPT_THREAD
  1167     java_thread = T2;
  1168     get_thread(java_thread);
  1169 #else
  1170     java_thread = TREG;
  1171 #endif
  1173   // determine last_java_sp register
  1174   if (!last_java_sp->is_valid()) {
  1175     last_java_sp = SP;
  1177   // debugging support
  1178   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
  1179   assert(number_of_arguments <= 4   , "cannot have negative number of arguments");
  1180   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
  1181   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
  1183   assert(last_java_sp != FP, "this code doesn't work for last_java_sp == fp, which currently can't portably work anyway since C2 doesn't save ebp");
  1185   // set last Java frame before call
  1186   before_call_pc = (address)pc();
  1187   set_last_Java_frame(java_thread, last_java_sp, FP, before_call_pc);
  1189   // do the call
  1190   move(A0, java_thread);
  1191   call(entry_point, relocInfo::runtime_call_type);
  1192   delayed()->nop();
  1194   // restore the thread (cannot use the pushed argument since arguments
  1195   // may be overwritten by C code generated by an optimizing compiler);
  1196   // however can use the register value directly if it is callee saved.
  1197 #ifndef OPT_THREAD
  1198   get_thread(java_thread);
  1199 #else
  1200 #ifdef ASSERT
  1202     Label L;
  1203     get_thread(AT);
  1204     beq(java_thread, AT, L);
  1205     delayed()->nop();
  1206     stop("MacroAssembler::call_VM_base: edi not callee saved?");
  1207     bind(L);
  1209 #endif
  1210 #endif
  1212   // discard thread and arguments
  1213   ld_ptr(SP, java_thread, in_bytes(JavaThread::last_Java_sp_offset()));
  1214   // reset last Java frame
  1215   reset_last_Java_frame(java_thread, false, true);
  1217   check_and_handle_popframe(java_thread);
  1218   check_and_handle_earlyret(java_thread);
  1219   if (check_exceptions) {
  1220     // check for pending exceptions (java_thread is set upon return)
  1221     Label L;
  1222 #ifdef _LP64
  1223     ld(AT, java_thread, in_bytes(Thread::pending_exception_offset()));
  1224 #else
  1225     lw(AT, java_thread, in_bytes(Thread::pending_exception_offset()));
  1226 #endif
  1227     beq(AT, R0, L);
  1228     delayed()->nop();
  1229     li(AT, before_call_pc);
  1230     push(AT);
  1231     jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
  1232     delayed()->nop();
  1233     bind(L);
  1236   // get oop result if there is one and reset the value in the thread
  1237   if (oop_result->is_valid()) {
  1238 #ifdef _LP64
  1239     ld(oop_result, java_thread, in_bytes(JavaThread::vm_result_offset()));
  1240     sd(R0, java_thread, in_bytes(JavaThread::vm_result_offset()));
  1241 #else
  1242     lw(oop_result, java_thread, in_bytes(JavaThread::vm_result_offset()));
  1243     sw(R0, java_thread, in_bytes(JavaThread::vm_result_offset()));
  1244 #endif
  1245     verify_oop(oop_result);
  1249 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
  1251   move(V0, SP);
  1252   //we also reserve space for java_thread here
  1253 #ifndef _LP64
  1254   daddi(SP, SP, (1 + number_of_arguments) * (- wordSize));
  1255 #endif
  1256   move(AT, -(StackAlignmentInBytes));
  1257   andr(SP, SP, AT);
  1258   call_VM_base(oop_result, NOREG, V0, entry_point, number_of_arguments, check_exceptions);
  1262 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
  1263   call_VM_leaf_base(entry_point, number_of_arguments);
  1266 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
  1267   if (arg_0 != A0) move(A0, arg_0);
  1268   call_VM_leaf(entry_point, 1);
  1271 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
  1272   if (arg_0 != A0) move(A0, arg_0);
  1273   if (arg_1 != A1) move(A1, arg_1); assert(arg_1 != A0, "smashed argument");
  1274   call_VM_leaf(entry_point, 2);
  1277 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
  1278   if (arg_0 != A0) move(A0, arg_0);
  1279   if (arg_1 != A1) move(A1, arg_1); assert(arg_1 != A0, "smashed argument");
  1280   if (arg_2 != A2) move(A2, arg_2); assert(arg_2 != A0 && arg_2 != A1, "smashed argument");
  1281   call_VM_leaf(entry_point, 3);
  1283 void MacroAssembler::super_call_VM_leaf(address entry_point) {
  1284   MacroAssembler::call_VM_leaf_base(entry_point, 0);
  1288 void MacroAssembler::super_call_VM_leaf(address entry_point,
  1289                                                    Register arg_1) {
  1290   if (arg_1 != A0) move(A0, arg_1);
  1291   MacroAssembler::call_VM_leaf_base(entry_point, 1);
  1295 void MacroAssembler::super_call_VM_leaf(address entry_point,
  1296                                                    Register arg_1,
  1297                                                    Register arg_2) {
  1298   if (arg_1 != A0) move(A0, arg_1);
  1299   if (arg_2 != A1) move(A1, arg_2); assert(arg_2 != A0, "smashed argument");
  1300   MacroAssembler::call_VM_leaf_base(entry_point, 2);
  1302 void MacroAssembler::super_call_VM_leaf(address entry_point,
  1303                                                    Register arg_1,
  1304                                                    Register arg_2,
  1305                                                    Register arg_3) {
  1306   if (arg_1 != A0) move(A0, arg_1);
  1307   if (arg_2 != A1) move(A1, arg_2); assert(arg_2 != A0, "smashed argument");
  1308   if (arg_3 != A2) move(A2, arg_3); assert(arg_3 != A0 && arg_3 != A1, "smashed argument");
  1309   MacroAssembler::call_VM_leaf_base(entry_point, 3);
  1312 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
  1315 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
  1318 void MacroAssembler::null_check(Register reg, int offset) {
  1319   if (needs_explicit_null_check(offset)) {
  1320     // provoke OS NULL exception if reg = NULL by
  1321     // accessing M[reg] w/o changing any (non-CC) registers
  1322     // NOTE: cmpl is plenty here to provoke a segv
  1323     lw(AT, reg, 0);
  1324     // Note: should probably use testl(rax, Address(reg, 0));
  1325     //       may be shorter code (however, this version of
  1326     //       testl needs to be implemented first)
  1327   } else {
  1328     // nothing to do, (later) access of M[reg + offset]
  1329     // will provoke OS NULL exception if reg = NULL
  1333 void MacroAssembler::enter() {
  1334   push2(RA, FP);
  1335   move(FP, SP);
  1338 void MacroAssembler::leave() {
  1339 #ifndef _LP64
  1340   //move(SP, FP);
  1341   //pop2(FP, RA);
  1342   addi(SP, FP, 2 * wordSize);
  1343   lw(RA, SP, - 1 * wordSize);
  1344   lw(FP, SP, - 2 * wordSize);
  1345 #else
  1346   daddi(SP, FP, 2 * wordSize);
  1347   ld(RA, SP, - 1 * wordSize);
  1348   ld(FP, SP, - 2 * wordSize);
  1349 #endif
  1351 /*
  1352 void MacroAssembler::os_breakpoint() {
  1353   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
  1354   // (e.g., MSVC can't call ps() otherwise)
  1355   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
  1357 */
  1358 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) {
  1359   // determine java_thread register
  1360   if (!java_thread->is_valid()) {
  1361 #ifndef OPT_THREAD
  1362     java_thread = T1;
  1363     get_thread(java_thread);
  1364 #else
  1365     java_thread = TREG;
  1366 #endif
  1368   // we must set sp to zero to clear frame
  1369   st_ptr(R0, java_thread, in_bytes(JavaThread::last_Java_sp_offset()));
  1370   // must clear fp, so that compiled frames are not confused; it is possible
  1371   // that we need it only for debugging
  1372   if(clear_fp)
  1373     st_ptr(R0, java_thread, in_bytes(JavaThread::last_Java_fp_offset()));
  1375   if (clear_pc)
  1376     st_ptr(R0, java_thread, in_bytes(JavaThread::last_Java_pc_offset()));
  1379 void MacroAssembler::reset_last_Java_frame(bool clear_fp,
  1380                                            bool clear_pc) {
  1381   Register thread = TREG;
  1382 #ifndef OPT_THREAD
  1383   get_thread(thread);
  1384 #endif
  1385   // we must set sp to zero to clear frame
  1386   sd(R0, Address(thread, JavaThread::last_Java_sp_offset()));
  1387   // must clear fp, so that compiled frames are not confused; it is
  1388   // possible that we need it only for debugging
  1389   if (clear_fp) {
  1390     sd(R0, Address(thread, JavaThread::last_Java_fp_offset()));
  1393   if (clear_pc) {
  1394     sd(R0, Address(thread, JavaThread::last_Java_pc_offset()));
  1398 // Write serialization page so VM thread can do a pseudo remote membar.
  1399 // We use the current thread pointer to calculate a thread specific
  1400 // offset to write to within the page. This minimizes bus traffic
  1401 // due to cache line collision.
  1402 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
  1403   move(tmp, thread);
  1404   srl(tmp, tmp,os::get_serialize_page_shift_count());
  1405   move(AT, (os::vm_page_size() - sizeof(int)));
  1406   andr(tmp, tmp,AT);
  1407   sw(tmp,Address(tmp, (intptr_t)os::get_memory_serialize_page()));
  1410 // Calls to C land
  1411 //
  1412 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
  1413 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
  1414 // has to be reset to 0. This is required to allow proper stack traversal.
  1415 void MacroAssembler::set_last_Java_frame(Register java_thread,
  1416                                          Register last_java_sp,
  1417                                          Register last_java_fp,
  1418                                          address  last_java_pc) {
  1419   // determine java_thread register
  1420   if (!java_thread->is_valid()) {
  1421 #ifndef OPT_THREAD
  1422     java_thread = T2;
  1423     get_thread(java_thread);
  1424 #else
  1425     java_thread = TREG;
  1426 #endif
  1428   // determine last_java_sp register
  1429   if (!last_java_sp->is_valid()) {
  1430     last_java_sp = SP;
  1433   // last_java_fp is optional
  1435   if (last_java_fp->is_valid()) {
  1436     st_ptr(last_java_fp, java_thread, in_bytes(JavaThread::last_Java_fp_offset()));
  1439   // last_java_pc is optional
  1441   if (last_java_pc != NULL) {
  1442     relocate(relocInfo::internal_pc_type);
  1443     patchable_set48(AT, (long)last_java_pc);
  1444     st_ptr(AT, java_thread, in_bytes(JavaThread::last_Java_pc_offset()));
  1446   st_ptr(last_java_sp, java_thread, in_bytes(JavaThread::last_Java_sp_offset()));
  1449 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
  1450                                          Register last_java_fp,
  1451                                          address  last_java_pc) {
  1452   // determine last_java_sp register
  1453   if (!last_java_sp->is_valid()) {
  1454     last_java_sp = SP;
  1457   Register thread = TREG;
  1458 #ifndef OPT_THREAD
  1459   get_thread(thread);
  1460 #endif
  1461   // last_java_fp is optional
  1462   if (last_java_fp->is_valid()) {
  1463     sd(last_java_fp, Address(thread, JavaThread::last_Java_fp_offset()));
  1466   // last_java_pc is optional
  1467   if (last_java_pc != NULL) {
  1468     Address java_pc(thread,
  1469                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
  1470     li(AT, (intptr_t)(last_java_pc));
  1471     sd(AT, java_pc);
  1474   sd(last_java_sp, Address(thread, JavaThread::last_Java_sp_offset()));
  1478 //////////////////////////////////////////////////////////////////////////////////
  1479 #if INCLUDE_ALL_GCS
  1481 void MacroAssembler::g1_write_barrier_pre(Register obj,
  1482                                           Register pre_val,
  1483                                           Register thread,
  1484                                           Register tmp,
  1485                                           bool tosca_live,
  1486                                           bool expand_call) {
  1488   // If expand_call is true then we expand the call_VM_leaf macro
  1489   // directly to skip generating the check by
  1490   // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
  1492 #ifdef _LP64
  1493   assert(thread == TREG, "must be");
  1494 #endif // _LP64
  1496   Label done;
  1497   Label runtime;
  1499   assert(pre_val != noreg, "check this code");
  1501   if (obj != noreg) {
  1502     assert_different_registers(obj, pre_val, tmp);
  1503     assert(pre_val != V0, "check this code");
  1506   Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
  1507                                        PtrQueue::byte_offset_of_active()));
  1508   Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
  1509                                        PtrQueue::byte_offset_of_index()));
  1510   Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
  1511                                        PtrQueue::byte_offset_of_buf()));
  1514   // Is marking active?
  1515   if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
  1516     lw(AT, in_progress);
  1517   } else {
  1518     assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
  1519     lb(AT, in_progress);
  1521   beq(AT, R0, done);
  1522   nop();
  1524   // Do we need to load the previous value?
  1525   if (obj != noreg) {
  1526     load_heap_oop(pre_val, Address(obj, 0));
  1529   // Is the previous value null?
  1530   beq(pre_val, R0, done);
  1531   nop();
  1533   // Can we store original value in the thread's buffer?
  1534   // Is index == 0?
  1535   // (The index field is typed as size_t.)
  1537   ld(tmp, index);
  1538   beq(tmp, R0, runtime);
  1539   nop();
  1541   daddiu(tmp, tmp, -1 * wordSize);
  1542   sd(tmp, index);
  1543   ld(AT, buffer);
  1544   daddu(tmp, tmp, AT);
  1546   // Record the previous value
  1547   sd(pre_val, tmp, 0);
  1548   beq(R0, R0, done);
  1549   nop();
  1551   bind(runtime);
  1552   // save the live input values
  1553   if (tosca_live) push(V0);
  1555   if (obj != noreg && obj != V0) push(obj);
  1557   if (pre_val != V0) push(pre_val);
  1559   // Calling the runtime using the regular call_VM_leaf mechanism generates
  1560   // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
  1561   // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL.
  1562   //
  1563   // If we care generating the pre-barrier without a frame (e.g. in the
  1564   // intrinsified Reference.get() routine) then ebp might be pointing to
  1565   // the caller frame and so this check will most likely fail at runtime.
  1566   //
  1567   // Expanding the call directly bypasses the generation of the check.
  1568   // So when we do not have have a full interpreter frame on the stack
  1569   // expand_call should be passed true.
  1571   NOT_LP64( push(thread); )
  1573   if (expand_call) {
  1574     LP64_ONLY( assert(pre_val != A1, "smashed arg"); )
  1575     if (thread != A1) move(A1, thread);
  1576     if (pre_val != A0) move(A0, pre_val);
  1577     MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
  1578   } else {
  1579     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
  1582   NOT_LP64( pop(thread); )
  1584   // save the live input values
  1585   if (pre_val != V0)
  1586     pop(pre_val);
  1588   if (obj != noreg && obj != V0)
  1589     pop(obj);
  1591   if(tosca_live) pop(V0);
  1593   bind(done);
  1596 void MacroAssembler::g1_write_barrier_post(Register store_addr,
  1597                                            Register new_val,
  1598                                            Register thread,
  1599                                            Register tmp,
  1600                                            Register tmp2) {
  1601   assert(tmp  != AT, "must be");
  1602   assert(tmp2 != AT, "must be");
  1603 #ifdef _LP64
  1604   assert(thread == TREG, "must be");
  1605 #endif // _LP64
  1607   Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
  1608                                        PtrQueue::byte_offset_of_index()));
  1609   Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
  1610                                        PtrQueue::byte_offset_of_buf()));
  1612   BarrierSet* bs = Universe::heap()->barrier_set();
  1613   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
  1614   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
  1616   Label done;
  1617   Label runtime;
  1619   // Does store cross heap regions?
  1620   xorr(AT, store_addr, new_val);
  1621   dsrl(AT, AT, HeapRegion::LogOfHRGrainBytes);
  1622   beq(AT, R0, done);
  1623   nop();
  1626   // crosses regions, storing NULL?
  1627   beq(new_val, R0, done);
  1628   nop();
  1630   // storing region crossing non-NULL, is card already dirty?
  1631   const Register card_addr = tmp;
  1632   const Register cardtable = tmp2;
  1634   move(card_addr, store_addr);
  1635   dsrl(card_addr, card_addr, CardTableModRefBS::card_shift);
  1636   // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT
  1637   // a valid address and therefore is not properly handled by the relocation code.
  1638   set64(cardtable, (intptr_t)ct->byte_map_base);
  1639   daddu(card_addr, card_addr, cardtable);
  1641   lb(AT, card_addr, 0);
  1642   daddiu(AT, AT, -1 * (int)G1SATBCardTableModRefBS::g1_young_card_val());
  1643   beq(AT, R0, done);
  1644   nop();
  1646   sync();
  1647   lb(AT, card_addr, 0);
  1648   daddiu(AT, AT, -1 * (int)(int)CardTableModRefBS::dirty_card_val());
  1649   beq(AT, R0, done);
  1650   nop();
  1653   // storing a region crossing, non-NULL oop, card is clean.
  1654   // dirty card and log.
  1655   move(AT, (int)CardTableModRefBS::dirty_card_val()); 
  1656   sb(AT, card_addr, 0);
  1658   lw(AT, queue_index);
  1659   beq(AT, R0, runtime);
  1660   nop();
  1661   daddiu(AT, AT, -1 * wordSize);
  1662   sw(AT, queue_index);
  1663   ld(tmp2, buffer);
  1664 #ifdef _LP64
  1665   ld(AT, queue_index);
  1666   daddu(tmp2, tmp2, AT);
  1667   sd(card_addr, tmp2, 0);
  1668 #else
  1669   lw(AT, queue_index);
  1670   addu32(tmp2, tmp2, AT);
  1671   sw(card_addr, tmp2, 0);
  1672 #endif
  1673   beq(R0, R0, done);
  1674   nop();
  1676   bind(runtime);
  1677   // save the live input values
  1678   push(store_addr);
  1679   push(new_val);
  1680 #ifdef _LP64
  1681   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, TREG);
  1682 #else
  1683   push(thread);
  1684   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
  1685   pop(thread);
  1686 #endif
  1687   pop(new_val);
  1688   pop(store_addr);
  1690   bind(done);
  1693 #endif // INCLUDE_ALL_GCS
  1694 //////////////////////////////////////////////////////////////////////////////////
  1697 void MacroAssembler::store_check(Register obj) {
  1698   // Does a store check for the oop in register obj. The content of
  1699   // register obj is destroyed afterwards.
  1700   store_check_part_1(obj);
  1701   store_check_part_2(obj);
  1704 void MacroAssembler::store_check(Register obj, Address dst) {
  1705   store_check(obj);
  1709 // split the store check operation so that other instructions can be scheduled inbetween
  1710 void MacroAssembler::store_check_part_1(Register obj) {
  1711   BarrierSet* bs = Universe::heap()->barrier_set();
  1712   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
  1713 #ifdef _LP64
  1714   dsrl(obj, obj, CardTableModRefBS::card_shift);
  1715 #else
  1716   shr(obj, CardTableModRefBS::card_shift);
  1717 #endif
  1720 void MacroAssembler::store_check_part_2(Register obj) {
  1721   BarrierSet* bs = Universe::heap()->barrier_set();
  1722   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
  1723   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
  1724   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
  1726   set64(AT, (long)ct->byte_map_base);
  1727 #ifdef _LP64
  1728   dadd(AT, AT, obj);
  1729 #else
  1730   add(AT, AT, obj);
  1731 #endif
  1732   if (UseConcMarkSweepGC) sync();
  1733   sb(R0, AT, 0);
  1736 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
  1737 void MacroAssembler::tlab_allocate(Register obj, Register var_size_in_bytes, int con_size_in_bytes,
  1738                                    Register t1, Register t2, Label& slow_case) {
  1739   assert_different_registers(obj, var_size_in_bytes, t1, t2, AT);
  1741   Register end = t2;
  1742 #ifndef OPT_THREAD
  1743   Register thread = t1;
  1744   get_thread(thread);
  1745 #else
  1746   Register thread = TREG;
  1747 #endif
  1748   verify_tlab(t1, t2);//blows t1&t2
  1750   ld_ptr(obj, thread, in_bytes(JavaThread::tlab_top_offset()));
  1752   if (var_size_in_bytes == NOREG) {
  1753     // i dont think we need move con_size_in_bytes to a register first.
  1754     // by yjl 8/17/2005
  1755     assert(is_simm16(con_size_in_bytes), "fixme by moving imm to a register first");
  1756     addi(end, obj, con_size_in_bytes);
  1757   } else {
  1758     add(end, obj, var_size_in_bytes);
  1761   ld_ptr(AT, thread, in_bytes(JavaThread::tlab_end_offset()));
  1762   sltu(AT, AT, end);
  1763   bne_far(AT, R0, slow_case);
  1764   delayed()->nop();
  1767   // update the tlab top pointer
  1768   st_ptr(end, thread, in_bytes(JavaThread::tlab_top_offset()));
  1770   // recover var_size_in_bytes if necessary
  1771   /*if (var_size_in_bytes == end) {
  1772     sub(var_size_in_bytes, end, obj);
  1773     }*/
  1775   verify_tlab(t1, t2);
  1778 // Defines obj, preserves var_size_in_bytes
  1779 void MacroAssembler::eden_allocate(Register obj, Register var_size_in_bytes, int con_size_in_bytes,
  1780                                    Register t1, Register t2, Label& slow_case) {
  1781   assert_different_registers(obj, var_size_in_bytes, t1, AT);
  1782   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) { //by yyq
  1783     // No allocation in the shared eden.
  1784     b_far(slow_case);
  1785     delayed()->nop();
  1786   } else {
  1788 #ifndef _LP64
  1789     Address heap_top(t1, Assembler::split_low((intptr_t)Universe::heap()->top_addr()));
  1790     lui(t1, split_high((intptr_t)Universe::heap()->top_addr()));
  1791 #else
  1792     Address heap_top(t1);
  1793     li(t1, (long)Universe::heap()->top_addr());
  1794 #endif
  1795     ld_ptr(obj, heap_top);
  1797     Register end = t2;
  1798     Label retry;
  1800     bind(retry);
  1801     if (var_size_in_bytes == NOREG) {
  1802     // i dont think we need move con_size_in_bytes to a register first.
  1803       assert(is_simm16(con_size_in_bytes), "fixme by moving imm to a register first");
  1804       addi(end, obj, con_size_in_bytes);
  1805     } else {
  1806       add(end, obj, var_size_in_bytes);
  1808     // if end < obj then we wrapped around => object too long => slow case
  1809     sltu(AT, end, obj);
  1810     bne_far(AT, R0, slow_case);
  1811     delayed()->nop();
  1813     li(AT, (long)Universe::heap()->end_addr());
  1814     sltu(AT, AT, end);
  1815     bne_far(AT, R0, slow_case);
  1816     delayed()->nop();
  1817     // Compare obj with the top addr, and if still equal, store the new top addr in
  1818     // end at the address of the top addr pointer. Sets ZF if was equal, and clears
  1819     // it otherwise. Use lock prefix for atomicity on MPs.
  1820     //if (os::is_MP()) {
  1821     //  sync();
  1822     //}
  1824     // if someone beat us on the allocation, try again, otherwise continue
  1825     cmpxchg(end, heap_top, obj);
  1826     beq_far(AT, R0, retry);    //by yyq
  1827     delayed()->nop();
  1832 // C2 doesn't invoke this one.
  1833 void MacroAssembler::tlab_refill(Label& retry, Label& try_eden, Label& slow_case) {
  1834   Register top = T0;
  1835   Register t1  = T1;
  1836 /* Jin: tlab_refill() is called in
  1838      [c1_Runtime1_mips.cpp] Runtime1::generate_code_for(new_type_array_id);
  1840   In generate_code_for(), T2 has been assigned as a register(length), which is used
  1841  after calling tlab_refill();
  1842   Therefore, tlab_refill() should not use T2.
  1844  Source:
  1846 Exception in thread "main" java.lang.ArrayIndexOutOfBoundsException
  1847         at java.lang.System.arraycopy(Native Method)
  1848         at java.util.Arrays.copyOf(Arrays.java:2799)  <-- alloc_array
  1849         at sun.misc.Resource.getBytes(Resource.java:117)
  1850         at java.net.URLClassLoader.defineClass(URLClassLoader.java:273)
  1851         at java.net.URLClassLoader.findClass(URLClassLoader.java:205)
  1852         at java.lang.ClassLoader.loadClass(ClassLoader.java:321)
  1853  */
  1854   Register t2  = T9;
  1855   Register t3  = T3;
  1856   Register thread_reg = T8;
  1857   Label do_refill, discard_tlab;
  1858   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) { //by yyq
  1859     // No allocation in the shared eden.
  1860     b(slow_case);
  1861     delayed()->nop();
  1864   get_thread(thread_reg);
  1866   ld_ptr(top, thread_reg, in_bytes(JavaThread::tlab_top_offset()));
  1867   ld_ptr(t1, thread_reg, in_bytes(JavaThread::tlab_end_offset()));
  1869   // calculate amount of free space
  1870   sub(t1, t1, top);
  1871   shr(t1, LogHeapWordSize);
  1873   // Retain tlab and allocate object in shared space if
  1874   // the amount free in the tlab is too large to discard.
  1875   ld_ptr(t2, thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset()));
  1876   slt(AT, t2, t1);
  1877   beq(AT, R0, discard_tlab);
  1878   delayed()->nop();
  1880   // Retain
  1882 #ifndef _LP64
  1883   move(AT, ThreadLocalAllocBuffer::refill_waste_limit_increment());
  1884 #else
  1885   li(AT, ThreadLocalAllocBuffer::refill_waste_limit_increment());
  1886 #endif
  1887   add(t2, t2, AT);
  1888   st_ptr(t2, thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset()));
  1890   if (TLABStats) {
  1891     // increment number of slow_allocations
  1892     lw(AT, thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset()));
  1893     addiu(AT, AT, 1);
  1894     sw(AT, thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset()));
  1896   b(try_eden);
  1897   delayed()->nop();
  1899   bind(discard_tlab);
  1900   if (TLABStats) {
  1901     // increment number of refills
  1902     lw(AT, thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset()));
  1903     addi(AT, AT, 1);
  1904     sw(AT, thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset()));
  1905     // accumulate wastage -- t1 is amount free in tlab
  1906     lw(AT, thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset()));
  1907     add(AT, AT, t1);
  1908     sw(AT, thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset()));
  1911   // if tlab is currently allocated (top or end != null) then
  1912   // fill [top, end + alignment_reserve) with array object
  1913   beq(top, R0, do_refill);
  1914   delayed()->nop();
  1916   // set up the mark word
  1917   li(AT, (long)markOopDesc::prototype()->copy_set_hash(0x2));
  1918   st_ptr(AT, top, oopDesc::mark_offset_in_bytes());
  1920   // set the length to the remaining space
  1921   addi(t1, t1, - typeArrayOopDesc::header_size(T_INT));
  1922   addi(t1, t1, ThreadLocalAllocBuffer::alignment_reserve());
  1923   shl(t1, log2_intptr(HeapWordSize/sizeof(jint)));
  1924   sw(t1, top, arrayOopDesc::length_offset_in_bytes());
  1926   // set klass to intArrayKlass
  1927 #ifndef _LP64
  1928   lui(AT, split_high((intptr_t)Universe::intArrayKlassObj_addr()));
  1929   lw(t1, AT, split_low((intptr_t)Universe::intArrayKlassObj_addr()));
  1930 #else
  1931   li(AT, (intptr_t)Universe::intArrayKlassObj_addr());
  1932   ld_ptr(t1, AT, 0);
  1933 #endif
  1934   //st_ptr(t1, top, oopDesc::klass_offset_in_bytes());
  1935   store_klass(top, t1);
  1937   // refill the tlab with an eden allocation
  1938   bind(do_refill);
  1939   ld_ptr(t1, thread_reg, in_bytes(JavaThread::tlab_size_offset()));
  1940   shl(t1, LogHeapWordSize);
  1941   // add object_size ??
  1942   eden_allocate(top, t1, 0, t2, t3, slow_case);
  1944   // Check that t1 was preserved in eden_allocate.
  1945 #ifdef ASSERT
  1946   if (UseTLAB) {
  1947     Label ok;
  1948     assert_different_registers(thread_reg, t1);
  1949     ld_ptr(AT, thread_reg, in_bytes(JavaThread::tlab_size_offset()));
  1950     shl(AT, LogHeapWordSize);
  1951     beq(AT, t1, ok);
  1952     delayed()->nop();
  1953     stop("assert(t1 != tlab size)");
  1954     should_not_reach_here();
  1956     bind(ok);
  1958 #endif
  1959   st_ptr(top, thread_reg, in_bytes(JavaThread::tlab_start_offset()));
  1960   st_ptr(top, thread_reg, in_bytes(JavaThread::tlab_top_offset()));
  1961   add(top, top, t1);
  1962   addi(top, top, - ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
  1963   st_ptr(top, thread_reg, in_bytes(JavaThread::tlab_end_offset()));
  1964   verify_tlab(t1, t2);
  1965   b(retry);
  1966   delayed()->nop();
  1969 static const double     pi_4 =  0.7853981633974483;
  1971 // the x86 version is to clumsy, i dont think we need that fuss. maybe i'm wrong, FIXME
  1972 // must get argument(a double) in F12/F13
  1973 //void MacroAssembler::trigfunc(char trig, bool preserve_cpu_regs, int num_fpu_regs_in_use) {
  1974 //We need to preseve the register which maybe modified during the Call @Jerome
  1975 void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
  1976 //save all modified register here
  1977 //  if (preserve_cpu_regs) {
  1978 //  }
  1979 //FIXME, in the disassembly of tirgfunc, only used V0,V1,T9, SP,RA,so we ony save V0,V1,T9
  1980   pushad();
  1981 //we should preserve the stack space before we call
  1982   addi(SP, SP, -wordSize * 2);
  1983         switch (trig){
  1984     case 's' :
  1985                   call( CAST_FROM_FN_PTR(address, SharedRuntime::dsin), relocInfo::runtime_call_type );
  1986       delayed()->nop();
  1987       break;
  1988     case 'c':
  1989       call( CAST_FROM_FN_PTR(address, SharedRuntime::dcos), relocInfo::runtime_call_type );
  1990       delayed()->nop();
  1991       break;
  1992     case 't':
  1993       call( CAST_FROM_FN_PTR(address, SharedRuntime::dtan), relocInfo::runtime_call_type );
  1994       delayed()->nop();
  1995       break;
  1996     default:assert (false, "bad intrinsic");
  1997     break;
  2001   addi(SP, SP, wordSize * 2);
  2002   popad();
  2003 //  if (preserve_cpu_regs) {
  2004 //  }
  2007 #ifdef _LP64
  2008 void MacroAssembler::li(Register rd, long imm) {
  2009   if (imm <= max_jint && imm >= min_jint) {
  2010     li32(rd, (int)imm);
  2011   } else if (julong(imm) <= 0xFFFFFFFF) {
  2012     assert_not_delayed();
  2013     // lui sign-extends, so we can't use that.
  2014     ori(rd, R0, julong(imm) >> 16);
  2015     dsll(rd, rd, 16);
  2016     ori(rd, rd, split_low(imm));
  2017   //aoqi_test
  2018   //} else if ((imm > 0) && ((imm >> 48) == 0)) {
  2019   } else if ((imm > 0) && is_simm16(imm >> 32)) {
  2020     /* A 48-bit address */
  2021     li48(rd, imm);
  2022   } else {
  2023     li64(rd, imm);
  2026 #else
  2027 void MacroAssembler::li(Register rd, long imm) {
  2028   li32(rd, (int)imm);
  2030 #endif
  2032 void MacroAssembler::li32(Register reg, int imm) {
  2033   if (is_simm16(imm)) {
  2034     /* Jin: for imm < 0, we should use addi instead of addiu.
  2036      *  java.lang.StringCoding$StringDecoder.decode(jobject, jint, jint)
  2038      *  78 move [int:-1|I] [a0|I]
  2039      *    : daddi a0, zero, 0xffffffff  (correct)
  2040      *    : daddiu a0, zero, 0xffffffff (incorrect)
  2041      */
  2042     if (imm >= 0)
  2043       addiu(reg, R0, imm);
  2044     else
  2045       addi(reg, R0, imm);
  2046   } else {
  2047     lui(reg, split_low(imm >> 16));
  2048     if (split_low(imm))
  2049       ori(reg, reg, split_low(imm));
  2053 #ifdef _LP64
  2054 void MacroAssembler::set64(Register d, jlong value) {
  2055   assert_not_delayed();
  2057   int hi = (int)(value >> 32);
  2058   int lo = (int)(value & ~0);
  2060   if (value == lo) {  // 32-bit integer
  2061     if (is_simm16(value)) {
  2062       daddiu(d, R0, value);
  2063     } else {
  2064       lui(d, split_low(value >> 16));
  2065       if (split_low(value)) {
  2066         ori(d, d, split_low(value));
  2069   } else if (hi == 0) {  // hardware zero-extends to upper 32
  2070       ori(d, R0, julong(value) >> 16);
  2071       dsll(d, d, 16);
  2072       if (split_low(value)) {
  2073         ori(d, d, split_low(value));
  2075   } else if ((value> 0) && is_simm16(value >> 32)) {  // li48
  2076     // 4 insts
  2077     li48(d, value);
  2078   } else {  // li64
  2079     // 6 insts
  2080     li64(d, value);
  2085 int MacroAssembler::insts_for_set64(jlong value) {
  2086   int hi = (int)(value >> 32);
  2087   int lo = (int)(value & ~0);
  2089   int count = 0;
  2091   if (value == lo) {  // 32-bit integer
  2092     if (is_simm16(value)) {
  2093       //daddiu(d, R0, value);
  2094       count++;
  2095     } else {
  2096       //lui(d, split_low(value >> 16));
  2097       count++;
  2098       if (split_low(value)) {
  2099         //ori(d, d, split_low(value));
  2100         count++;
  2103   } else if (hi == 0) {  // hardware zero-extends to upper 32
  2104       //ori(d, R0, julong(value) >> 16);
  2105       //dsll(d, d, 16);
  2106       count += 2;
  2107       if (split_low(value)) {
  2108         //ori(d, d, split_low(value));
  2109         count++;
  2111   } else if ((value> 0) && is_simm16(value >> 32)) {  // li48
  2112     // 4 insts
  2113     //li48(d, value);
  2114     count += 4;
  2115   } else {  // li64
  2116     // 6 insts
  2117     //li64(d, value);
  2118     count += 6;
  2121   return count;
  2124 void MacroAssembler::patchable_set48(Register d, jlong value) {
  2125   assert_not_delayed();
  2127   int hi = (int)(value >> 32);
  2128   int lo = (int)(value & ~0);
  2130   int count = 0;
  2132   if (value == lo) {  // 32-bit integer
  2133     if (is_simm16(value)) {
  2134       daddiu(d, R0, value);
  2135       count += 1;
  2136     } else {
  2137       lui(d, split_low(value >> 16));
  2138       count += 1;
  2139       if (split_low(value)) {
  2140         ori(d, d, split_low(value));
  2141         count += 1;
  2144   } else if (hi == 0) {  // hardware zero-extends to upper 32
  2145       ori(d, R0, julong(value) >> 16);
  2146       dsll(d, d, 16);
  2147       count += 2;
  2148       if (split_low(value)) {
  2149         ori(d, d, split_low(value));
  2150         count += 1;
  2152   } else if ((value> 0) && is_simm16(value >> 32)) {  // li48
  2153     // 4 insts
  2154     li48(d, value);
  2155     count += 4;
  2156   } else {  // li64
  2157     tty->print_cr("value = 0x%x", value);
  2158     guarantee(false, "Not supported yet !");
  2161   for (count; count < 4; count++) {
  2162     nop();
  2166 void MacroAssembler::patchable_set32(Register d, jlong value) {
  2167   assert_not_delayed();
  2169   int hi = (int)(value >> 32);
  2170   int lo = (int)(value & ~0);
  2172   int count = 0;
  2174   if (value == lo) {  // 32-bit integer
  2175     if (is_simm16(value)) {
  2176       daddiu(d, R0, value);
  2177       count += 1;
  2178     } else {
  2179       lui(d, split_low(value >> 16));
  2180       count += 1;
  2181       if (split_low(value)) {
  2182         ori(d, d, split_low(value));
  2183         count += 1;
  2186   } else if (hi == 0) {  // hardware zero-extends to upper 32
  2187       ori(d, R0, julong(value) >> 16);
  2188       dsll(d, d, 16);
  2189       count += 2;
  2190       if (split_low(value)) {
  2191         ori(d, d, split_low(value));
  2192         count += 1;
  2194   } else {
  2195     tty->print_cr("value = 0x%x", value);
  2196     guarantee(false, "Not supported yet !");
  2199   for (count; count < 3; count++) {
  2200     nop();
  2204 void MacroAssembler::patchable_call32(Register d, jlong value) {
  2205   assert_not_delayed();
  2207   int hi = (int)(value >> 32);
  2208   int lo = (int)(value & ~0);
  2210   int count = 0;
  2212   if (value == lo) {  // 32-bit integer
  2213     if (is_simm16(value)) {
  2214       daddiu(d, R0, value);
  2215       count += 1;
  2216     } else {
  2217       lui(d, split_low(value >> 16));
  2218       count += 1;
  2219       if (split_low(value)) {
  2220         ori(d, d, split_low(value));
  2221         count += 1;
  2224   } else {
  2225     tty->print_cr("value = 0x%x", value);
  2226     guarantee(false, "Not supported yet !");
  2229   for (count; count < 2; count++) {
  2230     nop();
  2234 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
  2235   assert(UseCompressedClassPointers, "should only be used for compressed header");
  2236   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
  2238   int klass_index = oop_recorder()->find_index(k);
  2239   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
  2240   long narrowKlass = (long)Klass::encode_klass(k);
  2242   relocate(rspec, Assembler::narrow_oop_operand);
  2243   patchable_set48(dst, narrowKlass);
  2247 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
  2248   assert(UseCompressedOops, "should only be used for compressed header");
  2249   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
  2251   int oop_index = oop_recorder()->find_index(obj);
  2252   RelocationHolder rspec = oop_Relocation::spec(oop_index);
  2254   relocate(rspec, Assembler::narrow_oop_operand);
  2255   patchable_set48(dst, oop_index);
  2258 void MacroAssembler::li64(Register rd, long imm) {
  2259   assert_not_delayed();
  2260   lui(rd, imm >> 48);
  2261   ori(rd, rd, split_low(imm >> 32));
  2262   dsll(rd, rd, 16);
  2263   ori(rd, rd, split_low(imm >> 16));
  2264   dsll(rd, rd, 16);
  2265   ori(rd, rd, split_low(imm));
  2268 void MacroAssembler::li48(Register rd, long imm) {
  2269   assert_not_delayed();
  2270   assert(is_simm16(imm >> 32), "Not a 48-bit address");
  2271   lui(rd, imm >> 32);
  2272   ori(rd, rd, split_low(imm >> 16));
  2273   dsll(rd, rd, 16);
  2274   ori(rd, rd, split_low(imm));
  2276 #endif
  2277 // NOTE: i dont push eax as i486.
  2278 // the x86 save eax for it use eax as the jump register
  2279 void MacroAssembler::verify_oop(Register reg, const char* s) {
  2280   /*
  2281      if (!VerifyOops) return;
  2283   // Pass register number to verify_oop_subroutine
  2284   char* b = new char[strlen(s) + 50];
  2285   sprintf(b, "verify_oop: %s: %s", reg->name(), s);
  2286   push(rax);                          // save rax,
  2287   push(reg);                          // pass register argument
  2288   ExternalAddress buffer((address) b);
  2289   // avoid using pushptr, as it modifies scratch registers
  2290   // and our contract is not to modify anything
  2291   movptr(rax, buffer.addr());
  2292   push(rax);
  2293   // call indirectly to solve generation ordering problem
  2294   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
  2295   call(rax);
  2296    */
  2297   if (!VerifyOops) return;
  2298   const char * b = NULL;
  2299   stringStream ss;
  2300   ss.print("verify_oop: %s: %s", reg->name(), s);
  2301   b = code_string(ss.as_string());
  2302 #ifdef _LP64
  2303   pushad();
  2304   move(A1, reg);
  2305   li(A0, (long)b);
  2306   li(AT, (long)StubRoutines::verify_oop_subroutine_entry_address());
  2307   ld(T9, AT, 0);
  2308   jalr(T9);
  2309   delayed()->nop();
  2310   popad();
  2311 #else
  2312   // Pass register number to verify_oop_subroutine
  2313   sw(T0, SP, - wordSize);
  2314   sw(T1, SP, - 2*wordSize);
  2315   sw(RA, SP, - 3*wordSize);
  2316   sw(A0, SP ,- 4*wordSize);
  2317   sw(A1, SP ,- 5*wordSize);
  2318   sw(AT, SP ,- 6*wordSize);
  2319   sw(T9, SP ,- 7*wordSize);
  2320   addiu(SP, SP, - 7 * wordSize);
  2321   move(A1, reg);
  2322   li(A0, (long)b);
  2323   // call indirectly to solve generation ordering problem
  2324   li(AT, (long)StubRoutines::verify_oop_subroutine_entry_address());
  2325   lw(T9, AT, 0);
  2326   jalr(T9);
  2327   delayed()->nop();
  2328   lw(T0, SP, 6* wordSize);
  2329   lw(T1, SP, 5* wordSize);
  2330   lw(RA, SP, 4* wordSize);
  2331   lw(A0, SP, 3* wordSize);
  2332   lw(A1, SP, 2* wordSize);
  2333   lw(AT, SP, 1* wordSize);
  2334   lw(T9, SP, 0* wordSize);
  2335   addiu(SP, SP, 7 * wordSize);
  2336 #endif
  2340 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
  2341   if (!VerifyOops) {
  2342     nop();
  2343     return;
  2345   // Pass register number to verify_oop_subroutine
  2346   const char * b = NULL;
  2347   stringStream ss;
  2348   ss.print("verify_oop_addr: %s",  s);
  2349   b = code_string(ss.as_string());
  2351   st_ptr(T0, SP, - wordSize);
  2352   st_ptr(T1, SP, - 2*wordSize);
  2353   st_ptr(RA, SP, - 3*wordSize);
  2354   st_ptr(A0, SP, - 4*wordSize);
  2355   st_ptr(A1, SP, - 5*wordSize);
  2356   st_ptr(AT, SP, - 6*wordSize);
  2357   st_ptr(T9, SP, - 7*wordSize);
  2358   ld_ptr(A1, addr);   // addr may use SP, so load from it before change SP
  2359   addiu(SP, SP, - 7 * wordSize);
  2361   li(A0, (long)b);
  2362   // call indirectly to solve generation ordering problem
  2363   li(AT, (long)StubRoutines::verify_oop_subroutine_entry_address());
  2364   ld_ptr(T9, AT, 0);
  2365   jalr(T9);
  2366   delayed()->nop();
  2367   ld_ptr(T0, SP, 6* wordSize);
  2368   ld_ptr(T1, SP, 5* wordSize);
  2369   ld_ptr(RA, SP, 4* wordSize);
  2370   ld_ptr(A0, SP, 3* wordSize);
  2371   ld_ptr(A1, SP, 2* wordSize);
  2372   ld_ptr(AT, SP, 1* wordSize);
  2373   ld_ptr(T9, SP, 0* wordSize);
  2374   addiu(SP, SP, 7 * wordSize);
  2377 // used registers :  T0, T1
  2378 void MacroAssembler::verify_oop_subroutine() {
  2379   // RA: ra
  2380   // A0: char* error message
  2381   // A1: oop   object to verify
  2383   Label exit, error;
  2384   // increment counter
  2385   li(T0, (long)StubRoutines::verify_oop_count_addr());
  2386   lw(AT, T0, 0);
  2387 #ifdef _LP64
  2388   daddi(AT, AT, 1);
  2389 #else
  2390   addi(AT, AT, 1);
  2391 #endif
  2392   sw(AT, T0, 0);
  2394   // make sure object is 'reasonable'
  2395   beq(A1, R0, exit);         // if obj is NULL it is ok
  2396   delayed()->nop();
  2398   // Check if the oop is in the right area of memory
  2399   //const int oop_mask = Universe::verify_oop_mask();
  2400   //const int oop_bits = Universe::verify_oop_bits();
  2401   const uintptr_t oop_mask = Universe::verify_oop_mask();
  2402   const uintptr_t oop_bits = Universe::verify_oop_bits();
  2403   li(AT, oop_mask);
  2404   andr(T0, A1, AT);
  2405   li(AT, oop_bits);
  2406   bne(T0, AT, error);
  2407   delayed()->nop();
  2409   // make sure klass is 'reasonable'
  2410   //add for compressedoops
  2411   reinit_heapbase();
  2412   //add for compressedoops
  2413   load_klass(T0, A1);
  2414   beq(T0, R0, error);                        // if klass is NULL it is broken
  2415   delayed()->nop();
  2416   #if 0
  2417   //FIXME:wuhui.
  2418   // Check if the klass is in the right area of memory
  2419   //const int klass_mask = Universe::verify_klass_mask();
  2420   //const int klass_bits = Universe::verify_klass_bits();
  2421   const uintptr_t klass_mask = Universe::verify_klass_mask();
  2422   const uintptr_t klass_bits = Universe::verify_klass_bits();
  2424   li(AT, klass_mask);
  2425   andr(T1, T0, AT);
  2426   li(AT, klass_bits);
  2427   bne(T1, AT, error);
  2428   delayed()->nop();
  2429   // make sure klass' klass is 'reasonable'
  2430   //add for compressedoops
  2431   load_klass(T0, T0);
  2432   beq(T0, R0, error);  // if klass' klass is NULL it is broken
  2433   delayed()->nop();
  2435   li(AT, klass_mask);
  2436   andr(T1, T0, AT);
  2437   li(AT, klass_bits);
  2438   bne(T1, AT, error);
  2439   delayed()->nop();     // if klass not in right area of memory it is broken too.
  2440 #endif
  2441   // return if everything seems ok
  2442   bind(exit);
  2444   jr(RA);
  2445   delayed()->nop();
  2447   // handle errors
  2448   bind(error);
  2449   pushad();
  2450 #ifndef _LP64
  2451   addi(SP, SP, (-1) * wordSize);
  2452 #endif
  2453   call(CAST_FROM_FN_PTR(address, MacroAssembler::debug), relocInfo::runtime_call_type);
  2454   delayed()->nop();
  2455 #ifndef _LP64
  2456   addiu(SP, SP, 1 * wordSize);
  2457 #endif
  2458   popad();
  2459   jr(RA);
  2460   delayed()->nop();
  2463 void MacroAssembler::verify_tlab(Register t1, Register t2) {
  2464 #ifdef ASSERT
  2465   assert_different_registers(t1, t2, AT);
  2466   if (UseTLAB && VerifyOops) {
  2467     Label next, ok;
  2469     get_thread(t1);
  2471     ld_ptr(t2, t1, in_bytes(JavaThread::tlab_top_offset()));
  2472     ld_ptr(AT, t1, in_bytes(JavaThread::tlab_start_offset()));
  2473     sltu(AT, t2, AT);
  2474     beq(AT, R0, next);
  2475     delayed()->nop();
  2477     stop("assert(top >= start)");
  2479     bind(next);
  2480     ld_ptr(AT, t1, in_bytes(JavaThread::tlab_end_offset()));
  2481     sltu(AT, AT, t2);
  2482     beq(AT, R0, ok);
  2483     delayed()->nop();
  2485     stop("assert(top <= end)");
  2487     bind(ok);
  2490 #endif
  2492  RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
  2493                                                        Register tmp,
  2494                                                        int offset) {
  2495    intptr_t value = *delayed_value_addr;
  2496    if (value != 0)
  2497    return RegisterOrConstant(value + offset);
  2498    AddressLiteral a(delayed_value_addr);
  2499    // load indirectly to solve generation ordering problem
  2500    //movptr(tmp, ExternalAddress((address) delayed_value_addr));
  2501    //ld(tmp, a);
  2502    if (offset != 0)
  2503      daddi(tmp,tmp, offset);
  2505    return RegisterOrConstant(tmp);
  2508 void MacroAssembler::hswap(Register reg) {
  2509   //short
  2510   //andi(reg, reg, 0xffff);
  2511   srl(AT, reg, 8);
  2512   sll(reg, reg, 24);
  2513   sra(reg, reg, 16);
  2514   orr(reg, reg, AT);
  2517 void MacroAssembler::huswap(Register reg) {
  2518 #ifdef _LP64
  2519   dsrl(AT, reg, 8);
  2520   dsll(reg, reg, 24);
  2521   dsrl(reg, reg, 16);
  2522   orr(reg, reg, AT);
  2523   andi(reg, reg, 0xffff);
  2524 #else
  2525   //andi(reg, reg, 0xffff);
  2526   srl(AT, reg, 8);
  2527   sll(reg, reg, 24);
  2528   srl(reg, reg, 16);
  2529   orr(reg, reg, AT);
  2530 #endif
  2533 // something funny to do this will only one more register AT
  2534 // 32 bits
  2535 void MacroAssembler::swap(Register reg) {
  2536   srl(AT, reg, 8);
  2537   sll(reg, reg, 24);
  2538   orr(reg, reg, AT);
  2539   //reg : 4 1 2 3
  2540   srl(AT, AT, 16);
  2541   xorr(AT, AT, reg);
  2542   andi(AT, AT, 0xff);
  2543   //AT : 0 0 0 1^3);
  2544   xorr(reg, reg, AT);
  2545   //reg : 4 1 2 1
  2546   sll(AT, AT, 16);
  2547   xorr(reg, reg, AT);
  2548   //reg : 4 3 2 1
  2551 #ifdef _LP64
  2553 /* do 32-bit CAS using MIPS64 lld/scd
  2555   Jin: cas_int should only compare 32-bits of the memory value.
  2556        However, lld/scd will do 64-bit operation, which violates the intention of cas_int.
  2557        To simulate a 32-bit atomic operation, the value loaded with LLD should be split into
  2558        tow halves, and only the low-32 bits is compared. If equals, the low-32 bits of newval,
  2559        plus the high-32 bits or memory value, are stored togethor with SCD.
  2561 Example:
  2563       double d = 3.1415926;
  2564       System.err.println("hello" + d);
  2566   sun.misc.FloatingDecimal$1.<init>()
  2568    `- java.util.concurrent.atomic.AtomicInteger::compareAndSet()
  2570   38 cas_int [a7a7|J] [a0|I] [a6|I]
  2571 // a0: 0xffffffffe8ea9f63 pc: 0x55647f3354
  2572 // a6: 0x4ab325aa
  2574 again:
  2575    0x00000055647f3c5c: lld at, 0x0(a7)                          ; 64-bit load, "0xe8ea9f63"
  2577    0x00000055647f3c60: sll t9, at, 0                            ; t9: low-32 bits (sign extended)
  2578    0x00000055647f3c64: dsrl32 t8, at, 0                         ; t8: high-32 bits
  2579    0x00000055647f3c68: dsll32 t8, t8, 0
  2580    0x00000055647f3c6c: bne t9, a0, 0x00000055647f3c9c           ; goto nequal
  2581    0x00000055647f3c70: sll zero, zero, 0
  2583    0x00000055647f3c74: ori v1, zero, 0xffffffff                 ; v1: low-32 bits of newval (sign unextended)
  2584    0x00000055647f3c78: dsll v1, v1, 16                          ; v1 = a6 & 0xFFFFFFFF;
  2585    0x00000055647f3c7c: ori v1, v1, 0xffffffff
  2586    0x00000055647f3c80: and v1, a6, v1
  2587    0x00000055647f3c84: or at, t8, v1
  2588    0x00000055647f3c88: scd at, 0x0(a7)
  2589    0x00000055647f3c8c: beq at, zero, 0x00000055647f3c5c         ; goto again
  2590    0x00000055647f3c90: sll zero, zero, 0
  2591    0x00000055647f3c94: beq zero, zero, 0x00000055647f45ac       ; goto done
  2592    0x00000055647f3c98: sll zero, zero, 0
  2593 nequal:
  2594    0x00000055647f45a4: dadd a0, t9, zero
  2595    0x00000055647f45a8: dadd at, zero, zero
  2596 done:
  2597 */
  2599 void MacroAssembler::cmpxchg32(Register x_reg, Address dest, Register c_reg) {
  2600   /* 2012/11/11 Jin: MIPS64 can use ll/sc for 32-bit atomic memory access */
  2601   Label done, again, nequal;
  2603   bind(again);
  2605   if(!Use3A2000) sync();
  2606   ll(AT, dest);
  2607   bne(AT, c_reg, nequal);
  2608   delayed()->nop();
  2610   move(AT, x_reg);
  2611   sc(AT, dest);
  2612   beq(AT, R0, again);
  2613   delayed()->nop();
  2614   b(done);
  2615   delayed()->nop();
  2617   // not xchged
  2618   bind(nequal);
  2619   sync();
  2620   move(c_reg, AT);
  2621   move(AT, R0);
  2623   bind(done);
  2625 #endif  // cmpxchg32
  2627 void MacroAssembler::cmpxchg(Register x_reg, Address dest, Register c_reg) {
  2628   Label done, again, nequal;
  2630   bind(again);
  2631 #ifdef _LP64
  2632   if(!Use3A2000) sync();
  2633   lld(AT, dest);
  2634 #else
  2635   if(!Use3A2000) sync();
  2636   ll(AT, dest);
  2637 #endif
  2638   bne(AT, c_reg, nequal);
  2639   delayed()->nop();
  2641   move(AT, x_reg);
  2642 #ifdef _LP64
  2643   scd(AT, dest);
  2644 #else
  2645   sc(AT, dest);
  2646 #endif
  2647   beq(AT, R0, again);
  2648   delayed()->nop();
  2649   b(done);
  2650   delayed()->nop();
  2652   // not xchged
  2653   bind(nequal);
  2654   sync();
  2655   move(c_reg, AT);
  2656   move(AT, R0);
  2658   bind(done);
  2661 void MacroAssembler::cmpxchg8(Register x_regLo, Register x_regHi, Address dest, Register c_regLo, Register c_regHi) {
  2662   Label done, again, nequal;
  2664   Register x_reg = x_regLo;
  2665   dsll32(x_regHi, x_regHi, 0);
  2666   dsll32(x_regLo, x_regLo, 0);
  2667   dsrl32(x_regLo, x_regLo, 0);
  2668   orr(x_reg, x_regLo, x_regHi);
  2670   Register c_reg = c_regLo;
  2671   dsll32(c_regHi, c_regHi, 0);
  2672   dsll32(c_regLo, c_regLo, 0);
  2673   dsrl32(c_regLo, c_regLo, 0);
  2674   orr(c_reg, c_regLo, c_regHi);
  2676   bind(again);
  2678         if(!Use3A2000) sync();
  2679   lld(AT, dest);
  2680   bne(AT, c_reg, nequal);
  2681   delayed()->nop();
  2683   //move(AT, x_reg);
  2684   dadd(AT, x_reg, R0);
  2685   scd(AT, dest);
  2686   beq(AT, R0, again);
  2687   delayed()->nop();
  2688   b(done);
  2689   delayed()->nop();
  2691   // not xchged
  2692   bind(nequal);
  2693   sync();
  2694   //move(c_reg, AT);
  2695   //move(AT, R0);
  2696   dadd(c_reg, AT, R0);
  2697   dadd(AT, R0, R0);
  2698   bind(done);
  2701 // be sure the three register is different
  2702 void MacroAssembler::rem_s(FloatRegister fd, FloatRegister fs, FloatRegister ft, FloatRegister tmp) {
  2703   assert_different_registers(tmp, fs, ft);
  2704   div_s(tmp, fs, ft);
  2705   trunc_l_s(tmp, tmp);
  2706   cvt_s_l(tmp, tmp);
  2707   mul_s(tmp, tmp, ft);
  2708   sub_s(fd, fs, tmp);
  2711 // be sure the three register is different
  2712 void MacroAssembler::rem_d(FloatRegister fd, FloatRegister fs, FloatRegister ft, FloatRegister tmp) {
  2713   assert_different_registers(tmp, fs, ft);
  2714   div_d(tmp, fs, ft);
  2715   trunc_l_d(tmp, tmp);
  2716   cvt_d_l(tmp, tmp);
  2717   mul_d(tmp, tmp, ft);
  2718   sub_d(fd, fs, tmp);
  2721 // Fast_Lock and Fast_Unlock used by C2
  2723 // Because the transitions from emitted code to the runtime
  2724 // monitorenter/exit helper stubs are so slow it's critical that
  2725 // we inline both the stack-locking fast-path and the inflated fast path.
  2726 //
  2727 // See also: cmpFastLock and cmpFastUnlock.
  2728 //
  2729 // What follows is a specialized inline transliteration of the code
  2730 // in slow_enter() and slow_exit().  If we're concerned about I$ bloat
  2731 // another option would be to emit TrySlowEnter and TrySlowExit methods
  2732 // at startup-time.  These methods would accept arguments as
  2733 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
  2734 // indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
  2735 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
  2736 // In practice, however, the # of lock sites is bounded and is usually small.
  2737 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
  2738 // if the processor uses simple bimodal branch predictors keyed by EIP
  2739 // Since the helper routines would be called from multiple synchronization
  2740 // sites.
  2741 //
  2742 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
  2743 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
  2744 // to those specialized methods.  That'd give us a mostly platform-independent
  2745 // implementation that the JITs could optimize and inline at their pleasure.
  2746 // Done correctly, the only time we'd need to cross to native could would be
  2747 // to park() or unpark() threads.  We'd also need a few more unsafe operators
  2748 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
  2749 // (b) explicit barriers or fence operations.
  2750 //
  2751 // TODO:
  2752 //
  2753 // *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
  2754 //    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
  2755 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
  2756 //    the lock operators would typically be faster than reifying Self.
  2757 //
  2758 // *  Ideally I'd define the primitives as:
  2759 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
  2760 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
  2761 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
  2762 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
  2763 //    Furthermore the register assignments are overconstrained, possibly resulting in
  2764 //    sub-optimal code near the synchronization site.
  2765 //
  2766 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
  2767 //    Alternately, use a better sp-proximity test.
  2768 //
  2769 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
  2770 //    Either one is sufficient to uniquely identify a thread.
  2771 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
  2772 //
  2773 // *  Intrinsify notify() and notifyAll() for the common cases where the
  2774 //    object is locked by the calling thread but the waitlist is empty.
  2775 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
  2776 //
  2777 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
  2778 //    But beware of excessive branch density on AMD Opterons.
  2779 //
  2780 // *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
  2781 //    or failure of the fast-path.  If the fast-path fails then we pass
  2782 //    control to the slow-path, typically in C.  In Fast_Lock and
  2783 //    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
  2784 //    will emit a conditional branch immediately after the node.
  2785 //    So we have branches to branches and lots of ICC.ZF games.
  2786 //    Instead, it might be better to have C2 pass a "FailureLabel"
  2787 //    into Fast_Lock and Fast_Unlock.  In the case of success, control
  2788 //    will drop through the node.  ICC.ZF is undefined at exit.
  2789 //    In the case of failure, the node will branch directly to the
  2790 //    FailureLabel
  2793 // obj: object to lock
  2794 // box: on-stack box address (displaced header location) - KILLED
  2795 // rax,: tmp -- KILLED
  2796 // scr: tmp -- KILLED
  2797 void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, Register scrReg) {
  2799   // Ensure the register assignents are disjoint
  2800   guarantee (objReg != boxReg, "") ;
  2801   guarantee (objReg != tmpReg, "") ;
  2802   guarantee (objReg != scrReg, "") ;
  2803   guarantee (boxReg != tmpReg, "") ;
  2804   guarantee (boxReg != scrReg, "") ;
  2807   block_comment("FastLock");
  2808   /*
  2809      move(AT, 0x0);
  2810      return;
  2811      */
  2812   if (PrintBiasedLockingStatistics) {
  2813     push(tmpReg);
  2814     atomic_inc32((address)BiasedLocking::total_entry_count_addr(), 1, AT, tmpReg);
  2815     pop(tmpReg);
  2818   if (EmitSync & 1) {
  2819     move(AT, 0x0);
  2820     return;
  2821   } else
  2822     if (EmitSync & 2) {
  2823       Label DONE_LABEL ;
  2824       if (UseBiasedLocking) {
  2825         // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
  2826         biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL);
  2829       ld(tmpReg, Address(objReg, 0)) ;          // fetch markword
  2830       ori(tmpReg, tmpReg, 0x1);
  2831       sd(tmpReg, Address(boxReg, 0));           // Anticipate successful CAS
  2833       cmpxchg(boxReg, Address(objReg, 0), tmpReg);          // Updates tmpReg
  2834       bne(AT, R0, DONE_LABEL);
  2835       delayed()->nop();
  2837       // Recursive locking
  2838       dsubu(tmpReg, tmpReg, SP);
  2839       li(AT, (7 - os::vm_page_size() ));
  2840       andr(tmpReg, tmpReg, AT);
  2841       sd(tmpReg, Address(boxReg, 0));
  2842       bind(DONE_LABEL) ;
  2843     } else {
  2844       // Possible cases that we'll encounter in fast_lock
  2845       // ------------------------------------------------
  2846       // * Inflated
  2847       //    -- unlocked
  2848       //    -- Locked
  2849       //       = by self
  2850       //       = by other
  2851       // * biased
  2852       //    -- by Self
  2853       //    -- by other
  2854       // * neutral
  2855       // * stack-locked
  2856       //    -- by self
  2857       //       = sp-proximity test hits
  2858       //       = sp-proximity test generates false-negative
  2859       //    -- by other
  2860       //
  2862       Label IsInflated, DONE_LABEL, PopDone ;
  2864       // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
  2865       // order to reduce the number of conditional branches in the most common cases.
  2866       // Beware -- there's a subtle invariant that fetch of the markword
  2867       // at [FETCH], below, will never observe a biased encoding (*101b).
  2868       // If this invariant is not held we risk exclusion (safety) failure.
  2869       if (UseBiasedLocking && !UseOptoBiasInlining) {
  2870         biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL);
  2873       ld(tmpReg, Address(objReg, 0)) ;         //Fetch the markword of the object.
  2874       andi(AT, tmpReg, markOopDesc::monitor_value);
  2875       bne(AT, R0, IsInflated);                      // inflated vs stack-locked|neutral|bias
  2876       delayed()->nop();
  2878       // Attempt stack-locking ...
  2879       ori (tmpReg, tmpReg, markOopDesc::unlocked_value);
  2880       sd(tmpReg, Address(boxReg, 0));          // Anticipate successful CAS
  2881       //if (os::is_MP()) {
  2882       //  sync();
  2883       //}
  2885       cmpxchg(boxReg, Address(objReg, 0), tmpReg);           // Updates tmpReg
  2886       //AT == 1: unlocked
  2888       if (PrintBiasedLockingStatistics) {
  2889         Label L;
  2890         beq(AT, R0, L);
  2891         delayed()->nop();
  2892         push(T0);
  2893         push(T1);
  2894         atomic_inc32((address)BiasedLocking::fast_path_entry_count_addr(), 1, T0, T1);
  2895         pop(T1);
  2896         pop(T0);
  2897         bind(L);
  2899       bne(AT, R0, DONE_LABEL);
  2900       delayed()->nop();
  2902       // Recursive locking
  2903       // The object is stack-locked: markword contains stack pointer to BasicLock.
  2904       // Locked by current thread if difference with current SP is less than one page.
  2905       dsubu(tmpReg, tmpReg, SP);
  2906       li(AT, 7 - os::vm_page_size() );
  2907       andr(tmpReg, tmpReg, AT);
  2908       sd(tmpReg, Address(boxReg, 0));
  2909       if (PrintBiasedLockingStatistics) {
  2910         Label L;
  2911         // tmpReg == 0 => BiasedLocking::_fast_path_entry_count++
  2912         bne(tmpReg, R0, L);
  2913         delayed()->nop();
  2914         push(T0);
  2915         push(T1);
  2916         atomic_inc32((address)BiasedLocking::fast_path_entry_count_addr(), 1, T0, T1);
  2917         pop(T1);
  2918         pop(T0);
  2919         bind(L);
  2921       sltiu(AT, tmpReg, 1); /* AT = (tmpReg == 0) ? 1 : 0 */
  2923       b(DONE_LABEL) ;
  2924       delayed()->nop();
  2926       bind(IsInflated) ;
  2927       // The object's monitor m is unlocked iff m->owner == NULL,
  2928       // otherwise m->owner may contain a thread or a stack address.
  2930       // TODO: someday avoid the ST-before-CAS penalty by
  2931       // relocating (deferring) the following ST.
  2932       // We should also think about trying a CAS without having
  2933       // fetched _owner.  If the CAS is successful we may
  2934       // avoid an RTO->RTS upgrade on the $line.
  2935       // Without cast to int32_t a movptr will destroy r10 which is typically obj
  2936       li(AT, (int32_t)intptr_t(markOopDesc::unused_mark()));
  2937       sd(AT, Address(boxReg, 0));
  2939       move(boxReg, tmpReg) ;
  2940       ld(tmpReg, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
  2941       // if (m->owner != 0) => AT = 0, goto slow path.
  2942       move(AT, R0);
  2943       bne(tmpReg, R0, DONE_LABEL);
  2944       delayed()->nop();
  2946 #ifndef OPT_THREAD
  2947       get_thread (TREG) ;
  2948 #endif
  2949       // It's inflated and appears unlocked
  2950       //if (os::is_MP()) {
  2951       //  sync();
  2952       //}
  2953       cmpxchg(TREG, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), tmpReg) ;
  2954       // Intentional fall-through into DONE_LABEL ...
  2957       // DONE_LABEL is a hot target - we'd really like to place it at the
  2958       // start of cache line by padding with NOPs.
  2959       // See the AMD and Intel software optimization manuals for the
  2960       // most efficient "long" NOP encodings.
  2961       // Unfortunately none of our alignment mechanisms suffice.
  2962       bind(DONE_LABEL);
  2964       // At DONE_LABEL the AT is set as follows ...
  2965       // Fast_Unlock uses the same protocol.
  2966       // AT == 1 -> Success
  2967       // AT == 0 -> Failure - force control through the slow-path
  2969       // Avoid branch-to-branch on AMD processors
  2970       // This appears to be superstition.
  2971       if (EmitSync & 32) nop() ;
  2976 // obj: object to unlock
  2977 // box: box address (displaced header location), killed.  Must be EAX.
  2978 // rbx,: killed tmp; cannot be obj nor box.
  2979 //
  2980 // Some commentary on balanced locking:
  2981 //
  2982 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
  2983 // Methods that don't have provably balanced locking are forced to run in the
  2984 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
  2985 // The interpreter provides two properties:
  2986 // I1:  At return-time the interpreter automatically and quietly unlocks any
  2987 //      objects acquired the current activation (frame).  Recall that the
  2988 //      interpreter maintains an on-stack list of locks currently held by
  2989 //      a frame.
  2990 // I2:  If a method attempts to unlock an object that is not held by the
  2991 //      the frame the interpreter throws IMSX.
  2992 //
  2993 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
  2994 // B() doesn't have provably balanced locking so it runs in the interpreter.
  2995 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
  2996 // is still locked by A().
  2997 //
  2998 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
  2999 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
  3000 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
  3001 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
  3003 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
  3005   guarantee (objReg != boxReg, "") ;
  3006   guarantee (objReg != tmpReg, "") ;
  3007   guarantee (boxReg != tmpReg, "") ;
  3011   block_comment("FastUnlock");
  3014   if (EmitSync & 4) {
  3015     // Disable - inhibit all inlining.  Force control through the slow-path
  3016     move(AT, 0x0);
  3017     return;
  3018   } else
  3019     if (EmitSync & 8) {
  3020       Label DONE_LABEL ;
  3021       if (UseBiasedLocking) {
  3022         biased_locking_exit(objReg, tmpReg, DONE_LABEL);
  3024       // classic stack-locking code ...
  3025       ld(tmpReg, Address(boxReg, 0)) ;
  3026       beq(tmpReg, R0, DONE_LABEL) ;
  3027       move(AT, 0x1);  // delay slot
  3029       cmpxchg(tmpReg, Address(objReg, 0), boxReg);          // Uses EAX which is box
  3030       bind(DONE_LABEL);
  3031     } else {
  3032       Label DONE_LABEL, Stacked, CheckSucc, Inflated ;
  3034       // Critically, the biased locking test must have precedence over
  3035       // and appear before the (box->dhw == 0) recursive stack-lock test.
  3036       if (UseBiasedLocking && !UseOptoBiasInlining) {
  3037         biased_locking_exit(objReg, tmpReg, DONE_LABEL);
  3040       ld(AT, Address(boxReg, 0)) ;            // Examine the displaced header
  3041       beq(AT, R0, DONE_LABEL) ;      // 0 indicates recursive stack-lock
  3042       delayed()->daddiu(AT, R0, 0x1);
  3044       ld(tmpReg, Address(objReg, 0)) ;       // Examine the object's markword
  3045       andi(AT, tmpReg, markOopDesc::monitor_value) ;                     // Inflated?
  3046       beq(AT, R0, Stacked) ;                     // Inflated?
  3047       delayed()->nop();
  3049       bind(Inflated) ;
  3050       // It's inflated.
  3051       // Despite our balanced locking property we still check that m->_owner == Self
  3052       // as java routines or native JNI code called by this thread might
  3053       // have released the lock.
  3054       // Refer to the comments in synchronizer.cpp for how we might encode extra
  3055       // state in _succ so we can avoid fetching EntryList|cxq.
  3056       //
  3057       // I'd like to add more cases in fast_lock() and fast_unlock() --
  3058       // such as recursive enter and exit -- but we have to be wary of
  3059       // I$ bloat, T$ effects and BP$ effects.
  3060       //
  3061       // If there's no contention try a 1-0 exit.  That is, exit without
  3062       // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
  3063       // we detect and recover from the race that the 1-0 exit admits.
  3064       //
  3065       // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
  3066       // before it STs null into _owner, releasing the lock.  Updates
  3067       // to data protected by the critical section must be visible before
  3068       // we drop the lock (and thus before any other thread could acquire
  3069       // the lock and observe the fields protected by the lock).
  3070       // IA32's memory-model is SPO, so STs are ordered with respect to
  3071       // each other and there's no need for an explicit barrier (fence).
  3072       // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
  3073 #ifndef OPT_THREAD
  3074       get_thread (TREG) ;
  3075 #endif
  3077       // It's inflated
  3078       ld(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
  3079       xorr(boxReg, boxReg, TREG);
  3081       ld(AT, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
  3082       orr(boxReg, boxReg, AT);
  3084       move(AT, R0);
  3085       bne(boxReg, R0, DONE_LABEL);
  3086       delayed()->nop();
  3088       ld(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
  3089       ld(AT, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
  3090       orr(boxReg, boxReg, AT);
  3092       move(AT, R0);
  3093       bne(boxReg, R0, DONE_LABEL);
  3094       delayed()->nop();
  3096       sync();
  3097       sd(R0, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
  3098       move(AT, 0x1);
  3099       b(DONE_LABEL);
  3100       delayed()->nop();
  3102       bind  (Stacked);
  3103       ld(tmpReg, Address(boxReg, 0)) ;
  3104       //if (os::is_MP()) { sync(); }
  3105       cmpxchg(tmpReg, Address(objReg, 0), boxReg);
  3107       if (EmitSync & 65536) {
  3108         bind (CheckSucc);
  3111       bind(DONE_LABEL);
  3113       // Avoid branch to branch on AMD processors
  3114       if (EmitSync & 32768) { nop() ; }
  3118 void MacroAssembler::align(int modulus) {
  3119   while (offset() % modulus != 0) nop();
  3123 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
  3124   //Unimplemented();
  3127 #ifdef _LP64
  3128 Register caller_saved_registers[] = {AT, V0, V1, A0, A1, A2, A3, A4, A5, A6, A7, T0, T1, T2, T3, T8, T9, GP, RA, FP};
  3130 /* FIXME: Jin: In MIPS64, F0~23 are all caller-saved registers */
  3131 FloatRegister caller_saved_fpu_registers[] = {F0, F12, F13};
  3132 #else
  3133 Register caller_saved_registers[] = {AT, V0, V1, A0, A1, A2, A3, T4, T5, T6, T7, T0, T1, T2, T3, T8, T9, GP, RA, FP};
  3135 Register caller_saved_fpu_registers[] = {};
  3136 #endif
  3138 //We preserve all caller-saved register
  3139 void  MacroAssembler::pushad(){
  3140   int i;
  3142   /* Fixed-point registers */
  3143   int len = sizeof(caller_saved_registers) / sizeof(caller_saved_registers[0]);
  3144   daddi(SP, SP, -1 * len * wordSize);
  3145   for (i = 0; i < len; i++)
  3147 #ifdef _LP64
  3148     sd(caller_saved_registers[i], SP, (len - i - 1) * wordSize);
  3149 #else
  3150     sw(caller_saved_registers[i], SP, (len - i - 1) * wordSize);
  3151 #endif
  3154   /* Floating-point registers */
  3155   len = sizeof(caller_saved_fpu_registers) / sizeof(caller_saved_fpu_registers[0]);
  3156   daddi(SP, SP, -1 * len * wordSize);
  3157   for (i = 0; i < len; i++)
  3159 #ifdef _LP64
  3160     sdc1(caller_saved_fpu_registers[i], SP, (len - i - 1) * wordSize);
  3161 #else
  3162     swc1(caller_saved_fpu_registers[i], SP, (len - i - 1) * wordSize);
  3163 #endif
  3165 };
  3167 void  MacroAssembler::popad(){
  3168   int i;
  3170   /* Floating-point registers */
  3171   int len = sizeof(caller_saved_fpu_registers) / sizeof(caller_saved_fpu_registers[0]);
  3172   for (i = 0; i < len; i++)
  3174 #ifdef _LP64
  3175     ldc1(caller_saved_fpu_registers[i], SP, (len - i - 1) * wordSize);
  3176 #else
  3177     lwc1(caller_saved_fpu_registers[i], SP, (len - i - 1) * wordSize);
  3178 #endif
  3180   daddi(SP, SP, len * wordSize);
  3182   /* Fixed-point registers */
  3183   len = sizeof(caller_saved_registers) / sizeof(caller_saved_registers[0]);
  3184   for (i = 0; i < len; i++)
  3186 #ifdef _LP64
  3187     ld(caller_saved_registers[i], SP, (len - i - 1) * wordSize);
  3188 #else
  3189     lw(caller_saved_registers[i], SP, (len - i - 1) * wordSize);
  3190 #endif
  3192   daddi(SP, SP, len * wordSize);
  3193 };
  3195 void MacroAssembler::push2(Register reg1, Register reg2) {
  3196 #ifdef _LP64
  3197   daddi(SP, SP, -16);
  3198   sd(reg2, SP, 0);
  3199   sd(reg1, SP, 8);
  3200 #else
  3201   addi(SP, SP, -8);
  3202   sw(reg2, SP, 0);
  3203   sw(reg1, SP, 4);
  3204 #endif
  3207 void MacroAssembler::pop2(Register reg1, Register reg2) {
  3208 #ifdef _LP64
  3209   ld(reg1, SP, 0);
  3210   ld(reg2, SP, 8);
  3211   daddi(SP, SP, 16);
  3212 #else
  3213   lw(reg1, SP, 0);
  3214   lw(reg2, SP, 4);
  3215   addi(SP, SP, 8);
  3216 #endif
  3219 //for UseCompressedOops Option
  3220 void MacroAssembler::load_klass(Register dst, Register src) {
  3221 #ifdef _LP64
  3222     if(UseCompressedClassPointers){
  3223         lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
  3224   decode_klass_not_null(dst);
  3225     } else
  3226 #endif
  3227         ld(dst, src, oopDesc::klass_offset_in_bytes());
  3230 void MacroAssembler::store_klass(Register dst, Register src) {
  3231 #ifdef _LP64
  3232     if(UseCompressedClassPointers){
  3233     encode_klass_not_null(src);
  3234     sw(src, dst, oopDesc::klass_offset_in_bytes());
  3235     } else {
  3236 #endif
  3237     sd(src, dst, oopDesc::klass_offset_in_bytes());
  3241 void MacroAssembler::load_prototype_header(Register dst, Register src) {
  3242   load_klass(dst, src);
  3243   ld(dst, Address(dst, Klass::prototype_header_offset()));
  3246 #ifdef _LP64
  3247 void MacroAssembler::store_klass_gap(Register dst, Register src) {
  3248   if (UseCompressedClassPointers) {
  3249     sw(src, dst, oopDesc::klass_gap_offset_in_bytes());
  3253 void MacroAssembler::load_heap_oop(Register dst, Address src) {
  3254     if(UseCompressedOops){
  3255   lwu(dst, src);
  3256   decode_heap_oop(dst);
  3257     } else{
  3258   ld(dst, src);
  3262 void MacroAssembler::store_heap_oop(Address dst, Register src){
  3263     if(UseCompressedOops){
  3264        assert(!dst.uses(src), "not enough registers");
  3265        encode_heap_oop(src);
  3266        sw(src, dst);
  3267     } else{
  3268        sd(src, dst);
  3272 void MacroAssembler::store_heap_oop_null(Address dst){
  3273     if(UseCompressedOops){
  3274        sw(R0, dst);
  3275     } else{
  3276        sd(R0, dst);
  3280 #ifdef ASSERT
  3281 void MacroAssembler::verify_heapbase(const char* msg) {
  3282   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
  3283   assert (Universe::heap() != NULL, "java heap should be initialized");
  3285 #endif
  3288 // Algorithm must match oop.inline.hpp encode_heap_oop.
  3289 void MacroAssembler::encode_heap_oop(Register r) {
  3290 #ifdef ASSERT
  3291   verify_heapbase("MacroAssembler::encode_heap_oop:heap base corrupted?");
  3292 #endif
  3293   verify_oop(r, "broken oop in encode_heap_oop");
  3294   if (Universe::narrow_oop_base() == NULL) {
  3295     if (Universe::narrow_oop_shift() != 0) {
  3296       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
  3297       shr(r, LogMinObjAlignmentInBytes);
  3299     return;
  3302     movz(r, S5_heapbase, r);
  3303     dsub(r, r, S5_heapbase);
  3304     if (Universe::narrow_oop_shift() != 0) {
  3305       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
  3306       shr(r, LogMinObjAlignmentInBytes);
  3310 void MacroAssembler::encode_heap_oop(Register dst, Register src) {
  3311 #ifdef ASSERT
  3312   verify_heapbase("MacroAssembler::encode_heap_oop:heap base corrupted?");
  3313 #endif
  3314   verify_oop(src, "broken oop in encode_heap_oop");
  3315   if (Universe::narrow_oop_base() == NULL) {
  3316     if (Universe::narrow_oop_shift() != 0) {
  3317       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
  3318       dsrl(dst, src, LogMinObjAlignmentInBytes);
  3319     } else {
  3320       if (dst != src) move(dst, src);
  3322   } else {
  3323     if (dst == src) {
  3324       movz(dst, S5_heapbase, dst);
  3325       dsub(dst, dst, S5_heapbase);
  3326       if (Universe::narrow_oop_shift() != 0) {
  3327         assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
  3328         shr(dst, LogMinObjAlignmentInBytes);
  3330     } else {
  3331       dsub(dst, src, S5_heapbase);
  3332       if (Universe::narrow_oop_shift() != 0) {
  3333         assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
  3334         shr(dst, LogMinObjAlignmentInBytes);
  3336       movz(dst, R0, src);
  3341 void MacroAssembler::encode_heap_oop_not_null(Register r) {
  3342     assert (UseCompressedOops, "should be compressed");
  3343 #ifdef ASSERT
  3344     if (CheckCompressedOops) {
  3345   Label ok;
  3346   bne(r, R0, ok);
  3347   delayed()->nop();
  3348   stop("null oop passed to encode_heap_oop_not_null");
  3349   bind(ok);
  3351 #endif
  3352   verify_oop(r, "broken oop in encode_heap_oop_not_null");
  3353   if (Universe::narrow_oop_base() != NULL) {
  3354     dsub(r, r, S5_heapbase);
  3356   if (Universe::narrow_oop_shift() != 0) {
  3357     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
  3358     shr(r, LogMinObjAlignmentInBytes);
  3363 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
  3364     assert (UseCompressedOops, "should be compressed");
  3365 #ifdef ASSERT
  3366     if (CheckCompressedOops) {
  3367   Label ok;
  3368   bne(src, R0, ok);
  3369   delayed()->nop();
  3370   stop("null oop passed to encode_heap_oop_not_null2");
  3371   bind(ok);
  3373 #endif
  3374     verify_oop(src, "broken oop in encode_heap_oop_not_null2");
  3376     if (Universe::narrow_oop_base() != NULL) {
  3377       dsub(dst, src, S5_heapbase);
  3378         if (Universe::narrow_oop_shift() != 0) {
  3379         assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
  3380         shr(dst, LogMinObjAlignmentInBytes);
  3382     } else {
  3383         if (Universe::narrow_oop_shift() != 0) {
  3384         assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
  3385           dsrl(dst, src, LogMinObjAlignmentInBytes);
  3386         } else {
  3387           if (dst != src) move(dst, src);
  3392 void  MacroAssembler::decode_heap_oop(Register r) {
  3393 #ifdef ASSERT
  3394   verify_heapbase("MacroAssembler::decode_heap_oop corrupted?");
  3395 #endif
  3396   if (Universe::narrow_oop_base() == NULL) {
  3397     if (Universe::narrow_oop_shift() != 0) {
  3398       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
  3399       shl(r, LogMinObjAlignmentInBytes);
  3401   } else {
  3402     move(AT, r);
  3403     if (Universe::narrow_oop_shift() != 0) {
  3404       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
  3405       shl(r, LogMinObjAlignmentInBytes);
  3407     dadd(r, r, S5_heapbase);
  3408     movz(r, R0, AT);
  3410   verify_oop(r, "broken oop in decode_heap_oop");
  3413 void  MacroAssembler::decode_heap_oop(Register dst, Register src) {
  3414 #ifdef ASSERT
  3415   verify_heapbase("MacroAssembler::decode_heap_oop corrupted?");
  3416 #endif
  3417   if (Universe::narrow_oop_base() == NULL) {
  3418     if (Universe::narrow_oop_shift() != 0) {
  3419       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
  3420       if (dst != src) nop(); // DON'T DELETE THIS GUY.
  3421       dsll(dst, src, LogMinObjAlignmentInBytes);
  3422     } else {
  3423       if (dst != src) move(dst, src);
  3425   } else {
  3426     if (dst == src) {
  3427       move(AT, dst);
  3428       if (Universe::narrow_oop_shift() != 0) {
  3429         assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
  3430         shl(dst, LogMinObjAlignmentInBytes);
  3432       dadd(dst, dst, S5_heapbase);
  3433       movz(dst, R0, AT);
  3434     } else {
  3435       if (Universe::narrow_oop_shift() != 0) {
  3436         assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
  3437         dsll(dst, src, LogMinObjAlignmentInBytes);
  3438         daddu(dst, dst, S5_heapbase);
  3439       } else {
  3440         daddu(dst, src, S5_heapbase);
  3442       movz(dst, R0, src);
  3445   verify_oop(dst, "broken oop in decode_heap_oop");
  3448 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
  3449   // Note: it will change flags
  3450   assert (UseCompressedOops, "should only be used for compressed headers");
  3451   assert (Universe::heap() != NULL, "java heap should be initialized");
  3452   // Cannot assert, unverified entry point counts instructions (see .ad file)
  3453   // vtableStubs also counts instructions in pd_code_size_limit.
  3454   // Also do not verify_oop as this is called by verify_oop.
  3455   if (Universe::narrow_oop_shift() != 0) {
  3456     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
  3457     shl(r, LogMinObjAlignmentInBytes);
  3458     if (Universe::narrow_oop_base() != NULL) {
  3459       daddu(r, r, S5_heapbase);
  3461   } else {
  3462     assert (Universe::narrow_oop_base() == NULL, "sanity");
  3466 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
  3467   assert (UseCompressedOops, "should only be used for compressed headers");
  3468   assert (Universe::heap() != NULL, "java heap should be initialized");
  3470   // Cannot assert, unverified entry point counts instructions (see .ad file)
  3471   // vtableStubs also counts instructions in pd_code_size_limit.
  3472   // Also do not verify_oop as this is called by verify_oop.
  3473   //lea(dst, Address(S5_heapbase, src, Address::times_8, 0));
  3474   if (Universe::narrow_oop_shift() != 0) {
  3475     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
  3476     if (LogMinObjAlignmentInBytes == Address::times_8) {
  3477       dsll(dst, src, LogMinObjAlignmentInBytes);
  3478       daddu(dst, dst, S5_heapbase);
  3479     } else {
  3480       dsll(dst, src, LogMinObjAlignmentInBytes);
  3481       if (Universe::narrow_oop_base() != NULL) {
  3482         daddu(dst, dst, S5_heapbase);
  3485   } else {
  3486     assert (Universe::narrow_oop_base() == NULL, "sanity");
  3487     if (dst != src) {
  3488       move(dst, src);
  3493 void MacroAssembler::encode_klass_not_null(Register r) {
  3494   if (Universe::narrow_klass_base() != NULL) {
  3495     assert(r != AT, "Encoding a klass in AT");
  3496     set64(AT, (int64_t)Universe::narrow_klass_base());
  3497     dsub(r, r, AT);
  3499   if (Universe::narrow_klass_shift() != 0) {
  3500     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
  3501     shr(r, LogKlassAlignmentInBytes);
  3503   // Not neccessary for MIPS at all.
  3504   //if (Universe::narrow_klass_base() != NULL) {
  3505   //  reinit_heapbase();
  3506   //}
  3509 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
  3510   if (dst == src) {
  3511     encode_klass_not_null(src);
  3512   } else {
  3513     if (Universe::narrow_klass_base() != NULL) {
  3514       set64(dst, (int64_t)Universe::narrow_klass_base());
  3515       dsub(dst, src, dst);
  3516       if (Universe::narrow_klass_shift() != 0) {
  3517         assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
  3518         shr(dst, LogKlassAlignmentInBytes);
  3520     } else {
  3521       if (Universe::narrow_klass_shift() != 0) {
  3522         assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
  3523         dsrl(dst, src, LogKlassAlignmentInBytes);
  3524       } else {
  3525         move(dst, src);
  3531 // Function instr_size_for_decode_klass_not_null() counts the instructions
  3532 // generated by decode_klass_not_null(register r) and reinit_heapbase(),
  3533 // when (Universe::heap() != NULL).  Hence, if the instructions they
  3534 // generate change, then this method needs to be updated.
  3535 int MacroAssembler::instr_size_for_decode_klass_not_null() {
  3536   assert (UseCompressedClassPointers, "only for compressed klass ptrs");
  3537   if (Universe::narrow_klass_base() != NULL) {
  3538     // mov64 + addq + shlq? + mov64  (for reinit_heapbase()).
  3539     return (Universe::narrow_klass_shift() == 0 ? 4 * 9 : 4 * 10);
  3540   } else {
  3541     // longest load decode klass function, mov64, leaq
  3542     return (Universe::narrow_klass_shift() == 0 ? 4 * 0 : 4 * 1);
  3546 void  MacroAssembler::decode_klass_not_null(Register r) {
  3547   assert (UseCompressedClassPointers, "should only be used for compressed headers");
  3548   assert(r != AT, "Decoding a klass in AT");
  3549   // Cannot assert, unverified entry point counts instructions (see .ad file)
  3550   // vtableStubs also counts instructions in pd_code_size_limit.
  3551   // Also do not verify_oop as this is called by verify_oop.
  3552   if (Universe::narrow_klass_shift() != 0) {
  3553     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
  3554     shl(r, LogKlassAlignmentInBytes);
  3556   if (Universe::narrow_klass_base() != NULL) {
  3557     set64(AT, (int64_t)Universe::narrow_klass_base());
  3558     daddu(r, r, AT);
  3559     //Not neccessary for MIPS at all.
  3560     //reinit_heapbase();
  3564 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
  3565   assert (UseCompressedClassPointers, "should only be used for compressed headers");
  3567   if (dst == src) {
  3568     decode_klass_not_null(dst);
  3569   } else {
  3570     // Cannot assert, unverified entry point counts instructions (see .ad file)
  3571     // vtableStubs also counts instructions in pd_code_size_limit.
  3572     // Also do not verify_oop as this is called by verify_oop.
  3573     set64(dst, (int64_t)Universe::narrow_klass_base());
  3574     if (Universe::narrow_klass_shift() != 0) {
  3575       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
  3576       assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
  3577       dsll(AT, src, Address::times_8);
  3578       daddu(dst, dst, AT);
  3579     } else {
  3580       daddu(dst, src, dst);
  3585 void MacroAssembler::incrementl(Register reg, int value) {
  3586   if (value == min_jint) {
  3587      move(AT, value);
  3588      LP64_ONLY(addu32(reg, reg, AT)) NOT_LP64(addu(reg, reg, AT));
  3589      return;
  3591   if (value <  0) { decrementl(reg, -value); return; }
  3592   if (value == 0) {                        ; return; }
  3594   if(Assembler::is_simm16(value)) {
  3595      NOT_LP64(addiu(reg, reg, value));
  3596      LP64_ONLY(move(AT, value); addu32(reg, reg, AT));
  3597   } else {
  3598      move(AT, value);
  3599      LP64_ONLY(addu32(reg, reg, AT)) NOT_LP64(addu(reg, reg, AT));
  3603 void MacroAssembler::decrementl(Register reg, int value) {
  3604   if (value == min_jint) {
  3605      move(AT, value);
  3606      LP64_ONLY(subu32(reg, reg, AT)) NOT_LP64(subu(reg, reg, AT));
  3607      return;
  3609   if (value <  0) { incrementl(reg, -value); return; }
  3610   if (value == 0) {                        ; return; }
  3612   if(Assembler::is_simm16(value)) {
  3613      NOT_LP64(addiu(reg, reg, -value));
  3614      LP64_ONLY(move(AT, value); subu32(reg, reg, AT));
  3615   } else {
  3616      move(AT, value);
  3617      LP64_ONLY(subu32(reg, reg, AT)) NOT_LP64(subu(reg, reg, AT));
  3621 void MacroAssembler::reinit_heapbase() {
  3622   if (UseCompressedOops || UseCompressedClassPointers) {
  3623     if (Universe::heap() != NULL) {
  3624       if (Universe::narrow_oop_base() == NULL) {
  3625         move(S5_heapbase, R0);
  3626       } else {
  3627         set64(S5_heapbase, (int64_t)Universe::narrow_ptrs_base());
  3629     } else {
  3630       set64(S5_heapbase, (intptr_t)Universe::narrow_ptrs_base_addr());
  3631       ld(S5_heapbase, S5_heapbase, 0);
  3635 #endif // _LP64
  3637 void MacroAssembler::check_klass_subtype(Register sub_klass,
  3638                            Register super_klass,
  3639                            Register temp_reg,
  3640                            Label& L_success) {
  3641 //implement ind   gen_subtype_check
  3642   Label L_failure;
  3643   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
  3644   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
  3645   bind(L_failure);
  3648 SkipIfEqual::SkipIfEqual(
  3649     MacroAssembler* masm, const bool* flag_addr, bool value) {
  3650   _masm = masm;
  3651   _masm->li(AT, (address)flag_addr);
  3652   _masm->lb(AT,AT,0);
  3653   _masm->addi(AT,AT,-value);
  3654   _masm->beq(AT,R0,_label);
  3655   _masm->delayed()->nop();
  3657 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
  3658                                                    Register super_klass,
  3659                                                    Register temp_reg,
  3660                                                    Label* L_success,
  3661                                                    Label* L_failure,
  3662                                                    Label* L_slow_path,
  3663                                         RegisterOrConstant super_check_offset) {
  3664   assert_different_registers(sub_klass, super_klass, temp_reg);
  3665   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
  3666   if (super_check_offset.is_register()) {
  3667     assert_different_registers(sub_klass, super_klass,
  3668                                super_check_offset.as_register());
  3669   } else if (must_load_sco) {
  3670     assert(temp_reg != noreg, "supply either a temp or a register offset");
  3673   Label L_fallthrough;
  3674   int label_nulls = 0;
  3675   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
  3676   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
  3677   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
  3678   assert(label_nulls <= 1, "at most one NULL in the batch");
  3680   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
  3681   int sco_offset = in_bytes(Klass::super_check_offset_offset());
  3682   // If the pointers are equal, we are done (e.g., String[] elements).
  3683   // This self-check enables sharing of secondary supertype arrays among
  3684   // non-primary types such as array-of-interface.  Otherwise, each such
  3685   // type would need its own customized SSA.
  3686   // We move this check to the front of the fast path because many
  3687   // type checks are in fact trivially successful in this manner,
  3688   // so we get a nicely predicted branch right at the start of the check.
  3689   //cmpptr(sub_klass, super_klass);
  3690   //local_jcc(Assembler::equal, *L_success);
  3691   beq(sub_klass, super_klass, *L_success);
  3692   delayed()->nop();
  3693   // Check the supertype display:
  3694   if (must_load_sco) {
  3695     // Positive movl does right thing on LP64.
  3696   lwu(temp_reg, super_klass, sco_offset);
  3697     super_check_offset = RegisterOrConstant(temp_reg);
  3699   dsll(AT, super_check_offset.register_or_noreg(), Address::times_1);
  3700   daddu(AT, sub_klass, AT);
  3701   ld(AT, AT, super_check_offset.constant_or_zero()*Address::times_1);
  3703   // This check has worked decisively for primary supers.
  3704   // Secondary supers are sought in the super_cache ('super_cache_addr').
  3705   // (Secondary supers are interfaces and very deeply nested subtypes.)
  3706   // This works in the same check above because of a tricky aliasing
  3707   // between the super_cache and the primary super display elements.
  3708   // (The 'super_check_addr' can address either, as the case requires.)
  3709   // Note that the cache is updated below if it does not help us find
  3710   // what we need immediately.
  3711   // So if it was a primary super, we can just fail immediately.
  3712   // Otherwise, it's the slow path for us (no success at this point).
  3714   if (super_check_offset.is_register()) {
  3715   beq(super_klass, AT, *L_success);
  3716   delayed()->nop();
  3717   addi(AT, super_check_offset.as_register(), -sc_offset);
  3718     if (L_failure == &L_fallthrough) {
  3719     beq(AT, R0, *L_slow_path);
  3720     delayed()->nop();
  3721     } else {
  3722     bne(AT, R0, *L_failure);
  3723     delayed()->nop();
  3724     b(*L_slow_path);
  3725     delayed()->nop();
  3727   } else if (super_check_offset.as_constant() == sc_offset) {
  3728     // Need a slow path; fast failure is impossible.
  3729     if (L_slow_path == &L_fallthrough) {
  3730     beq(super_klass, AT, *L_success);
  3731     delayed()->nop();
  3732     } else {
  3733     bne(super_klass, AT, *L_slow_path);
  3734     delayed()->nop();
  3735     b(*L_success);
  3736     delayed()->nop();
  3738   } else {
  3739     // No slow path; it's a fast decision.
  3740     if (L_failure == &L_fallthrough) {
  3741     beq(super_klass, AT, *L_success);
  3742     delayed()->nop();
  3743     } else {
  3744     bne(super_klass, AT, *L_failure);
  3745     delayed()->nop();
  3746     b(*L_success);
  3747     delayed()->nop();
  3751   bind(L_fallthrough);
  3756 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
  3757                                                    Register super_klass,
  3758                                                    Register temp_reg,
  3759                                                    Register temp2_reg,
  3760                                                    Label* L_success,
  3761                                                    Label* L_failure,
  3762                                                    bool set_cond_codes) {
  3763   assert_different_registers(sub_klass, super_klass, temp_reg);
  3764   if (temp2_reg != noreg)
  3765     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
  3766   else
  3767     temp2_reg = T9;
  3768 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
  3770   Label L_fallthrough;
  3771   int label_nulls = 0;
  3772   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
  3773   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
  3774   assert(label_nulls <= 1, "at most one NULL in the batch");
  3776   // a couple of useful fields in sub_klass:
  3777   int ss_offset = in_bytes(Klass::secondary_supers_offset());
  3778   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
  3779   Address secondary_supers_addr(sub_klass, ss_offset);
  3780   Address super_cache_addr(     sub_klass, sc_offset);
  3782   // Do a linear scan of the secondary super-klass chain.
  3783   // This code is rarely used, so simplicity is a virtue here.
  3784   // The repne_scan instruction uses fixed registers, which we must spill.
  3785   // Don't worry too much about pre-existing connections with the input regs.
  3787 #if 0
  3788   assert(sub_klass != T9, "killed reg"); // killed by mov(rax, super)
  3789   assert(sub_klass != T1, "killed reg"); // killed by lea(rcx, &pst_counter)
  3790 #endif
  3792   // Get super_klass value into rax (even if it was in rdi or rcx).
  3793 #ifndef PRODUCT
  3794   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
  3795   ExternalAddress pst_counter_addr((address) pst_counter);
  3796   NOT_LP64(  incrementl(pst_counter_addr) );
  3797   //LP64_ONLY( lea(rcx, pst_counter_addr) );
  3798   //LP64_ONLY( incrementl(Address(rcx, 0)) );
  3799 #endif //PRODUCT
  3801   // We will consult the secondary-super array.
  3802   ld(temp_reg, secondary_supers_addr);
  3803   // Load the array length.  (Positive movl does right thing on LP64.)
  3804   lw(temp2_reg, Address(temp_reg, Array<Klass*>::length_offset_in_bytes()));
  3805   // Skip to start of data.
  3806   daddiu(temp_reg, temp_reg, Array<Klass*>::base_offset_in_bytes());
  3808   // Scan RCX words at [RDI] for an occurrence of RAX.
  3809   // Set NZ/Z based on last compare.
  3810   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
  3811   // not change flags (only scas instruction which is repeated sets flags).
  3812   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
  3814   /* 2013/4/3 Jin: OpenJDK8 never compresses klass pointers in secondary-super array. */
  3815   Label Loop, subtype;
  3816   bind(Loop);
  3817   beq(temp2_reg, R0, *L_failure);
  3818   delayed()->nop();
  3819   ld(AT, temp_reg, 0);
  3820   beq(AT, super_klass, subtype);
  3821   delayed()->daddi(temp_reg, temp_reg, 1 * wordSize);
  3822   b(Loop);
  3823   delayed()->daddi(temp2_reg, temp2_reg, -1);
  3825   bind(subtype);
  3826   sd(super_klass, super_cache_addr);
  3827   if (L_success != &L_fallthrough) {
  3828     b(*L_success);
  3829     delayed()->nop();
  3832   // Success.  Cache the super we found and proceed in triumph.
  3833 #undef IS_A_TEMP
  3835   bind(L_fallthrough);
  3837 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
  3838   ld(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
  3839   sd(R0, Address(java_thread, JavaThread::vm_result_offset()));
  3840   verify_oop(oop_result, "broken oop in call_VM_base");
  3843 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
  3844   ld(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
  3845   sd(R0, Address(java_thread, JavaThread::vm_result_2_offset()));
  3848 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
  3849                                          int extra_slot_offset) {
  3850   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
  3851   int stackElementSize = Interpreter::stackElementSize;
  3852   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
  3853 #ifdef ASSERT
  3854   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
  3855   assert(offset1 - offset == stackElementSize, "correct arithmetic");
  3856 #endif
  3857   Register             scale_reg    = NOREG;
  3858   Address::ScaleFactor scale_factor = Address::no_scale;
  3859   if (arg_slot.is_constant()) {
  3860     offset += arg_slot.as_constant() * stackElementSize;
  3861   } else {
  3862     scale_reg    = arg_slot.as_register();
  3863     scale_factor = Address::times_8;
  3865   // 2014/07/31 Fu: We don't push RA on stack in prepare_invoke.
  3866   //  offset += wordSize;           // return PC is on stack
  3867   if(scale_reg==NOREG) return Address(SP, offset);
  3868   else {
  3869   dsll(scale_reg, scale_reg, scale_factor);
  3870   daddu(scale_reg, SP, scale_reg);
  3871   return Address(scale_reg, offset);
  3875 SkipIfEqual::~SkipIfEqual() {
  3876   _masm->bind(_label);
  3879 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
  3880   switch (size_in_bytes) {
  3881 #ifndef _LP64
  3882   case  8:
  3883     assert(dst2 != noreg, "second dest register required");
  3884     lw(dst,  src);
  3885     lw(dst2, src.plus_disp(BytesPerInt));
  3886     break;
  3887 #else
  3888   case  8:  ld(dst, src); break;
  3889 #endif
  3890   case  4:  lw(dst, src); break;
  3891   case  2:  is_signed ? lh(dst, src) : lhu(dst, src); break;
  3892   case  1:  is_signed ? lb( dst, src) : lbu( dst, src); break;
  3893   default:  ShouldNotReachHere();
  3897 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
  3898   switch (size_in_bytes) {
  3899 #ifndef _LP64
  3900   case  8:
  3901     assert(src2 != noreg, "second source register required");
  3902     sw(src, dst);
  3903     sw(src2, dst.plus_disp(BytesPerInt));
  3904     break;
  3905 #else
  3906   case  8:  sd(src, dst); break;
  3907 #endif
  3908   case  4:  sw(src, dst); break;
  3909   case  2:  sh(src, dst); break;
  3910   case  1:  sb(src, dst); break;
  3911   default:  ShouldNotReachHere();
  3915 // Look up the method for a megamorphic invokeinterface call.
  3916 // The target method is determined by <intf_klass, itable_index>.
  3917 // The receiver klass is in recv_klass.
  3918 // On success, the result will be in method_result, and execution falls through.
  3919 // On failure, execution transfers to the given label.
  3920 void MacroAssembler::lookup_interface_method(Register recv_klass,
  3921                                              Register intf_klass,
  3922                                              RegisterOrConstant itable_index,
  3923                                              Register method_result,
  3924                                              Register scan_temp,
  3925                                              Label& L_no_such_interface) {
  3926   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
  3927   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
  3928          "caller must use same register for non-constant itable index as for method");
  3930   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
  3931   int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
  3932   int itentry_off = itableMethodEntry::method_offset_in_bytes();
  3933   int scan_step   = itableOffsetEntry::size() * wordSize;
  3934   int vte_size    = vtableEntry::size() * wordSize;
  3935   Address::ScaleFactor times_vte_scale = Address::times_ptr;
  3936   assert(vte_size == wordSize, "else adjust times_vte_scale");
  3938   lw(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize));
  3940   // %%% Could store the aligned, prescaled offset in the klassoop.
  3941   dsll(scan_temp, scan_temp, times_vte_scale);
  3942   daddu(scan_temp, recv_klass, scan_temp);
  3943   daddiu(scan_temp, scan_temp, vtable_base);
  3944   if (HeapWordsPerLong > 1) {
  3945     // Round up to align_object_offset boundary
  3946     // see code for InstanceKlass::start_of_itable!
  3947     round_to(scan_temp, BytesPerLong);
  3950   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
  3951   assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
  3952 //  lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
  3953   if (itable_index.is_constant()) {
  3954     set64(AT, (int)itable_index.is_constant());
  3955     dsll(AT, AT, (int)Address::times_ptr);
  3956   } else {
  3957     dsll(AT, itable_index.as_register(), (int)Address::times_ptr);
  3959   daddu(AT, AT, recv_klass);
  3960   daddiu(recv_klass, AT, itentry_off);
  3962   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
  3963   //   if (scan->interface() == intf) {
  3964   //     result = (klass + scan->offset() + itable_index);
  3965   //   }
  3966   // }
  3967   Label search, found_method;
  3969   for (int peel = 1; peel >= 0; peel--) {
  3970     ld(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
  3972     if (peel) {
  3973       beq(intf_klass, method_result, found_method);
  3974       nop();
  3975     } else {
  3976       bne(intf_klass, method_result, search);
  3977       nop();
  3978       // (invert the test to fall through to found_method...)
  3981     if (!peel)  break;
  3983     bind(search);
  3985     // Check that the previous entry is non-null.  A null entry means that
  3986     // the receiver class doesn't implement the interface, and wasn't the
  3987     // same as when the caller was compiled.
  3988     beq(method_result, R0, L_no_such_interface);
  3989     nop();
  3990     daddiu(scan_temp, scan_temp, scan_step);
  3993   bind(found_method);
  3995   // Got a hit.
  3996   lw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
  3997   //ld(method_result, Address(recv_klass, scan_temp, Address::times_1));
  3998   if(UseLoongsonISA) {
  3999     gsldx(method_result, recv_klass, scan_temp, 0);
  4000   } else {
  4001     daddu(AT, recv_klass, scan_temp);
  4002     ld(method_result, AT);
  4007 // virtual method calling
  4008 void MacroAssembler::lookup_virtual_method(Register recv_klass,
  4009                                            RegisterOrConstant vtable_index,
  4010                                            Register method_result) {
  4011   Register tmp = GP;
  4012   push(tmp);
  4014   if (vtable_index.is_constant()) {
  4015     assert_different_registers(recv_klass, method_result, tmp);
  4016   } else {
  4017     assert_different_registers(recv_klass, method_result, vtable_index.as_register(), tmp);
  4019   const int base = InstanceKlass::vtable_start_offset() * wordSize;
  4020   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
  4021 /*
  4022   Address vtable_entry_addr(recv_klass,
  4023                             vtable_index, Address::times_ptr,
  4024                             base + vtableEntry::method_offset_in_bytes());
  4025 */
  4026   if (vtable_index.is_constant()) {
  4027     set64(AT, vtable_index.as_constant());
  4028     dsll(AT, AT, (int)Address::times_ptr);
  4029   } else {
  4030     dsll(AT, vtable_index.as_register(), (int)Address::times_ptr);
  4032   set64(tmp, base + vtableEntry::method_offset_in_bytes());
  4033   daddu(tmp, tmp, AT);
  4034   daddu(tmp, tmp, recv_klass);
  4035   ld(method_result, tmp, 0);
  4037   pop(tmp);

mercurial