diff -r 11d997b1e656 -r 52ea28d233d2 src/cpu/mips/vm/macroAssembler_mips.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/cpu/mips/vm/macroAssembler_mips.cpp Thu Sep 07 09:12:16 2017 +0800 @@ -0,0 +1,3844 @@ +/* + * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017, Loongson Technology. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#include "precompiled.hpp" +#include "asm/assembler.hpp" +#include "asm/assembler.inline.hpp" +#include "asm/macroAssembler.inline.hpp" +#include "compiler/disassembler.hpp" +#include "gc_interface/collectedHeap.inline.hpp" +#include "interpreter/interpreter.hpp" +#include "memory/cardTableModRefBS.hpp" +#include "memory/resourceArea.hpp" +#include "memory/universe.hpp" +#include "prims/methodHandles.hpp" +#include "runtime/biasedLocking.hpp" +#include "runtime/interfaceSupport.hpp" +#include "runtime/objectMonitor.hpp" +#include "runtime/os.hpp" +#include "runtime/sharedRuntime.hpp" +#include "runtime/stubRoutines.hpp" +#include "utilities/macros.hpp" +#if INCLUDE_ALL_GCS +#include "gc_implementation/g1/g1CollectedHeap.inline.hpp" +#include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp" +#include "gc_implementation/g1/heapRegion.hpp" +#endif // INCLUDE_ALL_GCS + +// Implementation of MacroAssembler + +intptr_t MacroAssembler::i[32] = {0}; +float MacroAssembler::f[32] = {0.0}; + +void MacroAssembler::print(outputStream *s) { + unsigned int k; + for(k=0; kprint_cr("i%d = 0x%.16lx", k, i[k]); + } + s->cr(); + + for(k=0; kprint_cr("f%d = %f", k, f[k]); + } + s->cr(); +} + +int MacroAssembler::i_offset(unsigned int k) { return (intptr_t)&((MacroAssembler*)0)->i[k]; } +int MacroAssembler::f_offset(unsigned int k) { return (intptr_t)&((MacroAssembler*)0)->f[k]; } + +void MacroAssembler::save_registers(MacroAssembler *masm) { +#define __ masm-> + for(int k=0; k<32; k++) { + __ sw (as_Register(k), A0, i_offset(k)); + } + + for(int k=0; k<32; k++) { + __ swc1 (as_FloatRegister(k), A0, f_offset(k)); + } +#undef __ +} + +void MacroAssembler::restore_registers(MacroAssembler *masm) { +#define __ masm-> + for(int k=0; k<32; k++) { + __ lw (as_Register(k), A0, i_offset(k)); + } + + for(int k=0; k<32; k++) { + __ lwc1 (as_FloatRegister(k), A0, f_offset(k)); + } +#undef __ +} + + +void MacroAssembler::pd_patch_instruction(address branch, address target) { + jint& stub_inst = *(jint*) branch; + +/* * + move(AT, RA); // dadd + emit_long(insn_ORRI(regimm_op, 0, bgezal_op, 1)); + nop(); + lui(T9, 0); // to be patched + ori(T9, 0); + daddu(T9, T9, RA); + move(RA, AT); + jr(T9); + */ + if(special(stub_inst) == dadd_op) { + jint *pc = (jint *)branch; + + assert(opcode(pc[3]) == lui_op + && opcode(pc[4]) == ori_op + && special(pc[5]) == daddu_op, "Not a branch label patch"); + if(!(opcode(pc[3]) == lui_op + && opcode(pc[4]) == ori_op + && special(pc[5]) == daddu_op)) { tty->print_cr("Not a branch label patch"); } + + int offset = target - branch; + if (!is_simm16(offset)) + { + pc[3] = (pc[3] & 0xffff0000) | high16(offset - 12); + pc[4] = (pc[4] & 0xffff0000) | low16(offset - 12); + } + else + { + /* revert to "beq + nop" */ + CodeBuffer cb(branch, 4 * 10); + MacroAssembler masm(&cb); +#define __ masm. + __ b(target); + __ nop(); + __ nop(); + __ nop(); + __ nop(); + __ nop(); + __ nop(); + __ nop(); + } + return; + } + +#ifndef PRODUCT + if (!is_simm16((target - branch - 4) >> 2)) + { + tty->print_cr("Illegal patching: target=0x%lx", target); + int *p = (int *)branch; + for (int i = -10; i < 10; i++) + { + tty->print("0x%lx, ", p[i]); + } + tty->print_cr(""); + } +#endif + + stub_inst = patched_branch(target - branch, stub_inst, 0); +} + +static inline address first_cache_address() { + return CodeCache::low_bound() + sizeof(HeapBlock::Header); +} + +static inline address last_cache_address() { + return CodeCache::high_bound() - Assembler::InstructionSize; +} + +int MacroAssembler::call_size(address target, bool far, bool patchable) { + if (patchable) return 6 << Assembler::LogInstructionSize; + if (!far) return 2 << Assembler::LogInstructionSize; // jal + nop + return (insts_for_set64((jlong)target) + 2) << Assembler::LogInstructionSize; +} + +// Can we reach target using jal/j from anywhere +// in the code cache (because code can be relocated)? +bool MacroAssembler::reachable_from_cache(address target) { + address cl = first_cache_address(); + address ch = last_cache_address(); + + return fit_in_jal(target, cl) && fit_in_jal(target, ch); +} + +void MacroAssembler::general_jump(address target) { + if (reachable_from_cache(target)) { + j(target); + nop(); + } else { + set64(T9, (long)target); + jr(T9); + nop(); + } +} + +int MacroAssembler::insts_for_general_jump(address target) { + if (reachable_from_cache(target)) { + //j(target); + //nop(); + return 2; + } else { + //set64(T9, (long)target); + //jr(T9); + //nop(); + return insts_for_set64((jlong)target) + 2; + } +} + +void MacroAssembler::patchable_jump(address target) { + if (reachable_from_cache(target)) { + nop(); + nop(); + nop(); + nop(); + j(target); + nop(); + } else { + patchable_set48(T9, (long)target); + jr(T9); + nop(); + } +} + +int MacroAssembler::insts_for_patchable_jump(address target) { + return 6; +} + +void MacroAssembler::general_call(address target) { + if (reachable_from_cache(target)) { + jal(target); + nop(); + } else { + set64(T9, (long)target); + jalr(T9); + nop(); + } +} + +int MacroAssembler::insts_for_general_call(address target) { + if (reachable_from_cache(target)) { + //jal(target); + //nop(); + return 2; + } else { + //set64(T9, (long)target); + //jalr(T9); + //nop(); + return insts_for_set64((jlong)target) + 2; + } +} + +void MacroAssembler::patchable_call(address target) { + if (reachable_from_cache(target)) { + nop(); + nop(); + nop(); + nop(); + jal(target); + nop(); + } else { + patchable_set48(T9, (long)target); + jalr(T9); + nop(); + } +} + +int MacroAssembler::insts_for_patchable_call(address target) { + return 6; +} + +void MacroAssembler::beq_far(Register rs, Register rt, address entry) +{ + u_char * cur_pc = pc(); + + /* Jin: Near/Far jump */ + if(is_simm16((entry - pc() - 4) / 4)) + { + Assembler::beq(rs, rt, offset(entry)); + } + else + { + Label not_jump; + bne(rs, rt, not_jump); + delayed()->nop(); + + b_far(entry); + delayed()->nop(); + + bind(not_jump); + has_delay_slot(); + } +} + +void MacroAssembler::beq_far(Register rs, Register rt, Label& L) +{ + if (L.is_bound()) { + beq_far(rs, rt, target(L)); + } else { + u_char * cur_pc = pc(); + Label not_jump; + bne(rs, rt, not_jump); + delayed()->nop(); + + b_far(L); + delayed()->nop(); + + bind(not_jump); + has_delay_slot(); + } +} + +void MacroAssembler::bne_far(Register rs, Register rt, address entry) +{ + u_char * cur_pc = pc(); + + /* Jin: Near/Far jump */ + if(is_simm16((entry - pc() - 4) / 4)) + { + Assembler::bne(rs, rt, offset(entry)); + } + else + { + Label not_jump; + beq(rs, rt, not_jump); + delayed()->nop(); + + b_far(entry); + delayed()->nop(); + + bind(not_jump); + has_delay_slot(); + } +} + +void MacroAssembler::bne_far(Register rs, Register rt, Label& L) +{ + if (L.is_bound()) { + bne_far(rs, rt, target(L)); + } else { + u_char * cur_pc = pc(); + Label not_jump; + beq(rs, rt, not_jump); + delayed()->nop(); + + b_far(L); + delayed()->nop(); + + bind(not_jump); + has_delay_slot(); + } +} + +void MacroAssembler::b_far(Label& L) +{ + if (L.is_bound()) { + b_far(target(L)); + } else { + volatile address dest = target(L); +/* +MacroAssembler::pd_patch_instruction branch=55651ed514, target=55651ef6d8 + 0x00000055651ed514: dadd at, ra, zero + 0x00000055651ed518: [4110001]bgezal zero, 0x00000055651ed520 + + 0x00000055651ed51c: sll zero, zero, 0 + 0x00000055651ed520: lui t9, 0x0 + 0x00000055651ed524: ori t9, t9, 0x21b8 + 0x00000055651ed528: daddu t9, t9, ra + 0x00000055651ed52c: dadd ra, at, zero + 0x00000055651ed530: jr t9 + 0x00000055651ed534: sll zero, zero, 0 +*/ + move(AT, RA); + emit_long(insn_ORRI(regimm_op, 0, bgezal_op, 1)); + nop(); + lui(T9, 0); // to be patched + ori(T9, T9, 0); + daddu(T9, T9, RA); + move(RA, AT); + jr(T9); + } +} + +void MacroAssembler::b_far(address entry) +{ + u_char * cur_pc = pc(); + + /* Jin: Near/Far jump */ + if(is_simm16((entry - pc() - 4) / 4)) + { + b(offset(entry)); + } + else + { + /* address must be bounded */ + move(AT, RA); + emit_long(insn_ORRI(regimm_op, 0, bgezal_op, 1)); + nop(); + li32(T9, entry - pc()); + daddu(T9, T9, RA); + move(RA, AT); + jr(T9); + } +} + +void MacroAssembler::ld_ptr(Register rt, Register offset, Register base) { + addu_long(AT, base, offset); + ld_ptr(rt, 0, AT); +} + +void MacroAssembler::st_ptr(Register rt, Register offset, Register base) { + addu_long(AT, base, offset); + st_ptr(rt, 0, AT); +} + +void MacroAssembler::ld_long(Register rt, Register offset, Register base) { + addu_long(AT, base, offset); + ld_long(rt, 0, AT); +} + +void MacroAssembler::st_long(Register rt, Register offset, Register base) { + addu_long(AT, base, offset); + st_long(rt, 0, AT); +} + +Address MacroAssembler::as_Address(AddressLiteral adr) { + return Address(adr.target(), adr.rspec()); +} + +Address MacroAssembler::as_Address(ArrayAddress adr) { + return Address::make_array(adr); +} + +// tmp_reg1 and tmp_reg2 should be saved outside of atomic_inc32 (caller saved). +void MacroAssembler::atomic_inc32(address counter_addr, int inc, Register tmp_reg1, Register tmp_reg2) { + Label again; + + li(tmp_reg1, counter_addr); + bind(again); + if(!Use3A2000) sync(); + ll(tmp_reg2, tmp_reg1, 0); + addi(tmp_reg2, tmp_reg2, inc); + sc(tmp_reg2, tmp_reg1, 0); + beq(tmp_reg2, R0, again); + delayed()->nop(); +} + +int MacroAssembler::biased_locking_enter(Register lock_reg, + Register obj_reg, + Register swap_reg, + Register tmp_reg, + bool swap_reg_contains_mark, + Label& done, + Label* slow_case, + BiasedLockingCounters* counters) { + assert(UseBiasedLocking, "why call this otherwise?"); + bool need_tmp_reg = false; + if (tmp_reg == noreg) { + need_tmp_reg = true; + tmp_reg = T9; + } + assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, AT); + assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); + Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); + Address saved_mark_addr(lock_reg, 0); + + // Biased locking + // See whether the lock is currently biased toward our thread and + // whether the epoch is still valid + // Note that the runtime guarantees sufficient alignment of JavaThread + // pointers to allow age to be placed into low bits + // First check to see whether biasing is even enabled for this object + Label cas_label; + int null_check_offset = -1; + if (!swap_reg_contains_mark) { + null_check_offset = offset(); + ld_ptr(swap_reg, mark_addr); + } + + if (need_tmp_reg) { + push(tmp_reg); + } + move(tmp_reg, swap_reg); + andi(tmp_reg, tmp_reg, markOopDesc::biased_lock_mask_in_place); +#ifdef _LP64 + daddi(AT, R0, markOopDesc::biased_lock_pattern); + dsub(AT, AT, tmp_reg); +#else + addi(AT, R0, markOopDesc::biased_lock_pattern); + sub(AT, AT, tmp_reg); +#endif + if (need_tmp_reg) { + pop(tmp_reg); + } + + bne(AT, R0, cas_label); + delayed()->nop(); + + + // The bias pattern is present in the object's header. Need to check + // whether the bias owner and the epoch are both still current. + // Note that because there is no current thread register on MIPS we + // need to store off the mark word we read out of the object to + // avoid reloading it and needing to recheck invariants below. This + // store is unfortunate but it makes the overall code shorter and + // simpler. + st_ptr(swap_reg, saved_mark_addr); + if (need_tmp_reg) { + push(tmp_reg); + } + if (swap_reg_contains_mark) { + null_check_offset = offset(); + } + load_prototype_header(tmp_reg, obj_reg); + xorr(tmp_reg, tmp_reg, swap_reg); + get_thread(swap_reg); + xorr(swap_reg, swap_reg, tmp_reg); + + move(AT, ~((int) markOopDesc::age_mask_in_place)); + andr(swap_reg, swap_reg, AT); + + if (PrintBiasedLockingStatistics) { + Label L; + bne(swap_reg, R0, L); + delayed()->nop(); + push(tmp_reg); + push(A0); + atomic_inc32((address)BiasedLocking::biased_lock_entry_count_addr(), 1, A0, tmp_reg); + pop(A0); + pop(tmp_reg); + bind(L); + } + if (need_tmp_reg) { + pop(tmp_reg); + } + beq(swap_reg, R0, done); + delayed()->nop(); + Label try_revoke_bias; + Label try_rebias; + + // At this point we know that the header has the bias pattern and + // that we are not the bias owner in the current epoch. We need to + // figure out more details about the state of the header in order to + // know what operations can be legally performed on the object's + // header. + + // If the low three bits in the xor result aren't clear, that means + // the prototype header is no longer biased and we have to revoke + // the bias on this object. + + move(AT, markOopDesc::biased_lock_mask_in_place); + andr(AT, swap_reg, AT); + bne(AT, R0, try_revoke_bias); + delayed()->nop(); + // Biasing is still enabled for this data type. See whether the + // epoch of the current bias is still valid, meaning that the epoch + // bits of the mark word are equal to the epoch bits of the + // prototype header. (Note that the prototype header's epoch bits + // only change at a safepoint.) If not, attempt to rebias the object + // toward the current thread. Note that we must be absolutely sure + // that the current epoch is invalid in order to do this because + // otherwise the manipulations it performs on the mark word are + // illegal. + + move(AT, markOopDesc::epoch_mask_in_place); + andr(AT,swap_reg, AT); + bne(AT, R0, try_rebias); + delayed()->nop(); + // The epoch of the current bias is still valid but we know nothing + // about the owner; it might be set or it might be clear. Try to + // acquire the bias of the object using an atomic operation. If this + // fails we will go in to the runtime to revoke the object's bias. + // Note that we first construct the presumed unbiased header so we + // don't accidentally blow away another thread's valid bias. + + ld_ptr(swap_reg, saved_mark_addr); + + move(AT, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); + andr(swap_reg, swap_reg, AT); + + if (need_tmp_reg) { + push(tmp_reg); + } + get_thread(tmp_reg); + orr(tmp_reg, tmp_reg, swap_reg); + //if (os::is_MP()) { + // sync(); + //} + cmpxchg(tmp_reg, Address(obj_reg, 0), swap_reg); + if (need_tmp_reg) { + pop(tmp_reg); + } + // If the biasing toward our thread failed, this means that + // another thread succeeded in biasing it toward itself and we + // need to revoke that bias. The revocation will occur in the + // interpreter runtime in the slow case. + if (PrintBiasedLockingStatistics) { + Label L; + bne(AT, R0, L); + delayed()->nop(); + push(tmp_reg); + push(A0); + atomic_inc32((address)BiasedLocking::anonymously_biased_lock_entry_count_addr(), 1, A0, tmp_reg); + pop(A0); + pop(tmp_reg); + bind(L); + } + if (slow_case != NULL) { + beq_far(AT, R0, *slow_case); + delayed()->nop(); + } + b(done); + delayed()->nop(); + + bind(try_rebias); + // At this point we know the epoch has expired, meaning that the + // current "bias owner", if any, is actually invalid. Under these + // circumstances _only_, we are allowed to use the current header's + // value as the comparison value when doing the cas to acquire the + // bias in the current epoch. In other words, we allow transfer of + // the bias from one thread to another directly in this situation. + // + // FIXME: due to a lack of registers we currently blow away the age + // bits in this situation. Should attempt to preserve them. + if (need_tmp_reg) { + push(tmp_reg); + } + load_prototype_header(tmp_reg, obj_reg); + get_thread(swap_reg); + orr(tmp_reg, tmp_reg, swap_reg); + ld_ptr(swap_reg, saved_mark_addr); + + //if (os::is_MP()) { + // sync(); + //} + cmpxchg(tmp_reg, Address(obj_reg, 0), swap_reg); + if (need_tmp_reg) { + pop(tmp_reg); + } + // If the biasing toward our thread failed, then another thread + // succeeded in biasing it toward itself and we need to revoke that + // bias. The revocation will occur in the runtime in the slow case. + if (PrintBiasedLockingStatistics) { + Label L; + bne(AT, R0, L); + delayed()->nop(); + push(AT); + push(tmp_reg); + atomic_inc32((address)BiasedLocking::rebiased_lock_entry_count_addr(), 1, AT, tmp_reg); + pop(tmp_reg); + pop(AT); + bind(L); + } + if (slow_case != NULL) { + beq_far(AT, R0, *slow_case); + delayed()->nop(); + } + + b(done); + delayed()->nop(); + bind(try_revoke_bias); + // The prototype mark in the klass doesn't have the bias bit set any + // more, indicating that objects of this data type are not supposed + // to be biased any more. We are going to try to reset the mark of + // this object to the prototype value and fall through to the + // CAS-based locking scheme. Note that if our CAS fails, it means + // that another thread raced us for the privilege of revoking the + // bias of this particular object, so it's okay to continue in the + // normal locking code. + // + // FIXME: due to a lack of registers we currently blow away the age + // bits in this situation. Should attempt to preserve them. + ld_ptr(swap_reg, saved_mark_addr); + + if (need_tmp_reg) { + push(tmp_reg); + } + load_prototype_header(tmp_reg, obj_reg); + //if (os::is_MP()) { + // lock(); + //} + cmpxchg(tmp_reg, Address(obj_reg, 0), swap_reg); + if (need_tmp_reg) { + pop(tmp_reg); + } + // Fall through to the normal CAS-based lock, because no matter what + // the result of the above CAS, some thread must have succeeded in + // removing the bias bit from the object's header. + if (PrintBiasedLockingStatistics) { + Label L; + bne(AT, R0, L); + delayed()->nop(); + push(AT); + push(tmp_reg); + atomic_inc32((address)BiasedLocking::revoked_lock_entry_count_addr(), 1, AT, tmp_reg); + pop(tmp_reg); + pop(AT); + bind(L); + } + + bind(cas_label); + return null_check_offset; +} + +void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { + assert(UseBiasedLocking, "why call this otherwise?"); + + // Check for biased locking unlock case, which is a no-op + // Note: we do not have to check the thread ID for two reasons. + // First, the interpreter checks for IllegalMonitorStateException at + // a higher level. Second, if the bias was revoked while we held the + // lock, the object could not be rebiased toward another thread, so + // the bias bit would be clear. +#ifdef _LP64 + ld(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); + andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); + daddi(AT, R0, markOopDesc::biased_lock_pattern); +#else + lw(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); + andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); + addi(AT, R0, markOopDesc::biased_lock_pattern); +#endif + + beq(AT, temp_reg, done); + delayed()->nop(); +} + +// NOTE: we dont increment the SP after call like the x86 version, maybe this is a problem, FIXME. +// the stack pointer adjustment is needed. see InterpreterMacroAssembler::super_call_VM_leaf +// this method will handle the stack problem, you need not to preserve the stack space for the argument now +void MacroAssembler::call_VM_leaf_base(address entry_point, + int number_of_arguments) { + //call(RuntimeAddress(entry_point)); + //increment(rsp, number_of_arguments * wordSize); + Label L, E; + + assert(number_of_arguments <= 4, "just check"); + + andi(AT, SP, 0xf); + beq(AT, R0, L); + delayed()->nop(); + daddi(SP, SP, -8); + call(entry_point, relocInfo::runtime_call_type); + delayed()->nop(); + daddi(SP, SP, 8); + b(E); + delayed()->nop(); + + bind(L); + call(entry_point, relocInfo::runtime_call_type); + delayed()->nop(); + bind(E); +} + + +void MacroAssembler::jmp(address entry) { + patchable_set48(T9, (long)entry); + jr(T9); +} + +void MacroAssembler::jmp(address entry, relocInfo::relocType rtype) { + switch (rtype) { + case relocInfo::runtime_call_type: + case relocInfo::none: + jmp(entry); + break; + default: + { + InstructionMark im(this); + relocate(rtype); + patchable_set48(T9, (long)entry); + jr(T9); + } + break; + } +} + +void MacroAssembler::call(address entry) { +// c/c++ code assume T9 is entry point, so we just always move entry to t9 +// maybe there is some more graceful method to handle this. FIXME +// For more info, see class NativeCall. +#ifndef _LP64 + move(T9, (int)entry); +#else + patchable_set48(T9, (long)entry); +#endif + jalr(T9); +} + +void MacroAssembler::call(address entry, relocInfo::relocType rtype) { + switch (rtype) { + case relocInfo::runtime_call_type: + case relocInfo::none: + call(entry); + break; + default: + { + InstructionMark im(this); + relocate(rtype); + call(entry); + } + break; + } +} + +void MacroAssembler::call(address entry, RelocationHolder& rh) +{ + switch (rh.type()) { + case relocInfo::runtime_call_type: + case relocInfo::none: + call(entry); + break; + default: + { + InstructionMark im(this); + relocate(rh); + call(entry); + } + break; + } +} + +void MacroAssembler::ic_call(address entry) { + RelocationHolder rh = virtual_call_Relocation::spec(pc()); + patchable_set48(IC_Klass, (long)Universe::non_oop_word()); + assert(entry != NULL, "call most probably wrong"); + InstructionMark im(this); + relocate(rh); + patchable_call(entry); +} + +void MacroAssembler::c2bool(Register r) { + Label L; + Assembler::beq(r, R0, L); + delayed()->nop(); + move(r, 1); + bind(L); +} + +#ifndef PRODUCT +extern "C" void findpc(intptr_t x); +#endif + +void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) { + // In order to get locks to work, we need to fake a in_VM state + JavaThread* thread = JavaThread::current(); + JavaThreadState saved_state = thread->thread_state(); + thread->set_thread_state(_thread_in_vm); + if (ShowMessageBoxOnError) { + JavaThread* thread = JavaThread::current(); + JavaThreadState saved_state = thread->thread_state(); + thread->set_thread_state(_thread_in_vm); + if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { + ttyLocker ttyl; + BytecodeCounter::print(); + } + // To see where a verify_oop failed, get $ebx+40/X for this frame. + // This is the value of eip which points to where verify_oop will return. + if (os::message_box(msg, "Execution stopped, print registers?")) { + ttyLocker ttyl; + tty->print_cr("eip = 0x%08x", eip); +#ifndef PRODUCT + tty->cr(); + findpc(eip); + tty->cr(); +#endif + tty->print_cr("rax, = 0x%08x", rax); + tty->print_cr("rbx, = 0x%08x", rbx); + tty->print_cr("rcx = 0x%08x", rcx); + tty->print_cr("rdx = 0x%08x", rdx); + tty->print_cr("rdi = 0x%08x", rdi); + tty->print_cr("rsi = 0x%08x", rsi); + tty->print_cr("rbp, = 0x%08x", rbp); + tty->print_cr("rsp = 0x%08x", rsp); + BREAKPOINT; + } + } else { + ttyLocker ttyl; + ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg); + assert(false, "DEBUG MESSAGE"); + } + ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); +} + +void MacroAssembler::debug(char* msg/*, RegistersForDebugging* regs*/) { + if ( ShowMessageBoxOnError ) { + JavaThreadState saved_state = JavaThread::current()->thread_state(); + JavaThread::current()->set_thread_state(_thread_in_vm); + { + // In order to get locks work, we need to fake a in_VM state + ttyLocker ttyl; + ::tty->print_cr("EXECUTION STOPPED: %s\n", msg); + if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { + BytecodeCounter::print(); + } + + // if (os::message_box(msg, "Execution stopped, print registers?")) + // regs->print(::tty); + } + ThreadStateTransition::transition(JavaThread::current(), _thread_in_vm, saved_state); + } + else + ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg); +} + + +void MacroAssembler::stop(const char* msg) { + li(A0, (long)msg); +#ifndef _LP64 + //reserver space for argument. added by yjl 7/10/2005 + addiu(SP, SP, - 1 * wordSize); +#endif + call(CAST_FROM_FN_PTR(address, MacroAssembler::debug), relocInfo::runtime_call_type); + delayed()->nop(); +#ifndef _LP64 + //restore space for argument + addiu(SP, SP, 1 * wordSize); +#endif + brk(17); +} + +void MacroAssembler::warn(const char* msg) { +#ifdef _LP64 + pushad(); + li(A0, (long)msg); + push(S2); + move(AT, -(StackAlignmentInBytes)); + move(S2, SP); // use S2 as a sender SP holder + andr(SP, SP, AT); // align stack as required by ABI + call(CAST_FROM_FN_PTR(address, MacroAssembler::debug), relocInfo::runtime_call_type); + delayed()->nop(); + move(SP, S2); // use S2 as a sender SP holder + pop(S2); + popad(); +#else + pushad(); + addi(SP, SP, -4); + sw(A0, SP, -1 * wordSize); + li(A0, (long)msg); + addi(SP, SP, -1 * wordSize); + call(CAST_FROM_FN_PTR(address, MacroAssembler::debug), relocInfo::runtime_call_type); + delayed()->nop(); + addi(SP, SP, 1 * wordSize); + lw(A0, SP, -1 * wordSize); + addi(SP, SP, 4); + popad(); +#endif +} + +void MacroAssembler::print_reg(Register reg) { +/* +char *s = getenv("PRINT_REG"); +if (s == NULL) + return; +if (strcmp(s, "1") != 0) + return; +*/ + void * cur_pc = pc(); + pushad(); + NOT_LP64(push(FP);) + + li(A0, (long)reg->name()); + if (reg == SP) + addiu(A1, SP, wordSize * 23); //23 registers saved in pushad() + else if (reg == A0) + ld(A1, SP, wordSize * 19); //A0 has been modified by li(A0, (long)reg->name()). Ugly Code! + else + move(A1, reg); + li(A2, (long)cur_pc); + push(S2); + move(AT, -(StackAlignmentInBytes)); + move(S2, SP); // use S2 as a sender SP holder + andr(SP, SP, AT); // align stack as required by ABI + call(CAST_FROM_FN_PTR(address, SharedRuntime::print_reg_with_pc),relocInfo::runtime_call_type); + delayed()->nop(); + move(SP, S2); // use S2 as a sender SP holder + pop(S2); + NOT_LP64(pop(FP);) + popad(); + +/* + pushad(); +#ifdef _LP64 + if (reg == SP) + addiu(A0, SP, wordSize * 23); //23 registers saved in pushad() + else + move(A0, reg); + call(CAST_FROM_FN_PTR(address, SharedRuntime::print_long),relocInfo::runtime_call_type); + delayed()->nop(); +#else + push(FP); + move(A0, reg); + dsrl32(A1, reg, 0); + //call(CAST_FROM_FN_PTR(address, SharedRuntime::print_int),relocInfo::runtime_call_type); + call(CAST_FROM_FN_PTR(address, SharedRuntime::print_long),relocInfo::runtime_call_type); + delayed()->nop(); + pop(FP); +#endif + popad(); + pushad(); + NOT_LP64(push(FP);) + char b[50]; + sprintf((char *)b, " pc: %p\n",cur_pc); + li(A0, (long)(char *)b); + call(CAST_FROM_FN_PTR(address, SharedRuntime::print_str),relocInfo::runtime_call_type); + delayed()->nop(); + NOT_LP64(pop(FP);) + popad(); +*/ +} + +void MacroAssembler::print_reg(FloatRegister reg) { + void * cur_pc = pc(); + pushad(); + NOT_LP64(push(FP);) + li(A0, (long)reg->name()); + push(S2); + move(AT, -(StackAlignmentInBytes)); + move(S2, SP); // use S2 as a sender SP holder + andr(SP, SP, AT); // align stack as required by ABI + call(CAST_FROM_FN_PTR(address, SharedRuntime::print_str),relocInfo::runtime_call_type); + delayed()->nop(); + move(SP, S2); // use S2 as a sender SP holder + pop(S2); + NOT_LP64(pop(FP);) + popad(); + + pushad(); + NOT_LP64(push(FP);) +#if 1 + move(FP, SP); + move(AT, -(StackAlignmentInBytes)); + andr(SP , SP , AT); + mov_d(F12, reg); + call(CAST_FROM_FN_PTR(address, SharedRuntime::print_double),relocInfo::runtime_call_type); + delayed()->nop(); + move(SP, FP); +#else + mov_s(F12, reg); + //call(CAST_FROM_FN_PTR(address, SharedRuntime::print_float),relocInfo::runtime_call_type); + //delayed()->nop(); +#endif + NOT_LP64(pop(FP);) + popad(); + +#if 0 + pushad(); + NOT_LP64(push(FP);) + char* b = new char[50]; + sprintf(b, " pc: %p\n", cur_pc); + li(A0, (long)b); + call(CAST_FROM_FN_PTR(address, SharedRuntime::print_str),relocInfo::runtime_call_type); + delayed()->nop(); + NOT_LP64(pop(FP);) + popad(); +#endif +} + +void MacroAssembler::increment(Register reg, int imm) { + if (!imm) return; + if (is_simm16(imm)) { +#ifdef _LP64 + daddiu(reg, reg, imm); +#else + addiu(reg, reg, imm); +#endif + } else { + move(AT, imm); +#ifdef _LP64 + daddu(reg, reg, AT); +#else + addu(reg, reg, AT); +#endif + } +} + +void MacroAssembler::decrement(Register reg, int imm) { + increment(reg, -imm); +} + + +void MacroAssembler::call_VM(Register oop_result, + address entry_point, + bool check_exceptions) { + call_VM_helper(oop_result, entry_point, 0, check_exceptions); +} + +void MacroAssembler::call_VM(Register oop_result, + address entry_point, + Register arg_1, + bool check_exceptions) { + if (arg_1!=A1) move(A1, arg_1); + call_VM_helper(oop_result, entry_point, 1, check_exceptions); +} + +void MacroAssembler::call_VM(Register oop_result, + address entry_point, + Register arg_1, + Register arg_2, + bool check_exceptions) { + if (arg_1!=A1) move(A1, arg_1); + if (arg_2!=A2) move(A2, arg_2); + assert(arg_2 != A1, "smashed argument"); + call_VM_helper(oop_result, entry_point, 2, check_exceptions); +} + +void MacroAssembler::call_VM(Register oop_result, + address entry_point, + Register arg_1, + Register arg_2, + Register arg_3, + bool check_exceptions) { + if (arg_1!=A1) move(A1, arg_1); + if (arg_2!=A2) move(A2, arg_2); assert(arg_2 != A1, "smashed argument"); + if (arg_3!=A3) move(A3, arg_3); assert(arg_3 != A1 && arg_3 != A2, "smashed argument"); + call_VM_helper(oop_result, entry_point, 3, check_exceptions); +} + +void MacroAssembler::call_VM(Register oop_result, + Register last_java_sp, + address entry_point, + int number_of_arguments, + bool check_exceptions) { + call_VM_base(oop_result, NOREG, last_java_sp, entry_point, number_of_arguments, check_exceptions); +} + +void MacroAssembler::call_VM(Register oop_result, + Register last_java_sp, + address entry_point, + Register arg_1, + bool check_exceptions) { + if (arg_1 != A1) move(A1, arg_1); + call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); +} + +void MacroAssembler::call_VM(Register oop_result, + Register last_java_sp, + address entry_point, + Register arg_1, + Register arg_2, + bool check_exceptions) { + if (arg_1 != A1) move(A1, arg_1); + if (arg_2 != A2) move(A2, arg_2); assert(arg_2 != A1, "smashed argument"); + call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); +} + +void MacroAssembler::call_VM(Register oop_result, + Register last_java_sp, + address entry_point, + Register arg_1, + Register arg_2, + Register arg_3, + bool check_exceptions) { + if (arg_1 != A1) move(A1, arg_1); + if (arg_2 != A2) move(A2, arg_2); assert(arg_2 != A1, "smashed argument"); + if (arg_3 != A3) move(A3, arg_3); assert(arg_3 != A1 && arg_3 != A2, "smashed argument"); + call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); +} + +void MacroAssembler::call_VM_base(Register oop_result, + Register java_thread, + Register last_java_sp, + address entry_point, + int number_of_arguments, + bool check_exceptions) { + + address before_call_pc; + // determine java_thread register + if (!java_thread->is_valid()) { +#ifndef OPT_THREAD + java_thread = T2; + get_thread(java_thread); +#else + java_thread = TREG; +#endif + } + // determine last_java_sp register + if (!last_java_sp->is_valid()) { + last_java_sp = SP; + } + // debugging support + assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); + assert(number_of_arguments <= 4 , "cannot have negative number of arguments"); + assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); + assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); + + assert(last_java_sp != FP, "this code doesn't work for last_java_sp == fp, which currently can't portably work anyway since C2 doesn't save ebp"); + + // set last Java frame before call + before_call_pc = (address)pc(); + set_last_Java_frame(java_thread, last_java_sp, FP, before_call_pc); + + // do the call + move(A0, java_thread); + call(entry_point, relocInfo::runtime_call_type); + delayed()->nop(); + + // restore the thread (cannot use the pushed argument since arguments + // may be overwritten by C code generated by an optimizing compiler); + // however can use the register value directly if it is callee saved. +#ifndef OPT_THREAD + if (java_thread >=S0 && java_thread <=S7) { +#ifdef ASSERT + { Label L; + get_thread(AT); + beq(java_thread, AT, L); + delayed()->nop(); + stop("MacroAssembler::call_VM_base: edi not callee saved?"); + bind(L); + } +#endif + } else { + get_thread(java_thread); + } +#endif + + // discard thread and arguments + ld_ptr(SP, java_thread, in_bytes(JavaThread::last_Java_sp_offset())); + // reset last Java frame + reset_last_Java_frame(java_thread, false, true); + + check_and_handle_popframe(java_thread); + check_and_handle_earlyret(java_thread); + if (check_exceptions) { + // check for pending exceptions (java_thread is set upon return) + Label L; +#ifdef _LP64 + ld(AT, java_thread, in_bytes(Thread::pending_exception_offset())); +#else + lw(AT, java_thread, in_bytes(Thread::pending_exception_offset())); +#endif + beq(AT, R0, L); + delayed()->nop(); + li(AT, before_call_pc); + push(AT); + jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type); + delayed()->nop(); + bind(L); + } + + // get oop result if there is one and reset the value in the thread + if (oop_result->is_valid()) { +#ifdef _LP64 + ld(oop_result, java_thread, in_bytes(JavaThread::vm_result_offset())); + sd(R0, java_thread, in_bytes(JavaThread::vm_result_offset())); +#else + lw(oop_result, java_thread, in_bytes(JavaThread::vm_result_offset())); + sw(R0, java_thread, in_bytes(JavaThread::vm_result_offset())); +#endif + verify_oop(oop_result); + } +} + +void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { + + move(V0, SP); + //we also reserve space for java_thread here +#ifndef _LP64 + daddi(SP, SP, (1 + number_of_arguments) * (- wordSize)); +#endif + move(AT, -(StackAlignmentInBytes)); + andr(SP, SP, AT); + call_VM_base(oop_result, NOREG, V0, entry_point, number_of_arguments, check_exceptions); + +} + +void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { + call_VM_leaf_base(entry_point, number_of_arguments); +} + +void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { + if (arg_0 != A0) move(A0, arg_0); + call_VM_leaf(entry_point, 1); +} + +void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { + if (arg_0 != A0) move(A0, arg_0); + if (arg_1 != A1) move(A1, arg_1); assert(arg_1 != A0, "smashed argument"); + call_VM_leaf(entry_point, 2); +} + +void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { + if (arg_0 != A0) move(A0, arg_0); + if (arg_1 != A1) move(A1, arg_1); assert(arg_1 != A0, "smashed argument"); + if (arg_2 != A2) move(A2, arg_2); assert(arg_2 != A0 && arg_2 != A1, "smashed argument"); + call_VM_leaf(entry_point, 3); +} +void MacroAssembler::super_call_VM_leaf(address entry_point) { + MacroAssembler::call_VM_leaf_base(entry_point, 0); +} + + +void MacroAssembler::super_call_VM_leaf(address entry_point, + Register arg_1) { + if (arg_1 != A0) move(A0, arg_1); + MacroAssembler::call_VM_leaf_base(entry_point, 1); +} + + +void MacroAssembler::super_call_VM_leaf(address entry_point, + Register arg_1, + Register arg_2) { + if (arg_1 != A0) move(A0, arg_1); + if (arg_2 != A1) move(A1, arg_2); assert(arg_2 != A0, "smashed argument"); + MacroAssembler::call_VM_leaf_base(entry_point, 2); +} +void MacroAssembler::super_call_VM_leaf(address entry_point, + Register arg_1, + Register arg_2, + Register arg_3) { + if (arg_1 != A0) move(A0, arg_1); + if (arg_2 != A1) move(A1, arg_2); assert(arg_2 != A0, "smashed argument"); + if (arg_3 != A2) move(A2, arg_3); assert(arg_3 != A0 && arg_3 != A1, "smashed argument"); + MacroAssembler::call_VM_leaf_base(entry_point, 3); +} + +void MacroAssembler::check_and_handle_earlyret(Register java_thread) { +} + +void MacroAssembler::check_and_handle_popframe(Register java_thread) { +} + +void MacroAssembler::null_check(Register reg, int offset) { + if (needs_explicit_null_check(offset)) { + // provoke OS NULL exception if reg = NULL by + // accessing M[reg] w/o changing any (non-CC) registers + // NOTE: cmpl is plenty here to provoke a segv + lw(AT, reg, 0); +/* Jin + nop(); + nop(); + nop(); +*/ + // Note: should probably use testl(rax, Address(reg, 0)); + // may be shorter code (however, this version of + // testl needs to be implemented first) + } else { + // nothing to do, (later) access of M[reg + offset] + // will provoke OS NULL exception if reg = NULL + } +} + +void MacroAssembler::enter() { + push2(RA, FP); + move(FP, SP); +} + +void MacroAssembler::leave() { +#ifndef _LP64 + //move(SP, FP); + //pop2(FP, RA); + addi(SP, FP, 2 * wordSize); + lw(RA, SP, - 1 * wordSize); + lw(FP, SP, - 2 * wordSize); +#else + daddi(SP, FP, 2 * wordSize); + ld(RA, SP, - 1 * wordSize); + ld(FP, SP, - 2 * wordSize); +#endif +} +/* +void MacroAssembler::os_breakpoint() { + // instead of directly emitting a breakpoint, call os:breakpoint for better debugability + // (e.g., MSVC can't call ps() otherwise) + call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint))); +} +*/ +void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) { + // determine java_thread register + if (!java_thread->is_valid()) { +#ifndef OPT_THREAD + java_thread = T1; + get_thread(java_thread); +#else + java_thread = TREG; +#endif + } + // we must set sp to zero to clear frame + st_ptr(R0, java_thread, in_bytes(JavaThread::last_Java_sp_offset())); + // must clear fp, so that compiled frames are not confused; it is possible + // that we need it only for debugging + if(clear_fp) + st_ptr(R0, java_thread, in_bytes(JavaThread::last_Java_fp_offset())); + + if (clear_pc) + st_ptr(R0, java_thread, in_bytes(JavaThread::last_Java_pc_offset())); +} + +void MacroAssembler::reset_last_Java_frame(bool clear_fp, + bool clear_pc) { + Register thread = TREG; +#ifndef OPT_THREAD + get_thread(thread); +#endif + // we must set sp to zero to clear frame + sd(R0, Address(thread, JavaThread::last_Java_sp_offset())); + // must clear fp, so that compiled frames are not confused; it is + // possible that we need it only for debugging + if (clear_fp) { + sd(R0, Address(thread, JavaThread::last_Java_fp_offset())); + } + + if (clear_pc) { + sd(R0, Address(thread, JavaThread::last_Java_pc_offset())); + } +} + +// Write serialization page so VM thread can do a pseudo remote membar. +// We use the current thread pointer to calculate a thread specific +// offset to write to within the page. This minimizes bus traffic +// due to cache line collision. +void MacroAssembler::serialize_memory(Register thread, Register tmp) { + move(tmp, thread); + srl(tmp, tmp,os::get_serialize_page_shift_count()); + move(AT, (os::vm_page_size() - sizeof(int))); + andr(tmp, tmp,AT); + sw(tmp,Address(tmp, (intptr_t)os::get_memory_serialize_page())); +} + +// Calls to C land +// +// When entering C land, the rbp, & rsp of the last Java frame have to be recorded +// in the (thread-local) JavaThread object. When leaving C land, the last Java fp +// has to be reset to 0. This is required to allow proper stack traversal. +void MacroAssembler::set_last_Java_frame(Register java_thread, + Register last_java_sp, + Register last_java_fp, + address last_java_pc) { + // determine java_thread register + if (!java_thread->is_valid()) { +#ifndef OPT_THREAD + java_thread = T2; + get_thread(java_thread); +#else + java_thread = TREG; +#endif + } + // determine last_java_sp register + if (!last_java_sp->is_valid()) { + last_java_sp = SP; + } + + // last_java_fp is optional + + if (last_java_fp->is_valid()) { + st_ptr(last_java_fp, java_thread, in_bytes(JavaThread::last_Java_fp_offset())); + } + + // last_java_pc is optional + + if (last_java_pc != NULL) { + relocate(relocInfo::internal_pc_type); + patchable_set48(AT, (long)last_java_pc); + st_ptr(AT, java_thread, in_bytes(JavaThread::last_Java_pc_offset())); + } + st_ptr(last_java_sp, java_thread, in_bytes(JavaThread::last_Java_sp_offset())); +} + +void MacroAssembler::set_last_Java_frame(Register last_java_sp, + Register last_java_fp, + address last_java_pc) { + // determine last_java_sp register + if (!last_java_sp->is_valid()) { + last_java_sp = SP; + } + + Register thread = TREG; +#ifndef OPT_THREAD + get_thread(thread); +#endif + // last_java_fp is optional + if (last_java_fp->is_valid()) { + sd(last_java_fp, Address(thread, JavaThread::last_Java_fp_offset())); + } + + // last_java_pc is optional + if (last_java_pc != NULL) { + Address java_pc(thread, + JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()); + li(AT, (intptr_t)(last_java_pc)); + sd(AT, java_pc); + } + + sd(last_java_sp, Address(thread, JavaThread::last_Java_sp_offset())); +} + +////////////////////////////////////////////////////////////////////////////////// +#if INCLUDE_ALL_GCS + +void MacroAssembler::g1_write_barrier_pre(Register obj, +#ifndef _LP64 + Register thread, +#endif + Register tmp, + Register tmp2, + bool tosca_live) { + Unimplemented(); +} + +void MacroAssembler::g1_write_barrier_post(Register store_addr, + Register new_val, +#ifndef _LP64 + Register thread, +#endif + Register tmp, + Register tmp2) { + + Unimplemented(); +} + +#endif // INCLUDE_ALL_GCS +////////////////////////////////////////////////////////////////////////////////// + + +void MacroAssembler::store_check(Register obj) { + // Does a store check for the oop in register obj. The content of + // register obj is destroyed afterwards. + store_check_part_1(obj); + store_check_part_2(obj); +} + +void MacroAssembler::store_check(Register obj, Address dst) { + store_check(obj); +} + + +// split the store check operation so that other instructions can be scheduled inbetween +void MacroAssembler::store_check_part_1(Register obj) { + BarrierSet* bs = Universe::heap()->barrier_set(); + assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind"); +#ifdef _LP64 + dsrl(obj, obj, CardTableModRefBS::card_shift); +#else + shr(obj, CardTableModRefBS::card_shift); +#endif +} + +void MacroAssembler::store_check_part_2(Register obj) { + BarrierSet* bs = Universe::heap()->barrier_set(); + assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind"); + CardTableModRefBS* ct = (CardTableModRefBS*)bs; + assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); + + li(AT, (long)ct->byte_map_base); +#ifdef _LP64 + dadd(AT, AT, obj); +#else + add(AT, AT, obj); +#endif + sb(R0, AT, 0); + sync(); +} + +// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. +void MacroAssembler::tlab_allocate(Register obj, Register var_size_in_bytes, int con_size_in_bytes, + Register t1, Register t2, Label& slow_case) { + assert_different_registers(obj, var_size_in_bytes, t1, t2, AT); + + Register end = t2; +#ifndef OPT_THREAD + Register thread = t1; + get_thread(thread); +#else + Register thread = TREG; +#endif + verify_tlab(t1, t2);//blows t1&t2 + + ld_ptr(obj, thread, in_bytes(JavaThread::tlab_top_offset())); + + if (var_size_in_bytes == NOREG) { + // i dont think we need move con_size_in_bytes to a register first. + // by yjl 8/17/2005 + assert(is_simm16(con_size_in_bytes), "fixme by moving imm to a register first"); + addi(end, obj, con_size_in_bytes); + } else { + add(end, obj, var_size_in_bytes); + } + + ld_ptr(AT, thread, in_bytes(JavaThread::tlab_end_offset())); + sltu(AT, AT, end); + bne_far(AT, R0, slow_case); + delayed()->nop(); + + + // update the tlab top pointer + st_ptr(end, thread, in_bytes(JavaThread::tlab_top_offset())); + + // recover var_size_in_bytes if necessary + /*if (var_size_in_bytes == end) { + sub(var_size_in_bytes, end, obj); + }*/ + + verify_tlab(t1, t2); +} + +// Defines obj, preserves var_size_in_bytes +void MacroAssembler::eden_allocate(Register obj, Register var_size_in_bytes, int con_size_in_bytes, + Register t1, Register t2, Label& slow_case) { + assert_different_registers(obj, var_size_in_bytes, t1, AT); + if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) { //by yyq + // No allocation in the shared eden. + b_far(slow_case); + delayed()->nop(); + } else { + +#ifndef _LP64 + Address heap_top(t1, Assembler::split_low((intptr_t)Universe::heap()->top_addr())); + lui(t1, split_high((intptr_t)Universe::heap()->top_addr())); +#else + Address heap_top(t1); + li(t1, (long)Universe::heap()->top_addr()); +#endif + ld_ptr(obj, heap_top); + + Register end = t2; + Label retry; + + bind(retry); + if (var_size_in_bytes == NOREG) { + // i dont think we need move con_size_in_bytes to a register first. + assert(is_simm16(con_size_in_bytes), "fixme by moving imm to a register first"); + addi(end, obj, con_size_in_bytes); + } else { + add(end, obj, var_size_in_bytes); + } + // if end < obj then we wrapped around => object too long => slow case + sltu(AT, end, obj); + bne_far(AT, R0, slow_case); + delayed()->nop(); + + li(AT, (long)Universe::heap()->end_addr()); + sltu(AT, AT, end); + bne_far(AT, R0, slow_case); + delayed()->nop(); + // Compare obj with the top addr, and if still equal, store the new top addr in + // end at the address of the top addr pointer. Sets ZF if was equal, and clears + // it otherwise. Use lock prefix for atomicity on MPs. + //if (os::is_MP()) { + // sync(); + //} + + // if someone beat us on the allocation, try again, otherwise continue + cmpxchg(end, heap_top, obj); + beq_far(AT, R0, retry); //by yyq + delayed()->nop(); + + } +} + +// C2 doesn't invoke this one. +void MacroAssembler::tlab_refill(Label& retry, Label& try_eden, Label& slow_case) { + Register top = T0; + Register t1 = T1; +/* Jin: tlab_refill() is called in + + [c1_Runtime1_mips.cpp] Runtime1::generate_code_for(new_type_array_id); + + In generate_code_for(), T2 has been assigned as a register(length), which is used + after calling tlab_refill(); + Therefore, tlab_refill() should not use T2. + + Source: + +Exception in thread "main" java.lang.ArrayIndexOutOfBoundsException + at java.lang.System.arraycopy(Native Method) + at java.util.Arrays.copyOf(Arrays.java:2799) <-- alloc_array + at sun.misc.Resource.getBytes(Resource.java:117) + at java.net.URLClassLoader.defineClass(URLClassLoader.java:273) + at java.net.URLClassLoader.findClass(URLClassLoader.java:205) + at java.lang.ClassLoader.loadClass(ClassLoader.java:321) + */ + Register t2 = T9; + Register t3 = T3; + Register thread_reg = T8; + Label do_refill, discard_tlab; + if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) { //by yyq + // No allocation in the shared eden. + b(slow_case); + delayed()->nop(); + } + + get_thread(thread_reg); + + ld_ptr(top, thread_reg, in_bytes(JavaThread::tlab_top_offset())); + ld_ptr(t1, thread_reg, in_bytes(JavaThread::tlab_end_offset())); + + // calculate amount of free space + sub(t1, t1, top); + shr(t1, LogHeapWordSize); + + // Retain tlab and allocate object in shared space if + // the amount free in the tlab is too large to discard. + ld_ptr(t2, thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())); + slt(AT, t2, t1); + beq(AT, R0, discard_tlab); + delayed()->nop(); + + // Retain + +#ifndef _LP64 + move(AT, ThreadLocalAllocBuffer::refill_waste_limit_increment()); +#else + li(AT, ThreadLocalAllocBuffer::refill_waste_limit_increment()); +#endif + add(t2, t2, AT); + st_ptr(t2, thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())); + + if (TLABStats) { + // increment number of slow_allocations + lw(AT, thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset())); + addiu(AT, AT, 1); + sw(AT, thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset())); + } + b(try_eden); + delayed()->nop(); + + bind(discard_tlab); + if (TLABStats) { + // increment number of refills + lw(AT, thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset())); + addi(AT, AT, 1); + sw(AT, thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset())); + // accumulate wastage -- t1 is amount free in tlab + lw(AT, thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset())); + add(AT, AT, t1); + sw(AT, thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset())); + } + + // if tlab is currently allocated (top or end != null) then + // fill [top, end + alignment_reserve) with array object + beq(top, R0, do_refill); + delayed()->nop(); + + // set up the mark word + li(AT, (long)markOopDesc::prototype()->copy_set_hash(0x2)); + st_ptr(AT, top, oopDesc::mark_offset_in_bytes()); + + // set the length to the remaining space + addi(t1, t1, - typeArrayOopDesc::header_size(T_INT)); + addi(t1, t1, ThreadLocalAllocBuffer::alignment_reserve()); + shl(t1, log2_intptr(HeapWordSize/sizeof(jint))); + sw(t1, top, arrayOopDesc::length_offset_in_bytes()); + + // set klass to intArrayKlass +#ifndef _LP64 + lui(AT, split_high((intptr_t)Universe::intArrayKlassObj_addr())); + lw(t1, AT, split_low((intptr_t)Universe::intArrayKlassObj_addr())); +#else + li(AT, (intptr_t)Universe::intArrayKlassObj_addr()); + ld_ptr(t1, AT, 0); +#endif + //st_ptr(t1, top, oopDesc::klass_offset_in_bytes()); + store_klass(top, t1); + + // refill the tlab with an eden allocation + bind(do_refill); + ld_ptr(t1, thread_reg, in_bytes(JavaThread::tlab_size_offset())); + shl(t1, LogHeapWordSize); + // add object_size ?? + eden_allocate(top, t1, 0, t2, t3, slow_case); + + // Check that t1 was preserved in eden_allocate. +#ifdef ASSERT + if (UseTLAB) { + Label ok; + assert_different_registers(thread_reg, t1); + ld_ptr(AT, thread_reg, in_bytes(JavaThread::tlab_size_offset())); + shl(AT, LogHeapWordSize); + beq(AT, t1, ok); + delayed()->nop(); + stop("assert(t1 != tlab size)"); + should_not_reach_here(); + + bind(ok); + } +#endif + st_ptr(top, thread_reg, in_bytes(JavaThread::tlab_start_offset())); + st_ptr(top, thread_reg, in_bytes(JavaThread::tlab_top_offset())); + add(top, top, t1); + addi(top, top, - ThreadLocalAllocBuffer::alignment_reserve_in_bytes()); + st_ptr(top, thread_reg, in_bytes(JavaThread::tlab_end_offset())); + verify_tlab(t1, t2); + b(retry); + delayed()->nop(); +} + +static const double pi_4 = 0.7853981633974483; + +// the x86 version is to clumsy, i dont think we need that fuss. maybe i'm wrong, FIXME +// must get argument(a double) in F12/F13 +//void MacroAssembler::trigfunc(char trig, bool preserve_cpu_regs, int num_fpu_regs_in_use) { +//We need to preseve the register which maybe modified during the Call @Jerome +void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) { +//save all modified register here +// if (preserve_cpu_regs) { +// } +//FIXME, in the disassembly of tirgfunc, only used V0,V1,T9, SP,RA,so we ony save V0,V1,T9 + pushad(); +//we should preserve the stack space before we call + addi(SP, SP, -wordSize * 2); + switch (trig){ + case 's' : + call( CAST_FROM_FN_PTR(address, SharedRuntime::dsin), relocInfo::runtime_call_type ); + delayed()->nop(); + break; + case 'c': + call( CAST_FROM_FN_PTR(address, SharedRuntime::dcos), relocInfo::runtime_call_type ); + delayed()->nop(); + break; + case 't': + call( CAST_FROM_FN_PTR(address, SharedRuntime::dtan), relocInfo::runtime_call_type ); + delayed()->nop(); + break; + default:assert (false, "bad intrinsic"); + break; + + } + + addi(SP, SP, wordSize * 2); + popad(); +// if (preserve_cpu_regs) { +// } +} + +#ifdef _LP64 +void MacroAssembler::li(Register rd, long imm) { + if (imm <= max_jint && imm >= min_jint) { + li32(rd, (int)imm); + } else if (julong(imm) <= 0xFFFFFFFF) { + assert_not_delayed(); + // lui sign-extends, so we can't use that. + ori(rd, R0, julong(imm) >> 16); + dsll(rd, rd, 16); + ori(rd, rd, split_low(imm)); + //aoqi_test + //} else if ((imm > 0) && ((imm >> 48) == 0)) { + } else if ((imm > 0) && is_simm16(imm >> 32)) { + /* A 48-bit address */ + li48(rd, imm); + } else { + li64(rd, imm); + } +} +#else +void MacroAssembler::li(Register rd, long imm) { + li32(rd, (int)imm); +} +#endif + +void MacroAssembler::li32(Register reg, int imm) { + if (is_simm16(imm)) { + /* Jin: for imm < 0, we should use addi instead of addiu. + * + * java.lang.StringCoding$StringDecoder.decode(jobject, jint, jint) + * + * 78 move [int:-1|I] [a0|I] + * : daddi a0, zero, 0xffffffff (correct) + * : daddiu a0, zero, 0xffffffff (incorrect) + */ + if (imm >= 0) + addiu(reg, R0, imm); + else + addi(reg, R0, imm); + } else { + lui(reg, split_low(imm >> 16)); + if (split_low(imm)) + ori(reg, reg, split_low(imm)); + } +} + +#ifdef _LP64 +void MacroAssembler::set64(Register d, jlong value) { + assert_not_delayed(); + + int hi = (int)(value >> 32); + int lo = (int)(value & ~0); + + if (value == lo) { // 32-bit integer + if (is_simm16(value)) { + daddiu(d, R0, value); + } else { + lui(d, split_low(value >> 16)); + if (split_low(value)) { + ori(d, d, split_low(value)); + } + } + } else if (hi == 0) { // hardware zero-extends to upper 32 + ori(d, R0, julong(value) >> 16); + dsll(d, d, 16); + if (split_low(value)) { + ori(d, d, split_low(value)); + } + } else if ((value> 0) && is_simm16(value >> 32)) { // li48 + // 4 insts + li48(d, value); + } else { // li64 + // 6 insts + li64(d, value); + } +} + + +int MacroAssembler::insts_for_set64(jlong value) { + int hi = (int)(value >> 32); + int lo = (int)(value & ~0); + + int count = 0; + + if (value == lo) { // 32-bit integer + if (is_simm16(value)) { + //daddiu(d, R0, value); + count++; + } else { + //lui(d, split_low(value >> 16)); + count++; + if (split_low(value)) { + //ori(d, d, split_low(value)); + count++; + } + } + } else if (hi == 0) { // hardware zero-extends to upper 32 + //ori(d, R0, julong(value) >> 16); + //dsll(d, d, 16); + count += 2; + if (split_low(value)) { + //ori(d, d, split_low(value)); + count++; + } + } else if ((value> 0) && is_simm16(value >> 32)) { // li48 + // 4 insts + //li48(d, value); + count += 4; + } else { // li64 + // 6 insts + //li64(d, value); + count += 6; + } + + return count; +} + +void MacroAssembler::patchable_set48(Register d, jlong value) { + assert_not_delayed(); + + int hi = (int)(value >> 32); + int lo = (int)(value & ~0); + + int count = 0; + + if (value == lo) { // 32-bit integer + if (is_simm16(value)) { + daddiu(d, R0, value); + count += 1; + } else { + lui(d, split_low(value >> 16)); + count += 1; + if (split_low(value)) { + ori(d, d, split_low(value)); + count += 1; + } + } + } else if (hi == 0) { // hardware zero-extends to upper 32 + ori(d, R0, julong(value) >> 16); + dsll(d, d, 16); + count += 2; + if (split_low(value)) { + ori(d, d, split_low(value)); + count += 1; + } + } else if ((value> 0) && is_simm16(value >> 32)) { // li48 + // 4 insts + li48(d, value); + count += 4; + } else { // li64 + tty->print_cr("value = 0x%x", value); + guarantee(false, "Not supported yet !"); + } + + for (count; count < 4; count++) { + nop(); + } +} + +void MacroAssembler::patchable_set32(Register d, jlong value) { + assert_not_delayed(); + + int hi = (int)(value >> 32); + int lo = (int)(value & ~0); + + int count = 0; + + if (value == lo) { // 32-bit integer + if (is_simm16(value)) { + daddiu(d, R0, value); + count += 1; + } else { + lui(d, split_low(value >> 16)); + count += 1; + if (split_low(value)) { + ori(d, d, split_low(value)); + count += 1; + } + } + } else if (hi == 0) { // hardware zero-extends to upper 32 + ori(d, R0, julong(value) >> 16); + dsll(d, d, 16); + count += 2; + if (split_low(value)) { + ori(d, d, split_low(value)); + count += 1; + } + } else { + tty->print_cr("value = 0x%x", value); + guarantee(false, "Not supported yet !"); + } + + for (count; count < 3; count++) { + nop(); + } +} + +void MacroAssembler::patchable_call32(Register d, jlong value) { + assert_not_delayed(); + + int hi = (int)(value >> 32); + int lo = (int)(value & ~0); + + int count = 0; + + if (value == lo) { // 32-bit integer + if (is_simm16(value)) { + daddiu(d, R0, value); + count += 1; + } else { + lui(d, split_low(value >> 16)); + count += 1; + if (split_low(value)) { + ori(d, d, split_low(value)); + count += 1; + } + } + } else { + tty->print_cr("value = 0x%x", value); + guarantee(false, "Not supported yet !"); + } + + for (count; count < 2; count++) { + nop(); + } +} + +void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { + assert(UseCompressedClassPointers, "should only be used for compressed header"); + assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); + + int klass_index = oop_recorder()->find_index(k); + RelocationHolder rspec = metadata_Relocation::spec(klass_index); + long narrowKlass = (long)Klass::encode_klass(k); + + relocate(rspec, Assembler::narrow_oop_operand); + patchable_set48(dst, narrowKlass); +} + + +void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { + assert(UseCompressedOops, "should only be used for compressed header"); + assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); + + int oop_index = oop_recorder()->find_index(obj); + RelocationHolder rspec = oop_Relocation::spec(oop_index); + + relocate(rspec, Assembler::narrow_oop_operand); + patchable_set48(dst, oop_index); +} + +void MacroAssembler::li64(Register rd, long imm) { + assert_not_delayed(); + lui(rd, imm >> 48); + ori(rd, rd, split_low(imm >> 32)); + dsll(rd, rd, 16); + ori(rd, rd, split_low(imm >> 16)); + dsll(rd, rd, 16); + ori(rd, rd, split_low(imm)); +} + +void MacroAssembler::li48(Register rd, long imm) { + assert_not_delayed(); + assert(is_simm16(imm >> 32), "Not a 48-bit address"); + lui(rd, imm >> 32); + ori(rd, rd, split_low(imm >> 16)); + dsll(rd, rd, 16); + ori(rd, rd, split_low(imm)); +} +#endif +// NOTE: i dont push eax as i486. +// the x86 save eax for it use eax as the jump register +void MacroAssembler::verify_oop(Register reg, const char* s) { + /* + if (!VerifyOops) return; + + // Pass register number to verify_oop_subroutine + char* b = new char[strlen(s) + 50]; + sprintf(b, "verify_oop: %s: %s", reg->name(), s); + push(rax); // save rax, + push(reg); // pass register argument + ExternalAddress buffer((address) b); + // avoid using pushptr, as it modifies scratch registers + // and our contract is not to modify anything + movptr(rax, buffer.addr()); + push(rax); + // call indirectly to solve generation ordering problem + movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); + call(rax); + */ + if (!VerifyOops) return; + const char * b = NULL; + stringStream ss; + ss.print("verify_oop: %s: %s", reg->name(), s); + b = code_string(ss.as_string()); +#ifdef _LP64 + pushad(); + move(A1, reg); + li(A0, (long)b); + li(AT, (long)StubRoutines::verify_oop_subroutine_entry_address()); + ld(T9, AT, 0); + jalr(T9); + delayed()->nop(); + popad(); +#else + // Pass register number to verify_oop_subroutine + sw(T0, SP, - wordSize); + sw(T1, SP, - 2*wordSize); + sw(RA, SP, - 3*wordSize); + sw(A0, SP ,- 4*wordSize); + sw(A1, SP ,- 5*wordSize); + sw(AT, SP ,- 6*wordSize); + sw(T9, SP ,- 7*wordSize); + addiu(SP, SP, - 7 * wordSize); + move(A1, reg); + li(A0, (long)b); + // call indirectly to solve generation ordering problem + li(AT, (long)StubRoutines::verify_oop_subroutine_entry_address()); + lw(T9, AT, 0); + jalr(T9); + delayed()->nop(); + lw(T0, SP, 6* wordSize); + lw(T1, SP, 5* wordSize); + lw(RA, SP, 4* wordSize); + lw(A0, SP, 3* wordSize); + lw(A1, SP, 2* wordSize); + lw(AT, SP, 1* wordSize); + lw(T9, SP, 0* wordSize); + addiu(SP, SP, 7 * wordSize); +#endif +} + + +void MacroAssembler::verify_oop_addr(Address addr, const char* s) { + if (!VerifyOops) { + nop(); + return; + } + // Pass register number to verify_oop_subroutine + const char * b = NULL; + stringStream ss; + ss.print("verify_oop_addr: %s", s); + b = code_string(ss.as_string()); + + st_ptr(T0, SP, - wordSize); + st_ptr(T1, SP, - 2*wordSize); + st_ptr(RA, SP, - 3*wordSize); + st_ptr(A0, SP, - 4*wordSize); + st_ptr(A1, SP, - 5*wordSize); + st_ptr(AT, SP, - 6*wordSize); + st_ptr(T9, SP, - 7*wordSize); + ld_ptr(A1, addr); // addr may use SP, so load from it before change SP + addiu(SP, SP, - 7 * wordSize); + + li(A0, (long)b); + // call indirectly to solve generation ordering problem + li(AT, (long)StubRoutines::verify_oop_subroutine_entry_address()); + ld_ptr(T9, AT, 0); + jalr(T9); + delayed()->nop(); + ld_ptr(T0, SP, 6* wordSize); + ld_ptr(T1, SP, 5* wordSize); + ld_ptr(RA, SP, 4* wordSize); + ld_ptr(A0, SP, 3* wordSize); + ld_ptr(A1, SP, 2* wordSize); + ld_ptr(AT, SP, 1* wordSize); + ld_ptr(T9, SP, 0* wordSize); + addiu(SP, SP, 7 * wordSize); +} + +// used registers : T0, T1 +void MacroAssembler::verify_oop_subroutine() { + // RA: ra + // A0: char* error message + // A1: oop object to verify + + Label exit, error; + // increment counter + li(T0, (long)StubRoutines::verify_oop_count_addr()); + lw(AT, T0, 0); +#ifdef _LP64 + daddi(AT, AT, 1); +#else + addi(AT, AT, 1); +#endif + sw(AT, T0, 0); + + // make sure object is 'reasonable' + beq(A1, R0, exit); // if obj is NULL it is ok + delayed()->nop(); + + // Check if the oop is in the right area of memory + //const int oop_mask = Universe::verify_oop_mask(); + //const int oop_bits = Universe::verify_oop_bits(); + const uintptr_t oop_mask = Universe::verify_oop_mask(); + const uintptr_t oop_bits = Universe::verify_oop_bits(); + li(AT, oop_mask); + andr(T0, A1, AT); + li(AT, oop_bits); + bne(T0, AT, error); + delayed()->nop(); + + // make sure klass is 'reasonable' + //add for compressedoops + reinit_heapbase(); + //add for compressedoops + load_klass(T0, A1); + beq(T0, R0, error); // if klass is NULL it is broken + delayed()->nop(); + #if 0 + //FIXME:wuhui. + // Check if the klass is in the right area of memory + //const int klass_mask = Universe::verify_klass_mask(); + //const int klass_bits = Universe::verify_klass_bits(); + const uintptr_t klass_mask = Universe::verify_klass_mask(); + const uintptr_t klass_bits = Universe::verify_klass_bits(); + + li(AT, klass_mask); + andr(T1, T0, AT); + li(AT, klass_bits); + bne(T1, AT, error); + delayed()->nop(); + // make sure klass' klass is 'reasonable' + //add for compressedoops + load_klass(T0, T0); + beq(T0, R0, error); // if klass' klass is NULL it is broken + delayed()->nop(); + + li(AT, klass_mask); + andr(T1, T0, AT); + li(AT, klass_bits); + bne(T1, AT, error); + delayed()->nop(); // if klass not in right area of memory it is broken too. +#endif + // return if everything seems ok + bind(exit); + + jr(RA); + delayed()->nop(); + + // handle errors + bind(error); + pushad(); +#ifndef _LP64 + addi(SP, SP, (-1) * wordSize); +#endif + call(CAST_FROM_FN_PTR(address, MacroAssembler::debug), relocInfo::runtime_call_type); + delayed()->nop(); +#ifndef _LP64 + addiu(SP, SP, 1 * wordSize); +#endif + popad(); + jr(RA); + delayed()->nop(); +} + +void MacroAssembler::verify_tlab(Register t1, Register t2) { +#ifdef ASSERT + assert_different_registers(t1, t2, AT); + if (UseTLAB && VerifyOops) { + Label next, ok; + + get_thread(t1); + + ld_ptr(t2, t1, in_bytes(JavaThread::tlab_top_offset())); + ld_ptr(AT, t1, in_bytes(JavaThread::tlab_start_offset())); + sltu(AT, t2, AT); + beq(AT, R0, next); + delayed()->nop(); + + stop("assert(top >= start)"); + + bind(next); + ld_ptr(AT, t1, in_bytes(JavaThread::tlab_end_offset())); + sltu(AT, AT, t2); + beq(AT, R0, ok); + delayed()->nop(); + + stop("assert(top <= end)"); + + bind(ok); + + } +#endif +} + RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, + Register tmp, + int offset) { + intptr_t value = *delayed_value_addr; + if (value != 0) + return RegisterOrConstant(value + offset); + AddressLiteral a(delayed_value_addr); + // load indirectly to solve generation ordering problem + //movptr(tmp, ExternalAddress((address) delayed_value_addr)); + //ld(tmp, a); + if (offset != 0) + daddi(tmp,tmp, offset); + + return RegisterOrConstant(tmp); + } + +void MacroAssembler::hswap(Register reg) { + //short + //andi(reg, reg, 0xffff); + srl(AT, reg, 8); + sll(reg, reg, 24); + sra(reg, reg, 16); + orr(reg, reg, AT); +} + +void MacroAssembler::huswap(Register reg) { +#ifdef _LP64 + dsrl(AT, reg, 8); + dsll(reg, reg, 24); + dsrl(reg, reg, 16); + orr(reg, reg, AT); + andi(reg, reg, 0xffff); +#else + //andi(reg, reg, 0xffff); + srl(AT, reg, 8); + sll(reg, reg, 24); + srl(reg, reg, 16); + orr(reg, reg, AT); +#endif +} + +// something funny to do this will only one more register AT +// 32 bits +void MacroAssembler::swap(Register reg) { + srl(AT, reg, 8); + sll(reg, reg, 24); + orr(reg, reg, AT); + //reg : 4 1 2 3 + srl(AT, AT, 16); + xorr(AT, AT, reg); + andi(AT, AT, 0xff); + //AT : 0 0 0 1^3); + xorr(reg, reg, AT); + //reg : 4 1 2 1 + sll(AT, AT, 16); + xorr(reg, reg, AT); + //reg : 4 3 2 1 +} + +#ifdef _LP64 + +/* do 32-bit CAS using MIPS64 lld/scd + + Jin: cas_int should only compare 32-bits of the memory value. + However, lld/scd will do 64-bit operation, which violates the intention of cas_int. + To simulate a 32-bit atomic operation, the value loaded with LLD should be split into + tow halves, and only the low-32 bits is compared. If equals, the low-32 bits of newval, + plus the high-32 bits or memory value, are stored togethor with SCD. + +Example: + + double d = 3.1415926; + System.err.println("hello" + d); + + sun.misc.FloatingDecimal$1.() + | + `- java.util.concurrent.atomic.AtomicInteger::compareAndSet() + + 38 cas_int [a7a7|J] [a0|I] [a6|I] +// a0: 0xffffffffe8ea9f63 pc: 0x55647f3354 +// a6: 0x4ab325aa + +again: + 0x00000055647f3c5c: lld at, 0x0(a7) ; 64-bit load, "0xe8ea9f63" + + 0x00000055647f3c60: sll t9, at, 0 ; t9: low-32 bits (sign extended) + 0x00000055647f3c64: dsrl32 t8, at, 0 ; t8: high-32 bits + 0x00000055647f3c68: dsll32 t8, t8, 0 + 0x00000055647f3c6c: bne t9, a0, 0x00000055647f3c9c ; goto nequal + 0x00000055647f3c70: sll zero, zero, 0 + + 0x00000055647f3c74: ori v1, zero, 0xffffffff ; v1: low-32 bits of newval (sign unextended) + 0x00000055647f3c78: dsll v1, v1, 16 ; v1 = a6 & 0xFFFFFFFF; + 0x00000055647f3c7c: ori v1, v1, 0xffffffff + 0x00000055647f3c80: and v1, a6, v1 + 0x00000055647f3c84: or at, t8, v1 + 0x00000055647f3c88: scd at, 0x0(a7) + 0x00000055647f3c8c: beq at, zero, 0x00000055647f3c5c ; goto again + 0x00000055647f3c90: sll zero, zero, 0 + 0x00000055647f3c94: beq zero, zero, 0x00000055647f45ac ; goto done + 0x00000055647f3c98: sll zero, zero, 0 +nequal: + 0x00000055647f45a4: dadd a0, t9, zero + 0x00000055647f45a8: dadd at, zero, zero +done: +*/ + +void MacroAssembler::cmpxchg32(Register x_reg, Address dest, Register c_reg) { + /* 2012/11/11 Jin: MIPS64 can use ll/sc for 32-bit atomic memory access */ + Label done, again, nequal; + + bind(again); + + if(!Use3A2000) sync(); + ll(AT, dest); + bne(AT, c_reg, nequal); + delayed()->nop(); + + move(AT, x_reg); + sc(AT, dest); + beq(AT, R0, again); + delayed()->nop(); + b(done); + delayed()->nop(); + + // not xchged + bind(nequal); + sync(); + move(c_reg, AT); + move(AT, R0); + + bind(done); +} +#endif // cmpxchg32 + +void MacroAssembler::cmpxchg(Register x_reg, Address dest, Register c_reg) { + Label done, again, nequal; + + bind(again); +#ifdef _LP64 + if(!Use3A2000) sync(); + lld(AT, dest); +#else + if(!Use3A2000) sync(); + ll(AT, dest); +#endif + bne(AT, c_reg, nequal); + delayed()->nop(); + + move(AT, x_reg); +#ifdef _LP64 + scd(AT, dest); +#else + sc(AT, dest); +#endif + beq(AT, R0, again); + delayed()->nop(); + b(done); + delayed()->nop(); + + // not xchged + bind(nequal); + sync(); + move(c_reg, AT); + move(AT, R0); + + bind(done); +} + +void MacroAssembler::cmpxchg8(Register x_regLo, Register x_regHi, Address dest, Register c_regLo, Register c_regHi) { + Label done, again, nequal; + + Register x_reg = x_regLo; + dsll32(x_regHi, x_regHi, 0); + dsll32(x_regLo, x_regLo, 0); + dsrl32(x_regLo, x_regLo, 0); + orr(x_reg, x_regLo, x_regHi); + + Register c_reg = c_regLo; + dsll32(c_regHi, c_regHi, 0); + dsll32(c_regLo, c_regLo, 0); + dsrl32(c_regLo, c_regLo, 0); + orr(c_reg, c_regLo, c_regHi); + + bind(again); + + if(!Use3A2000) sync(); + lld(AT, dest); + bne(AT, c_reg, nequal); + delayed()->nop(); + + //move(AT, x_reg); + dadd(AT, x_reg, R0); + scd(AT, dest); + beq(AT, R0, again); + delayed()->nop(); + b(done); + delayed()->nop(); + + // not xchged + bind(nequal); + sync(); + //move(c_reg, AT); + //move(AT, R0); + dadd(c_reg, AT, R0); + dadd(AT, R0, R0); + bind(done); +} + +// be sure the three register is different +void MacroAssembler::rem_s(FloatRegister fd, FloatRegister fs, FloatRegister ft, FloatRegister tmp) { + assert_different_registers(tmp, fs, ft); + div_s(tmp, fs, ft); + trunc_l_s(tmp, tmp); + cvt_s_l(tmp, tmp); + mul_s(tmp, tmp, ft); + sub_s(fd, fs, tmp); +} + +// be sure the three register is different +void MacroAssembler::rem_d(FloatRegister fd, FloatRegister fs, FloatRegister ft, FloatRegister tmp) { + assert_different_registers(tmp, fs, ft); + div_d(tmp, fs, ft); + trunc_l_d(tmp, tmp); + cvt_d_l(tmp, tmp); + mul_d(tmp, tmp, ft); + sub_d(fd, fs, tmp); +} + +// Fast_Lock and Fast_Unlock used by C2 + +// Because the transitions from emitted code to the runtime +// monitorenter/exit helper stubs are so slow it's critical that +// we inline both the stack-locking fast-path and the inflated fast path. +// +// See also: cmpFastLock and cmpFastUnlock. +// +// What follows is a specialized inline transliteration of the code +// in slow_enter() and slow_exit(). If we're concerned about I$ bloat +// another option would be to emit TrySlowEnter and TrySlowExit methods +// at startup-time. These methods would accept arguments as +// (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure +// indications in the icc.ZFlag. Fast_Lock and Fast_Unlock would simply +// marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. +// In practice, however, the # of lock sites is bounded and is usually small. +// Besides the call overhead, TrySlowEnter and TrySlowExit might suffer +// if the processor uses simple bimodal branch predictors keyed by EIP +// Since the helper routines would be called from multiple synchronization +// sites. +// +// An even better approach would be write "MonitorEnter()" and "MonitorExit()" +// in java - using j.u.c and unsafe - and just bind the lock and unlock sites +// to those specialized methods. That'd give us a mostly platform-independent +// implementation that the JITs could optimize and inline at their pleasure. +// Done correctly, the only time we'd need to cross to native could would be +// to park() or unpark() threads. We'd also need a few more unsafe operators +// to (a) prevent compiler-JIT reordering of non-volatile accesses, and +// (b) explicit barriers or fence operations. +// +// TODO: +// +// * Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr). +// This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals. +// Given TLAB allocation, Self is usually manifested in a register, so passing it into +// the lock operators would typically be faster than reifying Self. +// +// * Ideally I'd define the primitives as: +// fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. +// fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED +// Unfortunately ADLC bugs prevent us from expressing the ideal form. +// Instead, we're stuck with a rather awkward and brittle register assignments below. +// Furthermore the register assignments are overconstrained, possibly resulting in +// sub-optimal code near the synchronization site. +// +// * Eliminate the sp-proximity tests and just use "== Self" tests instead. +// Alternately, use a better sp-proximity test. +// +// * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. +// Either one is sufficient to uniquely identify a thread. +// TODO: eliminate use of sp in _owner and use get_thread(tr) instead. +// +// * Intrinsify notify() and notifyAll() for the common cases where the +// object is locked by the calling thread but the waitlist is empty. +// avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). +// +// * use jccb and jmpb instead of jcc and jmp to improve code density. +// But beware of excessive branch density on AMD Opterons. +// +// * Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success +// or failure of the fast-path. If the fast-path fails then we pass +// control to the slow-path, typically in C. In Fast_Lock and +// Fast_Unlock we often branch to DONE_LABEL, just to find that C2 +// will emit a conditional branch immediately after the node. +// So we have branches to branches and lots of ICC.ZF games. +// Instead, it might be better to have C2 pass a "FailureLabel" +// into Fast_Lock and Fast_Unlock. In the case of success, control +// will drop through the node. ICC.ZF is undefined at exit. +// In the case of failure, the node will branch directly to the +// FailureLabel + + +// obj: object to lock +// box: on-stack box address (displaced header location) - KILLED +// rax,: tmp -- KILLED +// scr: tmp -- KILLED +void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, Register scrReg) { + + // Ensure the register assignents are disjoint + guarantee (objReg != boxReg, "") ; + guarantee (objReg != tmpReg, "") ; + guarantee (objReg != scrReg, "") ; + guarantee (boxReg != tmpReg, "") ; + guarantee (boxReg != scrReg, "") ; + + + block_comment("FastLock"); + /* + move(AT, 0x0); + return; + */ + if (PrintBiasedLockingStatistics) { + push(tmpReg); + atomic_inc32((address)BiasedLocking::total_entry_count_addr(), 1, AT, tmpReg); + pop(tmpReg); + } + + if (EmitSync & 1) { + move(AT, 0x0); + return; + } else + if (EmitSync & 2) { + Label DONE_LABEL ; + if (UseBiasedLocking) { + // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument. + biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL); + } + + ld(tmpReg, Address(objReg, 0)) ; // fetch markword + ori(tmpReg, tmpReg, 0x1); + sd(tmpReg, Address(boxReg, 0)); // Anticipate successful CAS + + cmpxchg(boxReg, Address(objReg, 0), tmpReg); // Updates tmpReg + bne(AT, R0, DONE_LABEL); + delayed()->nop(); + + // Recursive locking + dsubu(tmpReg, tmpReg, SP); + li(AT, (7 - os::vm_page_size() )); + andr(tmpReg, tmpReg, AT); + sd(tmpReg, Address(boxReg, 0)); + bind(DONE_LABEL) ; + } else { + // Possible cases that we'll encounter in fast_lock + // ------------------------------------------------ + // * Inflated + // -- unlocked + // -- Locked + // = by self + // = by other + // * biased + // -- by Self + // -- by other + // * neutral + // * stack-locked + // -- by self + // = sp-proximity test hits + // = sp-proximity test generates false-negative + // -- by other + // + + Label IsInflated, DONE_LABEL, PopDone ; + + // TODO: optimize away redundant LDs of obj->mark and improve the markword triage + // order to reduce the number of conditional branches in the most common cases. + // Beware -- there's a subtle invariant that fetch of the markword + // at [FETCH], below, will never observe a biased encoding (*101b). + // If this invariant is not held we risk exclusion (safety) failure. + if (UseBiasedLocking && !UseOptoBiasInlining) { + biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL); + } + + ld(tmpReg, Address(objReg, 0)) ; //Fetch the markword of the object. + andi(AT, tmpReg, markOopDesc::monitor_value); + bne(AT, R0, IsInflated); // inflated vs stack-locked|neutral|bias + delayed()->nop(); + + // Attempt stack-locking ... + ori (tmpReg, tmpReg, markOopDesc::unlocked_value); + sd(tmpReg, Address(boxReg, 0)); // Anticipate successful CAS + //if (os::is_MP()) { + // sync(); + //} + + cmpxchg(boxReg, Address(objReg, 0), tmpReg); // Updates tmpReg + //AT == 1: unlocked + + if (PrintBiasedLockingStatistics) { + Label L; + beq(AT, R0, L); + delayed()->nop(); + push(T0); + push(T1); + atomic_inc32((address)BiasedLocking::fast_path_entry_count_addr(), 1, T0, T1); + pop(T1); + pop(T0); + bind(L); + } + bne(AT, R0, DONE_LABEL); + delayed()->nop(); + + // Recursive locking + // The object is stack-locked: markword contains stack pointer to BasicLock. + // Locked by current thread if difference with current SP is less than one page. + dsubu(tmpReg, tmpReg, SP); + li(AT, 7 - os::vm_page_size() ); + andr(tmpReg, tmpReg, AT); + sd(tmpReg, Address(boxReg, 0)); + if (PrintBiasedLockingStatistics) { + Label L; + // tmpReg == 0 => BiasedLocking::_fast_path_entry_count++ + bne(tmpReg, R0, L); + delayed()->nop(); + push(T0); + push(T1); + atomic_inc32((address)BiasedLocking::fast_path_entry_count_addr(), 1, T0, T1); + pop(T1); + pop(T0); + bind(L); + } + sltiu(AT, tmpReg, 1); /* AT = (tmpReg == 0) ? 1 : 0 */ + + b(DONE_LABEL) ; + delayed()->nop(); + + bind(IsInflated) ; + // The object's monitor m is unlocked iff m->owner == NULL, + // otherwise m->owner may contain a thread or a stack address. + + // TODO: someday avoid the ST-before-CAS penalty by + // relocating (deferring) the following ST. + // We should also think about trying a CAS without having + // fetched _owner. If the CAS is successful we may + // avoid an RTO->RTS upgrade on the $line. + // Without cast to int32_t a movptr will destroy r10 which is typically obj + li(AT, (int32_t)intptr_t(markOopDesc::unused_mark())); + sd(AT, Address(boxReg, 0)); + + move(boxReg, tmpReg) ; + ld(tmpReg, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; + // if (m->owner != 0) => AT = 0, goto slow path. + move(AT, R0); + bne(tmpReg, R0, DONE_LABEL); + delayed()->nop(); + +#ifndef OPT_THREAD + get_thread (TREG) ; +#endif + // It's inflated and appears unlocked + //if (os::is_MP()) { + // sync(); + //} + cmpxchg(TREG, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), tmpReg) ; + // Intentional fall-through into DONE_LABEL ... + + + // DONE_LABEL is a hot target - we'd really like to place it at the + // start of cache line by padding with NOPs. + // See the AMD and Intel software optimization manuals for the + // most efficient "long" NOP encodings. + // Unfortunately none of our alignment mechanisms suffice. + bind(DONE_LABEL); + + // At DONE_LABEL the AT is set as follows ... + // Fast_Unlock uses the same protocol. + // AT == 1 -> Success + // AT == 0 -> Failure - force control through the slow-path + + // Avoid branch-to-branch on AMD processors + // This appears to be superstition. + if (EmitSync & 32) nop() ; + + } +} + +// obj: object to unlock +// box: box address (displaced header location), killed. Must be EAX. +// rbx,: killed tmp; cannot be obj nor box. +// +// Some commentary on balanced locking: +// +// Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites. +// Methods that don't have provably balanced locking are forced to run in the +// interpreter - such methods won't be compiled to use fast_lock and fast_unlock. +// The interpreter provides two properties: +// I1: At return-time the interpreter automatically and quietly unlocks any +// objects acquired the current activation (frame). Recall that the +// interpreter maintains an on-stack list of locks currently held by +// a frame. +// I2: If a method attempts to unlock an object that is not held by the +// the frame the interpreter throws IMSX. +// +// Lets say A(), which has provably balanced locking, acquires O and then calls B(). +// B() doesn't have provably balanced locking so it runs in the interpreter. +// Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O +// is still locked by A(). +// +// The only other source of unbalanced locking would be JNI. The "Java Native Interface: +// Programmer's Guide and Specification" claims that an object locked by jni_monitorenter +// should not be unlocked by "normal" java-level locking and vice-versa. The specification +// doesn't specify what will occur if a program engages in such mixed-mode locking, however. + +void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { + + guarantee (objReg != boxReg, "") ; + guarantee (objReg != tmpReg, "") ; + guarantee (boxReg != tmpReg, "") ; + + + + block_comment("FastUnlock"); + + + if (EmitSync & 4) { + // Disable - inhibit all inlining. Force control through the slow-path + move(AT, 0x0); + return; + } else + if (EmitSync & 8) { + Label DONE_LABEL ; + if (UseBiasedLocking) { + biased_locking_exit(objReg, tmpReg, DONE_LABEL); + } + // classic stack-locking code ... + ld(tmpReg, Address(boxReg, 0)) ; + beq(tmpReg, R0, DONE_LABEL) ; + move(AT, 0x1); // delay slot + + cmpxchg(tmpReg, Address(objReg, 0), boxReg); // Uses EAX which is box + bind(DONE_LABEL); + } else { + Label DONE_LABEL, Stacked, CheckSucc, Inflated ; + + // Critically, the biased locking test must have precedence over + // and appear before the (box->dhw == 0) recursive stack-lock test. + if (UseBiasedLocking && !UseOptoBiasInlining) { + biased_locking_exit(objReg, tmpReg, DONE_LABEL); + } + + ld(AT, Address(boxReg, 0)) ; // Examine the displaced header + beq(AT, R0, DONE_LABEL) ; // 0 indicates recursive stack-lock + delayed()->daddiu(AT, R0, 0x1); + + ld(tmpReg, Address(objReg, 0)) ; // Examine the object's markword + andi(AT, tmpReg, markOopDesc::monitor_value) ; // Inflated? + beq(AT, R0, Stacked) ; // Inflated? + delayed()->nop(); + + bind(Inflated) ; + // It's inflated. + // Despite our balanced locking property we still check that m->_owner == Self + // as java routines or native JNI code called by this thread might + // have released the lock. + // Refer to the comments in synchronizer.cpp for how we might encode extra + // state in _succ so we can avoid fetching EntryList|cxq. + // + // I'd like to add more cases in fast_lock() and fast_unlock() -- + // such as recursive enter and exit -- but we have to be wary of + // I$ bloat, T$ effects and BP$ effects. + // + // If there's no contention try a 1-0 exit. That is, exit without + // a costly MEMBAR or CAS. See synchronizer.cpp for details on how + // we detect and recover from the race that the 1-0 exit admits. + // + // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier + // before it STs null into _owner, releasing the lock. Updates + // to data protected by the critical section must be visible before + // we drop the lock (and thus before any other thread could acquire + // the lock and observe the fields protected by the lock). + // IA32's memory-model is SPO, so STs are ordered with respect to + // each other and there's no need for an explicit barrier (fence). + // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. +#ifndef OPT_THREAD + get_thread (TREG) ; +#endif + + // It's inflated + ld(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; + xorr(boxReg, boxReg, TREG); + + ld(AT, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ; + orr(boxReg, boxReg, AT); + + move(AT, R0); + bne(boxReg, R0, DONE_LABEL); + delayed()->nop(); + + ld(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; + ld(AT, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; + orr(boxReg, boxReg, AT); + + move(AT, R0); + bne(boxReg, R0, DONE_LABEL); + delayed()->nop(); + + sync(); + sd(R0, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; + move(AT, 0x1); + b(DONE_LABEL); + delayed()->nop(); + + bind (Stacked); + ld(tmpReg, Address(boxReg, 0)) ; + //if (os::is_MP()) { sync(); } + cmpxchg(tmpReg, Address(objReg, 0), boxReg); + + if (EmitSync & 65536) { + bind (CheckSucc); + } + + bind(DONE_LABEL); + + // Avoid branch to branch on AMD processors + if (EmitSync & 32768) { nop() ; } + } +} + +void MacroAssembler::align(int modulus) { + while (offset() % modulus != 0) nop(); +} + + +void MacroAssembler::verify_FPU(int stack_depth, const char* s) { + //Unimplemented(); +} + +#ifdef _LP64 +Register caller_saved_registers[] = {AT, V0, V1, A0, A1, A2, A3, A4, A5, A6, A7, T0, T1, T2, T3, T8, T9, GP, RA, FP}; + +/* FIXME: Jin: In MIPS64, F0~23 are all caller-saved registers */ +FloatRegister caller_saved_fpu_registers[] = {F0, F12, F13}; +#else +Register caller_saved_registers[] = {AT, V0, V1, A0, A1, A2, A3, T4, T5, T6, T7, T0, T1, T2, T3, T8, T9, GP, RA, FP}; + +Register caller_saved_fpu_registers[] = {}; +#endif + +//We preserve all caller-saved register +void MacroAssembler::pushad(){ + int i; + + /* Fixed-point registers */ + int len = sizeof(caller_saved_registers) / sizeof(caller_saved_registers[0]); + daddi(SP, SP, -1 * len * wordSize); + for (i = 0; i < len; i++) + { +#ifdef _LP64 + sd(caller_saved_registers[i], SP, (len - i - 1) * wordSize); +#else + sw(caller_saved_registers[i], SP, (len - i - 1) * wordSize); +#endif + } + + /* Floating-point registers */ + len = sizeof(caller_saved_fpu_registers) / sizeof(caller_saved_fpu_registers[0]); + daddi(SP, SP, -1 * len * wordSize); + for (i = 0; i < len; i++) + { +#ifdef _LP64 + sdc1(caller_saved_fpu_registers[i], SP, (len - i - 1) * wordSize); +#else + swc1(caller_saved_fpu_registers[i], SP, (len - i - 1) * wordSize); +#endif + } +}; + +void MacroAssembler::popad(){ + int i; + + /* Floating-point registers */ + int len = sizeof(caller_saved_fpu_registers) / sizeof(caller_saved_fpu_registers[0]); + for (i = 0; i < len; i++) + { +#ifdef _LP64 + ldc1(caller_saved_fpu_registers[i], SP, (len - i - 1) * wordSize); +#else + lwc1(caller_saved_fpu_registers[i], SP, (len - i - 1) * wordSize); +#endif + } + daddi(SP, SP, len * wordSize); + + /* Fixed-point registers */ + len = sizeof(caller_saved_registers) / sizeof(caller_saved_registers[0]); + for (i = 0; i < len; i++) + { +#ifdef _LP64 + ld(caller_saved_registers[i], SP, (len - i - 1) * wordSize); +#else + lw(caller_saved_registers[i], SP, (len - i - 1) * wordSize); +#endif + } + daddi(SP, SP, len * wordSize); +}; + +void MacroAssembler::push2(Register reg1, Register reg2) { +#ifdef _LP64 + daddi(SP, SP, -16); + sd(reg2, SP, 0); + sd(reg1, SP, 8); +#else + addi(SP, SP, -8); + sw(reg2, SP, 0); + sw(reg1, SP, 4); +#endif +} + +void MacroAssembler::pop2(Register reg1, Register reg2) { +#ifdef _LP64 + ld(reg1, SP, 0); + ld(reg2, SP, 8); + daddi(SP, SP, 16); +#else + lw(reg1, SP, 0); + lw(reg2, SP, 4); + addi(SP, SP, 8); +#endif +} + +//for UseCompressedOops Option +void MacroAssembler::load_klass(Register dst, Register src) { +#ifdef _LP64 + if(UseCompressedClassPointers){ + lwu(dst, Address(src, oopDesc::klass_offset_in_bytes())); + decode_klass_not_null(dst); + } else +#endif + ld(dst, src, oopDesc::klass_offset_in_bytes()); +} + +void MacroAssembler::store_klass(Register dst, Register src) { +#ifdef _LP64 + if(UseCompressedClassPointers){ + encode_klass_not_null(src); + sw(src, dst, oopDesc::klass_offset_in_bytes()); + } else { +#endif + sd(src, dst, oopDesc::klass_offset_in_bytes()); + } +} + +void MacroAssembler::load_prototype_header(Register dst, Register src) { + load_klass(dst, src); + ld(dst, Address(dst, Klass::prototype_header_offset())); +} + +#ifdef _LP64 +void MacroAssembler::store_klass_gap(Register dst, Register src) { + if (UseCompressedClassPointers) { + sw(src, dst, oopDesc::klass_gap_offset_in_bytes()); + } +} + +void MacroAssembler::load_heap_oop(Register dst, Address src) { + if(UseCompressedOops){ + lwu(dst, src); + decode_heap_oop(dst); + } else{ + ld(dst, src); + } +} + +void MacroAssembler::store_heap_oop(Address dst, Register src){ + if(UseCompressedOops){ + assert(!dst.uses(src), "not enough registers"); + encode_heap_oop(src); + sw(src, dst); + } else{ + sd(src, dst); + } +} + +#ifdef ASSERT +void MacroAssembler::verify_heapbase(const char* msg) { + assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed"); + assert (Universe::heap() != NULL, "java heap should be initialized"); +} +#endif + + +// Algorithm must match oop.inline.hpp encode_heap_oop. +void MacroAssembler::encode_heap_oop(Register r) { +#ifdef ASSERT + verify_heapbase("MacroAssembler::encode_heap_oop:heap base corrupted?"); +#endif + verify_oop(r, "broken oop in encode_heap_oop"); + if (Universe::narrow_oop_base() == NULL) { + if (Universe::narrow_oop_shift() != 0) { + assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); + shr(r, LogMinObjAlignmentInBytes); + } + return; + } + + movz(r, S5_heapbase, r); + dsub(r, r, S5_heapbase); + if (Universe::narrow_oop_shift() != 0) { + assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); + shr(r, LogMinObjAlignmentInBytes); + } +} + +void MacroAssembler::encode_heap_oop(Register dst, Register src) { +#ifdef ASSERT + verify_heapbase("MacroAssembler::encode_heap_oop:heap base corrupted?"); +#endif + verify_oop(src, "broken oop in encode_heap_oop"); + if (Universe::narrow_oop_base() == NULL) { + if (Universe::narrow_oop_shift() != 0) { + assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); + dsrl(dst, src, LogMinObjAlignmentInBytes); + } else { + if (dst != src) move(dst, src); + } + } else { + if (dst == src) { + movz(dst, S5_heapbase, dst); + dsub(dst, dst, S5_heapbase); + if (Universe::narrow_oop_shift() != 0) { + assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); + shr(dst, LogMinObjAlignmentInBytes); + } + } else { + dsub(dst, src, S5_heapbase); + if (Universe::narrow_oop_shift() != 0) { + assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); + shr(dst, LogMinObjAlignmentInBytes); + } + movz(dst, R0, src); + } + } +} + +void MacroAssembler::encode_heap_oop_not_null(Register r) { + assert (UseCompressedOops, "should be compressed"); +#ifdef ASSERT + if (CheckCompressedOops) { + Label ok; + bne(r, R0, ok); + delayed()->nop(); + stop("null oop passed to encode_heap_oop_not_null"); + bind(ok); + } +#endif + verify_oop(r, "broken oop in encode_heap_oop_not_null"); + if (Universe::narrow_oop_base() != NULL) { + dsub(r, r, S5_heapbase); + } + if (Universe::narrow_oop_shift() != 0) { + assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); + shr(r, LogMinObjAlignmentInBytes); + } + +} + +void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { + assert (UseCompressedOops, "should be compressed"); +#ifdef ASSERT + if (CheckCompressedOops) { + Label ok; + bne(src, R0, ok); + delayed()->nop(); + stop("null oop passed to encode_heap_oop_not_null2"); + bind(ok); + } +#endif + verify_oop(src, "broken oop in encode_heap_oop_not_null2"); + + if (Universe::narrow_oop_base() != NULL) { + dsub(dst, src, S5_heapbase); + if (Universe::narrow_oop_shift() != 0) { + assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); + shr(dst, LogMinObjAlignmentInBytes); + } + } else { + if (Universe::narrow_oop_shift() != 0) { + assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); + dsrl(dst, src, LogMinObjAlignmentInBytes); + } else { + if (dst != src) move(dst, src); + } + } +} + +void MacroAssembler::decode_heap_oop(Register r) { +#ifdef ASSERT + verify_heapbase("MacroAssembler::decode_heap_oop corrupted?"); +#endif + if (Universe::narrow_oop_base() == NULL) { + if (Universe::narrow_oop_shift() != 0) { + assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); + shl(r, LogMinObjAlignmentInBytes); + } + } else { + move(AT, r); + if (Universe::narrow_oop_shift() != 0) { + assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); + shl(r, LogMinObjAlignmentInBytes); + } + dadd(r, r, S5_heapbase); + movz(r, R0, AT); + } + verify_oop(r, "broken oop in decode_heap_oop"); +} + +void MacroAssembler::decode_heap_oop(Register dst, Register src) { +#ifdef ASSERT + verify_heapbase("MacroAssembler::decode_heap_oop corrupted?"); +#endif + if (Universe::narrow_oop_base() == NULL) { + if (Universe::narrow_oop_shift() != 0) { + assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); + if (dst != src) nop(); // DON'T DELETE THIS GUY. + dsll(dst, src, LogMinObjAlignmentInBytes); + } else { + if (dst != src) move(dst, src); + } + } else { + if (dst == src) { + move(AT, dst); + if (Universe::narrow_oop_shift() != 0) { + assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); + shl(dst, LogMinObjAlignmentInBytes); + } + dadd(dst, dst, S5_heapbase); + movz(dst, R0, AT); + } else { + if (Universe::narrow_oop_shift() != 0) { + assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); + dsll(dst, src, LogMinObjAlignmentInBytes); + daddu(dst, dst, S5_heapbase); + } else { + daddu(dst, src, S5_heapbase); + } + movz(dst, R0, src); + } + } + verify_oop(dst, "broken oop in decode_heap_oop"); +} + +void MacroAssembler::decode_heap_oop_not_null(Register r) { + // Note: it will change flags + assert (UseCompressedOops, "should only be used for compressed headers"); + assert (Universe::heap() != NULL, "java heap should be initialized"); + // Cannot assert, unverified entry point counts instructions (see .ad file) + // vtableStubs also counts instructions in pd_code_size_limit. + // Also do not verify_oop as this is called by verify_oop. + if (Universe::narrow_oop_shift() != 0) { + assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); + shl(r, LogMinObjAlignmentInBytes); + if (Universe::narrow_oop_base() != NULL) { + daddu(r, r, S5_heapbase); + } + } else { + assert (Universe::narrow_oop_base() == NULL, "sanity"); + } +} + +void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { + assert (UseCompressedOops, "should only be used for compressed headers"); + assert (Universe::heap() != NULL, "java heap should be initialized"); + + // Cannot assert, unverified entry point counts instructions (see .ad file) + // vtableStubs also counts instructions in pd_code_size_limit. + // Also do not verify_oop as this is called by verify_oop. + //lea(dst, Address(S5_heapbase, src, Address::times_8, 0)); + if (Universe::narrow_oop_shift() != 0) { + assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); + if (LogMinObjAlignmentInBytes == Address::times_8) { + dsll(dst, src, LogMinObjAlignmentInBytes); + daddu(dst, dst, S5_heapbase); + } else { + dsll(dst, src, LogMinObjAlignmentInBytes); + if (Universe::narrow_oop_base() != NULL) { + daddu(dst, dst, S5_heapbase); + } + } + } else { + assert (Universe::narrow_oop_base() == NULL, "sanity"); + if (dst != src) { + move(dst, src); + } + } +} + +void MacroAssembler::encode_klass_not_null(Register r) { + if (Universe::narrow_klass_base() != NULL) { + assert(r != AT, "Encoding a klass in AT"); + set64(AT, (int64_t)Universe::narrow_klass_base()); + dsub(r, r, AT); + } + if (Universe::narrow_klass_shift() != 0) { + assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); + shr(r, LogKlassAlignmentInBytes); + } + // Not neccessary for MIPS at all. + //if (Universe::narrow_klass_base() != NULL) { + // reinit_heapbase(); + //} +} + +void MacroAssembler::encode_klass_not_null(Register dst, Register src) { + if (dst == src) { + encode_klass_not_null(src); + } else { + if (Universe::narrow_klass_base() != NULL) { + set64(dst, (int64_t)Universe::narrow_klass_base()); + dsub(dst, src, dst); + if (Universe::narrow_klass_shift() != 0) { + assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); + shr(dst, LogKlassAlignmentInBytes); + } + } else { + if (Universe::narrow_klass_shift() != 0) { + assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); + dsrl(dst, src, LogKlassAlignmentInBytes); + } else { + move(dst, src); + } + } + } +} + +// Function instr_size_for_decode_klass_not_null() counts the instructions +// generated by decode_klass_not_null(register r) and reinit_heapbase(), +// when (Universe::heap() != NULL). Hence, if the instructions they +// generate change, then this method needs to be updated. +int MacroAssembler::instr_size_for_decode_klass_not_null() { + assert (UseCompressedClassPointers, "only for compressed klass ptrs"); + if (Universe::narrow_klass_base() != NULL) { + // mov64 + addq + shlq? + mov64 (for reinit_heapbase()). + return (Universe::narrow_klass_shift() == 0 ? 4 * 9 : 4 * 10); + } else { + // longest load decode klass function, mov64, leaq + return (Universe::narrow_klass_shift() == 0 ? 4 * 0 : 4 * 1); + } +} + +void MacroAssembler::decode_klass_not_null(Register r) { + assert (UseCompressedClassPointers, "should only be used for compressed headers"); + assert(r != AT, "Decoding a klass in AT"); + // Cannot assert, unverified entry point counts instructions (see .ad file) + // vtableStubs also counts instructions in pd_code_size_limit. + // Also do not verify_oop as this is called by verify_oop. + if (Universe::narrow_klass_shift() != 0) { + assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); + shl(r, LogKlassAlignmentInBytes); + } + if (Universe::narrow_klass_base() != NULL) { + set64(AT, (int64_t)Universe::narrow_klass_base()); + daddu(r, r, AT); + //Not neccessary for MIPS at all. + //reinit_heapbase(); + } +} + +void MacroAssembler::decode_klass_not_null(Register dst, Register src) { + assert (UseCompressedClassPointers, "should only be used for compressed headers"); + + if (dst == src) { + decode_klass_not_null(dst); + } else { + // Cannot assert, unverified entry point counts instructions (see .ad file) + // vtableStubs also counts instructions in pd_code_size_limit. + // Also do not verify_oop as this is called by verify_oop. + set64(dst, (int64_t)Universe::narrow_klass_base()); + if (Universe::narrow_klass_shift() != 0) { + assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); + assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?"); + dsll(AT, src, Address::times_8); + daddu(dst, dst, AT); + } else { + daddu(dst, src, dst); + } + } +} + +void MacroAssembler::incrementl(Register reg, int value) { + if (value == min_jint) { + move(AT, value); + LP64_ONLY(addu32(reg, reg, AT)) NOT_LP64(addu(reg, reg, AT)); + return; + } + if (value < 0) { decrementl(reg, -value); return; } + if (value == 0) { ; return; } + + if(Assembler::is_simm16(value)) { + NOT_LP64(addiu(reg, reg, value)); + LP64_ONLY(move(AT, value); addu32(reg, reg, AT)); + } else { + move(AT, value); + LP64_ONLY(addu32(reg, reg, AT)) NOT_LP64(addu(reg, reg, AT)); + } +} + +void MacroAssembler::decrementl(Register reg, int value) { + if (value == min_jint) { + move(AT, value); + LP64_ONLY(subu32(reg, reg, AT)) NOT_LP64(subu(reg, reg, AT)); + return; + } + if (value < 0) { incrementl(reg, -value); return; } + if (value == 0) { ; return; } + + if(Assembler::is_simm16(value)) { + NOT_LP64(addiu(reg, reg, -value)); + LP64_ONLY(move(AT, value); subu32(reg, reg, AT)); + } else { + move(AT, value); + LP64_ONLY(subu32(reg, reg, AT)) NOT_LP64(subu(reg, reg, AT)); + } +} + +void MacroAssembler::reinit_heapbase() { + if (UseCompressedOops || UseCompressedClassPointers) { + if (Universe::heap() != NULL) { + if (Universe::narrow_oop_base() == NULL) { + move(S5_heapbase, R0); + } else { + set64(S5_heapbase, (int64_t)Universe::narrow_ptrs_base()); + } + } else { + set64(S5_heapbase, (intptr_t)Universe::narrow_ptrs_base_addr()); + ld(S5_heapbase, S5_heapbase, 0); + } + } +} +#endif // _LP64 + +void MacroAssembler::check_klass_subtype(Register sub_klass, + Register super_klass, + Register temp_reg, + Label& L_success) { +//implement ind gen_subtype_check + Label L_failure; + check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); + check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); + bind(L_failure); +} + +SkipIfEqual::SkipIfEqual( + MacroAssembler* masm, const bool* flag_addr, bool value) { + _masm = masm; + _masm->li(AT, (address)flag_addr); + _masm->lb(AT,AT,0); + _masm->addi(AT,AT,-value); + _masm->beq(AT,R0,_label); + _masm->delayed()->nop(); +} +void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, + Register super_klass, + Register temp_reg, + Label* L_success, + Label* L_failure, + Label* L_slow_path, + RegisterOrConstant super_check_offset) { + assert_different_registers(sub_klass, super_klass, temp_reg); + bool must_load_sco = (super_check_offset.constant_or_zero() == -1); + if (super_check_offset.is_register()) { + assert_different_registers(sub_klass, super_klass, + super_check_offset.as_register()); + } else if (must_load_sco) { + assert(temp_reg != noreg, "supply either a temp or a register offset"); + } + + Label L_fallthrough; + int label_nulls = 0; + if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } + if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } + if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } + assert(label_nulls <= 1, "at most one NULL in the batch"); + + int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); + int sco_offset = in_bytes(Klass::super_check_offset_offset()); + // If the pointers are equal, we are done (e.g., String[] elements). + // This self-check enables sharing of secondary supertype arrays among + // non-primary types such as array-of-interface. Otherwise, each such + // type would need its own customized SSA. + // We move this check to the front of the fast path because many + // type checks are in fact trivially successful in this manner, + // so we get a nicely predicted branch right at the start of the check. + //cmpptr(sub_klass, super_klass); + //local_jcc(Assembler::equal, *L_success); + beq(sub_klass, super_klass, *L_success); + delayed()->nop(); + // Check the supertype display: + if (must_load_sco) { + // Positive movl does right thing on LP64. + lwu(temp_reg, super_klass, sco_offset); + super_check_offset = RegisterOrConstant(temp_reg); + } + dsll(AT, super_check_offset.register_or_noreg(), Address::times_1); + daddu(AT, sub_klass, AT); + ld(AT, AT, super_check_offset.constant_or_zero()*Address::times_1); + + // This check has worked decisively for primary supers. + // Secondary supers are sought in the super_cache ('super_cache_addr'). + // (Secondary supers are interfaces and very deeply nested subtypes.) + // This works in the same check above because of a tricky aliasing + // between the super_cache and the primary super display elements. + // (The 'super_check_addr' can address either, as the case requires.) + // Note that the cache is updated below if it does not help us find + // what we need immediately. + // So if it was a primary super, we can just fail immediately. + // Otherwise, it's the slow path for us (no success at this point). + + if (super_check_offset.is_register()) { + beq(super_klass, AT, *L_success); + delayed()->nop(); + addi(AT, super_check_offset.as_register(), -sc_offset); + if (L_failure == &L_fallthrough) { + beq(AT, R0, *L_slow_path); + delayed()->nop(); + } else { + bne(AT, R0, *L_failure); + delayed()->nop(); + b(*L_slow_path); + delayed()->nop(); + } + } else if (super_check_offset.as_constant() == sc_offset) { + // Need a slow path; fast failure is impossible. + if (L_slow_path == &L_fallthrough) { + beq(super_klass, AT, *L_success); + delayed()->nop(); + } else { + bne(super_klass, AT, *L_slow_path); + delayed()->nop(); + b(*L_success); + delayed()->nop(); + } + } else { + // No slow path; it's a fast decision. + if (L_failure == &L_fallthrough) { + beq(super_klass, AT, *L_success); + delayed()->nop(); + } else { + bne(super_klass, AT, *L_failure); + delayed()->nop(); + b(*L_success); + delayed()->nop(); + } + } + + bind(L_fallthrough); + +} + + +void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, + Register super_klass, + Register temp_reg, + Register temp2_reg, + Label* L_success, + Label* L_failure, + bool set_cond_codes) { + assert_different_registers(sub_klass, super_klass, temp_reg); + if (temp2_reg != noreg) + assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg); + else + temp2_reg = T9; +#define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) + + Label L_fallthrough; + int label_nulls = 0; + if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } + if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } + assert(label_nulls <= 1, "at most one NULL in the batch"); + + // a couple of useful fields in sub_klass: + int ss_offset = in_bytes(Klass::secondary_supers_offset()); + int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); + Address secondary_supers_addr(sub_klass, ss_offset); + Address super_cache_addr( sub_klass, sc_offset); + + // Do a linear scan of the secondary super-klass chain. + // This code is rarely used, so simplicity is a virtue here. + // The repne_scan instruction uses fixed registers, which we must spill. + // Don't worry too much about pre-existing connections with the input regs. + +#if 0 + assert(sub_klass != T9, "killed reg"); // killed by mov(rax, super) + assert(sub_klass != T1, "killed reg"); // killed by lea(rcx, &pst_counter) +#endif + + // Get super_klass value into rax (even if it was in rdi or rcx). +#ifndef PRODUCT + int* pst_counter = &SharedRuntime::_partial_subtype_ctr; + ExternalAddress pst_counter_addr((address) pst_counter); + NOT_LP64( incrementl(pst_counter_addr) ); + //LP64_ONLY( lea(rcx, pst_counter_addr) ); + //LP64_ONLY( incrementl(Address(rcx, 0)) ); +#endif //PRODUCT + + // We will consult the secondary-super array. + ld(temp_reg, secondary_supers_addr); + // Load the array length. (Positive movl does right thing on LP64.) + lw(temp2_reg, Address(temp_reg, Array::length_offset_in_bytes())); + // Skip to start of data. + daddiu(temp_reg, temp_reg, Array::base_offset_in_bytes()); + + // Scan RCX words at [RDI] for an occurrence of RAX. + // Set NZ/Z based on last compare. + // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does + // not change flags (only scas instruction which is repeated sets flags). + // Set Z = 0 (not equal) before 'repne' to indicate that class was not found. + + /* 2013/4/3 Jin: OpenJDK8 never compresses klass pointers in secondary-super array. */ + Label Loop, subtype; + bind(Loop); + beq(temp2_reg, R0, *L_failure); + delayed()->nop(); + ld(AT, temp_reg, 0); + beq(AT, super_klass, subtype); + delayed()->daddi(temp_reg, temp_reg, 1 * wordSize); + b(Loop); + delayed()->daddi(temp2_reg, temp2_reg, -1); + + bind(subtype); + sd(super_klass, super_cache_addr); + if (L_success != &L_fallthrough) { + b(*L_success); + delayed()->nop(); + } + + // Success. Cache the super we found and proceed in triumph. +#undef IS_A_TEMP + + bind(L_fallthrough); +} +void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { + ld(oop_result, Address(java_thread, JavaThread::vm_result_offset())); + sd(R0, Address(java_thread, JavaThread::vm_result_offset())); + verify_oop(oop_result, "broken oop in call_VM_base"); +} + +void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { + ld(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); + sd(R0, Address(java_thread, JavaThread::vm_result_2_offset())); +} + +Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, + int extra_slot_offset) { + // cf. TemplateTable::prepare_invoke(), if (load_receiver). + int stackElementSize = Interpreter::stackElementSize; + int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); +#ifdef ASSERT + int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); + assert(offset1 - offset == stackElementSize, "correct arithmetic"); +#endif + Register scale_reg = NOREG; + Address::ScaleFactor scale_factor = Address::no_scale; + if (arg_slot.is_constant()) { + offset += arg_slot.as_constant() * stackElementSize; + } else { + scale_reg = arg_slot.as_register(); + scale_factor = Address::times_8; + } + // 2014/07/31 Fu: We don't push RA on stack in prepare_invoke. + // offset += wordSize; // return PC is on stack + if(scale_reg==NOREG) return Address(SP, offset); + else { + dsll(scale_reg, scale_reg, scale_factor); + daddu(scale_reg, SP, scale_reg); + return Address(scale_reg, offset); + } +} + +SkipIfEqual::~SkipIfEqual() { + _masm->bind(_label); +} + +void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { + switch (size_in_bytes) { +#ifndef _LP64 + case 8: + assert(dst2 != noreg, "second dest register required"); + lw(dst, src); + lw(dst2, src.plus_disp(BytesPerInt)); + break; +#else + case 8: ld(dst, src); break; +#endif + case 4: lw(dst, src); break; + case 2: is_signed ? lh(dst, src) : lhu(dst, src); break; + case 1: is_signed ? lb( dst, src) : lbu( dst, src); break; + default: ShouldNotReachHere(); + } +} + +void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { + switch (size_in_bytes) { +#ifndef _LP64 + case 8: + assert(src2 != noreg, "second source register required"); + sw(src, dst); + sw(src2, dst.plus_disp(BytesPerInt)); + break; +#else + case 8: sd(src, dst); break; +#endif + case 4: sw(src, dst); break; + case 2: sh(src, dst); break; + case 1: sb(src, dst); break; + default: ShouldNotReachHere(); + } +} + +// Look up the method for a megamorphic invokeinterface call. +// The target method is determined by . +// The receiver klass is in recv_klass. +// On success, the result will be in method_result, and execution falls through. +// On failure, execution transfers to the given label. +void MacroAssembler::lookup_interface_method(Register recv_klass, + Register intf_klass, + RegisterOrConstant itable_index, + Register method_result, + Register scan_temp, + Label& L_no_such_interface) { + assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); + assert(itable_index.is_constant() || itable_index.as_register() == method_result, + "caller must use same register for non-constant itable index as for method"); + + // Compute start of first itableOffsetEntry (which is at the end of the vtable) + int vtable_base = InstanceKlass::vtable_start_offset() * wordSize; + int itentry_off = itableMethodEntry::method_offset_in_bytes(); + int scan_step = itableOffsetEntry::size() * wordSize; + int vte_size = vtableEntry::size() * wordSize; + Address::ScaleFactor times_vte_scale = Address::times_ptr; + assert(vte_size == wordSize, "else adjust times_vte_scale"); + + lw(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize)); + + // %%% Could store the aligned, prescaled offset in the klassoop. + dsll(scan_temp, scan_temp, times_vte_scale); + daddu(scan_temp, recv_klass, scan_temp); + daddiu(scan_temp, scan_temp, vtable_base); + if (HeapWordsPerLong > 1) { + // Round up to align_object_offset boundary + // see code for InstanceKlass::start_of_itable! + round_to(scan_temp, BytesPerLong); + } + + // Adjust recv_klass by scaled itable_index, so we can free itable_index. + assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); +// lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); + if (itable_index.is_constant()) { + set64(AT, (int)itable_index.is_constant()); + dsll(AT, AT, (int)Address::times_ptr); + } else { + dsll(AT, itable_index.as_register(), (int)Address::times_ptr); + } + daddu(AT, AT, recv_klass); + daddiu(recv_klass, AT, itentry_off); + + // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { + // if (scan->interface() == intf) { + // result = (klass + scan->offset() + itable_index); + // } + // } + Label search, found_method; + + for (int peel = 1; peel >= 0; peel--) { + ld(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); + + if (peel) { + beq(intf_klass, method_result, found_method); + nop(); + } else { + bne(intf_klass, method_result, search); + nop(); + // (invert the test to fall through to found_method...) + } + + if (!peel) break; + + bind(search); + + // Check that the previous entry is non-null. A null entry means that + // the receiver class doesn't implement the interface, and wasn't the + // same as when the caller was compiled. + beq(method_result, R0, L_no_such_interface); + nop(); + daddiu(scan_temp, scan_temp, scan_step); + } + + bind(found_method); + + // Got a hit. + lw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); + //ld(method_result, Address(recv_klass, scan_temp, Address::times_1)); + if(UseLoongsonISA) { + gsldx(method_result, recv_klass, scan_temp, 0); + } else { + daddu(AT, recv_klass, scan_temp); + ld(method_result, AT); + } +} + + +// virtual method calling +void MacroAssembler::lookup_virtual_method(Register recv_klass, + RegisterOrConstant vtable_index, + Register method_result) { + Register tmp = GP; + push(tmp); + + if (vtable_index.is_constant()) { + assert_different_registers(recv_klass, method_result, tmp); + } else { + assert_different_registers(recv_klass, method_result, vtable_index.as_register(), tmp); + } + const int base = InstanceKlass::vtable_start_offset() * wordSize; + assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below"); +/* + Address vtable_entry_addr(recv_klass, + vtable_index, Address::times_ptr, + base + vtableEntry::method_offset_in_bytes()); +*/ + if (vtable_index.is_constant()) { + set64(AT, vtable_index.as_constant()); + dsll(AT, AT, (int)Address::times_ptr); + } else { + dsll(AT, vtable_index.as_register(), (int)Address::times_ptr); + } + set64(tmp, base + vtableEntry::method_offset_in_bytes()); + daddu(tmp, tmp, AT); + daddu(tmp, tmp, recv_klass); + ld(method_result, tmp, 0); + + pop(tmp); +}