src/cpu/mips/vm/assembler_mips.cpp

Tue, 26 Jul 2016 11:15:09 +0800

author
fujie
date
Tue, 26 Jul 2016 11:15:09 +0800
changeset 38
f0e26f502a50
parent 31
f9d3579d1f72
child 41
d885f8d65c58
permissions
-rw-r--r--

Instruction decoding support: add movn and movz in MIPS disassembler.

     1 /*
     2  * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
     3  * Copyright (c) 2015, 2016, Loongson Technology. All rights reserved.
     4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     5  *
     6  * This code is free software; you can redistribute it and/or modify it
     7  * under the terms of the GNU General Public License version 2 only, as
     8  * published by the Free Software Foundation.
     9  *
    10  * This code is distributed in the hope that it will be useful, but WITHOUT
    11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    13  * version 2 for more details (a copy is included in the LICENSE file that
    14  * accompanied this code).
    15  *
    16  * You should have received a copy of the GNU General Public License version
    17  * 2 along with this work; if not, write to the Free Software Foundation,
    18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    19  *
    20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    21  * or visit www.oracle.com if you need additional information or have any
    22  * questions.
    23  *
    24  */
    26 #include "precompiled.hpp"
    27 #include "asm/assembler.hpp"
    28 #include "asm/assembler.inline.hpp"
    29 #include "gc_interface/collectedHeap.inline.hpp"
    30 #include "interpreter/interpreter.hpp"
    31 #include "memory/cardTableModRefBS.hpp"
    32 #include "memory/resourceArea.hpp"
    33 #include "prims/methodHandles.hpp"
    34 #include "runtime/biasedLocking.hpp"
    35 #include "runtime/interfaceSupport.hpp"
    36 #include "runtime/objectMonitor.hpp"
    37 #include "runtime/os.hpp"
    38 #include "runtime/sharedRuntime.hpp"
    39 #include "runtime/stubRoutines.hpp"
    40 #ifndef SERIALGC
    41 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
    42 #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
    43 #include "gc_implementation/g1/heapRegion.hpp"
    44 #endif
    45 #ifdef PRODUCT
    46 #define BLOCK_COMMENT(str) /* nothing */
    47 #define STOP(error) stop(error)
    48 #else
    49 #define BLOCK_COMMENT(str) block_comment(str)
    50 #define STOP(error) block_comment(error); stop(error)
    51 #endif
    53 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
    55 intptr_t MacroAssembler::i[32] = {0};
    56 float MacroAssembler::f[32] = {0.0};
    58 void MacroAssembler::print(outputStream *s) {
    59 	unsigned int k;
    60 	for(k=0; k<sizeof(i)/sizeof(i[0]); k++) {
    61 		s->print_cr("i%d = 0x%.16lx", k, i[k]);
    62 	}
    63 	s->cr();
    65 	for(k=0; k<sizeof(f)/sizeof(f[0]); k++) {
    66 		s->print_cr("f%d = %f", k, f[k]); 
    67 	}
    68 	s->cr();
    69 }
    72 int MacroAssembler::i_offset(unsigned int k) { return (intptr_t)&((MacroAssembler*)0)->i[k]; }
    73 int MacroAssembler::f_offset(unsigned int k) { return (intptr_t)&((MacroAssembler*)0)->f[k]; }
    75 void MacroAssembler::save_registers(MacroAssembler *masm) {
    76 #define __ masm->
    77 	for(int k=0; k<32; k++) {
    78 		__ sw (as_Register(k), A0, i_offset(k));
    79 	}
    81 	for(int k=0; k<32; k++) {
    82 		__ swc1 (as_FloatRegister(k), A0, f_offset(k));
    83 	}
    84 #undef __
    85 }
    87 void MacroAssembler::restore_registers(MacroAssembler *masm) {
    88 #define __ masm->
    89 	for(int k=0; k<32; k++) {
    90 		__ lw (as_Register(k), A0, i_offset(k));
    91 	}
    93 	for(int k=0; k<32; k++) {
    94 		__ lwc1 (as_FloatRegister(k), A0, f_offset(k));
    95 	}
    96 #undef __
    97 }
   100 // Implementation of AddressLiteral
   102 AddressLiteral::AddressLiteral(address target, relocInfo::relocType rtype) {
   103   _is_lval = false;
   104   _target = target;
   105   _rspec = rspec_from_rtype(rtype, target);
   106 }
   108 // Implementation of Address
   110 //FIXME aoqi
   111 //#ifdef _LP64
   112 #if 0
   114 Address Address::make_array(ArrayAddress adr) {
   115   // Not implementable on 64bit machines
   116   // Should have been handled higher up the call chain.
   117   ShouldNotReachHere();
   118   return Address();
   119 }
   121 // exceedingly dangerous constructor
   122 Address::Address(int disp, address loc, relocInfo::relocType rtype) {
   123   _base  = noreg;
   124   _index = noreg;
   125   _scale = no_scale;
   126   _disp  = disp;
   127   switch (rtype) {
   128     case relocInfo::external_word_type:
   129       _rspec = external_word_Relocation::spec(loc);
   130       break;
   131     case relocInfo::internal_word_type:
   132       _rspec = internal_word_Relocation::spec(loc);
   133       break;
   134     case relocInfo::runtime_call_type:
   135       // HMM
   136       _rspec = runtime_call_Relocation::spec();
   137       break;
   138     case relocInfo::poll_type:
   139     case relocInfo::poll_return_type:
   140       _rspec = Relocation::spec_simple(rtype);
   141       break;
   142     case relocInfo::none:
   143       break;
   144     default:
   145       ShouldNotReachHere();
   146   }
   147 }
   148 #else // LP64
   150 Address Address::make_array(ArrayAddress adr) {
   151   AddressLiteral base = adr.base();
   152   Address index = adr.index();
   153   assert(index._disp == 0, "must not have disp"); // maybe it can?
   154   Address array(index._base, index._index, index._scale, (intptr_t) base.target());
   155   array._rspec = base._rspec;
   156   return array;
   157 }
   159 // exceedingly dangerous constructor
   160 Address::Address(address loc, RelocationHolder spec) {
   161   _base  = noreg;
   162   _index = noreg;
   163   _scale = no_scale;
   164   _disp  = (intptr_t) loc;
   165   _rspec = spec;
   166 }
   168 #endif // _LP64
   171 /*
   172 // Convert the raw encoding form into the form expected by the constructor for
   173 // Address.  An index of 4 (rsp) corresponds to having no index, so convert
   174 // that to noreg for the Address constructor.
   175 Address Address::make_raw(int base, int index, int scale, int disp) {
   176   bool valid_index = index != rsp->encoding();
   177   if (valid_index) {
   178     Address madr(as_Register(base), as_Register(index), (Address::ScaleFactor)scale, in_ByteSize(disp));
   179     return madr;
   180   } else {
   181     Address madr(as_Register(base), noreg, Address::no_scale, in_ByteSize(disp));
   182     return madr;
   183   }
   184 }
   185 */
   187 // Implementation of Assembler
   188 const char *Assembler::ops_name[] = {
   189 	"special",  "regimm",   "j",      "jal",    "beq",      "bne",      "blez",   "bgtz",
   190 	"addi",     "addiu",    "slti",   "sltiu",  "andi",     "ori",      "xori",   "lui",
   191 	"cop0",     "cop1",     "cop2",   "cop3",   "beql",     "bnel",     "bleql",  "bgtzl",
   192 	"daddi",    "daddiu",   "ldl",    "ldr",    "",         "",         "",       "",
   193 	"lb",       "lh",       "lwl",    "lw",     "lbu",      "lhu",      "lwr",    "lwu",
   194 	"sb",       "sh",       "swl",    "sw",     "sdl",      "sdr",      "swr",    "cache",
   195 	"ll",       "lwc1",     "",       "",       "lld",      "ldc1",     "",       "ld",
   196 	"sc",       "swc1",     "",       "",       "scd",      "sdc1",     "",       "sd"
   197 };
   199 const char* Assembler::special_name[] = {
   200 	"sll",      "",         "srl",      "sra",      "sllv",     "",         "srlv",     "srav",
   201 	"jr",       "jalr",     "movz",     "movn",     "syscall",  "break",    "",         "sync",
   202 	"mfhi",     "mthi",     "mflo",     "mtlo",     "dsll",     "",         "dsrl",     "dsra",
   203 	"mult",     "multu",    "div",      "divu",     "dmult",    "dmultu",   "ddiv",     "ddivu",
   204 	"add",      "addu",     "sub",      "subu",     "and",      "or",       "xor",      "nor",
   205 	"",         "",         "slt",      "sltu",     "dadd",     "daddu",    "dsub",     "dsubu",
   206 	"tge",      "tgeu",     "tlt",      "tltu",     "teq",      "",         "tne",      "",
   207 	"dsll",     "",         "dsrl",     "dsra",     "dsll32",   "",         "dsrl32",   "dsra32"
   208 };
   210 const char* Assembler::regimm_name[] = {
   211 	"bltz",     "bgez",     "bltzl",    "bgezl",    "",         "",         "",         "",
   212 	"tgei",     "tgeiu",    "tlti",     "tltiu",    "teqi",     "",         "tnei",     "",
   213 	"bltzal",   "bgezal",   "bltzall",  "bgezall"
   214 };
   216 const char* Assembler::float_name[] = {
   217 	"add",			"sub",			"mul",			"div",			"sqrt",			"abs",			"mov",			"neg",
   218 	"round.l",	"trunc.l",	"ceil.l",		"floor.l",	"round.w",  "trunc.w",	"ceil.w",		"floor.w"
   219 };
   221 //misleading name, print only branch/jump instruction 
   222 void Assembler::print_instruction(int inst) {
   223 	const char *s;
   224 	switch( opcode(inst) ) {
   225 	default:
   226 		s = ops_name[opcode(inst)];
   227 		break;
   228 	case special_op:
   229 		s = special_name[special(inst)];
   230 		break;
   231 	case regimm_op:
   232 		s = special_name[rt(inst)];
   233 		break;
   234 	}
   236 	::tty->print("%s", s);
   237 }
   239 void MacroAssembler::pd_patch_instruction(address branch, address target) {
   240   jint& stub_inst = *(jint*) branch;
   242 /* *
   243 	move(AT, RA); // dadd
   244 	emit_long(insn_ORRI(regimm_op, 0, bgezal_op, 1));
   245 	nop();
   246         lui(T9, 0); // to be patched
   247         ori(T9, 0);
   248 	daddu(T9, T9, RA);
   249 	move(RA, AT);
   250 	jr(T9);
   251  */
   252   if(special(stub_inst) == dadd_op) {
   253     jint *pc = (jint *)branch;
   255     assert(opcode(pc[3]) == lui_op
   256           && opcode(pc[4]) == ori_op
   257           && special(pc[5]) == daddu_op, "Not a branch label patch");
   258     if(!(opcode(pc[3]) == lui_op
   259           && opcode(pc[4]) == ori_op
   260           && special(pc[5]) == daddu_op)) { tty->print_cr("Not a branch label patch"); }
   262     int offset = target - branch;
   263     if (!is_simm16(offset))
   264     {
   265       pc[3] = (pc[3] & 0xffff0000) | high16(offset - 12);
   266       pc[4] = (pc[4] & 0xffff0000) | low16(offset - 12);
   267     }
   268     else
   269     {
   270       /* revert to "beq + nop" */
   271       CodeBuffer cb(branch, 4 * 10);
   272       MacroAssembler masm(&cb);
   273 #define __ masm.
   274       __ b(target);
   275       __ nop();
   276       __ nop();
   277       __ nop();
   278       __ nop();
   279       __ nop();
   280       __ nop();
   281       __ nop();
   282     }
   283     return;
   284   }
   286 #ifndef PRODUCT
   287   if (!is_simm16((target - branch - 4) >> 2))
   288   {
   289     tty->print_cr("Illegal patching: target=0x%lx", target);
   290     int *p = (int *)branch;
   291     for (int i = -10; i < 10; i++)
   292     {
   293        tty->print("0x%lx, ", p[i]);
   294     }
   295     tty->print_cr("");
   296   }
   297 #endif
   299   stub_inst = patched_branch(target - branch, stub_inst, 0);
   300 }
   302 //without check, maybe fixed
   303 int Assembler::patched_branch(int dest_pos, int inst, int inst_pos) {
   304 	int v = (dest_pos - inst_pos - 4)>>2;
   305 	switch(opcode(inst)) {
   306 	case j_op:
   307 	case jal_op:
   308 		assert(false, "should not use j/jal here");
   309 		break;
   310 	default:
   311 		assert(is_simm16(v), "must be simm16");
   312 #ifndef PRODUCT
   313 		if(!is_simm16(v))
   314 		{ 
   315 			tty->print_cr("must be simm16");
   316 			tty->print_cr("Inst: %lx", inst);
   317 		}
   318 #endif
   320 		v = low16(v);
   321 		inst &= 0xffff0000;
   322 		break;
   323 	}
   325 	return inst | v;
   326 }
   328 int Assembler::branch_destination(int inst, int pos) {
   329 	int off;
   331 	switch(opcode(inst)) {
   332 	case j_op:
   333 	case jal_op:
   334 		assert(false, "should not use j/jal here");
   335 		break;
   336 	default:
   337 		off = expand(low16(inst), 15);
   338 		break;
   339 	}
   341 	return off ? pos + 4 + (off<<2) : 0;
   342 }
   344 int AbstractAssembler::code_fill_byte() {
   345 	  return 0x00;                  // illegal instruction 0x00000000
   346 }
   348 // Now the Assembler instruction (identical for 32/64 bits)
   350 void Assembler::lb(Register rt, Address src) {
   351 	lb(rt, src.base(), src.disp());
   352 }
   354 void Assembler::lbu(Register rt, Address src) {
   355 	lbu(rt, src.base(), src.disp());
   356 }
   358 void Assembler::ld(Register rt, Address src){
   359 	ld(rt, src.base(), src.disp());
   360 }
   362 void Assembler::ldl(Register rt, Address src){
   363 	ldl(rt, src.base(), src.disp());
   364 }
   366 void Assembler::ldr(Register rt, Address src){
   367 	ldr(rt, src.base(), src.disp());
   368 }
   370 void Assembler::lh(Register rt, Address src){
   371 	lh(rt, src.base(), src.disp());
   372 }
   374 void Assembler::lhu(Register rt, Address src){
   375 	lhu(rt, src.base(), src.disp());
   376 }
   378 void Assembler::ll(Register rt, Address src){
   379 	ll(rt, src.base(), src.disp());
   380 }
   382 void Assembler::lld(Register rt, Address src){
   383 	lld(rt, src.base(), src.disp());
   384 }
   386 void Assembler::lw(Register rt, Address src){
   387 	lw(rt, src.base(), src.disp());
   388 }
   389 void Assembler::lea(Register rt, Address src) {
   390 #ifdef _LP64
   391   daddi(rt, src.base(), src.disp());
   392 #else
   393   addi(rt, src.base(), src.disp());
   394 #endif
   395 }
   397 void Assembler::lwl(Register rt, Address src){
   398 	lwl(rt, src.base(), src.disp());
   399 }
   401 void Assembler::lwr(Register rt, Address src){
   402 	lwr(rt, src.base(), src.disp());
   403 }
   405 void Assembler::lwu(Register rt, Address src){
   406 	lwu(rt, src.base(), src.disp());
   407 }
   409 void Assembler::sb(Register rt, Address dst) {
   410 	sb(rt, dst.base(), dst.disp());
   411 }
   413 void Assembler::sc(Register rt, Address dst) {
   414 	sc(rt, dst.base(), dst.disp());
   415 }
   417 void Assembler::scd(Register rt, Address dst) {
   418 	scd(rt, dst.base(), dst.disp());
   419 }
   421 void Assembler::sd(Register rt, Address dst) {
   422 	sd(rt, dst.base(), dst.disp());
   423 }
   425 void Assembler::sdl(Register rt, Address dst) {
   426 	sdl(rt, dst.base(), dst.disp());
   427 }
   429 void Assembler::sdr(Register rt, Address dst) {
   430 	sdr(rt, dst.base(), dst.disp());
   431 }
   433 void Assembler::sh(Register rt, Address dst) {
   434 	sh(rt, dst.base(), dst.disp());
   435 }
   437 void Assembler::sw(Register rt, Address dst) {
   438 	sw(rt, dst.base(), dst.disp());
   439 }
   441 void Assembler::swl(Register rt, Address dst) {
   442 	swl(rt, dst.base(), dst.disp());
   443 }
   445 void Assembler::swr(Register rt, Address dst) {
   446 	swr(rt, dst.base(), dst.disp());
   447 }
   449 void Assembler::lwc1(FloatRegister rt, Address src) {
   450 	lwc1(rt, src.base(), src.disp());
   451 }
   453 void Assembler::ldc1(FloatRegister rt, Address src) {
   454 	ldc1(rt, src.base(), src.disp());
   455 }
   457 void Assembler::swc1(FloatRegister rt, Address dst) {
   458 	swc1(rt, dst.base(), dst.disp());
   459 }
   461 void Assembler::sdc1(FloatRegister rt, Address dst) {
   462 	sdc1(rt, dst.base(), dst.disp());
   463 }
   465 void Assembler::j(address entry) {
   466 	int dest = ((intptr_t)entry - (((intptr_t)pc() + 4) & 0xf0000000))>>2;
   467 	emit_long((j_op<<26) | dest); 
   468 	has_delay_slot(); 
   469 }
   471 void Assembler::jal(address entry) {
   472 	int dest = ((intptr_t)entry - (((intptr_t)pc() + 4) & 0xf0000000))>>2;
   473 	emit_long((jal_op<<26) | dest); 
   474 	has_delay_slot(); 
   475 }
   477 void MacroAssembler::beq_far(Register rs, Register rt, address entry)
   478 {
   479   u_char * cur_pc = pc();
   481   /* Jin: Near/Far jump */
   482   if(is_simm16((entry - pc() - 4) / 4))
   483   {
   484     Assembler::beq(rs, rt, offset(entry));
   485   }
   486   else
   487   {
   488     Label not_jump;
   489     bne(rs, rt, not_jump);
   490     delayed()->nop();
   492     b_far(entry); 
   493     delayed()->nop();
   495     bind(not_jump);
   496     has_delay_slot();
   497   }
   498 }
   500 void MacroAssembler::beq_far(Register rs, Register rt, Label& L)
   501 {
   502   if (L.is_bound()) {
   503     beq_far(rs, rt, target(L));
   504   } else {
   505     u_char * cur_pc = pc();
   506     Label not_jump;
   507     bne(rs, rt, not_jump);
   508     delayed()->nop();
   510     b_far(L); 
   511     delayed()->nop();
   513     bind(not_jump);
   514     has_delay_slot();
   515   }
   516 }
   518 void MacroAssembler::bne_far(Register rs, Register rt, address entry)
   519 {
   520   u_char * cur_pc = pc();
   522   /* Jin: Near/Far jump */
   523   if(is_simm16((entry - pc() - 4) / 4))
   524   {
   525     Assembler::bne(rs, rt, offset(entry));
   526   }
   527   else
   528   {
   529     Label not_jump;
   530     beq(rs, rt, not_jump);
   531     delayed()->nop();
   533     b_far(entry); 
   534     delayed()->nop();
   536     bind(not_jump);
   537     has_delay_slot();
   538   }
   539 }
   541 void MacroAssembler::bne_far(Register rs, Register rt, Label& L)
   542 {
   543   if (L.is_bound()) {
   544     bne_far(rs, rt, target(L));
   545   } else {
   546     u_char * cur_pc = pc();
   547     Label not_jump;
   548     beq(rs, rt, not_jump);
   549     delayed()->nop();
   551     b_far(L); 
   552     delayed()->nop();
   554     bind(not_jump);
   555     has_delay_slot();
   556   }
   557 }
   559 void MacroAssembler::b_far(Label& L)
   560 {
   561   if (L.is_bound()) {
   562     b_far(target(L));
   563   } else {
   564 	volatile address dest = target(L);
   565 /*
   566 MacroAssembler::pd_patch_instruction branch=55651ed514, target=55651ef6d8
   567    0x00000055651ed514: dadd at, ra, zero
   568    0x00000055651ed518: [4110001]bgezal zero, 0x00000055651ed520
   570    0x00000055651ed51c: sll zero, zero, 0
   571    0x00000055651ed520: lui t9, 0x0
   572    0x00000055651ed524: ori t9, t9, 0x21b8
   573    0x00000055651ed528: daddu t9, t9, ra
   574    0x00000055651ed52c: dadd ra, at, zero
   575    0x00000055651ed530: jr t9
   576    0x00000055651ed534: sll zero, zero, 0
   577 */
   578 	move(AT, RA);
   579 	emit_long(insn_ORRI(regimm_op, 0, bgezal_op, 1));
   580 	nop();
   581         lui(T9, 0); // to be patched
   582         ori(T9, T9, 0);
   583 	daddu(T9, T9, RA);
   584 	move(RA, AT);
   585 	jr(T9);
   586   }
   587 }
   589 void MacroAssembler::b_far(address entry)
   590 { 
   591 	u_char * cur_pc = pc();
   593 	/* Jin: Near/Far jump */
   594 	if(is_simm16((entry - pc() - 4) / 4))
   595 	{
   596 		b(offset(entry));
   597 	}
   598 	else
   599 	{
   600 		/* address must be bounded */
   601 		move(AT, RA);
   602 	 	emit_long(insn_ORRI(regimm_op, 0, bgezal_op, 1));
   603 		nop();
   604 		li32(T9, entry - pc());
   605 		daddu(T9, T9, RA);
   606 		move(RA, AT);
   607 		jr(T9);
   608 	}
   609 }
   611 // Implementation of MacroAssembler
   613 // First all the versions that have distinct versions depending on 32/64 bit
   614 // Unless the difference is trivial (1 line or so).
   616 //#ifndef _LP64
   618 // 32bit versions
   620 void MacroAssembler::ld_ptr(Register rt, Register offset, Register base) {
   621   addu_long(AT, base, offset);
   622   ld_ptr(rt, 0, AT);
   623 }
   625 void MacroAssembler::st_ptr(Register rt, Register offset, Register base) {
   626   addu_long(AT, base, offset);
   627   st_ptr(rt, 0, AT);
   628 }
   630 void MacroAssembler::ld_long(Register rt, Register offset, Register base) {
   631   addu_long(AT, base, offset);
   632   ld_long(rt, 0, AT);
   633 }
   635 void MacroAssembler::st_long(Register rt, Register offset, Register base) {
   636   addu_long(AT, base, offset);
   637   st_long(rt, 0, AT);
   638 }
   640 Address MacroAssembler::as_Address(AddressLiteral adr) {
   641   return Address(adr.target(), adr.rspec());
   642 }
   644 Address MacroAssembler::as_Address(ArrayAddress adr) {
   645   return Address::make_array(adr);
   646 }
   648 // tmp_reg1 and tmp_reg2 should be saved outside of atomic_inc32 (caller saved).
   649 void MacroAssembler::atomic_inc32(address counter_addr, int inc, Register tmp_reg1, Register tmp_reg2) {
   650   Label again;
   652   bind(again);
   653   sync();
   654   li(tmp_reg1, counter_addr);
   655   ll(tmp_reg2, tmp_reg1, 0);
   656   addi(tmp_reg2, tmp_reg2, inc);
   657   sc(tmp_reg2, tmp_reg1, 0);
   658   beq(tmp_reg2, R0, again);
   659   delayed()->nop();
   660 }
   661 int MacroAssembler::biased_locking_enter(Register lock_reg,
   662                                          Register obj_reg,
   663                                          Register swap_reg,
   664                                          Register tmp_reg,
   665                                          bool swap_reg_contains_mark,
   666                                          Label& done,
   667                                          Label* slow_case,
   668                                          BiasedLockingCounters* counters) {
   669   assert(UseBiasedLocking, "why call this otherwise?");
   670   bool need_tmp_reg = false;
   671   if (tmp_reg == noreg) {
   672     need_tmp_reg = true;
   673     tmp_reg = T9;
   674   }
   675   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, AT);
   676   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
   677   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
   678   Address saved_mark_addr(lock_reg, 0);
   680   // Biased locking
   681   // See whether the lock is currently biased toward our thread and
   682   // whether the epoch is still valid
   683   // Note that the runtime guarantees sufficient alignment of JavaThread
   684   // pointers to allow age to be placed into low bits
   685   // First check to see whether biasing is even enabled for this object
   686   Label cas_label;
   687   int null_check_offset = -1;
   688   if (!swap_reg_contains_mark) {
   689     null_check_offset = offset();
   690     ld_ptr(swap_reg, mark_addr);
   691   }
   693   if (need_tmp_reg) {
   694     push(tmp_reg);
   695   }
   696   move(tmp_reg, swap_reg);
   697   andi(tmp_reg, tmp_reg, markOopDesc::biased_lock_mask_in_place);
   698 #ifdef _LP64
   699   daddi(AT, R0, markOopDesc::biased_lock_pattern);
   700   dsub(AT, AT, tmp_reg);
   701 #else
   702   addi(AT, R0, markOopDesc::biased_lock_pattern);
   703   sub(AT, AT, tmp_reg);
   704 #endif
   705   if (need_tmp_reg) {
   706     pop(tmp_reg);
   707   }
   709   bne(AT, R0, cas_label);
   710   delayed()->nop();
   713   // The bias pattern is present in the object's header. Need to check
   714   // whether the bias owner and the epoch are both still current.
   715   // Note that because there is no current thread register on MIPS we
   716   // need to store off the mark word we read out of the object to
   717   // avoid reloading it and needing to recheck invariants below. This
   718   // store is unfortunate but it makes the overall code shorter and
   719   // simpler.
   720   st_ptr(swap_reg, saved_mark_addr);
   721   if (need_tmp_reg) {
   722     push(tmp_reg);
   723   }
   724   if (swap_reg_contains_mark) {
   725     null_check_offset = offset();
   726   }
   727   load_prototype_header(tmp_reg, obj_reg);
   728   xorr(tmp_reg, tmp_reg, swap_reg);
   729   get_thread(swap_reg);
   730   xorr(swap_reg, swap_reg, tmp_reg);
   732   move(AT, ~((int) markOopDesc::age_mask_in_place));
   733   andr(swap_reg, swap_reg, AT);
   735   if (PrintBiasedLockingStatistics) {
   736     Label L;
   737     bne(swap_reg, R0, L);
   738     delayed()->nop();
   739     atomic_inc32((address)BiasedLocking::biased_lock_entry_count_addr(), 1, AT, tmp_reg);
   740     bind(L);
   741   }
   742   if (need_tmp_reg) {
   743     pop(tmp_reg);
   744   }
   745   beq(swap_reg, R0, done);
   746   delayed()->nop();
   747   Label try_revoke_bias;
   748   Label try_rebias;
   750   // At this point we know that the header has the bias pattern and
   751   // that we are not the bias owner in the current epoch. We need to
   752   // figure out more details about the state of the header in order to
   753   // know what operations can be legally performed on the object's
   754   // header.
   756   // If the low three bits in the xor result aren't clear, that means
   757   // the prototype header is no longer biased and we have to revoke
   758   // the bias on this object.
   760   move(AT, markOopDesc::biased_lock_mask_in_place);
   761   andr(AT, swap_reg, AT);
   762   bne(AT, R0, try_revoke_bias);
   763   delayed()->nop();
   764   // Biasing is still enabled for this data type. See whether the
   765   // epoch of the current bias is still valid, meaning that the epoch
   766   // bits of the mark word are equal to the epoch bits of the
   767   // prototype header. (Note that the prototype header's epoch bits
   768   // only change at a safepoint.) If not, attempt to rebias the object
   769   // toward the current thread. Note that we must be absolutely sure
   770   // that the current epoch is invalid in order to do this because
   771   // otherwise the manipulations it performs on the mark word are
   772   // illegal.
   774   move(AT, markOopDesc::epoch_mask_in_place);
   775   andr(AT,swap_reg, AT);
   776   bne(AT, R0, try_rebias);
   777   delayed()->nop();
   778   // The epoch of the current bias is still valid but we know nothing
   779   // about the owner; it might be set or it might be clear. Try to
   780   // acquire the bias of the object using an atomic operation. If this
   781   // fails we will go in to the runtime to revoke the object's bias.
   782   // Note that we first construct the presumed unbiased header so we
   783   // don't accidentally blow away another thread's valid bias.
   785   ld_ptr(swap_reg, saved_mark_addr);
   787   move(AT, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);  
   788   andr(swap_reg, swap_reg, AT);
   790   if (need_tmp_reg) {
   791     push(tmp_reg);
   792   }
   793   get_thread(tmp_reg);
   794   orr(tmp_reg, tmp_reg, swap_reg);
   795   //if (os::is_MP()) {
   796   // lock();
   797   //}
   798   cmpxchg(tmp_reg, Address(obj_reg, 0), swap_reg);
   799   if (need_tmp_reg) {
   800     pop(tmp_reg);
   801   }
   802   // If the biasing toward our thread failed, this means that
   803   // another thread succeeded in biasing it toward itself and we
   804   // need to revoke that bias. The revocation will occur in the
   805   // interpreter runtime in the slow case.
   806   if (PrintBiasedLockingStatistics) {
   807     Label L;
   808     bne(AT, R0, L);
   809     delayed()->nop();
   810     push(tmp_reg);
   811     push(A0);
   812     atomic_inc32((address)BiasedLocking::anonymously_biased_lock_entry_count_addr(), 1, A0, tmp_reg);
   813     pop(A0);
   814     pop(tmp_reg);
   815     bind(L);
   816   }
   817   if (slow_case != NULL) {
   818     beq_far(AT, R0, *slow_case);
   819     delayed()->nop();
   820   }
   821   b(done);
   822   delayed()->nop();
   824   bind(try_rebias);
   825   // At this point we know the epoch has expired, meaning that the
   826   // current "bias owner", if any, is actually invalid. Under these
   827   // circumstances _only_, we are allowed to use the current header's
   828   // value as the comparison value when doing the cas to acquire the
   829   // bias in the current epoch. In other words, we allow transfer of
   830   // the bias from one thread to another directly in this situation.
   831   //
   832   // FIXME: due to a lack of registers we currently blow away the age
   833   // bits in this situation. Should attempt to preserve them.
   834   if (need_tmp_reg) {
   835     push(tmp_reg);
   836   }
   837   load_prototype_header(tmp_reg, obj_reg);
   838   get_thread(swap_reg);
   839   orr(tmp_reg, tmp_reg, swap_reg);
   840   ld_ptr(swap_reg, saved_mark_addr);
   842   // if (os::is_MP()) {
   843   //  lock();
   844   //}
   845   cmpxchg(tmp_reg, Address(obj_reg, 0), swap_reg);
   846   if (need_tmp_reg) {
   847     pop(tmp_reg);
   848   }
   849   // If the biasing toward our thread failed, then another thread
   850   // succeeded in biasing it toward itself and we need to revoke that
   851   // bias. The revocation will occur in the runtime in the slow case.
   852   if (PrintBiasedLockingStatistics) {
   853     Label L;
   854     bne(AT, R0, L);
   855     delayed()->nop();
   856     push(AT);
   857     push(tmp_reg);
   858     atomic_inc32((address)BiasedLocking::rebiased_lock_entry_count_addr(), 1, AT, tmp_reg);
   859     pop(tmp_reg);
   860     pop(AT);
   861     bind(L);
   862   }
   863   if (slow_case != NULL) {
   864     beq_far(AT, R0, *slow_case);
   865     delayed()->nop();
   866   }
   868   b(done);
   869   delayed()->nop();
   870   bind(try_revoke_bias);
   871   // The prototype mark in the klass doesn't have the bias bit set any
   872   // more, indicating that objects of this data type are not supposed
   873   // to be biased any more. We are going to try to reset the mark of
   874   // this object to the prototype value and fall through to the
   875   // CAS-based locking scheme. Note that if our CAS fails, it means
   876   // that another thread raced us for the privilege of revoking the
   877   // bias of this particular object, so it's okay to continue in the
   878   // normal locking code.
   879   //
   880   // FIXME: due to a lack of registers we currently blow away the age
   881   // bits in this situation. Should attempt to preserve them.
   882   ld_ptr(swap_reg, saved_mark_addr);
   884   if (need_tmp_reg) {
   885     push(tmp_reg);
   886   }
   887   load_prototype_header(tmp_reg, obj_reg);
   888   //if (os::is_MP()) {
   889   // lock();
   890   //}    
   891   cmpxchg(tmp_reg, Address(obj_reg, 0), swap_reg);
   892   if (need_tmp_reg) {
   893     pop(tmp_reg);
   894   }
   895   // Fall through to the normal CAS-based lock, because no matter what
   896   // the result of the above CAS, some thread must have succeeded in
   897   // removing the bias bit from the object's header.
   898   if (PrintBiasedLockingStatistics) {
   899     Label L;
   900     bne(AT, R0, L);
   901     delayed()->nop();
   902     push(AT);
   903     push(tmp_reg);
   904     atomic_inc32((address)BiasedLocking::revoked_lock_entry_count_addr(), 1, AT, tmp_reg);
   905     pop(tmp_reg);
   906     pop(AT);
   907     bind(L);
   908   }
   910   bind(cas_label);
   911   return null_check_offset;
   912 }
   914 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
   915   assert(UseBiasedLocking, "why call this otherwise?");
   917   // Check for biased locking unlock case, which is a no-op
   918   // Note: we do not have to check the thread ID for two reasons.
   919   // First, the interpreter checks for IllegalMonitorStateException at
   920   // a higher level. Second, if the bias was revoked while we held the
   921   // lock, the object could not be rebiased toward another thread, so
   922   // the bias bit would be clear.
   923 #ifdef _LP64
   924   ld(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
   925   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
   926   daddi(AT, R0, markOopDesc::biased_lock_pattern);
   927 #else
   928   lw(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
   929   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
   930   addi(AT, R0, markOopDesc::biased_lock_pattern);
   931 #endif
   933   beq(AT, temp_reg, done);
   934   delayed()->nop();
   935 }
   937 // NOTE: we dont increment the SP after call like the x86 version, maybe this is a problem, FIXME. 
   938 // by yjl 6/27/2005 
   939 // the stack pointer adjustment is needed. see InterpreterMacroAssembler::super_call_VM_leaf
   940 // by yjl 7/11/2005
   941 // this method will handle the stack problem, you need not to preserve the stack space for the argument now
   942 // by yjl 8/1/2005
   943 void MacroAssembler::call_VM_leaf_base(address entry_point,
   944     int number_of_arguments) {
   945   //call(RuntimeAddress(entry_point));
   946   //increment(rsp, number_of_arguments * wordSize);
   947   Label L, E;
   949   assert(number_of_arguments <= 4, "just check");
   951   andi(AT, SP, 0xf);
   952   beq(AT, R0, L);
   953   delayed()->nop();
   954   daddi(SP, SP, -8);
   955   {
   956 	call(entry_point, relocInfo::runtime_call_type);
   957 	delayed()->nop();
   958   }
   959   daddi(SP, SP, 8);
   960   b(E);
   961   delayed()->nop();
   963   bind(L);
   964   {
   965 	call(entry_point, relocInfo::runtime_call_type);
   966 	delayed()->nop();
   967   }
   968   bind(E);
   969 }
   972 void MacroAssembler::jmp(address entry) {
   973   li48(T9, (long)entry);
   974   jr(T9);
   975 }
   977 void MacroAssembler::jmp(address entry, relocInfo::relocType rtype) {
   978   switch (rtype) {
   979     case relocInfo::runtime_call_type:
   980     case relocInfo::none:
   981       jmp(entry);
   982       break;
   983     default:
   984       {
   985 	InstructionMark im(this);
   986 	relocate(rtype);
   987 	li48(T9, (long)entry);
   988 	jr(T9);
   989       }
   990       break;
   991   }
   992 }
   994 void MacroAssembler::call(address entry) {
   995 // c/c++ code assume T9 is entry point, so we just always move entry to t9
   996 // maybe there is some more graceful method to handle this. FIXME 
   997 // by yjl 6/27/2005
   998 // For more info, see class NativeCall.
   999 #ifndef _LP64
  1000   move(T9, (int)entry);
  1001 #else
  1002   li48(T9, (long)entry);
  1003 #endif
  1004   jalr(T9);
  1007 void MacroAssembler::call(address entry, relocInfo::relocType rtype) {
  1008   switch (rtype) {
  1009     case relocInfo::runtime_call_type:
  1010     case relocInfo::none:
  1011       call(entry);
  1012       break;
  1013     default:
  1015 	InstructionMark im(this);
  1016 	relocate(rtype);
  1017 	call(entry);
  1019       break;
  1023 void MacroAssembler::call(address entry, RelocationHolder& rh)
  1025   switch (rh.type()) {
  1026     case relocInfo::runtime_call_type:
  1027     case relocInfo::none:
  1028       call(entry);
  1029       break;
  1030     default:
  1032 	InstructionMark im(this);
  1033 	relocate(rh);
  1034 	call(entry);
  1036       break;
  1040 void MacroAssembler::ic_call(address entry) {
  1041 	RelocationHolder rh = virtual_call_Relocation::spec(pc());
  1042 	li64(IC_Klass, (long)Universe::non_oop_word());
  1043 	assert(entry != NULL, "call most probably wrong");
  1044 	InstructionMark im(this);
  1045 	relocate(rh);
  1046 	li48(T9, (long)entry);
  1047 	jalr(T9);
  1048 	delayed()->nop();
  1051 void MacroAssembler::c2bool(Register r) {
  1052   Label L;
  1053   Assembler::beq(r, R0, L);
  1054   delayed()->nop();
  1055   move(r, 1);
  1056   bind(L);
  1059 #ifndef PRODUCT
  1060 extern "C" void findpc(intptr_t x);
  1061 #endif
  1063 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
  1064   // In order to get locks to work, we need to fake a in_VM state
  1065   JavaThread* thread = JavaThread::current();
  1066   JavaThreadState saved_state = thread->thread_state();
  1067   thread->set_thread_state(_thread_in_vm);
  1068   if (ShowMessageBoxOnError) {
  1069     JavaThread* thread = JavaThread::current();
  1070     JavaThreadState saved_state = thread->thread_state();
  1071     thread->set_thread_state(_thread_in_vm);
  1072     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
  1073       ttyLocker ttyl;
  1074       BytecodeCounter::print();
  1076     // To see where a verify_oop failed, get $ebx+40/X for this frame.
  1077     // This is the value of eip which points to where verify_oop will return.
  1078     if (os::message_box(msg, "Execution stopped, print registers?")) {
  1079       ttyLocker ttyl;
  1080       tty->print_cr("eip = 0x%08x", eip);
  1081 #ifndef PRODUCT
  1082       tty->cr();
  1083       findpc(eip);
  1084       tty->cr();
  1085 #endif
  1086       tty->print_cr("rax, = 0x%08x", rax);
  1087       tty->print_cr("rbx, = 0x%08x", rbx);
  1088       tty->print_cr("rcx = 0x%08x", rcx);
  1089       tty->print_cr("rdx = 0x%08x", rdx);
  1090       tty->print_cr("rdi = 0x%08x", rdi);
  1091       tty->print_cr("rsi = 0x%08x", rsi);
  1092       tty->print_cr("rbp, = 0x%08x", rbp);
  1093       tty->print_cr("rsp = 0x%08x", rsp);
  1094       BREAKPOINT;
  1096   } else {
  1097     ttyLocker ttyl;
  1098     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
  1099     assert(false, "DEBUG MESSAGE");
  1101   ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
  1104 void MacroAssembler::debug(char* msg/*, RegistersForDebugging* regs*/) {
  1105   if ( ShowMessageBoxOnError ) {
  1106     JavaThreadState saved_state = JavaThread::current()->thread_state();
  1107     JavaThread::current()->set_thread_state(_thread_in_vm);
  1109       // In order to get locks work, we need to fake a in_VM state
  1110       ttyLocker ttyl;
  1111       ::tty->print_cr("EXECUTION STOPPED: %s\n", msg);
  1112       if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
  1113 	BytecodeCounter::print();
  1116       //			if (os::message_box(msg, "Execution stopped, print registers?"))
  1117       //				regs->print(::tty);
  1119     ThreadStateTransition::transition(JavaThread::current(), _thread_in_vm, saved_state);
  1121   else
  1122     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
  1126 void MacroAssembler::stop(const char* msg) {
  1127   li(A0, (long)msg);
  1128 #ifndef _LP64
  1129   //reserver space for argument. added by yjl 7/10/2005
  1130   addiu(SP, SP, - 1 * wordSize);
  1131 #endif
  1132   call(CAST_FROM_FN_PTR(address, MacroAssembler::debug), relocInfo::runtime_call_type);
  1133   delayed()->nop();
  1134 #ifndef _LP64
  1135   //restore space for argument
  1136   addiu(SP, SP, 1 * wordSize);
  1137 #endif
  1138   brk(17);
  1141 void MacroAssembler::warn(const char* msg) {
  1142 #ifdef _LP64
  1143   pushad();
  1144   li(A0, (long)msg);
  1145   call(CAST_FROM_FN_PTR(address, MacroAssembler::debug), relocInfo::runtime_call_type);
  1146   delayed()->nop();
  1147   popad();
  1148 #else
  1149   pushad();
  1150   addi(SP, SP, -4);
  1151   sw(A0, SP, -1 * wordSize);
  1152   li(A0, (long)msg);
  1153   addi(SP, SP, -1 * wordSize);
  1154   call(CAST_FROM_FN_PTR(address, MacroAssembler::debug), relocInfo::runtime_call_type);
  1155   delayed()->nop();
  1156   addi(SP, SP, 1 * wordSize);
  1157   lw(A0, SP, -1 * wordSize);
  1158   addi(SP, SP, 4);
  1159   popad();
  1160 #endif
  1163 void MacroAssembler::print_reg(Register reg) {
  1164 /*
  1165 char *s = getenv("PRINT_REG");
  1166 if (s == NULL)
  1167   return;
  1168 if (strcmp(s, "1") != 0)
  1169   return;
  1170 */
  1171   void * cur_pc = pc();
  1172   pushad();
  1173   NOT_LP64(push(FP);)
  1175   li(A0, (long)reg->name());
  1176   if (reg == SP)
  1177     addiu(A1, SP, wordSize * 23); //23 registers saved in pushad()
  1178   else if (reg == A0)
  1179     ld(A1, SP, wordSize * 19); //A0 has been modified by li(A0, (long)reg->name()). Ugly Code!
  1180   else
  1181     move(A1, reg);
  1182   li(A2, (long)cur_pc);
  1183   call(CAST_FROM_FN_PTR(address, SharedRuntime::print_reg_with_pc),relocInfo::runtime_call_type);
  1184   delayed()->nop();
  1185   NOT_LP64(pop(FP);)
  1186   popad();
  1188 /*
  1189   pushad();
  1190 #ifdef _LP64
  1191   if (reg == SP)
  1192     addiu(A0, SP, wordSize * 23); //23 registers saved in pushad()
  1193   else
  1194     move(A0, reg);
  1195   call(CAST_FROM_FN_PTR(address, SharedRuntime::print_long),relocInfo::runtime_call_type);
  1196   delayed()->nop();
  1197 #else 
  1198   push(FP);
  1199   move(A0, reg);
  1200   dsrl32(A1, reg, 0);
  1201   //call(CAST_FROM_FN_PTR(address, SharedRuntime::print_int),relocInfo::runtime_call_type);
  1202   call(CAST_FROM_FN_PTR(address, SharedRuntime::print_long),relocInfo::runtime_call_type);
  1203   delayed()->nop();
  1204   pop(FP);
  1205 #endif
  1206   popad();
  1207   pushad();
  1208   NOT_LP64(push(FP);)
  1209   char b[50];
  1210   sprintf((char *)b, " pc: %p\n",cur_pc);
  1211   li(A0, (long)(char *)b);
  1212   call(CAST_FROM_FN_PTR(address, SharedRuntime::print_str),relocInfo::runtime_call_type);
  1213   delayed()->nop();
  1214   NOT_LP64(pop(FP);)
  1215   popad();
  1216 */
  1219 void MacroAssembler::print_reg(FloatRegister reg) {
  1220   void * cur_pc = pc();
  1221   pushad();
  1222   NOT_LP64(push(FP);)
  1223   li(A0, (long)reg->name());
  1224   call(CAST_FROM_FN_PTR(address, SharedRuntime::print_str),relocInfo::runtime_call_type);
  1225   delayed()->nop();
  1226   NOT_LP64(pop(FP);)
  1227   popad();
  1229   pushad();
  1230   NOT_LP64(push(FP);)
  1231 #if 1
  1232   move(FP, SP);
  1233   move(AT, -(StackAlignmentInBytes));	
  1234   andr(SP , SP , AT);
  1235   mov_d(F12, reg);
  1236   call(CAST_FROM_FN_PTR(address, SharedRuntime::print_double),relocInfo::runtime_call_type);
  1237   delayed()->nop();
  1238   move(SP, FP);
  1239 #else
  1240   mov_s(F12, reg);
  1241   //call(CAST_FROM_FN_PTR(address, SharedRuntime::print_float),relocInfo::runtime_call_type);
  1242   //delayed()->nop();
  1243 #endif
  1244   NOT_LP64(pop(FP);)
  1245   popad();
  1247 #if 0
  1248   pushad();
  1249   NOT_LP64(push(FP);)
  1250   char* b = new char[50];
  1251   sprintf(b, " pc: %p\n", cur_pc);
  1252   li(A0, (long)b);
  1253   call(CAST_FROM_FN_PTR(address, SharedRuntime::print_str),relocInfo::runtime_call_type);
  1254   delayed()->nop();
  1255   NOT_LP64(pop(FP);)
  1256   popad();
  1257 #endif
  1260 void MacroAssembler::increment(Register reg, int imm) {
  1261   if (!imm) return;
  1262   if (is_simm16(imm)) {
  1263 #ifdef _LP64
  1264     daddiu(reg, reg, imm);
  1265 #else
  1266     addiu(reg, reg, imm);
  1267 #endif
  1268   } else {
  1269     move(AT, imm);
  1270 #ifdef _LP64
  1271     daddu(reg, reg, AT);
  1272 #else
  1273     addu(reg, reg, AT);
  1274 #endif
  1278 void MacroAssembler::decrement(Register reg, int imm) {
  1279 	increment(reg, -imm);
  1283 void MacroAssembler::call_VM(Register oop_result,
  1284                              address entry_point,
  1285                              bool check_exceptions) {
  1286   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
  1289 void MacroAssembler::call_VM(Register oop_result,
  1290                              address entry_point,
  1291                              Register arg_1,
  1292                              bool check_exceptions) {
  1293   if (arg_1!=A1) move(A1, arg_1);
  1294   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
  1297 void MacroAssembler::call_VM(Register oop_result,
  1298                              address entry_point,
  1299                              Register arg_1,
  1300                              Register arg_2,
  1301                              bool check_exceptions) {
  1302   if (arg_1!=A1) move(A1, arg_1);
  1303   if (arg_2!=A2) move(A2, arg_2); 
  1304   assert(arg_2 != A1, "smashed argument");
  1305   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
  1308 void MacroAssembler::call_VM(Register oop_result,
  1309                              address entry_point,
  1310                              Register arg_1,
  1311                              Register arg_2,
  1312                              Register arg_3,
  1313                              bool check_exceptions) {
  1314   if (arg_1!=A1) move(A1, arg_1);
  1315   if (arg_2!=A2) move(A2, arg_2); assert(arg_2 != A1, "smashed argument");
  1316   if (arg_3!=A3) move(A3, arg_3); assert(arg_3 != A1 && arg_3 != A2, "smashed argument");
  1317   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
  1320 void MacroAssembler::call_VM(Register oop_result,
  1321                              Register last_java_sp,
  1322                              address entry_point,
  1323                              int number_of_arguments,
  1324                              bool check_exceptions) {
  1325   call_VM_base(oop_result, NOREG, last_java_sp, entry_point, number_of_arguments, check_exceptions);
  1328 void MacroAssembler::call_VM(Register oop_result,
  1329                              Register last_java_sp,
  1330                              address entry_point,
  1331                              Register arg_1,
  1332                              bool check_exceptions) {
  1333   if (arg_1 != A1) move(A1, arg_1);
  1334   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
  1337 void MacroAssembler::call_VM(Register oop_result,
  1338                              Register last_java_sp,
  1339                              address entry_point,
  1340                              Register arg_1,
  1341                              Register arg_2,
  1342                              bool check_exceptions) {
  1343   if (arg_1 != A1) move(A1, arg_1);
  1344   if (arg_2 != A2) move(A2, arg_2); assert(arg_2 != A1, "smashed argument");
  1345   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
  1348 void MacroAssembler::call_VM(Register oop_result,
  1349                              Register last_java_sp,
  1350                              address entry_point,
  1351                              Register arg_1,
  1352                              Register arg_2,
  1353                              Register arg_3,
  1354                              bool check_exceptions) {
  1355   if (arg_1 != A1) move(A1, arg_1);
  1356   if (arg_2 != A2) move(A2, arg_2); assert(arg_2 != A1, "smashed argument");
  1357   if (arg_3 != A3) move(A3, arg_3); assert(arg_3 != A1 && arg_3 != A2, "smashed argument");
  1358   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
  1361 void MacroAssembler::call_VM_base(Register oop_result,
  1362                                   Register java_thread,
  1363                                   Register last_java_sp,
  1364                                   address  entry_point,
  1365                                   int      number_of_arguments,
  1366 				  bool     check_exceptions) {
  1368   address before_call_pc;
  1369   // determine java_thread register
  1370   if (!java_thread->is_valid()) {
  1371 #ifndef OPT_THREAD
  1372     java_thread = T2;
  1373     get_thread(java_thread);
  1374 #else
  1375     java_thread = TREG;
  1376 #endif
  1378   // determine last_java_sp register
  1379   if (!last_java_sp->is_valid()) {
  1380     last_java_sp = SP;
  1382   // debugging support
  1383   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
  1384   assert(number_of_arguments <= 4   , "cannot have negative number of arguments");
  1385   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
  1386   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
  1388   assert(last_java_sp != FP, "this code doesn't work for last_java_sp == fp, which currently can't portably work anyway since C2 doesn't save ebp");
  1390   // set last Java frame before call
  1391   before_call_pc = (address)pc();
  1392   set_last_Java_frame(java_thread, last_java_sp, FP, before_call_pc);
  1394   // do the call
  1395   move(A0, java_thread);
  1396   call(entry_point, relocInfo::runtime_call_type);
  1397   delayed()->nop();
  1399   // restore the thread (cannot use the pushed argument since arguments
  1400   // may be overwritten by C code generated by an optimizing compiler);
  1401   // however can use the register value directly if it is callee saved.
  1402 #ifndef OPT_THREAD
  1403   if (java_thread >=S0 && java_thread <=S7) {
  1404 #ifdef ASSERT
  1405     { Label L;
  1406       get_thread(AT);
  1407       beq(java_thread, AT, L);
  1408       delayed()->nop();
  1409       stop("MacroAssembler::call_VM_base: edi not callee saved?");
  1410       bind(L);
  1412 #endif
  1413   } else {
  1414     get_thread(java_thread);
  1416 #endif
  1418   // discard thread and arguments
  1419   ld_ptr(SP, java_thread, in_bytes(JavaThread::last_Java_sp_offset())); 
  1420   // reset last Java frame
  1421   reset_last_Java_frame(java_thread, false, true);
  1423   check_and_handle_popframe(java_thread);
  1424   check_and_handle_earlyret(java_thread);
  1425   if (check_exceptions) {
  1426     // check for pending exceptions (java_thread is set upon return)
  1427     Label L;
  1428 #ifdef _LP64
  1429     ld(AT, java_thread, in_bytes(Thread::pending_exception_offset()));
  1430 #else
  1431     lw(AT, java_thread, in_bytes(Thread::pending_exception_offset()));
  1432 #endif
  1433     beq(AT, R0, L);
  1434     delayed()->nop();
  1435     li(AT, before_call_pc);
  1436     push(AT);
  1437     jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
  1438     delayed()->nop();
  1439     bind(L);
  1442   // get oop result if there is one and reset the value in the thread
  1443   if (oop_result->is_valid()) {
  1444 #ifdef _LP64
  1445     ld(oop_result, java_thread, in_bytes(JavaThread::vm_result_offset()));
  1446     sd(R0, java_thread, in_bytes(JavaThread::vm_result_offset()));
  1447 #else
  1448     lw(oop_result, java_thread, in_bytes(JavaThread::vm_result_offset()));
  1449     sw(R0, java_thread, in_bytes(JavaThread::vm_result_offset()));
  1450 #endif
  1451     verify_oop(oop_result);
  1455 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
  1457   move(V0, SP);
  1458   //we also reserve space for java_thread here
  1459 #ifndef _LP64
  1460   daddi(SP, SP, (1 + number_of_arguments) * (- wordSize));
  1461 #endif
  1462   move(AT, -(StackAlignmentInBytes));
  1463   andr(SP, SP, AT);
  1464   call_VM_base(oop_result, NOREG, V0, entry_point, number_of_arguments, check_exceptions);
  1468 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
  1469   call_VM_leaf_base(entry_point, number_of_arguments);
  1472 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
  1473   if (arg_0 != A0) move(A0, arg_0);
  1474   call_VM_leaf(entry_point, 1);
  1477 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
  1478   if (arg_0 != A0) move(A0, arg_0);
  1479   if (arg_1 != A1) move(A1, arg_1); assert(arg_1 != A0, "smashed argument");
  1480   call_VM_leaf(entry_point, 2);
  1483 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
  1484   if (arg_0 != A0) move(A0, arg_0);
  1485   if (arg_1 != A1) move(A1, arg_1); assert(arg_1 != A0, "smashed argument");
  1486   if (arg_2 != A2) move(A2, arg_2); assert(arg_2 != A0 && arg_2 != A1, "smashed argument");
  1487   call_VM_leaf(entry_point, 3);
  1489 void MacroAssembler::super_call_VM_leaf(address entry_point) {
  1490 	MacroAssembler::call_VM_leaf_base(entry_point, 0);
  1494 void MacroAssembler::super_call_VM_leaf(address entry_point,
  1495                                                    Register arg_1) {
  1496   if (arg_1 != A0) move(A0, arg_1);
  1497   MacroAssembler::call_VM_leaf_base(entry_point, 1);
  1501 void MacroAssembler::super_call_VM_leaf(address entry_point,
  1502                                                    Register arg_1,
  1503                                                    Register arg_2) {
  1504   if (arg_1 != A0) move(A0, arg_1);
  1505   if (arg_2 != A1) move(A1, arg_2); assert(arg_2 != A0, "smashed argument");
  1506   MacroAssembler::call_VM_leaf_base(entry_point, 2);
  1508 void MacroAssembler::super_call_VM_leaf(address entry_point,
  1509                                                    Register arg_1,
  1510                                                    Register arg_2,
  1511                                                    Register arg_3) {
  1512   if (arg_1 != A0) move(A0, arg_1);
  1513   if (arg_2 != A1) move(A1, arg_2); assert(arg_2 != A0, "smashed argument");
  1514   if (arg_3 != A2) move(A2, arg_3); assert(arg_3 != A0 && arg_3 != A1, "smashed argument");
  1515   MacroAssembler::call_VM_leaf_base(entry_point, 3);
  1518 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
  1521 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
  1524 void MacroAssembler::null_check(Register reg, int offset) {
  1525   if (needs_explicit_null_check(offset)) {
  1526     // provoke OS NULL exception if reg = NULL by
  1527     // accessing M[reg] w/o changing any (non-CC) registers
  1528     // NOTE: cmpl is plenty here to provoke a segv
  1529     lw(AT, reg, 0);
  1530 /* Jin
  1531     nop();	
  1532     nop();
  1533     nop();
  1534 */
  1535     // Note: should probably use testl(rax, Address(reg, 0));
  1536     //       may be shorter code (however, this version of
  1537     //       testl needs to be implemented first)
  1538   } else {
  1539     // nothing to do, (later) access of M[reg + offset]
  1540     // will provoke OS NULL exception if reg = NULL
  1544 void MacroAssembler::enter() {
  1545   push2(RA, FP);
  1546   move(FP, SP);
  1549 void MacroAssembler::leave() {
  1550 #ifndef _LP64
  1551   //move(SP, FP);
  1552   //pop2(FP, RA);
  1553   addi(SP, FP, 2 * wordSize);
  1554   lw(RA, SP, - 1 * wordSize);
  1555   lw(FP, SP, - 2 * wordSize);
  1556 #else
  1557   daddi(SP, FP, 2 * wordSize);
  1558   ld(RA, SP, - 1 * wordSize);
  1559   ld(FP, SP, - 2 * wordSize);
  1560 #endif
  1562 /*
  1563 void MacroAssembler::os_breakpoint() {
  1564   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
  1565   // (e.g., MSVC can't call ps() otherwise)
  1566   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
  1568 */
  1569 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) {
  1570   // determine java_thread register
  1571   if (!java_thread->is_valid()) {
  1572 #ifndef OPT_THREAD
  1573     java_thread = T1;
  1574     get_thread(java_thread);
  1575 #else
  1576     java_thread = TREG;
  1577 #endif
  1579   // we must set sp to zero to clear frame
  1580   st_ptr(R0, java_thread, in_bytes(JavaThread::last_Java_sp_offset()));
  1581   // must clear fp, so that compiled frames are not confused; it is possible
  1582   // that we need it only for debugging
  1583   if(clear_fp)	
  1584     st_ptr(R0, java_thread, in_bytes(JavaThread::last_Java_fp_offset()));
  1586   if (clear_pc)
  1587     st_ptr(R0, java_thread, in_bytes(JavaThread::last_Java_pc_offset()));
  1590 void MacroAssembler::reset_last_Java_frame(bool clear_fp,
  1591                                            bool clear_pc) {
  1592   Register thread = TREG;
  1593 #ifndef OPT_THREAD
  1594   get_thread(thread);
  1595 #endif
  1596   // we must set sp to zero to clear frame
  1597   sd(R0, Address(thread, JavaThread::last_Java_sp_offset()));
  1598   // must clear fp, so that compiled frames are not confused; it is
  1599   // possible that we need it only for debugging
  1600   if (clear_fp) {
  1601     sd(R0, Address(thread, JavaThread::last_Java_fp_offset()));
  1604   if (clear_pc) {
  1605     sd(R0, Address(thread, JavaThread::last_Java_pc_offset()));
  1609 // Write serialization page so VM thread can do a pseudo remote membar.
  1610 // We use the current thread pointer to calculate a thread specific
  1611 // offset to write to within the page. This minimizes bus traffic
  1612 // due to cache line collision.
  1613 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
  1614   move(tmp, thread);
  1615   srl(tmp, tmp,os::get_serialize_page_shift_count());
  1616   move(AT, (os::vm_page_size() - sizeof(int))); 
  1617   andr(tmp, tmp,AT);
  1618   sw(tmp,Address(tmp, (intptr_t)os::get_memory_serialize_page()));
  1621 // Calls to C land
  1622 //
  1623 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
  1624 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
  1625 // has to be reset to 0. This is required to allow proper stack traversal.
  1626 void MacroAssembler::set_last_Java_frame(Register java_thread,
  1627                                          Register last_java_sp,
  1628                                          Register last_java_fp,
  1629                                          address  last_java_pc) {
  1630   // determine java_thread register
  1631   if (!java_thread->is_valid()) {
  1632 #ifndef OPT_THREAD
  1633     java_thread = T2;
  1634     get_thread(java_thread);
  1635 #else
  1636     java_thread = TREG;
  1637 #endif
  1639   // determine last_java_sp register
  1640   if (!last_java_sp->is_valid()) {
  1641     last_java_sp = SP;
  1644   // last_java_fp is optional
  1646   if (last_java_fp->is_valid()) {
  1647     st_ptr(last_java_fp, java_thread, in_bytes(JavaThread::last_Java_fp_offset()));
  1650   // last_java_pc is optional
  1652   if (last_java_pc != NULL) {
  1653     relocate(relocInfo::internal_pc_type);
  1654     li48(AT, (long)last_java_pc);
  1655     st_ptr(AT, java_thread, in_bytes(JavaThread::last_Java_pc_offset()));
  1657   st_ptr(last_java_sp, java_thread, in_bytes(JavaThread::last_Java_sp_offset()));
  1660 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
  1661                                          Register last_java_fp,
  1662                                          address  last_java_pc) {
  1663   // determine last_java_sp register
  1664   if (!last_java_sp->is_valid()) {
  1665     last_java_sp = SP; 
  1668   Register thread = TREG;
  1669 #ifndef OPT_THREAD
  1670   get_thread(thread);
  1671 #endif
  1672   // last_java_fp is optional
  1673   if (last_java_fp->is_valid()) {
  1674     sd(last_java_fp, Address(thread, JavaThread::last_Java_fp_offset()));
  1677   // last_java_pc is optional
  1678   if (last_java_pc != NULL) {
  1679     Address java_pc(thread,
  1680                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
  1681     li(AT, (intptr_t)(last_java_pc));
  1682     sd(AT, java_pc);
  1685   sd(last_java_sp, Address(thread, JavaThread::last_Java_sp_offset()));
  1688 //////////////////////////////////////////////////////////////////////////////////
  1689 #ifndef SERIALGC
  1691 void MacroAssembler::g1_write_barrier_pre(Register obj,
  1692 #ifndef _LP64
  1693                                           Register thread,
  1694 #endif
  1695                                           Register tmp,
  1696                                           Register tmp2,
  1697                                           bool tosca_live) {
  1698 /*  LP64_ONLY(Register thread = r15_thread;)
  1699   Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
  1700                                        PtrQueue::byte_offset_of_active()));
  1702   Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
  1703                                        PtrQueue::byte_offset_of_index()));
  1704   Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
  1705                                        PtrQueue::byte_offset_of_buf()));
  1708   Label done;
  1709   Label runtime;
  1711   // if (!marking_in_progress) goto done;
  1712   if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
  1713     cmpl(in_progress, 0);
  1714   } else {
  1715     assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
  1716     cmpb(in_progress, 0);
  1718   jcc(Assembler::equal, done);
  1720   // if (x.f == NULL) goto done;
  1721   cmpptr(Address(obj, 0), NULL_WORD);
  1722   jcc(Assembler::equal, done);
  1724   // Can we store original value in the thread's buffer?
  1726   LP64_ONLY(movslq(tmp, index);)
  1727   movptr(tmp2, Address(obj, 0));
  1728 #ifdef _LP64
  1729   cmpq(tmp, 0);
  1730 #else
  1731   cmpl(index, 0);
  1732 #endif
  1733   jcc(Assembler::equal, runtime);
  1734 #ifdef _LP64
  1735   subq(tmp, wordSize);
  1736   movl(index, tmp);
  1737   addq(tmp, buffer);
  1738 #else
  1739   subl(index, wordSize);
  1740   movl(tmp, buffer);
  1741   addl(tmp, index);
  1742 #endif
  1743   movptr(Address(tmp, 0), tmp2);
  1744   jmp(done);
  1745   bind(runtime);
  1746   // save the live input values
  1747   if(tosca_live) push(rax);
  1748   push(obj);
  1749 #ifdef _LP64
  1750   movq(c_rarg0, Address(obj, 0));
  1751   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), c_rarg0, r15_thread);
  1752 #else
  1753   push(thread);
  1754   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), tmp2, thread);
  1755   pop(thread);
  1756 #endif
  1757   pop(obj);
  1758   if(tosca_live) pop(rax);
  1759   bind(done);
  1760 */
  1763 void MacroAssembler::g1_write_barrier_post(Register store_addr,
  1764                                            Register new_val,
  1765 #ifndef _LP64
  1766                                            Register thread,
  1767 #endif
  1768                                            Register tmp,
  1769                                            Register tmp2) {
  1771   /*LP64_ONLY(Register thread = r15_thread;)
  1772   Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
  1773                                        PtrQueue::byte_offset_of_index()));
  1774   Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
  1775                                        PtrQueue::byte_offset_of_buf()));
  1776   BarrierSet* bs = Universe::heap()->barrier_set();
  1777   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
  1778   Label done;
  1779   Label runtime;
  1781   // Does store cross heap regions?
  1783   movptr(tmp, store_addr);
  1784   xorptr(tmp, new_val);
  1785   shrptr(tmp, HeapRegion::LogOfHRGrainBytes);
  1786   jcc(Assembler::equal, done);
  1788   // crosses regions, storing NULL?
  1790   cmpptr(new_val, (int32_t) NULL_WORD);
  1791   jcc(Assembler::equal, done);
  1793   // storing region crossing non-NULL, is card already dirty?
  1795   ExternalAddress cardtable((address) ct->byte_map_base);
  1796   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
  1797 #ifdef _LP64
  1798   const Register card_addr = tmp;
  1800   movq(card_addr, store_addr);
  1801   shrq(card_addr, CardTableModRefBS::card_shift);
  1803   lea(tmp2, cardtable);
  1805   // get the address of the card
  1806   addq(card_addr, tmp2);
  1807 #else
  1808   const Register card_index = tmp;
  1810   movl(card_index, store_addr);
  1811   shrl(card_index, CardTableModRefBS::card_shift);
  1813   Address index(noreg, card_index, Address::times_1);
  1814   const Register card_addr = tmp;
  1815   lea(card_addr, as_Address(ArrayAddress(cardtable, index)));
  1816 #endif
  1817   cmpb(Address(card_addr, 0), 0);
  1818   jcc(Assembler::equal, done);
  1820   // storing a region crossing, non-NULL oop, card is clean.
  1821   // dirty card and log.
  1823   movb(Address(card_addr, 0), 0);
  1825   cmpl(queue_index, 0);
  1826   jcc(Assembler::equal, runtime);
  1827   subl(queue_index, wordSize);
  1828   movptr(tmp2, buffer);
  1829 #ifdef _LP64
  1830   movslq(rscratch1, queue_index);
  1831   addq(tmp2, rscratch1);
  1832   movq(Address(tmp2, 0), card_addr);
  1833 #else
  1834   addl(tmp2, queue_index);
  1835   movl(Address(tmp2, 0), card_index);
  1836 #endif
  1837   jmp(done);
  1839   bind(runtime);
  1840   // save the live input values
  1841   push(store_addr);
  1842   push(new_val);
  1843 #ifdef _LP64
  1844   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, r15_thread);
  1845 #else
  1846   push(thread);
  1847   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
  1848   pop(thread);
  1849 #endif
  1850   pop(new_val);
  1851   pop(store_addr);
  1853   bind(done);
  1854 */
  1857 #endif // SERIALGC
  1858 //////////////////////////////////////////////////////////////////////////////////
  1861 void MacroAssembler::store_check(Register obj) {
  1862   // Does a store check for the oop in register obj. The content of
  1863   // register obj is destroyed afterwards.
  1864   store_check_part_1(obj);
  1865   store_check_part_2(obj);
  1868 void MacroAssembler::store_check(Register obj, Address dst) {
  1869   store_check(obj);
  1873 // split the store check operation so that other instructions can be scheduled inbetween
  1874 void MacroAssembler::store_check_part_1(Register obj) {
  1875   BarrierSet* bs = Universe::heap()->barrier_set();
  1876   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
  1877 #ifdef _LP64
  1878   dsrl(obj, obj, CardTableModRefBS::card_shift);
  1879 #else
  1880   shr(obj, CardTableModRefBS::card_shift);
  1881 #endif
  1884 void MacroAssembler::store_check_part_2(Register obj) {
  1885   BarrierSet* bs = Universe::heap()->barrier_set();
  1886   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
  1887   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
  1888   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
  1890   li(AT, (long)ct->byte_map_base);
  1891 #ifdef _LP64
  1892   dadd(AT, AT, obj);
  1893 #else
  1894   add(AT, AT, obj);
  1895 #endif
  1896   sb(R0, AT, 0);
  1898 /*
  1899 void MacroAssembler::subptr(Register dst, int32_t imm32) {
  1900   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
  1903 void MacroAssembler::subptr(Register dst, Register src) {
  1904   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
  1907 void MacroAssembler::test32(Register src1, AddressLiteral src2) {
  1908   // src2 must be rval
  1910   if (reachable(src2)) {
  1911     testl(src1, as_Address(src2));
  1912   } else {
  1913     lea(rscratch1, src2);
  1914     testl(src1, Address(rscratch1, 0));
  1918 // C++ bool manipulation
  1919 void MacroAssembler::testbool(Register dst) {
  1920   if(sizeof(bool) == 1)
  1921     testb(dst, 0xff);
  1922   else if(sizeof(bool) == 2) {
  1923     // testw implementation needed for two byte bools
  1924     ShouldNotReachHere();
  1925   } else if(sizeof(bool) == 4)
  1926     testl(dst, dst);
  1927   else
  1928     // unsupported
  1929     ShouldNotReachHere();
  1932 void MacroAssembler::testptr(Register dst, Register src) {
  1933   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
  1937 */
  1939 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
  1940 void MacroAssembler::tlab_allocate(Register obj, Register var_size_in_bytes, int con_size_in_bytes,
  1941                                    Register t1, Register t2, Label& slow_case) {
  1942   assert_different_registers(obj, var_size_in_bytes, t1, t2, AT);
  1944   Register end = t2;
  1945 #ifndef OPT_THREAD
  1946   Register thread = t1;
  1947   get_thread(thread);
  1948 #else
  1949   Register thread = TREG;
  1950 #endif
  1951   verify_tlab(t1, t2);//blows t1&t2
  1953   ld_ptr(obj, thread, in_bytes(JavaThread::tlab_top_offset()));
  1955   if (var_size_in_bytes == NOREG) {
  1956     // i dont think we need move con_size_in_bytes to a register first.
  1957     // by yjl 8/17/2005
  1958     assert(is_simm16(con_size_in_bytes), "fixme by moving imm to a register first");
  1959     addi(end, obj, con_size_in_bytes);
  1960   } else {
  1961     add(end, obj, var_size_in_bytes);
  1964   ld_ptr(AT, thread, in_bytes(JavaThread::tlab_end_offset()));
  1965   sltu(AT, AT, end);
  1966   bne_far(AT, R0, slow_case);
  1967   delayed()->nop();
  1970   // update the tlab top pointer
  1971   st_ptr(end, thread, in_bytes(JavaThread::tlab_top_offset()));
  1973   // recover var_size_in_bytes if necessary
  1974   /*if (var_size_in_bytes == end) {
  1975     sub(var_size_in_bytes, end, obj);
  1976     }*/
  1978   verify_tlab(t1, t2);
  1981 // Defines obj, preserves var_size_in_bytes
  1982 void MacroAssembler::eden_allocate(Register obj, Register var_size_in_bytes, int con_size_in_bytes,
  1983 		Register t1, Register t2, Label& slow_case) {
  1984   assert_different_registers(obj, var_size_in_bytes, t1, AT);
  1985   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) { //by yyq
  1986     // No allocation in the shared eden.
  1987     b_far(slow_case);
  1988     delayed()->nop();
  1989   } else {
  1991 #ifndef _LP64
  1992     Address heap_top(t1, Assembler::split_low((intptr_t)Universe::heap()->top_addr()));
  1993     lui(t1, split_high((intptr_t)Universe::heap()->top_addr()));
  1994 #else
  1995     Address heap_top(t1);
  1996     li(t1, (long)Universe::heap()->top_addr());
  1997 #endif
  1998     ld_ptr(obj, heap_top);
  2000     Register end = t2;
  2001     Label retry;
  2003     bind(retry);
  2004     if (var_size_in_bytes == NOREG) {
  2005     // i dont think we need move con_size_in_bytes to a register first.
  2006     // by yjl 8/17/2005
  2007       assert(is_simm16(con_size_in_bytes), "fixme by moving imm to a register first");
  2008       addi(end, obj, con_size_in_bytes);
  2009     } else {
  2010       add(end, obj, var_size_in_bytes);
  2012     // if end < obj then we wrapped around => object too long => slow case
  2013     sltu(AT, end, obj);
  2014     bne_far(AT, R0, slow_case);
  2015     delayed()->nop();
  2017     //lui(AT, split_high((int)Universe::heap()->end_addr()));
  2018     //lw(AT, AT, split_low((int)Universe::heap()->end_addr()));
  2019     li(AT, (long)Universe::heap()->end_addr());
  2020     sltu(AT, AT, end);
  2021     bne_far(AT, R0, slow_case);
  2022     delayed()->nop();
  2023     // Compare obj with the top addr, and if still equal, store the new top addr in
  2024     // end at the address of the top addr pointer. Sets ZF if was equal, and clears
  2025     // it otherwise. Use lock prefix for atomicity on MPs.
  2026     if (os::is_MP()) {
  2027     	///lock();
  2030     // if someone beat us on the allocation, try again, otherwise continue
  2031     cmpxchg(end, heap_top, obj);
  2032     beq_far(AT, R0, retry);    //by yyq
  2033     delayed()->nop();
  2038 void MacroAssembler::tlab_refill(Label& retry, Label& try_eden, Label& slow_case) {
  2039 	Register top = T0;
  2040 	Register t1  = T1;
  2041 /* Jin: tlab_refill() is called in 
  2043      [c1_Runtime1_mips.cpp] Runtime1::generate_code_for(new_type_array_id);
  2045   In generate_code_for(), T2 has been assigned as a register(length), which is used
  2046  after calling tlab_refill();
  2047   Therefore, tlab_refill() should not use T2.
  2049  Source:
  2051 Exception in thread "main" java.lang.ArrayIndexOutOfBoundsException
  2052         at java.lang.System.arraycopy(Native Method)
  2053         at java.util.Arrays.copyOf(Arrays.java:2799)	<-- alloc_array
  2054         at sun.misc.Resource.getBytes(Resource.java:117)
  2055         at java.net.URLClassLoader.defineClass(URLClassLoader.java:273)
  2056         at java.net.URLClassLoader.findClass(URLClassLoader.java:205)
  2057         at java.lang.ClassLoader.loadClass(ClassLoader.java:321)
  2058  */
  2059 	Register t2  = T9;
  2060 	Register t3  = T3;
  2061 	Register thread_reg = T8;
  2062 	Label do_refill, discard_tlab;
  2063 	if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) { //by yyq
  2064 		// No allocation in the shared eden.
  2065 		b(slow_case);
  2066 		delayed()->nop();
  2069 	get_thread(thread_reg);
  2071 	ld_ptr(top, thread_reg, in_bytes(JavaThread::tlab_top_offset()));
  2072 	ld_ptr(t1, thread_reg, in_bytes(JavaThread::tlab_end_offset()));
  2074 	// calculate amount of free space
  2075 	sub(t1, t1, top);
  2076 	shr(t1, LogHeapWordSize);
  2078 	// Retain tlab and allocate object in shared space if
  2079 	// the amount free in the tlab is too large to discard.
  2080 	ld_ptr(t2, thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset()));
  2081 	slt(AT, t2, t1);
  2082 	beq(AT, R0, discard_tlab);
  2083 	delayed()->nop();
  2085 	// Retain
  2087 #ifndef _LP64
  2088 	move(AT, ThreadLocalAllocBuffer::refill_waste_limit_increment());
  2089 #else
  2090 	li(AT, ThreadLocalAllocBuffer::refill_waste_limit_increment());
  2091 #endif
  2092 	add(t2, t2, AT);
  2093 	st_ptr(t2, thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset()));
  2095 	if (TLABStats) {
  2096 		// increment number of slow_allocations
  2097 		lw(AT, thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset()));
  2098 		addiu(AT, AT, 1);
  2099 		sw(AT, thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset()));
  2101 	b(try_eden);
  2102 	delayed()->nop();
  2104   bind(discard_tlab);
  2105 	if (TLABStats) {
  2106 		// increment number of refills
  2107 		lw(AT, thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset()));
  2108 		addi(AT, AT, 1);
  2109 		sw(AT, thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset()));
  2110 		// accumulate wastage -- t1 is amount free in tlab
  2111 		lw(AT, thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset()));
  2112 		add(AT, AT, t1);
  2113 		sw(AT, thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset()));
  2116 	// if tlab is currently allocated (top or end != null) then
  2117 	// fill [top, end + alignment_reserve) with array object
  2118 	beq(top, R0, do_refill);
  2119 	delayed()->nop();
  2121 	// set up the mark word
  2122 	li(AT, (long)markOopDesc::prototype()->copy_set_hash(0x2));
  2123 	st_ptr(AT, top, oopDesc::mark_offset_in_bytes());
  2125 	// set the length to the remaining space
  2126 	addi(t1, t1, - typeArrayOopDesc::header_size(T_INT));
  2127 	addi(t1, t1, ThreadLocalAllocBuffer::alignment_reserve());
  2128 	shl(t1, log2_intptr(HeapWordSize/sizeof(jint)));
  2129 	sw(t1, top, arrayOopDesc::length_offset_in_bytes());
  2131 	// set klass to intArrayKlass
  2132 #ifndef _LP64
  2133 	lui(AT, split_high((intptr_t)Universe::intArrayKlassObj_addr()));
  2134 	lw(t1, AT, split_low((intptr_t)Universe::intArrayKlassObj_addr()));
  2135 #else
  2136 	li(AT, (intptr_t)Universe::intArrayKlassObj_addr());
  2137 	ld_ptr(t1, AT, 0);
  2138 #endif
  2139 	//st_ptr(t1, top, oopDesc::klass_offset_in_bytes());
  2140 	store_klass(top, t1);
  2142 	// refill the tlab with an eden allocation
  2143 	bind(do_refill);
  2144 	ld_ptr(t1, thread_reg, in_bytes(JavaThread::tlab_size_offset()));
  2145 	shl(t1, LogHeapWordSize);
  2146 	// add object_size ??
  2147 	eden_allocate(top, t1, 0, t2, t3, slow_case);
  2149 	// Check that t1 was preserved in eden_allocate.
  2150 #ifdef ASSERT
  2151 	if (UseTLAB) {
  2152 		Label ok;
  2153 		assert_different_registers(thread_reg, t1);
  2154 		ld_ptr(AT, thread_reg, in_bytes(JavaThread::tlab_size_offset()));
  2155 		shl(AT, LogHeapWordSize);
  2156 		beq(AT, t1, ok);
  2157 		delayed()->nop();
  2158 		stop("assert(t1 != tlab size)");
  2159 		should_not_reach_here();
  2161 		bind(ok);
  2163 #endif
  2164 	st_ptr(top, thread_reg, in_bytes(JavaThread::tlab_start_offset()));
  2165 	st_ptr(top, thread_reg, in_bytes(JavaThread::tlab_top_offset()));
  2166 	add(top, top, t1);	
  2167 	addi(top, top, - ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
  2168 	st_ptr(top, thread_reg, in_bytes(JavaThread::tlab_end_offset()));
  2169 	verify_tlab(t1, t2);
  2170 	b(retry);
  2171 	delayed()->nop();
  2174 static const double     pi_4 =  0.7853981633974483;
  2176 // the x86 version is to clumsy, i dont think we need that fuss. maybe i'm wrong, FIXME
  2177 // must get argument(a double) in F12/F13
  2178 //void MacroAssembler::trigfunc(char trig, bool preserve_cpu_regs, int num_fpu_regs_in_use) {
  2179 //We need to preseve the register which maybe modified during the Call @Jerome
  2180 void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
  2181 //save all modified register here
  2182 //	if (preserve_cpu_regs) {
  2183 //	}
  2184 //FIXME, in the disassembly of tirgfunc, only used V0,V1,T9, SP,RA,so we ony save V0,V1,T9 
  2185 	pushad();
  2186 //we should preserve the stack space before we call
  2187 	addi(SP, SP, -wordSize * 2);
  2188         switch (trig){
  2189 		case 's' :
  2190               		call( CAST_FROM_FN_PTR(address, SharedRuntime::dsin), relocInfo::runtime_call_type );
  2191 			delayed()->nop();
  2192 			break;
  2193 		case 'c':	
  2194 			call( CAST_FROM_FN_PTR(address, SharedRuntime::dcos), relocInfo::runtime_call_type );
  2195 			delayed()->nop();
  2196 			break;
  2197 		case 't':
  2198 			call( CAST_FROM_FN_PTR(address, SharedRuntime::dtan), relocInfo::runtime_call_type );
  2199 			delayed()->nop();
  2200 			break;
  2201 		default:assert (false, "bad intrinsic");
  2202 		break;
  2206 	addi(SP, SP, wordSize * 2);
  2207 	popad();
  2208 //	if (preserve_cpu_regs) {
  2209 //	}
  2211 /*
  2213 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
  2214   ucomisd(dst, as_Address(src));
  2217 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
  2218   ucomiss(dst, as_Address(src));
  2221 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
  2222   if (reachable(src)) {
  2223     xorpd(dst, as_Address(src));
  2224   } else {
  2225     lea(rscratch1, src);
  2226     xorpd(dst, Address(rscratch1, 0));
  2230 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
  2231   if (reachable(src)) {
  2232     xorps(dst, as_Address(src));
  2233   } else {
  2234     lea(rscratch1, src);
  2235     xorps(dst, Address(rscratch1, 0));
  2238  */
  2240 #ifdef _LP64
  2241 void MacroAssembler::li(Register rd, long imm) {
  2242   if (imm <= max_jint && imm >= min_jint) {
  2243     li32(rd, (int)imm);
  2244   } else if (julong(imm) <= 0xFFFFFFFF) {
  2245     assert_not_delayed();
  2246     // lui sign-extends, so we can't use that.
  2247     ori(rd, R0, julong(imm) >> 16);
  2248     dsll(rd, rd, 16);
  2249     ori(rd, rd, split_low(imm));
  2250   //aoqi_test
  2251   //} else if ((imm > 0) && ((imm >> 48) == 0)) {
  2252   } else if ((imm > 0) && is_simm16(imm >> 32)) {
  2253     /* A 48-bit address */
  2254     li48(rd, imm);
  2255   } else {
  2256     li64(rd, imm);
  2259 #else
  2260 void MacroAssembler::li(Register rd, long imm) {
  2261   li32(rd, (int)imm);
  2263 #endif
  2265 void MacroAssembler::li32(Register reg, int imm) {
  2266   if (is_simm16(imm)) {
  2267     /* Jin: for imm < 0, we should use addi instead of addiu.
  2269      *  java.lang.StringCoding$StringDecoder.decode(jobject, jint, jint)
  2271      *  78 move [int:-1|I] [a0|I]
  2272      *    : daddi a0, zero, 0xffffffff  (correct)
  2273      *    : daddiu a0, zero, 0xffffffff (incorrect)
  2274      */
  2275     if (imm >= 0)
  2276       addiu(reg, R0, imm);
  2277     else
  2278       addi(reg, R0, imm);
  2279   } else {
  2280     lui(reg, split_low(imm >> 16));
  2281     if (split_low(imm))
  2282       ori(reg, reg, split_low(imm));
  2286 #ifdef _LP64
  2287 void MacroAssembler::li64(Register rd, long imm) {
  2288   assert_not_delayed();
  2289   lui(rd, imm >> 48);
  2290   ori(rd, rd, split_low(imm >> 32));
  2291   dsll(rd, rd, 16);
  2292   ori(rd, rd, split_low(imm >> 16));
  2293   dsll(rd, rd, 16);
  2294   ori(rd, rd, split_low(imm));
  2297 void MacroAssembler::li48(Register rd, long imm) {
  2298   assert(is_simm16(imm >> 32), "Not a 48-bit address");
  2299   lui(rd, imm >> 32);
  2300   ori(rd, rd, split_low(imm >> 16));
  2301   dsll(rd, rd, 16);
  2302   ori(rd, rd, split_low(imm));
  2304 #endif
  2305 // NOTE: i dont push eax as i486.
  2306 // the x86 save eax for it use eax as the jump register
  2307 void MacroAssembler::verify_oop(Register reg, const char* s) {
  2308   /*
  2309      if (!VerifyOops) return;
  2311   // Pass register number to verify_oop_subroutine
  2312   char* b = new char[strlen(s) + 50];
  2313   sprintf(b, "verify_oop: %s: %s", reg->name(), s);
  2314   push(rax);                          // save rax,
  2315   push(reg);                          // pass register argument
  2316   ExternalAddress buffer((address) b);
  2317   // avoid using pushptr, as it modifies scratch registers
  2318   // and our contract is not to modify anything
  2319   movptr(rax, buffer.addr());
  2320   push(rax);
  2321   // call indirectly to solve generation ordering problem
  2322   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
  2323   call(rax);
  2324    */
  2325   if (!VerifyOops) return;
  2326   const char * b = NULL; 
  2327   stringStream ss;
  2328   ss.print("verify_oop: %s: %s", reg->name(), s);
  2329   b = code_string(ss.as_string());
  2330 #ifdef _LP64
  2331   pushad();
  2332   move(A1, reg);
  2333   li(A0, (long)b);
  2334   li(AT, (long)StubRoutines::verify_oop_subroutine_entry_address());
  2335   ld(T9, AT, 0);
  2336   jalr(T9);
  2337   delayed()->nop();
  2338   popad();
  2339 #else
  2340   // Pass register number to verify_oop_subroutine
  2341   sw(T0, SP, - wordSize);
  2342   sw(T1, SP, - 2*wordSize);
  2343   sw(RA, SP, - 3*wordSize);
  2344   sw(A0, SP ,- 4*wordSize);	
  2345   sw(A1, SP ,- 5*wordSize);	
  2346   sw(AT, SP ,- 6*wordSize);	
  2347   sw(T9, SP ,- 7*wordSize);	
  2348   addiu(SP, SP, - 7 * wordSize);
  2349   move(A1, reg);
  2350   li(A0, (long)b);
  2351   // call indirectly to solve generation ordering problem
  2352   li(AT, (long)StubRoutines::verify_oop_subroutine_entry_address());        	
  2353   lw(T9, AT, 0);
  2354   jalr(T9);
  2355   delayed()->nop();
  2356   lw(T0, SP, 6* wordSize);
  2357   lw(T1, SP, 5* wordSize);
  2358   lw(RA, SP, 4* wordSize);
  2359   lw(A0, SP, 3* wordSize);
  2360   lw(A1, SP, 2* wordSize);
  2361   lw(AT, SP, 1* wordSize);
  2362   lw(T9, SP, 0* wordSize);
  2363   addiu(SP, SP, 7 * wordSize);
  2364 #endif
  2368 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
  2369 	if (!VerifyOops) {
  2370 		nop();
  2371 		return;
  2373 	// Pass register number to verify_oop_subroutine
  2374 	const char * b = NULL;
  2375 	stringStream ss;
  2376 	ss.print("verify_oop_addr: %s",  s);
  2377 	b = code_string(ss.as_string());
  2379 	st_ptr(T0, SP, - wordSize);
  2380 	st_ptr(T1, SP, - 2*wordSize);
  2381 	st_ptr(RA, SP, - 3*wordSize);
  2382 	st_ptr(A0, SP, - 4*wordSize);	
  2383 	st_ptr(A1, SP, - 5*wordSize);	
  2384 	st_ptr(AT, SP, - 6*wordSize);	
  2385 	st_ptr(T9, SP, - 7*wordSize);	
  2386 	ld_ptr(A1, addr);   // addr may use SP, so load from it before change SP
  2387 	addiu(SP, SP, - 7 * wordSize);
  2389 	li(A0, (long)b);
  2390 	// call indirectly to solve generation ordering problem
  2391 	li(AT, (long)StubRoutines::verify_oop_subroutine_entry_address());        	
  2392 	ld_ptr(T9, AT, 0);
  2393 	jalr(T9);
  2394 	delayed()->nop();
  2395 	ld_ptr(T0, SP, 6* wordSize);
  2396 	ld_ptr(T1, SP, 5* wordSize);
  2397 	ld_ptr(RA, SP, 4* wordSize);
  2398 	ld_ptr(A0, SP, 3* wordSize);
  2399 	ld_ptr(A1, SP, 2* wordSize);
  2400 	ld_ptr(AT, SP, 1* wordSize);
  2401 	ld_ptr(T9, SP, 0* wordSize);
  2402 	addiu(SP, SP, 7 * wordSize);
  2405 // used registers :  T0, T1
  2406 void MacroAssembler::verify_oop_subroutine() {
  2407   // RA: ra
  2408   // A0: char* error message    
  2409   // A1: oop   object to verify 
  2411   Label exit, error;
  2412   // increment counter
  2413   li(T0, (long)StubRoutines::verify_oop_count_addr());
  2414   lw(AT, T0, 0);
  2415 #ifdef _LP64
  2416 //FIXME, aoqi: rewrite addi, addu, etc in 64bits mode.
  2417   daddi(AT, AT, 1);
  2418 #else
  2419   addi(AT, AT, 1);
  2420 #endif
  2421   sw(AT, T0, 0);
  2423   // make sure object is 'reasonable'
  2424   beq(A1, R0, exit);         // if obj is NULL it is ok
  2425   delayed()->nop();
  2427   // Check if the oop is in the right area of memory
  2428   //const int oop_mask = Universe::verify_oop_mask();
  2429   //const int oop_bits = Universe::verify_oop_bits();
  2430   const uintptr_t oop_mask = Universe::verify_oop_mask();
  2431   const uintptr_t oop_bits = Universe::verify_oop_bits();
  2432   li(AT, oop_mask);
  2433   andr(T0, A1, AT);
  2434   li(AT, oop_bits);
  2435   bne(T0, AT, error);
  2436   delayed()->nop();
  2438   // make sure klass is 'reasonable'
  2439   //add for compressedoops
  2440   reinit_heapbase();
  2441   //add for compressedoops
  2442   load_klass(T0, A1);
  2443   beq(T0, R0, error);                        // if klass is NULL it is broken
  2444   delayed()->nop();
  2445   #if 0
  2446   //FIXME:wuhui.
  2447   // Check if the klass is in the right area of memory
  2448   //const int klass_mask = Universe::verify_klass_mask();
  2449   //const int klass_bits = Universe::verify_klass_bits();
  2450   const uintptr_t klass_mask = Universe::verify_klass_mask();
  2451   const uintptr_t klass_bits = Universe::verify_klass_bits();
  2453   li(AT, klass_mask);
  2454   andr(T1, T0, AT);
  2455   li(AT, klass_bits);
  2456   bne(T1, AT, error);
  2457   delayed()->nop();
  2458   // make sure klass' klass is 'reasonable'
  2459   //add for compressedoops
  2460   load_klass(T0, T0);
  2461   beq(T0, R0, error);  // if klass' klass is NULL it is broken
  2462   delayed()->nop();
  2464   li(AT, klass_mask);
  2465   andr(T1, T0, AT);
  2466   li(AT, klass_bits);
  2467   bne(T1, AT, error);
  2468   delayed()->nop();     // if klass not in right area of memory it is broken too.
  2469 #endif
  2470   // return if everything seems ok
  2471   bind(exit);
  2473   jr(RA);
  2474   delayed()->nop();
  2476   // handle errors
  2477   bind(error);
  2478   pushad();
  2479 #ifndef _LP64
  2480   addi(SP, SP, (-1) * wordSize);
  2481 #endif
  2482   call(CAST_FROM_FN_PTR(address, MacroAssembler::debug), relocInfo::runtime_call_type);
  2483   delayed()->nop();
  2484 #ifndef _LP64
  2485   addiu(SP, SP, 1 * wordSize);
  2486 #endif
  2487   popad();	
  2488   jr(RA);
  2489   delayed()->nop();
  2492 void MacroAssembler::verify_tlab(Register t1, Register t2) {
  2493 #ifdef ASSERT
  2494   assert_different_registers(t1, t2, AT);
  2495   if (UseTLAB && VerifyOops) {
  2496     Label next, ok;
  2498     get_thread(t1);
  2500     ld_ptr(t2, t1, in_bytes(JavaThread::tlab_top_offset()));
  2501     ld_ptr(AT, t1, in_bytes(JavaThread::tlab_start_offset()));
  2502     sltu(AT, t2, AT);
  2503     beq(AT, R0, next);
  2504     delayed()->nop();
  2506     stop("assert(top >= start)");
  2508     bind(next);
  2509     ld_ptr(AT, t1, in_bytes(JavaThread::tlab_end_offset()));
  2510     sltu(AT, AT, t2);
  2511     beq(AT, R0, ok);
  2512     delayed()->nop();
  2514     stop("assert(top <= end)");
  2516     bind(ok);
  2518     /*
  2519        Label next, ok;
  2520        Register t1 = rsi;
  2521        Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
  2523        push(t1);
  2524        NOT_LP64(push(thread_reg));
  2525        NOT_LP64(get_thread(thread_reg));
  2527        movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
  2528        cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
  2529        jcc(Assembler::aboveEqual, next);
  2530        stop("assert(top >= start)");
  2531        should_not_reach_here();
  2533        bind(next);
  2534        movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
  2535        cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
  2536        jcc(Assembler::aboveEqual, ok);
  2537        stop("assert(top <= end)");
  2538        should_not_reach_here();
  2540        bind(ok);
  2541        NOT_LP64(pop(thread_reg));
  2542        pop(t1);
  2543      */
  2545 #endif
  2547  RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
  2548                                                        Register tmp,
  2549                                                        int offset) {
  2550    intptr_t value = *delayed_value_addr;
  2551    if (value != 0)
  2552    return RegisterOrConstant(value + offset);
  2553    AddressLiteral a(delayed_value_addr);
  2554    // load indirectly to solve generation ordering problem
  2555    //movptr(tmp, ExternalAddress((address) delayed_value_addr));
  2556    //ld(tmp, a);
  2557   /* #ifdef ASSERT
  2558    { Label L;
  2559      testptr(tmp, tmp);
  2560      if (WizardMode) {
  2561             jcc(Assembler::notZero, L);
  2562             char* buf = new char[40];
  2563             sprintf(buf, "DelayedValue="INTPTR_FORMAT, delayed_value_addr[1]);
  2564             STOP(buf);
  2565                       } else {
  2566             jccb(Assembler::notZero, L);
  2567             hlt();
  2569      bind(L);
  2571    #endif*/
  2572    if (offset != 0)
  2573      daddi(tmp,tmp, offset);
  2575    return RegisterOrConstant(tmp);
  2578 void MacroAssembler::hswap(Register reg) {
  2579   //andi(reg, reg, 0xffff);
  2580   srl(AT, reg, 8);
  2581   sll(reg, reg, 24);
  2582   sra(reg, reg, 16);
  2583   orr(reg, reg, AT);
  2586 void MacroAssembler::huswap(Register reg) {
  2587 #ifdef _LP64
  2588   dsrl(AT, reg, 8);
  2589   dsll(reg, reg, 24);
  2590   dsrl(reg, reg, 16);
  2591   orr(reg, reg, AT);
  2592   andi(reg, reg, 0xffff);
  2593 #else
  2594   //andi(reg, reg, 0xffff);
  2595   srl(AT, reg, 8);
  2596   sll(reg, reg, 24);
  2597   srl(reg, reg, 16);
  2598   orr(reg, reg, AT);
  2599 #endif
  2602 // something funny to do this will only one more register AT
  2603 // by yjl 6/29/2005
  2604 void MacroAssembler::swap(Register reg) {
  2605 	srl(AT, reg, 8);
  2606 	sll(reg, reg, 24);
  2607 	orr(reg, reg, AT);
  2608 	//reg : 4 1 2 3
  2609 	srl(AT, AT, 16);
  2610 	xorr(AT, AT, reg);
  2611 	andi(AT, AT, 0xff);
  2612 	//AT : 0 0 0 1^3);
  2613 	xorr(reg, reg, AT);
  2614 	//reg : 4 1 2 1
  2615 	sll(AT, AT, 16);
  2616 	xorr(reg, reg, AT);
  2617 	//reg : 4 3 2 1
  2620 #ifdef _LP64
  2622 /* do 32-bit CAS using MIPS64 lld/scd
  2624   Jin: cas_int should only compare 32-bits of the memory value.
  2625        However, lld/scd will do 64-bit operation, which violates the intention of cas_int.
  2626        To simulate a 32-bit atomic operation, the value loaded with LLD should be split into 
  2627        tow halves, and only the low-32 bits is compared. If equals, the low-32 bits of newval,
  2628        plus the high-32 bits or memory value, are stored togethor with SCD.
  2630 Example:
  2632       double d = 3.1415926;
  2633       System.err.println("hello" + d);
  2635   sun.misc.FloatingDecimal$1.<init>()
  2637    `- java.util.concurrent.atomic.AtomicInteger::compareAndSet()
  2639   38 cas_int [a7a7|J] [a0|I] [a6|I]   
  2640 // a0: 0xffffffffe8ea9f63 pc: 0x55647f3354
  2641 // a6: 0x4ab325aa
  2643 again:
  2644    0x00000055647f3c5c: lld at, 0x0(a7)                          ; 64-bit load, "0xe8ea9f63"
  2646    0x00000055647f3c60: sll t9, at, 0                            ; t9: low-32 bits (sign extended)
  2647    0x00000055647f3c64: dsrl32 t8, at, 0                         ; t8: high-32 bits
  2648    0x00000055647f3c68: dsll32 t8, t8, 0
  2649    0x00000055647f3c6c: bne t9, a0, 0x00000055647f3c9c           ; goto nequal
  2650    0x00000055647f3c70: sll zero, zero, 0
  2652    0x00000055647f3c74: ori v1, zero, 0xffffffff                 ; v1: low-32 bits of newval (sign unextended)
  2653    0x00000055647f3c78: dsll v1, v1, 16                          ; v1 = a6 & 0xFFFFFFFF;
  2654    0x00000055647f3c7c: ori v1, v1, 0xffffffff
  2655    0x00000055647f3c80: and v1, a6, v1 
  2656    0x00000055647f3c84: or at, t8, v1 
  2657    0x00000055647f3c88: scd at, 0x0(a7)
  2658    0x00000055647f3c8c: beq at, zero, 0x00000055647f3c5c         ; goto again
  2659    0x00000055647f3c90: sll zero, zero, 0
  2660    0x00000055647f3c94: beq zero, zero, 0x00000055647f45ac       ; goto done
  2661    0x00000055647f3c98: sll zero, zero, 0
  2662 nequal:
  2663    0x00000055647f45a4: dadd a0, t9, zero
  2664    0x00000055647f45a8: dadd at, zero, zero
  2665 done:
  2666 */
  2668 void MacroAssembler::cmpxchg32(Register x_reg, Address dest, Register c_reg) {
  2669 #if 0
  2670   Label done, again, nequal;
  2671   bind(again);
  2673   sync();
  2674   lld(AT, dest);
  2676   /* T9:  32 bits, sign extended
  2677    * V1: low 32 bits, sign unextended
  2678    * T8: high 32 bits (may be another variables's space)
  2679    */
  2680   sll(T9, AT, 0);	// Use 32-bit sll to extend bit 31
  2681   dsrl32(T8, AT, 0);
  2682   dsll32(T8, T8, 0);
  2684   bne(T9, c_reg, nequal);
  2685   delayed()->nop(); 
  2687   ori(V1, R0, 0xFFFF);
  2688   dsll(V1, V1, 16);
  2689   ori(V1, V1, 0xFFFF);
  2690   andr(V1, x_reg, V1);
  2691   orr(AT, T8, V1);
  2692   scd(AT, dest);
  2693   beq(AT, R0, again);
  2694   delayed()->nop();
  2695   b(done);
  2696   delayed()->nop();
  2698   // not xchged
  2699   bind(nequal);
  2700   move(c_reg, T9);
  2701   move(AT, R0);
  2703   bind(done);
  2704 #else
  2706   /* 2012/11/11 Jin: MIPS64 can use ll/sc for 32-bit atomic memory access */
  2707   Label done, again, nequal;
  2709   bind(again);
  2711   sync();
  2712   ll(AT, dest);
  2713   bne(AT, c_reg, nequal);
  2714   delayed()->nop(); 
  2716   move(AT, x_reg);
  2717   sc(AT, dest);
  2718   beq(AT, R0, again);
  2719   delayed()->nop();
  2720   b(done);
  2721   delayed()->nop();
  2723   // not xchged
  2724   bind(nequal);
  2725   sync();
  2726   move(c_reg, AT);
  2727   move(AT, R0);
  2729   bind(done);
  2730 #endif
  2732 #endif	// cmpxchg32
  2734 void MacroAssembler::cmpxchg(Register x_reg, Address dest, Register c_reg) {
  2735   Label done, again, nequal;
  2737   bind(again);
  2738 #ifdef _LP64
  2739   sync();
  2740   lld(AT, dest);
  2741 #else
  2742   sync();
  2743   ll(AT, dest);
  2744 #endif
  2745   bne(AT, c_reg, nequal);
  2746   delayed()->nop(); 
  2748   move(AT, x_reg);
  2749 #ifdef _LP64
  2750   scd(AT, dest);
  2751 #else
  2752   sc(AT, dest);
  2753 #endif
  2754   beq(AT, R0, again);
  2755   delayed()->nop();
  2756   b(done);
  2757   delayed()->nop();
  2759   // not xchged
  2760   bind(nequal);
  2761   sync();
  2762   move(c_reg, AT);
  2763   move(AT, R0);
  2765   bind(done);
  2768 void MacroAssembler::cmpxchg8(Register x_regLo, Register x_regHi, Address dest, Register c_regLo, Register c_regHi) {
  2769 	Label done, again, nequal;
  2771 	Register x_reg = x_regLo;
  2772 	dsll32(x_regHi, x_regHi, 0);
  2773 	dsll32(x_regLo, x_regLo, 0);
  2774 	dsrl32(x_regLo, x_regLo, 0);
  2775 	orr(x_reg, x_regLo, x_regHi);
  2777 	Register c_reg = c_regLo;
  2778 	dsll32(c_regHi, c_regHi, 0);
  2779 	dsll32(c_regLo, c_regLo, 0);
  2780 	dsrl32(c_regLo, c_regLo, 0);
  2781 	orr(c_reg, c_regLo, c_regHi);
  2783 	bind(again);
  2785 	sync();
  2786 	lld(AT, dest);
  2787 	bne(AT, c_reg, nequal);
  2788 	delayed()->nop(); 
  2790 	//move(AT, x_reg);
  2791 	dadd(AT, x_reg, R0);
  2792 	scd(AT, dest);
  2793 	beq(AT, R0, again);
  2794 	delayed()->nop();
  2795 	b(done);
  2796 	delayed()->nop();
  2798 	// not xchged
  2799 	bind(nequal);
  2800 	sync();
  2801 	//move(c_reg, AT);
  2802 	//move(AT, R0);
  2803 	dadd(c_reg, AT, R0);
  2804 	dadd(AT, R0, R0);
  2805 	bind(done);
  2808 // be sure the three register is different
  2809 void MacroAssembler::rem_s(FloatRegister fd, FloatRegister fs, FloatRegister ft, FloatRegister tmp) {    
  2810   assert_different_registers(tmp, fs, ft); 
  2811 	div_s(tmp, fs, ft); 
  2812 	trunc_l_s(tmp, tmp); 
  2813 	cvt_s_l(tmp, tmp); 
  2814 	mul_s(tmp, tmp, ft); 
  2815 	sub_s(fd, fs, tmp); 
  2818 // be sure the three register is different
  2819 void MacroAssembler::rem_d(FloatRegister fd, FloatRegister fs, FloatRegister ft, FloatRegister tmp) {    
  2820 	assert_different_registers(tmp, fs, ft); 
  2821 	div_d(tmp, fs, ft); 
  2822 	trunc_l_d(tmp, tmp); 
  2823 	cvt_d_l(tmp, tmp); 
  2824 	mul_d(tmp, tmp, ft); 
  2825 	sub_d(fd, fs, tmp); 
  2828 // Fast_Lock and Fast_Unlock used by C2
  2830 // Because the transitions from emitted code to the runtime
  2831 // monitorenter/exit helper stubs are so slow it's critical that
  2832 // we inline both the stack-locking fast-path and the inflated fast path.
  2833 //
  2834 // See also: cmpFastLock and cmpFastUnlock.
  2835 //
  2836 // What follows is a specialized inline transliteration of the code
  2837 // in slow_enter() and slow_exit().  If we're concerned about I$ bloat
  2838 // another option would be to emit TrySlowEnter and TrySlowExit methods
  2839 // at startup-time.  These methods would accept arguments as
  2840 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
  2841 // indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
  2842 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
  2843 // In practice, however, the # of lock sites is bounded and is usually small.
  2844 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
  2845 // if the processor uses simple bimodal branch predictors keyed by EIP
  2846 // Since the helper routines would be called from multiple synchronization
  2847 // sites.
  2848 //
  2849 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
  2850 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
  2851 // to those specialized methods.  That'd give us a mostly platform-independent
  2852 // implementation that the JITs could optimize and inline at their pleasure.
  2853 // Done correctly, the only time we'd need to cross to native could would be
  2854 // to park() or unpark() threads.  We'd also need a few more unsafe operators
  2855 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
  2856 // (b) explicit barriers or fence operations.
  2857 //
  2858 // TODO:
  2859 //
  2860 // *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
  2861 //    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
  2862 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
  2863 //    the lock operators would typically be faster than reifying Self.
  2864 //
  2865 // *  Ideally I'd define the primitives as:
  2866 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
  2867 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
  2868 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
  2869 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
  2870 //    Furthermore the register assignments are overconstrained, possibly resulting in
  2871 //    sub-optimal code near the synchronization site.
  2872 //
  2873 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
  2874 //    Alternately, use a better sp-proximity test.
  2875 //
  2876 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
  2877 //    Either one is sufficient to uniquely identify a thread.
  2878 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
  2879 //
  2880 // *  Intrinsify notify() and notifyAll() for the common cases where the
  2881 //    object is locked by the calling thread but the waitlist is empty.
  2882 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
  2883 //
  2884 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
  2885 //    But beware of excessive branch density on AMD Opterons.
  2886 //
  2887 // *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
  2888 //    or failure of the fast-path.  If the fast-path fails then we pass
  2889 //    control to the slow-path, typically in C.  In Fast_Lock and
  2890 //    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
  2891 //    will emit a conditional branch immediately after the node.
  2892 //    So we have branches to branches and lots of ICC.ZF games.
  2893 //    Instead, it might be better to have C2 pass a "FailureLabel"
  2894 //    into Fast_Lock and Fast_Unlock.  In the case of success, control
  2895 //    will drop through the node.  ICC.ZF is undefined at exit.
  2896 //    In the case of failure, the node will branch directly to the
  2897 //    FailureLabel
  2900 // obj: object to lock
  2901 // box: on-stack box address (displaced header location) - KILLED
  2902 // rax,: tmp -- KILLED
  2903 // scr: tmp -- KILLED
  2904 void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, Register scrReg) {
  2906   tmpReg = T8;
  2907   scrReg = S7;
  2909   // Ensure the register assignents are disjoint
  2910   guarantee (objReg != boxReg, "") ;
  2911   guarantee (objReg != tmpReg, "") ;
  2912   guarantee (objReg != scrReg, "") ;
  2913   guarantee (boxReg != tmpReg, "") ;
  2914   guarantee (boxReg != scrReg, "") ;
  2917   block_comment("FastLock");
  2918   /*
  2919      __ move(AT, 0x0);
  2920      return;
  2921      */
  2922   if (PrintBiasedLockingStatistics) {
  2923     push(tmpReg);
  2924     atomic_inc32((address)BiasedLocking::total_entry_count_addr(), 1, AT, tmpReg);
  2925     pop(tmpReg);
  2928   if (EmitSync & 1) {
  2929     // set box->dhw = unused_mark (3)
  2930     // Force all sync thru slow-path: slow_enter() and slow_exit()
  2931     move (AT, (int32_t)intptr_t(markOopDesc::unused_mark()));
  2932     sd(AT, Address(boxReg, 0));
  2933     move (AT, (int32_t)0) ;	// Eflags.ZF = 0
  2934   } else
  2935     if (EmitSync & 2) {
  2936       Label DONE_LABEL ;
  2937       if (UseBiasedLocking) {
  2938         // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
  2939         biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL);
  2942       ld(tmpReg, Address(objReg, 0)) ;          // fetch markword
  2943       ori(tmpReg, tmpReg, 0x1);
  2944       sd(tmpReg, Address(boxReg, 0));           // Anticipate successful CAS
  2946       cmpxchg(boxReg, Address(objReg, 0), tmpReg);          // Updates tmpReg
  2947       bne(AT, R0, DONE_LABEL);
  2948       delayed()->nop();
  2950       // Recursive locking
  2951       dsubu(tmpReg, tmpReg, SP);
  2952       li(AT, (7 - os::vm_page_size() ));
  2953       andr(tmpReg, tmpReg, AT);
  2954       sd(tmpReg, Address(boxReg, 0));
  2955       bind(DONE_LABEL) ;
  2956     } else {
  2957       // Possible cases that we'll encounter in fast_lock
  2958       // ------------------------------------------------
  2959       // * Inflated
  2960       //    -- unlocked
  2961       //    -- Locked
  2962       //       = by self
  2963       //       = by other
  2964       // * biased
  2965       //    -- by Self
  2966       //    -- by other
  2967       // * neutral
  2968       // * stack-locked
  2969       //    -- by self
  2970       //       = sp-proximity test hits
  2971       //       = sp-proximity test generates false-negative
  2972       //    -- by other
  2973       //
  2975       Label IsInflated, DONE_LABEL, PopDone ;
  2977       // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
  2978       // order to reduce the number of conditional branches in the most common cases.
  2979       // Beware -- there's a subtle invariant that fetch of the markword
  2980       // at [FETCH], below, will never observe a biased encoding (*101b).
  2981       // If this invariant is not held we risk exclusion (safety) failure.
  2982       if (UseBiasedLocking && !UseOptoBiasInlining) {
  2983         biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL);
  2986       ld(tmpReg, Address(objReg, 0)) ;         //Fetch the markword of the object.
  2987       andi(AT, tmpReg, 0x02);                  //If AT == 0x02 ==> the object is inflated, will not use the fast lock method.
  2988       bne(AT, R0, IsInflated);                      // Inflated v (Stack-locked or neutral)
  2989       delayed()->nop();
  2991       // Attempt stack-locking ...
  2992       ori (tmpReg, tmpReg, 0x1);
  2993       sd(tmpReg, Address(boxReg, 0));          // Anticipate successful CAS
  2995       cmpxchg(boxReg, Address(objReg, 0), tmpReg);           // Updates tmpReg
  2997       if (PrintBiasedLockingStatistics) {
  2998         Label L;
  2999         beq(AT, R0, L);
  3000         delayed()->nop();
  3001         push(T0);
  3002         push(T1);
  3003         atomic_inc32((address)BiasedLocking::fast_path_entry_count_addr(), 1, T0, T1);
  3004         pop(T1);
  3005         pop(T0);
  3006         bind(L);
  3008       bne(AT, R0, DONE_LABEL);
  3009       delayed()->nop();
  3011       // Recursive locking
  3012       dsubu(tmpReg, tmpReg, SP);
  3013       li(AT, 7 - os::vm_page_size() );
  3014       andr(tmpReg, tmpReg, AT);
  3015       sd(tmpReg, Address(boxReg, 0));
  3016       if (PrintBiasedLockingStatistics) {
  3017         Label L;
  3018         // tmpReg == 0 => BiasedLocking::_fast_path_entry_count++
  3019         bne(tmpReg, R0, L);
  3020         delayed()->nop();
  3021         push(T0);
  3022         push(T1);
  3023         atomic_inc32((address)BiasedLocking::fast_path_entry_count_addr(), 1, T0, T1);
  3024         pop(T1);
  3025         pop(T0);
  3026         bind(L);
  3028       sltiu(AT, tmpReg, 1); /* AT = (tmpReg == 0) ? 1 : 0 */
  3030       b(DONE_LABEL) ;
  3031       delayed()->nop();
  3033       bind(IsInflated) ;
  3035       // TODO: someday avoid the ST-before-CAS penalty by
  3036       // relocating (deferring) the following ST.
  3037       // We should also think about trying a CAS without having
  3038       // fetched _owner.  If the CAS is successful we may
  3039       // avoid an RTO->RTS upgrade on the $line.
  3040       // Without cast to int32_t a movptr will destroy r10 which is typically obj
  3041       li(AT, (int32_t)intptr_t(markOopDesc::unused_mark()));
  3042       sd(AT, Address(boxReg, 0));
  3044       move(boxReg, tmpReg) ;
  3045       ld(tmpReg, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
  3046       sltiu(AT, tmpReg, 1);  /* Jin: AT = !tmpReg; */
  3047       bne(tmpReg, R0, DONE_LABEL);
  3048       delayed()->nop();
  3050       cmpxchg(TREG, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), tmpReg) ;
  3051       // Intentional fall-through into DONE_LABEL ...
  3054       // DONE_LABEL is a hot target - we'd really like to place it at the
  3055       // start of cache line by padding with NOPs.
  3056       // See the AMD and Intel software optimization manuals for the
  3057       // most efficient "long" NOP encodings.
  3058       // Unfortunately none of our alignment mechanisms suffice.
  3059       bind(DONE_LABEL);
  3061       // Avoid branch-to-branch on AMD processors
  3062       // This appears to be superstition.
  3063       if (EmitSync & 32) nop() ;
  3066       // At DONE_LABEL the icc ZFlag is set as follows ...
  3067       // Fast_Unlock uses the same protocol.
  3068       // ZFlag == 1 -> Success
  3069       // ZFlag == 0 -> Failure - force control through the slow-path
  3073 // obj: object to unlock
  3074 // box: box address (displaced header location), killed.  Must be EAX.
  3075 // rbx,: killed tmp; cannot be obj nor box.
  3076 //
  3077 // Some commentary on balanced locking:
  3078 //
  3079 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
  3080 // Methods that don't have provably balanced locking are forced to run in the
  3081 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
  3082 // The interpreter provides two properties:
  3083 // I1:  At return-time the interpreter automatically and quietly unlocks any
  3084 //      objects acquired the current activation (frame).  Recall that the
  3085 //      interpreter maintains an on-stack list of locks currently held by
  3086 //      a frame.
  3087 // I2:  If a method attempts to unlock an object that is not held by the
  3088 //      the frame the interpreter throws IMSX.
  3089 //
  3090 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
  3091 // B() doesn't have provably balanced locking so it runs in the interpreter.
  3092 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
  3093 // is still locked by A().
  3094 //
  3095 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
  3096 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
  3097 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
  3098 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
  3100 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
  3102   tmpReg = T8;
  3104   guarantee (objReg != boxReg, "") ;
  3105   guarantee (objReg != tmpReg, "") ;
  3106   guarantee (boxReg != tmpReg, "") ;
  3110   block_comment("FastUnlock");
  3112   /*
  3113      move(AT, 0x0);
  3114      return;
  3115      */
  3117   if (EmitSync & 4) {
  3118     // Disable - inhibit all inlining.  Force control through the slow-path
  3119     move(AT, R0);
  3120   } else
  3121     if (EmitSync & 8) {
  3122       Label DONE_LABEL ;
  3123       if (UseBiasedLocking) {
  3124         biased_locking_exit(objReg, tmpReg, DONE_LABEL);
  3126       // classic stack-locking code ...
  3127       ld(tmpReg, Address(boxReg, 0)) ;
  3128       beq(tmpReg, R0, DONE_LABEL) ;
  3129       move(AT, 0x1);  // delay slot
  3131       cmpxchg(tmpReg, Address(objReg, 0), boxReg);          // Uses EAX which is box
  3132       bind(DONE_LABEL);
  3133     } else {
  3134       Label DONE_LABEL, Stacked, CheckSucc, Inflated ;
  3136       // Critically, the biased locking test must have precedence over
  3137       // and appear before the (box->dhw == 0) recursive stack-lock test.
  3138       if (UseBiasedLocking && !UseOptoBiasInlining) {
  3139         biased_locking_exit(objReg, tmpReg, DONE_LABEL);
  3142       ld(tmpReg, Address(objReg, 0)) ;       // Examine the object's markword
  3143       ld(AT, Address(boxReg, 0)) ;            // Examine the displaced header
  3144       beq(AT, R0, DONE_LABEL) ;      // 0 indicates recursive stack-lock
  3145       //move(AT, 0x1);
  3146       //delayed()->nop();
  3147       delayed()->daddiu(AT, R0, 0x1);
  3149       andi(AT, tmpReg, markOopDesc::monitor_value) ;                     // Inflated?
  3150       beq(AT, R0, Stacked) ;                     // Inflated?
  3151       delayed()->nop();
  3153       bind(Inflated) ;
  3154       // It's inflated.
  3155       // Despite our balanced locking property we still check that m->_owner == Self
  3156       // as java routines or native JNI code called by this thread might
  3157       // have released the lock.
  3158       // Refer to the comments in synchronizer.cpp for how we might encode extra
  3159       // state in _succ so we can avoid fetching EntryList|cxq.
  3160       //
  3161       // I'd like to add more cases in fast_lock() and fast_unlock() --
  3162       // such as recursive enter and exit -- but we have to be wary of
  3163       // I$ bloat, T$ effects and BP$ effects.
  3164       //
  3165       // If there's no contention try a 1-0 exit.  That is, exit without
  3166       // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
  3167       // we detect and recover from the race that the 1-0 exit admits.
  3168       //
  3169       // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
  3170       // before it STs null into _owner, releasing the lock.  Updates
  3171       // to data protected by the critical section must be visible before
  3172       // we drop the lock (and thus before any other thread could acquire
  3173       // the lock and observe the fields protected by the lock).
  3174       // IA32's memory-model is SPO, so STs are ordered with respect to
  3175       // each other and there's no need for an explicit barrier (fence).
  3176       // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
  3177 #ifdef OPT_THREAD
  3178       move(boxReg, TREG);
  3179 #else
  3180       get_thread (boxReg) ;
  3181 #endif
  3183 #ifndef _LP64
  3185       // Note that we could employ various encoding schemes to reduce
  3186       // the number of loads below (currently 4) to just 2 or 3.
  3187       // Refer to the comments in synchronizer.cpp.
  3188       // In practice the chain of fetches doesn't seem to impact performance, however.
  3189       if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
  3190         // Attempt to reduce branch density - AMD's branch predictor.
  3191         ld(AT, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
  3192         xorr(boxReg, boxReg, AT);
  3194         ld(AT, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
  3195         orr(boxReg, boxReg, AT);
  3197         ld(AT, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
  3198         orr(boxReg, boxReg, AT);
  3200         ld(AT, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
  3201         orr(boxReg, boxReg, AT);
  3203         bne(boxReg, R0, DONE_LABEL);
  3204         move(AT, R0);	/* delay slot */
  3206         sw(R0, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
  3207         b(DONE_LABEL);
  3208         move(AT, 0x1);	/* delay slot */
  3209       } else {
  3210         ld(AT, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
  3211         xorr(boxReg, boxReg, AT);
  3213         ld(AT, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
  3214         orr(boxReg, boxReg, AT);
  3216         bne(boxReg, R0, DONE_LABEL);
  3217         move(AT, R0);	/* delay slot */
  3219         ld(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
  3220         ld(AT, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
  3221         orr(boxReg, boxReg, AT);
  3223         bne(boxReg, R0, CheckSucc);
  3224         move(AT, R0);	/* delay slot */
  3226         sd(R0, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
  3227         b(DONE_LABEL);
  3228         move(AT, 0x1);	/* delay slot */
  3231       // The Following code fragment (EmitSync & 65536) improves the performance of
  3232       // contended applications and contended synchronization microbenchmarks.
  3233       // Unfortunately the emission of the code - even though not executed - causes regressions
  3234       // in scimark and jetstream, evidently because of $ effects.  Replacing the code
  3235       // with an equal number of never-executed NOPs results in the same regression.
  3236       // We leave it off by default.
  3238       if ((EmitSync & 65536) != 0) {
  3239         Label LSuccess, LGoSlowPath ;
  3241         bind(CheckSucc) ;
  3243         // Optional pre-test ... it's safe to elide this
  3244         if ((EmitSync & 16) == 0) {
  3245           ld(AT, Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2)) ;
  3246           beq(AT, R0, LGoSlowPath);
  3247           delayed()->nop();
  3250         // We have a classic Dekker-style idiom:
  3251         //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
  3252         // There are a number of ways to implement the barrier:
  3253         // (1) lock:andl &m->_owner, 0
  3254         //     is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
  3255         //     LOCK: ANDL [ebx+Offset(_Owner)-2], 0
  3256         //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
  3257         // (2) If supported, an explicit MFENCE is appealing.
  3258         //     In older IA32 processors MFENCE is slower than lock:add or xchg
  3259         //     particularly if the write-buffer is full as might be the case if
  3260         //     if stores closely precede the fence or fence-equivalent instruction.
  3261         //     In more modern implementations MFENCE appears faster, however.
  3262         // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
  3263         //     The $lines underlying the top-of-stack should be in M-state.
  3264         //     The locked add instruction is serializing, of course.
  3265         // (4) Use xchg, which is serializing
  3266         //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
  3267         // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
  3268         //     The integer condition codes will tell us if succ was 0.
  3269         //     Since _succ and _owner should reside in the same $line and
  3270         //     we just stored into _owner, it's likely that the $line
  3271         //     remains in M-state for the lock:orl.
  3272         //
  3273         // We currently use (3), although it's likely that switching to (2)
  3274         // is correct for the future.
  3276         sd(R0, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
  3278         // Ratify _succ remains non-null
  3279         ld(AT, Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2)) ;
  3280         bne(AT, R0, LSuccess);
  3281         delayed()->nop();		/* delay slot */
  3282         /*
  3283            masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ;
  3284            masm.jccb  (Assembler::notZero, LSuccess) ;
  3285            */
  3287         move(boxReg, R0) ;                  // box is really EAX
  3289         cmpxchg(SP, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg);
  3290         beq(AT, R0, LSuccess);
  3291         delayed()->nop();
  3293         // Since we're low on registers we installed rsp as a placeholding in _owner.
  3294         // Now install Self over rsp.  This is safe as we're transitioning from
  3295         // non-null to non=null
  3296         get_thread (boxReg) ;
  3297         sd(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
  3298         // Intentional fall-through into LGoSlowPath ...
  3300         bind(LGoSlowPath) ;
  3301         ori(boxReg, boxReg, 1) ;                      // set ICC.ZF=0 to indicate failure
  3302         b(DONE_LABEL) ;
  3303         move(AT, R0) ;	/* delay slot */
  3305         bind(LSuccess) ;
  3306         move(boxReg, R0) ;                 // set ICC.ZF=1 to indicate success
  3307         b(DONE_LABEL) ;
  3308         move(AT, 0x1) ;	/* delay slot */
  3311       bind (Stacked) ;
  3312       // It's not inflated and it's not recursively stack-locked and it's not biased.
  3313       // It must be stack-locked.
  3314       // Try to reset the header to displaced header.
  3315       // The "box" value on the stack is stable, so we can reload
  3316       // and be assured we observe the same value as above.
  3317       ld(tmpReg, Address(boxReg, 0)) ;
  3319       cmpxchg(tmpReg, Address(objReg, 0), boxReg); // Uses EAX which is box
  3320       // Intention fall-thru into DONE_LABEL
  3323       // DONE_LABEL is a hot target - we'd really like to place it at the
  3324       // start of cache line by padding with NOPs.
  3325       // See the AMD and Intel software optimization manuals for the
  3326       // most efficient "long" NOP encodings.
  3327       // Unfortunately none of our alignment mechanisms suffice.
  3328       if ((EmitSync & 65536) == 0) {
  3329         bind (CheckSucc) ;
  3331 #else // _LP64
  3332       // It's inflated
  3333       ld(AT, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
  3334       xorr(boxReg, boxReg, AT);
  3336       ld(AT, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
  3337       orr(boxReg, boxReg, AT);
  3339       move(AT, R0);
  3340       bne(boxReg, R0, DONE_LABEL);
  3341       delayed()->nop();
  3343       ld(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
  3344       ld(AT, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
  3345       orr(boxReg, boxReg, AT);
  3347       move(AT, R0);
  3348       bne(boxReg, R0, CheckSucc);
  3349       delayed()->nop();
  3351       sd(R0, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
  3352       move(AT, 0x1);
  3353       b(DONE_LABEL);
  3354       delayed()->nop();
  3357       if ((EmitSync & 65536) == 0) {
  3358         Label LSuccess, LGoSlowPath ;
  3359         bind (CheckSucc);
  3360         ld(AT, Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2)) ;
  3361         beq(AT, R0, LGoSlowPath);
  3362         delayed()->nop();
  3364         // I'd much rather use lock:andl m->_owner, 0 as it's faster than the
  3365         // the explicit ST;MEMBAR combination, but masm doesn't currently support
  3366         // "ANDQ M,IMM".  Don't use MFENCE here.  lock:add to TOS, xchg, etc
  3367         // are all faster when the write buffer is populated.
  3368         sd(R0, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
  3369         if (os::is_MP()) {
  3370           // lock (); 
  3371           //addl (Address(rsp, 0), 0); //?
  3373         ld(AT, Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2)) ;
  3374         bne(AT, R0, LSuccess);
  3375         delayed()->nop();
  3377         move(boxReg, R0) ;                  // box is really EAX
  3378         //if (os::is_MP()) { lock(); }
  3379         cmpxchg(SP, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg);
  3380         beq(AT, R0, LSuccess);
  3381         delayed()->nop();
  3382         // Intentional fall-through into slow-path
  3384         bind  (LGoSlowPath);
  3385         ori(boxReg, boxReg, 1) ;                      // set ICC.ZF=0 to indicate failure
  3386         move(AT, R0);
  3387         b(DONE_LABEL) ;
  3388         delayed()->nop();
  3391         bind  (LSuccess);
  3392         move(boxReg, R0) ;                 // set ICC.ZF=1 to indicate success
  3393         move(AT, 0x1) ;
  3394         b(DONE_LABEL) ;
  3395         delayed()->nop();
  3398       bind  (Stacked);
  3399       ld(tmpReg, Address(boxReg, 0)) ;
  3400       //if (os::is_MP()) { lock(); }
  3401       cmpxchg(tmpReg, Address(objReg, 0), boxReg); // Uses EAX which is box
  3403       if (EmitSync & 65536) {
  3404         bind (CheckSucc);
  3406 #endif
  3408       bind(DONE_LABEL);
  3410       // Avoid branch to branch on AMD processors
  3411       if (EmitSync & 32768) { nop() ; }
  3415 class ControlWord {
  3416 				public:
  3417 								int32_t _value;
  3419   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
  3420   int  precision_control() const       { return  (_value >>  8) & 3      ; }
  3421   bool precision() const               { return ((_value >>  5) & 1) != 0; }
  3422   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
  3423   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
  3424   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
  3425   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
  3426   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
  3428   void print() const {
  3429     // rounding control
  3430     const char* rc;
  3431     switch (rounding_control()) {
  3432       case 0: rc = "round near"; break;
  3433       case 1: rc = "round down"; break;
  3434       case 2: rc = "round up  "; break;
  3435       case 3: rc = "chop      "; break;
  3436     };
  3437     // precision control
  3438     const char* pc;
  3439     switch (precision_control()) {
  3440       case 0: pc = "24 bits "; break;
  3441       case 1: pc = "reserved"; break;
  3442       case 2: pc = "53 bits "; break;
  3443       case 3: pc = "64 bits "; break;
  3444     };
  3445     // flags
  3446     char f[9];
  3447     f[0] = ' ';
  3448     f[1] = ' ';
  3449     f[2] = (precision   ()) ? 'P' : 'p';
  3450     f[3] = (underflow   ()) ? 'U' : 'u';
  3451     f[4] = (overflow    ()) ? 'O' : 'o';
  3452     f[5] = (zero_divide ()) ? 'Z' : 'z';
  3453     f[6] = (denormalized()) ? 'D' : 'd';
  3454     f[7] = (invalid     ()) ? 'I' : 'i';
  3455     f[8] = '\x0';
  3456     // output
  3457     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
  3460 };
  3462 class StatusWord {
  3463  public:
  3464   int32_t _value;
  3466   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
  3467   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
  3468   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
  3469   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
  3470   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
  3471   int  top() const                     { return  (_value >> 11) & 7      ; }
  3472   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
  3473   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
  3474   bool precision() const               { return ((_value >>  5) & 1) != 0; }
  3475   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
  3476   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
  3477   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
  3478   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
  3479   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
  3481   void print() const {
  3482     // condition codes
  3483     char c[5];
  3484     c[0] = (C3()) ? '3' : '-';
  3485     c[1] = (C2()) ? '2' : '-';
  3486     c[2] = (C1()) ? '1' : '-';
  3487     c[3] = (C0()) ? '0' : '-';
  3488     c[4] = '\x0';
  3489     // flags
  3490     char f[9];
  3491     f[0] = (error_status()) ? 'E' : '-';
  3492     f[1] = (stack_fault ()) ? 'S' : '-';
  3493     f[2] = (precision   ()) ? 'P' : '-';
  3494     f[3] = (underflow   ()) ? 'U' : '-';
  3495     f[4] = (overflow    ()) ? 'O' : '-';
  3496     f[5] = (zero_divide ()) ? 'Z' : '-';
  3497     f[6] = (denormalized()) ? 'D' : '-';
  3498     f[7] = (invalid     ()) ? 'I' : '-';
  3499     f[8] = '\x0';
  3500     // output
  3501     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
  3504 };
  3506 class TagWord {
  3507  public:
  3508   int32_t _value;
  3510   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
  3512   void print() const {
  3513     printf("%04x", _value & 0xFFFF);
  3516 };
  3518 class FPU_Register {
  3519  public:
  3520   int32_t _m0;
  3521   int32_t _m1;
  3522   int16_t _ex;
  3524   bool is_indefinite() const           {
  3525     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
  3528   void print() const {
  3529     char  sign = (_ex < 0) ? '-' : '+';
  3530     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
  3531     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
  3532   };
  3534 };
  3536 class FPU_State {
  3537  public:
  3538   enum {
  3539     register_size       = 10,
  3540     number_of_registers =  8,
  3541     register_mask       =  7
  3542   };
  3544   ControlWord  _control_word;
  3545   StatusWord   _status_word;
  3546   TagWord      _tag_word;
  3547   int32_t      _error_offset;
  3548   int32_t      _error_selector;
  3549   int32_t      _data_offset;
  3550   int32_t      _data_selector;
  3551   int8_t       _register[register_size * number_of_registers];
  3553   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
  3554   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
  3556   const char* tag_as_string(int tag) const {
  3557     switch (tag) {
  3558       case 0: return "valid";
  3559       case 1: return "zero";
  3560       case 2: return "special";
  3561       case 3: return "empty";
  3563     ShouldNotReachHere();
  3564     return NULL;
  3567   void print() const {
  3568     // print computation registers
  3569     { int t = _status_word.top();
  3570       for (int i = 0; i < number_of_registers; i++) {
  3571         int j = (i - t) & register_mask;
  3572         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
  3573         st(j)->print();
  3574         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
  3577     printf("\n");
  3578     // print control registers
  3579     printf("ctrl = "); _control_word.print(); printf("\n");
  3580     printf("stat = "); _status_word .print(); printf("\n");
  3581     printf("tags = "); _tag_word    .print(); printf("\n");
  3584 };
  3586 class Flag_Register {
  3587  public:
  3588   int32_t _value;
  3590   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
  3591   bool direction() const               { return ((_value >> 10) & 1) != 0; }
  3592   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
  3593   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
  3594   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
  3595   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
  3596   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
  3598   void print() const {
  3599     // flags
  3600     char f[8];
  3601     f[0] = (overflow       ()) ? 'O' : '-';
  3602     f[1] = (direction      ()) ? 'D' : '-';
  3603     f[2] = (sign           ()) ? 'S' : '-';
  3604     f[3] = (zero           ()) ? 'Z' : '-';
  3605     f[4] = (auxiliary_carry()) ? 'A' : '-';
  3606     f[5] = (parity         ()) ? 'P' : '-';
  3607     f[6] = (carry          ()) ? 'C' : '-';
  3608     f[7] = '\x0';
  3609     // output
  3610     printf("%08x  flags = %s", _value, f);
  3613 };
  3615 class IU_Register {
  3616  public:
  3617   int32_t _value;
  3619   void print() const {
  3620     printf("%08x  %11d", _value, _value);
  3623 };
  3625 class IU_State {
  3626  public:
  3627   Flag_Register _eflags;
  3628   IU_Register   _rdi;
  3629   IU_Register   _rsi;
  3630   IU_Register   _rbp;
  3631   IU_Register   _rsp;
  3632   IU_Register   _rbx;
  3633   IU_Register   _rdx;
  3634   IU_Register   _rcx;
  3635   IU_Register   _rax;
  3637   void print() const {
  3638     // computation registers
  3639     printf("rax,  = "); _rax.print(); printf("\n");
  3640     printf("rbx,  = "); _rbx.print(); printf("\n");
  3641     printf("rcx  = "); _rcx.print(); printf("\n");
  3642     printf("rdx  = "); _rdx.print(); printf("\n");
  3643     printf("rdi  = "); _rdi.print(); printf("\n");
  3644     printf("rsi  = "); _rsi.print(); printf("\n");
  3645     printf("rbp,  = "); _rbp.print(); printf("\n");
  3646     printf("rsp  = "); _rsp.print(); printf("\n");
  3647     printf("\n");
  3648     // control registers
  3649     printf("flgs = "); _eflags.print(); printf("\n");
  3651 };
  3654 class CPU_State {
  3655  public:
  3656   FPU_State _fpu_state;
  3657   IU_State  _iu_state;
  3659   void print() const {
  3660     printf("--------------------------------------------------\n");
  3661     _iu_state .print();
  3662     printf("\n");
  3663     _fpu_state.print();
  3664     printf("--------------------------------------------------\n");
  3667 };
  3670 /*
  3671 static void _print_CPU_state(CPU_State* state) {
  3672   state->print();
  3673 };
  3675 void MacroAssembler::print_CPU_state() {
  3676   push_CPU_state();
  3677   push(rsp);                // pass CPU state
  3678   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
  3679   addptr(rsp, wordSize);       // discard argument
  3680   pop_CPU_state();
  3682 */
  3684 void MacroAssembler::align(int modulus) {
  3685 	while (offset() % modulus != 0) nop();
  3688 #if 0
  3689 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
  3690   static int counter = 0;
  3691   FPU_State* fs = &state->_fpu_state;
  3692   counter++;
  3693   // For leaf calls, only verify that the top few elements remain empty.
  3694   // We only need 1 empty at the top for C2 code.
  3695   if( stack_depth < 0 ) {
  3696     if( fs->tag_for_st(7) != 3 ) {
  3697       printf("FPR7 not empty\n");
  3698       state->print();
  3699       assert(false, "error");
  3700       return false;
  3702     return true;                // All other stack states do not matter
  3705   assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
  3706          "bad FPU control word");
  3708   // compute stack depth
  3709   int i = 0;
  3710   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
  3711   int d = i;
  3712   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
  3713   // verify findings
  3714   if (i != FPU_State::number_of_registers) {
  3715     // stack not contiguous
  3716     printf("%s: stack not contiguous at ST%d\n", s, i);
  3717     state->print();
  3718     assert(false, "error");
  3719     return false;
  3721   // check if computed stack depth corresponds to expected stack depth
  3722   if (stack_depth < 0) {
  3723     // expected stack depth is -stack_depth or less
  3724     if (d > -stack_depth) {
  3725       // too many elements on the stack
  3726       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
  3727       state->print();
  3728       assert(false, "error");
  3729       return false;
  3731   } else {
  3732     // expected stack depth is stack_depth
  3733     if (d != stack_depth) {
  3734       // wrong stack depth
  3735       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
  3736       state->print();
  3737       assert(false, "error");
  3738       return false;
  3741   // everything is cool
  3742   return true;
  3744 #endif
  3747 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
  3748 	//FIXME aoqi
  3749 	// %%%%% need to implement this
  3750 	//Unimplemented();
  3751 	/*
  3752 	if (!VerifyFPU) return;
  3753   push_CPU_state();
  3754   push(rsp);                // pass CPU state
  3755   ExternalAddress msg((address) s);
  3756   // pass message string s
  3757   pushptr(msg.addr());
  3758   push(stack_depth);        // pass stack depth
  3759   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
  3760   addptr(rsp, 3 * wordSize);   // discard arguments
  3761   // check for error
  3762   { Label L;
  3763     testl(rax, rax);
  3764     jcc(Assembler::notZero, L);
  3765     int3();                  // break if error condition
  3766     bind(L);
  3768   pop_CPU_state();
  3769 	*/
  3772 #ifdef _LP64
  3773 Register caller_saved_registers[] = {AT, V0, V1, A0, A1, A2, A3, A4, A5, A6, A7, T0, T1, T2, T3, T8, T9, GP, RA, FP};
  3775 /* FIXME: Jin: In MIPS64, F0~23 are all caller-saved registers */
  3776 FloatRegister caller_saved_fpu_registers[] = {F0, F12, F13};
  3777 #else
  3778 Register caller_saved_registers[] = {AT, V0, V1, A0, A1, A2, A3, T4, T5, T6, T7, T0, T1, T2, T3, T8, T9, GP, RA, FP};
  3780 Register caller_saved_fpu_registers[] = {};
  3781 #endif
  3783 //We preserve all caller-saved register
  3784 void  MacroAssembler::pushad(){
  3785   int i;
  3787   /* Fixed-point registers */
  3788   int len = sizeof(caller_saved_registers) / sizeof(caller_saved_registers[0]);
  3789   daddi(SP, SP, -1 * len * wordSize);
  3790   for (i = 0; i < len; i++)
  3792 #ifdef _LP64
  3793     sd(caller_saved_registers[i], SP, (len - i - 1) * wordSize);
  3794 #else
  3795     sw(caller_saved_registers[i], SP, (len - i - 1) * wordSize);
  3796 #endif
  3799   /* Floating-point registers */
  3800   len = sizeof(caller_saved_fpu_registers) / sizeof(caller_saved_fpu_registers[0]);
  3801   daddi(SP, SP, -1 * len * wordSize);
  3802   for (i = 0; i < len; i++)
  3804 #ifdef _LP64
  3805     sdc1(caller_saved_fpu_registers[i], SP, (len - i - 1) * wordSize);
  3806 #else
  3807     swc1(caller_saved_fpu_registers[i], SP, (len - i - 1) * wordSize);
  3808 #endif
  3810 };
  3812 void  MacroAssembler::popad(){
  3813   int i;
  3815   /* Floating-point registers */
  3816   int len = sizeof(caller_saved_fpu_registers) / sizeof(caller_saved_fpu_registers[0]);
  3817   for (i = 0; i < len; i++)
  3819 #ifdef _LP64
  3820     ldc1(caller_saved_fpu_registers[i], SP, (len - i - 1) * wordSize);
  3821 #else
  3822     lwc1(caller_saved_fpu_registers[i], SP, (len - i - 1) * wordSize);
  3823 #endif
  3825   daddi(SP, SP, len * wordSize);
  3827   /* Fixed-point registers */
  3828   len = sizeof(caller_saved_registers) / sizeof(caller_saved_registers[0]);
  3829   for (i = 0; i < len; i++)
  3831 #ifdef _LP64
  3832     ld(caller_saved_registers[i], SP, (len - i - 1) * wordSize);
  3833 #else
  3834     lw(caller_saved_registers[i], SP, (len - i - 1) * wordSize);
  3835 #endif
  3837   daddi(SP, SP, len * wordSize);
  3838 };
  3840 void MacroAssembler::push2(Register reg1, Register reg2) {
  3841 #ifdef _LP64
  3842   daddi(SP, SP, -16);
  3843   sd(reg2, SP, 0);
  3844   sd(reg1, SP, 8);
  3845 #else
  3846   addi(SP, SP, -8);
  3847   sw(reg2, SP, 0);
  3848   sw(reg1, SP, 4);
  3849 #endif
  3852 void MacroAssembler::pop2(Register reg1, Register reg2) {
  3853 #ifdef _LP64
  3854   ld(reg1, SP, 0);
  3855   ld(reg2, SP, 8);
  3856   daddi(SP, SP, 16);
  3857 #else
  3858   lw(reg1, SP, 0);
  3859   lw(reg2, SP, 4);
  3860   addi(SP, SP, 8);
  3861 #endif
  3864 //for UseCompressedOops Option
  3865 void MacroAssembler::load_klass(Register dst, Register src) {
  3866 #ifdef _LP64
  3867     if(UseCompressedClassPointers){
  3868         lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
  3869 		decode_klass_not_null(dst);
  3870     } else 
  3871 #endif
  3872         ld(dst, src, oopDesc::klass_offset_in_bytes());
  3875 void MacroAssembler::store_klass(Register dst, Register src) {
  3876 #ifdef _LP64
  3877     if(UseCompressedClassPointers){
  3878 		encode_klass_not_null(src);
  3879 		sw(src, dst, oopDesc::klass_offset_in_bytes());
  3880     } else {
  3881 #endif 
  3882 		sd(src, dst, oopDesc::klass_offset_in_bytes());
  3886 void MacroAssembler::load_prototype_header(Register dst, Register src) {
  3887   load_klass(dst, src);
  3888   ld(dst, Address(dst, Klass::prototype_header_offset()));
  3891 #ifdef _LP64
  3892 void MacroAssembler::store_klass_gap(Register dst, Register src) {
  3893   if (UseCompressedClassPointers) {
  3894     sw(src, dst, oopDesc::klass_gap_offset_in_bytes());
  3898 void MacroAssembler::load_heap_oop(Register dst, Address src) {
  3899     if(UseCompressedOops){
  3900 	lwu(dst, src); 
  3901 	decode_heap_oop(dst);
  3902     } else{
  3903 	ld(dst, src); 
  3907 void MacroAssembler::store_heap_oop(Address dst, Register src){
  3908     if(UseCompressedOops){
  3909        assert(!dst.uses(src), "not enough registers");
  3910        encode_heap_oop(src); 
  3911        sw(src, dst);
  3912     } else{
  3913        sd(src, dst);
  3917 #ifdef ASSERT
  3918 void MacroAssembler::verify_heapbase(const char* msg) {
  3919   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
  3920   assert (Universe::heap() != NULL, "java heap should be initialized");
  3921 /*  if (CheckCompressedOops) {
  3922     Label ok;
  3923     push(rscratch1); // cmpptr trashes rscratch1
  3924     cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
  3925     jcc(Assembler::equal, ok);
  3926     STOP(msg);
  3927     bind(ok);
  3928     pop(rscratch1);
  3929   }*/
  3931 #endif
  3934 // Algorithm must match oop.inline.hpp encode_heap_oop.
  3935 void MacroAssembler::encode_heap_oop(Register r) {
  3936 #ifdef ASSERT
  3937   verify_heapbase("MacroAssembler::encode_heap_oop:heap base corrupted?");
  3938 #endif
  3939   verify_oop(r, "broken oop in encode_heap_oop");
  3940   if (Universe::narrow_oop_base() == NULL) {
  3941     if (Universe::narrow_oop_shift() != 0) { 
  3942       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
  3943       shr(r, LogMinObjAlignmentInBytes);
  3945     return;
  3948     Label done;
  3949     beq(r, R0, done);
  3950     delayed()->nop();
  3951     dsub(r, r, S5_heapbase);
  3952     shr(r, LogMinObjAlignmentInBytes);
  3953     bind(done);
  3956 void MacroAssembler::encode_heap_oop_not_null(Register r) {
  3957     assert (UseCompressedOops, "should be compressed");
  3958 #ifdef ASSERT
  3959     if (CheckCompressedOops) {
  3960 	Label ok;
  3961 	bne(r, R0, ok);
  3962 	delayed()->nop();
  3963 	stop("null oop passed to encode_heap_oop_not_null");
  3964 	bind(ok);
  3966 #endif
  3967 	verify_oop(r, "broken oop in encode_heap_oop_not_null");
  3968 	if (Universe::narrow_oop_base() != NULL) {
  3969 		dsub(r, r, S5_heapbase);
  3971 	if (Universe::narrow_oop_shift() != 0) {
  3972 		assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
  3973 		shr(r, LogMinObjAlignmentInBytes);
  3978 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
  3979     assert (UseCompressedOops, "should be compressed");
  3980 #ifdef ASSERT
  3981     if (CheckCompressedOops) {
  3982 	Label ok;
  3983 	bne(src, R0, ok);
  3984 	delayed()->nop();
  3985 	stop("null oop passed to encode_heap_oop_not_null2");
  3986 	bind(ok);
  3988 #endif
  3989     verify_oop(src, "broken oop in encode_heap_oop_not_null2");
  3990     if (dst != src) {
  3991 	move(dst, src);
  3994 	if (Universe::narrow_oop_base() != NULL) {
  3995 		dsub(dst, dst, S5_heapbase);
  3997 	if (Universe::narrow_oop_shift() != 0) {
  3998 		assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
  3999 		shr(dst, LogMinObjAlignmentInBytes);
  4004 void  MacroAssembler::decode_heap_oop(Register r) {
  4005 #ifdef ASSERT
  4006   verify_heapbase("MacroAssembler::decode_heap_oop corrupted?");
  4007 #endif
  4008   if (Universe::narrow_oop_base() == NULL) {
  4009     if (Universe::narrow_oop_shift() != 0) {
  4010       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
  4011       shl(r, LogMinObjAlignmentInBytes);
  4013   } else {
  4014     Label done;
  4015     shl(r, LogMinObjAlignmentInBytes);
  4016     beq(r, R0, done);
  4017     delayed()->nop();
  4018     dadd(r, r, S5_heapbase);
  4019     bind(done);
  4021   verify_oop(r, "broken oop in decode_heap_oop");
  4024 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
  4025   // Note: it will change flags
  4026   assert (UseCompressedOops, "should only be used for compressed headers");
  4027   assert (Universe::heap() != NULL, "java heap should be initialized");
  4028   // Cannot assert, unverified entry point counts instructions (see .ad file)
  4029   // vtableStubs also counts instructions in pd_code_size_limit.
  4030   // Also do not verify_oop as this is called by verify_oop.
  4031   if (Universe::narrow_oop_shift() != 0) {
  4032     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
  4033     shl(r, LogMinObjAlignmentInBytes);
  4034     if (Universe::narrow_oop_base() != NULL) {
  4035       dadd(r, r, S5_heapbase);
  4037   } else {
  4038     assert (Universe::narrow_oop_base() == NULL, "sanity");
  4042 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
  4043   assert (UseCompressedOops, "should only be used for compressed headers");
  4044   assert (Universe::heap() != NULL, "java heap should be initialized");
  4046   // Cannot assert, unverified entry point counts instructions (see .ad file)
  4047   // vtableStubs also counts instructions in pd_code_size_limit.
  4048   // Also do not verify_oop as this is called by verify_oop.
  4049   //lea(dst, Address(S5_heapbase, src, Address::times_8, 0));
  4050   if (Universe::narrow_oop_shift() != 0) {
  4051     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
  4052     if (LogMinObjAlignmentInBytes == Address::times_8) {
  4053       dsll(dst, src, LogMinObjAlignmentInBytes);
  4054       dadd(dst, dst, S5_heapbase);
  4055     } else {
  4056       if (dst != src) {
  4057         move(dst, src);
  4059       shl(dst, LogMinObjAlignmentInBytes);
  4060       if (Universe::narrow_oop_base() != NULL) {
  4061         dadd(dst, dst, S5_heapbase);
  4064   } else {
  4065     assert (Universe::narrow_oop_base() == NULL, "sanity");
  4066     if (dst != src) {
  4067       move(dst, src);
  4072 void MacroAssembler::encode_klass_not_null(Register r) {
  4073   if (Universe::narrow_klass_base() != NULL) {
  4074     // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
  4075     assert(r != S5_heapbase, "Encoding a klass in r12");
  4076     li48(S5_heapbase, (int64_t)Universe::narrow_klass_base());
  4077     dsub(r, r, S5_heapbase);
  4079   if (Universe::narrow_klass_shift() != 0) {
  4080     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
  4081     shr(r, LogKlassAlignmentInBytes);
  4083   if (Universe::narrow_klass_base() != NULL) {
  4084     reinit_heapbase();
  4088 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
  4089   if (dst == src) {
  4090     encode_klass_not_null(src);
  4091   } else {
  4092     if (Universe::narrow_klass_base() != NULL) {
  4093       li48(dst, (int64_t)Universe::narrow_klass_base());
  4094       dsub(dst, src, dst);
  4095     } else {
  4096       move(dst, src);
  4098     if (Universe::narrow_klass_shift() != 0) {
  4099       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
  4100       shr(dst, LogKlassAlignmentInBytes);
  4105 // Function instr_size_for_decode_klass_not_null() counts the instructions
  4106 // generated by decode_klass_not_null(register r) and reinit_heapbase(),
  4107 // when (Universe::heap() != NULL).  Hence, if the instructions they
  4108 // generate change, then this method needs to be updated.
  4109 int MacroAssembler::instr_size_for_decode_klass_not_null() {
  4110   assert (UseCompressedClassPointers, "only for compressed klass ptrs");
  4111   if (Universe::narrow_klass_base() != NULL) {
  4112     // mov64 + addq + shlq? + mov64  (for reinit_heapbase()).
  4113     return (Universe::narrow_klass_shift() == 0 ? 4 * 9 : 4 * 10);
  4114   } else {
  4115     // longest load decode klass function, mov64, leaq
  4116     return (Universe::narrow_klass_shift() == 0 ? 4 * 0 : 4 * 1);
  4120 void  MacroAssembler::decode_klass_not_null(Register r) { 
  4121   // Note: it will change flags
  4122   assert (UseCompressedClassPointers, "should only be used for compressed headers");
  4123   assert(r != S5_heapbase, "Decoding a klass in r12");
  4124   // Cannot assert, unverified entry point counts instructions (see .ad file)
  4125   // vtableStubs also counts instructions in pd_code_size_limit.
  4126   // Also do not verify_oop as this is called by verify_oop.
  4127   if (Universe::narrow_klass_shift() != 0) { 
  4128     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
  4129     shl(r, LogKlassAlignmentInBytes);
  4131   if (Universe::narrow_klass_base() != NULL) {
  4132     li48(S5_heapbase, (int64_t)Universe::narrow_klass_base());
  4133     dadd(r, r, S5_heapbase);
  4134     reinit_heapbase();
  4138 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
  4139   assert (UseCompressedClassPointers, "should only be used for compressed headers");
  4141   if (dst == src) {
  4142     decode_klass_not_null(dst);
  4143   } else {
  4144     // Cannot assert, unverified entry point counts instructions (see .ad file)
  4145     // vtableStubs also counts instructions in pd_code_size_limit.
  4146     // Also do not verify_oop as this is called by verify_oop.
  4147     li48(S5_heapbase, (int64_t)Universe::narrow_klass_base());
  4148     if (Universe::narrow_klass_shift() != 0) {
  4149       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
  4150       assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
  4151       dsll(dst, src, Address::times_8);
  4152       dadd(dst, dst, S5_heapbase);
  4153     } else {
  4154       dadd(dst, src, S5_heapbase);
  4156     reinit_heapbase();
  4160 /*
  4161 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
  4162   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
  4163   int oop_index = oop_recorder()->find_index(obj);
  4164   RelocationHolder rspec = oop_Relocation::spec(oop_index);
  4165   mov_literal32(dst, oop_index, rspec, narrow_oop_operand);
  4167 */
  4169 void MacroAssembler::incrementl(Register reg, int value) {
  4170   if (value == min_jint) {
  4171      move(AT, value);
  4172      LP64_ONLY(addu32(reg, reg, AT)) NOT_LP64(addu(reg, reg, AT));
  4173      return; 
  4175   if (value <  0) { decrementl(reg, -value); return; }
  4176   if (value == 0) {                        ; return; }
  4178   if(Assembler::is_simm16(value)) {
  4179      NOT_LP64(addiu(reg, reg, value));
  4180      LP64_ONLY(move(AT, value); addu32(reg, reg, AT));
  4181   } else {
  4182      move(AT, value);
  4183      LP64_ONLY(addu32(reg, reg, AT)) NOT_LP64(addu(reg, reg, AT));
  4187 void MacroAssembler::decrementl(Register reg, int value) {
  4188   if (value == min_jint) {
  4189      move(AT, value);
  4190      LP64_ONLY(subu32(reg, reg, AT)) NOT_LP64(subu(reg, reg, AT));
  4191      return;
  4193   if (value <  0) { incrementl(reg, -value); return; }
  4194   if (value == 0) {                        ; return; }
  4196   if(Assembler::is_simm16(value)) {
  4197      NOT_LP64(addiu(reg, reg, -value));
  4198      LP64_ONLY(move(AT, value); subu32(reg, reg, AT));
  4199   } else {
  4200      move(AT, value);
  4201      LP64_ONLY(subu32(reg, reg, AT)) NOT_LP64(subu(reg, reg, AT));
  4205 void MacroAssembler::reinit_heapbase() {
  4206   if (UseCompressedOops || UseCompressedClassPointers) {
  4207     if (Universe::heap() != NULL) {
  4208       if (Universe::narrow_oop_base() == NULL) {
  4209         move(S5_heapbase, R0);
  4210       } else {
  4211         li48(S5_heapbase, (int64_t)Universe::narrow_ptrs_base());
  4213     } else {
  4214       li48(S5_heapbase, (intptr_t)Universe::narrow_ptrs_base_addr());
  4215       ld(S5_heapbase, S5_heapbase, 0);
  4219 #endif // _LP64
  4221 void MacroAssembler::check_klass_subtype(Register sub_klass,
  4222                            Register super_klass,
  4223                            Register temp_reg,
  4224                            Label& L_success) {
  4225 //implement ind   gen_subtype_check
  4226   Label L_failure;
  4227   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
  4228   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
  4229   bind(L_failure);
  4232 SkipIfEqual::SkipIfEqual(
  4233     MacroAssembler* masm, const bool* flag_addr, bool value) {
  4234   _masm = masm;
  4235   _masm->li(AT, (address)flag_addr);
  4236   _masm->lb(AT,AT,0);
  4237   _masm->addi(AT,AT,-value);
  4238   _masm->beq(AT,R0,_label);
  4239   _masm->delayed()->nop();
  4241 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
  4242                                                    Register super_klass,
  4243                                                    Register temp_reg,
  4244                                                    Label* L_success,
  4245                                                    Label* L_failure,
  4246                                                    Label* L_slow_path,
  4247                                         RegisterOrConstant super_check_offset) {
  4248   assert_different_registers(sub_klass, super_klass, temp_reg);
  4249   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
  4250   if (super_check_offset.is_register()) {
  4251     assert_different_registers(sub_klass, super_klass,
  4252                                super_check_offset.as_register());
  4253   } else if (must_load_sco) {
  4254     assert(temp_reg != noreg, "supply either a temp or a register offset");
  4257   Label L_fallthrough;
  4258   int label_nulls = 0;
  4259   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
  4260   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
  4261   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
  4262   assert(label_nulls <= 1, "at most one NULL in the batch");
  4264   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
  4265   int sco_offset = in_bytes(Klass::super_check_offset_offset());
  4266   // If the pointers are equal, we are done (e.g., String[] elements).
  4267   // This self-check enables sharing of secondary supertype arrays among
  4268   // non-primary types such as array-of-interface.  Otherwise, each such
  4269   // type would need its own customized SSA.
  4270   // We move this check to the front of the fast path because many
  4271   // type checks are in fact trivially successful in this manner,
  4272   // so we get a nicely predicted branch right at the start of the check.
  4273   //cmpptr(sub_klass, super_klass);
  4274   //local_jcc(Assembler::equal, *L_success);
  4275   beq(sub_klass, super_klass, *L_success);
  4276   delayed()->nop();
  4277   // Check the supertype display:
  4278   if (must_load_sco) {
  4279     // Positive movl does right thing on LP64.
  4280 	lwu(temp_reg, super_klass, sco_offset);
  4281     super_check_offset = RegisterOrConstant(temp_reg);
  4283   dsll(AT, super_check_offset.register_or_noreg(), Address::times_1);
  4284   daddu(AT, sub_klass, AT);
  4285   ld(AT, AT, super_check_offset.constant_or_zero()*Address::times_1);
  4287   // This check has worked decisively for primary supers.
  4288   // Secondary supers are sought in the super_cache ('super_cache_addr').
  4289   // (Secondary supers are interfaces and very deeply nested subtypes.)
  4290   // This works in the same check above because of a tricky aliasing
  4291   // between the super_cache and the primary super display elements.
  4292   // (The 'super_check_addr' can address either, as the case requires.)
  4293   // Note that the cache is updated below if it does not help us find
  4294   // what we need immediately.
  4295   // So if it was a primary super, we can just fail immediately.
  4296   // Otherwise, it's the slow path for us (no success at this point).
  4298   if (super_check_offset.is_register()) {
  4299 	beq(super_klass, AT, *L_success);
  4300 	delayed()->nop();
  4301 	addi(AT, super_check_offset.as_register(), -sc_offset);
  4302     if (L_failure == &L_fallthrough) {
  4303 	  beq(AT, R0, *L_slow_path);
  4304 	  delayed()->nop();
  4305     } else {
  4306 	  bne(AT, R0, *L_failure);
  4307 	  delayed()->nop();
  4308 	  b(*L_slow_path);
  4309 	  delayed()->nop();
  4311   } else if (super_check_offset.as_constant() == sc_offset) {
  4312     // Need a slow path; fast failure is impossible.
  4313     if (L_slow_path == &L_fallthrough) {
  4314 		beq(super_klass, AT, *L_success);
  4315 		delayed()->nop();
  4316     } else {
  4317 		bne(super_klass, AT, *L_slow_path);
  4318 		delayed()->nop();
  4319 		b(*L_success);
  4320 		delayed()->nop();
  4322   } else {
  4323     // No slow path; it's a fast decision.
  4324     if (L_failure == &L_fallthrough) {
  4325 		beq(super_klass, AT, *L_success);
  4326 		delayed()->nop();
  4327     } else {
  4328 		bne(super_klass, AT, *L_failure);
  4329 		delayed()->nop();
  4330 		b(*L_success);
  4331 		delayed()->nop();
  4335   bind(L_fallthrough);
  4340 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
  4341                                                    Register super_klass,
  4342                                                    Register temp_reg,
  4343                                                    Register temp2_reg,
  4344                                                    Label* L_success,
  4345                                                    Label* L_failure,
  4346                                                    bool set_cond_codes) {
  4347   assert_different_registers(sub_klass, super_klass, temp_reg);
  4348   if (temp2_reg != noreg)
  4349     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
  4350   else
  4351     temp2_reg = T9;
  4352 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
  4354   Label L_fallthrough;
  4355   int label_nulls = 0;
  4356   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
  4357   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
  4358   assert(label_nulls <= 1, "at most one NULL in the batch");
  4360   // a couple of useful fields in sub_klass:
  4361   int ss_offset = in_bytes(Klass::secondary_supers_offset());
  4362   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
  4363   Address secondary_supers_addr(sub_klass, ss_offset);
  4364   Address super_cache_addr(     sub_klass, sc_offset);
  4366   // Do a linear scan of the secondary super-klass chain.
  4367   // This code is rarely used, so simplicity is a virtue here.
  4368   // The repne_scan instruction uses fixed registers, which we must spill.
  4369   // Don't worry too much about pre-existing connections with the input regs.
  4371 #if 0
  4372   assert(sub_klass != T9, "killed reg"); // killed by mov(rax, super)
  4373   assert(sub_klass != T1, "killed reg"); // killed by lea(rcx, &pst_counter)
  4374 #endif
  4376   // Get super_klass value into rax (even if it was in rdi or rcx).
  4377 /*
  4378   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
  4379   if (super_klass != rax || UseCompressedOops) {
  4380     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
  4381     mov(rax, super_klass);
  4383   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
  4384   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
  4385 */
  4386 #ifndef PRODUCT
  4387   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
  4388   ExternalAddress pst_counter_addr((address) pst_counter);
  4389   NOT_LP64(  incrementl(pst_counter_addr) );
  4390   //LP64_ONLY( lea(rcx, pst_counter_addr) );
  4391   //LP64_ONLY( incrementl(Address(rcx, 0)) );
  4392 #endif //PRODUCT
  4394   // We will consult the secondary-super array.
  4395   ld(temp_reg, secondary_supers_addr);
  4396   // Load the array length.  (Positive movl does right thing on LP64.)
  4397   lw(temp2_reg, Address(temp_reg, Array<Klass*>::length_offset_in_bytes()));
  4398   // Skip to start of data.
  4399   daddiu(temp_reg, temp_reg, Array<Klass*>::base_offset_in_bytes());
  4401   // Scan RCX words at [RDI] for an occurrence of RAX.
  4402   // Set NZ/Z based on last compare.
  4403   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
  4404   // not change flags (only scas instruction which is repeated sets flags).
  4405   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
  4407   /* 2013/4/3 Jin: OpenJDK8 never compresses klass pointers in secondary-super array. */
  4408   Label Loop, subtype;
  4409   bind(Loop);
  4410   beq(temp2_reg, R0, *L_failure);
  4411   delayed()->nop();
  4412   ld(AT, temp_reg, 0);
  4413   beq(AT, super_klass, subtype);
  4414   delayed()->daddi(temp_reg, temp_reg, 1 * wordSize);
  4415   b(Loop);
  4416   delayed()->daddi(temp2_reg, temp2_reg, -1); 
  4418   bind(subtype);
  4419   sd(super_klass, super_cache_addr);
  4420   if (L_success != &L_fallthrough) {
  4421 	  b(*L_success);
  4422 	  delayed()->nop();
  4425 /*
  4426   if (set_cond_codes) {
  4427     // Special hack for the AD files:  rdi is guaranteed non-zero.
  4428     assert(!pushed_rdi, "rdi must be left non-NULL");
  4429     // Also, the condition codes are properly set Z/NZ on succeed/failure.
  4431 */
  4432   // Success.  Cache the super we found and proceed in triumph.
  4433 #undef IS_A_TEMP
  4435   bind(L_fallthrough);
  4437 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
  4438   ld(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
  4439   sd(R0, Address(java_thread, JavaThread::vm_result_offset()));
  4440   verify_oop(oop_result, "broken oop in call_VM_base");
  4443 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
  4444   ld(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
  4445   sd(R0, Address(java_thread, JavaThread::vm_result_2_offset()));
  4448 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
  4449                                          int extra_slot_offset) {
  4450   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
  4451   int stackElementSize = Interpreter::stackElementSize;
  4452   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
  4453 #ifdef ASSERT
  4454   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
  4455   assert(offset1 - offset == stackElementSize, "correct arithmetic");
  4456 #endif
  4457   Register             scale_reg    = NOREG;
  4458   Address::ScaleFactor scale_factor = Address::no_scale;
  4459   if (arg_slot.is_constant()) {
  4460     offset += arg_slot.as_constant() * stackElementSize;
  4461   } else {
  4462     scale_reg    = arg_slot.as_register();
  4463     scale_factor = Address::times_8;
  4465   // 2014/07/31 Fu: We don't push RA on stack in prepare_invoke.
  4466   //  offset += wordSize;           // return PC is on stack
  4467   if(scale_reg==NOREG) return Address(SP, offset);
  4468   else {
  4469 	dsll(scale_reg, scale_reg, scale_factor);
  4470 	daddu(scale_reg, SP, scale_reg);
  4471 	return Address(scale_reg, offset);
  4475 SkipIfEqual::~SkipIfEqual() {
  4476   _masm->bind(_label);
  4479 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
  4480   switch (size_in_bytes) {
  4481 #ifndef _LP64
  4482   case  8:
  4483     assert(dst2 != noreg, "second dest register required");
  4484     lw(dst,  src);
  4485     lw(dst2, src.plus_disp(BytesPerInt));
  4486     break;
  4487 #else
  4488   case  8:  ld(dst, src); break;
  4489 #endif
  4490   case  4:  lw(dst, src); break;
  4491   case  2:  is_signed ? lh(dst, src) : lhu(dst, src); break;
  4492   case  1:  is_signed ? lb( dst, src) : lbu( dst, src); break;
  4493   default:  ShouldNotReachHere();
  4497 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
  4498   switch (size_in_bytes) {
  4499 #ifndef _LP64
  4500   case  8:
  4501     assert(src2 != noreg, "second source register required");
  4502     sw(src, dst);
  4503     sw(src2, dst.plus_disp(BytesPerInt));
  4504     break;
  4505 #else
  4506   case  8:  sd(src, dst); break;
  4507 #endif
  4508   case  4:  sw(src, dst); break;
  4509   case  2:  sh(src, dst); break;
  4510   case  1:  sb(src, dst); break;
  4511   default:  ShouldNotReachHere();
  4515 // Look up the method for a megamorphic invokeinterface call.
  4516 // The target method is determined by <intf_klass, itable_index>.
  4517 // The receiver klass is in recv_klass.
  4518 // On success, the result will be in method_result, and execution falls through.
  4519 // On failure, execution transfers to the given label.
  4520 void MacroAssembler::lookup_interface_method(Register recv_klass,
  4521                                              Register intf_klass,
  4522                                              RegisterOrConstant itable_index,
  4523                                              Register method_result,
  4524                                              Register scan_temp,
  4525                                              Label& L_no_such_interface) {
  4526   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
  4527   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
  4528          "caller must use same register for non-constant itable index as for method");
  4530   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
  4531   int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
  4532   int itentry_off = itableMethodEntry::method_offset_in_bytes();
  4533   int scan_step   = itableOffsetEntry::size() * wordSize;
  4534   int vte_size    = vtableEntry::size() * wordSize;
  4535   Address::ScaleFactor times_vte_scale = Address::times_ptr;
  4536   assert(vte_size == wordSize, "else adjust times_vte_scale");
  4538   lw(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize));
  4540   // %%% Could store the aligned, prescaled offset in the klassoop.
  4541 //  lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
  4542   dsll(scan_temp, scan_temp, times_vte_scale);
  4543   daddu(scan_temp, recv_klass, scan_temp);
  4544   daddiu(scan_temp, scan_temp, vtable_base);
  4545   if (HeapWordsPerLong > 1) {
  4546     // Round up to align_object_offset boundary
  4547     // see code for InstanceKlass::start_of_itable!
  4548     round_to(scan_temp, BytesPerLong);
  4551   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
  4552   assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
  4553 //  lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
  4554   if (itable_index.is_constant()) {
  4555     li48(AT, (int)itable_index.is_constant());
  4556     dsll(AT, AT, (int)Address::times_ptr);
  4557   } else {
  4558     dsll(AT, itable_index.as_register(), (int)Address::times_ptr);
  4560   daddu(AT, AT, recv_klass);
  4561   daddiu(recv_klass, AT, itentry_off);
  4563   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
  4564   //   if (scan->interface() == intf) {
  4565   //     result = (klass + scan->offset() + itable_index);
  4566   //   }
  4567   // }
  4568   Label search, found_method;
  4570   for (int peel = 1; peel >= 0; peel--) {
  4571     ld(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
  4573     if (peel) {
  4574       beq(intf_klass, method_result, found_method);
  4575       nop();
  4576     } else {
  4577       bne(intf_klass, method_result, search);
  4578       nop();
  4579       // (invert the test to fall through to found_method...)
  4582     if (!peel)  break;
  4584     bind(search);
  4586     // Check that the previous entry is non-null.  A null entry means that
  4587     // the receiver class doesn't implement the interface, and wasn't the
  4588     // same as when the caller was compiled.
  4589     beq(method_result, R0, L_no_such_interface);
  4590     nop();
  4591     daddiu(scan_temp, scan_temp, scan_step);
  4594   bind(found_method);
  4596   // Got a hit.
  4597   lw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
  4598   ld(method_result, Address(recv_klass, scan_temp, Address::times_1));
  4602 // virtual method calling
  4603 void MacroAssembler::lookup_virtual_method(Register recv_klass,
  4604                                            RegisterOrConstant vtable_index,
  4605                                            Register method_result) {
  4606   Register tmp = GP;
  4607   push(tmp);
  4609   if (vtable_index.is_constant()) {
  4610     assert_different_registers(recv_klass, method_result, tmp);
  4611   } else {
  4612     assert_different_registers(recv_klass, method_result, vtable_index.as_register(), tmp);
  4614   const int base = InstanceKlass::vtable_start_offset() * wordSize;
  4615   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
  4616 /*
  4617   Address vtable_entry_addr(recv_klass,
  4618                             vtable_index, Address::times_ptr,
  4619                             base + vtableEntry::method_offset_in_bytes());
  4620 */
  4621   if (vtable_index.is_constant()) {
  4622     li48(AT, vtable_index.as_constant());
  4623     dsll(AT, AT, (int)Address::times_ptr);
  4624   } else {
  4625     dsll(AT, vtable_index.as_register(), (int)Address::times_ptr);
  4627   li48(tmp, base + vtableEntry::method_offset_in_bytes());
  4628   daddu(tmp, tmp, AT);
  4629   daddu(tmp, tmp, recv_klass);
  4630   ld(method_result, tmp, 0);
  4632   pop(tmp);

mercurial