src/cpu/mips/vm/stubGenerator_mips_64.cpp

Tue, 20 Sep 2016 11:48:21 +0800

author
jiangshaofeng
date
Tue, 20 Sep 2016 11:48:21 +0800
changeset 117
89e1dfe996be
parent 103
58408aa75fba
child 118
bf4b1d1988a6
permissions
-rw-r--r--

#4537 Rewrite generate_disjoint_byte_copy
Eliminated unaligned access and Optimized copy algorithm. The same as changeset 114
The unaligned account does not increase, has passed the SPECjvm2008 test.
20% speed up at the test program.
The test program:

public class ByteCopyTest{
public static void main(String args[]){
int count = 100000;
char []A = new char[count];
char []B = new char[count];
for(int i = 0; i < count; i++){
A[i] = (char)(i % 26 + 97);
}
long startTime = System.nanoTime();
System.arraycopy(A, 0, B, 0, count);
long endTime = System.nanoTime();
System.out.println(endTime - startTime);
}
}

     1 /*
     2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
     3  * Copyright (c) 2015, 2016, Loongson Technology. All rights reserved.
     4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     5  *
     6  * This code is free software; you can redistribute it and/or modify it
     7  * under the terms of the GNU General Public License version 2 only, as
     8  * published by the Free Software Foundation.
     9  *
    10  * This code is distributed in the hope that it will be useful, but WITHOUT
    11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    13  * version 2 for more details (a copy is included in the LICENSE file that
    14  * accompanied this code).
    15  *
    16  * You should have received a copy of the GNU General Public License version
    17  * 2 along with this work; if not, write to the Free Software Foundation,
    18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    19  *
    20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    21  * or visit www.oracle.com if you need additional information or have any
    22  * questions.
    23  *
    24  */
    26 #include "precompiled.hpp"
    27 #include "asm/macroAssembler.hpp"
    28 #include "asm/macroAssembler.inline.hpp"
    29 #include "interpreter/interpreter.hpp"
    30 #include "nativeInst_mips.hpp"
    31 #include "oops/instanceOop.hpp"
    32 #include "oops/method.hpp"
    33 #include "oops/objArrayKlass.hpp"
    34 #include "oops/oop.inline.hpp"
    35 #include "prims/methodHandles.hpp"
    36 #include "runtime/frame.inline.hpp"
    37 #include "runtime/handles.inline.hpp"
    38 #include "runtime/sharedRuntime.hpp"
    39 #include "runtime/stubCodeGenerator.hpp"
    40 #include "runtime/stubRoutines.hpp"
    41 #include "runtime/thread.inline.hpp"
    42 #include "utilities/top.hpp"
    43 #ifdef COMPILER2
    44 #include "opto/runtime.hpp"
    45 #endif
    48 // Declaration and definition of StubGenerator (no .hpp file).
    49 // For a more detailed description of the stub routine structure
    50 // see the comment in stubRoutines.hpp
    52 #define __ _masm->
    53 //#define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
    54 //#define a__ ((Assembler*)_masm)->
    56 //#ifdef PRODUCT
    57 //#define BLOCK_COMMENT(str) /* nothing */
    58 //#else
    59 //#define BLOCK_COMMENT(str) __ block_comment(str)
    60 //#endif
    62 //#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
    63 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
    65 // Stub Code definitions
    67 static address handle_unsafe_access() {
    68   JavaThread* thread = JavaThread::current();
    69   address pc = thread->saved_exception_pc();
    70   // pc is the instruction which we must emulate
    71   // doing a no-op is fine:  return garbage from the load
    72   // therefore, compute npc
    73   //address npc = Assembler::locate_next_instruction(pc);
    74 	address npc = (address)((unsigned long)pc + sizeof(unsigned long));
    76   // request an async exception
    77   thread->set_pending_unsafe_access_error();
    79   // return address of next instruction to execute
    80   return npc;
    81 }
    83 class StubGenerator: public StubCodeGenerator {
    84  private:
    86   // ABI mips n64
    87   // This fig is not MIPS ABI. It is call Java from C ABI.
    88   // Call stubs are used to call Java from C
    89   //
    90   //    [ return_from_Java     ]
    91   //    [ argument word n-1    ] <--- sp
    92   //      ...
    93   //    [ argument word 0      ]
    94   //      ...
    95   //-10 [ S6     	       ]
    96   // -9 [ S5		       ] 
    97   // -8 [ S4		       ]
    98   // -7 [ S3                   ]
    99   // -6 [ S0  		       ]
   100   // -5 [ TSR(S2)	       ]
   101   // -4 [ LVP(S7)              ]
   102   // -3 [ BCP(S1)              ]
   103   // -2 [ saved fp             ] <--- fp_after_call
   104   // -1 [ return address       ] 
   105   //  0 [ ptr. to call wrapper ] <--- a0 (old sp -->)fp
   106   //  1 [ result               ] <--- a1
   107   //  2 [ result_type          ] <--- a2
   108   //  3 [ method               ] <--- a3
   109   //  4 [ entry_point          ] <--- a4
   110   //  5 [ parameters           ] <--- a5
   111   //  6 [ parameter_size       ] <--- a6
   112   //  7 [ thread               ] <--- a7
   114   //
   115   // _LP64: n64 does not save paras in sp.
   116   //
   117   //    [ return_from_Java     ]
   118   //    [ argument word n-1    ] <--- sp
   119   //      ...
   120   //    [ argument word 0      ]
   121   //      ...
   122   //-14 [ thread               ]
   123   //-13 [ result_type          ] <--- a2
   124   //-12 [ result               ] <--- a1
   125   //-11 [ ptr. to call wrapper ] <--- a0
   126   //-10 [ S6     	       ]
   127   // -9 [ S5		       ] 
   128   // -8 [ S4		       ]
   129   // -7 [ S3                   ]
   130   // -6 [ S0  		       ]
   131   // -5 [ TSR(S2)	       ]
   132   // -4 [ LVP(S7)              ]
   133   // -3 [ BCP(S1)              ]
   134   // -2 [ saved fp             ] <--- fp_after_call
   135   // -1 [ return address       ] 
   136   //  0 [        	       ] <--- old sp
   137   /*
   138    * 2014/01/16 Fu: Find a right place in the call_stub for GP.
   139    * GP will point to the starting point of Interpreter::dispatch_table(itos). 
   140    * It should be saved/restored before/after Java calls. 
   141    *
   142    */
   143    enum call_stub_layout {
   144      RA_off		  = -1,
   145      FP_off		  = -2,
   146      BCP_off		  = -3,
   147      LVP_off		  = -4,
   148      TSR_off		  = -5,
   149      S1_off		  = -6,
   150      S3_off		  = -7,
   151      S4_off		  = -8,
   152      S5_off		  = -9,
   153      S6_off		  = -10,
   154      result_off		  = -11,
   155      result_type_off	  = -12,
   156      thread_off		  = -13,
   157      total_off		  = thread_off - 3,
   158      GP_off               = -16,
   159    };
   161   address generate_call_stub(address& return_address) {
   163     StubCodeMark mark(this, "StubRoutines", "call_stub");
   164     address start = __ pc();
   166     // same as in generate_catch_exception()!
   168     // stub code
   169     // save ra and fp
   170     __ sd(RA, SP, RA_off * wordSize);
   171     __ sd(FP, SP, FP_off * wordSize);
   172     __ sd(BCP, SP, BCP_off * wordSize);
   173     __ sd(LVP, SP, LVP_off * wordSize);
   174     __ sd(GP, SP, GP_off * wordSize);
   175     __ sd(TSR, SP, TSR_off * wordSize);
   176     __ sd(S1, SP, S1_off * wordSize);
   177     __ sd(S3, SP, S3_off * wordSize);
   178     __ sd(S4, SP, S4_off * wordSize);
   179     __ sd(S5, SP, S5_off * wordSize);
   180     __ sd(S6, SP, S6_off * wordSize);
   183     __ li48(GP, (long)Interpreter::dispatch_table(itos));
   185     // I think 14 is the max gap between argument and callee saved register
   186     __ daddi(FP, SP, (-2) * wordSize);
   187     __ daddi(SP, SP, total_off * wordSize);
   188 //FIXME, aoqi. find a suitable place to save A1 & A2.
   189     /*
   190     __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);
   191     __ sd(A1, FP, 3 * wordSize);
   192     __ sd(A2, FP, 4 * wordSize);
   193     __ sd(A3, FP, 5 * wordSize);
   194     __ sd(A4, FP, 6 * wordSize);
   195     __ sd(A5, FP, 7 * wordSize);
   196     __ sd(A6, FP, 8 * wordSize);
   197     __ sd(A7, FP, 9 * wordSize);
   198     */
   199     __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);
   200     __ sd(A1, FP, result_off * wordSize);
   201     __ sd(A2, FP, result_type_off * wordSize);
   202     __ sd(A7, FP, thread_off * wordSize);
   204 #ifdef OPT_THREAD
   205     //__ get_thread(TREG);
   206     __ move(TREG, A7);
   208     //__ ld(TREG, FP, thread_off * wordSize);
   209 #endif
   210     //add for compressedoops
   211     __ reinit_heapbase();
   213 #ifdef ASSERT
   214     // make sure we have no pending exceptions
   215     { 
   216       Label L;
   217     	__ ld(AT, A7, in_bytes(Thread::pending_exception_offset()));
   218     	__ beq(AT, R0, L); 
   219     	__ delayed()->nop();
   220     	/* FIXME: I do not know how to realize stop in mips arch, do it in the future */
   221     	__ stop("StubRoutines::call_stub: entered with pending exception");
   222     	__ bind(L);
   223     }
   224 #endif
   226     // pass parameters if any
   227     // A5: parameter
   228     // A6: parameter_size
   229     // T0: parameter_size_tmp(--)
   230     // T2: offset(++)
   231     // T3: tmp
   232     Label parameters_done;
   233     // judge if the parameter_size equals 0
   234     __ beq(A6, R0, parameters_done);
   235     __ delayed()->nop();
   236     __ dsll(AT, A6, Interpreter::logStackElementSize);
   237     __ dsub(SP, SP, AT); 
   238     __ move(AT, -StackAlignmentInBytes); 
   239     __ andr(SP, SP , AT); 
   240     // Copy Java parameters in reverse order (receiver last)
   241     // Note that the argument order is inverted in the process
   242     // source is edx[ecx: N-1..0]
   243     // dest   is esp[ebx: 0..N-1]
   244     Label loop;
   245     __ move(T0, A6);
   246     __ move(T2, R0);
   247     __ bind(loop);
   249     // get parameter
   250     __ dsll(T3, T0, LogBytesPerWord);   
   251     __ dadd(T3, T3, A5);	    
   252     __ ld(AT, T3,  -wordSize);
   253     __ dsll(T3, T2, LogBytesPerWord); 
   254     __ dadd(T3, T3, SP); 
   255     __ sd(AT, T3, Interpreter::expr_offset_in_bytes(0));
   256     __ daddi(T2, T2, 1); 
   257     __ daddi(T0, T0, -1); 
   258     __ bne(T0, R0, loop);
   259     __ delayed()->nop();
   260     // advance to next parameter
   262     // call Java function
   263     __ bind(parameters_done);
   265     // receiver in V0, methodOop in Rmethod
   267     __ move(Rmethod, A3);
   268     __ move(Rsender, SP);             //set sender sp
   269     __ jalr(A4);
   270     __ delayed()->nop();
   271     return_address = __ pc();
   273     Label common_return;
   274     __ bind(common_return);
   276     // store result depending on type
   277     // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
   278     __ ld(T0, FP, result_off * wordSize); 	// result --> T0
   279     Label is_long, is_float, is_double, exit;
   280     __ ld(T2, FP, result_type_off * wordSize);	// result_type --> T2
   281     __ daddi(T3, T2, (-1) * T_LONG);
   282     __ beq(T3, R0, is_long);
   283     __ delayed()->daddi(T3, T2, (-1) * T_FLOAT);
   284     __ beq(T3, R0, is_float);
   285     __ delayed()->daddi(T3, T2, (-1) * T_DOUBLE);
   286     __ beq(T3, R0, is_double);
   287     __ delayed()->nop();
   289     // handle T_INT case
   290     __ sd(V0, T0, 0 * wordSize);
   291     __ bind(exit);
   293     // restore 
   294     __ daddi(SP, FP, 2 * wordSize );
   295     __ ld(RA, SP, RA_off * wordSize);
   296     __ ld(FP, SP, FP_off * wordSize);
   297     __ ld(BCP, SP, BCP_off * wordSize);
   298     __ ld(LVP, SP, LVP_off * wordSize);
   299     __ ld(GP, SP, GP_off * wordSize);
   300     __ ld(TSR, SP, TSR_off * wordSize);
   302     __ ld(S1, SP, S1_off * wordSize);
   303     __ ld(S3, SP, S3_off * wordSize);
   304     __ ld(S4, SP, S4_off * wordSize);
   305     __ ld(S5, SP, S5_off * wordSize);
   306     __ ld(S6, SP, S6_off * wordSize);
   308     // return
   309     __ jr(RA);
   310     __ delayed()->nop();
   312     // handle return types different from T_INT
   313     __ bind(is_long);
   314     __ sd(V0, T0, 0 * wordSize);
   315     //__ sd(V1, T0, 1 * wordSize);
   316     //__ sd(R0, T0, 1 * wordSize);
   317     __ b(exit);
   318     __ delayed()->nop();
   320     __ bind(is_float);
   321     __ swc1(F0, T0, 0 * wordSize);
   322     __ b(exit);
   323     __ delayed()->nop();
   325     __ bind(is_double);
   326     __ sdc1(F0, T0, 0 * wordSize);
   327     //__ sdc1(F1, T0, 1 * wordSize);
   328     //__ sd(R0, T0, 1 * wordSize);
   329     __ b(exit);
   330     __ delayed()->nop();
   331     //FIXME, 1.6 mips version add operation of fpu here
   332     StubRoutines::gs2::set_call_stub_compiled_return(__ pc());
   333     __ b(common_return);
   334     __ delayed()->nop(); 
   335     return start;
   336   }
   338   // Return point for a Java call if there's an exception thrown in
   339   // Java code.  The exception is caught and transformed into a
   340   // pending exception stored in JavaThread that can be tested from
   341   // within the VM.
   342   //
   343   // Note: Usually the parameters are removed by the callee. In case
   344   // of an exception crossing an activation frame boundary, that is
   345   // not the case if the callee is compiled code => need to setup the
   346   // rsp.
   347   //
   348   // rax: exception oop
   350   address generate_catch_exception() {
   351     StubCodeMark mark(this, "StubRoutines", "catch_exception");
   352     address start = __ pc();
   354     Register thread = TREG;
   356     // get thread directly
   357 #ifndef OPT_THREAD
   358     __ ld(thread, FP, thread_off * wordSize);
   359 #endif
   361 #ifdef ASSERT
   362     // verify that threads correspond
   363     { Label L;
   364       __ get_thread(T8);
   365       __ beq(T8, thread, L);
   366       __ delayed()->nop();
   367       __ stop("StubRoutines::catch_exception: threads must correspond");
   368       __ bind(L);
   369     }
   370 #endif
   371     // set pending exception
   372     __ verify_oop(V0);
   373     __ sd(V0, thread, in_bytes(Thread::pending_exception_offset()));
   374     __ li(AT, (long)__FILE__);
   375     __ sd(AT, thread, in_bytes(Thread::exception_file_offset   ()));
   376     __ li(AT, (long)__LINE__);
   377     __ sd(AT, thread, in_bytes(Thread::exception_line_offset   ()));
   379     // complete return to VM
   380     assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
   381     __ jmp(StubRoutines::_call_stub_return_address, relocInfo::none);
   382     __ delayed()->nop();
   384     return start;
   385   }
   387   // Continuation point for runtime calls returning with a pending
   388   // exception.  The pending exception check happened in the runtime
   389   // or native call stub.  The pending exception in Thread is
   390   // converted into a Java-level exception.
   391   //
   392   // Contract with Java-level exception handlers:
   393   // rax: exception
   394   // rdx: throwing pc
   395   //
   396   // NOTE: At entry of this stub, exception-pc must be on stack !!
   398   address generate_forward_exception() {
   399     StubCodeMark mark(this, "StubRoutines", "forward exception");
   400     //Register thread = TREG;
   401     Register thread = TREG;
   402     address start = __ pc();
   404     // Upon entry, the sp points to the return address returning into Java
   405     // (interpreted or compiled) code; i.e., the return address becomes the
   406     // throwing pc.
   407     //
   408     // Arguments pushed before the runtime call are still on the stack but
   409     // the exception handler will reset the stack pointer -> ignore them.
   410     // A potential result in registers can be ignored as well.
   412 #ifdef ASSERT
   413     // make sure this code is only executed if there is a pending exception
   414 #ifndef OPT_THREAD
   415     __ get_thread(thread);
   416 #endif
   417     { Label L;
   418       __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));
   419       __ bne(AT, R0, L);
   420       __ delayed()->nop();
   421       __ stop("StubRoutines::forward exception: no pending exception (1)");
   422       __ bind(L);
   423     }
   424 #endif
   426     // compute exception handler into T9
   427     __ ld(A1, SP, 0);
   428     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, A1);
   429     __ move(T9, V0);
   430     __ pop(V1);
   432 #ifndef OPT_THREAD
   433     __ get_thread(thread);
   434 #endif
   435     __ ld(V0, thread, in_bytes(Thread::pending_exception_offset()));
   436     __ sd(R0, thread, in_bytes(Thread::pending_exception_offset()));
   438 #ifdef ASSERT
   439     // make sure exception is set
   440     { Label L;
   441       __ bne(V0, R0, L);
   442       __ delayed()->nop();
   443       __ stop("StubRoutines::forward exception: no pending exception (2)");
   444       __ bind(L);
   445     }
   446 #endif
   448     // continue at exception handler (return address removed)
   449     // V0: exception
   450     // T9: exception handler
   451     // V1: throwing pc
   452     __ verify_oop(V0);
   453     __ jr(T9);
   454     __ delayed()->nop();
   456     return start;
   457   }
   459   // Support for intptr_t get_previous_fp()
   460   //
   461   // This routine is used to find the previous frame pointer for the
   462   // caller (current_frame_guess). This is used as part of debugging
   463   // ps() is seemingly lost trying to find frames.
   464   // This code assumes that caller current_frame_guess) has a frame.
   465   address generate_get_previous_fp() {
   466     StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
   467     const Address old_fp       (FP,  0);
   468     const Address older_fp       (V0,  0);
   469     address start = __ pc();
   470     __ enter();    
   471     __ lw(V0, old_fp); // callers fp
   472     __ lw(V0, older_fp); // the frame for ps()
   473     __ leave();
   474     __ jr(RA);
   475     __ delayed()->nop();
   476     return start;
   477   }
   478   // The following routine generates a subroutine to throw an
   479   // asynchronous UnknownError when an unsafe access gets a fault that
   480   // could not be reasonably prevented by the programmer.  (Example:
   481   // SIGBUS/OBJERR.)
   482   address generate_handler_for_unsafe_access() {
   483 		StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
   484 		address start = __ pc();
   485 		__ pushad();                      // push registers
   486 		//  Address next_pc(esp, RegisterImpl::number_of_registers * BytesPerWord);
   487 		__ call(CAST_FROM_FN_PTR(address, handle_unsafe_access), relocInfo::runtime_call_type);
   488 		__ delayed()->nop(); 
   489 		__ sw(V0, SP, RegisterImpl::number_of_registers * BytesPerWord); 
   490 		__ popad();
   491 		__ jr(RA);
   492 		__ delayed()->nop();  
   493 		return start;
   494   }
   496   // Non-destructive plausibility checks for oops
   497   //
   498   // Arguments:
   499   //    all args on stack!
   500   //
   501   // Stack after saving c_rarg3:
   502   //    [tos + 0]: saved c_rarg3
   503   //    [tos + 1]: saved c_rarg2
   504   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
   505   //    [tos + 3]: saved flags
   506   //    [tos + 4]: return address
   507   //  * [tos + 5]: error message (char*)
   508   //  * [tos + 6]: object to verify (oop)
   509   //  * [tos + 7]: saved rax - saved by caller and bashed
   510   //  * = popped on exit
   511   address generate_verify_oop() {
   512 	  StubCodeMark mark(this, "StubRoutines", "verify_oop");
   513 	  address start = __ pc();
   514 	  __ reinit_heapbase();
   515 	  __ verify_oop_subroutine(); 
   516     address end = __ pc();
   517 	  return start;
   518   }
   520   //
   521   //  Generate overlap test for array copy stubs
   522   //
   523   //  Input:
   524   //     A0    -  array1
   525   //     A1    -  array2
   526   //     A2    -  element count
   527   //
   528   //  Note: this code can only use %eax, %ecx, and %edx
   529   //
   531  // use T9 as temp 
   532   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
   533     int elem_size = 1 << log2_elem_size;
   534     Address::ScaleFactor sf = Address::times_1;
   536     switch (log2_elem_size) {
   537       case 0: sf = Address::times_1; break;
   538       case 1: sf = Address::times_2; break;
   539       case 2: sf = Address::times_4; break;
   540       case 3: sf = Address::times_8; break;
   541     }
   543     __ dsll(AT, A2, sf);
   544     __ dadd(AT, AT, A0); 
   545     __ lea(T9, Address(AT, -elem_size)); 
   546     __ dsub(AT, A1, A0); 
   547     __ blez(AT, no_overlap_target); 
   548     __ delayed()->nop(); 
   549     __ dsub(AT, A1, T9); 
   550     __ bgtz(AT, no_overlap_target); 
   551     __ delayed()->nop(); 
   553     // 2016/05/10 aoqi: If A0 = 0xf... and A1 = 0x0..., than goto no_overlap_target 
   554     Label L;
   555     __ bgez(A0, L);
   556     __ delayed()->nop(); 
   557     __ bgtz(A1, no_overlap_target);
   558     __ delayed()->nop(); 
   559     __ bind(L);
   561   }
   563   //
   564   //  Generate store check for array
   565   //
   566   //  Input:
   567   //     %edi    -  starting address
   568   //     %ecx    -  element count
   569   //
   570   //  The 2 input registers are overwritten
   571   //
   573   //
   574   //  Generate store check for array
   575   //
   576   //  Input:
   577   //     T0    -  starting address(edi)
   578   //     T1    -  element count  (ecx)
   579   //
   580   //  The 2 input registers are overwritten
   581   //
   583 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
   585 	void array_store_check() {
   586 		BarrierSet* bs = Universe::heap()->barrier_set();
   587 		assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
   588 		CardTableModRefBS* ct = (CardTableModRefBS*)bs;
   589 		assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
   590 		Label l_0;
   592 		__ dsll(AT, T1, TIMES_OOP);
   593 		__ dadd(AT, T0, AT); 
   594 		__ daddiu(T1, AT, - BytesPerHeapOop);
   596 		__ shr(T0, CardTableModRefBS::card_shift); 
   597 		__ shr(T1, CardTableModRefBS::card_shift);
   599 		__ dsub(T1, T1, T0);   // end --> cards count
   600 		__ bind(l_0);
   602 		__ li48(AT, (long)ct->byte_map_base); 
   603 		__ dadd(AT, AT, T0); 
   604 		__ dadd(AT, AT, T1); 
   605                 __ sync();
   606 		__ sb(R0, AT, 0);
   607 		//__ daddi(T1, T1, -4);  
   608 		__ daddi(T1, T1, - 1);
   609 		__ bgez(T1, l_0);
   610 		__ delayed()->nop(); 
   611 	}
   613   // Arguments:
   614   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
   615   //             ignored
   616   //   name    - stub name string
   617   //
   618   // Inputs:
   619   //   c_rarg0   - source array address
   620   //   c_rarg1   - destination array address
   621   //   c_rarg2   - element count, treated as ssize_t, can be zero
   622   //
   623   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
   624   // we let the hardware handle it.  The one to eight bytes within words,
   625   // dwords or qwords that span cache line boundaries will still be loaded
   626   // and stored atomically.
   627   //
   628   // Side Effects:
   629   //   disjoint_byte_copy_entry is set to the no-overlap entry point
   630   //   used by generate_conjoint_byte_copy().
   631   //
   632   address generate_disjoint_byte_copy(bool aligned, const char * name) {
   633     StubCodeMark mark(this, "StubRoutines", name);
   634     __ align(CodeEntryAlignment);
   637     Register tmp1 = T0;
   638     Register tmp2 = T1;
   639     Register tmp3 = T3;
   641     address start = __ pc();
   643     __ push(tmp1);
   644     __ push(tmp2);
   645     __ push(tmp3);
   646     __ move(tmp1, A0);
   647     __ move(tmp2, A1);
   648     __ move(tmp3, A2);
   651     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10, l_11;
   652     Label l_debug;
   654     __ daddi(AT, tmp3, -9); //why the number is 9 ?
   655     __ blez(AT, l_9);
   656     __ delayed()->nop();
   658     if (!aligned) {
   659       __ xorr(AT, tmp1, tmp2);
   660       __ andi(AT, AT, 1);
   661       __ bne(AT, R0, l_9); // if arrays don't have the same alignment mod 2, do 1 element copy
   662       __ delayed()->nop();
   664       __ andi(AT, tmp1, 1);
   665       __ beq(AT, R0, l_10); //copy 1 enlement if necessary to aligh to 2 bytes
   666       __ delayed()->nop();
   668       __ lb(AT, tmp1, 0);
   669       __ daddi(tmp1, tmp1, 1);
   670       __ sb(AT, tmp2, 0);
   671       __ daddi(tmp2, tmp2, 1);
   672       __ daddi(tmp3, tmp3, -1);
   673       __ bind(l_10);
   675       __ xorr(AT, tmp1, tmp2);
   676       __ andi(AT, AT, 3);
   677       __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 2 elements copy
   678       __ delayed()->nop();
   680       // At this point it is guaranteed that both, from and to have the same alignment mod 4.
   682       // Copy 2 elements if necessary to align to 4 bytes.
   683       __ andi(AT, tmp1, 3);
   684       __ beq(AT, R0, l_2);
   685       __ delayed()->nop();
   687       __ lhu(AT, tmp1, 0);
   688       __ daddi(tmp1, tmp1, 2);
   689       __ sh(AT, tmp2, 0);
   690       __ daddi(tmp2, tmp2, 2);
   691       __ daddi(tmp3, tmp3, -2);
   692       __ bind(l_2);
   694       // At this point the positions of both, from and to, are at least 4 byte aligned.
   696       // Copy 4 elements at a time.
   697       // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
   698       __ xorr(AT, tmp1, tmp2);
   699       __ andi(AT, AT, 7);
   700       __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned
   701       __ delayed()->nop();
   703       // Copy a 4 elements if necessary to align to 8 bytes.
   704       __ andi(AT, tmp1, 7);
   705       __ beq(AT, R0, l_7);
   706       __ delayed()->nop();
   708       __ lw(AT, tmp1, 0);
   709       __ daddi(tmp3, tmp3, -4);
   710       __ sw(AT, tmp2, 0);
   711       { // FasterArrayCopy
   712         __ daddi(tmp1, tmp1, 4);
   713         __ daddi(tmp2, tmp2, 4);
   714       }
   715     }
   717     __ bind(l_7);
   719     // Copy 4 elements at a time; either the loads or the stores can
   720     // be unaligned if aligned == false.
   722     { // FasterArrayCopy
   723       __ daddi(AT, tmp3, -7);
   724       __ blez(AT, l_6); // copy 4 at a time if less than 4 elements remain
   725       __ delayed()->nop();
   727       __ bind(l_8);
   728       // For Loongson, there is 128-bit memory access. TODO
   729       __ ld(AT, tmp1, 0);
   730       __ sd(AT, tmp2, 0);
   731       __ daddi(tmp1, tmp1, 8);
   732       __ daddi(tmp2, tmp2, 8);
   733       __ daddi(tmp3, tmp3, -8);
   734       __ daddi(AT, tmp3, -8);
   735       __ bgez(AT, l_8);
   736       __ delayed()->nop();
   737     }
   738     __ bind(l_6);
   740     // copy 4 bytes at a time
   741     { // FasterArrayCopy
   742       __ daddi(AT, tmp3, -3);
   743       __ blez(AT, l_1);
   744       __ delayed()->nop();
   746       __ bind(l_3);
   747       __ lw(AT, tmp1, 0);
   748       __ sw(AT, tmp2, 0);
   749       __ daddi(tmp1, tmp1, 4);
   750       __ daddi(tmp2, tmp2, 4);
   751       __ daddi(tmp3, tmp3, -4);
   752       __ daddi(AT, tmp3, -4);
   753       __ bgez(AT, l_3);
   754       __ delayed()->nop();
   756     }
   758     // do 2 bytes copy
   759     __ bind(l_1);
   760     { 
   761       __ daddi(AT, tmp3, -1);
   762       __ blez(AT, l_9);
   763       __ delayed()->nop();
   765       __ bind(l_5);
   766       __ lhu(AT, tmp1, 0);
   767       __ daddi(tmp3, tmp3, -2);
   768       __ sh(AT, tmp2, 0);
   769       __ daddi(tmp1, tmp1, 2);
   770       __ daddi(tmp2, tmp2, 2);
   771       __ daddi(AT, tmp3, -2);
   772       __ bgez(AT, l_5);
   773       __ delayed()->nop();
   774     }
   776     //do 1 element copy--byte
   777     __ bind(l_9);
   778     __ beq(R0, tmp3, l_4);
   779     __ delayed()->nop();
   781     {
   782       __ bind(l_11);
   783       __ lb(AT, tmp1, 0);
   784       __ daddi(tmp3, tmp3, -1);
   785       __ sb(AT, tmp2, 0);
   786       __ daddi(tmp1, tmp1, 1);
   787       __ daddi(tmp2, tmp2, 1);
   788       __ daddi(AT, tmp3, -1);
   789       __ bgez(AT, l_11);
   790       __ delayed()->nop();
   791     }
   793     __ bind(l_4);
   794     __ pop(tmp3);
   795     __ pop(tmp2);
   796     __ pop(tmp1);
   798     __ jr(RA);
   799     __ delayed()->nop();
   801     return start;
   802   }
   804   // Arguments:
   805   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
   806   //             ignored
   807   //   name    - stub name string
   808   //
   809   // Inputs:
   810   //   A0   - source array address
   811   //   A1   - destination array address
   812   //   A2   - element count, treated as ssize_t, can be zero
   813   //
   814   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
   815   // we let the hardware handle it.  The one to eight bytes within words,
   816   // dwords or qwords that span cache line boundaries will still be loaded
   817   // and stored atomically.
   818   //
   819   address generate_conjoint_byte_copy(bool aligned, const char *name) {
   820     __ align(CodeEntryAlignment);
   821     StubCodeMark mark(this, "StubRoutines", name);
   822     address start = __ pc();
   824     Label l_copy_4_bytes_loop, l_copy_suffix, l_copy_suffix_loop, l_exit;
   825     Label l_copy_byte, l_from_unaligned, l_unaligned, l_4_bytes_aligned;
   827     address nooverlap_target = aligned ?
   828 	    StubRoutines::arrayof_jbyte_disjoint_arraycopy() :
   829 	    StubRoutines::jbyte_disjoint_arraycopy();
   831     array_overlap_test(nooverlap_target, 0);
   833     const Register from      = A0;   // source array address
   834     const Register to        = A1;   // destination array address
   835     const Register count     = A2;   // elements count
   836     const Register end_from  = T3;   // source array end address
   837     const Register end_to    = T0;   // destination array end address
   838     const Register end_count = T1;   // destination array end address
   840     __ push(end_from);	
   841     __ push(end_to);	
   842     __ push(end_count);	
   843     __ push(T8);	
   845     // copy from high to low
   846     __ move(end_count, count);  
   847     __ dadd(end_from, from, end_count);  
   848     __ dadd(end_to, to, end_count);  
   850     // 2016/05/08 aoqi: If end_from and end_to has differante alignment, unaligned copy is performed.
   851     __ andi(AT, end_from, 3); 
   852     __ andi(T8, end_to, 3); 
   853     __ bne(AT, T8, l_copy_byte); 
   854     __ delayed()->nop();	
   856     // First deal with the unaligned data at the top.
   857     __ bind(l_unaligned);
   858     __ beq(end_count, R0, l_exit); 
   859     __ delayed()->nop(); 
   861     __ andi(AT, end_from, 3);    
   862     __ bne(AT, R0, l_from_unaligned); 
   863     __ delayed()->nop(); 
   865     __ andi(AT, end_to, 3);    
   866     __ beq(AT, R0, l_4_bytes_aligned); 
   867     __ delayed()->nop(); 
   869     __ bind(l_from_unaligned);
   870     __ lb(AT, end_from, -1);   
   871     __ sb(AT, end_to, -1); 
   872     __ daddi(end_from, end_from, -1); 
   873     __ daddi(end_to, end_to, -1); 
   874     __ daddi(end_count, end_count, -1); 
   875     __ b(l_unaligned); 
   876     __ delayed()->nop(); 
   878     // now end_to, end_from point to 4-byte aligned high-ends
   879     //     end_count contains byte count that is not copied.
   880     // copy 4 bytes at a time
   881     __ bind(l_4_bytes_aligned);
   883     __ move(T8, end_count); 
   884     __ daddi(AT, end_count, -3); 
   885     __ blez(AT, l_copy_suffix); 
   886     __ delayed()->nop();	
   888     //__ andi(T8, T8, 3); 
   889     __ lea(end_from, Address(end_from, -4));
   890     __ lea(end_to, Address(end_to, -4));
   892     __ dsrl(end_count, end_count, 2); 
   893     __ align(16);
   894     __ bind(l_copy_4_bytes_loop); //l_copy_4_bytes
   895     __ lw(AT, end_from, 0);   
   896     __ sw(AT, end_to, 0); 
   897     __ addi(end_from, end_from, -4);    
   898     __ addi(end_to, end_to, -4);    
   899     __ addi(end_count, end_count, -1);  
   900     __ bne(end_count, R0, l_copy_4_bytes_loop); 
   901     __ delayed()->nop(); 
   903     __ b(l_copy_suffix);  
   904     __ delayed()->nop(); 
   905     // copy dwords aligned or not with repeat move
   906     // l_copy_suffix
   907     // copy suffix (0-3 bytes)
   908     __ bind(l_copy_suffix); 
   909     __ andi(T8, T8, 3); 
   910     __ beq(T8, R0, l_exit); 
   911     __ delayed()->nop(); 
   912     __ addi(end_from, end_from, 3); 
   913     __ addi(end_to, end_to, 3); 
   914     __ bind(l_copy_suffix_loop);
   915     __ lb(AT, end_from, 0);  
   916     __ sb(AT, end_to, 0); 
   917     __ addi(end_from, end_from, -1);  
   918     __ addi(end_to, end_to, -1);  
   919     __ addi(T8, T8, -1); 
   920     __ bne(T8, R0, l_copy_suffix_loop); 
   921     __ delayed()->nop(); 
   923     __ bind(l_copy_byte);
   924     __ beq(end_count, R0, l_exit); 
   925     __ delayed()->nop(); 
   926     __ lb(AT, end_from, -1);   
   927     __ sb(AT, end_to, -1); 
   928     __ daddi(end_from, end_from, -1); 
   929     __ daddi(end_to, end_to, -1); 
   930     __ daddi(end_count, end_count, -1); 
   931     __ b(l_copy_byte); 
   932     __ delayed()->nop(); 
   934     __ bind(l_exit);
   935     __ pop(T8);	
   936     __ pop(end_count);	
   937     __ pop(end_to);	
   938     __ pop(end_from);	
   939     __ jr(RA); 
   940     __ delayed()->nop(); 
   941     return start;
   942   }
   944   // Generate stub for disjoint short copy.  If "aligned" is true, the
   945   // "from" and "to" addresses are assumed to be heapword aligned.
   946   //
   947   // Arguments for generated stub:
   948   //      from:  A0
   949   //      to:    A1
   950   //  elm.count: A2 treated as signed
   951   //  one element: 2 bytes
   952   //
   953   // Strategy for aligned==true:
   954   //
   955   //  If length <= 9:
   956   //     1. copy 1 elements at a time (l_5)
   957   //
   958   //  If length > 9:
   959   //     1. copy 4 elements at a time until less than 4 elements are left (l_7)
   960   //     2. copy 2 elements at a time until less than 2 elements are left (l_6)
   961   //     3. copy last element if one was left in step 2. (l_1)
   962   //
   963   //
   964   // Strategy for aligned==false:
   965   //
   966   //  If length <= 9: same as aligned==true case
   967   //
   968   //  If length > 9:
   969   //     1. continue with step 7. if the alignment of from and to mod 4
   970   //        is different.
   971   //     2. align from and to to 4 bytes by copying 1 element if necessary
   972   //     3. at l_2 from and to are 4 byte aligned; continue with
   973   //        6. if they cannot be aligned to 8 bytes because they have
   974   //        got different alignment mod 8.
   975   //     4. at this point we know that both, from and to, have the same
   976   //        alignment mod 8, now copy one element if necessary to get
   977   //        8 byte alignment of from and to.
   978   //     5. copy 4 elements at a time until less than 4 elements are
   979   //        left; depending on step 3. all load/stores are aligned.
   980   //     6. copy 2 elements at a time until less than 2 elements are
   981   //        left. (l_6)
   982   //     7. copy 1 element at a time. (l_5)
   983   //     8. copy last element if one was left in step 6. (l_1)
   984   //
   985   //  TODO:
   986   //
   987   //  1. use loongson 128-bit load/store
   988   //  2. use loop unrolling optimization when len is big enough, for example if len > 0x2000:
   989   //    __ bind(l_x);
   990   //    __ ld(AT, tmp1, 0);
   991   //    __ ld(tmp, tmp1, 8);
   992   //    __ sd(AT, tmp2, 0);
   993   //    __ sd(tmp, tmp2, 8);
   994   //    __ ld(AT, tmp1, 16);
   995   //    __ ld(tmp, tmp1, 24);
   996   //    __ sd(AT, tmp2, 16);
   997   //    __ sd(tmp, tmp2, 24);
   998   //    __ daddi(tmp1, tmp1, 32);
   999   //    __ daddi(tmp2, tmp2, 32);
  1000   //    __ daddi(tmp3, tmp3, -16);
  1001   //    __ daddi(AT, tmp3, -16);
  1002   //    __ bgez(AT, l_x);
  1003   //    __ delayed()->nop();
  1004   //
  1005   address generate_disjoint_short_copy(bool aligned, const char * name) {
  1006     StubCodeMark mark(this, "StubRoutines", name);
  1007     __ align(CodeEntryAlignment);
  1009     Register tmp1 = T0;
  1010     Register tmp2 = T1;
  1011     Register tmp3 = T3;
  1013     address start = __ pc();
  1015     __ push(tmp1);
  1016     __ push(tmp2);
  1017     __ push(tmp3);
  1018     __ move(tmp1, A0);
  1019     __ move(tmp2, A1);
  1020     __ move(tmp3, A2);
  1022     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
  1023     Label l_debug;
  1024     // don't try anything fancy if arrays don't have many elements
  1025     __ daddi(AT, tmp3, -9);
  1026     __ blez(AT, l_1);
  1027     __ delayed()->nop();
  1029     if (!aligned) {
  1030       __ xorr(AT, A0, A1);
  1031       __ andi(AT, AT, 1);
  1032       __ bne(AT, R0, l_debug); // if arrays don't have the same alignment mod 2, can this happen?
  1033       __ delayed()->nop();
  1035       __ xorr(AT, A0, A1);
  1036       __ andi(AT, AT, 3);
  1037       __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 1 element copy
  1038       __ delayed()->nop();
  1040       // At this point it is guaranteed that both, from and to have the same alignment mod 4.
  1042       // Copy 1 element if necessary to align to 4 bytes.
  1043       __ andi(AT, A0, 3);
  1044       __ beq(AT, R0, l_2);
  1045       __ delayed()->nop();
  1047       __ lhu(AT, tmp1, 0);
  1048       __ daddi(tmp1, tmp1, 2);
  1049       __ sh(AT, tmp2, 0);
  1050       __ daddi(tmp2, tmp2, 2);
  1051       __ daddi(tmp3, tmp3, -1);
  1052       __ bind(l_2);
  1054       // At this point the positions of both, from and to, are at least 4 byte aligned.
  1056       // Copy 4 elements at a time.
  1057       // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
  1058       __ xorr(AT, tmp1, tmp2);
  1059       __ andi(AT, AT, 7);
  1060       __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned
  1061       __ delayed()->nop();
  1063       // Copy a 2-element word if necessary to align to 8 bytes.
  1064       __ andi(AT, tmp1, 7);
  1065       __ beq(AT, R0, l_7);
  1066       __ delayed()->nop();
  1068       __ lw(AT, tmp1, 0);
  1069       __ daddi(tmp3, tmp3, -2);
  1070       __ sw(AT, tmp2, 0);
  1071       { // FasterArrayCopy
  1072         __ daddi(tmp1, tmp1, 4);
  1073         __ daddi(tmp2, tmp2, 4);
  1077     __ bind(l_7);
  1079     // Copy 4 elements at a time; either the loads or the stores can
  1080     // be unaligned if aligned == false.
  1082     { // FasterArrayCopy
  1083       __ daddi(AT, tmp3, -15);
  1084       __ blez(AT, l_6); // copy 2 at a time if less than 16 elements remain
  1085       __ delayed()->nop();
  1087       __ bind(l_8);
  1088       // For Loongson, there is 128-bit memory access. TODO
  1089       __ ld(AT, tmp1, 0);
  1090       __ sd(AT, tmp2, 0);
  1091       __ daddi(tmp1, tmp1, 8);
  1092       __ daddi(tmp2, tmp2, 8);
  1093       __ daddi(tmp3, tmp3, -4);
  1094       __ daddi(AT, tmp3, -4);
  1095       __ bgez(AT, l_8);
  1096       __ delayed()->nop();
  1098     __ bind(l_6);
  1100     // copy 2 element at a time
  1101     { // FasterArrayCopy
  1102       __ daddi(AT, tmp3, -1);
  1103       __ blez(AT, l_1);
  1104       __ delayed()->nop();
  1106       __ bind(l_3);
  1107       __ lw(AT, tmp1, 0);
  1108       __ sw(AT, tmp2, 0);
  1109       __ daddi(tmp1, tmp1, 4);
  1110       __ daddi(tmp2, tmp2, 4);
  1111       __ daddi(tmp3, tmp3, -2);
  1112       __ daddi(AT, tmp3, -2);
  1113       __ bgez(AT, l_3);
  1114       __ delayed()->nop();
  1118     // do single element copy (8 bit), can this happen?
  1119     __ bind(l_1);
  1120     __ beq(R0, tmp3, l_4);
  1121     __ delayed()->nop();
  1123     { // FasterArrayCopy
  1125       __ bind(l_5);
  1126       __ lhu(AT, tmp1, 0);
  1127       __ daddi(tmp3, tmp3, -1);
  1128       __ sh(AT, tmp2, 0);
  1129       __ daddi(tmp1, tmp1, 2);
  1130       __ daddi(tmp2, tmp2, 2);
  1131       __ daddi(AT, tmp3, -1);
  1132       __ bgez(AT, l_5);
  1133       __ delayed()->nop();
  1135     __ bind(l_4);
  1136     __ pop(tmp3);
  1137     __ pop(tmp2);
  1138     __ pop(tmp1);
  1140     __ jr(RA);
  1141     __ delayed()->nop();
  1143     __ bind(l_debug);
  1144     __ stop("generate_disjoint_short_copy should not reach here");
  1145     return start;
  1148   // Arguments:
  1149   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1150   //             ignored
  1151   //   name    - stub name string
  1152   //
  1153   // Inputs:
  1154   //   c_rarg0   - source array address
  1155   //   c_rarg1   - destination array address
  1156   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1157   //
  1158   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
  1159   // let the hardware handle it.  The two or four words within dwords
  1160   // or qwords that span cache line boundaries will still be loaded
  1161   // and stored atomically.
  1162   //
  1163   address generate_conjoint_short_copy(bool aligned, const char *name) {
  1164 		Label l_1, l_2, l_3, l_4, l_5;
  1165 		StubCodeMark mark(this, "StubRoutines", name);
  1166 		__ align(CodeEntryAlignment);
  1167 		address start = __ pc();
  1168 		address nooverlap_target = aligned ?
  1169 						StubRoutines::arrayof_jshort_disjoint_arraycopy() :
  1170 						StubRoutines::jshort_disjoint_arraycopy();
  1172 		array_overlap_test(nooverlap_target, 1);
  1174 		__ push(T3);	
  1175 		__ push(T0);	
  1176 		__ push(T1);	
  1177 		__ push(T8);	
  1179 		/*
  1180 			 __ pushl(esi);
  1181 			 __ movl(ecx, Address(esp, 4+12));      // count
  1182 			 __ pushl(edi);
  1183 			 __ movl(esi, Address(esp, 8+ 4));      // from
  1184 			 __ movl(edi, Address(esp, 8+ 8));      // to
  1185 		 */ 
  1186 		__ move(T1, A2);  
  1187 		__ move(T3, A0); 
  1188 		__ move(T0, A1);
  1191 		// copy dwords from high to low
  1192 		// __ leal(esi, Address(esi, ecx, Address::times_2, -4)); // from + count*2 - 4
  1193 		__ sll(AT, T1, Address::times_2); 
  1194 		__ add(AT, T3, AT); 
  1195 		__ lea(T3, Address( AT, -4)); 
  1196 		//__ std();
  1197 		//__ leal(edi, Address(edi, ecx, Address::times_2, -4)); // to + count*2 - 4
  1198 		__ sll(AT,T1 , Address::times_2); 
  1199 		__ add(AT, T0, AT); 
  1200 		__ lea(T0, Address( AT, -4)); 
  1201 		//  __ movl(eax, ecx);
  1202 		__ move(T8, T1); 
  1203 		__ bind(l_1);
  1204 		//   __ sarl(ecx, 1);              // dword count
  1205 		__ sra(T1,T1, 1); 
  1206 		//__ jcc(Assembler::equal, l_4);                   // no dwords to move
  1207 		__ beq(T1, R0, l_4);  
  1208 		__ delayed()->nop(); 
  1209 		/*    __ cmpl(ecx, 32);
  1210 					__ jcc(Assembler::above, l_3);                   // > 32 dwords
  1211 		// copy dwords with loop
  1212 		__ subl(edi, esi);
  1213 		 */     __ align(16);
  1214 		__ bind(l_2);
  1215 		//__ movl(edx, Address(esi));
  1216 		__ lw(AT, T3, 0);   
  1217 		//__ movl(Address(edi, esi, Address::times_1), edx);
  1218 		__ sw(AT, T0, 0); 
  1219 		//__ subl(esi, 4);
  1220 		__ addi(T3, T3, -4); 
  1221 		__ addi(T0, T0, -4); 
  1222 		//__ decl(ecx);
  1223 		__ addi(T1, T1, -1); 
  1224 		//  __ jcc(Assembler::notEqual, l_2);
  1225 		__ bne(T1, R0, l_2); 
  1226 		__ delayed()->nop(); 
  1227 		//  __ addl(edi, esi);
  1228 		// __ jmp(l_4);
  1229 		__ b(l_4);
  1230 		__ delayed()->nop();
  1231 		// copy dwords with repeat move
  1232 		__ bind(l_3);
  1233 		//   __ rep_movl();
  1234 		__ bind(l_4);
  1235 		//  __ andl(eax, 1);              // suffix count
  1236 		__ andi(T8, T8, 1);              // suffix count
  1237 		//__ jcc(Assembler::equal, l_5);                   // no suffix
  1238 		__ beq(T8, R0, l_5 );  
  1239 		__ delayed()->nop(); 
  1240 		// copy suffix
  1241 		//   __ movw(edx, Address(esi, 2));
  1242 		__ lh(AT, T3, 2); 
  1243 		//  __ movw(Address(edi, 2), edx);
  1244 		__ sh(AT, T0, 2); 
  1245 		__ bind(l_5);
  1246 		//    __ cld();
  1247 		//    __ popl(edi);
  1248 		//    __ popl(esi);
  1249 		//   __ ret(0);
  1250 		__ pop(T8);	
  1251 		__ pop(T1);	
  1252 		__ pop(T0);	
  1253 		__ pop(T3);	
  1254 		__ jr(RA); 
  1255 		__ delayed()->nop();   
  1256 		return start;
  1259   // Arguments:
  1260   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1261   //             ignored
  1262   //   is_oop  - true => oop array, so generate store check code
  1263   //   name    - stub name string
  1264   //
  1265   // Inputs:
  1266   //   c_rarg0   - source array address
  1267   //   c_rarg1   - destination array address
  1268   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1269   //
  1270   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1271   // the hardware handle it.  The two dwords within qwords that span
  1272   // cache line boundaries will still be loaded and stored atomicly.
  1273   //
  1274   // Side Effects:
  1275   //   disjoint_int_copy_entry is set to the no-overlap entry point
  1276   //   used by generate_conjoint_int_oop_copy().
  1277   //
  1278   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {
  1279 		Label l_2, l_3, l_4, l_stchk;
  1280 		StubCodeMark mark(this, "StubRoutines", name);
  1281 		__ align(CodeEntryAlignment);
  1282 		address start = __ pc();
  1283 		/*
  1284 			 __ pushl(esi);
  1285 			 __ movl(ecx, Address(esp, 4+12));      // count
  1286 			 __ pushl(edi);
  1287 			 __ movl(esi, Address(esp, 8+ 4));      // from
  1288 			 __ movl(edi, Address(esp, 8+ 8));      // to
  1289 		 */
  1290 		__ push(T3);	
  1291 		__ push(T0);	
  1292 		__ push(T1);	
  1293 		__ push(T8);	
  1294 		__ move(T1, A2);  
  1295 		__ move(T3, A0); 
  1296 		__ move(T0, A1);
  1298 		// __ cmpl(ecx, 32);
  1299 		// __ jcc(Assembler::belowEqual, l_2);                   // <= 32 dwords
  1300 		// __ rep_movl();
  1301 		__ b(l_2); 	
  1302 		__ delayed()->nop();	
  1303 		if (is_oop) {
  1304 		//  __ jmp(l_stchk);
  1305 			__ b(l_stchk); 
  1306 			__ delayed()->nop(); 
  1308 		//    __ popl(edi);
  1309 		//   __ popl(esi);
  1310 		//  __ ret(0);
  1311 		__ pop(T8);	
  1312 		__ pop(T1);	
  1313 		__ pop(T0);	
  1314 		__ pop(T3);	
  1315 		__ jr(RA); 
  1316 		__ delayed()->nop(); 
  1318 		__ bind(l_2);
  1319 		//  __ subl(edi, esi);
  1320 		//  __ testl(ecx, ecx);
  1321 		// __ jcc(Assembler::zero, l_4);
  1322 		__ beq(T1, R0, l_4);  
  1323 		__ delayed()->nop(); 
  1324 		__ align(16);
  1325 		__ bind(l_3);
  1326 		//__ movl(edx, Address(esi));
  1327 		__ lw(AT, T3, 0);   
  1328 		// __ movl(Address(edi, esi, Address::times_1), edx);
  1329 		__ sw(AT, T0, 0); 
  1330 		// __ addl(esi, 4);
  1331 		__ addi(T3, T3, 4);
  1332 		__ addi(T0, T0, 4);
  1333 		//   __ decl(ecx);
  1334 		__ addi(T1, T1, -1); 
  1335 		//    __ jcc(Assembler::notEqual, l_3);
  1336 		__ bne(T1, R0, l_3); 
  1337 		__ delayed()->nop(); 
  1338 		if (is_oop) {
  1339 			__ bind(l_stchk);
  1340 			//      __ movl(edi, Address(esp, 8+ 8));
  1341 			//     __ movl(ecx, Address(esp, 8+ 12));
  1342 			__ move(T0, A1); 
  1343 			__ move(T1, A2); 
  1344 			array_store_check();
  1346 		__ bind(l_4);
  1347 		//    __ popl(edi);
  1348 		//   __ popl(esi);
  1349 		//  __ ret(0);
  1350 		__ pop(T8);
  1351 		__ pop(T1);
  1352 		__ pop(T0);
  1353 		__ pop(T3);
  1354 		__ jr(RA); 
  1355 		__ delayed()->nop(); 
  1356 		return start;
  1359   // Arguments:
  1360   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1361   //             ignored
  1362   //   is_oop  - true => oop array, so generate store check code
  1363   //   name    - stub name string
  1364   //
  1365   // Inputs:
  1366   //   c_rarg0   - source array address
  1367   //   c_rarg1   - destination array address
  1368   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1369   //
  1370   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1371   // the hardware handle it.  The two dwords within qwords that span
  1372   // cache line boundaries will still be loaded and stored atomicly.
  1373   //
  1374   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {
  1375 		Label l_2, l_3, l_4, l_stchk;
  1376 		StubCodeMark mark(this, "StubRoutines", name);
  1377 		__ align(CodeEntryAlignment);
  1378 		address start = __ pc();
  1379 		address nooverlap_target;
  1381 		if (is_oop) {
  1382 			nooverlap_target = aligned ?
  1383 							StubRoutines::arrayof_oop_disjoint_arraycopy() :
  1384 							StubRoutines::oop_disjoint_arraycopy();
  1385 		}else {
  1386 			nooverlap_target = aligned ?
  1387 							StubRoutines::arrayof_jint_disjoint_arraycopy() :
  1388 							StubRoutines::jint_disjoint_arraycopy();
  1391 		array_overlap_test(nooverlap_target, 2);
  1393 		__ push(T3);
  1394 		__ push(T0);
  1395 		__ push(T1);
  1396 		__ push(T8);
  1398 		/*
  1399 			 __ pushl(esi);
  1400 			 __ movl(ecx, Address(esp, 4+12));      // count
  1401 			 __ pushl(edi);
  1402 			 __ movl(esi, Address(esp, 8+ 4));      // from
  1403 			 __ movl(edi, Address(esp, 8+ 8));      // to
  1404 		 */ 
  1405 		__ move(T1, A2);  
  1406 		__ move(T3, A0); 
  1407 		__ move(T0, A1);
  1409 		//__ leal(esi, Address(esi, ecx, Address::times_4, -4)); // from + count*4 - 4
  1410 		__ sll(AT, T1, Address::times_4); 
  1411 		__ add(AT, T3, AT); 
  1412 		__ lea(T3 , Address(AT, -4)); 
  1413 		//__ std();
  1414 		//__ leal(edi, Address(edi, ecx, Address::times_4, -4)); // to + count*4 - 4
  1415 		__ sll(AT, T1, Address::times_4); 
  1416 		__ add(AT, T0, AT); 
  1417 		__ lea(T0 , Address(AT, -4)); 
  1419 		//    __ cmpl(ecx, 32);
  1420 		//   __ jcc(Assembler::above, l_3);                   // > 32 dwords
  1421 		//  __ testl(ecx, ecx);
  1422 		//__ jcc(Assembler::zero, l_4);
  1423 		__ beq(T1, R0, l_4); 
  1424 		__ delayed()->nop();  
  1425 		// __ subl(edi, esi);
  1426 		__ align(16);
  1427 		__ bind(l_2);
  1428 		// __ movl(edx, Address(esi));
  1429 		__ lw(AT, T3, 0);   
  1430 		// __ movl(Address(esi, edi, Address::times_1), edx);
  1431 		__ sw(AT, T0, 0); 
  1432 		// __ subl(esi, 4);
  1433 		__ addi(T3, T3, -4); 
  1434 		__ addi(T0, T0, -4); 
  1435 		//   __ decl(ecx);
  1436 		__ addi(T1, T1, -1); 
  1437 		//__ jcc(Assembler::notEqual, l_2);
  1438 		__ bne(T1, R0, l_2);  
  1439 		__ delayed()->nop(); 
  1440 		if (is_oop) {
  1441 			// __ jmp(l_stchk);
  1442 			__ b( l_stchk); 
  1443 			__ delayed()->nop(); 
  1445 		__ bind(l_4);
  1446 		//      __ cld();
  1447 		//     __ popl(edi);
  1448 		//    __ popl(esi);
  1449 		//   __ ret(0);
  1450 		__ pop(T8); 
  1451 		__ pop(T1); 
  1452 		__ pop(T0); 
  1453 		__ pop(T3); 
  1454 		__ jr(RA); 
  1455 		__ delayed()->nop(); 
  1456 		__ bind(l_3);
  1457 		//   __ rep_movl();
  1458 		if (is_oop) {
  1459 			__ bind(l_stchk);
  1460 			//  __ movl(edi, Address(esp, 8+ 8));
  1461 			__ move(T0, A1);  
  1462 			// __ movl(ecx, Address(esp, 8+ 12));
  1463 			__ move(T1, A2);  
  1464 			array_store_check();
  1466 		//    __ cld();
  1467 		//   __ popl(edi);
  1468 		//   __ popl(esi);
  1469 		//  __ ret(0);
  1470 		__ pop(T8);	
  1471 		__ pop(T1);	
  1472 		__ pop(T0);	
  1473 		__ pop(T3);	
  1474 		__ jr(RA);	
  1475 		__ delayed()->nop(); 
  1476 		return start;
  1479   // Arguments:
  1480   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1481   //             ignored
  1482   //   is_oop  - true => oop array, so generate store check code
  1483   //   name    - stub name string
  1484   //
  1485   // Inputs:
  1486   //   c_rarg0   - source array address
  1487   //   c_rarg1   - destination array address
  1488   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1489   //
  1490   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1491   // the hardware handle it.  The two dwords within qwords that span
  1492   // cache line boundaries will still be loaded and stored atomicly.
  1493   //
  1494   // Side Effects:
  1495   //   disjoint_int_copy_entry is set to the no-overlap entry point
  1496   //   used by generate_conjoint_int_oop_copy().
  1497   //
  1498   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
  1499 		Label l_2, l_3, l_4, l_stchk;
  1500 		StubCodeMark mark(this, "StubRoutines", name);
  1501 		__ align(CodeEntryAlignment);
  1502 		address start = __ pc();
  1503 		__ push(T3);	
  1504 		__ push(T0);	
  1505 		__ push(T1);	
  1506 		__ push(T8);	
  1507 		__ move(T1, A2);  
  1508 		__ move(T3, A0); 
  1509 		__ move(T0, A1);
  1511 		// __ cmpl(ecx, 32);
  1512 		// __ jcc(Assembler::belowEqual, l_2);                   // <= 32 dwords
  1513 		// __ rep_movl();
  1514 		__ b(l_2); 	
  1515 		__ delayed()->nop();	
  1516 		if (is_oop) {
  1517 		//  __ jmp(l_stchk);
  1518 			__ b(l_stchk); 
  1519 			__ delayed()->nop(); 
  1521 		//    __ popl(edi);
  1522 		//   __ popl(esi);
  1523 		//  __ ret(0);
  1524 		__ pop(T8);	
  1525 		__ pop(T1);	
  1526 		__ pop(T0);	
  1527 		__ pop(T3);	
  1528 		__ jr(RA); 
  1529 		__ delayed()->nop(); 
  1531 		__ bind(l_2);
  1532 		//  __ subl(edi, esi);
  1533 		//  __ testl(ecx, ecx);
  1534 		// __ jcc(Assembler::zero, l_4);
  1535 		__ beq(T1, R0, l_4);  
  1536 		__ delayed()->nop(); 
  1537 		__ align(16);
  1538 		__ bind(l_3);
  1539 		//__ movl(edx, Address(esi));
  1540 		__ ld(AT, T3, 0);   
  1541 		// __ movl(Address(edi, esi, Address::times_1), edx);
  1542 		__ sd(AT, T0, 0); 
  1543 		// __ addl(esi, 4);
  1544 		__ addi(T3, T3, 8);
  1545 		__ addi(T0, T0, 8);
  1546 		//   __ decl(ecx);
  1547 		__ addi(T1, T1, -1); 
  1548 		//    __ jcc(Assembler::notEqual, l_3);
  1549 		__ bne(T1, R0, l_3); 
  1550 		__ delayed()->nop(); 
  1551 		if (is_oop) {
  1552 			__ bind(l_stchk);
  1553 			//      __ movl(edi, Address(esp, 8+ 8));
  1554 			//     __ movl(ecx, Address(esp, 8+ 12));
  1555 			__ move(T0, A1); 
  1556 			__ move(T1, A2); 
  1557 			array_store_check();
  1559 		__ bind(l_4);
  1560 		//    __ popl(edi);
  1561 		//   __ popl(esi);
  1562 		//  __ ret(0);
  1563 		__ pop(T8);
  1564 		__ pop(T1);
  1565 		__ pop(T0);
  1566 		__ pop(T3);
  1567 		__ jr(RA); 
  1568 		__ delayed()->nop(); 
  1569 		return start;
  1572   // Arguments:
  1573   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1574   //             ignored
  1575   //   is_oop  - true => oop array, so generate store check code
  1576   //   name    - stub name string
  1577   //
  1578   // Inputs:
  1579   //   c_rarg0   - source array address
  1580   //   c_rarg1   - destination array address
  1581   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1582   //
  1583   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1584   // the hardware handle it.  The two dwords within qwords that span
  1585   // cache line boundaries will still be loaded and stored atomicly.
  1586   //
  1587   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
  1588 		Label l_2, l_3, l_4, l_stchk;
  1589 		StubCodeMark mark(this, "StubRoutines", name);
  1590 		__ align(CodeEntryAlignment);
  1591 		address start = __ pc();
  1592 		address nooverlap_target;
  1594 		if (is_oop) {
  1595 			nooverlap_target = aligned ?
  1596 							StubRoutines::arrayof_oop_disjoint_arraycopy() :
  1597 							StubRoutines::oop_disjoint_arraycopy();
  1598 		}else {
  1599 			nooverlap_target = aligned ?
  1600 							StubRoutines::arrayof_jlong_disjoint_arraycopy() :
  1601 							StubRoutines::jlong_disjoint_arraycopy();
  1604 		array_overlap_test(nooverlap_target, 3);
  1606 		__ push(T3);
  1607 		__ push(T0);
  1608 		__ push(T1);
  1609 		__ push(T8);
  1611 		__ move(T1, A2);  
  1612 		__ move(T3, A0); 
  1613 		__ move(T0, A1);
  1615 		//__ leal(esi, Address(esi, ecx, Address::times_4, -4)); // from + count*4 - 4
  1616 		__ sll(AT, T1, Address::times_8); 
  1617 		__ add(AT, T3, AT); 
  1618 		__ lea(T3 , Address(AT, -8)); 
  1619 		//__ std();
  1620 		//__ leal(edi, Address(edi, ecx, Address::times_4, -4)); // to + count*4 - 4
  1621 		__ sll(AT, T1, Address::times_8); 
  1622 		__ add(AT, T0, AT); 
  1623 		__ lea(T0 , Address(AT, -8)); 
  1625 		//    __ cmpl(ecx, 32);
  1626 		//   __ jcc(Assembler::above, l_3);                   // > 32 dwords
  1627 		//  __ testl(ecx, ecx);
  1628 		//__ jcc(Assembler::zero, l_4);
  1629 		__ beq(T1, R0, l_4); 
  1630 		__ delayed()->nop();  
  1631 		// __ subl(edi, esi);
  1632 		__ align(16);
  1633 		__ bind(l_2);
  1634 		// __ movl(edx, Address(esi));
  1635 		__ ld(AT, T3, 0);   
  1636 		// __ movl(Address(esi, edi, Address::times_1), edx);
  1637 		__ sd(AT, T0, 0); 
  1638 		// __ subl(esi, 4);
  1639 		__ addi(T3, T3, -8); 
  1640 		__ addi(T0, T0, -8); 
  1641 		//   __ decl(ecx);
  1642 		__ addi(T1, T1, -1); 
  1643 		//__ jcc(Assembler::notEqual, l_2);
  1644 		__ bne(T1, R0, l_2);  
  1645 		__ delayed()->nop(); 
  1646 		if (is_oop) {
  1647 			// __ jmp(l_stchk);
  1648 			__ b( l_stchk); 
  1649 			__ delayed()->nop(); 
  1651 		__ bind(l_4);
  1652 		//      __ cld();
  1653 		//     __ popl(edi);
  1654 		//    __ popl(esi);
  1655 		//   __ ret(0);
  1656 		__ pop(T8); 
  1657 		__ pop(T1); 
  1658 		__ pop(T0); 
  1659 		__ pop(T3); 
  1660 		__ jr(RA); 
  1661 		__ delayed()->nop(); 
  1662 		__ bind(l_3);
  1663 		//   __ rep_movl();
  1664 		if (is_oop) {
  1665 			__ bind(l_stchk);
  1666 			//  __ movl(edi, Address(esp, 8+ 8));
  1667 			__ move(T0, A1);  
  1668 			// __ movl(ecx, Address(esp, 8+ 12));
  1669 			__ move(T1, A2);  
  1670 			array_store_check();
  1672 		//    __ cld();
  1673 		//   __ popl(edi);
  1674 		//   __ popl(esi);
  1675 		//  __ ret(0);
  1676 		__ pop(T8);	
  1677 		__ pop(T1);	
  1678 		__ pop(T0);	
  1679 		__ pop(T3);	
  1680 		__ jr(RA);	
  1681 		__ delayed()->nop(); 
  1682 		return start;
  1684 #if 0
  1685   // Arguments:
  1686   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
  1687   //             ignored
  1688   //   is_oop  - true => oop array, so generate store check code
  1689   //   name    - stub name string
  1690   //
  1691   // Inputs:
  1692   //   c_rarg0   - source array address
  1693   //   c_rarg1   - destination array address
  1694   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1695   //
  1696   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
  1697     __ align(CodeEntryAlignment);
  1698     StubCodeMark mark(this, "StubRoutines", name);
  1699     address start = __ pc();
  1701     Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
  1702     const Register from        = rdi;  // source array address
  1703     const Register to          = rsi;  // destination array address
  1704     const Register qword_count = rdx;  // elements count
  1705     const Register saved_count = rcx;
  1707     __ enter(); // required for proper stackwalking of RuntimeStub frame
  1708     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
  1710     address disjoint_copy_entry = NULL;
  1711     if (is_oop) {
  1712       assert(!UseCompressedOops, "shouldn't be called for compressed oops");
  1713       disjoint_copy_entry = disjoint_oop_copy_entry;
  1714       oop_copy_entry  = __ pc();
  1715       array_overlap_test(disjoint_oop_copy_entry, Address::times_8);
  1716     } else {
  1717       disjoint_copy_entry = disjoint_long_copy_entry;
  1718       long_copy_entry = __ pc();
  1719       array_overlap_test(disjoint_long_copy_entry, Address::times_8);
  1721     BLOCK_COMMENT("Entry:");
  1722     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  1724     array_overlap_test(disjoint_copy_entry, Address::times_8);
  1725     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
  1726                       // r9 and r10 may be used to save non-volatile registers
  1728     // 'from', 'to' and 'qword_count' are now valid
  1730     if (is_oop) {
  1731       // Save to and count for store barrier
  1732       __ movptr(saved_count, qword_count);
  1733       // No registers are destroyed by this call
  1734       gen_write_ref_array_pre_barrier(to, saved_count);
  1737     __ jmp(L_copy_32_bytes);
  1739     // Copy trailing qwords
  1740   __ BIND(L_copy_8_bytes);
  1741     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
  1742     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
  1743     __ decrement(qword_count);
  1744     __ jcc(Assembler::notZero, L_copy_8_bytes);
  1746     if (is_oop) {
  1747       __ jmp(L_exit);
  1748     } else {
  1749       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
  1750       restore_arg_regs();
  1751       __ xorptr(rax, rax); // return 0
  1752       __ leave(); // required for proper stackwalking of RuntimeStub frame
  1753       __ ret(0);
  1756     // Copy in 32-bytes chunks
  1757     copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
  1759     if (is_oop) {
  1760     __ BIND(L_exit);
  1761       __ lea(rcx, Address(to, saved_count, Address::times_8, -8));
  1762       gen_write_ref_array_post_barrier(to, rcx, rax);
  1763       inc_counter_np(SharedRuntime::_oop_array_copy_ctr);
  1764     } else {
  1765       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
  1767     restore_arg_regs();
  1768     __ xorptr(rax, rax); // return 0
  1769     __ leave(); // required for proper stackwalking of RuntimeStub frame
  1770     __ ret(0);
  1772     return start;
  1776   // Helper for generating a dynamic type check.
  1777   // Smashes no registers.
  1778   void generate_type_check(Register sub_klass,
  1779                            Register super_check_offset,
  1780                            Register super_klass,
  1781                            Label& L_success) {
  1782     assert_different_registers(sub_klass, super_check_offset, super_klass);
  1784     BLOCK_COMMENT("type_check:");
  1786     Label L_miss;
  1788     // a couple of useful fields in sub_klass:
  1789     int ss_offset = (klassOopDesc::header_size() * HeapWordSize +
  1790                      Klass::secondary_supers_offset_in_bytes());
  1791     int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
  1792                      Klass::secondary_super_cache_offset_in_bytes());
  1793     Address secondary_supers_addr(sub_klass, ss_offset);
  1794     Address super_cache_addr(     sub_klass, sc_offset);
  1796     // if the pointers are equal, we are done (e.g., String[] elements)
  1797     __ cmpptr(super_klass, sub_klass);
  1798     __ jcc(Assembler::equal, L_success);
  1800     // check the supertype display:
  1801     Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
  1802     __ cmpptr(super_klass, super_check_addr); // test the super type
  1803     __ jcc(Assembler::equal, L_success);
  1805     // if it was a primary super, we can just fail immediately
  1806     __ cmpl(super_check_offset, sc_offset);
  1807     __ jcc(Assembler::notEqual, L_miss);
  1809     // Now do a linear scan of the secondary super-klass chain.
  1810     // The repne_scan instruction uses fixed registers, which we must spill.
  1811     // (We need a couple more temps in any case.)
  1812     // This code is rarely used, so simplicity is a virtue here.
  1813     inc_counter_np(SharedRuntime::_partial_subtype_ctr);
  1815       __ push(rax);
  1816       __ push(rcx);
  1817       __ push(rdi);
  1818       assert_different_registers(sub_klass, super_klass, rax, rcx, rdi);
  1820       __ movptr(rdi, secondary_supers_addr);
  1821       // Load the array length.
  1822       __ movl(rcx, Address(rdi, arrayOopDesc::length_offset_in_bytes()));
  1823       // Skip to start of data.
  1824       __ addptr(rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
  1825       // Scan rcx words at [rdi] for occurance of rax
  1826       // Set NZ/Z based on last compare
  1827       __ movptr(rax, super_klass);
  1828       if (UseCompressedOops) {
  1829         // Compare against compressed form.  Don't need to uncompress because
  1830         // looks like orig rax is restored in popq below.
  1831         __ encode_heap_oop(rax);
  1832         __ repne_scanl();
  1833       } else {
  1834         __ repne_scan();
  1837       // Unspill the temp. registers:
  1838       __ pop(rdi);
  1839       __ pop(rcx);
  1840       __ pop(rax);
  1842       __ jcc(Assembler::notEqual, L_miss);
  1845     // Success.  Cache the super we found and proceed in triumph.
  1846     __ movptr(super_cache_addr, super_klass); // note: rax is dead
  1847     __ jmp(L_success);
  1849     // Fall through on failure!
  1850     __ BIND(L_miss);
  1853   //
  1854   //  Generate checkcasting array copy stub
  1855   //
  1856   //  Input:
  1857   //    c_rarg0   - source array address
  1858   //    c_rarg1   - destination array address
  1859   //    c_rarg2   - element count, treated as ssize_t, can be zero
  1860   //    c_rarg3   - size_t ckoff (super_check_offset)
  1861   // not Win64
  1862   //    c_rarg4   - oop ckval (super_klass)
  1863   // Win64
  1864   //    rsp+40    - oop ckval (super_klass)
  1865   //
  1866   //  Output:
  1867   //    rax ==  0  -  success
  1868   //    rax == -1^K - failure, where K is partial transfer count
  1869   //
  1870   address generate_checkcast_copy(const char *name) {
  1872     Label L_load_element, L_store_element, L_do_card_marks, L_done;
  1874     // Input registers (after setup_arg_regs)
  1875     const Register from        = rdi;   // source array address
  1876     const Register to          = rsi;   // destination array address
  1877     const Register length      = rdx;   // elements count
  1878     const Register ckoff       = rcx;   // super_check_offset
  1879     const Register ckval       = r8;    // super_klass
  1881     // Registers used as temps (r13, r14 are save-on-entry)
  1882     const Register end_from    = from;  // source array end address
  1883     const Register end_to      = r13;   // destination array end address
  1884     const Register count       = rdx;   // -(count_remaining)
  1885     const Register r14_length  = r14;   // saved copy of length
  1886     // End pointers are inclusive, and if length is not zero they point
  1887     // to the last unit copied:  end_to[0] := end_from[0]
  1889     const Register rax_oop    = rax;    // actual oop copied
  1890     const Register r11_klass  = r11;    // oop._klass
  1892     //---------------------------------------------------------------
  1893     // Assembler stub will be used for this call to arraycopy
  1894     // if the two arrays are subtypes of Object[] but the
  1895     // destination array type is not equal to or a supertype
  1896     // of the source type.  Each element must be separately
  1897     // checked.
  1899     __ align(CodeEntryAlignment);
  1900     StubCodeMark mark(this, "StubRoutines", name);
  1901     address start = __ pc();
  1903     __ enter(); // required for proper stackwalking of RuntimeStub frame
  1905     checkcast_copy_entry  = __ pc();
  1906     BLOCK_COMMENT("Entry:");
  1908 #ifdef ASSERT
  1909     // caller guarantees that the arrays really are different
  1910     // otherwise, we would have to make conjoint checks
  1911     { Label L;
  1912       array_overlap_test(L, TIMES_OOP);
  1913       __ stop("checkcast_copy within a single array");
  1914       __ bind(L);
  1916 #endif //ASSERT
  1918     // allocate spill slots for r13, r14
  1919     enum {
  1920       saved_r13_offset,
  1921       saved_r14_offset,
  1922       saved_rbp_offset,
  1923       saved_rip_offset,
  1924       saved_rarg0_offset
  1925     };
  1926     __ subptr(rsp, saved_rbp_offset * wordSize);
  1927     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
  1928     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
  1929     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
  1930                        // ckoff => rcx, ckval => r8
  1931                        // r9 and r10 may be used to save non-volatile registers
  1932 #ifdef _WIN64
  1933     // last argument (#4) is on stack on Win64
  1934     const int ckval_offset = saved_rarg0_offset + 4;
  1935     __ movptr(ckval, Address(rsp, ckval_offset * wordSize));
  1936 #endif
  1938     // check that int operands are properly extended to size_t
  1939     assert_clean_int(length, rax);
  1940     assert_clean_int(ckoff, rax);
  1942 #ifdef ASSERT
  1943     BLOCK_COMMENT("assert consistent ckoff/ckval");
  1944     // The ckoff and ckval must be mutually consistent,
  1945     // even though caller generates both.
  1946     { Label L;
  1947       int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
  1948                         Klass::super_check_offset_offset_in_bytes());
  1949       __ cmpl(ckoff, Address(ckval, sco_offset));
  1950       __ jcc(Assembler::equal, L);
  1951       __ stop("super_check_offset inconsistent");
  1952       __ bind(L);
  1954 #endif //ASSERT
  1956     // Loop-invariant addresses.  They are exclusive end pointers.
  1957     Address end_from_addr(from, length, TIMES_OOP, 0);
  1958     Address   end_to_addr(to,   length, TIMES_OOP, 0);
  1959     // Loop-variant addresses.  They assume post-incremented count < 0.
  1960     Address from_element_addr(end_from, count, TIMES_OOP, 0);
  1961     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
  1963     gen_write_ref_array_pre_barrier(to, count);
  1965     // Copy from low to high addresses, indexed from the end of each array.
  1966     __ lea(end_from, end_from_addr);
  1967     __ lea(end_to,   end_to_addr);
  1968     __ movptr(r14_length, length);        // save a copy of the length
  1969     assert(length == count, "");          // else fix next line:
  1970     __ negptr(count);                     // negate and test the length
  1971     __ jcc(Assembler::notZero, L_load_element);
  1973     // Empty array:  Nothing to do.
  1974     __ xorptr(rax, rax);                  // return 0 on (trivial) success
  1975     __ jmp(L_done);
  1977     // ======== begin loop ========
  1978     // (Loop is rotated; its entry is L_load_element.)
  1979     // Loop control:
  1980     //   for (count = -count; count != 0; count++)
  1981     // Base pointers src, dst are biased by 8*(count-1),to last element.
  1982     __ align(16);
  1984     __ BIND(L_store_element);
  1985     __ store_heap_oop(rax_oop, to_element_addr);  // store the oop
  1986     __ sync();
  1987     __ increment(count);               // increment the count toward zero
  1988     __ jcc(Assembler::zero, L_do_card_marks);
  1990     // ======== loop entry is here ========
  1991     __ BIND(L_load_element);
  1992     __ load_heap_oop(rax_oop, from_element_addr); // load the oop
  1993     __ testptr(rax_oop, rax_oop);
  1994     __ jcc(Assembler::zero, L_store_element);
  1996     __ load_klass(r11_klass, rax_oop);// query the object klass
  1997     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
  1998     // ======== end loop ========
  2000     // It was a real error; we must depend on the caller to finish the job.
  2001     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
  2002     // Emit GC store barriers for the oops we have copied (r14 + rdx),
  2003     // and report their number to the caller.
  2004     assert_different_registers(rax, r14_length, count, to, end_to, rcx);
  2005     __ lea(end_to, to_element_addr);
  2006     gen_write_ref_array_post_barrier(to, end_to, rscratch1);
  2007     __ movptr(rax, r14_length);           // original oops
  2008     __ addptr(rax, count);                // K = (original - remaining) oops
  2009     __ notptr(rax);                       // report (-1^K) to caller
  2010     __ jmp(L_done);
  2012     // Come here on success only.
  2013     __ BIND(L_do_card_marks);
  2014     __ addptr(end_to, -wordSize);         // make an inclusive end pointer
  2015     gen_write_ref_array_post_barrier(to, end_to, rscratch1);
  2016     __ xorptr(rax, rax);                  // return 0 on success
  2018     // Common exit point (success or failure).
  2019     __ BIND(L_done);
  2020     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
  2021     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
  2022     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
  2023     restore_arg_regs();
  2024     __ leave(); // required for proper stackwalking of RuntimeStub frame
  2025     __ ret(0);
  2027     return start;
  2030   //
  2031   //  Generate 'unsafe' array copy stub
  2032   //  Though just as safe as the other stubs, it takes an unscaled
  2033   //  size_t argument instead of an element count.
  2034   //
  2035   //  Input:
  2036   //    c_rarg0   - source array address
  2037   //    c_rarg1   - destination array address
  2038   //    c_rarg2   - byte count, treated as ssize_t, can be zero
  2039   //
  2040   // Examines the alignment of the operands and dispatches
  2041   // to a long, int, short, or byte copy loop.
  2042   //
  2043   address generate_unsafe_copy(const char *name) {
  2045     Label L_long_aligned, L_int_aligned, L_short_aligned;
  2047     // Input registers (before setup_arg_regs)
  2048     const Register from        = c_rarg0;  // source array address
  2049     const Register to          = c_rarg1;  // destination array address
  2050     const Register size        = c_rarg2;  // byte count (size_t)
  2052     // Register used as a temp
  2053     const Register bits        = rax;      // test copy of low bits
  2055     __ align(CodeEntryAlignment);
  2056     StubCodeMark mark(this, "StubRoutines", name);
  2057     address start = __ pc();
  2059     __ enter(); // required for proper stackwalking of RuntimeStub frame
  2061     // bump this on entry, not on exit:
  2062     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
  2064     __ mov(bits, from);
  2065     __ orptr(bits, to);
  2066     __ orptr(bits, size);
  2068     __ testb(bits, BytesPerLong-1);
  2069     __ jccb(Assembler::zero, L_long_aligned);
  2071     __ testb(bits, BytesPerInt-1);
  2072     __ jccb(Assembler::zero, L_int_aligned);
  2074     __ testb(bits, BytesPerShort-1);
  2075     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
  2077     __ BIND(L_short_aligned);
  2078     __ shrptr(size, LogBytesPerShort); // size => short_count
  2079     __ jump(RuntimeAddress(short_copy_entry));
  2081     __ BIND(L_int_aligned);
  2082     __ shrptr(size, LogBytesPerInt); // size => int_count
  2083     __ jump(RuntimeAddress(int_copy_entry));
  2085     __ BIND(L_long_aligned);
  2086     __ shrptr(size, LogBytesPerLong); // size => qword_count
  2087     __ jump(RuntimeAddress(long_copy_entry));
  2089     return start;
  2092   // Perform range checks on the proposed arraycopy.
  2093   // Kills temp, but nothing else.
  2094   // Also, clean the sign bits of src_pos and dst_pos.
  2095   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
  2096                               Register src_pos, // source position (c_rarg1)
  2097                               Register dst,     // destination array oo (c_rarg2)
  2098                               Register dst_pos, // destination position (c_rarg3)
  2099                               Register length,
  2100                               Register temp,
  2101                               Label& L_failed) {
  2102     BLOCK_COMMENT("arraycopy_range_checks:");
  2104     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
  2105     __ movl(temp, length);
  2106     __ addl(temp, src_pos);             // src_pos + length
  2107     __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
  2108     __ jcc(Assembler::above, L_failed);
  2110     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
  2111     __ movl(temp, length);
  2112     __ addl(temp, dst_pos);             // dst_pos + length
  2113     __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
  2114     __ jcc(Assembler::above, L_failed);
  2116     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
  2117     // Move with sign extension can be used since they are positive.
  2118     __ movslq(src_pos, src_pos);
  2119     __ movslq(dst_pos, dst_pos);
  2121     BLOCK_COMMENT("arraycopy_range_checks done");
  2124   //
  2125   //  Generate generic array copy stubs
  2126   //
  2127   //  Input:
  2128   //    c_rarg0    -  src oop
  2129   //    c_rarg1    -  src_pos (32-bits)
  2130   //    c_rarg2    -  dst oop
  2131   //    c_rarg3    -  dst_pos (32-bits)
  2132   // not Win64
  2133   //    c_rarg4    -  element count (32-bits)
  2134   // Win64
  2135   //    rsp+40     -  element count (32-bits)
  2136   //
  2137   //  Output:
  2138   //    rax ==  0  -  success
  2139   //    rax == -1^K - failure, where K is partial transfer count
  2140   //
  2141   address generate_generic_copy(const char *name) {
  2143     Label L_failed, L_failed_0, L_objArray;
  2144     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
  2146     // Input registers
  2147     const Register src        = c_rarg0;  // source array oop
  2148     const Register src_pos    = c_rarg1;  // source position
  2149     const Register dst        = c_rarg2;  // destination array oop
  2150     const Register dst_pos    = c_rarg3;  // destination position
  2151     // elements count is on stack on Win64
  2152 #ifdef _WIN64
  2153 #define C_RARG4 Address(rsp, 6 * wordSize)
  2154 #else
  2155 #define C_RARG4 c_rarg4
  2156 #endif
  2158     { int modulus = CodeEntryAlignment;
  2159       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
  2160       int advance = target - (__ offset() % modulus);
  2161       if (advance < 0)  advance += modulus;
  2162       if (advance > 0)  __ nop(advance);
  2164     StubCodeMark mark(this, "StubRoutines", name);
  2166     // Short-hop target to L_failed.  Makes for denser prologue code.
  2167     __ BIND(L_failed_0);
  2168     __ jmp(L_failed);
  2169     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
  2171     __ align(CodeEntryAlignment);
  2172     address start = __ pc();
  2174     __ enter(); // required for proper stackwalking of RuntimeStub frame
  2176     // bump this on entry, not on exit:
  2177     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
  2179     //-----------------------------------------------------------------------
  2180     // Assembler stub will be used for this call to arraycopy
  2181     // if the following conditions are met:
  2182     //
  2183     // (1) src and dst must not be null.
  2184     // (2) src_pos must not be negative.
  2185     // (3) dst_pos must not be negative.
  2186     // (4) length  must not be negative.
  2187     // (5) src klass and dst klass should be the same and not NULL.
  2188     // (6) src and dst should be arrays.
  2189     // (7) src_pos + length must not exceed length of src.
  2190     // (8) dst_pos + length must not exceed length of dst.
  2191     //
  2193     //  if (src == NULL) return -1;
  2194     __ testptr(src, src);         // src oop
  2195     size_t j1off = __ offset();
  2196     __ jccb(Assembler::zero, L_failed_0);
  2198     //  if (src_pos < 0) return -1;
  2199     __ testl(src_pos, src_pos); // src_pos (32-bits)
  2200     __ jccb(Assembler::negative, L_failed_0);
  2202     //  if (dst == NULL) return -1;
  2203     __ testptr(dst, dst);         // dst oop
  2204     __ jccb(Assembler::zero, L_failed_0);
  2206     //  if (dst_pos < 0) return -1;
  2207     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
  2208     size_t j4off = __ offset();
  2209     __ jccb(Assembler::negative, L_failed_0);
  2211     // The first four tests are very dense code,
  2212     // but not quite dense enough to put four
  2213     // jumps in a 16-byte instruction fetch buffer.
  2214     // That's good, because some branch predicters
  2215     // do not like jumps so close together.
  2216     // Make sure of this.
  2217     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
  2219     // registers used as temp
  2220     const Register r11_length    = r11; // elements count to copy
  2221     const Register r10_src_klass = r10; // array klass
  2222     const Register r9_dst_klass  = r9;  // dest array klass
  2224     //  if (length < 0) return -1;
  2225     __ movl(r11_length, C_RARG4);       // length (elements count, 32-bits value)
  2226     __ testl(r11_length, r11_length);
  2227     __ jccb(Assembler::negative, L_failed_0);
  2229     __ load_klass(r10_src_klass, src);
  2230 #ifdef ASSERT
  2231     //  assert(src->klass() != NULL);
  2232     BLOCK_COMMENT("assert klasses not null");
  2233     { Label L1, L2;
  2234       __ testptr(r10_src_klass, r10_src_klass);
  2235       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
  2236       __ bind(L1);
  2237       __ stop("broken null klass");
  2238       __ bind(L2);
  2239       __ load_klass(r9_dst_klass, dst);
  2240       __ cmpq(r9_dst_klass, 0);
  2241       __ jcc(Assembler::equal, L1);     // this would be broken also
  2242       BLOCK_COMMENT("assert done");
  2244 #endif
  2246     // Load layout helper (32-bits)
  2247     //
  2248     //  |array_tag|     | header_size | element_type |     |log2_element_size|
  2249     // 32        30    24            16              8     2                 0
  2250     //
  2251     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
  2252     //
  2254     int lh_offset = klassOopDesc::header_size() * HeapWordSize +
  2255                     Klass::layout_helper_offset_in_bytes();
  2257     const Register rax_lh = rax;  // layout helper
  2259     __ movl(rax_lh, Address(r10_src_klass, lh_offset));
  2261     // Handle objArrays completely differently...
  2262     jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
  2263     __ cmpl(rax_lh, objArray_lh);
  2264     __ jcc(Assembler::equal, L_objArray);
  2266     //  if (src->klass() != dst->klass()) return -1;
  2267     __ load_klass(r9_dst_klass, dst);
  2268     __ cmpq(r10_src_klass, r9_dst_klass);
  2269     __ jcc(Assembler::notEqual, L_failed);
  2271     //  if (!src->is_Array()) return -1;
  2272     __ cmpl(rax_lh, Klass::_lh_neutral_value);
  2273     __ jcc(Assembler::greaterEqual, L_failed);
  2275     // At this point, it is known to be a typeArray (array_tag 0x3).
  2276 #ifdef ASSERT
  2277     { Label L;
  2278       __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
  2279       __ jcc(Assembler::greaterEqual, L);
  2280       __ stop("must be a primitive array");
  2281       __ bind(L);
  2283 #endif
  2285     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
  2286                            r10, L_failed);
  2288     // typeArrayKlass
  2289     //
  2290     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
  2291     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
  2292     //
  2294     const Register r10_offset = r10;    // array offset
  2295     const Register rax_elsize = rax_lh; // element size
  2297     __ movl(r10_offset, rax_lh);
  2298     __ shrl(r10_offset, Klass::_lh_header_size_shift);
  2299     __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
  2300     __ addptr(src, r10_offset);           // src array offset
  2301     __ addptr(dst, r10_offset);           // dst array offset
  2302     BLOCK_COMMENT("choose copy loop based on element size");
  2303     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
  2305     // next registers should be set before the jump to corresponding stub
  2306     const Register from     = c_rarg0;  // source array address
  2307     const Register to       = c_rarg1;  // destination array address
  2308     const Register count    = c_rarg2;  // elements count
  2310     // 'from', 'to', 'count' registers should be set in such order
  2311     // since they are the same as 'src', 'src_pos', 'dst'.
  2313   __ BIND(L_copy_bytes);
  2314     __ cmpl(rax_elsize, 0);
  2315     __ jccb(Assembler::notEqual, L_copy_shorts);
  2316     __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
  2317     __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
  2318     __ movl2ptr(count, r11_length); // length
  2319     __ jump(RuntimeAddress(byte_copy_entry));
  2321   __ BIND(L_copy_shorts);
  2322     __ cmpl(rax_elsize, LogBytesPerShort);
  2323     __ jccb(Assembler::notEqual, L_copy_ints);
  2324     __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
  2325     __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
  2326     __ movl2ptr(count, r11_length); // length
  2327     __ jump(RuntimeAddress(short_copy_entry));
  2329   __ BIND(L_copy_ints);
  2330     __ cmpl(rax_elsize, LogBytesPerInt);
  2331     __ jccb(Assembler::notEqual, L_copy_longs);
  2332     __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
  2333     __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
  2334     __ movl2ptr(count, r11_length); // length
  2335     __ jump(RuntimeAddress(int_copy_entry));
  2337   __ BIND(L_copy_longs);
  2338 #ifdef ASSERT
  2339     { Label L;
  2340       __ cmpl(rax_elsize, LogBytesPerLong);
  2341       __ jcc(Assembler::equal, L);
  2342       __ stop("must be long copy, but elsize is wrong");
  2343       __ bind(L);
  2345 #endif
  2346     __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
  2347     __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
  2348     __ movl2ptr(count, r11_length); // length
  2349     __ jump(RuntimeAddress(long_copy_entry));
  2351     // objArrayKlass
  2352   __ BIND(L_objArray);
  2353     // live at this point:  r10_src_klass, src[_pos], dst[_pos]
  2355     Label L_plain_copy, L_checkcast_copy;
  2356     //  test array classes for subtyping
  2357     __ load_klass(r9_dst_klass, dst);
  2358     __ cmpq(r10_src_klass, r9_dst_klass); // usual case is exact equality
  2359     __ jcc(Assembler::notEqual, L_checkcast_copy);
  2361     // Identically typed arrays can be copied without element-wise checks.
  2362     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
  2363                            r10, L_failed);
  2365     __ lea(from, Address(src, src_pos, TIMES_OOP,
  2366                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
  2367     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
  2368                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
  2369     __ movl2ptr(count, r11_length); // length
  2370   __ BIND(L_plain_copy);
  2371     __ jump(RuntimeAddress(oop_copy_entry));
  2373   __ BIND(L_checkcast_copy);
  2374     // live at this point:  r10_src_klass, !r11_length
  2376       // assert(r11_length == C_RARG4); // will reload from here
  2377       Register r11_dst_klass = r11;
  2378       __ load_klass(r11_dst_klass, dst);
  2380       // Before looking at dst.length, make sure dst is also an objArray.
  2381       __ cmpl(Address(r11_dst_klass, lh_offset), objArray_lh);
  2382       __ jcc(Assembler::notEqual, L_failed);
  2384       // It is safe to examine both src.length and dst.length.
  2385 #ifndef _WIN64
  2386       arraycopy_range_checks(src, src_pos, dst, dst_pos, C_RARG4,
  2387                              rax, L_failed);
  2388 #else
  2389       __ movl(r11_length, C_RARG4);     // reload
  2390       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
  2391                              rax, L_failed);
  2392       __ load_klass(r11_dst_klass, dst); // reload
  2393 #endif
  2395       // Marshal the base address arguments now, freeing registers.
  2396       __ lea(from, Address(src, src_pos, TIMES_OOP,
  2397                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
  2398       __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
  2399                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
  2400       __ movl(count, C_RARG4);          // length (reloaded)
  2401       Register sco_temp = c_rarg3;      // this register is free now
  2402       assert_different_registers(from, to, count, sco_temp,
  2403                                  r11_dst_klass, r10_src_klass);
  2404       assert_clean_int(count, sco_temp);
  2406       // Generate the type check.
  2407       int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
  2408                         Klass::super_check_offset_offset_in_bytes());
  2409       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
  2410       assert_clean_int(sco_temp, rax);
  2411       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
  2413       // Fetch destination element klass from the objArrayKlass header.
  2414       int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
  2415                        objArrayKlass::element_klass_offset_in_bytes());
  2416       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
  2417       __ movl(sco_temp,      Address(r11_dst_klass, sco_offset));
  2418       assert_clean_int(sco_temp, rax);
  2420       // the checkcast_copy loop needs two extra arguments:
  2421       assert(c_rarg3 == sco_temp, "#3 already in place");
  2422       __ movptr(C_RARG4, r11_dst_klass);  // dst.klass.element_klass
  2423       __ jump(RuntimeAddress(checkcast_copy_entry));
  2426   __ BIND(L_failed);
  2427     __ xorptr(rax, rax);
  2428     __ notptr(rax); // return -1
  2429     __ leave();   // required for proper stackwalking of RuntimeStub frame
  2430     __ ret(0);
  2432     return start;
  2435 #undef length_arg
  2436 #endif
  2438 //FIXME
  2439   address generate_disjoint_long_copy(bool aligned, const char *name) {
  2440 	  Label l_1, l_2;
  2441 	  StubCodeMark mark(this, "StubRoutines", name);
  2442 	  __ align(CodeEntryAlignment);
  2443 	  address start = __ pc();
  2445 	  //      __ movl(ecx, Address(esp, 4+8));       // count
  2446 	  //     __ movl(eax, Address(esp, 4+0));       // from
  2447 	  //    __ movl(edx, Address(esp, 4+4));       // to
  2448 	  __ move(T1, A2);  
  2449 	  __ move(T3, A0); 
  2450 	  __ move(T0, A1);
  2451 	  __ push(T3); 
  2452 	  __ push(T0);
  2453 	  __ push(T1);
  2454 	  //__ subl(edx, eax);
  2455 	  //__ jmp(l_2);
  2456 	  __ b(l_2);  
  2457 	  __ delayed()->nop();   
  2458 	  __ align(16);
  2459 	  __ bind(l_1);
  2460 	  //   if (VM_Version::supports_mmx()) {
  2461 	  //     __ movq(mmx0, Address(eax));
  2462 	  //     __ movq(Address(eax, edx, Address::times_1), mmx0);
  2463 	  //   } else {
  2464 	  //   __ fild_d(Address(eax));
  2465 	  __ ld(AT, T3, 0);   
  2466 	  // __ fistp_d(Address(eax, edx, Address::times_1));
  2467 	  __ sd (AT, T0, 0); 
  2468 	  //   }
  2469 	  //   __ addl(eax, 8);
  2470 	  __ addi(T3, T3, 8); 
  2471 	  __ addi(T0, T0, 8); 
  2472 	  __ bind(l_2);
  2473 	  //    __ decl(ecx);
  2474 	  __ addi(T1, T1, -1); 
  2475 	  //    __ jcc(Assembler::greaterEqual, l_1);
  2476 	  __ bgez(T1, l_1);    
  2477 	  __ delayed()->nop(); 
  2478 	  //  if (VM_Version::supports_mmx()) {
  2479 	  //    __ emms();
  2480 	  //  }
  2481 	  //  __ ret(0);
  2482 	  __ pop(T1); 
  2483 	  __ pop(T0); 
  2484 	  __ pop(T3); 
  2485 	  __ jr(RA); 
  2486 	  __ delayed()->nop(); 
  2487 	  return start;
  2491   address generate_conjoint_long_copy(bool aligned, const char *name) {
  2492 	  Label l_1, l_2;
  2493 	  StubCodeMark mark(this, "StubRoutines", name);
  2494 	  __ align(CodeEntryAlignment);
  2495 	  address start = __ pc();
  2496 	  address nooverlap_target = aligned ?
  2497 		  StubRoutines::arrayof_jlong_disjoint_arraycopy() :
  2498 		  StubRoutines::jlong_disjoint_arraycopy();
  2499 	  array_overlap_test(nooverlap_target, 3);
  2501 	  __ push(T3); 
  2502 	  __ push(T0); 
  2503 	  __ push(T1); 
  2505 		/*      __ movl(ecx, Address(esp, 4+8));       // count
  2506 						__ movl(eax, Address(esp, 4+0));       // from
  2507 						__ movl(edx, Address(esp, 4+4));       // to
  2508 						__ jmp(l_2);
  2510 		 */
  2511 	  __ move(T1, A2);  
  2512 	  __ move(T3, A0); 
  2513 	  __ move(T0, A1);
  2514 	  __ sll(AT, T1, Address::times_8); 
  2515 	  __ add(AT, T3, AT); 
  2516 	  __ lea(T3 , Address(AT, -8)); 
  2517 	  __ sll(AT, T1, Address::times_8); 
  2518 	  __ add(AT, T0, AT); 
  2519 	  __ lea(T0 , Address(AT, -8)); 
  2523 	  __ b(l_2); 
  2524 	  __ delayed()->nop(); 
  2525 	  __ align(16);
  2526 		__ bind(l_1);
  2527 		/*      if (VM_Version::supports_mmx()) {
  2528 						__ movq(mmx0, Address(eax, ecx, Address::times_8));
  2529 						__ movq(Address(edx, ecx,Address::times_8), mmx0);
  2530 						} else {
  2531 						__ fild_d(Address(eax, ecx, Address::times_8));
  2532 						__ fistp_d(Address(edx, ecx,Address::times_8));
  2534 		 */    
  2535 		__ ld(AT, T3, 0);   
  2536 		__ sd (AT, T0, 0); 
  2537 	  __ addi(T3, T3, -8); 
  2538 	  __ addi(T0, T0,-8); 
  2539 	  __ bind(l_2);
  2540 	  //	    __ decl(ecx);
  2541 	  __ addi(T1, T1, -1); 
  2542 	  //__ jcc(Assembler::greaterEqual, l_1);
  2543 	  __ bgez(T1, l_1); 
  2544 	  __ delayed()->nop(); 
  2545 	  //      if (VM_Version::supports_mmx()) {
  2546 	  //      __ emms();
  2547 	  //   }
  2548 	  //  __ ret(0);
  2549 	  __ pop(T1); 
  2550 	  __ pop(T0); 
  2551 	  __ pop(T3); 
  2552 	  __ jr(RA); 
  2553 	  __ delayed()->nop();  
  2554 	  return start;
  2557   void generate_arraycopy_stubs() {
  2558     if (UseCompressedOops) {
  2559       StubRoutines::_oop_disjoint_arraycopy    = generate_disjoint_int_oop_copy(false, true, "oop_disjoint_arraycopy");
  2560       StubRoutines::_oop_arraycopy   	= generate_conjoint_int_oop_copy(false, true, "oop_arraycopy");
  2561     } else {
  2562       StubRoutines::_oop_disjoint_arraycopy    = generate_disjoint_long_oop_copy(false, true, "oop_disjoint_arraycopy");
  2563       StubRoutines::_oop_arraycopy   	= generate_conjoint_long_oop_copy(false, true, "oop_arraycopy");
  2566     StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
  2567     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
  2568     StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, "jint_disjoint_arraycopy");
  2569     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
  2570     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
  2572     //  if (VM_Version::supports_mmx())
  2573     //if (false)
  2574     // StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_mmx_copy_aligned("arrayof_jshort_disjoint_arraycopy");
  2575     // else
  2576     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
  2577     StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(true, false, "arrayof_jint_disjoint_arraycopy");
  2578     //StubRoutines::_arrayof_oop_disjoint_arraycopy   = generate_disjoint_int_oop_copy(true, true, "arrayof_oop_disjoint_arraycopy");
  2579     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
  2581     StubRoutines::_jbyte_arraycopy  = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
  2582     StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, "jshort_arraycopy");
  2583     StubRoutines::_jint_arraycopy   = generate_conjoint_int_oop_copy(false, false, "jint_arraycopy");
  2584     StubRoutines::_jlong_arraycopy  = generate_conjoint_long_copy(false, "jlong_arraycopy");
  2586     StubRoutines::_arrayof_jbyte_arraycopy  = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
  2587     StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
  2588     StubRoutines::_arrayof_jint_arraycopy   = generate_conjoint_int_oop_copy(true, false, "arrayof_jint_arraycopy");
  2589     //StubRoutines::_arrayof_oop_arraycopy    = generate_conjoint_int_oop_copy(true, true, "arrayof_oop_arraycopy");
  2590     StubRoutines::_arrayof_jlong_arraycopy  = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");
  2592     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
  2593     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
  2596 //Wang: add a function to implement SafeFetch32 and SafeFetchN
  2597   void generate_safefetch(const char* name, int size, address* entry,
  2598                           address* fault_pc, address* continuation_pc) {
  2599     // safefetch signatures:
  2600     //   int      SafeFetch32(int*      adr, int      errValue);
  2601     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
  2602     //
  2603     // arguments:
  2604     //   A0 = adr
  2605     //   A1 = errValue
  2606     //
  2607     // result:
  2608     //   PPC_RET  = *adr or errValue
  2610     StubCodeMark mark(this, "StubRoutines", name);
  2612     // Entry point, pc or function descriptor.
  2613     *entry = __ pc();
  2615     // Load *adr into A1, may fault.
  2616     *fault_pc = __ pc();
  2617     switch (size) {
  2618       case 4:
  2619         // int32_t
  2620         __ lw(A1, A0, 0); 
  2621         break;
  2622       case 8:
  2623         // int64_t
  2624         __ ld(A1, A0, 0); 
  2625         break;
  2626       default:
  2627         ShouldNotReachHere();
  2630     // return errValue or *adr
  2631     *continuation_pc = __ pc();
  2632     __ addu(V0,A1,R0);
  2633     __ jr(RA);
  2634     __ delayed()->nop();
  2638 #undef __
  2639 #define __ masm->
  2641   // Continuation point for throwing of implicit exceptions that are
  2642   // not handled in the current activation. Fabricates an exception
  2643   // oop and initiates normal exception dispatching in this
  2644   // frame. Since we need to preserve callee-saved values (currently
  2645   // only for C2, but done for C1 as well) we need a callee-saved oop
  2646   // map and therefore have to make these stubs into RuntimeStubs
  2647   // rather than BufferBlobs.  If the compiler needs all registers to
  2648   // be preserved between the fault point and the exception handler
  2649   // then it must assume responsibility for that in
  2650   // AbstractCompiler::continuation_for_implicit_null_exception or
  2651   // continuation_for_implicit_division_by_zero_exception. All other
  2652   // implicit exceptions (e.g., NullPointerException or
  2653   // AbstractMethodError on entry) are either at call sites or
  2654   // otherwise assume that stack unwinding will be initiated, so
  2655   // caller saved registers were assumed volatile in the compiler.
  2656   address generate_throw_exception(const char* name,
  2657                                    address runtime_entry,
  2658                                    bool restore_saved_exception_pc) {
  2659     // Information about frame layout at time of blocking runtime call.
  2660     // Note that we only have to preserve callee-saved registers since
  2661     // the compilers are responsible for supplying a continuation point
  2662 		// if they expect all registers to be preserved.
  2663 //#define aoqi_test
  2664 #ifdef aoqi_test
  2665 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2666 #endif
  2667 		enum layout {
  2668 			thread_off,    // last_java_sp                
  2669 			S7_off,        // callee saved register      sp + 1
  2670 			S6_off,        // callee saved register      sp + 2
  2671 			S5_off,        // callee saved register      sp + 3
  2672 			S4_off,        // callee saved register      sp + 4
  2673 			S3_off,        // callee saved register      sp + 5
  2674 			S2_off,        // callee saved register      sp + 6
  2675 			S1_off,        // callee saved register      sp + 7
  2676 			S0_off,        // callee saved register      sp + 8
  2677 			FP_off,
  2678 			ret_address,
  2679 			framesize
  2680 		};
  2682 		int insts_size = 2048;
  2683 		int locs_size  = 32;
  2685 		//  CodeBuffer* code     = new CodeBuffer(insts_size, locs_size, 0, 0, 0, false, 
  2686 		//  NULL, NULL, NULL, false, NULL, name, false);
  2687 		CodeBuffer code (name , insts_size, locs_size);
  2688 #ifdef aoqi_test
  2689 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2690 #endif
  2691 		OopMapSet* oop_maps  = new OopMapSet();
  2692 #ifdef aoqi_test
  2693 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2694 #endif
  2695 		MacroAssembler* masm = new MacroAssembler(&code);
  2696 #ifdef aoqi_test
  2697 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2698 #endif
  2700 		address start = __ pc();
  2701     	//__ stop("generate_throw_exception");
  2702 		/*
  2703 			 __ move(AT, (int)&jerome1 );
  2704 			 __ sw(SP, AT, 0); 	
  2705 			 __ move(AT, (int)&jerome2 );
  2706 			 __ sw(FP, AT, 0); 	
  2707 			 __ move(AT, (int)&jerome3 );
  2708 			 __ sw(RA, AT, 0); 	
  2709 			 __ move(AT, (int)&jerome4 );
  2710 			 __ sw(R0, AT, 0); 	
  2711 			 __ move(AT, (int)&jerome5 );
  2712 			 __ sw(R0, AT, 0); 	
  2713 			 __ move(AT, (int)&jerome6 );
  2714 			 __ sw(R0, AT, 0); 	
  2715 			 __ move(AT, (int)&jerome7 );
  2716 			 __ sw(R0, AT, 0); 	
  2717 			 __ move(AT, (int)&jerome10 );
  2718 			 __ sw(R0, AT, 0); 	
  2720 			 __ pushad();
  2722 		//__ enter();
  2723 		__ call(CAST_FROM_FN_PTR(address, SharedRuntime::print_call_statistics), 
  2724 		relocInfo::runtime_call_type);
  2725 		__ delayed()->nop();
  2727 		//__ leave();
  2728 		__ popad();
  2730 		 */
  2732 		// This is an inlined and slightly modified version of call_VM
  2733 		// which has the ability to fetch the return PC out of
  2734 		// thread-local storage and also sets up last_Java_sp slightly
  2735 		// differently than the real call_VM
  2736 #ifndef OPT_THREAD	
  2737 		Register java_thread = TREG;
  2738 		__ get_thread(java_thread);
  2739 #else
  2740 		Register java_thread = TREG;
  2741 #endif
  2742 #ifdef aoqi_test
  2743 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2744 #endif
  2745 		if (restore_saved_exception_pc) {
  2746 			__ ld(RA, java_thread, in_bytes(JavaThread::saved_exception_pc_offset())); // eax
  2749 		__ enter(); // required for proper stackwalking of RuntimeStub frame
  2751 		__ addi(SP, SP, (-1) * (framesize-2) * wordSize); // prolog
  2752 		__ sd(S0, SP, S0_off * wordSize);
  2753 		__ sd(S1, SP, S1_off * wordSize);
  2754 		__ sd(S2, SP, S2_off * wordSize);
  2755 		__ sd(S3, SP, S3_off * wordSize);
  2756 		__ sd(S4, SP, S4_off * wordSize);
  2757 		__ sd(S5, SP, S5_off * wordSize);
  2758 		__ sd(S6, SP, S6_off * wordSize);
  2759 		__ sd(S7, SP, S7_off * wordSize);
  2761 		int frame_complete = __ pc() - start;
  2762 		// push java thread (becomes first argument of C function)
  2763 		__ sd(java_thread, SP, thread_off * wordSize);
  2764 		if (java_thread!=A0)
  2765 			__ move(A0, java_thread);
  2767 		// Set up last_Java_sp and last_Java_fp
  2768 		__ set_last_Java_frame(java_thread, SP, FP, NULL);
  2769 		__ relocate(relocInfo::internal_pc_type);
  2771 			intptr_t save_pc = (intptr_t)__ pc() +  NativeMovConstReg::instruction_size + NativeCall::return_address_offset + 4;
  2772 			__ li48(AT, save_pc);
  2774 		__ sd(AT, java_thread, in_bytes(JavaThread::last_Java_pc_offset())); 
  2776 		// Call runtime
  2777 		__ call(runtime_entry);
  2778 		__ delayed()->nop();
  2779 		// Generate oop map
  2780 		OopMap* map =  new OopMap(framesize, 0);        
  2781 		oop_maps->add_gc_map(__ offset(),  map);
  2783 		// restore the thread (cannot use the pushed argument since arguments
  2784 		// may be overwritten by C code generated by an optimizing compiler);
  2785 		// however can use the register value directly if it is callee saved.
  2786 #ifndef OPT_THREAD
  2787 		__ get_thread(java_thread);
  2788 #endif
  2790 		__ ld(SP, java_thread, in_bytes(JavaThread::last_Java_sp_offset()));
  2791 		//  __ reset_last_Java_frame(java_thread, true);
  2792 		__ reset_last_Java_frame(java_thread, true, true);
  2794 		// Restore callee save registers.  This must be done after resetting the Java frame
  2795 		__ ld(S0, SP, S0_off * wordSize);
  2796 		__ ld(S1, SP, S1_off * wordSize);
  2797 		__ ld(S2, SP, S2_off * wordSize);
  2798 		__ ld(S3, SP, S3_off * wordSize);
  2799 		__ ld(S4, SP, S4_off * wordSize);
  2800 		__ ld(S5, SP, S5_off * wordSize);
  2801 		__ ld(S6, SP, S6_off * wordSize);
  2802 		__ ld(S7, SP, S7_off * wordSize);
  2804 		// discard arguments
  2805 		__ addi(SP, SP, (framesize-2) * wordSize); // epilog
  2806 		//	__ leave(); // required for proper stackwalking of RuntimeStub frame
  2807 		__ addi(SP, FP, wordSize);
  2808 		__ ld(FP, SP, -1*wordSize);
  2809 		// check for pending exceptions
  2810 #ifdef ASSERT
  2811 		Label L;
  2812 		__ lw(AT, java_thread, in_bytes(Thread::pending_exception_offset()));
  2813 		__ bne(AT, R0, L);
  2814 		__ delayed()->nop();
  2815 		__ should_not_reach_here();
  2816 		__ bind(L);
  2817 #endif //ASSERT
  2818 		__ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
  2819 		__ delayed()->nop();
  2820 #ifdef aoqi_test
  2821 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2822 #endif
  2823 		RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code,frame_complete, 
  2824 										framesize, oop_maps, false);
  2825 #ifdef aoqi_test
  2826 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2827 #endif
  2828 		return stub->entry_point();
  2831   // Initialization
  2832   void generate_initial() {
  2833 /*
  2834 		// Generates all stubs and initializes the entry points
  2836     // This platform-specific stub is needed by generate_call_stub()
  2837     StubRoutines::mips::_mxcsr_std        = generate_fp_mask("mxcsr_std",        0x0000000000001F80);
  2839     // entry points that exist in all platforms Note: This is code
  2840     // that could be shared among different platforms - however the
  2841     // benefit seems to be smaller than the disadvantage of having a
  2842     // much more complicated generator structure. See also comment in
  2843     // stubRoutines.hpp.
  2845     StubRoutines::_forward_exception_entry = generate_forward_exception();
  2847     StubRoutines::_call_stub_entry =
  2848       generate_call_stub(StubRoutines::_call_stub_return_address);
  2850     // is referenced by megamorphic call
  2851     StubRoutines::_catch_exception_entry = generate_catch_exception();
  2853     // atomic calls
  2854     StubRoutines::_atomic_xchg_entry         = generate_atomic_xchg();
  2855     StubRoutines::_atomic_xchg_ptr_entry     = generate_atomic_xchg_ptr();
  2856     StubRoutines::_atomic_cmpxchg_entry      = generate_atomic_cmpxchg();
  2857     StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
  2858     StubRoutines::_atomic_add_entry          = generate_atomic_add();
  2859     StubRoutines::_atomic_add_ptr_entry      = generate_atomic_add_ptr();
  2860     StubRoutines::_fence_entry               = generate_orderaccess_fence();
  2862     StubRoutines::_handler_for_unsafe_access_entry =
  2863       generate_handler_for_unsafe_access();
  2865     // platform dependent
  2866     StubRoutines::mips::_get_previous_fp_entry = generate_get_previous_fp();
  2868     StubRoutines::mips::_verify_mxcsr_entry    = generate_verify_mxcsr();
  2869 */
  2870 		// Generates all stubs and initializes the entry points
  2872 		//-------------------------------------------------------------
  2873 		//-----------------------------------------------------------
  2874 		// entry points that exist in all platforms
  2875 		// Note: This is code that could be shared among different platforms - however the benefit seems to be smaller 
  2876 		// than the disadvantage of having a much more complicated generator structure. 
  2877 		// See also comment in stubRoutines.hpp.
  2878 		StubRoutines::_forward_exception_entry = generate_forward_exception();    
  2879 		StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
  2880 		// is referenced by megamorphic call    
  2881 		StubRoutines::_catch_exception_entry = generate_catch_exception();    
  2883 		StubRoutines::_handler_for_unsafe_access_entry = generate_handler_for_unsafe_access();
  2885 		// platform dependent
  2886 		StubRoutines::gs2::_get_previous_fp_entry = generate_get_previous_fp();
  2889 void generate_all() {
  2890 #ifdef aoqi_test
  2891 tty->print_cr("%s:%d", __func__, __LINE__);
  2892 #endif
  2893     // Generates all stubs and initializes the entry points
  2895     // These entry points require SharedInfo::stack0 to be set up in
  2896     // non-core builds and need to be relocatable, so they each
  2897     // fabricate a RuntimeStub internally.
  2898 	/*
  2899     StubRoutines::_throw_AbstractMethodError_entry =
  2900       generate_throw_exception("AbstractMethodError throw_exception",
  2901                                CAST_FROM_FN_PTR(address,
  2902                                                 SharedRuntime::
  2903                                                 throw_AbstractMethodError),
  2904                                false);
  2906     StubRoutines::_throw_IncompatibleClassChangeError_entry =
  2907       generate_throw_exception("IncompatibleClassChangeError throw_exception",
  2908                                CAST_FROM_FN_PTR(address,
  2909                                                 SharedRuntime::
  2910                                                 throw_IncompatibleClassChangeError),
  2911                                false);
  2913     StubRoutines::_throw_ArithmeticException_entry =
  2914       generate_throw_exception("ArithmeticException throw_exception",
  2915                                CAST_FROM_FN_PTR(address,
  2916                                                 SharedRuntime::
  2917                                                 throw_ArithmeticException),
  2918                                true);
  2920     StubRoutines::_throw_NullPointerException_entry =
  2921       generate_throw_exception("NullPointerException throw_exception",
  2922                                CAST_FROM_FN_PTR(address,
  2923                                                 SharedRuntime::
  2924                                                 throw_NullPointerException),
  2925                                true);
  2927     StubRoutines::_throw_NullPointerException_at_call_entry =
  2928       generate_throw_exception("NullPointerException at call throw_exception",
  2929                                CAST_FROM_FN_PTR(address,
  2930                                                 SharedRuntime::
  2931                                                 throw_NullPointerException_at_call),
  2932                                false);
  2934     StubRoutines::_throw_StackOverflowError_entry =
  2935       generate_throw_exception("StackOverflowError throw_exception",
  2936                                CAST_FROM_FN_PTR(address,
  2937                                                 SharedRuntime::
  2938                                                 throw_StackOverflowError),
  2939                                false);
  2941     // entry points that are platform specific
  2942     StubRoutines::mips::_f2i_fixup = generate_f2i_fixup();
  2943     StubRoutines::mips::_f2l_fixup = generate_f2l_fixup();
  2944     StubRoutines::mips::_d2i_fixup = generate_d2i_fixup();
  2945     StubRoutines::mips::_d2l_fixup = generate_d2l_fixup();
  2947     StubRoutines::mips::_float_sign_mask  = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
  2948     StubRoutines::mips::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
  2949     StubRoutines::mips::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
  2950     StubRoutines::mips::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
  2952     // support for verify_oop (must happen after universe_init)
  2953     StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
  2955     // arraycopy stubs used by compilers
  2956     generate_arraycopy_stubs();
  2957 	*/
  2958 #ifdef aoqi_test
  2959 tty->print_cr("%s:%d", __func__, __LINE__);
  2960 #endif
  2961 		StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
  2962 #ifdef aoqi_test
  2963 tty->print_cr("%s:%d", __func__, __LINE__);
  2964 #endif
  2965 //		StubRoutines::_throw_ArithmeticException_entry         = generate_throw_exception("ArithmeticException throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_ArithmeticException),  true);
  2966 #ifdef aoqi_test
  2967 tty->print_cr("%s:%d", __func__, __LINE__);
  2968 #endif
  2969 //		StubRoutines::_throw_NullPointerException_entry        = generate_throw_exception("NullPointerException throw_exception",         CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException), true);
  2970 #ifdef aoqi_test
  2971 tty->print_cr("%s:%d", __func__, __LINE__);
  2972 #endif
  2973 		StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
  2974 #ifdef aoqi_test
  2975 tty->print_cr("%s:%d", __func__, __LINE__);
  2976 #endif
  2977 		StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError),   false);
  2978 #ifdef aoqi_test
  2979 tty->print_cr("%s:%d", __func__, __LINE__);
  2980 #endif
  2982 		//------------------------------------------------------
  2983 		//------------------------------------------------------------------
  2984 		// entry points that are platform specific  
  2986 		// support for verify_oop (must happen after universe_init)
  2987 #ifdef aoqi_test
  2988 tty->print_cr("%s:%d", __func__, __LINE__);
  2989 #endif
  2990 		StubRoutines::_verify_oop_subroutine_entry	   = generate_verify_oop();
  2991 #ifdef aoqi_test
  2992 tty->print_cr("%s:%d", __func__, __LINE__);
  2993 #endif
  2994 #ifndef CORE
  2995 		// arraycopy stubs used by compilers
  2996 		generate_arraycopy_stubs();
  2997 #ifdef aoqi_test
  2998 tty->print_cr("%s:%d", __func__, __LINE__);
  2999 #endif
  3000 #endif
  3002     // Safefetch stubs.
  3003     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
  3004                                                        &StubRoutines::_safefetch32_fault_pc,
  3005                                                        &StubRoutines::_safefetch32_continuation_pc);
  3006     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
  3007                                                        &StubRoutines::_safefetchN_fault_pc,
  3008                                                        &StubRoutines::_safefetchN_continuation_pc);
  3011  public:
  3012   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
  3013     if (all) {
  3014       generate_all();
  3015     } else {
  3016       generate_initial();
  3019 }; // end class declaration
  3020 /*
  3021 address StubGenerator::disjoint_byte_copy_entry  = NULL;
  3022 address StubGenerator::disjoint_short_copy_entry = NULL;
  3023 address StubGenerator::disjoint_int_copy_entry   = NULL;
  3024 address StubGenerator::disjoint_long_copy_entry  = NULL;
  3025 address StubGenerator::disjoint_oop_copy_entry   = NULL;
  3027 address StubGenerator::byte_copy_entry  = NULL;
  3028 address StubGenerator::short_copy_entry = NULL;
  3029 address StubGenerator::int_copy_entry   = NULL;
  3030 address StubGenerator::long_copy_entry  = NULL;
  3031 address StubGenerator::oop_copy_entry   = NULL;
  3033 address StubGenerator::checkcast_copy_entry = NULL;
  3034 */
  3035 void StubGenerator_generate(CodeBuffer* code, bool all) {
  3036   StubGenerator g(code, all);

mercurial