src/cpu/mips/vm/stubGenerator_mips_64.cpp

Tue, 26 Jul 2016 17:06:17 +0800

author
fujie
date
Tue, 26 Jul 2016 17:06:17 +0800
changeset 41
d885f8d65c58
parent 35
a2cbf57bd9f3
child 103
58408aa75fba
permissions
-rw-r--r--

Add multiply word to GPR instruction (mul) in MIPS assembler.

     1 /*
     2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
     3  * Copyright (c) 2015, 2016, Loongson Technology. All rights reserved.
     4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     5  *
     6  * This code is free software; you can redistribute it and/or modify it
     7  * under the terms of the GNU General Public License version 2 only, as
     8  * published by the Free Software Foundation.
     9  *
    10  * This code is distributed in the hope that it will be useful, but WITHOUT
    11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    13  * version 2 for more details (a copy is included in the LICENSE file that
    14  * accompanied this code).
    15  *
    16  * You should have received a copy of the GNU General Public License version
    17  * 2 along with this work; if not, write to the Free Software Foundation,
    18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    19  *
    20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    21  * or visit www.oracle.com if you need additional information or have any
    22  * questions.
    23  *
    24  */
    26 #include "precompiled.hpp"
    27 #include "asm/macroAssembler.hpp"
    28 #include "asm/macroAssembler.inline.hpp"
    29 #include "interpreter/interpreter.hpp"
    30 #include "nativeInst_mips.hpp"
    31 #include "oops/instanceOop.hpp"
    32 #include "oops/method.hpp"
    33 #include "oops/objArrayKlass.hpp"
    34 #include "oops/oop.inline.hpp"
    35 #include "prims/methodHandles.hpp"
    36 #include "runtime/frame.inline.hpp"
    37 #include "runtime/handles.inline.hpp"
    38 #include "runtime/sharedRuntime.hpp"
    39 #include "runtime/stubCodeGenerator.hpp"
    40 #include "runtime/stubRoutines.hpp"
    41 #include "runtime/thread.inline.hpp"
    42 #include "utilities/top.hpp"
    43 #ifdef COMPILER2
    44 #include "opto/runtime.hpp"
    45 #endif
    48 // Declaration and definition of StubGenerator (no .hpp file).
    49 // For a more detailed description of the stub routine structure
    50 // see the comment in stubRoutines.hpp
    52 #define __ _masm->
    53 //#define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
    54 //#define a__ ((Assembler*)_masm)->
    56 //#ifdef PRODUCT
    57 //#define BLOCK_COMMENT(str) /* nothing */
    58 //#else
    59 //#define BLOCK_COMMENT(str) __ block_comment(str)
    60 //#endif
    62 //#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
    63 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
    65 // Stub Code definitions
    67 static address handle_unsafe_access() {
    68   JavaThread* thread = JavaThread::current();
    69   address pc = thread->saved_exception_pc();
    70   // pc is the instruction which we must emulate
    71   // doing a no-op is fine:  return garbage from the load
    72   // therefore, compute npc
    73   //address npc = Assembler::locate_next_instruction(pc);
    74 	address npc = (address)((unsigned long)pc + sizeof(unsigned long));
    76   // request an async exception
    77   thread->set_pending_unsafe_access_error();
    79   // return address of next instruction to execute
    80   return npc;
    81 }
    83 class StubGenerator: public StubCodeGenerator {
    84  private:
    86   // ABI mips n64
    87   // This fig is not MIPS ABI. It is call Java from C ABI.
    88   // Call stubs are used to call Java from C
    89   //
    90   //    [ return_from_Java     ]
    91   //    [ argument word n-1    ] <--- sp
    92   //      ...
    93   //    [ argument word 0      ]
    94   //      ...
    95   //-10 [ S6     	       ]
    96   // -9 [ S5		       ] 
    97   // -8 [ S4		       ]
    98   // -7 [ S3                   ]
    99   // -6 [ S0  		       ]
   100   // -5 [ TSR(S2)	       ]
   101   // -4 [ LVP(S7)              ]
   102   // -3 [ BCP(S1)              ]
   103   // -2 [ saved fp             ] <--- fp_after_call
   104   // -1 [ return address       ] 
   105   //  0 [ ptr. to call wrapper ] <--- a0 (old sp -->)fp
   106   //  1 [ result               ] <--- a1
   107   //  2 [ result_type          ] <--- a2
   108   //  3 [ method               ] <--- a3
   109   //  4 [ entry_point          ] <--- a4
   110   //  5 [ parameters           ] <--- a5
   111   //  6 [ parameter_size       ] <--- a6
   112   //  7 [ thread               ] <--- a7
   114   //
   115   // _LP64: n64 does not save paras in sp.
   116   //
   117   //    [ return_from_Java     ]
   118   //    [ argument word n-1    ] <--- sp
   119   //      ...
   120   //    [ argument word 0      ]
   121   //      ...
   122   //-14 [ thread               ]
   123   //-13 [ result_type          ] <--- a2
   124   //-12 [ result               ] <--- a1
   125   //-11 [ ptr. to call wrapper ] <--- a0
   126   //-10 [ S6     	       ]
   127   // -9 [ S5		       ] 
   128   // -8 [ S4		       ]
   129   // -7 [ S3                   ]
   130   // -6 [ S0  		       ]
   131   // -5 [ TSR(S2)	       ]
   132   // -4 [ LVP(S7)              ]
   133   // -3 [ BCP(S1)              ]
   134   // -2 [ saved fp             ] <--- fp_after_call
   135   // -1 [ return address       ] 
   136   //  0 [        	       ] <--- old sp
   137   /*
   138    * 2014/01/16 Fu: Find a right place in the call_stub for GP.
   139    * GP will point to the starting point of Interpreter::dispatch_table(itos). 
   140    * It should be saved/restored before/after Java calls. 
   141    *
   142    */
   143    enum call_stub_layout {
   144      RA_off		  = -1,
   145      FP_off		  = -2,
   146      BCP_off		  = -3,
   147      LVP_off		  = -4,
   148      TSR_off		  = -5,
   149      S1_off		  = -6,
   150      S3_off		  = -7,
   151      S4_off		  = -8,
   152      S5_off		  = -9,
   153      S6_off		  = -10,
   154      result_off		  = -11,
   155      result_type_off	  = -12,
   156      thread_off		  = -13,
   157      total_off		  = thread_off - 3,
   158      GP_off               = -16,
   159    };
   161   address generate_call_stub(address& return_address) {
   163     StubCodeMark mark(this, "StubRoutines", "call_stub");
   164     address start = __ pc();
   166     // same as in generate_catch_exception()!
   168     // stub code
   169     // save ra and fp
   170     __ sd(RA, SP, RA_off * wordSize);
   171     __ sd(FP, SP, FP_off * wordSize);
   172     __ sd(BCP, SP, BCP_off * wordSize);
   173     __ sd(LVP, SP, LVP_off * wordSize);
   174     __ sd(GP, SP, GP_off * wordSize);
   175     __ sd(TSR, SP, TSR_off * wordSize);
   176     __ sd(S1, SP, S1_off * wordSize);
   177     __ sd(S3, SP, S3_off * wordSize);
   178     __ sd(S4, SP, S4_off * wordSize);
   179     __ sd(S5, SP, S5_off * wordSize);
   180     __ sd(S6, SP, S6_off * wordSize);
   183     __ li48(GP, (long)Interpreter::dispatch_table(itos));
   185     // I think 14 is the max gap between argument and callee saved register
   186     __ daddi(FP, SP, (-2) * wordSize);
   187     __ daddi(SP, SP, total_off * wordSize);
   188 //FIXME, aoqi. find a suitable place to save A1 & A2.
   189     /*
   190     __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);
   191     __ sd(A1, FP, 3 * wordSize);
   192     __ sd(A2, FP, 4 * wordSize);
   193     __ sd(A3, FP, 5 * wordSize);
   194     __ sd(A4, FP, 6 * wordSize);
   195     __ sd(A5, FP, 7 * wordSize);
   196     __ sd(A6, FP, 8 * wordSize);
   197     __ sd(A7, FP, 9 * wordSize);
   198     */
   199     __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);
   200     __ sd(A1, FP, result_off * wordSize);
   201     __ sd(A2, FP, result_type_off * wordSize);
   202     __ sd(A7, FP, thread_off * wordSize);
   204 #ifdef OPT_THREAD
   205     //__ get_thread(TREG);
   206     __ move(TREG, A7);
   208     //__ ld(TREG, FP, thread_off * wordSize);
   209 #endif
   210     //add for compressedoops
   211     __ reinit_heapbase();
   213 #ifdef ASSERT
   214     // make sure we have no pending exceptions
   215     { 
   216       Label L;
   217     	__ ld(AT, A7, in_bytes(Thread::pending_exception_offset()));
   218     	__ beq(AT, R0, L); 
   219     	__ delayed()->nop();
   220     	/* FIXME: I do not know how to realize stop in mips arch, do it in the future */
   221     	__ stop("StubRoutines::call_stub: entered with pending exception");
   222     	__ bind(L);
   223     }
   224 #endif
   226     // pass parameters if any
   227     // A5: parameter
   228     // A6: parameter_size
   229     // T0: parameter_size_tmp(--)
   230     // T2: offset(++)
   231     // T3: tmp
   232     Label parameters_done;
   233     // judge if the parameter_size equals 0
   234     __ beq(A6, R0, parameters_done);
   235     __ delayed()->nop();
   236     __ dsll(AT, A6, Interpreter::logStackElementSize);
   237     __ dsub(SP, SP, AT); 
   238     __ move(AT, -StackAlignmentInBytes); 
   239     __ andr(SP, SP , AT); 
   240     // Copy Java parameters in reverse order (receiver last)
   241     // Note that the argument order is inverted in the process
   242     // source is edx[ecx: N-1..0]
   243     // dest   is esp[ebx: 0..N-1]
   244     Label loop;
   245     __ move(T0, A6);
   246     __ move(T2, R0);
   247     __ bind(loop);
   249     // get parameter
   250     __ dsll(T3, T0, LogBytesPerWord);   
   251     __ dadd(T3, T3, A5);	    
   252     __ ld(AT, T3,  -wordSize);
   253     __ dsll(T3, T2, LogBytesPerWord); 
   254     __ dadd(T3, T3, SP); 
   255     __ sd(AT, T3, Interpreter::expr_offset_in_bytes(0));
   256     __ daddi(T2, T2, 1); 
   257     __ daddi(T0, T0, -1); 
   258     __ bne(T0, R0, loop);
   259     __ delayed()->nop();
   260     // advance to next parameter
   262     // call Java function
   263     __ bind(parameters_done);
   265     // receiver in V0, methodOop in Rmethod
   267     __ move(Rmethod, A3);
   268     __ move(Rsender, SP);             //set sender sp
   269     __ jalr(A4);
   270     __ delayed()->nop();
   271     return_address = __ pc();
   273     Label common_return;
   274     __ bind(common_return);
   276     // store result depending on type
   277     // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
   278     __ ld(T0, FP, result_off * wordSize); 	// result --> T0
   279     Label is_long, is_float, is_double, exit;
   280     __ ld(T2, FP, result_type_off * wordSize);	// result_type --> T2
   281     __ daddi(T3, T2, (-1) * T_LONG);
   282     __ beq(T3, R0, is_long);
   283     __ delayed()->daddi(T3, T2, (-1) * T_FLOAT);
   284     __ beq(T3, R0, is_float);
   285     __ delayed()->daddi(T3, T2, (-1) * T_DOUBLE);
   286     __ beq(T3, R0, is_double);
   287     __ delayed()->nop();
   289     // handle T_INT case
   290     __ sd(V0, T0, 0 * wordSize);
   291     __ bind(exit);
   293     // restore 
   294     __ daddi(SP, FP, 2 * wordSize );
   295     __ ld(RA, SP, RA_off * wordSize);
   296     __ ld(FP, SP, FP_off * wordSize);
   297     __ ld(BCP, SP, BCP_off * wordSize);
   298     __ ld(LVP, SP, LVP_off * wordSize);
   299     __ ld(GP, SP, GP_off * wordSize);
   300     __ ld(TSR, SP, TSR_off * wordSize);
   302     __ ld(S1, SP, S1_off * wordSize);
   303     __ ld(S3, SP, S3_off * wordSize);
   304     __ ld(S4, SP, S4_off * wordSize);
   305     __ ld(S5, SP, S5_off * wordSize);
   306     __ ld(S6, SP, S6_off * wordSize);
   308     // return
   309     __ jr(RA);
   310     __ delayed()->nop();
   312     // handle return types different from T_INT
   313     __ bind(is_long);
   314     __ sd(V0, T0, 0 * wordSize);
   315     //__ sd(V1, T0, 1 * wordSize);
   316     //__ sd(R0, T0, 1 * wordSize);
   317     __ b(exit);
   318     __ delayed()->nop();
   320     __ bind(is_float);
   321     __ swc1(F0, T0, 0 * wordSize);
   322     __ b(exit);
   323     __ delayed()->nop();
   325     __ bind(is_double);
   326     __ sdc1(F0, T0, 0 * wordSize);
   327     //__ sdc1(F1, T0, 1 * wordSize);
   328     //__ sd(R0, T0, 1 * wordSize);
   329     __ b(exit);
   330     __ delayed()->nop();
   331     //FIXME, 1.6 mips version add operation of fpu here
   332     StubRoutines::gs2::set_call_stub_compiled_return(__ pc());
   333     __ b(common_return);
   334     __ delayed()->nop(); 
   335     return start;
   336   }
   338   // Return point for a Java call if there's an exception thrown in
   339   // Java code.  The exception is caught and transformed into a
   340   // pending exception stored in JavaThread that can be tested from
   341   // within the VM.
   342   //
   343   // Note: Usually the parameters are removed by the callee. In case
   344   // of an exception crossing an activation frame boundary, that is
   345   // not the case if the callee is compiled code => need to setup the
   346   // rsp.
   347   //
   348   // rax: exception oop
   350   address generate_catch_exception() {
   351     StubCodeMark mark(this, "StubRoutines", "catch_exception");
   352     address start = __ pc();
   354     Register thread = TREG;
   356     // get thread directly
   357 #ifndef OPT_THREAD
   358     __ ld(thread, FP, thread_off * wordSize);
   359 #endif
   361 #ifdef ASSERT
   362     // verify that threads correspond
   363     { Label L;
   364       __ get_thread(T8);
   365       __ beq(T8, thread, L);
   366       __ delayed()->nop();
   367       __ stop("StubRoutines::catch_exception: threads must correspond");
   368       __ bind(L);
   369     }
   370 #endif
   371     // set pending exception
   372     __ verify_oop(V0);
   373     __ sd(V0, thread, in_bytes(Thread::pending_exception_offset()));
   374     __ li(AT, (long)__FILE__);
   375     __ sd(AT, thread, in_bytes(Thread::exception_file_offset   ()));
   376     __ li(AT, (long)__LINE__);
   377     __ sd(AT, thread, in_bytes(Thread::exception_line_offset   ()));
   379     // complete return to VM
   380     assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
   381     __ jmp(StubRoutines::_call_stub_return_address, relocInfo::none);
   382     __ delayed()->nop();
   384     return start;
   385   }
   387   // Continuation point for runtime calls returning with a pending
   388   // exception.  The pending exception check happened in the runtime
   389   // or native call stub.  The pending exception in Thread is
   390   // converted into a Java-level exception.
   391   //
   392   // Contract with Java-level exception handlers:
   393   // rax: exception
   394   // rdx: throwing pc
   395   //
   396   // NOTE: At entry of this stub, exception-pc must be on stack !!
   398   address generate_forward_exception() {
   399     StubCodeMark mark(this, "StubRoutines", "forward exception");
   400     //Register thread = TREG;
   401     Register thread = TREG;
   402     address start = __ pc();
   404     // Upon entry, the sp points to the return address returning into Java
   405     // (interpreted or compiled) code; i.e., the return address becomes the
   406     // throwing pc.
   407     //
   408     // Arguments pushed before the runtime call are still on the stack but
   409     // the exception handler will reset the stack pointer -> ignore them.
   410     // A potential result in registers can be ignored as well.
   412 #ifdef ASSERT
   413     // make sure this code is only executed if there is a pending exception
   414 #ifndef OPT_THREAD
   415     __ get_thread(thread);
   416 #endif
   417     { Label L;
   418       __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));
   419       __ bne(AT, R0, L);
   420       __ delayed()->nop();
   421       __ stop("StubRoutines::forward exception: no pending exception (1)");
   422       __ bind(L);
   423     }
   424 #endif
   426     // compute exception handler into T9
   427     __ ld(A1, SP, 0);
   428     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, A1);
   429     __ move(T9, V0);
   430     __ pop(V1);
   432 #ifndef OPT_THREAD
   433     __ get_thread(thread);
   434 #endif
   435     __ ld(V0, thread, in_bytes(Thread::pending_exception_offset()));
   436     __ sd(R0, thread, in_bytes(Thread::pending_exception_offset()));
   438 #ifdef ASSERT
   439     // make sure exception is set
   440     { Label L;
   441       __ bne(V0, R0, L);
   442       __ delayed()->nop();
   443       __ stop("StubRoutines::forward exception: no pending exception (2)");
   444       __ bind(L);
   445     }
   446 #endif
   448     // continue at exception handler (return address removed)
   449     // V0: exception
   450     // T9: exception handler
   451     // V1: throwing pc
   452     __ verify_oop(V0);
   453     __ jr(T9);
   454     __ delayed()->nop();
   456     return start;
   457   }
   459   // Support for intptr_t get_previous_fp()
   460   //
   461   // This routine is used to find the previous frame pointer for the
   462   // caller (current_frame_guess). This is used as part of debugging
   463   // ps() is seemingly lost trying to find frames.
   464   // This code assumes that caller current_frame_guess) has a frame.
   465   address generate_get_previous_fp() {
   466     StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
   467     const Address old_fp       (FP,  0);
   468     const Address older_fp       (V0,  0);
   469     address start = __ pc();
   470     __ enter();    
   471     __ lw(V0, old_fp); // callers fp
   472     __ lw(V0, older_fp); // the frame for ps()
   473     __ leave();
   474     __ jr(RA);
   475     __ delayed()->nop();
   476     return start;
   477   }
   478   // The following routine generates a subroutine to throw an
   479   // asynchronous UnknownError when an unsafe access gets a fault that
   480   // could not be reasonably prevented by the programmer.  (Example:
   481   // SIGBUS/OBJERR.)
   482   address generate_handler_for_unsafe_access() {
   483 		StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
   484 		address start = __ pc();
   485 		__ pushad();                      // push registers
   486 		//  Address next_pc(esp, RegisterImpl::number_of_registers * BytesPerWord);
   487 		__ call(CAST_FROM_FN_PTR(address, handle_unsafe_access), relocInfo::runtime_call_type);
   488 		__ delayed()->nop(); 
   489 		__ sw(V0, SP, RegisterImpl::number_of_registers * BytesPerWord); 
   490 		__ popad();
   491 		__ jr(RA);
   492 		__ delayed()->nop();  
   493 		return start;
   494   }
   496   // Non-destructive plausibility checks for oops
   497   //
   498   // Arguments:
   499   //    all args on stack!
   500   //
   501   // Stack after saving c_rarg3:
   502   //    [tos + 0]: saved c_rarg3
   503   //    [tos + 1]: saved c_rarg2
   504   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
   505   //    [tos + 3]: saved flags
   506   //    [tos + 4]: return address
   507   //  * [tos + 5]: error message (char*)
   508   //  * [tos + 6]: object to verify (oop)
   509   //  * [tos + 7]: saved rax - saved by caller and bashed
   510   //  * = popped on exit
   511   address generate_verify_oop() {
   512 	  StubCodeMark mark(this, "StubRoutines", "verify_oop");
   513 	  address start = __ pc();
   514 	  __ reinit_heapbase();
   515 	  __ verify_oop_subroutine(); 
   516     address end = __ pc();
   517 	  return start;
   518   }
   520   //
   521   //  Generate overlap test for array copy stubs
   522   //
   523   //  Input:
   524   //     A0    -  array1
   525   //     A1    -  array2
   526   //     A2    -  element count
   527   //
   528   //  Note: this code can only use %eax, %ecx, and %edx
   529   //
   531  // use T9 as temp 
   532   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
   533     int elem_size = 1 << log2_elem_size;
   534     Address::ScaleFactor sf = Address::times_1;
   536     switch (log2_elem_size) {
   537       case 0: sf = Address::times_1; break;
   538       case 1: sf = Address::times_2; break;
   539       case 2: sf = Address::times_4; break;
   540       case 3: sf = Address::times_8; break;
   541     }
   543     __ dsll(AT, A2, sf);
   544     __ dadd(AT, AT, A0); 
   545     __ lea(T9, Address(AT, -elem_size)); 
   546     __ dsub(AT, A1, A0); 
   547     __ blez(AT, no_overlap_target); 
   548     __ delayed()->nop(); 
   549     __ dsub(AT, A1, T9); 
   550     __ bgtz(AT, no_overlap_target); 
   551     __ delayed()->nop(); 
   553     // 2016/05/10 aoqi: If A0 = 0xf... and A1 = 0x0..., than goto no_overlap_target 
   554     Label L;
   555     __ bgez(A0, L);
   556     __ delayed()->nop(); 
   557     __ bgtz(A1, no_overlap_target);
   558     __ delayed()->nop(); 
   559     __ bind(L);
   561   }
   563   //
   564   //  Generate store check for array
   565   //
   566   //  Input:
   567   //     %edi    -  starting address
   568   //     %ecx    -  element count
   569   //
   570   //  The 2 input registers are overwritten
   571   //
   573   //
   574   //  Generate store check for array
   575   //
   576   //  Input:
   577   //     T0    -  starting address(edi)
   578   //     T1    -  element count  (ecx)
   579   //
   580   //  The 2 input registers are overwritten
   581   //
   583 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
   585 	void array_store_check() {
   586 		BarrierSet* bs = Universe::heap()->barrier_set();
   587 		assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
   588 		CardTableModRefBS* ct = (CardTableModRefBS*)bs;
   589 		assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
   590 		Label l_0;
   592 		__ dsll(AT, T1, TIMES_OOP);
   593 		__ dadd(AT, T0, AT); 
   594 		__ daddiu(T1, AT, - BytesPerHeapOop);
   596 		__ shr(T0, CardTableModRefBS::card_shift); 
   597 		__ shr(T1, CardTableModRefBS::card_shift);
   599 		__ dsub(T1, T1, T0);   // end --> cards count
   600 		__ bind(l_0);
   602 		__ li48(AT, (long)ct->byte_map_base); 
   603 		__ dadd(AT, AT, T0); 
   604 		__ dadd(AT, AT, T1); 
   605 		__ sb(R0, AT, 0);
   606 		//__ daddi(T1, T1, -4);  
   607 		__ daddi(T1, T1, - 1);
   608 		__ bgez(T1, l_0);
   609 		__ delayed()->nop(); 
   610 	}
   612   // Arguments:
   613   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
   614   //             ignored
   615   //   name    - stub name string
   616   //
   617   // Inputs:
   618   //   c_rarg0   - source array address
   619   //   c_rarg1   - destination array address
   620   //   c_rarg2   - element count, treated as ssize_t, can be zero
   621   //
   622   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
   623   // we let the hardware handle it.  The one to eight bytes within words,
   624   // dwords or qwords that span cache line boundaries will still be loaded
   625   // and stored atomically.
   626   //
   627   // Side Effects:
   628   //   disjoint_byte_copy_entry is set to the no-overlap entry point
   629   //   used by generate_conjoint_byte_copy().
   630   //
   631 	address generate_disjoint_byte_copy(bool aligned, const char *name) {
   632 	  StubCodeMark mark(this, "StubRoutines", name);
   633 	  __ align(CodeEntryAlignment);
   634 	  address start = __ pc();
   635 	  Label l_0, l_1, l_2, l_3, l_4, l_5, l_6;
   637 	  __ push(T3);
   638 	  __ push(T0);
   639 	  __ push(T1);
   640 	  __ push(T8);
   641 	  __ move(T3, A0); 
   642 	  __ move(T0, A1);
   643 	  __ move(T1, A2);  
   644 	  __ move(T8, T1);             // original count in T1
   645 	  __ daddi(AT, T1, -3); 
   646 	  __ blez(AT, l_4);  
   647 	  __ delayed()->nop();	
   648 	  if (!aligned) {
   649           //TODO: copy 8 bytes at one time
   650 	    // 2016/5/8 Jin: only when src and dest has the same alignment can we do lw/sw */
   651 	    __ andi(AT, T3, 3); 
   652 	    __ andi(T9, T0, 3); 
   653 	    __ bne(AT, T9, l_5); 
   654 	    __ delayed()->nop();	
   656 	    // align source address at dword address boundary
   657 	    __ move(T1, 4); 
   658 	    __ sub(T1, T1, T3); 
   659 	    __ andi(T1, T1, 3); 
   660 	    __ beq(T1, R0, l_1); 
   661 	    __ delayed()->nop();	
   662 	    __ sub(T8,T8,T1); 
   663 	    __ bind(l_0);
   664 	    __ lb(AT, T3, 0); 
   665 	    __ sb(AT, T0, 0); 
   666 	    __ addi(T3, T3, 1); 
   667 	    __ addi(T0, T0, 1); 
   668 	    __ addi(T1 ,T1, -1);  
   669 	    __ bne(T1, R0, l_0); 
   670 	    __ delayed()->nop(); 
   671 	    __ bind(l_1);
   672 	    __ move(T1, T8); 
   673 	  }
   674 	  __ shr(T1, 2); 
   675 	  __ beq(T1, R0, l_4);     // no dwords to move
   676 	  __ delayed()->nop(); 
   677 	  // copy aligned dwords
   678 	  __ bind(l_2);
   679 	  __ align(16);
   680 	  __ bind(l_3);
   681 	  __ lw(AT, T3, 0);   
   682 	  __ sw(AT, T0, 0 ); 
   683 	  __ addi(T3, T3, 4); 
   684 	  __ addi(T0, T0, 4); 
   685 	  __ addi(T1, T1, -1); 
   686 	  __ bne(T1, R0, l_3); 
   687 	  __ delayed()->nop(); 
   688 	  __ bind(l_4);
   689 	  __ move(T1, T8); 
   690 	  __ andi(T1, T1, 3); 
   691 	  __ beq(T1, R0, l_6);  
   692 	  __ delayed()->nop(); 
   693 	  // copy suffix
   694 	  __ bind(l_5);
   695 	  __ lb(AT, T3, 0); 
   696 	  __ sb(AT, T0, 0); 
   697 	  __ addi(T3, T3, 1);  
   698 	  __ addi(T0, T0, 1);  
   699 	  __ addi(T1, T1, -1); 
   700 	  __ bne(T1, R0, l_5 ); 
   701 	  __ delayed()->nop(); 
   702 	  __ bind(l_6);
   703 	  __ pop(T8); 
   704 	  __ pop(T1); 
   705 	  __ pop(T0); 
   706 	  __ pop(T3); 
   707 	  __ jr(RA); 
   708 	  __ delayed()->nop(); 
   709 	  return start;
   710   }
   712   // Arguments:
   713   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
   714   //             ignored
   715   //   name    - stub name string
   716   //
   717   // Inputs:
   718   //   A0   - source array address
   719   //   A1   - destination array address
   720   //   A2   - element count, treated as ssize_t, can be zero
   721   //
   722   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
   723   // we let the hardware handle it.  The one to eight bytes within words,
   724   // dwords or qwords that span cache line boundaries will still be loaded
   725   // and stored atomically.
   726   //
   727   address generate_conjoint_byte_copy(bool aligned, const char *name) {
   728     __ align(CodeEntryAlignment);
   729     StubCodeMark mark(this, "StubRoutines", name);
   730     address start = __ pc();
   732     Label l_copy_4_bytes_loop, l_copy_suffix, l_copy_suffix_loop, l_exit;
   733     Label l_copy_byte, l_from_unaligned, l_unaligned, l_4_bytes_aligned;
   735     address nooverlap_target = aligned ?
   736 	    StubRoutines::arrayof_jbyte_disjoint_arraycopy() :
   737 	    StubRoutines::jbyte_disjoint_arraycopy();
   739     array_overlap_test(nooverlap_target, 0);
   741     const Register from      = A0;   // source array address
   742     const Register to        = A1;   // destination array address
   743     const Register count     = A2;   // elements count
   744     const Register end_from  = T3;   // source array end address
   745     const Register end_to    = T0;   // destination array end address
   746     const Register end_count = T1;   // destination array end address
   748     __ push(end_from);	
   749     __ push(end_to);	
   750     __ push(end_count);	
   751     __ push(T8);	
   753     // copy from high to low
   754     __ move(end_count, count);  
   755     __ dadd(end_from, from, end_count);  
   756     __ dadd(end_to, to, end_count);  
   758     // 2016/05/08 aoqi: If end_from and end_to has differante alignment, unaligned copy is performed.
   759     __ andi(AT, end_from, 3); 
   760     __ andi(T8, end_to, 3); 
   761     __ bne(AT, T8, l_copy_byte); 
   762     __ delayed()->nop();	
   764     // First deal with the unaligned data at the top.
   765     __ bind(l_unaligned);
   766     __ beq(end_count, R0, l_exit); 
   767     __ delayed()->nop(); 
   769     __ andi(AT, end_from, 3);    
   770     __ bne(AT, R0, l_from_unaligned); 
   771     __ delayed()->nop(); 
   773     __ andi(AT, end_to, 3);    
   774     __ beq(AT, R0, l_4_bytes_aligned); 
   775     __ delayed()->nop(); 
   777     __ bind(l_from_unaligned);
   778     __ lb(AT, end_from, -1);   
   779     __ sb(AT, end_to, -1); 
   780     __ daddi(end_from, end_from, -1); 
   781     __ daddi(end_to, end_to, -1); 
   782     __ daddi(end_count, end_count, -1); 
   783     __ b(l_unaligned); 
   784     __ delayed()->nop(); 
   786     // now end_to, end_from point to 4-byte aligned high-ends
   787     //     end_count contains byte count that is not copied.
   788     // copy 4 bytes at a time
   789     __ bind(l_4_bytes_aligned);
   791     __ move(T8, end_count); 
   792     __ daddi(AT, end_count, -3); 
   793     __ blez(AT, l_copy_suffix); 
   794     __ delayed()->nop();	
   796     //__ andi(T8, T8, 3); 
   797     __ lea(end_from, Address(end_from, -4));
   798     __ lea(end_to, Address(end_to, -4));
   800     __ dsrl(end_count, end_count, 2); 
   801     __ align(16);
   802     __ bind(l_copy_4_bytes_loop); //l_copy_4_bytes
   803     __ lw(AT, end_from, 0);   
   804     __ sw(AT, end_to, 0); 
   805     __ addi(end_from, end_from, -4);    
   806     __ addi(end_to, end_to, -4);    
   807     __ addi(end_count, end_count, -1);  
   808     __ bne(end_count, R0, l_copy_4_bytes_loop); 
   809     __ delayed()->nop(); 
   811     __ b(l_copy_suffix);  
   812     __ delayed()->nop(); 
   813     // copy dwords aligned or not with repeat move
   814     // l_copy_suffix
   815     // copy suffix (0-3 bytes)
   816     __ bind(l_copy_suffix); 
   817     __ andi(T8, T8, 3); 
   818     __ beq(T8, R0, l_exit); 
   819     __ delayed()->nop(); 
   820     __ addi(end_from, end_from, 3); 
   821     __ addi(end_to, end_to, 3); 
   822     __ bind(l_copy_suffix_loop);
   823     __ lb(AT, end_from, 0);  
   824     __ sb(AT, end_to, 0); 
   825     __ addi(end_from, end_from, -1);  
   826     __ addi(end_to, end_to, -1);  
   827     __ addi(T8, T8, -1); 
   828     __ bne(T8, R0, l_copy_suffix_loop); 
   829     __ delayed()->nop(); 
   831     __ bind(l_copy_byte);
   832     __ beq(end_count, R0, l_exit); 
   833     __ delayed()->nop(); 
   834     __ lb(AT, end_from, -1);   
   835     __ sb(AT, end_to, -1); 
   836     __ daddi(end_from, end_from, -1); 
   837     __ daddi(end_to, end_to, -1); 
   838     __ daddi(end_count, end_count, -1); 
   839     __ b(l_copy_byte); 
   840     __ delayed()->nop(); 
   842     __ bind(l_exit);
   843     __ pop(T8);	
   844     __ pop(end_count);	
   845     __ pop(end_to);	
   846     __ pop(end_from);	
   847     __ jr(RA); 
   848     __ delayed()->nop(); 
   849     return start;
   850   }
   852   // Generate stub for disjoint short copy.  If "aligned" is true, the
   853   // "from" and "to" addresses are assumed to be heapword aligned.
   854   //
   855   // Arguments for generated stub:
   856   //      from:  A0
   857   //      to:    A1
   858   //  elm.count: A2 treated as signed
   859   //  one element: 2 bytes
   860   //
   861   // Strategy for aligned==true:
   862   //
   863   //  If length <= 9:
   864   //     1. copy 1 elements at a time (l_5)
   865   //
   866   //  If length > 9:
   867   //     1. copy 4 elements at a time until less than 4 elements are left (l_7)
   868   //     2. copy 2 elements at a time until less than 2 elements are left (l_6)
   869   //     3. copy last element if one was left in step 2. (l_1)
   870   //
   871   //
   872   // Strategy for aligned==false:
   873   //
   874   //  If length <= 9: same as aligned==true case
   875   //
   876   //  If length > 9:
   877   //     1. continue with step 7. if the alignment of from and to mod 4
   878   //        is different.
   879   //     2. align from and to to 4 bytes by copying 1 element if necessary
   880   //     3. at l_2 from and to are 4 byte aligned; continue with
   881   //        6. if they cannot be aligned to 8 bytes because they have
   882   //        got different alignment mod 8.
   883   //     4. at this point we know that both, from and to, have the same
   884   //        alignment mod 8, now copy one element if necessary to get
   885   //        8 byte alignment of from and to.
   886   //     5. copy 4 elements at a time until less than 4 elements are
   887   //        left; depending on step 3. all load/stores are aligned.
   888   //     6. copy 2 elements at a time until less than 2 elements are
   889   //        left. (l_6)
   890   //     7. copy 1 element at a time. (l_5)
   891   //     8. copy last element if one was left in step 6. (l_1)
   892   //
   893   //  TODO:
   894   //
   895   //  1. use loongson 128-bit load/store
   896   //  2. use loop unrolling optimization when len is big enough, for example if len > 0x2000:
   897   //    __ bind(l_x);
   898   //    __ ld(AT, tmp1, 0);
   899   //    __ ld(tmp, tmp1, 8);
   900   //    __ sd(AT, tmp2, 0);
   901   //    __ sd(tmp, tmp2, 8);
   902   //    __ ld(AT, tmp1, 16);
   903   //    __ ld(tmp, tmp1, 24);
   904   //    __ sd(AT, tmp2, 16);
   905   //    __ sd(tmp, tmp2, 24);
   906   //    __ daddi(tmp1, tmp1, 32);
   907   //    __ daddi(tmp2, tmp2, 32);
   908   //    __ daddi(tmp3, tmp3, -16);
   909   //    __ daddi(AT, tmp3, -16);
   910   //    __ bgez(AT, l_x);
   911   //    __ delayed()->nop();
   912   //
   913   address generate_disjoint_short_copy(bool aligned, const char * name) {
   914     StubCodeMark mark(this, "StubRoutines", name);
   915     __ align(CodeEntryAlignment);
   917     Register tmp1 = T0;
   918     Register tmp2 = T1;
   919     Register tmp3 = T3;
   921     address start = __ pc();
   923     __ push(tmp1);
   924     __ push(tmp2);
   925     __ push(tmp3);
   926     __ move(tmp1, A0);
   927     __ move(tmp2, A1);
   928     __ move(tmp3, A2);
   930     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
   931     Label l_debug;
   932     // don't try anything fancy if arrays don't have many elements
   933     __ daddi(AT, tmp3, -9);
   934     __ blez(AT, l_1);
   935     __ delayed()->nop();
   937     if (!aligned) {
   938       __ xorr(AT, A0, A1);
   939       __ andi(AT, AT, 1);
   940       __ bne(AT, R0, l_debug); // if arrays don't have the same alignment mod 2, can this happen?
   941       __ delayed()->nop();
   943       __ xorr(AT, A0, A1);
   944       __ andi(AT, AT, 3);
   945       __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 1 element copy
   946       __ delayed()->nop();
   948       // At this point it is guaranteed that both, from and to have the same alignment mod 4.
   950       // Copy 1 element if necessary to align to 4 bytes.
   951       __ andi(AT, A0, 3);
   952       __ beq(AT, R0, l_2);
   953       __ delayed()->nop();
   955       __ lhu(AT, tmp1, 0);
   956       __ daddi(tmp1, tmp1, 2);
   957       __ sh(AT, tmp2, 0);
   958       __ daddi(tmp2, tmp2, 2);
   959       __ daddi(tmp3, tmp3, -1);
   960       __ bind(l_2);
   962       // At this point the positions of both, from and to, are at least 4 byte aligned.
   964       // Copy 4 elements at a time.
   965       // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
   966       __ xorr(AT, tmp1, tmp2);
   967       __ andi(AT, AT, 7);
   968       __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned
   969       __ delayed()->nop();
   971       // Copy a 2-element word if necessary to align to 8 bytes.
   972       __ andi(AT, tmp1, 7);
   973       __ beq(AT, R0, l_7);
   974       __ delayed()->nop();
   976       __ lw(AT, tmp1, 0);
   977       __ daddi(tmp3, tmp3, -2);
   978       __ sw(AT, tmp2, 0);
   979       { // FasterArrayCopy
   980         __ daddi(tmp1, tmp1, 4);
   981         __ daddi(tmp2, tmp2, 4);
   982       }
   983     }
   985     __ bind(l_7);
   987     // Copy 4 elements at a time; either the loads or the stores can
   988     // be unaligned if aligned == false.
   990     { // FasterArrayCopy
   991       __ daddi(AT, tmp3, -15);
   992       __ blez(AT, l_6); // copy 2 at a time if less than 16 elements remain
   993       __ delayed()->nop();
   995       __ bind(l_8);
   996       // For Loongson, there is 128-bit memory access. TODO
   997       __ ld(AT, tmp1, 0);
   998       __ sd(AT, tmp2, 0);
   999       __ daddi(tmp1, tmp1, 8);
  1000       __ daddi(tmp2, tmp2, 8);
  1001       __ daddi(tmp3, tmp3, -4);
  1002       __ daddi(AT, tmp3, -4);
  1003       __ bgez(AT, l_8);
  1004       __ delayed()->nop();
  1006     __ bind(l_6);
  1008     // copy 2 element at a time
  1009     { // FasterArrayCopy
  1010       __ daddi(AT, tmp3, -1);
  1011       __ blez(AT, l_1);
  1012       __ delayed()->nop();
  1014       __ bind(l_3);
  1015       __ lw(AT, tmp1, 0);
  1016       __ sw(AT, tmp2, 0);
  1017       __ daddi(tmp1, tmp1, 4);
  1018       __ daddi(tmp2, tmp2, 4);
  1019       __ daddi(tmp3, tmp3, -2);
  1020       __ daddi(AT, tmp3, -2);
  1021       __ bgez(AT, l_3);
  1022       __ delayed()->nop();
  1026     // do single element copy (8 bit), can this happen?
  1027     __ bind(l_1);
  1028     __ beq(R0, tmp3, l_4);
  1029     __ delayed()->nop();
  1031     { // FasterArrayCopy
  1033       __ bind(l_5);
  1034       __ lhu(AT, tmp1, 0);
  1035       __ daddi(tmp3, tmp3, -1);
  1036       __ sh(AT, tmp2, 0);
  1037       __ daddi(tmp1, tmp1, 2);
  1038       __ daddi(tmp2, tmp2, 2);
  1039       __ daddi(AT, tmp3, -1);
  1040       __ bgez(AT, l_5);
  1041       __ delayed()->nop();
  1043     __ bind(l_4);
  1044     __ pop(tmp3);
  1045     __ pop(tmp2);
  1046     __ pop(tmp1);
  1048     __ jr(RA);
  1049     __ delayed()->nop();
  1051     __ bind(l_debug);
  1052     __ stop("generate_disjoint_short_copy should not reach here");
  1053     return start;
  1056   // Arguments:
  1057   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1058   //             ignored
  1059   //   name    - stub name string
  1060   //
  1061   // Inputs:
  1062   //   c_rarg0   - source array address
  1063   //   c_rarg1   - destination array address
  1064   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1065   //
  1066   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
  1067   // let the hardware handle it.  The two or four words within dwords
  1068   // or qwords that span cache line boundaries will still be loaded
  1069   // and stored atomically.
  1070   //
  1071   address generate_conjoint_short_copy(bool aligned, const char *name) {
  1072 		Label l_1, l_2, l_3, l_4, l_5;
  1073 		StubCodeMark mark(this, "StubRoutines", name);
  1074 		__ align(CodeEntryAlignment);
  1075 		address start = __ pc();
  1076 		address nooverlap_target = aligned ?
  1077 						StubRoutines::arrayof_jshort_disjoint_arraycopy() :
  1078 						StubRoutines::jshort_disjoint_arraycopy();
  1080 		array_overlap_test(nooverlap_target, 1);
  1082 		__ push(T3);	
  1083 		__ push(T0);	
  1084 		__ push(T1);	
  1085 		__ push(T8);	
  1087 		/*
  1088 			 __ pushl(esi);
  1089 			 __ movl(ecx, Address(esp, 4+12));      // count
  1090 			 __ pushl(edi);
  1091 			 __ movl(esi, Address(esp, 8+ 4));      // from
  1092 			 __ movl(edi, Address(esp, 8+ 8));      // to
  1093 		 */ 
  1094 		__ move(T1, A2);  
  1095 		__ move(T3, A0); 
  1096 		__ move(T0, A1);
  1099 		// copy dwords from high to low
  1100 		// __ leal(esi, Address(esi, ecx, Address::times_2, -4)); // from + count*2 - 4
  1101 		__ sll(AT, T1, Address::times_2); 
  1102 		__ add(AT, T3, AT); 
  1103 		__ lea(T3, Address( AT, -4)); 
  1104 		//__ std();
  1105 		//__ leal(edi, Address(edi, ecx, Address::times_2, -4)); // to + count*2 - 4
  1106 		__ sll(AT,T1 , Address::times_2); 
  1107 		__ add(AT, T0, AT); 
  1108 		__ lea(T0, Address( AT, -4)); 
  1109 		//  __ movl(eax, ecx);
  1110 		__ move(T8, T1); 
  1111 		__ bind(l_1);
  1112 		//   __ sarl(ecx, 1);              // dword count
  1113 		__ sra(T1,T1, 1); 
  1114 		//__ jcc(Assembler::equal, l_4);                   // no dwords to move
  1115 		__ beq(T1, R0, l_4);  
  1116 		__ delayed()->nop(); 
  1117 		/*    __ cmpl(ecx, 32);
  1118 					__ jcc(Assembler::above, l_3);                   // > 32 dwords
  1119 		// copy dwords with loop
  1120 		__ subl(edi, esi);
  1121 		 */     __ align(16);
  1122 		__ bind(l_2);
  1123 		//__ movl(edx, Address(esi));
  1124 		__ lw(AT, T3, 0);   
  1125 		//__ movl(Address(edi, esi, Address::times_1), edx);
  1126 		__ sw(AT, T0, 0); 
  1127 		//__ subl(esi, 4);
  1128 		__ addi(T3, T3, -4); 
  1129 		__ addi(T0, T0, -4); 
  1130 		//__ decl(ecx);
  1131 		__ addi(T1, T1, -1); 
  1132 		//  __ jcc(Assembler::notEqual, l_2);
  1133 		__ bne(T1, R0, l_2); 
  1134 		__ delayed()->nop(); 
  1135 		//  __ addl(edi, esi);
  1136 		// __ jmp(l_4);
  1137 		__ b(l_4);
  1138 		__ delayed()->nop();
  1139 		// copy dwords with repeat move
  1140 		__ bind(l_3);
  1141 		//   __ rep_movl();
  1142 		__ bind(l_4);
  1143 		//  __ andl(eax, 1);              // suffix count
  1144 		__ andi(T8, T8, 1);              // suffix count
  1145 		//__ jcc(Assembler::equal, l_5);                   // no suffix
  1146 		__ beq(T8, R0, l_5 );  
  1147 		__ delayed()->nop(); 
  1148 		// copy suffix
  1149 		//   __ movw(edx, Address(esi, 2));
  1150 		__ lh(AT, T3, 2); 
  1151 		//  __ movw(Address(edi, 2), edx);
  1152 		__ sh(AT, T0, 2); 
  1153 		__ bind(l_5);
  1154 		//    __ cld();
  1155 		//    __ popl(edi);
  1156 		//    __ popl(esi);
  1157 		//   __ ret(0);
  1158 		__ pop(T8);	
  1159 		__ pop(T1);	
  1160 		__ pop(T0);	
  1161 		__ pop(T3);	
  1162 		__ jr(RA); 
  1163 		__ delayed()->nop();   
  1164 		return start;
  1167   // Arguments:
  1168   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1169   //             ignored
  1170   //   is_oop  - true => oop array, so generate store check code
  1171   //   name    - stub name string
  1172   //
  1173   // Inputs:
  1174   //   c_rarg0   - source array address
  1175   //   c_rarg1   - destination array address
  1176   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1177   //
  1178   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1179   // the hardware handle it.  The two dwords within qwords that span
  1180   // cache line boundaries will still be loaded and stored atomicly.
  1181   //
  1182   // Side Effects:
  1183   //   disjoint_int_copy_entry is set to the no-overlap entry point
  1184   //   used by generate_conjoint_int_oop_copy().
  1185   //
  1186   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {
  1187 		Label l_2, l_3, l_4, l_stchk;
  1188 		StubCodeMark mark(this, "StubRoutines", name);
  1189 		__ align(CodeEntryAlignment);
  1190 		address start = __ pc();
  1191 		/*
  1192 			 __ pushl(esi);
  1193 			 __ movl(ecx, Address(esp, 4+12));      // count
  1194 			 __ pushl(edi);
  1195 			 __ movl(esi, Address(esp, 8+ 4));      // from
  1196 			 __ movl(edi, Address(esp, 8+ 8));      // to
  1197 		 */
  1198 		__ push(T3);	
  1199 		__ push(T0);	
  1200 		__ push(T1);	
  1201 		__ push(T8);	
  1202 		__ move(T1, A2);  
  1203 		__ move(T3, A0); 
  1204 		__ move(T0, A1);
  1206 		// __ cmpl(ecx, 32);
  1207 		// __ jcc(Assembler::belowEqual, l_2);                   // <= 32 dwords
  1208 		// __ rep_movl();
  1209 		__ b(l_2); 	
  1210 		__ delayed()->nop();	
  1211 		if (is_oop) {
  1212 		//  __ jmp(l_stchk);
  1213 			__ b(l_stchk); 
  1214 			__ delayed()->nop(); 
  1216 		//    __ popl(edi);
  1217 		//   __ popl(esi);
  1218 		//  __ ret(0);
  1219 		__ pop(T8);	
  1220 		__ pop(T1);	
  1221 		__ pop(T0);	
  1222 		__ pop(T3);	
  1223 		__ jr(RA); 
  1224 		__ delayed()->nop(); 
  1226 		__ bind(l_2);
  1227 		//  __ subl(edi, esi);
  1228 		//  __ testl(ecx, ecx);
  1229 		// __ jcc(Assembler::zero, l_4);
  1230 		__ beq(T1, R0, l_4);  
  1231 		__ delayed()->nop(); 
  1232 		__ align(16);
  1233 		__ bind(l_3);
  1234 		//__ movl(edx, Address(esi));
  1235 		__ lw(AT, T3, 0);   
  1236 		// __ movl(Address(edi, esi, Address::times_1), edx);
  1237 		__ sw(AT, T0, 0); 
  1238 		// __ addl(esi, 4);
  1239 		__ addi(T3, T3, 4);
  1240 		__ addi(T0, T0, 4);
  1241 		//   __ decl(ecx);
  1242 		__ addi(T1, T1, -1); 
  1243 		//    __ jcc(Assembler::notEqual, l_3);
  1244 		__ bne(T1, R0, l_3); 
  1245 		__ delayed()->nop(); 
  1246 		if (is_oop) {
  1247 			__ bind(l_stchk);
  1248 			//      __ movl(edi, Address(esp, 8+ 8));
  1249 			//     __ movl(ecx, Address(esp, 8+ 12));
  1250 			__ move(T0, A1); 
  1251 			__ move(T1, A2); 
  1252 			array_store_check();
  1254 		__ bind(l_4);
  1255 		//    __ popl(edi);
  1256 		//   __ popl(esi);
  1257 		//  __ ret(0);
  1258 		__ pop(T8);
  1259 		__ pop(T1);
  1260 		__ pop(T0);
  1261 		__ pop(T3);
  1262 		__ jr(RA); 
  1263 		__ delayed()->nop(); 
  1264 		return start;
  1267   // Arguments:
  1268   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1269   //             ignored
  1270   //   is_oop  - true => oop array, so generate store check code
  1271   //   name    - stub name string
  1272   //
  1273   // Inputs:
  1274   //   c_rarg0   - source array address
  1275   //   c_rarg1   - destination array address
  1276   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1277   //
  1278   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1279   // the hardware handle it.  The two dwords within qwords that span
  1280   // cache line boundaries will still be loaded and stored atomicly.
  1281   //
  1282   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {
  1283 		Label l_2, l_3, l_4, l_stchk;
  1284 		StubCodeMark mark(this, "StubRoutines", name);
  1285 		__ align(CodeEntryAlignment);
  1286 		address start = __ pc();
  1287 		address nooverlap_target;
  1289 		if (is_oop) {
  1290 			nooverlap_target = aligned ?
  1291 							StubRoutines::arrayof_oop_disjoint_arraycopy() :
  1292 							StubRoutines::oop_disjoint_arraycopy();
  1293 		}else {
  1294 			nooverlap_target = aligned ?
  1295 							StubRoutines::arrayof_jint_disjoint_arraycopy() :
  1296 							StubRoutines::jint_disjoint_arraycopy();
  1299 		array_overlap_test(nooverlap_target, 2);
  1301 		__ push(T3);
  1302 		__ push(T0);
  1303 		__ push(T1);
  1304 		__ push(T8);
  1306 		/*
  1307 			 __ pushl(esi);
  1308 			 __ movl(ecx, Address(esp, 4+12));      // count
  1309 			 __ pushl(edi);
  1310 			 __ movl(esi, Address(esp, 8+ 4));      // from
  1311 			 __ movl(edi, Address(esp, 8+ 8));      // to
  1312 		 */ 
  1313 		__ move(T1, A2);  
  1314 		__ move(T3, A0); 
  1315 		__ move(T0, A1);
  1317 		//__ leal(esi, Address(esi, ecx, Address::times_4, -4)); // from + count*4 - 4
  1318 		__ sll(AT, T1, Address::times_4); 
  1319 		__ add(AT, T3, AT); 
  1320 		__ lea(T3 , Address(AT, -4)); 
  1321 		//__ std();
  1322 		//__ leal(edi, Address(edi, ecx, Address::times_4, -4)); // to + count*4 - 4
  1323 		__ sll(AT, T1, Address::times_4); 
  1324 		__ add(AT, T0, AT); 
  1325 		__ lea(T0 , Address(AT, -4)); 
  1327 		//    __ cmpl(ecx, 32);
  1328 		//   __ jcc(Assembler::above, l_3);                   // > 32 dwords
  1329 		//  __ testl(ecx, ecx);
  1330 		//__ jcc(Assembler::zero, l_4);
  1331 		__ beq(T1, R0, l_4); 
  1332 		__ delayed()->nop();  
  1333 		// __ subl(edi, esi);
  1334 		__ align(16);
  1335 		__ bind(l_2);
  1336 		// __ movl(edx, Address(esi));
  1337 		__ lw(AT, T3, 0);   
  1338 		// __ movl(Address(esi, edi, Address::times_1), edx);
  1339 		__ sw(AT, T0, 0); 
  1340 		// __ subl(esi, 4);
  1341 		__ addi(T3, T3, -4); 
  1342 		__ addi(T0, T0, -4); 
  1343 		//   __ decl(ecx);
  1344 		__ addi(T1, T1, -1); 
  1345 		//__ jcc(Assembler::notEqual, l_2);
  1346 		__ bne(T1, R0, l_2);  
  1347 		__ delayed()->nop(); 
  1348 		if (is_oop) {
  1349 			// __ jmp(l_stchk);
  1350 			__ b( l_stchk); 
  1351 			__ delayed()->nop(); 
  1353 		__ bind(l_4);
  1354 		//      __ cld();
  1355 		//     __ popl(edi);
  1356 		//    __ popl(esi);
  1357 		//   __ ret(0);
  1358 		__ pop(T8); 
  1359 		__ pop(T1); 
  1360 		__ pop(T0); 
  1361 		__ pop(T3); 
  1362 		__ jr(RA); 
  1363 		__ delayed()->nop(); 
  1364 		__ bind(l_3);
  1365 		//   __ rep_movl();
  1366 		if (is_oop) {
  1367 			__ bind(l_stchk);
  1368 			//  __ movl(edi, Address(esp, 8+ 8));
  1369 			__ move(T0, A1);  
  1370 			// __ movl(ecx, Address(esp, 8+ 12));
  1371 			__ move(T1, A2);  
  1372 			array_store_check();
  1374 		//    __ cld();
  1375 		//   __ popl(edi);
  1376 		//   __ popl(esi);
  1377 		//  __ ret(0);
  1378 		__ pop(T8);	
  1379 		__ pop(T1);	
  1380 		__ pop(T0);	
  1381 		__ pop(T3);	
  1382 		__ jr(RA);	
  1383 		__ delayed()->nop(); 
  1384 		return start;
  1387   // Arguments:
  1388   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1389   //             ignored
  1390   //   is_oop  - true => oop array, so generate store check code
  1391   //   name    - stub name string
  1392   //
  1393   // Inputs:
  1394   //   c_rarg0   - source array address
  1395   //   c_rarg1   - destination array address
  1396   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1397   //
  1398   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1399   // the hardware handle it.  The two dwords within qwords that span
  1400   // cache line boundaries will still be loaded and stored atomicly.
  1401   //
  1402   // Side Effects:
  1403   //   disjoint_int_copy_entry is set to the no-overlap entry point
  1404   //   used by generate_conjoint_int_oop_copy().
  1405   //
  1406   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
  1407 		Label l_2, l_3, l_4, l_stchk;
  1408 		StubCodeMark mark(this, "StubRoutines", name);
  1409 		__ align(CodeEntryAlignment);
  1410 		address start = __ pc();
  1411 		__ push(T3);	
  1412 		__ push(T0);	
  1413 		__ push(T1);	
  1414 		__ push(T8);	
  1415 		__ move(T1, A2);  
  1416 		__ move(T3, A0); 
  1417 		__ move(T0, A1);
  1419 		// __ cmpl(ecx, 32);
  1420 		// __ jcc(Assembler::belowEqual, l_2);                   // <= 32 dwords
  1421 		// __ rep_movl();
  1422 		__ b(l_2); 	
  1423 		__ delayed()->nop();	
  1424 		if (is_oop) {
  1425 		//  __ jmp(l_stchk);
  1426 			__ b(l_stchk); 
  1427 			__ delayed()->nop(); 
  1429 		//    __ popl(edi);
  1430 		//   __ popl(esi);
  1431 		//  __ ret(0);
  1432 		__ pop(T8);	
  1433 		__ pop(T1);	
  1434 		__ pop(T0);	
  1435 		__ pop(T3);	
  1436 		__ jr(RA); 
  1437 		__ delayed()->nop(); 
  1439 		__ bind(l_2);
  1440 		//  __ subl(edi, esi);
  1441 		//  __ testl(ecx, ecx);
  1442 		// __ jcc(Assembler::zero, l_4);
  1443 		__ beq(T1, R0, l_4);  
  1444 		__ delayed()->nop(); 
  1445 		__ align(16);
  1446 		__ bind(l_3);
  1447 		//__ movl(edx, Address(esi));
  1448 		__ ld(AT, T3, 0);   
  1449 		// __ movl(Address(edi, esi, Address::times_1), edx);
  1450 		__ sd(AT, T0, 0); 
  1451 		// __ addl(esi, 4);
  1452 		__ addi(T3, T3, 8);
  1453 		__ addi(T0, T0, 8);
  1454 		//   __ decl(ecx);
  1455 		__ addi(T1, T1, -1); 
  1456 		//    __ jcc(Assembler::notEqual, l_3);
  1457 		__ bne(T1, R0, l_3); 
  1458 		__ delayed()->nop(); 
  1459 		if (is_oop) {
  1460 			__ bind(l_stchk);
  1461 			//      __ movl(edi, Address(esp, 8+ 8));
  1462 			//     __ movl(ecx, Address(esp, 8+ 12));
  1463 			__ move(T0, A1); 
  1464 			__ move(T1, A2); 
  1465 			array_store_check();
  1467 		__ bind(l_4);
  1468 		//    __ popl(edi);
  1469 		//   __ popl(esi);
  1470 		//  __ ret(0);
  1471 		__ pop(T8);
  1472 		__ pop(T1);
  1473 		__ pop(T0);
  1474 		__ pop(T3);
  1475 		__ jr(RA); 
  1476 		__ delayed()->nop(); 
  1477 		return start;
  1480   // Arguments:
  1481   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1482   //             ignored
  1483   //   is_oop  - true => oop array, so generate store check code
  1484   //   name    - stub name string
  1485   //
  1486   // Inputs:
  1487   //   c_rarg0   - source array address
  1488   //   c_rarg1   - destination array address
  1489   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1490   //
  1491   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1492   // the hardware handle it.  The two dwords within qwords that span
  1493   // cache line boundaries will still be loaded and stored atomicly.
  1494   //
  1495   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
  1496 		Label l_2, l_3, l_4, l_stchk;
  1497 		StubCodeMark mark(this, "StubRoutines", name);
  1498 		__ align(CodeEntryAlignment);
  1499 		address start = __ pc();
  1500 		address nooverlap_target;
  1502 		if (is_oop) {
  1503 			nooverlap_target = aligned ?
  1504 							StubRoutines::arrayof_oop_disjoint_arraycopy() :
  1505 							StubRoutines::oop_disjoint_arraycopy();
  1506 		}else {
  1507 			nooverlap_target = aligned ?
  1508 							StubRoutines::arrayof_jlong_disjoint_arraycopy() :
  1509 							StubRoutines::jlong_disjoint_arraycopy();
  1512 		array_overlap_test(nooverlap_target, 3);
  1514 		__ push(T3);
  1515 		__ push(T0);
  1516 		__ push(T1);
  1517 		__ push(T8);
  1519 		__ move(T1, A2);  
  1520 		__ move(T3, A0); 
  1521 		__ move(T0, A1);
  1523 		//__ leal(esi, Address(esi, ecx, Address::times_4, -4)); // from + count*4 - 4
  1524 		__ sll(AT, T1, Address::times_8); 
  1525 		__ add(AT, T3, AT); 
  1526 		__ lea(T3 , Address(AT, -8)); 
  1527 		//__ std();
  1528 		//__ leal(edi, Address(edi, ecx, Address::times_4, -4)); // to + count*4 - 4
  1529 		__ sll(AT, T1, Address::times_8); 
  1530 		__ add(AT, T0, AT); 
  1531 		__ lea(T0 , Address(AT, -8)); 
  1533 		//    __ cmpl(ecx, 32);
  1534 		//   __ jcc(Assembler::above, l_3);                   // > 32 dwords
  1535 		//  __ testl(ecx, ecx);
  1536 		//__ jcc(Assembler::zero, l_4);
  1537 		__ beq(T1, R0, l_4); 
  1538 		__ delayed()->nop();  
  1539 		// __ subl(edi, esi);
  1540 		__ align(16);
  1541 		__ bind(l_2);
  1542 		// __ movl(edx, Address(esi));
  1543 		__ ld(AT, T3, 0);   
  1544 		// __ movl(Address(esi, edi, Address::times_1), edx);
  1545 		__ sd(AT, T0, 0); 
  1546 		// __ subl(esi, 4);
  1547 		__ addi(T3, T3, -8); 
  1548 		__ addi(T0, T0, -8); 
  1549 		//   __ decl(ecx);
  1550 		__ addi(T1, T1, -1); 
  1551 		//__ jcc(Assembler::notEqual, l_2);
  1552 		__ bne(T1, R0, l_2);  
  1553 		__ delayed()->nop(); 
  1554 		if (is_oop) {
  1555 			// __ jmp(l_stchk);
  1556 			__ b( l_stchk); 
  1557 			__ delayed()->nop(); 
  1559 		__ bind(l_4);
  1560 		//      __ cld();
  1561 		//     __ popl(edi);
  1562 		//    __ popl(esi);
  1563 		//   __ ret(0);
  1564 		__ pop(T8); 
  1565 		__ pop(T1); 
  1566 		__ pop(T0); 
  1567 		__ pop(T3); 
  1568 		__ jr(RA); 
  1569 		__ delayed()->nop(); 
  1570 		__ bind(l_3);
  1571 		//   __ rep_movl();
  1572 		if (is_oop) {
  1573 			__ bind(l_stchk);
  1574 			//  __ movl(edi, Address(esp, 8+ 8));
  1575 			__ move(T0, A1);  
  1576 			// __ movl(ecx, Address(esp, 8+ 12));
  1577 			__ move(T1, A2);  
  1578 			array_store_check();
  1580 		//    __ cld();
  1581 		//   __ popl(edi);
  1582 		//   __ popl(esi);
  1583 		//  __ ret(0);
  1584 		__ pop(T8);	
  1585 		__ pop(T1);	
  1586 		__ pop(T0);	
  1587 		__ pop(T3);	
  1588 		__ jr(RA);	
  1589 		__ delayed()->nop(); 
  1590 		return start;
  1592 #if 0
  1593   // Arguments:
  1594   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
  1595   //             ignored
  1596   //   is_oop  - true => oop array, so generate store check code
  1597   //   name    - stub name string
  1598   //
  1599   // Inputs:
  1600   //   c_rarg0   - source array address
  1601   //   c_rarg1   - destination array address
  1602   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1603   //
  1604   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
  1605     __ align(CodeEntryAlignment);
  1606     StubCodeMark mark(this, "StubRoutines", name);
  1607     address start = __ pc();
  1609     Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
  1610     const Register from        = rdi;  // source array address
  1611     const Register to          = rsi;  // destination array address
  1612     const Register qword_count = rdx;  // elements count
  1613     const Register saved_count = rcx;
  1615     __ enter(); // required for proper stackwalking of RuntimeStub frame
  1616     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
  1618     address disjoint_copy_entry = NULL;
  1619     if (is_oop) {
  1620       assert(!UseCompressedOops, "shouldn't be called for compressed oops");
  1621       disjoint_copy_entry = disjoint_oop_copy_entry;
  1622       oop_copy_entry  = __ pc();
  1623       array_overlap_test(disjoint_oop_copy_entry, Address::times_8);
  1624     } else {
  1625       disjoint_copy_entry = disjoint_long_copy_entry;
  1626       long_copy_entry = __ pc();
  1627       array_overlap_test(disjoint_long_copy_entry, Address::times_8);
  1629     BLOCK_COMMENT("Entry:");
  1630     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  1632     array_overlap_test(disjoint_copy_entry, Address::times_8);
  1633     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
  1634                       // r9 and r10 may be used to save non-volatile registers
  1636     // 'from', 'to' and 'qword_count' are now valid
  1638     if (is_oop) {
  1639       // Save to and count for store barrier
  1640       __ movptr(saved_count, qword_count);
  1641       // No registers are destroyed by this call
  1642       gen_write_ref_array_pre_barrier(to, saved_count);
  1645     __ jmp(L_copy_32_bytes);
  1647     // Copy trailing qwords
  1648   __ BIND(L_copy_8_bytes);
  1649     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
  1650     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
  1651     __ decrement(qword_count);
  1652     __ jcc(Assembler::notZero, L_copy_8_bytes);
  1654     if (is_oop) {
  1655       __ jmp(L_exit);
  1656     } else {
  1657       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
  1658       restore_arg_regs();
  1659       __ xorptr(rax, rax); // return 0
  1660       __ leave(); // required for proper stackwalking of RuntimeStub frame
  1661       __ ret(0);
  1664     // Copy in 32-bytes chunks
  1665     copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
  1667     if (is_oop) {
  1668     __ BIND(L_exit);
  1669       __ lea(rcx, Address(to, saved_count, Address::times_8, -8));
  1670       gen_write_ref_array_post_barrier(to, rcx, rax);
  1671       inc_counter_np(SharedRuntime::_oop_array_copy_ctr);
  1672     } else {
  1673       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
  1675     restore_arg_regs();
  1676     __ xorptr(rax, rax); // return 0
  1677     __ leave(); // required for proper stackwalking of RuntimeStub frame
  1678     __ ret(0);
  1680     return start;
  1684   // Helper for generating a dynamic type check.
  1685   // Smashes no registers.
  1686   void generate_type_check(Register sub_klass,
  1687                            Register super_check_offset,
  1688                            Register super_klass,
  1689                            Label& L_success) {
  1690     assert_different_registers(sub_klass, super_check_offset, super_klass);
  1692     BLOCK_COMMENT("type_check:");
  1694     Label L_miss;
  1696     // a couple of useful fields in sub_klass:
  1697     int ss_offset = (klassOopDesc::header_size() * HeapWordSize +
  1698                      Klass::secondary_supers_offset_in_bytes());
  1699     int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
  1700                      Klass::secondary_super_cache_offset_in_bytes());
  1701     Address secondary_supers_addr(sub_klass, ss_offset);
  1702     Address super_cache_addr(     sub_klass, sc_offset);
  1704     // if the pointers are equal, we are done (e.g., String[] elements)
  1705     __ cmpptr(super_klass, sub_klass);
  1706     __ jcc(Assembler::equal, L_success);
  1708     // check the supertype display:
  1709     Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
  1710     __ cmpptr(super_klass, super_check_addr); // test the super type
  1711     __ jcc(Assembler::equal, L_success);
  1713     // if it was a primary super, we can just fail immediately
  1714     __ cmpl(super_check_offset, sc_offset);
  1715     __ jcc(Assembler::notEqual, L_miss);
  1717     // Now do a linear scan of the secondary super-klass chain.
  1718     // The repne_scan instruction uses fixed registers, which we must spill.
  1719     // (We need a couple more temps in any case.)
  1720     // This code is rarely used, so simplicity is a virtue here.
  1721     inc_counter_np(SharedRuntime::_partial_subtype_ctr);
  1723       __ push(rax);
  1724       __ push(rcx);
  1725       __ push(rdi);
  1726       assert_different_registers(sub_klass, super_klass, rax, rcx, rdi);
  1728       __ movptr(rdi, secondary_supers_addr);
  1729       // Load the array length.
  1730       __ movl(rcx, Address(rdi, arrayOopDesc::length_offset_in_bytes()));
  1731       // Skip to start of data.
  1732       __ addptr(rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
  1733       // Scan rcx words at [rdi] for occurance of rax
  1734       // Set NZ/Z based on last compare
  1735       __ movptr(rax, super_klass);
  1736       if (UseCompressedOops) {
  1737         // Compare against compressed form.  Don't need to uncompress because
  1738         // looks like orig rax is restored in popq below.
  1739         __ encode_heap_oop(rax);
  1740         __ repne_scanl();
  1741       } else {
  1742         __ repne_scan();
  1745       // Unspill the temp. registers:
  1746       __ pop(rdi);
  1747       __ pop(rcx);
  1748       __ pop(rax);
  1750       __ jcc(Assembler::notEqual, L_miss);
  1753     // Success.  Cache the super we found and proceed in triumph.
  1754     __ movptr(super_cache_addr, super_klass); // note: rax is dead
  1755     __ jmp(L_success);
  1757     // Fall through on failure!
  1758     __ BIND(L_miss);
  1761   //
  1762   //  Generate checkcasting array copy stub
  1763   //
  1764   //  Input:
  1765   //    c_rarg0   - source array address
  1766   //    c_rarg1   - destination array address
  1767   //    c_rarg2   - element count, treated as ssize_t, can be zero
  1768   //    c_rarg3   - size_t ckoff (super_check_offset)
  1769   // not Win64
  1770   //    c_rarg4   - oop ckval (super_klass)
  1771   // Win64
  1772   //    rsp+40    - oop ckval (super_klass)
  1773   //
  1774   //  Output:
  1775   //    rax ==  0  -  success
  1776   //    rax == -1^K - failure, where K is partial transfer count
  1777   //
  1778   address generate_checkcast_copy(const char *name) {
  1780     Label L_load_element, L_store_element, L_do_card_marks, L_done;
  1782     // Input registers (after setup_arg_regs)
  1783     const Register from        = rdi;   // source array address
  1784     const Register to          = rsi;   // destination array address
  1785     const Register length      = rdx;   // elements count
  1786     const Register ckoff       = rcx;   // super_check_offset
  1787     const Register ckval       = r8;    // super_klass
  1789     // Registers used as temps (r13, r14 are save-on-entry)
  1790     const Register end_from    = from;  // source array end address
  1791     const Register end_to      = r13;   // destination array end address
  1792     const Register count       = rdx;   // -(count_remaining)
  1793     const Register r14_length  = r14;   // saved copy of length
  1794     // End pointers are inclusive, and if length is not zero they point
  1795     // to the last unit copied:  end_to[0] := end_from[0]
  1797     const Register rax_oop    = rax;    // actual oop copied
  1798     const Register r11_klass  = r11;    // oop._klass
  1800     //---------------------------------------------------------------
  1801     // Assembler stub will be used for this call to arraycopy
  1802     // if the two arrays are subtypes of Object[] but the
  1803     // destination array type is not equal to or a supertype
  1804     // of the source type.  Each element must be separately
  1805     // checked.
  1807     __ align(CodeEntryAlignment);
  1808     StubCodeMark mark(this, "StubRoutines", name);
  1809     address start = __ pc();
  1811     __ enter(); // required for proper stackwalking of RuntimeStub frame
  1813     checkcast_copy_entry  = __ pc();
  1814     BLOCK_COMMENT("Entry:");
  1816 #ifdef ASSERT
  1817     // caller guarantees that the arrays really are different
  1818     // otherwise, we would have to make conjoint checks
  1819     { Label L;
  1820       array_overlap_test(L, TIMES_OOP);
  1821       __ stop("checkcast_copy within a single array");
  1822       __ bind(L);
  1824 #endif //ASSERT
  1826     // allocate spill slots for r13, r14
  1827     enum {
  1828       saved_r13_offset,
  1829       saved_r14_offset,
  1830       saved_rbp_offset,
  1831       saved_rip_offset,
  1832       saved_rarg0_offset
  1833     };
  1834     __ subptr(rsp, saved_rbp_offset * wordSize);
  1835     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
  1836     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
  1837     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
  1838                        // ckoff => rcx, ckval => r8
  1839                        // r9 and r10 may be used to save non-volatile registers
  1840 #ifdef _WIN64
  1841     // last argument (#4) is on stack on Win64
  1842     const int ckval_offset = saved_rarg0_offset + 4;
  1843     __ movptr(ckval, Address(rsp, ckval_offset * wordSize));
  1844 #endif
  1846     // check that int operands are properly extended to size_t
  1847     assert_clean_int(length, rax);
  1848     assert_clean_int(ckoff, rax);
  1850 #ifdef ASSERT
  1851     BLOCK_COMMENT("assert consistent ckoff/ckval");
  1852     // The ckoff and ckval must be mutually consistent,
  1853     // even though caller generates both.
  1854     { Label L;
  1855       int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
  1856                         Klass::super_check_offset_offset_in_bytes());
  1857       __ cmpl(ckoff, Address(ckval, sco_offset));
  1858       __ jcc(Assembler::equal, L);
  1859       __ stop("super_check_offset inconsistent");
  1860       __ bind(L);
  1862 #endif //ASSERT
  1864     // Loop-invariant addresses.  They are exclusive end pointers.
  1865     Address end_from_addr(from, length, TIMES_OOP, 0);
  1866     Address   end_to_addr(to,   length, TIMES_OOP, 0);
  1867     // Loop-variant addresses.  They assume post-incremented count < 0.
  1868     Address from_element_addr(end_from, count, TIMES_OOP, 0);
  1869     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
  1871     gen_write_ref_array_pre_barrier(to, count);
  1873     // Copy from low to high addresses, indexed from the end of each array.
  1874     __ lea(end_from, end_from_addr);
  1875     __ lea(end_to,   end_to_addr);
  1876     __ movptr(r14_length, length);        // save a copy of the length
  1877     assert(length == count, "");          // else fix next line:
  1878     __ negptr(count);                     // negate and test the length
  1879     __ jcc(Assembler::notZero, L_load_element);
  1881     // Empty array:  Nothing to do.
  1882     __ xorptr(rax, rax);                  // return 0 on (trivial) success
  1883     __ jmp(L_done);
  1885     // ======== begin loop ========
  1886     // (Loop is rotated; its entry is L_load_element.)
  1887     // Loop control:
  1888     //   for (count = -count; count != 0; count++)
  1889     // Base pointers src, dst are biased by 8*(count-1),to last element.
  1890     __ align(16);
  1892     __ BIND(L_store_element);
  1893     __ store_heap_oop(rax_oop, to_element_addr);  // store the oop
  1894     __ sync();
  1895     __ increment(count);               // increment the count toward zero
  1896     __ jcc(Assembler::zero, L_do_card_marks);
  1898     // ======== loop entry is here ========
  1899     __ BIND(L_load_element);
  1900     __ load_heap_oop(rax_oop, from_element_addr); // load the oop
  1901     __ testptr(rax_oop, rax_oop);
  1902     __ jcc(Assembler::zero, L_store_element);
  1904     __ load_klass(r11_klass, rax_oop);// query the object klass
  1905     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
  1906     // ======== end loop ========
  1908     // It was a real error; we must depend on the caller to finish the job.
  1909     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
  1910     // Emit GC store barriers for the oops we have copied (r14 + rdx),
  1911     // and report their number to the caller.
  1912     assert_different_registers(rax, r14_length, count, to, end_to, rcx);
  1913     __ lea(end_to, to_element_addr);
  1914     gen_write_ref_array_post_barrier(to, end_to, rscratch1);
  1915     __ movptr(rax, r14_length);           // original oops
  1916     __ addptr(rax, count);                // K = (original - remaining) oops
  1917     __ notptr(rax);                       // report (-1^K) to caller
  1918     __ jmp(L_done);
  1920     // Come here on success only.
  1921     __ BIND(L_do_card_marks);
  1922     __ addptr(end_to, -wordSize);         // make an inclusive end pointer
  1923     gen_write_ref_array_post_barrier(to, end_to, rscratch1);
  1924     __ xorptr(rax, rax);                  // return 0 on success
  1926     // Common exit point (success or failure).
  1927     __ BIND(L_done);
  1928     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
  1929     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
  1930     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
  1931     restore_arg_regs();
  1932     __ leave(); // required for proper stackwalking of RuntimeStub frame
  1933     __ ret(0);
  1935     return start;
  1938   //
  1939   //  Generate 'unsafe' array copy stub
  1940   //  Though just as safe as the other stubs, it takes an unscaled
  1941   //  size_t argument instead of an element count.
  1942   //
  1943   //  Input:
  1944   //    c_rarg0   - source array address
  1945   //    c_rarg1   - destination array address
  1946   //    c_rarg2   - byte count, treated as ssize_t, can be zero
  1947   //
  1948   // Examines the alignment of the operands and dispatches
  1949   // to a long, int, short, or byte copy loop.
  1950   //
  1951   address generate_unsafe_copy(const char *name) {
  1953     Label L_long_aligned, L_int_aligned, L_short_aligned;
  1955     // Input registers (before setup_arg_regs)
  1956     const Register from        = c_rarg0;  // source array address
  1957     const Register to          = c_rarg1;  // destination array address
  1958     const Register size        = c_rarg2;  // byte count (size_t)
  1960     // Register used as a temp
  1961     const Register bits        = rax;      // test copy of low bits
  1963     __ align(CodeEntryAlignment);
  1964     StubCodeMark mark(this, "StubRoutines", name);
  1965     address start = __ pc();
  1967     __ enter(); // required for proper stackwalking of RuntimeStub frame
  1969     // bump this on entry, not on exit:
  1970     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
  1972     __ mov(bits, from);
  1973     __ orptr(bits, to);
  1974     __ orptr(bits, size);
  1976     __ testb(bits, BytesPerLong-1);
  1977     __ jccb(Assembler::zero, L_long_aligned);
  1979     __ testb(bits, BytesPerInt-1);
  1980     __ jccb(Assembler::zero, L_int_aligned);
  1982     __ testb(bits, BytesPerShort-1);
  1983     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
  1985     __ BIND(L_short_aligned);
  1986     __ shrptr(size, LogBytesPerShort); // size => short_count
  1987     __ jump(RuntimeAddress(short_copy_entry));
  1989     __ BIND(L_int_aligned);
  1990     __ shrptr(size, LogBytesPerInt); // size => int_count
  1991     __ jump(RuntimeAddress(int_copy_entry));
  1993     __ BIND(L_long_aligned);
  1994     __ shrptr(size, LogBytesPerLong); // size => qword_count
  1995     __ jump(RuntimeAddress(long_copy_entry));
  1997     return start;
  2000   // Perform range checks on the proposed arraycopy.
  2001   // Kills temp, but nothing else.
  2002   // Also, clean the sign bits of src_pos and dst_pos.
  2003   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
  2004                               Register src_pos, // source position (c_rarg1)
  2005                               Register dst,     // destination array oo (c_rarg2)
  2006                               Register dst_pos, // destination position (c_rarg3)
  2007                               Register length,
  2008                               Register temp,
  2009                               Label& L_failed) {
  2010     BLOCK_COMMENT("arraycopy_range_checks:");
  2012     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
  2013     __ movl(temp, length);
  2014     __ addl(temp, src_pos);             // src_pos + length
  2015     __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
  2016     __ jcc(Assembler::above, L_failed);
  2018     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
  2019     __ movl(temp, length);
  2020     __ addl(temp, dst_pos);             // dst_pos + length
  2021     __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
  2022     __ jcc(Assembler::above, L_failed);
  2024     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
  2025     // Move with sign extension can be used since they are positive.
  2026     __ movslq(src_pos, src_pos);
  2027     __ movslq(dst_pos, dst_pos);
  2029     BLOCK_COMMENT("arraycopy_range_checks done");
  2032   //
  2033   //  Generate generic array copy stubs
  2034   //
  2035   //  Input:
  2036   //    c_rarg0    -  src oop
  2037   //    c_rarg1    -  src_pos (32-bits)
  2038   //    c_rarg2    -  dst oop
  2039   //    c_rarg3    -  dst_pos (32-bits)
  2040   // not Win64
  2041   //    c_rarg4    -  element count (32-bits)
  2042   // Win64
  2043   //    rsp+40     -  element count (32-bits)
  2044   //
  2045   //  Output:
  2046   //    rax ==  0  -  success
  2047   //    rax == -1^K - failure, where K is partial transfer count
  2048   //
  2049   address generate_generic_copy(const char *name) {
  2051     Label L_failed, L_failed_0, L_objArray;
  2052     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
  2054     // Input registers
  2055     const Register src        = c_rarg0;  // source array oop
  2056     const Register src_pos    = c_rarg1;  // source position
  2057     const Register dst        = c_rarg2;  // destination array oop
  2058     const Register dst_pos    = c_rarg3;  // destination position
  2059     // elements count is on stack on Win64
  2060 #ifdef _WIN64
  2061 #define C_RARG4 Address(rsp, 6 * wordSize)
  2062 #else
  2063 #define C_RARG4 c_rarg4
  2064 #endif
  2066     { int modulus = CodeEntryAlignment;
  2067       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
  2068       int advance = target - (__ offset() % modulus);
  2069       if (advance < 0)  advance += modulus;
  2070       if (advance > 0)  __ nop(advance);
  2072     StubCodeMark mark(this, "StubRoutines", name);
  2074     // Short-hop target to L_failed.  Makes for denser prologue code.
  2075     __ BIND(L_failed_0);
  2076     __ jmp(L_failed);
  2077     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
  2079     __ align(CodeEntryAlignment);
  2080     address start = __ pc();
  2082     __ enter(); // required for proper stackwalking of RuntimeStub frame
  2084     // bump this on entry, not on exit:
  2085     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
  2087     //-----------------------------------------------------------------------
  2088     // Assembler stub will be used for this call to arraycopy
  2089     // if the following conditions are met:
  2090     //
  2091     // (1) src and dst must not be null.
  2092     // (2) src_pos must not be negative.
  2093     // (3) dst_pos must not be negative.
  2094     // (4) length  must not be negative.
  2095     // (5) src klass and dst klass should be the same and not NULL.
  2096     // (6) src and dst should be arrays.
  2097     // (7) src_pos + length must not exceed length of src.
  2098     // (8) dst_pos + length must not exceed length of dst.
  2099     //
  2101     //  if (src == NULL) return -1;
  2102     __ testptr(src, src);         // src oop
  2103     size_t j1off = __ offset();
  2104     __ jccb(Assembler::zero, L_failed_0);
  2106     //  if (src_pos < 0) return -1;
  2107     __ testl(src_pos, src_pos); // src_pos (32-bits)
  2108     __ jccb(Assembler::negative, L_failed_0);
  2110     //  if (dst == NULL) return -1;
  2111     __ testptr(dst, dst);         // dst oop
  2112     __ jccb(Assembler::zero, L_failed_0);
  2114     //  if (dst_pos < 0) return -1;
  2115     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
  2116     size_t j4off = __ offset();
  2117     __ jccb(Assembler::negative, L_failed_0);
  2119     // The first four tests are very dense code,
  2120     // but not quite dense enough to put four
  2121     // jumps in a 16-byte instruction fetch buffer.
  2122     // That's good, because some branch predicters
  2123     // do not like jumps so close together.
  2124     // Make sure of this.
  2125     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
  2127     // registers used as temp
  2128     const Register r11_length    = r11; // elements count to copy
  2129     const Register r10_src_klass = r10; // array klass
  2130     const Register r9_dst_klass  = r9;  // dest array klass
  2132     //  if (length < 0) return -1;
  2133     __ movl(r11_length, C_RARG4);       // length (elements count, 32-bits value)
  2134     __ testl(r11_length, r11_length);
  2135     __ jccb(Assembler::negative, L_failed_0);
  2137     __ load_klass(r10_src_klass, src);
  2138 #ifdef ASSERT
  2139     //  assert(src->klass() != NULL);
  2140     BLOCK_COMMENT("assert klasses not null");
  2141     { Label L1, L2;
  2142       __ testptr(r10_src_klass, r10_src_klass);
  2143       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
  2144       __ bind(L1);
  2145       __ stop("broken null klass");
  2146       __ bind(L2);
  2147       __ load_klass(r9_dst_klass, dst);
  2148       __ cmpq(r9_dst_klass, 0);
  2149       __ jcc(Assembler::equal, L1);     // this would be broken also
  2150       BLOCK_COMMENT("assert done");
  2152 #endif
  2154     // Load layout helper (32-bits)
  2155     //
  2156     //  |array_tag|     | header_size | element_type |     |log2_element_size|
  2157     // 32        30    24            16              8     2                 0
  2158     //
  2159     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
  2160     //
  2162     int lh_offset = klassOopDesc::header_size() * HeapWordSize +
  2163                     Klass::layout_helper_offset_in_bytes();
  2165     const Register rax_lh = rax;  // layout helper
  2167     __ movl(rax_lh, Address(r10_src_klass, lh_offset));
  2169     // Handle objArrays completely differently...
  2170     jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
  2171     __ cmpl(rax_lh, objArray_lh);
  2172     __ jcc(Assembler::equal, L_objArray);
  2174     //  if (src->klass() != dst->klass()) return -1;
  2175     __ load_klass(r9_dst_klass, dst);
  2176     __ cmpq(r10_src_klass, r9_dst_klass);
  2177     __ jcc(Assembler::notEqual, L_failed);
  2179     //  if (!src->is_Array()) return -1;
  2180     __ cmpl(rax_lh, Klass::_lh_neutral_value);
  2181     __ jcc(Assembler::greaterEqual, L_failed);
  2183     // At this point, it is known to be a typeArray (array_tag 0x3).
  2184 #ifdef ASSERT
  2185     { Label L;
  2186       __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
  2187       __ jcc(Assembler::greaterEqual, L);
  2188       __ stop("must be a primitive array");
  2189       __ bind(L);
  2191 #endif
  2193     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
  2194                            r10, L_failed);
  2196     // typeArrayKlass
  2197     //
  2198     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
  2199     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
  2200     //
  2202     const Register r10_offset = r10;    // array offset
  2203     const Register rax_elsize = rax_lh; // element size
  2205     __ movl(r10_offset, rax_lh);
  2206     __ shrl(r10_offset, Klass::_lh_header_size_shift);
  2207     __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
  2208     __ addptr(src, r10_offset);           // src array offset
  2209     __ addptr(dst, r10_offset);           // dst array offset
  2210     BLOCK_COMMENT("choose copy loop based on element size");
  2211     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
  2213     // next registers should be set before the jump to corresponding stub
  2214     const Register from     = c_rarg0;  // source array address
  2215     const Register to       = c_rarg1;  // destination array address
  2216     const Register count    = c_rarg2;  // elements count
  2218     // 'from', 'to', 'count' registers should be set in such order
  2219     // since they are the same as 'src', 'src_pos', 'dst'.
  2221   __ BIND(L_copy_bytes);
  2222     __ cmpl(rax_elsize, 0);
  2223     __ jccb(Assembler::notEqual, L_copy_shorts);
  2224     __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
  2225     __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
  2226     __ movl2ptr(count, r11_length); // length
  2227     __ jump(RuntimeAddress(byte_copy_entry));
  2229   __ BIND(L_copy_shorts);
  2230     __ cmpl(rax_elsize, LogBytesPerShort);
  2231     __ jccb(Assembler::notEqual, L_copy_ints);
  2232     __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
  2233     __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
  2234     __ movl2ptr(count, r11_length); // length
  2235     __ jump(RuntimeAddress(short_copy_entry));
  2237   __ BIND(L_copy_ints);
  2238     __ cmpl(rax_elsize, LogBytesPerInt);
  2239     __ jccb(Assembler::notEqual, L_copy_longs);
  2240     __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
  2241     __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
  2242     __ movl2ptr(count, r11_length); // length
  2243     __ jump(RuntimeAddress(int_copy_entry));
  2245   __ BIND(L_copy_longs);
  2246 #ifdef ASSERT
  2247     { Label L;
  2248       __ cmpl(rax_elsize, LogBytesPerLong);
  2249       __ jcc(Assembler::equal, L);
  2250       __ stop("must be long copy, but elsize is wrong");
  2251       __ bind(L);
  2253 #endif
  2254     __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
  2255     __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
  2256     __ movl2ptr(count, r11_length); // length
  2257     __ jump(RuntimeAddress(long_copy_entry));
  2259     // objArrayKlass
  2260   __ BIND(L_objArray);
  2261     // live at this point:  r10_src_klass, src[_pos], dst[_pos]
  2263     Label L_plain_copy, L_checkcast_copy;
  2264     //  test array classes for subtyping
  2265     __ load_klass(r9_dst_klass, dst);
  2266     __ cmpq(r10_src_klass, r9_dst_klass); // usual case is exact equality
  2267     __ jcc(Assembler::notEqual, L_checkcast_copy);
  2269     // Identically typed arrays can be copied without element-wise checks.
  2270     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
  2271                            r10, L_failed);
  2273     __ lea(from, Address(src, src_pos, TIMES_OOP,
  2274                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
  2275     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
  2276                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
  2277     __ movl2ptr(count, r11_length); // length
  2278   __ BIND(L_plain_copy);
  2279     __ jump(RuntimeAddress(oop_copy_entry));
  2281   __ BIND(L_checkcast_copy);
  2282     // live at this point:  r10_src_klass, !r11_length
  2284       // assert(r11_length == C_RARG4); // will reload from here
  2285       Register r11_dst_klass = r11;
  2286       __ load_klass(r11_dst_klass, dst);
  2288       // Before looking at dst.length, make sure dst is also an objArray.
  2289       __ cmpl(Address(r11_dst_klass, lh_offset), objArray_lh);
  2290       __ jcc(Assembler::notEqual, L_failed);
  2292       // It is safe to examine both src.length and dst.length.
  2293 #ifndef _WIN64
  2294       arraycopy_range_checks(src, src_pos, dst, dst_pos, C_RARG4,
  2295                              rax, L_failed);
  2296 #else
  2297       __ movl(r11_length, C_RARG4);     // reload
  2298       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
  2299                              rax, L_failed);
  2300       __ load_klass(r11_dst_klass, dst); // reload
  2301 #endif
  2303       // Marshal the base address arguments now, freeing registers.
  2304       __ lea(from, Address(src, src_pos, TIMES_OOP,
  2305                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
  2306       __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
  2307                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
  2308       __ movl(count, C_RARG4);          // length (reloaded)
  2309       Register sco_temp = c_rarg3;      // this register is free now
  2310       assert_different_registers(from, to, count, sco_temp,
  2311                                  r11_dst_klass, r10_src_klass);
  2312       assert_clean_int(count, sco_temp);
  2314       // Generate the type check.
  2315       int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
  2316                         Klass::super_check_offset_offset_in_bytes());
  2317       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
  2318       assert_clean_int(sco_temp, rax);
  2319       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
  2321       // Fetch destination element klass from the objArrayKlass header.
  2322       int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
  2323                        objArrayKlass::element_klass_offset_in_bytes());
  2324       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
  2325       __ movl(sco_temp,      Address(r11_dst_klass, sco_offset));
  2326       assert_clean_int(sco_temp, rax);
  2328       // the checkcast_copy loop needs two extra arguments:
  2329       assert(c_rarg3 == sco_temp, "#3 already in place");
  2330       __ movptr(C_RARG4, r11_dst_klass);  // dst.klass.element_klass
  2331       __ jump(RuntimeAddress(checkcast_copy_entry));
  2334   __ BIND(L_failed);
  2335     __ xorptr(rax, rax);
  2336     __ notptr(rax); // return -1
  2337     __ leave();   // required for proper stackwalking of RuntimeStub frame
  2338     __ ret(0);
  2340     return start;
  2343 #undef length_arg
  2344 #endif
  2346 //FIXME
  2347   address generate_disjoint_long_copy(bool aligned, const char *name) {
  2348 	  Label l_1, l_2;
  2349 	  StubCodeMark mark(this, "StubRoutines", name);
  2350 	  __ align(CodeEntryAlignment);
  2351 	  address start = __ pc();
  2353 	  //      __ movl(ecx, Address(esp, 4+8));       // count
  2354 	  //     __ movl(eax, Address(esp, 4+0));       // from
  2355 	  //    __ movl(edx, Address(esp, 4+4));       // to
  2356 	  __ move(T1, A2);  
  2357 	  __ move(T3, A0); 
  2358 	  __ move(T0, A1);
  2359 	  __ push(T3); 
  2360 	  __ push(T0);
  2361 	  __ push(T1);
  2362 	  //__ subl(edx, eax);
  2363 	  //__ jmp(l_2);
  2364 	  __ b(l_2);  
  2365 	  __ delayed()->nop();   
  2366 	  __ align(16);
  2367 	  __ bind(l_1);
  2368 	  //   if (VM_Version::supports_mmx()) {
  2369 	  //     __ movq(mmx0, Address(eax));
  2370 	  //     __ movq(Address(eax, edx, Address::times_1), mmx0);
  2371 	  //   } else {
  2372 	  //   __ fild_d(Address(eax));
  2373 	  __ ld(AT, T3, 0);   
  2374 	  // __ fistp_d(Address(eax, edx, Address::times_1));
  2375 	  __ sd (AT, T0, 0); 
  2376 	  //   }
  2377 	  //   __ addl(eax, 8);
  2378 	  __ addi(T3, T3, 8); 
  2379 	  __ addi(T0, T0, 8); 
  2380 	  __ bind(l_2);
  2381 	  //    __ decl(ecx);
  2382 	  __ addi(T1, T1, -1); 
  2383 	  //    __ jcc(Assembler::greaterEqual, l_1);
  2384 	  __ bgez(T1, l_1);    
  2385 	  __ delayed()->nop(); 
  2386 	  //  if (VM_Version::supports_mmx()) {
  2387 	  //    __ emms();
  2388 	  //  }
  2389 	  //  __ ret(0);
  2390 	  __ pop(T1); 
  2391 	  __ pop(T0); 
  2392 	  __ pop(T3); 
  2393 	  __ jr(RA); 
  2394 	  __ delayed()->nop(); 
  2395 	  return start;
  2399   address generate_conjoint_long_copy(bool aligned, const char *name) {
  2400 	  Label l_1, l_2;
  2401 	  StubCodeMark mark(this, "StubRoutines", name);
  2402 	  __ align(CodeEntryAlignment);
  2403 	  address start = __ pc();
  2404 	  address nooverlap_target = aligned ?
  2405 		  StubRoutines::arrayof_jlong_disjoint_arraycopy() :
  2406 		  StubRoutines::jlong_disjoint_arraycopy();
  2407 	  array_overlap_test(nooverlap_target, 3);
  2409 	  __ push(T3); 
  2410 	  __ push(T0); 
  2411 	  __ push(T1); 
  2413 		/*      __ movl(ecx, Address(esp, 4+8));       // count
  2414 						__ movl(eax, Address(esp, 4+0));       // from
  2415 						__ movl(edx, Address(esp, 4+4));       // to
  2416 						__ jmp(l_2);
  2418 		 */
  2419 	  __ move(T1, A2);  
  2420 	  __ move(T3, A0); 
  2421 	  __ move(T0, A1);
  2422 	  __ sll(AT, T1, Address::times_8); 
  2423 	  __ add(AT, T3, AT); 
  2424 	  __ lea(T3 , Address(AT, -8)); 
  2425 	  __ sll(AT, T1, Address::times_8); 
  2426 	  __ add(AT, T0, AT); 
  2427 	  __ lea(T0 , Address(AT, -8)); 
  2431 	  __ b(l_2); 
  2432 	  __ delayed()->nop(); 
  2433 	  __ align(16);
  2434 		__ bind(l_1);
  2435 		/*      if (VM_Version::supports_mmx()) {
  2436 						__ movq(mmx0, Address(eax, ecx, Address::times_8));
  2437 						__ movq(Address(edx, ecx,Address::times_8), mmx0);
  2438 						} else {
  2439 						__ fild_d(Address(eax, ecx, Address::times_8));
  2440 						__ fistp_d(Address(edx, ecx,Address::times_8));
  2442 		 */    
  2443 		__ ld(AT, T3, 0);   
  2444 		__ sd (AT, T0, 0); 
  2445 	  __ addi(T3, T3, -8); 
  2446 	  __ addi(T0, T0,-8); 
  2447 	  __ bind(l_2);
  2448 	  //	    __ decl(ecx);
  2449 	  __ addi(T1, T1, -1); 
  2450 	  //__ jcc(Assembler::greaterEqual, l_1);
  2451 	  __ bgez(T1, l_1); 
  2452 	  __ delayed()->nop(); 
  2453 	  //      if (VM_Version::supports_mmx()) {
  2454 	  //      __ emms();
  2455 	  //   }
  2456 	  //  __ ret(0);
  2457 	  __ pop(T1); 
  2458 	  __ pop(T0); 
  2459 	  __ pop(T3); 
  2460 	  __ jr(RA); 
  2461 	  __ delayed()->nop();  
  2462 	  return start;
  2465   void generate_arraycopy_stubs() {
  2466     if (UseCompressedOops) {
  2467       StubRoutines::_oop_disjoint_arraycopy    = generate_disjoint_int_oop_copy(false, true, "oop_disjoint_arraycopy");
  2468       StubRoutines::_oop_arraycopy   	= generate_conjoint_int_oop_copy(false, true, "oop_arraycopy");
  2469     } else {
  2470       StubRoutines::_oop_disjoint_arraycopy    = generate_disjoint_long_oop_copy(false, true, "oop_disjoint_arraycopy");
  2471       StubRoutines::_oop_arraycopy   	= generate_conjoint_long_oop_copy(false, true, "oop_arraycopy");
  2474     StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
  2475     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
  2476     StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, "jint_disjoint_arraycopy");
  2477     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
  2478     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
  2480     //  if (VM_Version::supports_mmx())
  2481     //if (false)
  2482     // StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_mmx_copy_aligned("arrayof_jshort_disjoint_arraycopy");
  2483     // else
  2484     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
  2485     StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(true, false, "arrayof_jint_disjoint_arraycopy");
  2486     //StubRoutines::_arrayof_oop_disjoint_arraycopy   = generate_disjoint_int_oop_copy(true, true, "arrayof_oop_disjoint_arraycopy");
  2487     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
  2489     StubRoutines::_jbyte_arraycopy  = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
  2490     StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, "jshort_arraycopy");
  2491     StubRoutines::_jint_arraycopy   = generate_conjoint_int_oop_copy(false, false, "jint_arraycopy");
  2492     StubRoutines::_jlong_arraycopy  = generate_conjoint_long_copy(false, "jlong_arraycopy");
  2494     StubRoutines::_arrayof_jbyte_arraycopy  = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
  2495     StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
  2496     StubRoutines::_arrayof_jint_arraycopy   = generate_conjoint_int_oop_copy(true, false, "arrayof_jint_arraycopy");
  2497     //StubRoutines::_arrayof_oop_arraycopy    = generate_conjoint_int_oop_copy(true, true, "arrayof_oop_arraycopy");
  2498     StubRoutines::_arrayof_jlong_arraycopy  = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");
  2500     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
  2501     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
  2504 //Wang: add a function to implement SafeFetch32 and SafeFetchN
  2505   void generate_safefetch(const char* name, int size, address* entry,
  2506                           address* fault_pc, address* continuation_pc) {
  2507     // safefetch signatures:
  2508     //   int      SafeFetch32(int*      adr, int      errValue);
  2509     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
  2510     //
  2511     // arguments:
  2512     //   A0 = adr
  2513     //   A1 = errValue
  2514     //
  2515     // result:
  2516     //   PPC_RET  = *adr or errValue
  2518     StubCodeMark mark(this, "StubRoutines", name);
  2520     // Entry point, pc or function descriptor.
  2521     *entry = __ pc();
  2523     // Load *adr into A1, may fault.
  2524     *fault_pc = __ pc();
  2525     switch (size) {
  2526       case 4:
  2527         // int32_t
  2528         __ lw(A1, A0, 0); 
  2529         break;
  2530       case 8:
  2531         // int64_t
  2532         __ ld(A1, A0, 0); 
  2533         break;
  2534       default:
  2535         ShouldNotReachHere();
  2538     // return errValue or *adr
  2539     *continuation_pc = __ pc();
  2540     __ addu(V0,A1,R0);
  2541     __ jr(RA);
  2542     __ delayed()->nop();
  2546 #undef __
  2547 #define __ masm->
  2549   // Continuation point for throwing of implicit exceptions that are
  2550   // not handled in the current activation. Fabricates an exception
  2551   // oop and initiates normal exception dispatching in this
  2552   // frame. Since we need to preserve callee-saved values (currently
  2553   // only for C2, but done for C1 as well) we need a callee-saved oop
  2554   // map and therefore have to make these stubs into RuntimeStubs
  2555   // rather than BufferBlobs.  If the compiler needs all registers to
  2556   // be preserved between the fault point and the exception handler
  2557   // then it must assume responsibility for that in
  2558   // AbstractCompiler::continuation_for_implicit_null_exception or
  2559   // continuation_for_implicit_division_by_zero_exception. All other
  2560   // implicit exceptions (e.g., NullPointerException or
  2561   // AbstractMethodError on entry) are either at call sites or
  2562   // otherwise assume that stack unwinding will be initiated, so
  2563   // caller saved registers were assumed volatile in the compiler.
  2564   address generate_throw_exception(const char* name,
  2565                                    address runtime_entry,
  2566                                    bool restore_saved_exception_pc) {
  2567     // Information about frame layout at time of blocking runtime call.
  2568     // Note that we only have to preserve callee-saved registers since
  2569     // the compilers are responsible for supplying a continuation point
  2570 		// if they expect all registers to be preserved.
  2571 //#define aoqi_test
  2572 #ifdef aoqi_test
  2573 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2574 #endif
  2575 		enum layout {
  2576 			thread_off,    // last_java_sp                
  2577 			S7_off,        // callee saved register      sp + 1
  2578 			S6_off,        // callee saved register      sp + 2
  2579 			S5_off,        // callee saved register      sp + 3
  2580 			S4_off,        // callee saved register      sp + 4
  2581 			S3_off,        // callee saved register      sp + 5
  2582 			S2_off,        // callee saved register      sp + 6
  2583 			S1_off,        // callee saved register      sp + 7
  2584 			S0_off,        // callee saved register      sp + 8
  2585 			FP_off,
  2586 			ret_address,
  2587 			framesize
  2588 		};
  2590 		int insts_size = 2048;
  2591 		int locs_size  = 32;
  2593 		//  CodeBuffer* code     = new CodeBuffer(insts_size, locs_size, 0, 0, 0, false, 
  2594 		//  NULL, NULL, NULL, false, NULL, name, false);
  2595 		CodeBuffer code (name , insts_size, locs_size);
  2596 #ifdef aoqi_test
  2597 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2598 #endif
  2599 		OopMapSet* oop_maps  = new OopMapSet();
  2600 #ifdef aoqi_test
  2601 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2602 #endif
  2603 		MacroAssembler* masm = new MacroAssembler(&code);
  2604 #ifdef aoqi_test
  2605 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2606 #endif
  2608 		address start = __ pc();
  2609     	//__ stop("generate_throw_exception");
  2610 		/*
  2611 			 __ move(AT, (int)&jerome1 );
  2612 			 __ sw(SP, AT, 0); 	
  2613 			 __ move(AT, (int)&jerome2 );
  2614 			 __ sw(FP, AT, 0); 	
  2615 			 __ move(AT, (int)&jerome3 );
  2616 			 __ sw(RA, AT, 0); 	
  2617 			 __ move(AT, (int)&jerome4 );
  2618 			 __ sw(R0, AT, 0); 	
  2619 			 __ move(AT, (int)&jerome5 );
  2620 			 __ sw(R0, AT, 0); 	
  2621 			 __ move(AT, (int)&jerome6 );
  2622 			 __ sw(R0, AT, 0); 	
  2623 			 __ move(AT, (int)&jerome7 );
  2624 			 __ sw(R0, AT, 0); 	
  2625 			 __ move(AT, (int)&jerome10 );
  2626 			 __ sw(R0, AT, 0); 	
  2628 			 __ pushad();
  2630 		//__ enter();
  2631 		__ call(CAST_FROM_FN_PTR(address, SharedRuntime::print_call_statistics), 
  2632 		relocInfo::runtime_call_type);
  2633 		__ delayed()->nop();
  2635 		//__ leave();
  2636 		__ popad();
  2638 		 */
  2640 		// This is an inlined and slightly modified version of call_VM
  2641 		// which has the ability to fetch the return PC out of
  2642 		// thread-local storage and also sets up last_Java_sp slightly
  2643 		// differently than the real call_VM
  2644 #ifndef OPT_THREAD	
  2645 		Register java_thread = TREG;
  2646 		__ get_thread(java_thread);
  2647 #else
  2648 		Register java_thread = TREG;
  2649 #endif
  2650 #ifdef aoqi_test
  2651 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2652 #endif
  2653 		if (restore_saved_exception_pc) {
  2654 			__ ld(RA, java_thread, in_bytes(JavaThread::saved_exception_pc_offset())); // eax
  2657 		__ enter(); // required for proper stackwalking of RuntimeStub frame
  2659 		__ addi(SP, SP, (-1) * (framesize-2) * wordSize); // prolog
  2660 		__ sd(S0, SP, S0_off * wordSize);
  2661 		__ sd(S1, SP, S1_off * wordSize);
  2662 		__ sd(S2, SP, S2_off * wordSize);
  2663 		__ sd(S3, SP, S3_off * wordSize);
  2664 		__ sd(S4, SP, S4_off * wordSize);
  2665 		__ sd(S5, SP, S5_off * wordSize);
  2666 		__ sd(S6, SP, S6_off * wordSize);
  2667 		__ sd(S7, SP, S7_off * wordSize);
  2669 		int frame_complete = __ pc() - start;
  2670 		// push java thread (becomes first argument of C function)
  2671 		__ sd(java_thread, SP, thread_off * wordSize);
  2672 		if (java_thread!=A0)
  2673 			__ move(A0, java_thread);
  2675 		// Set up last_Java_sp and last_Java_fp
  2676 		__ set_last_Java_frame(java_thread, SP, FP, NULL);
  2677 		__ relocate(relocInfo::internal_pc_type);
  2679 			intptr_t save_pc = (intptr_t)__ pc() +  NativeMovConstReg::instruction_size + NativeCall::return_address_offset + 4;
  2680 			__ li48(AT, save_pc);
  2682 		__ sd(AT, java_thread, in_bytes(JavaThread::last_Java_pc_offset())); 
  2684 		// Call runtime
  2685 		__ call(runtime_entry);
  2686 		__ delayed()->nop();
  2687 		// Generate oop map
  2688 		OopMap* map =  new OopMap(framesize, 0);        
  2689 		oop_maps->add_gc_map(__ offset(),  map);
  2691 		// restore the thread (cannot use the pushed argument since arguments
  2692 		// may be overwritten by C code generated by an optimizing compiler);
  2693 		// however can use the register value directly if it is callee saved.
  2694 #ifndef OPT_THREAD
  2695 		__ get_thread(java_thread);
  2696 #endif
  2698 		__ ld(SP, java_thread, in_bytes(JavaThread::last_Java_sp_offset()));
  2699 		//  __ reset_last_Java_frame(java_thread, true);
  2700 		__ reset_last_Java_frame(java_thread, true, true);
  2702 		// Restore callee save registers.  This must be done after resetting the Java frame
  2703 		__ ld(S0, SP, S0_off * wordSize);
  2704 		__ ld(S1, SP, S1_off * wordSize);
  2705 		__ ld(S2, SP, S2_off * wordSize);
  2706 		__ ld(S3, SP, S3_off * wordSize);
  2707 		__ ld(S4, SP, S4_off * wordSize);
  2708 		__ ld(S5, SP, S5_off * wordSize);
  2709 		__ ld(S6, SP, S6_off * wordSize);
  2710 		__ ld(S7, SP, S7_off * wordSize);
  2712 		// discard arguments
  2713 		__ addi(SP, SP, (framesize-2) * wordSize); // epilog
  2714 		//	__ leave(); // required for proper stackwalking of RuntimeStub frame
  2715 		__ addi(SP, FP, wordSize);
  2716 		__ ld(FP, SP, -1*wordSize);
  2717 		// check for pending exceptions
  2718 #ifdef ASSERT
  2719 		Label L;
  2720 		__ lw(AT, java_thread, in_bytes(Thread::pending_exception_offset()));
  2721 		__ bne(AT, R0, L);
  2722 		__ delayed()->nop();
  2723 		__ should_not_reach_here();
  2724 		__ bind(L);
  2725 #endif //ASSERT
  2726 		__ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
  2727 		__ delayed()->nop();
  2728 #ifdef aoqi_test
  2729 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2730 #endif
  2731 		RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code,frame_complete, 
  2732 										framesize, oop_maps, false);
  2733 #ifdef aoqi_test
  2734 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2735 #endif
  2736 		return stub->entry_point();
  2739   // Initialization
  2740   void generate_initial() {
  2741 /*
  2742 		// Generates all stubs and initializes the entry points
  2744     // This platform-specific stub is needed by generate_call_stub()
  2745     StubRoutines::mips::_mxcsr_std        = generate_fp_mask("mxcsr_std",        0x0000000000001F80);
  2747     // entry points that exist in all platforms Note: This is code
  2748     // that could be shared among different platforms - however the
  2749     // benefit seems to be smaller than the disadvantage of having a
  2750     // much more complicated generator structure. See also comment in
  2751     // stubRoutines.hpp.
  2753     StubRoutines::_forward_exception_entry = generate_forward_exception();
  2755     StubRoutines::_call_stub_entry =
  2756       generate_call_stub(StubRoutines::_call_stub_return_address);
  2758     // is referenced by megamorphic call
  2759     StubRoutines::_catch_exception_entry = generate_catch_exception();
  2761     // atomic calls
  2762     StubRoutines::_atomic_xchg_entry         = generate_atomic_xchg();
  2763     StubRoutines::_atomic_xchg_ptr_entry     = generate_atomic_xchg_ptr();
  2764     StubRoutines::_atomic_cmpxchg_entry      = generate_atomic_cmpxchg();
  2765     StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
  2766     StubRoutines::_atomic_add_entry          = generate_atomic_add();
  2767     StubRoutines::_atomic_add_ptr_entry      = generate_atomic_add_ptr();
  2768     StubRoutines::_fence_entry               = generate_orderaccess_fence();
  2770     StubRoutines::_handler_for_unsafe_access_entry =
  2771       generate_handler_for_unsafe_access();
  2773     // platform dependent
  2774     StubRoutines::mips::_get_previous_fp_entry = generate_get_previous_fp();
  2776     StubRoutines::mips::_verify_mxcsr_entry    = generate_verify_mxcsr();
  2777 */
  2778 		// Generates all stubs and initializes the entry points
  2780 		//-------------------------------------------------------------
  2781 		//-----------------------------------------------------------
  2782 		// entry points that exist in all platforms
  2783 		// Note: This is code that could be shared among different platforms - however the benefit seems to be smaller 
  2784 		// than the disadvantage of having a much more complicated generator structure. 
  2785 		// See also comment in stubRoutines.hpp.
  2786 		StubRoutines::_forward_exception_entry = generate_forward_exception();    
  2787 		StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
  2788 		// is referenced by megamorphic call    
  2789 		StubRoutines::_catch_exception_entry = generate_catch_exception();    
  2791 		StubRoutines::_handler_for_unsafe_access_entry = generate_handler_for_unsafe_access();
  2793 		// platform dependent
  2794 		StubRoutines::gs2::_get_previous_fp_entry = generate_get_previous_fp();
  2797 void generate_all() {
  2798 #ifdef aoqi_test
  2799 tty->print_cr("%s:%d", __func__, __LINE__);
  2800 #endif
  2801     // Generates all stubs and initializes the entry points
  2803     // These entry points require SharedInfo::stack0 to be set up in
  2804     // non-core builds and need to be relocatable, so they each
  2805     // fabricate a RuntimeStub internally.
  2806 	/*
  2807     StubRoutines::_throw_AbstractMethodError_entry =
  2808       generate_throw_exception("AbstractMethodError throw_exception",
  2809                                CAST_FROM_FN_PTR(address,
  2810                                                 SharedRuntime::
  2811                                                 throw_AbstractMethodError),
  2812                                false);
  2814     StubRoutines::_throw_IncompatibleClassChangeError_entry =
  2815       generate_throw_exception("IncompatibleClassChangeError throw_exception",
  2816                                CAST_FROM_FN_PTR(address,
  2817                                                 SharedRuntime::
  2818                                                 throw_IncompatibleClassChangeError),
  2819                                false);
  2821     StubRoutines::_throw_ArithmeticException_entry =
  2822       generate_throw_exception("ArithmeticException throw_exception",
  2823                                CAST_FROM_FN_PTR(address,
  2824                                                 SharedRuntime::
  2825                                                 throw_ArithmeticException),
  2826                                true);
  2828     StubRoutines::_throw_NullPointerException_entry =
  2829       generate_throw_exception("NullPointerException throw_exception",
  2830                                CAST_FROM_FN_PTR(address,
  2831                                                 SharedRuntime::
  2832                                                 throw_NullPointerException),
  2833                                true);
  2835     StubRoutines::_throw_NullPointerException_at_call_entry =
  2836       generate_throw_exception("NullPointerException at call throw_exception",
  2837                                CAST_FROM_FN_PTR(address,
  2838                                                 SharedRuntime::
  2839                                                 throw_NullPointerException_at_call),
  2840                                false);
  2842     StubRoutines::_throw_StackOverflowError_entry =
  2843       generate_throw_exception("StackOverflowError throw_exception",
  2844                                CAST_FROM_FN_PTR(address,
  2845                                                 SharedRuntime::
  2846                                                 throw_StackOverflowError),
  2847                                false);
  2849     // entry points that are platform specific
  2850     StubRoutines::mips::_f2i_fixup = generate_f2i_fixup();
  2851     StubRoutines::mips::_f2l_fixup = generate_f2l_fixup();
  2852     StubRoutines::mips::_d2i_fixup = generate_d2i_fixup();
  2853     StubRoutines::mips::_d2l_fixup = generate_d2l_fixup();
  2855     StubRoutines::mips::_float_sign_mask  = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
  2856     StubRoutines::mips::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
  2857     StubRoutines::mips::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
  2858     StubRoutines::mips::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
  2860     // support for verify_oop (must happen after universe_init)
  2861     StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
  2863     // arraycopy stubs used by compilers
  2864     generate_arraycopy_stubs();
  2865 	*/
  2866 #ifdef aoqi_test
  2867 tty->print_cr("%s:%d", __func__, __LINE__);
  2868 #endif
  2869 		StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
  2870 #ifdef aoqi_test
  2871 tty->print_cr("%s:%d", __func__, __LINE__);
  2872 #endif
  2873 //		StubRoutines::_throw_ArithmeticException_entry         = generate_throw_exception("ArithmeticException throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_ArithmeticException),  true);
  2874 #ifdef aoqi_test
  2875 tty->print_cr("%s:%d", __func__, __LINE__);
  2876 #endif
  2877 //		StubRoutines::_throw_NullPointerException_entry        = generate_throw_exception("NullPointerException throw_exception",         CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException), true);
  2878 #ifdef aoqi_test
  2879 tty->print_cr("%s:%d", __func__, __LINE__);
  2880 #endif
  2881 		StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
  2882 #ifdef aoqi_test
  2883 tty->print_cr("%s:%d", __func__, __LINE__);
  2884 #endif
  2885 		StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError),   false);
  2886 #ifdef aoqi_test
  2887 tty->print_cr("%s:%d", __func__, __LINE__);
  2888 #endif
  2890 		//------------------------------------------------------
  2891 		//------------------------------------------------------------------
  2892 		// entry points that are platform specific  
  2894 		// support for verify_oop (must happen after universe_init)
  2895 #ifdef aoqi_test
  2896 tty->print_cr("%s:%d", __func__, __LINE__);
  2897 #endif
  2898 		StubRoutines::_verify_oop_subroutine_entry	   = generate_verify_oop();
  2899 #ifdef aoqi_test
  2900 tty->print_cr("%s:%d", __func__, __LINE__);
  2901 #endif
  2902 #ifndef CORE
  2903 		// arraycopy stubs used by compilers
  2904 		generate_arraycopy_stubs();
  2905 #ifdef aoqi_test
  2906 tty->print_cr("%s:%d", __func__, __LINE__);
  2907 #endif
  2908 #endif
  2910     // Safefetch stubs.
  2911     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
  2912                                                        &StubRoutines::_safefetch32_fault_pc,
  2913                                                        &StubRoutines::_safefetch32_continuation_pc);
  2914     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
  2915                                                        &StubRoutines::_safefetchN_fault_pc,
  2916                                                        &StubRoutines::_safefetchN_continuation_pc);
  2919  public:
  2920   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
  2921     if (all) {
  2922       generate_all();
  2923     } else {
  2924       generate_initial();
  2927 }; // end class declaration
  2928 /*
  2929 address StubGenerator::disjoint_byte_copy_entry  = NULL;
  2930 address StubGenerator::disjoint_short_copy_entry = NULL;
  2931 address StubGenerator::disjoint_int_copy_entry   = NULL;
  2932 address StubGenerator::disjoint_long_copy_entry  = NULL;
  2933 address StubGenerator::disjoint_oop_copy_entry   = NULL;
  2935 address StubGenerator::byte_copy_entry  = NULL;
  2936 address StubGenerator::short_copy_entry = NULL;
  2937 address StubGenerator::int_copy_entry   = NULL;
  2938 address StubGenerator::long_copy_entry  = NULL;
  2939 address StubGenerator::oop_copy_entry   = NULL;
  2941 address StubGenerator::checkcast_copy_entry = NULL;
  2942 */
  2943 void StubGenerator_generate(CodeBuffer* code, bool all) {
  2944   StubGenerator g(code, all);

mercurial