src/cpu/mips/vm/stubGenerator_mips_64.cpp

Mon, 30 May 2016 02:01:38 -0400

author
aoqi
date
Mon, 30 May 2016 02:01:38 -0400
changeset 13
bc227c49eaae
parent 8
cf5765c81f87
child 32
3b95e10c12fa
permissions
-rw-r--r--

[C2] Rewrite generate_disjoint_short_copy.
Eliminated unaligned access and Optimized copy algorithm.
xml.transform improved by 50%, total GEO improved by 13%.
Copy Algorithm:
Generate stub for disjoint short copy. If "aligned" is true, the
"from" and "to" addresses are assumed to be heapword aligned.

Arguments for generated stub:
from: A0
to: A1
elm.count: A2 treated as signed
one element: 2 bytes

Strategy for aligned==true:

If length <= 9:
1. copy 1 elements at a time (l_5)

If length > 9:
1. copy 4 elements at a time until less than 4 elements are left (l_7)
2. copy 2 elements at a time until less than 2 elements are left (l_6)
3. copy last element if one was left in step 2. (l_1)


Strategy for aligned==false:

If length <= 9: same as aligned==true case

If length > 9:
1. continue with step 7. if the alignment of from and to mod 4
is different.
2. align from and to to 4 bytes by copying 1 element if necessary
3. at l_2 from and to are 4 byte aligned; continue with
6. if they cannot be aligned to 8 bytes because they have
got different alignment mod 8.
4. at this point we know that both, from and to, have the same
alignment mod 8, now copy one element if necessary to get
8 byte alignment of from and to.
5. copy 4 elements at a time until less than 4 elements are
left; depending on step 3. all load/stores are aligned.
6. copy 2 elements at a time until less than 2 elements are
left. (l_6)
7. copy 1 element at a time. (l_5)
8. copy last element if one was left in step 6. (l_1)

TODO:

1. use loongson 128-bit load/store
2. use loop unrolling optimization when len is big enough, for example if
len > 0x2000:
__ bind(l_x);
__ ld(AT, tmp1, 0);
__ ld(tmp, tmp1, 8);
__ sd(AT, tmp2, 0);
__ sd(tmp, tmp2, 8);
__ ld(AT, tmp1, 16);
__ ld(tmp, tmp1, 24);
__ sd(AT, tmp2, 16);
__ sd(tmp, tmp2, 24);
__ daddi(tmp1, tmp1, 32);
__ daddi(tmp2, tmp2, 32);
__ daddi(tmp3, tmp3, -16);
__ daddi(AT, tmp3, -16);
__ bgez(AT, l_x);
__ delayed()->nop();

     1 /*
     2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
     3  * Copyright (c) 2015, 2016, Loongson Technology. All rights reserved.
     4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     5  *
     6  * This code is free software; you can redistribute it and/or modify it
     7  * under the terms of the GNU General Public License version 2 only, as
     8  * published by the Free Software Foundation.
     9  *
    10  * This code is distributed in the hope that it will be useful, but WITHOUT
    11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    13  * version 2 for more details (a copy is included in the LICENSE file that
    14  * accompanied this code).
    15  *
    16  * You should have received a copy of the GNU General Public License version
    17  * 2 along with this work; if not, write to the Free Software Foundation,
    18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    19  *
    20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    21  * or visit www.oracle.com if you need additional information or have any
    22  * questions.
    23  *
    24  */
    26 #include "precompiled.hpp"
    27 #include "asm/macroAssembler.hpp"
    28 #include "asm/macroAssembler.inline.hpp"
    29 #include "interpreter/interpreter.hpp"
    30 #include "nativeInst_mips.hpp"
    31 #include "oops/instanceOop.hpp"
    32 #include "oops/method.hpp"
    33 #include "oops/objArrayKlass.hpp"
    34 #include "oops/oop.inline.hpp"
    35 #include "prims/methodHandles.hpp"
    36 #include "runtime/frame.inline.hpp"
    37 #include "runtime/handles.inline.hpp"
    38 #include "runtime/sharedRuntime.hpp"
    39 #include "runtime/stubCodeGenerator.hpp"
    40 #include "runtime/stubRoutines.hpp"
    41 #include "runtime/thread.inline.hpp"
    42 #include "utilities/top.hpp"
    43 #ifdef COMPILER2
    44 #include "opto/runtime.hpp"
    45 #endif
    48 // Declaration and definition of StubGenerator (no .hpp file).
    49 // For a more detailed description of the stub routine structure
    50 // see the comment in stubRoutines.hpp
    52 #define __ _masm->
    53 //#define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
    54 //#define a__ ((Assembler*)_masm)->
    56 //#ifdef PRODUCT
    57 //#define BLOCK_COMMENT(str) /* nothing */
    58 //#else
    59 //#define BLOCK_COMMENT(str) __ block_comment(str)
    60 //#endif
    62 //#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
    63 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
    65 // Stub Code definitions
    67 static address handle_unsafe_access() {
    68   JavaThread* thread = JavaThread::current();
    69   address pc = thread->saved_exception_pc();
    70   // pc is the instruction which we must emulate
    71   // doing a no-op is fine:  return garbage from the load
    72   // therefore, compute npc
    73   //address npc = Assembler::locate_next_instruction(pc);
    74 	address npc = (address)((unsigned long)pc + sizeof(unsigned long));
    76   // request an async exception
    77   thread->set_pending_unsafe_access_error();
    79   // return address of next instruction to execute
    80   return npc;
    81 }
    83 class StubGenerator: public StubCodeGenerator {
    84  private:
    86   // ABI mips n64
    87   // This fig is not MIPS ABI. It is call Java from C ABI.
    88   // Call stubs are used to call Java from C
    89   //
    90   //    [ return_from_Java     ]
    91   //    [ argument word n-1    ] <--- sp
    92   //      ...
    93   //    [ argument word 0      ]
    94   //      ...
    95   //-10 [ S6     	       ]
    96   // -9 [ S5		       ] 
    97   // -8 [ S4		       ]
    98   // -7 [ S3                   ]
    99   // -6 [ S0  		       ]
   100   // -5 [ TSR(S2)	       ]
   101   // -4 [ LVP(S7)              ]
   102   // -3 [ BCP(S1)              ]
   103   // -2 [ saved fp             ] <--- fp_after_call
   104   // -1 [ return address       ] 
   105   //  0 [ ptr. to call wrapper ] <--- a0 (old sp -->)fp
   106   //  1 [ result               ] <--- a1
   107   //  2 [ result_type          ] <--- a2
   108   //  3 [ method               ] <--- a3
   109   //  4 [ entry_point          ] <--- a4
   110   //  5 [ parameters           ] <--- a5
   111   //  6 [ parameter_size       ] <--- a6
   112   //  7 [ thread               ] <--- a7
   114   //
   115   // _LP64: n64 does not save paras in sp.
   116   //
   117   //    [ return_from_Java     ]
   118   //    [ argument word n-1    ] <--- sp
   119   //      ...
   120   //    [ argument word 0      ]
   121   //      ...
   122   //-14 [ thread               ]
   123   //-13 [ result_type          ] <--- a2
   124   //-12 [ result               ] <--- a1
   125   //-11 [ ptr. to call wrapper ] <--- a0
   126   //-10 [ S6     	       ]
   127   // -9 [ S5		       ] 
   128   // -8 [ S4		       ]
   129   // -7 [ S3                   ]
   130   // -6 [ S0  		       ]
   131   // -5 [ TSR(S2)	       ]
   132   // -4 [ LVP(S7)              ]
   133   // -3 [ BCP(S1)              ]
   134   // -2 [ saved fp             ] <--- fp_after_call
   135   // -1 [ return address       ] 
   136   //  0 [        	       ] <--- old sp
   137   /*
   138    * 2014/01/16 Fu: Find a right place in the call_stub for GP.
   139    * GP will point to the starting point of Interpreter::dispatch_table(itos). 
   140    * It should be saved/restored before/after Java calls. 
   141    *
   142    */
   143    enum call_stub_layout {
   144      RA_off		  = -1,
   145      FP_off		  = -2,
   146      BCP_off		  = -3,
   147      LVP_off		  = -4,
   148      TSR_off		  = -5,
   149      S1_off		  = -6,
   150      S3_off		  = -7,
   151      S4_off		  = -8,
   152      S5_off		  = -9,
   153      S6_off		  = -10,
   154      result_off		  = -11,
   155      result_type_off	  = -12,
   156      thread_off		  = -13,
   157      total_off		  = thread_off - 3,
   158      GP_off               = -16,
   159    };
   161   address generate_call_stub(address& return_address) {
   163     StubCodeMark mark(this, "StubRoutines", "call_stub");
   164     address start = __ pc();
   166     // same as in generate_catch_exception()!
   168     // stub code
   169     // save ra and fp
   170     __ sd(RA, SP, RA_off * wordSize);
   171     __ sd(FP, SP, FP_off * wordSize);
   172     __ sd(BCP, SP, BCP_off * wordSize);
   173     __ sd(LVP, SP, LVP_off * wordSize);
   174     __ sd(GP, SP, GP_off * wordSize);
   175     __ sd(TSR, SP, TSR_off * wordSize);
   176     __ sd(S1, SP, S1_off * wordSize);
   177     __ sd(S3, SP, S3_off * wordSize);
   178     __ sd(S4, SP, S4_off * wordSize);
   179     __ sd(S5, SP, S5_off * wordSize);
   180     __ sd(S6, SP, S6_off * wordSize);
   183     __ li48(GP, (long)Interpreter::dispatch_table(itos));
   185     // I think 14 is the max gap between argument and callee saved register
   186     __ daddi(FP, SP, (-2) * wordSize);
   187     __ daddi(SP, SP, total_off * wordSize);
   188 //FIXME, aoqi. find a suitable place to save A1 & A2.
   189     /*
   190     __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);
   191     __ sd(A1, FP, 3 * wordSize);
   192     __ sd(A2, FP, 4 * wordSize);
   193     __ sd(A3, FP, 5 * wordSize);
   194     __ sd(A4, FP, 6 * wordSize);
   195     __ sd(A5, FP, 7 * wordSize);
   196     __ sd(A6, FP, 8 * wordSize);
   197     __ sd(A7, FP, 9 * wordSize);
   198     */
   199     __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);
   200     __ sd(A1, FP, result_off * wordSize);
   201     __ sd(A2, FP, result_type_off * wordSize);
   202     __ sd(A7, FP, thread_off * wordSize);
   204 #ifdef OPT_THREAD
   205     //__ get_thread(TREG);
   206     __ move(TREG, A7);
   208     //__ ld(TREG, FP, thread_off * wordSize);
   209 #endif
   210     //add for compressedoops
   211     __ reinit_heapbase();
   213 #ifdef ASSERT
   214     // make sure we have no pending exceptions
   215     { 
   216       Label L;
   217     	__ ld(AT, A7, in_bytes(Thread::pending_exception_offset()));
   218     	__ beq(AT, R0, L); 
   219     	__ delayed()->nop();
   220     	/* FIXME: I do not know how to realize stop in mips arch, do it in the future */
   221     	__ stop("StubRoutines::call_stub: entered with pending exception");
   222     	__ bind(L);
   223     }
   224 #endif
   226     // pass parameters if any
   227     // A5: parameter
   228     // A6: parameter_size
   229     // T0: parameter_size_tmp(--)
   230     // T2: offset(++)
   231     // T3: tmp
   232     Label parameters_done;
   233     // judge if the parameter_size equals 0
   234     __ beq(A6, R0, parameters_done);
   235     __ delayed()->nop();
   236     __ dsll(AT, A6, Interpreter::logStackElementSize);
   237     __ dsub(SP, SP, AT); 
   238     __ move(AT, -StackAlignmentInBytes); 
   239     __ andr(SP, SP , AT); 
   240     // Copy Java parameters in reverse order (receiver last)
   241     // Note that the argument order is inverted in the process
   242     // source is edx[ecx: N-1..0]
   243     // dest   is esp[ebx: 0..N-1]
   244     Label loop;
   245     __ move(T0, A6);
   246     __ move(T2, R0);
   247     __ bind(loop);
   249     // get parameter
   250     __ dsll(T3, T0, LogBytesPerWord);   
   251     __ dadd(T3, T3, A5);	    
   252     __ ld(AT, T3,  -wordSize);
   253     __ dsll(T3, T2, LogBytesPerWord); 
   254     __ dadd(T3, T3, SP); 
   255     __ sd(AT, T3, Interpreter::expr_offset_in_bytes(0));
   256     __ daddi(T2, T2, 1); 
   257     __ daddi(T0, T0, -1); 
   258     __ bne(T0, R0, loop);
   259     __ delayed()->nop();
   260     // advance to next parameter
   262     // call Java function
   263     __ bind(parameters_done);
   265     // receiver in V0, methodOop in Rmethod
   267     __ move(Rmethod, A3);
   268     __ move(Rsender, SP);             //set sender sp
   269     __ jalr(A4);
   270     __ delayed()->nop();
   271     return_address = __ pc();
   273     Label common_return;
   274     __ bind(common_return);
   276     // store result depending on type
   277     // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
   278     __ ld(T0, FP, result_off * wordSize); 	// result --> T0
   279     Label is_long, is_float, is_double, exit;
   280     __ ld(T2, FP, result_type_off * wordSize);	// result_type --> T2
   281     __ daddi(T3, T2, (-1) * T_LONG);
   282     __ beq(T3, R0, is_long);
   283     __ delayed()->daddi(T3, T2, (-1) * T_FLOAT);
   284     __ beq(T3, R0, is_float);
   285     __ delayed()->daddi(T3, T2, (-1) * T_DOUBLE);
   286     __ beq(T3, R0, is_double);
   287     __ delayed()->nop();
   289     // handle T_INT case
   290     __ sd(V0, T0, 0 * wordSize);
   291     __ bind(exit);
   293     // restore 
   294     __ daddi(SP, FP, 2 * wordSize );
   295     __ ld(RA, SP, RA_off * wordSize);
   296     __ ld(FP, SP, FP_off * wordSize);
   297     __ ld(BCP, SP, BCP_off * wordSize);
   298     __ ld(LVP, SP, LVP_off * wordSize);
   299     __ ld(GP, SP, GP_off * wordSize);
   300     __ ld(TSR, SP, TSR_off * wordSize);
   302     __ ld(S1, SP, S1_off * wordSize);
   303     __ ld(S3, SP, S3_off * wordSize);
   304     __ ld(S4, SP, S4_off * wordSize);
   305     __ ld(S5, SP, S5_off * wordSize);
   306     __ ld(S6, SP, S6_off * wordSize);
   308     // return
   309     __ jr(RA);
   310     __ delayed()->nop();
   312     // handle return types different from T_INT
   313     __ bind(is_long);
   314     __ sd(V0, T0, 0 * wordSize);
   315     //__ sd(V1, T0, 1 * wordSize);
   316     __ sd(R0, T0, 1 * wordSize);
   317     __ b(exit);
   318     __ delayed()->nop();
   320     __ bind(is_float);
   321     __ swc1(F0, T0, 0 * wordSize);
   322     __ b(exit);
   323     __ delayed()->nop();
   325     __ bind(is_double);
   326     __ sdc1(F0, T0, 0 * wordSize);
   327     //__ sdc1(F1, T0, 1 * wordSize);
   328     __ sd(R0, T0, 1 * wordSize);
   329     __ b(exit);
   330     __ delayed()->nop();
   331     //FIXME, 1.6 mips version add operation of fpu here
   332     StubRoutines::gs2::set_call_stub_compiled_return(__ pc());
   333     __ b(common_return);
   334     __ delayed()->nop(); 
   335     return start;
   336   }
   338   // Return point for a Java call if there's an exception thrown in
   339   // Java code.  The exception is caught and transformed into a
   340   // pending exception stored in JavaThread that can be tested from
   341   // within the VM.
   342   //
   343   // Note: Usually the parameters are removed by the callee. In case
   344   // of an exception crossing an activation frame boundary, that is
   345   // not the case if the callee is compiled code => need to setup the
   346   // rsp.
   347   //
   348   // rax: exception oop
   350   address generate_catch_exception() {
   351     StubCodeMark mark(this, "StubRoutines", "catch_exception");
   352     address start = __ pc();
   354     Register thread = TREG;
   356     // get thread directly
   357 #ifndef OPT_THREAD
   358     __ ld(thread, FP, thread_off * wordSize);
   359 #endif
   361 #ifdef ASSERT
   362     // verify that threads correspond
   363     { Label L;
   364       __ get_thread(T8);
   365       __ beq(T8, thread, L);
   366       __ delayed()->nop();
   367       __ stop("StubRoutines::catch_exception: threads must correspond");
   368       __ bind(L);
   369     }
   370 #endif
   371     // set pending exception
   372     __ verify_oop(V0);
   373     __ sd(V0, thread, in_bytes(Thread::pending_exception_offset()));
   374     __ li(AT, (long)__FILE__);
   375     __ sd(AT, thread, in_bytes(Thread::exception_file_offset   ()));
   376     __ li(AT, (long)__LINE__);
   377     __ sd(AT, thread, in_bytes(Thread::exception_line_offset   ()));
   379     // complete return to VM
   380     assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
   381     __ jmp(StubRoutines::_call_stub_return_address, relocInfo::none);
   382     __ delayed()->nop();
   384     return start;
   385   }
   387   // Continuation point for runtime calls returning with a pending
   388   // exception.  The pending exception check happened in the runtime
   389   // or native call stub.  The pending exception in Thread is
   390   // converted into a Java-level exception.
   391   //
   392   // Contract with Java-level exception handlers:
   393   // rax: exception
   394   // rdx: throwing pc
   395   //
   396   // NOTE: At entry of this stub, exception-pc must be on stack !!
   398   address generate_forward_exception() {
   399     StubCodeMark mark(this, "StubRoutines", "forward exception");
   400     //Register thread = TREG;
   401     Register thread = TREG;
   402     address start = __ pc();
   404     // Upon entry, the sp points to the return address returning into Java
   405     // (interpreted or compiled) code; i.e., the return address becomes the
   406     // throwing pc.
   407     //
   408     // Arguments pushed before the runtime call are still on the stack but
   409     // the exception handler will reset the stack pointer -> ignore them.
   410     // A potential result in registers can be ignored as well.
   412 #ifdef ASSERT
   413     // make sure this code is only executed if there is a pending exception
   414 #ifndef OPT_THREAD
   415     __ get_thread(thread);
   416 #endif
   417     { Label L;
   418       __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));
   419       __ bne(AT, R0, L);
   420       __ delayed()->nop();
   421       __ stop("StubRoutines::forward exception: no pending exception (1)");
   422       __ bind(L);
   423     }
   424 #endif
   426     // compute exception handler into T9
   427     __ ld(A1, SP, 0);
   428     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, A1);
   429     __ move(T9, V0);
   430     __ pop(V1);
   432 #ifndef OPT_THREAD
   433     __ get_thread(thread);
   434 #endif
   435     __ ld(V0, thread, in_bytes(Thread::pending_exception_offset()));
   436     __ sd(R0, thread, in_bytes(Thread::pending_exception_offset()));
   438 #ifdef ASSERT
   439     // make sure exception is set
   440     { Label L;
   441       __ bne(V0, R0, L);
   442       __ delayed()->nop();
   443       __ stop("StubRoutines::forward exception: no pending exception (2)");
   444       __ bind(L);
   445     }
   446 #endif
   448     // continue at exception handler (return address removed)
   449     // V0: exception
   450     // T9: exception handler
   451     // V1: throwing pc
   452     __ verify_oop(V0);
   453     __ jr(T9);
   454     __ delayed()->nop();
   456     return start;
   457   }
   459   // Support for intptr_t get_previous_fp()
   460   //
   461   // This routine is used to find the previous frame pointer for the
   462   // caller (current_frame_guess). This is used as part of debugging
   463   // ps() is seemingly lost trying to find frames.
   464   // This code assumes that caller current_frame_guess) has a frame.
   465   address generate_get_previous_fp() {
   466     StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
   467     const Address old_fp       (FP,  0);
   468     const Address older_fp       (V0,  0);
   469     address start = __ pc();
   470     __ enter();    
   471     __ lw(V0, old_fp); // callers fp
   472     __ lw(V0, older_fp); // the frame for ps()
   473     __ leave();
   474     __ jr(RA);
   475     __ delayed()->nop();
   476     return start;
   477   }
   478   // The following routine generates a subroutine to throw an
   479   // asynchronous UnknownError when an unsafe access gets a fault that
   480   // could not be reasonably prevented by the programmer.  (Example:
   481   // SIGBUS/OBJERR.)
   482   address generate_handler_for_unsafe_access() {
   483 		StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
   484 		address start = __ pc();
   485 		__ pushad();                      // push registers
   486 		//  Address next_pc(esp, RegisterImpl::number_of_registers * BytesPerWord);
   487 		__ call(CAST_FROM_FN_PTR(address, handle_unsafe_access), relocInfo::runtime_call_type);
   488 		__ delayed()->nop(); 
   489 		__ sw(V0, SP, RegisterImpl::number_of_registers * BytesPerWord); 
   490 		__ popad();
   491 		__ jr(RA);
   492 		__ delayed()->nop();  
   493 		return start;
   494   }
   496   // Non-destructive plausibility checks for oops
   497   //
   498   // Arguments:
   499   //    all args on stack!
   500   //
   501   // Stack after saving c_rarg3:
   502   //    [tos + 0]: saved c_rarg3
   503   //    [tos + 1]: saved c_rarg2
   504   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
   505   //    [tos + 3]: saved flags
   506   //    [tos + 4]: return address
   507   //  * [tos + 5]: error message (char*)
   508   //  * [tos + 6]: object to verify (oop)
   509   //  * [tos + 7]: saved rax - saved by caller and bashed
   510   //  * = popped on exit
   511   address generate_verify_oop() {
   512 	  StubCodeMark mark(this, "StubRoutines", "verify_oop");
   513 	  address start = __ pc();
   514 	  __ reinit_heapbase();
   515 	  __ verify_oop_subroutine(); 
   516     address end = __ pc();
   517 	  return start;
   518   }
   520   //
   521   //  Generate overlap test for array copy stubs
   522   //
   523   //  Input:
   524   //     A0    -  array1
   525   //     A1    -  array2
   526   //     A2    -  element count
   527   //
   528   //  Note: this code can only use %eax, %ecx, and %edx
   529   //
   531  // use T9 as temp 
   532   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
   533     int elem_size = 1 << log2_elem_size;
   534     Address::ScaleFactor sf = Address::times_1;
   536     switch (log2_elem_size) {
   537       case 0: sf = Address::times_1; break;
   538       case 1: sf = Address::times_2; break;
   539       case 2: sf = Address::times_4; break;
   540       case 3: sf = Address::times_8; break;
   541     }
   543     __ dsll(AT, A2, sf);
   544     __ dadd(AT, AT, A0); 
   545     __ lea(T9, Address(AT, -elem_size)); 
   546     __ dsub(AT, A1, A0); 
   547     __ blez(AT, no_overlap_target); 
   548     __ delayed()->nop(); 
   549     __ dsub(AT, A1, T9); 
   550     __ bgtz(AT, no_overlap_target); 
   551     __ delayed()->nop(); 
   553     // 2016/05/10 aoqi: If A0 = 0xf... and A1 = 0x0..., than goto no_overlap_target 
   554     Label L;
   555     __ bgez(A0, L);
   556     __ delayed()->nop(); 
   557     __ bgtz(A1, no_overlap_target);
   558     __ delayed()->nop(); 
   559     __ bind(L);
   561   }
   563   //
   564   //  Generate store check for array
   565   //
   566   //  Input:
   567   //     %edi    -  starting address
   568   //     %ecx    -  element count
   569   //
   570   //  The 2 input registers are overwritten
   571   //
   573   //
   574   //  Generate store check for array
   575   //
   576   //  Input:
   577   //     T0    -  starting address(edi)
   578   //     T1    -  element count  (ecx)
   579   //
   580   //  The 2 input registers are overwritten
   581   //
   583 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
   585 	void array_store_check() {
   586 		BarrierSet* bs = Universe::heap()->barrier_set();
   587 		assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
   588 		CardTableModRefBS* ct = (CardTableModRefBS*)bs;
   589 		assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
   590 		Label l_0;
   592 		__ dsll(AT, T1, TIMES_OOP);
   593 		__ dadd(AT, T0, AT); 
   594 		__ daddiu(T1, AT, - BytesPerHeapOop);
   596 		__ shr(T0, CardTableModRefBS::card_shift); 
   597 		__ shr(T1, CardTableModRefBS::card_shift);
   599 		__ dsub(T1, T1, T0);   // end --> cards count
   600 		__ bind(l_0);
   602 		__ li48(AT, (long)ct->byte_map_base); 
   603 		__ dadd(AT, AT, T0); 
   604 		__ dadd(AT, AT, T1); 
   605 		__ sb(R0, AT, 0);
   606 		//__ daddi(T1, T1, -4);  
   607 		__ daddi(T1, T1, - 1);
   608 		__ bgez(T1, l_0);
   609 		__ delayed()->nop(); 
   610 	}
   612   // Arguments:
   613   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
   614   //             ignored
   615   //   name    - stub name string
   616   //
   617   // Inputs:
   618   //   c_rarg0   - source array address
   619   //   c_rarg1   - destination array address
   620   //   c_rarg2   - element count, treated as ssize_t, can be zero
   621   //
   622   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
   623   // we let the hardware handle it.  The one to eight bytes within words,
   624   // dwords or qwords that span cache line boundaries will still be loaded
   625   // and stored atomically.
   626   //
   627   // Side Effects:
   628   //   disjoint_byte_copy_entry is set to the no-overlap entry point
   629   //   used by generate_conjoint_byte_copy().
   630   //
   631 	address generate_disjoint_byte_copy(bool aligned, const char *name) {
   632 	  StubCodeMark mark(this, "StubRoutines", name);
   633 	  __ align(CodeEntryAlignment);
   634 	  address start = __ pc();
   635 	  Label l_0, l_1, l_2, l_3, l_4, l_5, l_6;
   637 	  __ push(T3);
   638 	  __ push(T0);
   639 	  __ push(T1);
   640 	  __ push(T8);
   641 	  __ move(T3, A0); 
   642 	  __ move(T0, A1);
   643 	  __ move(T1, A2);  
   644 	  __ move(T8, T1);             // original count in T1
   645 	  __ daddi(AT, T1, -3); 
   646 	  __ blez(AT, l_4);  
   647 	  __ delayed()->nop();	
   648 	  if (!aligned) {
   649           //TODO: copy 8 bytes at one time
   650 	    // 2016/5/8 Jin: only when src and dest has the same alignment can we do lw/sw */
   651 	    __ andi(AT, T3, 3); 
   652 	    __ andi(T9, T0, 3); 
   653 	    __ bne(AT, T9, l_5); 
   654 	    __ delayed()->nop();	
   656 	    // align source address at dword address boundary
   657 	    __ move(T1, 4); 
   658 	    __ sub(T1, T1, T3); 
   659 	    __ andi(T1, T1, 3); 
   660 	    __ beq(T1, R0, l_1); 
   661 	    __ delayed()->nop();	
   662 	    __ sub(T8,T8,T1); 
   663 	    __ bind(l_0);
   664 	    __ lb(AT, T3, 0); 
   665 	    __ sb(AT, T0, 0); 
   666 	    __ addi(T3, T3, 1); 
   667 	    __ addi(T0, T0, 1); 
   668 	    __ addi(T1 ,T1, -1);  
   669 	    __ bne(T1, R0, l_0); 
   670 	    __ delayed()->nop(); 
   671 	    __ bind(l_1);
   672 	    __ move(T1, T8); 
   673 	  }
   674 	  __ shr(T1, 2); 
   675 	  __ beq(T1, R0, l_4);     // no dwords to move
   676 	  __ delayed()->nop(); 
   677 	  // copy aligned dwords
   678 	  __ bind(l_2);
   679 	  __ align(16);
   680 	  __ bind(l_3);
   681 	  __ lw(AT, T3, 0);   
   682 	  __ sw(AT, T0, 0 ); 
   683 	  __ addi(T3, T3, 4); 
   684 	  __ addi(T0, T0, 4); 
   685 	  __ addi(T1, T1, -1); 
   686 	  __ bne(T1, R0, l_3); 
   687 	  __ delayed()->nop(); 
   688 	  __ bind(l_4);
   689 	  __ move(T1, T8); 
   690 	  __ andi(T1, T1, 3); 
   691 	  __ beq(T1, R0, l_6);  
   692 	  __ delayed()->nop(); 
   693 	  // copy suffix
   694 	  __ bind(l_5);
   695 	  __ lb(AT, T3, 0); 
   696 	  __ sb(AT, T0, 0); 
   697 	  __ addi(T3, T3, 1);  
   698 	  __ addi(T0, T0, 1);  
   699 	  __ addi(T1, T1, -1); 
   700 	  __ bne(T1, R0, l_5 ); 
   701 	  __ delayed()->nop(); 
   702 	  __ bind(l_6);
   703 	  __ pop(T8); 
   704 	  __ pop(T1); 
   705 	  __ pop(T0); 
   706 	  __ pop(T3); 
   707 	  __ jr(RA); 
   708 	  __ delayed()->nop(); 
   709 	  return start;
   710   }
   712   // Arguments:
   713   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
   714   //             ignored
   715   //   name    - stub name string
   716   //
   717   // Inputs:
   718   //   A0   - source array address
   719   //   A1   - destination array address
   720   //   A2   - element count, treated as ssize_t, can be zero
   721   //
   722   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
   723   // we let the hardware handle it.  The one to eight bytes within words,
   724   // dwords or qwords that span cache line boundaries will still be loaded
   725   // and stored atomically.
   726   //
   727   address generate_conjoint_byte_copy(bool aligned, const char *name) {
   728     __ align(CodeEntryAlignment);
   729     StubCodeMark mark(this, "StubRoutines", name);
   730     address start = __ pc();
   732     Label l_copy_4_bytes_loop, l_copy_suffix, l_copy_suffix_loop, l_exit;
   733     Label l_copy_byte, l_from_unaligned, l_unaligned, l_4_bytes_aligned;
   735     address nooverlap_target = aligned ?
   736 	    StubRoutines::arrayof_jbyte_disjoint_arraycopy() :
   737 	    StubRoutines::jbyte_disjoint_arraycopy();
   739     array_overlap_test(nooverlap_target, 0);
   741     const Register from      = A0;   // source array address
   742     const Register to        = A1;   // destination array address
   743     const Register count     = A2;   // elements count
   744     const Register end_from  = T3;   // source array end address
   745     const Register end_to    = T0;   // destination array end address
   746     const Register end_count = T1;   // destination array end address
   748     __ push(end_from);	
   749     __ push(end_to);	
   750     __ push(end_count);	
   751     __ push(T8);	
   753     // copy from high to low
   754     __ move(end_count, count);  
   755     __ dadd(end_from, from, end_count);  
   756     __ dadd(end_to, to, end_count);  
   758     // 2016/05/08 aoqi: If end_from and end_to has differante alignment, unaligned copy is performed.
   759     __ andi(AT, end_from, 3); 
   760     __ andi(T8, end_to, 3); 
   761     __ bne(AT, T8, l_copy_byte); 
   762     __ delayed()->nop();	
   764     // First deal with the unaligned data at the top.
   765     __ bind(l_unaligned);
   766     __ beq(end_count, R0, l_exit); 
   767     __ delayed()->nop(); 
   769     __ andi(AT, end_from, 3);    
   770     __ bne(AT, R0, l_from_unaligned); 
   771     __ delayed()->nop(); 
   773     __ andi(AT, end_to, 3);    
   774     __ beq(AT, R0, l_4_bytes_aligned); 
   775     __ delayed()->nop(); 
   777     __ bind(l_from_unaligned);
   778     __ lb(AT, end_from, -1);   
   779     __ sb(AT, end_to, -1); 
   780     __ daddi(end_from, end_from, -1); 
   781     __ daddi(end_to, end_to, -1); 
   782     __ daddi(end_count, end_count, -1); 
   783     __ b(l_unaligned); 
   784     __ delayed()->nop(); 
   786     // now end_to, end_from point to 4-byte aligned high-ends
   787     //     end_count contains byte count that is not copied.
   788     // copy 4 bytes at a time
   789     __ bind(l_4_bytes_aligned);
   791     __ move(T8, end_count); 
   792     __ daddi(AT, end_count, -3); 
   793     __ blez(AT, l_copy_suffix); 
   794     __ delayed()->nop();	
   796     //__ andi(T8, T8, 3); 
   797     __ lea(end_from, Address(end_from, -4));
   798     __ lea(end_to, Address(end_to, -4));
   800     __ dsrl(end_count, end_count, 2); 
   801     __ align(16);
   802     __ bind(l_copy_4_bytes_loop); //l_copy_4_bytes
   803     __ lw(AT, end_from, 0);   
   804     __ sw(AT, end_to, 0); 
   805     __ addi(end_from, end_from, -4);    
   806     __ addi(end_to, end_to, -4);    
   807     __ addi(end_count, end_count, -1);  
   808     __ bne(end_count, R0, l_copy_4_bytes_loop); 
   809     __ delayed()->nop(); 
   811     __ b(l_copy_suffix);  
   812     __ delayed()->nop(); 
   813     // copy dwords aligned or not with repeat move
   814     // l_copy_suffix
   815     // copy suffix (0-3 bytes)
   816     __ bind(l_copy_suffix); 
   817     __ andi(T8, T8, 3); 
   818     __ beq(T8, R0, l_exit); 
   819     __ delayed()->nop(); 
   820     __ addi(end_from, end_from, 3); 
   821     __ addi(end_to, end_to, 3); 
   822     __ bind(l_copy_suffix_loop);
   823     __ lb(AT, end_from, 0);  
   824     __ sb(AT, end_to, 0); 
   825     __ addi(end_from, end_from, -1);  
   826     __ addi(end_to, end_to, -1);  
   827     __ addi(T8, T8, -1); 
   828     __ bne(T8, R0, l_copy_suffix_loop); 
   829     __ delayed()->nop(); 
   831     __ bind(l_copy_byte);
   832     __ beq(end_count, R0, l_exit); 
   833     __ delayed()->nop(); 
   834     __ lb(AT, end_from, -1);   
   835     __ sb(AT, end_to, -1); 
   836     __ daddi(end_from, end_from, -1); 
   837     __ daddi(end_to, end_to, -1); 
   838     __ daddi(end_count, end_count, -1); 
   839     __ b(l_copy_byte); 
   840     __ delayed()->nop(); 
   842     __ bind(l_exit);
   843     __ pop(T8);	
   844     __ pop(end_count);	
   845     __ pop(end_to);	
   846     __ pop(end_from);	
   847     __ jr(RA); 
   848     __ delayed()->nop(); 
   849     return start;
   850   }
   852   // Generate stub for disjoint short copy.  If "aligned" is true, the
   853   // "from" and "to" addresses are assumed to be heapword aligned.
   854   //
   855   // Arguments for generated stub:
   856   //      from:  A0
   857   //      to:    A1
   858   //  elm.count: A2 treated as signed
   859   //  one element: 2 bytes
   860   //
   861   // Strategy for aligned==true:
   862   //
   863   //  If length <= 9:
   864   //     1. copy 1 elements at a time (l_5)
   865   //
   866   //  If length > 9:
   867   //     1. copy 4 elements at a time until less than 4 elements are left (l_7)
   868   //     2. copy 2 elements at a time until less than 2 elements are left (l_6)
   869   //     3. copy last element if one was left in step 2. (l_1)
   870   //
   871   //
   872   // Strategy for aligned==false:
   873   //
   874   //  If length <= 9: same as aligned==true case
   875   //
   876   //  If length > 9:
   877   //     1. continue with step 7. if the alignment of from and to mod 4
   878   //        is different.
   879   //     2. align from and to to 4 bytes by copying 1 element if necessary
   880   //     3. at l_2 from and to are 4 byte aligned; continue with
   881   //        6. if they cannot be aligned to 8 bytes because they have
   882   //        got different alignment mod 8.
   883   //     4. at this point we know that both, from and to, have the same
   884   //        alignment mod 8, now copy one element if necessary to get
   885   //        8 byte alignment of from and to.
   886   //     5. copy 4 elements at a time until less than 4 elements are
   887   //        left; depending on step 3. all load/stores are aligned.
   888   //     6. copy 2 elements at a time until less than 2 elements are
   889   //        left. (l_6)
   890   //     7. copy 1 element at a time. (l_5)
   891   //     8. copy last element if one was left in step 6. (l_1)
   892   //
   893   //  TODO:
   894   //
   895   //  1. use loongson 128-bit load/store
   896   //  2. use loop unrolling optimization when len is big enough, for example if len > 0x2000:
   897   //    __ bind(l_x);
   898   //    __ ld(AT, tmp1, 0);
   899   //    __ ld(tmp, tmp1, 8);
   900   //    __ sd(AT, tmp2, 0);
   901   //    __ sd(tmp, tmp2, 8);
   902   //    __ ld(AT, tmp1, 16);
   903   //    __ ld(tmp, tmp1, 24);
   904   //    __ sd(AT, tmp2, 16);
   905   //    __ sd(tmp, tmp2, 24);
   906   //    __ daddi(tmp1, tmp1, 32);
   907   //    __ daddi(tmp2, tmp2, 32);
   908   //    __ daddi(tmp3, tmp3, -16);
   909   //    __ daddi(AT, tmp3, -16);
   910   //    __ bgez(AT, l_x);
   911   //    __ delayed()->nop();
   912   //
   913   address generate_disjoint_short_copy(bool aligned, const char * name) {
   914     StubCodeMark mark(this, "StubRoutines", name);
   915     __ align(CodeEntryAlignment);
   917     Register tmp1 = T0;
   918     Register tmp2 = T1;
   919     Register tmp3 = T3;
   921     address start = __ pc();
   923     __ push(tmp1);
   924     __ push(tmp2);
   925     __ push(tmp3);
   926     __ move(tmp1, A0);
   927     __ move(tmp2, A1);
   928     __ move(tmp3, A2);
   930     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
   931     Label l_debug;
   932     // don't try anything fancy if arrays don't have many elements
   933     __ daddi(AT, tmp3, -9);
   934     __ blez(AT, l_1);
   935     __ delayed()->nop();
   937     if (!aligned) {
   938       __ xorr(AT, A0, A1);
   939       __ andi(AT, AT, 1);
   940       __ bne(AT, R0, l_debug); // if arrays don't have the same alignment mod 2, can this happen?
   941       __ delayed()->nop();
   943       __ xorr(AT, A0, A1);
   944       __ andi(AT, AT, 3);
   945       __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 1 element copy
   946       __ delayed()->nop();
   948       // At this point it is guaranteed that both, from and to have the same alignment mod 4.
   950       // Copy 1 element if necessary to align to 4 bytes.
   951       __ andi(AT, A0, 3);
   952       __ beq(AT, R0, l_2);
   953       __ delayed()->nop();
   955       __ lhu(AT, tmp1, 0);
   956       __ daddi(tmp1, tmp1, 2);
   957       __ sh(AT, tmp2, 0);
   958       __ daddi(tmp2, tmp2, 2);
   959       __ daddi(tmp3, tmp3, -1);
   960       __ bind(l_2);
   962       // At this point the positions of both, from and to, are at least 4 byte aligned.
   964       // Copy 4 elements at a time.
   965       // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
   966       __ xorr(AT, tmp1, tmp2);
   967       __ andi(AT, AT, 7);
   968       __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned
   969       __ delayed()->nop();
   971       // Copy a 2-element word if necessary to align to 8 bytes.
   972       __ andi(AT, tmp1, 7);
   973       __ beq(AT, R0, l_7);
   974       __ delayed()->nop();
   976       __ lw(AT, tmp1, 0);
   977       __ daddi(tmp3, tmp3, -2);
   978       __ sw(AT, tmp2, 0);
   979       { // FasterArrayCopy
   980         __ daddi(tmp1, tmp1, 4);
   981         __ daddi(tmp2, tmp2, 4);
   982       }
   983     }
   985     __ bind(l_7);
   987     // Copy 4 elements at a time; either the loads or the stores can
   988     // be unaligned if aligned == false.
   990     { // FasterArrayCopy
   991       __ daddi(AT, tmp3, -15);
   992       __ blez(AT, l_6); // copy 2 at a time if less than 16 elements remain
   993       __ delayed()->nop();
   995       __ bind(l_8);
   996       // For Loongson, there is 128-bit memory access. TODO
   997       __ ld(AT, tmp1, 0);
   998       __ sd(AT, tmp2, 0);
   999       __ daddi(tmp1, tmp1, 8);
  1000       __ daddi(tmp2, tmp2, 8);
  1001       __ daddi(tmp3, tmp3, -4);
  1002       __ daddi(AT, tmp3, -4);
  1003       __ bgez(AT, l_8);
  1004       __ delayed()->nop();
  1006     __ bind(l_6);
  1008     // copy 2 element at a time
  1009     { // FasterArrayCopy
  1010       __ daddi(AT, tmp3, -1);
  1011       __ blez(AT, l_1);
  1012       __ delayed()->nop();
  1014       __ bind(l_3);
  1015       __ lw(AT, tmp1, 0);
  1016       __ sw(AT, tmp2, 0);
  1017       __ daddi(tmp1, tmp1, 4);
  1018       __ daddi(tmp2, tmp2, 4);
  1019       __ daddi(tmp3, tmp3, -2);
  1020       __ daddi(AT, tmp3, -2);
  1021       __ bgez(AT, l_3);
  1022       __ delayed()->nop();
  1026     // do single element copy (8 bit), can this happen?
  1027     __ bind(l_1);
  1028     __ beq(R0, tmp3, l_4);
  1029     __ delayed()->nop();
  1031     { // FasterArrayCopy
  1033       __ bind(l_5);
  1034       __ lhu(AT, tmp1, 0);
  1035       __ daddi(tmp3, tmp3, -1);
  1036       __ sh(AT, tmp2, 0);
  1037       __ daddi(tmp1, tmp1, 2);
  1038       __ daddi(tmp2, tmp2, 2);
  1039       __ daddi(AT, tmp3, -1);
  1040       __ bgez(AT, l_5);
  1041       __ delayed()->nop();
  1043     __ bind(l_4);
  1044     __ pop(tmp3);
  1045     __ pop(tmp2);
  1046     __ pop(tmp1);
  1048     __ jr(RA);
  1049     __ delayed()->nop();
  1051     __ bind(l_debug);
  1052     __ stop("generate_disjoint_short_copy should not reach here");
  1053     return start;
  1056   // Arguments:
  1057   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1058   //             ignored
  1059   //   name    - stub name string
  1060   //
  1061   // Inputs:
  1062   //   c_rarg0   - source array address
  1063   //   c_rarg1   - destination array address
  1064   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1065   //
  1066   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
  1067   // let the hardware handle it.  The two or four words within dwords
  1068   // or qwords that span cache line boundaries will still be loaded
  1069   // and stored atomically.
  1070   //
  1071   address generate_conjoint_short_copy(bool aligned, const char *name) {
  1072 		Label l_1, l_2, l_3, l_4, l_5;
  1073 		StubCodeMark mark(this, "StubRoutines", name);
  1074 		__ align(CodeEntryAlignment);
  1075 		address start = __ pc();
  1076 		address nooverlap_target = aligned ?
  1077 						StubRoutines::arrayof_jshort_disjoint_arraycopy() :
  1078 						StubRoutines::jshort_disjoint_arraycopy();
  1080 		array_overlap_test(nooverlap_target, 1);
  1082 		__ push(T3);	
  1083 		__ push(T0);	
  1084 		__ push(T1);	
  1085 		__ push(T8);	
  1087 		/*
  1088 			 __ pushl(esi);
  1089 			 __ movl(ecx, Address(esp, 4+12));      // count
  1090 			 __ pushl(edi);
  1091 			 __ movl(esi, Address(esp, 8+ 4));      // from
  1092 			 __ movl(edi, Address(esp, 8+ 8));      // to
  1093 		 */ 
  1094 		__ move(T1, A2);  
  1095 		__ move(T3, A0); 
  1096 		__ move(T0, A1);
  1099 		// copy dwords from high to low
  1100 		// __ leal(esi, Address(esi, ecx, Address::times_2, -4)); // from + count*2 - 4
  1101 		__ sll(AT, T1, Address::times_2); 
  1102 		__ add(AT, T3, AT); 
  1103 		__ lea(T3, Address( AT, -4)); 
  1104 		//__ std();
  1105 		//__ leal(edi, Address(edi, ecx, Address::times_2, -4)); // to + count*2 - 4
  1106 		__ sll(AT,T1 , Address::times_2); 
  1107 		__ add(AT, T0, AT); 
  1108 		__ lea(T0, Address( AT, -4)); 
  1109 		//  __ movl(eax, ecx);
  1110 		__ move(T8, T1); 
  1111 		__ bind(l_1);
  1112 		//   __ sarl(ecx, 1);              // dword count
  1113 		__ sra(T1,T1, 1); 
  1114 		//__ jcc(Assembler::equal, l_4);                   // no dwords to move
  1115 		__ beq(T1, R0, l_4);  
  1116 		__ delayed()->nop(); 
  1117 		/*    __ cmpl(ecx, 32);
  1118 					__ jcc(Assembler::above, l_3);                   // > 32 dwords
  1119 		// copy dwords with loop
  1120 		__ subl(edi, esi);
  1121 		 */     __ align(16);
  1122 		__ bind(l_2);
  1123 		//__ movl(edx, Address(esi));
  1124 		__ lw(AT, T3, 0);   
  1125 		//__ movl(Address(edi, esi, Address::times_1), edx);
  1126 		__ sw(AT, T0, 0); 
  1127 		//__ subl(esi, 4);
  1128 		__ addi(T3, T3, -4); 
  1129 		__ addi(T0, T0, -4); 
  1130 		//__ decl(ecx);
  1131 		__ addi(T1, T1, -1); 
  1132 		//  __ jcc(Assembler::notEqual, l_2);
  1133 		__ bne(T1, R0, l_2); 
  1134 		__ delayed()->nop(); 
  1135 		//  __ addl(edi, esi);
  1136 		// __ jmp(l_4);
  1137 		__ b(l_4);
  1138 		__ delayed()->nop();
  1139 		// copy dwords with repeat move
  1140 		__ bind(l_3);
  1141 		//   __ rep_movl();
  1142 		__ bind(l_4);
  1143 		//  __ andl(eax, 1);              // suffix count
  1144 		__ andi(T8, T8, 1);              // suffix count
  1145 		//__ jcc(Assembler::equal, l_5);                   // no suffix
  1146 		__ beq(T8, R0, l_5 );  
  1147 		__ delayed()->nop(); 
  1148 		// copy suffix
  1149 		//   __ movw(edx, Address(esi, 2));
  1150 		__ lh(AT, T3, 2); 
  1151 		//  __ movw(Address(edi, 2), edx);
  1152 		__ sh(AT, T0, 2); 
  1153 		__ bind(l_5);
  1154 		//    __ cld();
  1155 		//    __ popl(edi);
  1156 		//    __ popl(esi);
  1157 		//   __ ret(0);
  1158 		__ pop(T8);	
  1159 		__ pop(T1);	
  1160 		__ pop(T0);	
  1161 		__ pop(T3);	
  1162 		__ jr(RA); 
  1163 		__ delayed()->nop();   
  1164 		return start;
  1167   // Arguments:
  1168   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1169   //             ignored
  1170   //   is_oop  - true => oop array, so generate store check code
  1171   //   name    - stub name string
  1172   //
  1173   // Inputs:
  1174   //   c_rarg0   - source array address
  1175   //   c_rarg1   - destination array address
  1176   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1177   //
  1178   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1179   // the hardware handle it.  The two dwords within qwords that span
  1180   // cache line boundaries will still be loaded and stored atomicly.
  1181   //
  1182   // Side Effects:
  1183   //   disjoint_int_copy_entry is set to the no-overlap entry point
  1184   //   used by generate_conjoint_int_oop_copy().
  1185   //
  1186   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {
  1187 		Label l_2, l_3, l_4, l_stchk;
  1188 		StubCodeMark mark(this, "StubRoutines", name);
  1189 		__ align(CodeEntryAlignment);
  1190 		address start = __ pc();
  1191 		/*
  1192 			 __ pushl(esi);
  1193 			 __ movl(ecx, Address(esp, 4+12));      // count
  1194 			 __ pushl(edi);
  1195 			 __ movl(esi, Address(esp, 8+ 4));      // from
  1196 			 __ movl(edi, Address(esp, 8+ 8));      // to
  1197 		 */
  1198 		__ push(T3);	
  1199 		__ push(T0);	
  1200 		__ push(T1);	
  1201 		__ push(T8);	
  1202 		__ move(T1, A2);  
  1203 		__ move(T3, A0); 
  1204 		__ move(T0, A1);
  1206 		// __ cmpl(ecx, 32);
  1207 		// __ jcc(Assembler::belowEqual, l_2);                   // <= 32 dwords
  1208 		// __ rep_movl();
  1209 		__ b(l_2); 	
  1210 		__ delayed()->nop();	
  1211 		if (is_oop) {
  1212 		//  __ jmp(l_stchk);
  1213 			__ b(l_stchk); 
  1214 			__ delayed()->nop(); 
  1216 		//    __ popl(edi);
  1217 		//   __ popl(esi);
  1218 		//  __ ret(0);
  1219 		__ pop(T8);	
  1220 		__ pop(T1);	
  1221 		__ pop(T0);	
  1222 		__ pop(T3);	
  1223 		__ jr(RA); 
  1224 		__ delayed()->nop(); 
  1226 		__ bind(l_2);
  1227 		//  __ subl(edi, esi);
  1228 		//  __ testl(ecx, ecx);
  1229 		// __ jcc(Assembler::zero, l_4);
  1230 		__ beq(T1, R0, l_4);  
  1231 		__ delayed()->nop(); 
  1232 		__ align(16);
  1233 		__ bind(l_3);
  1234 		//__ movl(edx, Address(esi));
  1235 		__ lw(AT, T3, 0);   
  1236 		// __ movl(Address(edi, esi, Address::times_1), edx);
  1237 		__ sw(AT, T0, 0); 
  1238 		// __ addl(esi, 4);
  1239 		__ addi(T3, T3, 4);
  1240 		__ addi(T0, T0, 4);
  1241 		//   __ decl(ecx);
  1242 		__ addi(T1, T1, -1); 
  1243 		//    __ jcc(Assembler::notEqual, l_3);
  1244 		__ bne(T1, R0, l_3); 
  1245 		__ delayed()->nop(); 
  1246 		if (is_oop) {
  1247 			__ bind(l_stchk);
  1248 			//      __ movl(edi, Address(esp, 8+ 8));
  1249 			//     __ movl(ecx, Address(esp, 8+ 12));
  1250 			__ move(T0, A1); 
  1251 			__ move(T1, A2); 
  1252 			array_store_check();
  1254 		__ bind(l_4);
  1255 		//    __ popl(edi);
  1256 		//   __ popl(esi);
  1257 		//  __ ret(0);
  1258 		__ pop(T8);
  1259 		__ pop(T1);
  1260 		__ pop(T0);
  1261 		__ pop(T3);
  1262 		__ jr(RA); 
  1263 		__ delayed()->nop(); 
  1264 		return start;
  1267   // Arguments:
  1268   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1269   //             ignored
  1270   //   is_oop  - true => oop array, so generate store check code
  1271   //   name    - stub name string
  1272   //
  1273   // Inputs:
  1274   //   c_rarg0   - source array address
  1275   //   c_rarg1   - destination array address
  1276   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1277   //
  1278   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1279   // the hardware handle it.  The two dwords within qwords that span
  1280   // cache line boundaries will still be loaded and stored atomicly.
  1281   //
  1282   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {
  1283 		Label l_2, l_3, l_4, l_stchk;
  1284 		StubCodeMark mark(this, "StubRoutines", name);
  1285 		__ align(CodeEntryAlignment);
  1286 		address start = __ pc();
  1287 		address nooverlap_target;
  1289 		if (is_oop) {
  1290 			nooverlap_target = aligned ?
  1291 							StubRoutines::arrayof_oop_disjoint_arraycopy() :
  1292 							StubRoutines::oop_disjoint_arraycopy();
  1293 		}else {
  1294 			nooverlap_target = aligned ?
  1295 							StubRoutines::arrayof_jint_disjoint_arraycopy() :
  1296 							StubRoutines::jint_disjoint_arraycopy();
  1299 		array_overlap_test(nooverlap_target, 2);
  1301 		__ push(T3);
  1302 		__ push(T0);
  1303 		__ push(T1);
  1304 		__ push(T8);
  1306 		/*
  1307 			 __ pushl(esi);
  1308 			 __ movl(ecx, Address(esp, 4+12));      // count
  1309 			 __ pushl(edi);
  1310 			 __ movl(esi, Address(esp, 8+ 4));      // from
  1311 			 __ movl(edi, Address(esp, 8+ 8));      // to
  1312 		 */ 
  1313 		__ move(T1, A2);  
  1314 		__ move(T3, A0); 
  1315 		__ move(T0, A1);
  1317 		//__ leal(esi, Address(esi, ecx, Address::times_4, -4)); // from + count*4 - 4
  1318 		__ sll(AT, T1, Address::times_4); 
  1319 		__ add(AT, T3, AT); 
  1320 		__ lea(T3 , Address(AT, -4)); 
  1321 		//__ std();
  1322 		//__ leal(edi, Address(edi, ecx, Address::times_4, -4)); // to + count*4 - 4
  1323 		__ sll(AT, T1, Address::times_4); 
  1324 		__ add(AT, T0, AT); 
  1325 		__ lea(T0 , Address(AT, -4)); 
  1327 		//    __ cmpl(ecx, 32);
  1328 		//   __ jcc(Assembler::above, l_3);                   // > 32 dwords
  1329 		//  __ testl(ecx, ecx);
  1330 		//__ jcc(Assembler::zero, l_4);
  1331 		__ beq(T1, R0, l_4); 
  1332 		__ delayed()->nop();  
  1333 		// __ subl(edi, esi);
  1334 		__ align(16);
  1335 		__ bind(l_2);
  1336 		// __ movl(edx, Address(esi));
  1337 		__ lw(AT, T3, 0);   
  1338 		// __ movl(Address(esi, edi, Address::times_1), edx);
  1339 		__ sw(AT, T0, 0); 
  1340 		// __ subl(esi, 4);
  1341 		__ addi(T3, T3, -4); 
  1342 		__ addi(T0, T0, -4); 
  1343 		//   __ decl(ecx);
  1344 		__ addi(T1, T1, -1); 
  1345 		//__ jcc(Assembler::notEqual, l_2);
  1346 		__ bne(T1, R0, l_2);  
  1347 		__ delayed()->nop(); 
  1348 		if (is_oop) {
  1349 			// __ jmp(l_stchk);
  1350 			__ b( l_stchk); 
  1351 			__ delayed()->nop(); 
  1353 		__ bind(l_4);
  1354 		//      __ cld();
  1355 		//     __ popl(edi);
  1356 		//    __ popl(esi);
  1357 		//   __ ret(0);
  1358 		__ pop(T8); 
  1359 		__ pop(T1); 
  1360 		__ pop(T0); 
  1361 		__ pop(T3); 
  1362 		__ jr(RA); 
  1363 		__ delayed()->nop(); 
  1364 		__ bind(l_3);
  1365 		//   __ rep_movl();
  1366 		if (is_oop) {
  1367 			__ bind(l_stchk);
  1368 			//  __ movl(edi, Address(esp, 8+ 8));
  1369 			__ move(T0, A1);  
  1370 			// __ movl(ecx, Address(esp, 8+ 12));
  1371 			__ move(T1, A2);  
  1372 			array_store_check();
  1374 		//    __ cld();
  1375 		//   __ popl(edi);
  1376 		//   __ popl(esi);
  1377 		//  __ ret(0);
  1378 		__ pop(T8);	
  1379 		__ pop(T1);	
  1380 		__ pop(T0);	
  1381 		__ pop(T3);	
  1382 		__ jr(RA);	
  1383 		__ delayed()->nop(); 
  1384 		return start;
  1387   // Arguments:
  1388   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1389   //             ignored
  1390   //   is_oop  - true => oop array, so generate store check code
  1391   //   name    - stub name string
  1392   //
  1393   // Inputs:
  1394   //   c_rarg0   - source array address
  1395   //   c_rarg1   - destination array address
  1396   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1397   //
  1398   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1399   // the hardware handle it.  The two dwords within qwords that span
  1400   // cache line boundaries will still be loaded and stored atomicly.
  1401   //
  1402   // Side Effects:
  1403   //   disjoint_int_copy_entry is set to the no-overlap entry point
  1404   //   used by generate_conjoint_int_oop_copy().
  1405   //
  1406   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
  1407 		Label l_2, l_3, l_4, l_stchk;
  1408 		StubCodeMark mark(this, "StubRoutines", name);
  1409 		__ align(CodeEntryAlignment);
  1410 		address start = __ pc();
  1411 		__ push(T3);	
  1412 		__ push(T0);	
  1413 		__ push(T1);	
  1414 		__ push(T8);	
  1415 		__ move(T1, A2);  
  1416 		__ move(T3, A0); 
  1417 		__ move(T0, A1);
  1419 		// __ cmpl(ecx, 32);
  1420 		// __ jcc(Assembler::belowEqual, l_2);                   // <= 32 dwords
  1421 		// __ rep_movl();
  1422 		__ b(l_2); 	
  1423 		__ delayed()->nop();	
  1424 		if (is_oop) {
  1425 		//  __ jmp(l_stchk);
  1426 			__ b(l_stchk); 
  1427 			__ delayed()->nop(); 
  1429 		//    __ popl(edi);
  1430 		//   __ popl(esi);
  1431 		//  __ ret(0);
  1432 		__ pop(T8);	
  1433 		__ pop(T1);	
  1434 		__ pop(T0);	
  1435 		__ pop(T3);	
  1436 		__ jr(RA); 
  1437 		__ delayed()->nop(); 
  1439 		__ bind(l_2);
  1440 		//  __ subl(edi, esi);
  1441 		//  __ testl(ecx, ecx);
  1442 		// __ jcc(Assembler::zero, l_4);
  1443 		__ beq(T1, R0, l_4);  
  1444 		__ delayed()->nop(); 
  1445 		__ align(16);
  1446 		__ bind(l_3);
  1447 		//__ movl(edx, Address(esi));
  1448 		__ ld(AT, T3, 0);   
  1449 		// __ movl(Address(edi, esi, Address::times_1), edx);
  1450 		__ sd(AT, T0, 0); 
  1451 		// __ addl(esi, 4);
  1452 		__ addi(T3, T3, 8);
  1453 		__ addi(T0, T0, 8);
  1454 		//   __ decl(ecx);
  1455 		__ addi(T1, T1, -1); 
  1456 		//    __ jcc(Assembler::notEqual, l_3);
  1457 		__ bne(T1, R0, l_3); 
  1458 		__ delayed()->nop(); 
  1459 		if (is_oop) {
  1460 			__ bind(l_stchk);
  1461 			//      __ movl(edi, Address(esp, 8+ 8));
  1462 			//     __ movl(ecx, Address(esp, 8+ 12));
  1463 			__ move(T0, A1); 
  1464 			__ move(T1, A2); 
  1465 			array_store_check();
  1467 		__ bind(l_4);
  1468 		//    __ popl(edi);
  1469 		//   __ popl(esi);
  1470 		//  __ ret(0);
  1471 		__ pop(T8);
  1472 		__ pop(T1);
  1473 		__ pop(T0);
  1474 		__ pop(T3);
  1475 		__ jr(RA); 
  1476 		__ delayed()->nop(); 
  1477 		return start;
  1480   // Arguments:
  1481   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1482   //             ignored
  1483   //   is_oop  - true => oop array, so generate store check code
  1484   //   name    - stub name string
  1485   //
  1486   // Inputs:
  1487   //   c_rarg0   - source array address
  1488   //   c_rarg1   - destination array address
  1489   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1490   //
  1491   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1492   // the hardware handle it.  The two dwords within qwords that span
  1493   // cache line boundaries will still be loaded and stored atomicly.
  1494   //
  1495   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
  1496 		Label l_2, l_3, l_4, l_stchk;
  1497 		StubCodeMark mark(this, "StubRoutines", name);
  1498 		__ align(CodeEntryAlignment);
  1499 		address start = __ pc();
  1500 		address nooverlap_target;
  1502 		if (is_oop) {
  1503 			nooverlap_target = aligned ?
  1504 							StubRoutines::arrayof_oop_disjoint_arraycopy() :
  1505 							StubRoutines::oop_disjoint_arraycopy();
  1506 		}else {
  1507 			nooverlap_target = aligned ?
  1508 							StubRoutines::arrayof_jlong_disjoint_arraycopy() :
  1509 							StubRoutines::jlong_disjoint_arraycopy();
  1512 		array_overlap_test(nooverlap_target, 3);
  1514 		__ push(T3);
  1515 		__ push(T0);
  1516 		__ push(T1);
  1517 		__ push(T8);
  1519 		__ move(T1, A2);  
  1520 		__ move(T3, A0); 
  1521 		__ move(T0, A1);
  1523 		//__ leal(esi, Address(esi, ecx, Address::times_4, -4)); // from + count*4 - 4
  1524 		__ sll(AT, T1, Address::times_8); 
  1525 		__ add(AT, T3, AT); 
  1526 		__ lea(T3 , Address(AT, -8)); 
  1527 		//__ std();
  1528 		//__ leal(edi, Address(edi, ecx, Address::times_4, -4)); // to + count*4 - 4
  1529 		__ sll(AT, T1, Address::times_8); 
  1530 		__ add(AT, T0, AT); 
  1531 		__ lea(T0 , Address(AT, -8)); 
  1533 		//    __ cmpl(ecx, 32);
  1534 		//   __ jcc(Assembler::above, l_3);                   // > 32 dwords
  1535 		//  __ testl(ecx, ecx);
  1536 		//__ jcc(Assembler::zero, l_4);
  1537 		__ beq(T1, R0, l_4); 
  1538 		__ delayed()->nop();  
  1539 		// __ subl(edi, esi);
  1540 		__ align(16);
  1541 		__ bind(l_2);
  1542 		// __ movl(edx, Address(esi));
  1543 		__ ld(AT, T3, 0);   
  1544 		// __ movl(Address(esi, edi, Address::times_1), edx);
  1545 		__ sd(AT, T0, 0); 
  1546 		// __ subl(esi, 4);
  1547 		__ addi(T3, T3, -8); 
  1548 		__ addi(T0, T0, -8); 
  1549 		//   __ decl(ecx);
  1550 		__ addi(T1, T1, -1); 
  1551 		//__ jcc(Assembler::notEqual, l_2);
  1552 		__ bne(T1, R0, l_2);  
  1553 		__ delayed()->nop(); 
  1554 		if (is_oop) {
  1555 			// __ jmp(l_stchk);
  1556 			__ b( l_stchk); 
  1557 			__ delayed()->nop(); 
  1559 		__ bind(l_4);
  1560 		//      __ cld();
  1561 		//     __ popl(edi);
  1562 		//    __ popl(esi);
  1563 		//   __ ret(0);
  1564 		__ pop(T8); 
  1565 		__ pop(T1); 
  1566 		__ pop(T0); 
  1567 		__ pop(T3); 
  1568 		__ jr(RA); 
  1569 		__ delayed()->nop(); 
  1570 		__ bind(l_3);
  1571 		//   __ rep_movl();
  1572 		if (is_oop) {
  1573 			__ bind(l_stchk);
  1574 			//  __ movl(edi, Address(esp, 8+ 8));
  1575 			__ move(T0, A1);  
  1576 			// __ movl(ecx, Address(esp, 8+ 12));
  1577 			__ move(T1, A2);  
  1578 			array_store_check();
  1580 		//    __ cld();
  1581 		//   __ popl(edi);
  1582 		//   __ popl(esi);
  1583 		//  __ ret(0);
  1584 		__ pop(T8);	
  1585 		__ pop(T1);	
  1586 		__ pop(T0);	
  1587 		__ pop(T3);	
  1588 		__ jr(RA);	
  1589 		__ delayed()->nop(); 
  1590 		return start;
  1592 #if 0
  1593   // Arguments:
  1594   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
  1595   //             ignored
  1596   //   is_oop  - true => oop array, so generate store check code
  1597   //   name    - stub name string
  1598   //
  1599   // Inputs:
  1600   //   c_rarg0   - source array address
  1601   //   c_rarg1   - destination array address
  1602   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1603   //
  1604   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
  1605     __ align(CodeEntryAlignment);
  1606     StubCodeMark mark(this, "StubRoutines", name);
  1607     address start = __ pc();
  1609     Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
  1610     const Register from        = rdi;  // source array address
  1611     const Register to          = rsi;  // destination array address
  1612     const Register qword_count = rdx;  // elements count
  1613     const Register saved_count = rcx;
  1615     __ enter(); // required for proper stackwalking of RuntimeStub frame
  1616     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
  1618     address disjoint_copy_entry = NULL;
  1619     if (is_oop) {
  1620       assert(!UseCompressedOops, "shouldn't be called for compressed oops");
  1621       disjoint_copy_entry = disjoint_oop_copy_entry;
  1622       oop_copy_entry  = __ pc();
  1623       array_overlap_test(disjoint_oop_copy_entry, Address::times_8);
  1624     } else {
  1625       disjoint_copy_entry = disjoint_long_copy_entry;
  1626       long_copy_entry = __ pc();
  1627       array_overlap_test(disjoint_long_copy_entry, Address::times_8);
  1629     BLOCK_COMMENT("Entry:");
  1630     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  1632     array_overlap_test(disjoint_copy_entry, Address::times_8);
  1633     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
  1634                       // r9 and r10 may be used to save non-volatile registers
  1636     // 'from', 'to' and 'qword_count' are now valid
  1638     if (is_oop) {
  1639       // Save to and count for store barrier
  1640       __ movptr(saved_count, qword_count);
  1641       // No registers are destroyed by this call
  1642       gen_write_ref_array_pre_barrier(to, saved_count);
  1645     __ jmp(L_copy_32_bytes);
  1647     // Copy trailing qwords
  1648   __ BIND(L_copy_8_bytes);
  1649     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
  1650     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
  1651     __ decrement(qword_count);
  1652     __ jcc(Assembler::notZero, L_copy_8_bytes);
  1654     if (is_oop) {
  1655       __ jmp(L_exit);
  1656     } else {
  1657       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
  1658       restore_arg_regs();
  1659       __ xorptr(rax, rax); // return 0
  1660       __ leave(); // required for proper stackwalking of RuntimeStub frame
  1661       __ ret(0);
  1664     // Copy in 32-bytes chunks
  1665     copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
  1667     if (is_oop) {
  1668     __ BIND(L_exit);
  1669       __ lea(rcx, Address(to, saved_count, Address::times_8, -8));
  1670       gen_write_ref_array_post_barrier(to, rcx, rax);
  1671       inc_counter_np(SharedRuntime::_oop_array_copy_ctr);
  1672     } else {
  1673       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
  1675     restore_arg_regs();
  1676     __ xorptr(rax, rax); // return 0
  1677     __ leave(); // required for proper stackwalking of RuntimeStub frame
  1678     __ ret(0);
  1680     return start;
  1684   // Helper for generating a dynamic type check.
  1685   // Smashes no registers.
  1686   void generate_type_check(Register sub_klass,
  1687                            Register super_check_offset,
  1688                            Register super_klass,
  1689                            Label& L_success) {
  1690     assert_different_registers(sub_klass, super_check_offset, super_klass);
  1692     BLOCK_COMMENT("type_check:");
  1694     Label L_miss;
  1696     // a couple of useful fields in sub_klass:
  1697     int ss_offset = (klassOopDesc::header_size() * HeapWordSize +
  1698                      Klass::secondary_supers_offset_in_bytes());
  1699     int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
  1700                      Klass::secondary_super_cache_offset_in_bytes());
  1701     Address secondary_supers_addr(sub_klass, ss_offset);
  1702     Address super_cache_addr(     sub_klass, sc_offset);
  1704     // if the pointers are equal, we are done (e.g., String[] elements)
  1705     __ cmpptr(super_klass, sub_klass);
  1706     __ jcc(Assembler::equal, L_success);
  1708     // check the supertype display:
  1709     Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
  1710     __ cmpptr(super_klass, super_check_addr); // test the super type
  1711     __ jcc(Assembler::equal, L_success);
  1713     // if it was a primary super, we can just fail immediately
  1714     __ cmpl(super_check_offset, sc_offset);
  1715     __ jcc(Assembler::notEqual, L_miss);
  1717     // Now do a linear scan of the secondary super-klass chain.
  1718     // The repne_scan instruction uses fixed registers, which we must spill.
  1719     // (We need a couple more temps in any case.)
  1720     // This code is rarely used, so simplicity is a virtue here.
  1721     inc_counter_np(SharedRuntime::_partial_subtype_ctr);
  1723       __ push(rax);
  1724       __ push(rcx);
  1725       __ push(rdi);
  1726       assert_different_registers(sub_klass, super_klass, rax, rcx, rdi);
  1728       __ movptr(rdi, secondary_supers_addr);
  1729       // Load the array length.
  1730       __ movl(rcx, Address(rdi, arrayOopDesc::length_offset_in_bytes()));
  1731       // Skip to start of data.
  1732       __ addptr(rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
  1733       // Scan rcx words at [rdi] for occurance of rax
  1734       // Set NZ/Z based on last compare
  1735       __ movptr(rax, super_klass);
  1736       if (UseCompressedOops) {
  1737         // Compare against compressed form.  Don't need to uncompress because
  1738         // looks like orig rax is restored in popq below.
  1739         __ encode_heap_oop(rax);
  1740         __ repne_scanl();
  1741       } else {
  1742         __ repne_scan();
  1745       // Unspill the temp. registers:
  1746       __ pop(rdi);
  1747       __ pop(rcx);
  1748       __ pop(rax);
  1750       __ jcc(Assembler::notEqual, L_miss);
  1753     // Success.  Cache the super we found and proceed in triumph.
  1754     __ movptr(super_cache_addr, super_klass); // note: rax is dead
  1755     __ jmp(L_success);
  1757     // Fall through on failure!
  1758     __ BIND(L_miss);
  1761   //
  1762   //  Generate checkcasting array copy stub
  1763   //
  1764   //  Input:
  1765   //    c_rarg0   - source array address
  1766   //    c_rarg1   - destination array address
  1767   //    c_rarg2   - element count, treated as ssize_t, can be zero
  1768   //    c_rarg3   - size_t ckoff (super_check_offset)
  1769   // not Win64
  1770   //    c_rarg4   - oop ckval (super_klass)
  1771   // Win64
  1772   //    rsp+40    - oop ckval (super_klass)
  1773   //
  1774   //  Output:
  1775   //    rax ==  0  -  success
  1776   //    rax == -1^K - failure, where K is partial transfer count
  1777   //
  1778   address generate_checkcast_copy(const char *name) {
  1780     Label L_load_element, L_store_element, L_do_card_marks, L_done;
  1782     // Input registers (after setup_arg_regs)
  1783     const Register from        = rdi;   // source array address
  1784     const Register to          = rsi;   // destination array address
  1785     const Register length      = rdx;   // elements count
  1786     const Register ckoff       = rcx;   // super_check_offset
  1787     const Register ckval       = r8;    // super_klass
  1789     // Registers used as temps (r13, r14 are save-on-entry)
  1790     const Register end_from    = from;  // source array end address
  1791     const Register end_to      = r13;   // destination array end address
  1792     const Register count       = rdx;   // -(count_remaining)
  1793     const Register r14_length  = r14;   // saved copy of length
  1794     // End pointers are inclusive, and if length is not zero they point
  1795     // to the last unit copied:  end_to[0] := end_from[0]
  1797     const Register rax_oop    = rax;    // actual oop copied
  1798     const Register r11_klass  = r11;    // oop._klass
  1800     //---------------------------------------------------------------
  1801     // Assembler stub will be used for this call to arraycopy
  1802     // if the two arrays are subtypes of Object[] but the
  1803     // destination array type is not equal to or a supertype
  1804     // of the source type.  Each element must be separately
  1805     // checked.
  1807     __ align(CodeEntryAlignment);
  1808     StubCodeMark mark(this, "StubRoutines", name);
  1809     address start = __ pc();
  1811     __ enter(); // required for proper stackwalking of RuntimeStub frame
  1813     checkcast_copy_entry  = __ pc();
  1814     BLOCK_COMMENT("Entry:");
  1816 #ifdef ASSERT
  1817     // caller guarantees that the arrays really are different
  1818     // otherwise, we would have to make conjoint checks
  1819     { Label L;
  1820       array_overlap_test(L, TIMES_OOP);
  1821       __ stop("checkcast_copy within a single array");
  1822       __ bind(L);
  1824 #endif //ASSERT
  1826     // allocate spill slots for r13, r14
  1827     enum {
  1828       saved_r13_offset,
  1829       saved_r14_offset,
  1830       saved_rbp_offset,
  1831       saved_rip_offset,
  1832       saved_rarg0_offset
  1833     };
  1834     __ subptr(rsp, saved_rbp_offset * wordSize);
  1835     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
  1836     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
  1837     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
  1838                        // ckoff => rcx, ckval => r8
  1839                        // r9 and r10 may be used to save non-volatile registers
  1840 #ifdef _WIN64
  1841     // last argument (#4) is on stack on Win64
  1842     const int ckval_offset = saved_rarg0_offset + 4;
  1843     __ movptr(ckval, Address(rsp, ckval_offset * wordSize));
  1844 #endif
  1846     // check that int operands are properly extended to size_t
  1847     assert_clean_int(length, rax);
  1848     assert_clean_int(ckoff, rax);
  1850 #ifdef ASSERT
  1851     BLOCK_COMMENT("assert consistent ckoff/ckval");
  1852     // The ckoff and ckval must be mutually consistent,
  1853     // even though caller generates both.
  1854     { Label L;
  1855       int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
  1856                         Klass::super_check_offset_offset_in_bytes());
  1857       __ cmpl(ckoff, Address(ckval, sco_offset));
  1858       __ jcc(Assembler::equal, L);
  1859       __ stop("super_check_offset inconsistent");
  1860       __ bind(L);
  1862 #endif //ASSERT
  1864     // Loop-invariant addresses.  They are exclusive end pointers.
  1865     Address end_from_addr(from, length, TIMES_OOP, 0);
  1866     Address   end_to_addr(to,   length, TIMES_OOP, 0);
  1867     // Loop-variant addresses.  They assume post-incremented count < 0.
  1868     Address from_element_addr(end_from, count, TIMES_OOP, 0);
  1869     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
  1871     gen_write_ref_array_pre_barrier(to, count);
  1873     // Copy from low to high addresses, indexed from the end of each array.
  1874     __ lea(end_from, end_from_addr);
  1875     __ lea(end_to,   end_to_addr);
  1876     __ movptr(r14_length, length);        // save a copy of the length
  1877     assert(length == count, "");          // else fix next line:
  1878     __ negptr(count);                     // negate and test the length
  1879     __ jcc(Assembler::notZero, L_load_element);
  1881     // Empty array:  Nothing to do.
  1882     __ xorptr(rax, rax);                  // return 0 on (trivial) success
  1883     __ jmp(L_done);
  1885     // ======== begin loop ========
  1886     // (Loop is rotated; its entry is L_load_element.)
  1887     // Loop control:
  1888     //   for (count = -count; count != 0; count++)
  1889     // Base pointers src, dst are biased by 8*(count-1),to last element.
  1890     __ align(16);
  1892     __ BIND(L_store_element);
  1893     __ store_heap_oop(rax_oop, to_element_addr);  // store the oop
  1894     __ increment(count);               // increment the count toward zero
  1895     __ jcc(Assembler::zero, L_do_card_marks);
  1897     // ======== loop entry is here ========
  1898     __ BIND(L_load_element);
  1899     __ load_heap_oop(rax_oop, from_element_addr); // load the oop
  1900     __ testptr(rax_oop, rax_oop);
  1901     __ jcc(Assembler::zero, L_store_element);
  1903     __ load_klass(r11_klass, rax_oop);// query the object klass
  1904     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
  1905     // ======== end loop ========
  1907     // It was a real error; we must depend on the caller to finish the job.
  1908     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
  1909     // Emit GC store barriers for the oops we have copied (r14 + rdx),
  1910     // and report their number to the caller.
  1911     assert_different_registers(rax, r14_length, count, to, end_to, rcx);
  1912     __ lea(end_to, to_element_addr);
  1913     gen_write_ref_array_post_barrier(to, end_to, rscratch1);
  1914     __ movptr(rax, r14_length);           // original oops
  1915     __ addptr(rax, count);                // K = (original - remaining) oops
  1916     __ notptr(rax);                       // report (-1^K) to caller
  1917     __ jmp(L_done);
  1919     // Come here on success only.
  1920     __ BIND(L_do_card_marks);
  1921     __ addptr(end_to, -wordSize);         // make an inclusive end pointer
  1922     gen_write_ref_array_post_barrier(to, end_to, rscratch1);
  1923     __ xorptr(rax, rax);                  // return 0 on success
  1925     // Common exit point (success or failure).
  1926     __ BIND(L_done);
  1927     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
  1928     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
  1929     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
  1930     restore_arg_regs();
  1931     __ leave(); // required for proper stackwalking of RuntimeStub frame
  1932     __ ret(0);
  1934     return start;
  1937   //
  1938   //  Generate 'unsafe' array copy stub
  1939   //  Though just as safe as the other stubs, it takes an unscaled
  1940   //  size_t argument instead of an element count.
  1941   //
  1942   //  Input:
  1943   //    c_rarg0   - source array address
  1944   //    c_rarg1   - destination array address
  1945   //    c_rarg2   - byte count, treated as ssize_t, can be zero
  1946   //
  1947   // Examines the alignment of the operands and dispatches
  1948   // to a long, int, short, or byte copy loop.
  1949   //
  1950   address generate_unsafe_copy(const char *name) {
  1952     Label L_long_aligned, L_int_aligned, L_short_aligned;
  1954     // Input registers (before setup_arg_regs)
  1955     const Register from        = c_rarg0;  // source array address
  1956     const Register to          = c_rarg1;  // destination array address
  1957     const Register size        = c_rarg2;  // byte count (size_t)
  1959     // Register used as a temp
  1960     const Register bits        = rax;      // test copy of low bits
  1962     __ align(CodeEntryAlignment);
  1963     StubCodeMark mark(this, "StubRoutines", name);
  1964     address start = __ pc();
  1966     __ enter(); // required for proper stackwalking of RuntimeStub frame
  1968     // bump this on entry, not on exit:
  1969     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
  1971     __ mov(bits, from);
  1972     __ orptr(bits, to);
  1973     __ orptr(bits, size);
  1975     __ testb(bits, BytesPerLong-1);
  1976     __ jccb(Assembler::zero, L_long_aligned);
  1978     __ testb(bits, BytesPerInt-1);
  1979     __ jccb(Assembler::zero, L_int_aligned);
  1981     __ testb(bits, BytesPerShort-1);
  1982     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
  1984     __ BIND(L_short_aligned);
  1985     __ shrptr(size, LogBytesPerShort); // size => short_count
  1986     __ jump(RuntimeAddress(short_copy_entry));
  1988     __ BIND(L_int_aligned);
  1989     __ shrptr(size, LogBytesPerInt); // size => int_count
  1990     __ jump(RuntimeAddress(int_copy_entry));
  1992     __ BIND(L_long_aligned);
  1993     __ shrptr(size, LogBytesPerLong); // size => qword_count
  1994     __ jump(RuntimeAddress(long_copy_entry));
  1996     return start;
  1999   // Perform range checks on the proposed arraycopy.
  2000   // Kills temp, but nothing else.
  2001   // Also, clean the sign bits of src_pos and dst_pos.
  2002   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
  2003                               Register src_pos, // source position (c_rarg1)
  2004                               Register dst,     // destination array oo (c_rarg2)
  2005                               Register dst_pos, // destination position (c_rarg3)
  2006                               Register length,
  2007                               Register temp,
  2008                               Label& L_failed) {
  2009     BLOCK_COMMENT("arraycopy_range_checks:");
  2011     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
  2012     __ movl(temp, length);
  2013     __ addl(temp, src_pos);             // src_pos + length
  2014     __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
  2015     __ jcc(Assembler::above, L_failed);
  2017     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
  2018     __ movl(temp, length);
  2019     __ addl(temp, dst_pos);             // dst_pos + length
  2020     __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
  2021     __ jcc(Assembler::above, L_failed);
  2023     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
  2024     // Move with sign extension can be used since they are positive.
  2025     __ movslq(src_pos, src_pos);
  2026     __ movslq(dst_pos, dst_pos);
  2028     BLOCK_COMMENT("arraycopy_range_checks done");
  2031   //
  2032   //  Generate generic array copy stubs
  2033   //
  2034   //  Input:
  2035   //    c_rarg0    -  src oop
  2036   //    c_rarg1    -  src_pos (32-bits)
  2037   //    c_rarg2    -  dst oop
  2038   //    c_rarg3    -  dst_pos (32-bits)
  2039   // not Win64
  2040   //    c_rarg4    -  element count (32-bits)
  2041   // Win64
  2042   //    rsp+40     -  element count (32-bits)
  2043   //
  2044   //  Output:
  2045   //    rax ==  0  -  success
  2046   //    rax == -1^K - failure, where K is partial transfer count
  2047   //
  2048   address generate_generic_copy(const char *name) {
  2050     Label L_failed, L_failed_0, L_objArray;
  2051     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
  2053     // Input registers
  2054     const Register src        = c_rarg0;  // source array oop
  2055     const Register src_pos    = c_rarg1;  // source position
  2056     const Register dst        = c_rarg2;  // destination array oop
  2057     const Register dst_pos    = c_rarg3;  // destination position
  2058     // elements count is on stack on Win64
  2059 #ifdef _WIN64
  2060 #define C_RARG4 Address(rsp, 6 * wordSize)
  2061 #else
  2062 #define C_RARG4 c_rarg4
  2063 #endif
  2065     { int modulus = CodeEntryAlignment;
  2066       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
  2067       int advance = target - (__ offset() % modulus);
  2068       if (advance < 0)  advance += modulus;
  2069       if (advance > 0)  __ nop(advance);
  2071     StubCodeMark mark(this, "StubRoutines", name);
  2073     // Short-hop target to L_failed.  Makes for denser prologue code.
  2074     __ BIND(L_failed_0);
  2075     __ jmp(L_failed);
  2076     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
  2078     __ align(CodeEntryAlignment);
  2079     address start = __ pc();
  2081     __ enter(); // required for proper stackwalking of RuntimeStub frame
  2083     // bump this on entry, not on exit:
  2084     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
  2086     //-----------------------------------------------------------------------
  2087     // Assembler stub will be used for this call to arraycopy
  2088     // if the following conditions are met:
  2089     //
  2090     // (1) src and dst must not be null.
  2091     // (2) src_pos must not be negative.
  2092     // (3) dst_pos must not be negative.
  2093     // (4) length  must not be negative.
  2094     // (5) src klass and dst klass should be the same and not NULL.
  2095     // (6) src and dst should be arrays.
  2096     // (7) src_pos + length must not exceed length of src.
  2097     // (8) dst_pos + length must not exceed length of dst.
  2098     //
  2100     //  if (src == NULL) return -1;
  2101     __ testptr(src, src);         // src oop
  2102     size_t j1off = __ offset();
  2103     __ jccb(Assembler::zero, L_failed_0);
  2105     //  if (src_pos < 0) return -1;
  2106     __ testl(src_pos, src_pos); // src_pos (32-bits)
  2107     __ jccb(Assembler::negative, L_failed_0);
  2109     //  if (dst == NULL) return -1;
  2110     __ testptr(dst, dst);         // dst oop
  2111     __ jccb(Assembler::zero, L_failed_0);
  2113     //  if (dst_pos < 0) return -1;
  2114     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
  2115     size_t j4off = __ offset();
  2116     __ jccb(Assembler::negative, L_failed_0);
  2118     // The first four tests are very dense code,
  2119     // but not quite dense enough to put four
  2120     // jumps in a 16-byte instruction fetch buffer.
  2121     // That's good, because some branch predicters
  2122     // do not like jumps so close together.
  2123     // Make sure of this.
  2124     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
  2126     // registers used as temp
  2127     const Register r11_length    = r11; // elements count to copy
  2128     const Register r10_src_klass = r10; // array klass
  2129     const Register r9_dst_klass  = r9;  // dest array klass
  2131     //  if (length < 0) return -1;
  2132     __ movl(r11_length, C_RARG4);       // length (elements count, 32-bits value)
  2133     __ testl(r11_length, r11_length);
  2134     __ jccb(Assembler::negative, L_failed_0);
  2136     __ load_klass(r10_src_klass, src);
  2137 #ifdef ASSERT
  2138     //  assert(src->klass() != NULL);
  2139     BLOCK_COMMENT("assert klasses not null");
  2140     { Label L1, L2;
  2141       __ testptr(r10_src_klass, r10_src_klass);
  2142       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
  2143       __ bind(L1);
  2144       __ stop("broken null klass");
  2145       __ bind(L2);
  2146       __ load_klass(r9_dst_klass, dst);
  2147       __ cmpq(r9_dst_klass, 0);
  2148       __ jcc(Assembler::equal, L1);     // this would be broken also
  2149       BLOCK_COMMENT("assert done");
  2151 #endif
  2153     // Load layout helper (32-bits)
  2154     //
  2155     //  |array_tag|     | header_size | element_type |     |log2_element_size|
  2156     // 32        30    24            16              8     2                 0
  2157     //
  2158     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
  2159     //
  2161     int lh_offset = klassOopDesc::header_size() * HeapWordSize +
  2162                     Klass::layout_helper_offset_in_bytes();
  2164     const Register rax_lh = rax;  // layout helper
  2166     __ movl(rax_lh, Address(r10_src_klass, lh_offset));
  2168     // Handle objArrays completely differently...
  2169     jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
  2170     __ cmpl(rax_lh, objArray_lh);
  2171     __ jcc(Assembler::equal, L_objArray);
  2173     //  if (src->klass() != dst->klass()) return -1;
  2174     __ load_klass(r9_dst_klass, dst);
  2175     __ cmpq(r10_src_klass, r9_dst_klass);
  2176     __ jcc(Assembler::notEqual, L_failed);
  2178     //  if (!src->is_Array()) return -1;
  2179     __ cmpl(rax_lh, Klass::_lh_neutral_value);
  2180     __ jcc(Assembler::greaterEqual, L_failed);
  2182     // At this point, it is known to be a typeArray (array_tag 0x3).
  2183 #ifdef ASSERT
  2184     { Label L;
  2185       __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
  2186       __ jcc(Assembler::greaterEqual, L);
  2187       __ stop("must be a primitive array");
  2188       __ bind(L);
  2190 #endif
  2192     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
  2193                            r10, L_failed);
  2195     // typeArrayKlass
  2196     //
  2197     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
  2198     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
  2199     //
  2201     const Register r10_offset = r10;    // array offset
  2202     const Register rax_elsize = rax_lh; // element size
  2204     __ movl(r10_offset, rax_lh);
  2205     __ shrl(r10_offset, Klass::_lh_header_size_shift);
  2206     __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
  2207     __ addptr(src, r10_offset);           // src array offset
  2208     __ addptr(dst, r10_offset);           // dst array offset
  2209     BLOCK_COMMENT("choose copy loop based on element size");
  2210     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
  2212     // next registers should be set before the jump to corresponding stub
  2213     const Register from     = c_rarg0;  // source array address
  2214     const Register to       = c_rarg1;  // destination array address
  2215     const Register count    = c_rarg2;  // elements count
  2217     // 'from', 'to', 'count' registers should be set in such order
  2218     // since they are the same as 'src', 'src_pos', 'dst'.
  2220   __ BIND(L_copy_bytes);
  2221     __ cmpl(rax_elsize, 0);
  2222     __ jccb(Assembler::notEqual, L_copy_shorts);
  2223     __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
  2224     __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
  2225     __ movl2ptr(count, r11_length); // length
  2226     __ jump(RuntimeAddress(byte_copy_entry));
  2228   __ BIND(L_copy_shorts);
  2229     __ cmpl(rax_elsize, LogBytesPerShort);
  2230     __ jccb(Assembler::notEqual, L_copy_ints);
  2231     __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
  2232     __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
  2233     __ movl2ptr(count, r11_length); // length
  2234     __ jump(RuntimeAddress(short_copy_entry));
  2236   __ BIND(L_copy_ints);
  2237     __ cmpl(rax_elsize, LogBytesPerInt);
  2238     __ jccb(Assembler::notEqual, L_copy_longs);
  2239     __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
  2240     __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
  2241     __ movl2ptr(count, r11_length); // length
  2242     __ jump(RuntimeAddress(int_copy_entry));
  2244   __ BIND(L_copy_longs);
  2245 #ifdef ASSERT
  2246     { Label L;
  2247       __ cmpl(rax_elsize, LogBytesPerLong);
  2248       __ jcc(Assembler::equal, L);
  2249       __ stop("must be long copy, but elsize is wrong");
  2250       __ bind(L);
  2252 #endif
  2253     __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
  2254     __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
  2255     __ movl2ptr(count, r11_length); // length
  2256     __ jump(RuntimeAddress(long_copy_entry));
  2258     // objArrayKlass
  2259   __ BIND(L_objArray);
  2260     // live at this point:  r10_src_klass, src[_pos], dst[_pos]
  2262     Label L_plain_copy, L_checkcast_copy;
  2263     //  test array classes for subtyping
  2264     __ load_klass(r9_dst_klass, dst);
  2265     __ cmpq(r10_src_klass, r9_dst_klass); // usual case is exact equality
  2266     __ jcc(Assembler::notEqual, L_checkcast_copy);
  2268     // Identically typed arrays can be copied without element-wise checks.
  2269     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
  2270                            r10, L_failed);
  2272     __ lea(from, Address(src, src_pos, TIMES_OOP,
  2273                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
  2274     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
  2275                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
  2276     __ movl2ptr(count, r11_length); // length
  2277   __ BIND(L_plain_copy);
  2278     __ jump(RuntimeAddress(oop_copy_entry));
  2280   __ BIND(L_checkcast_copy);
  2281     // live at this point:  r10_src_klass, !r11_length
  2283       // assert(r11_length == C_RARG4); // will reload from here
  2284       Register r11_dst_klass = r11;
  2285       __ load_klass(r11_dst_klass, dst);
  2287       // Before looking at dst.length, make sure dst is also an objArray.
  2288       __ cmpl(Address(r11_dst_klass, lh_offset), objArray_lh);
  2289       __ jcc(Assembler::notEqual, L_failed);
  2291       // It is safe to examine both src.length and dst.length.
  2292 #ifndef _WIN64
  2293       arraycopy_range_checks(src, src_pos, dst, dst_pos, C_RARG4,
  2294                              rax, L_failed);
  2295 #else
  2296       __ movl(r11_length, C_RARG4);     // reload
  2297       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
  2298                              rax, L_failed);
  2299       __ load_klass(r11_dst_klass, dst); // reload
  2300 #endif
  2302       // Marshal the base address arguments now, freeing registers.
  2303       __ lea(from, Address(src, src_pos, TIMES_OOP,
  2304                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
  2305       __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
  2306                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
  2307       __ movl(count, C_RARG4);          // length (reloaded)
  2308       Register sco_temp = c_rarg3;      // this register is free now
  2309       assert_different_registers(from, to, count, sco_temp,
  2310                                  r11_dst_klass, r10_src_klass);
  2311       assert_clean_int(count, sco_temp);
  2313       // Generate the type check.
  2314       int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
  2315                         Klass::super_check_offset_offset_in_bytes());
  2316       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
  2317       assert_clean_int(sco_temp, rax);
  2318       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
  2320       // Fetch destination element klass from the objArrayKlass header.
  2321       int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
  2322                        objArrayKlass::element_klass_offset_in_bytes());
  2323       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
  2324       __ movl(sco_temp,      Address(r11_dst_klass, sco_offset));
  2325       assert_clean_int(sco_temp, rax);
  2327       // the checkcast_copy loop needs two extra arguments:
  2328       assert(c_rarg3 == sco_temp, "#3 already in place");
  2329       __ movptr(C_RARG4, r11_dst_klass);  // dst.klass.element_klass
  2330       __ jump(RuntimeAddress(checkcast_copy_entry));
  2333   __ BIND(L_failed);
  2334     __ xorptr(rax, rax);
  2335     __ notptr(rax); // return -1
  2336     __ leave();   // required for proper stackwalking of RuntimeStub frame
  2337     __ ret(0);
  2339     return start;
  2342 #undef length_arg
  2343 #endif
  2345 //FIXME
  2346   address generate_disjoint_long_copy(bool aligned, const char *name) {
  2347 	  Label l_1, l_2;
  2348 	  StubCodeMark mark(this, "StubRoutines", name);
  2349 	  __ align(CodeEntryAlignment);
  2350 	  address start = __ pc();
  2352 	  //      __ movl(ecx, Address(esp, 4+8));       // count
  2353 	  //     __ movl(eax, Address(esp, 4+0));       // from
  2354 	  //    __ movl(edx, Address(esp, 4+4));       // to
  2355 	  __ move(T1, A2);  
  2356 	  __ move(T3, A0); 
  2357 	  __ move(T0, A1);
  2358 	  __ push(T3); 
  2359 	  __ push(T0);
  2360 	  __ push(T1);
  2361 	  //__ subl(edx, eax);
  2362 	  //__ jmp(l_2);
  2363 	  __ b(l_2);  
  2364 	  __ delayed()->nop();   
  2365 	  __ align(16);
  2366 	  __ bind(l_1);
  2367 	  //   if (VM_Version::supports_mmx()) {
  2368 	  //     __ movq(mmx0, Address(eax));
  2369 	  //     __ movq(Address(eax, edx, Address::times_1), mmx0);
  2370 	  //   } else {
  2371 	  //   __ fild_d(Address(eax));
  2372 	  __ ld(AT, T3, 0);   
  2373 	  // __ fistp_d(Address(eax, edx, Address::times_1));
  2374 	  __ sd (AT, T0, 0); 
  2375 	  //   }
  2376 	  //   __ addl(eax, 8);
  2377 	  __ addi(T3, T3, 8); 
  2378 	  __ addi(T0, T0, 8); 
  2379 	  __ bind(l_2);
  2380 	  //    __ decl(ecx);
  2381 	  __ addi(T1, T1, -1); 
  2382 	  //    __ jcc(Assembler::greaterEqual, l_1);
  2383 	  __ bgez(T1, l_1);    
  2384 	  __ delayed()->nop(); 
  2385 	  //  if (VM_Version::supports_mmx()) {
  2386 	  //    __ emms();
  2387 	  //  }
  2388 	  //  __ ret(0);
  2389 	  __ pop(T1); 
  2390 	  __ pop(T0); 
  2391 	  __ pop(T3); 
  2392 	  __ jr(RA); 
  2393 	  __ delayed()->nop(); 
  2394 	  return start;
  2398   address generate_conjoint_long_copy(bool aligned, const char *name) {
  2399 	  Label l_1, l_2;
  2400 	  StubCodeMark mark(this, "StubRoutines", name);
  2401 	  __ align(CodeEntryAlignment);
  2402 	  address start = __ pc();
  2403 	  address nooverlap_target = aligned ?
  2404 		  StubRoutines::arrayof_jlong_disjoint_arraycopy() :
  2405 		  StubRoutines::jlong_disjoint_arraycopy();
  2406 	  array_overlap_test(nooverlap_target, 3);
  2408 	  __ push(T3); 
  2409 	  __ push(T0); 
  2410 	  __ push(T1); 
  2412 		/*      __ movl(ecx, Address(esp, 4+8));       // count
  2413 						__ movl(eax, Address(esp, 4+0));       // from
  2414 						__ movl(edx, Address(esp, 4+4));       // to
  2415 						__ jmp(l_2);
  2417 		 */
  2418 	  __ move(T1, A2);  
  2419 	  __ move(T3, A0); 
  2420 	  __ move(T0, A1);
  2421 	  __ sll(AT, T1, Address::times_8); 
  2422 	  __ add(AT, T3, AT); 
  2423 	  __ lea(T3 , Address(AT, -8)); 
  2424 	  __ sll(AT, T1, Address::times_8); 
  2425 	  __ add(AT, T0, AT); 
  2426 	  __ lea(T0 , Address(AT, -8)); 
  2430 	  __ b(l_2); 
  2431 	  __ delayed()->nop(); 
  2432 	  __ align(16);
  2433 		__ bind(l_1);
  2434 		/*      if (VM_Version::supports_mmx()) {
  2435 						__ movq(mmx0, Address(eax, ecx, Address::times_8));
  2436 						__ movq(Address(edx, ecx,Address::times_8), mmx0);
  2437 						} else {
  2438 						__ fild_d(Address(eax, ecx, Address::times_8));
  2439 						__ fistp_d(Address(edx, ecx,Address::times_8));
  2441 		 */    
  2442 		__ ld(AT, T3, 0);   
  2443 		__ sd (AT, T0, 0); 
  2444 	  __ addi(T3, T3, -8); 
  2445 	  __ addi(T0, T0,-8); 
  2446 	  __ bind(l_2);
  2447 	  //	    __ decl(ecx);
  2448 	  __ addi(T1, T1, -1); 
  2449 	  //__ jcc(Assembler::greaterEqual, l_1);
  2450 	  __ bgez(T1, l_1); 
  2451 	  __ delayed()->nop(); 
  2452 	  //      if (VM_Version::supports_mmx()) {
  2453 	  //      __ emms();
  2454 	  //   }
  2455 	  //  __ ret(0);
  2456 	  __ pop(T1); 
  2457 	  __ pop(T0); 
  2458 	  __ pop(T3); 
  2459 	  __ jr(RA); 
  2460 	  __ delayed()->nop();  
  2461 	  return start;
  2464   void generate_arraycopy_stubs() {
  2465     if (UseCompressedOops) {
  2466       StubRoutines::_oop_disjoint_arraycopy    = generate_disjoint_int_oop_copy(false, true, "oop_disjoint_arraycopy");
  2467       StubRoutines::_oop_arraycopy   	= generate_conjoint_int_oop_copy(false, true, "oop_arraycopy");
  2468     } else {
  2469       StubRoutines::_oop_disjoint_arraycopy    = generate_disjoint_long_oop_copy(false, true, "oop_disjoint_arraycopy");
  2470       StubRoutines::_oop_arraycopy   	= generate_conjoint_long_oop_copy(false, true, "oop_arraycopy");
  2473     StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
  2474     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
  2475     StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, "jint_disjoint_arraycopy");
  2476     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
  2477     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
  2479     //  if (VM_Version::supports_mmx())
  2480     //if (false)
  2481     // StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_mmx_copy_aligned("arrayof_jshort_disjoint_arraycopy");
  2482     // else
  2483     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
  2484     StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(true, false, "arrayof_jint_disjoint_arraycopy");
  2485     //StubRoutines::_arrayof_oop_disjoint_arraycopy   = generate_disjoint_int_oop_copy(true, true, "arrayof_oop_disjoint_arraycopy");
  2486     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
  2488     StubRoutines::_jbyte_arraycopy  = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
  2489     StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, "jshort_arraycopy");
  2490     StubRoutines::_jint_arraycopy   = generate_conjoint_int_oop_copy(false, false, "jint_arraycopy");
  2491     StubRoutines::_jlong_arraycopy  = generate_conjoint_long_copy(false, "jlong_arraycopy");
  2493     StubRoutines::_arrayof_jbyte_arraycopy  = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
  2494     StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
  2495     StubRoutines::_arrayof_jint_arraycopy   = generate_conjoint_int_oop_copy(true, false, "arrayof_jint_arraycopy");
  2496     //StubRoutines::_arrayof_oop_arraycopy    = generate_conjoint_int_oop_copy(true, true, "arrayof_oop_arraycopy");
  2497     StubRoutines::_arrayof_jlong_arraycopy  = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");
  2499     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
  2500     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
  2503 //Wang: add a function to implement SafeFetch32 and SafeFetchN
  2504   void generate_safefetch(const char* name, int size, address* entry,
  2505                           address* fault_pc, address* continuation_pc) {
  2506     // safefetch signatures:
  2507     //   int      SafeFetch32(int*      adr, int      errValue);
  2508     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
  2509     //
  2510     // arguments:
  2511     //   A0 = adr
  2512     //   A1 = errValue
  2513     //
  2514     // result:
  2515     //   PPC_RET  = *adr or errValue
  2517     StubCodeMark mark(this, "StubRoutines", name);
  2519     // Entry point, pc or function descriptor.
  2520     *entry = __ pc();
  2522     // Load *adr into A1, may fault.
  2523     *fault_pc = __ pc();
  2524     switch (size) {
  2525       case 4:
  2526         // int32_t
  2527         __ lw(A1, A0, 0); 
  2528         break;
  2529       case 8:
  2530         // int64_t
  2531         __ ld(A1, A0, 0); 
  2532         break;
  2533       default:
  2534         ShouldNotReachHere();
  2537     // return errValue or *adr
  2538     *continuation_pc = __ pc();
  2539     __ addu(V0,A1,R0);
  2540     __ jr(RA);
  2541     __ delayed()->nop();
  2545 #undef __
  2546 #define __ masm->
  2548   // Continuation point for throwing of implicit exceptions that are
  2549   // not handled in the current activation. Fabricates an exception
  2550   // oop and initiates normal exception dispatching in this
  2551   // frame. Since we need to preserve callee-saved values (currently
  2552   // only for C2, but done for C1 as well) we need a callee-saved oop
  2553   // map and therefore have to make these stubs into RuntimeStubs
  2554   // rather than BufferBlobs.  If the compiler needs all registers to
  2555   // be preserved between the fault point and the exception handler
  2556   // then it must assume responsibility for that in
  2557   // AbstractCompiler::continuation_for_implicit_null_exception or
  2558   // continuation_for_implicit_division_by_zero_exception. All other
  2559   // implicit exceptions (e.g., NullPointerException or
  2560   // AbstractMethodError on entry) are either at call sites or
  2561   // otherwise assume that stack unwinding will be initiated, so
  2562   // caller saved registers were assumed volatile in the compiler.
  2563   address generate_throw_exception(const char* name,
  2564                                    address runtime_entry,
  2565                                    bool restore_saved_exception_pc) {
  2566     // Information about frame layout at time of blocking runtime call.
  2567     // Note that we only have to preserve callee-saved registers since
  2568     // the compilers are responsible for supplying a continuation point
  2569 		// if they expect all registers to be preserved.
  2570 //#define aoqi_test
  2571 #ifdef aoqi_test
  2572 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2573 #endif
  2574 		enum layout {
  2575 			thread_off,    // last_java_sp                
  2576 			S7_off,        // callee saved register      sp + 1
  2577 			S6_off,        // callee saved register      sp + 2
  2578 			S5_off,        // callee saved register      sp + 3
  2579 			S4_off,        // callee saved register      sp + 4
  2580 			S3_off,        // callee saved register      sp + 5
  2581 			S2_off,        // callee saved register      sp + 6
  2582 			S1_off,        // callee saved register      sp + 7
  2583 			S0_off,        // callee saved register      sp + 8
  2584 			FP_off,
  2585 			ret_address,
  2586 			framesize
  2587 		};
  2589 		int insts_size = 2048;
  2590 		int locs_size  = 32;
  2592 		//  CodeBuffer* code     = new CodeBuffer(insts_size, locs_size, 0, 0, 0, false, 
  2593 		//  NULL, NULL, NULL, false, NULL, name, false);
  2594 		CodeBuffer code (name , insts_size, locs_size);
  2595 #ifdef aoqi_test
  2596 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2597 #endif
  2598 		OopMapSet* oop_maps  = new OopMapSet();
  2599 #ifdef aoqi_test
  2600 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2601 #endif
  2602 		MacroAssembler* masm = new MacroAssembler(&code);
  2603 #ifdef aoqi_test
  2604 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2605 #endif
  2607 		address start = __ pc();
  2608     	//__ stop("generate_throw_exception");
  2609 		/*
  2610 			 __ move(AT, (int)&jerome1 );
  2611 			 __ sw(SP, AT, 0); 	
  2612 			 __ move(AT, (int)&jerome2 );
  2613 			 __ sw(FP, AT, 0); 	
  2614 			 __ move(AT, (int)&jerome3 );
  2615 			 __ sw(RA, AT, 0); 	
  2616 			 __ move(AT, (int)&jerome4 );
  2617 			 __ sw(R0, AT, 0); 	
  2618 			 __ move(AT, (int)&jerome5 );
  2619 			 __ sw(R0, AT, 0); 	
  2620 			 __ move(AT, (int)&jerome6 );
  2621 			 __ sw(R0, AT, 0); 	
  2622 			 __ move(AT, (int)&jerome7 );
  2623 			 __ sw(R0, AT, 0); 	
  2624 			 __ move(AT, (int)&jerome10 );
  2625 			 __ sw(R0, AT, 0); 	
  2627 			 __ pushad();
  2629 		//__ enter();
  2630 		__ call(CAST_FROM_FN_PTR(address, SharedRuntime::print_call_statistics), 
  2631 		relocInfo::runtime_call_type);
  2632 		__ delayed()->nop();
  2634 		//__ leave();
  2635 		__ popad();
  2637 		 */
  2639 		// This is an inlined and slightly modified version of call_VM
  2640 		// which has the ability to fetch the return PC out of
  2641 		// thread-local storage and also sets up last_Java_sp slightly
  2642 		// differently than the real call_VM
  2643 #ifndef OPT_THREAD	
  2644 		Register java_thread = TREG;
  2645 		__ get_thread(java_thread);
  2646 #else
  2647 		Register java_thread = TREG;
  2648 #endif
  2649 #ifdef aoqi_test
  2650 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2651 #endif
  2652 		if (restore_saved_exception_pc) {
  2653 			__ ld(RA, java_thread, in_bytes(JavaThread::saved_exception_pc_offset())); // eax
  2656 		__ enter(); // required for proper stackwalking of RuntimeStub frame
  2658 		__ addi(SP, SP, (-1) * (framesize-2) * wordSize); // prolog
  2659 		__ sd(S0, SP, S0_off * wordSize);
  2660 		__ sd(S1, SP, S1_off * wordSize);
  2661 		__ sd(S2, SP, S2_off * wordSize);
  2662 		__ sd(S3, SP, S3_off * wordSize);
  2663 		__ sd(S4, SP, S4_off * wordSize);
  2664 		__ sd(S5, SP, S5_off * wordSize);
  2665 		__ sd(S6, SP, S6_off * wordSize);
  2666 		__ sd(S7, SP, S7_off * wordSize);
  2668 		int frame_complete = __ pc() - start;
  2669 		// push java thread (becomes first argument of C function)
  2670 		__ sd(java_thread, SP, thread_off * wordSize);
  2671 		if (java_thread!=A0)
  2672 			__ move(A0, java_thread);
  2674 		// Set up last_Java_sp and last_Java_fp
  2675 		__ set_last_Java_frame(java_thread, SP, FP, NULL);
  2676 		__ relocate(relocInfo::internal_pc_type);
  2678 			intptr_t save_pc = (intptr_t)__ pc() +  NativeMovConstReg::instruction_size + NativeCall::return_address_offset + 4;
  2679 			__ li48(AT, save_pc);
  2681 		__ sd(AT, java_thread, in_bytes(JavaThread::last_Java_pc_offset())); 
  2683 		// Call runtime
  2684 		__ call(runtime_entry);
  2685 		__ delayed()->nop();
  2686 		// Generate oop map
  2687 		OopMap* map =  new OopMap(framesize, 0);        
  2688 		oop_maps->add_gc_map(__ offset(),  map);
  2690 		// restore the thread (cannot use the pushed argument since arguments
  2691 		// may be overwritten by C code generated by an optimizing compiler);
  2692 		// however can use the register value directly if it is callee saved.
  2693 #ifndef OPT_THREAD
  2694 		__ get_thread(java_thread);
  2695 #endif
  2697 		__ ld(SP, java_thread, in_bytes(JavaThread::last_Java_sp_offset()));
  2698 		//  __ reset_last_Java_frame(java_thread, true);
  2699 		__ reset_last_Java_frame(java_thread, true, true);
  2701 		// Restore callee save registers.  This must be done after resetting the Java frame
  2702 		__ ld(S0, SP, S0_off * wordSize);
  2703 		__ ld(S1, SP, S1_off * wordSize);
  2704 		__ ld(S2, SP, S2_off * wordSize);
  2705 		__ ld(S3, SP, S3_off * wordSize);
  2706 		__ ld(S4, SP, S4_off * wordSize);
  2707 		__ ld(S5, SP, S5_off * wordSize);
  2708 		__ ld(S6, SP, S6_off * wordSize);
  2709 		__ ld(S7, SP, S7_off * wordSize);
  2711 		// discard arguments
  2712 		__ addi(SP, SP, (framesize-2) * wordSize); // epilog
  2713 		//	__ leave(); // required for proper stackwalking of RuntimeStub frame
  2714 		__ addi(SP, FP, wordSize);
  2715 		__ ld(FP, SP, -1*wordSize);
  2716 		// check for pending exceptions
  2717 #ifdef ASSERT
  2718 		Label L;
  2719 		__ lw(AT, java_thread, in_bytes(Thread::pending_exception_offset()));
  2720 		__ bne(AT, R0, L);
  2721 		__ delayed()->nop();
  2722 		__ should_not_reach_here();
  2723 		__ bind(L);
  2724 #endif //ASSERT
  2725 		__ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
  2726 		__ delayed()->nop();
  2727 #ifdef aoqi_test
  2728 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2729 #endif
  2730 		RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code,frame_complete, 
  2731 										framesize, oop_maps, false);
  2732 #ifdef aoqi_test
  2733 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2734 #endif
  2735 		return stub->entry_point();
  2738   // Initialization
  2739   void generate_initial() {
  2740 /*
  2741 		// Generates all stubs and initializes the entry points
  2743     // This platform-specific stub is needed by generate_call_stub()
  2744     StubRoutines::mips::_mxcsr_std        = generate_fp_mask("mxcsr_std",        0x0000000000001F80);
  2746     // entry points that exist in all platforms Note: This is code
  2747     // that could be shared among different platforms - however the
  2748     // benefit seems to be smaller than the disadvantage of having a
  2749     // much more complicated generator structure. See also comment in
  2750     // stubRoutines.hpp.
  2752     StubRoutines::_forward_exception_entry = generate_forward_exception();
  2754     StubRoutines::_call_stub_entry =
  2755       generate_call_stub(StubRoutines::_call_stub_return_address);
  2757     // is referenced by megamorphic call
  2758     StubRoutines::_catch_exception_entry = generate_catch_exception();
  2760     // atomic calls
  2761     StubRoutines::_atomic_xchg_entry         = generate_atomic_xchg();
  2762     StubRoutines::_atomic_xchg_ptr_entry     = generate_atomic_xchg_ptr();
  2763     StubRoutines::_atomic_cmpxchg_entry      = generate_atomic_cmpxchg();
  2764     StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
  2765     StubRoutines::_atomic_add_entry          = generate_atomic_add();
  2766     StubRoutines::_atomic_add_ptr_entry      = generate_atomic_add_ptr();
  2767     StubRoutines::_fence_entry               = generate_orderaccess_fence();
  2769     StubRoutines::_handler_for_unsafe_access_entry =
  2770       generate_handler_for_unsafe_access();
  2772     // platform dependent
  2773     StubRoutines::mips::_get_previous_fp_entry = generate_get_previous_fp();
  2775     StubRoutines::mips::_verify_mxcsr_entry    = generate_verify_mxcsr();
  2776 */
  2777 		// Generates all stubs and initializes the entry points
  2779 		//-------------------------------------------------------------
  2780 		//-----------------------------------------------------------
  2781 		// entry points that exist in all platforms
  2782 		// Note: This is code that could be shared among different platforms - however the benefit seems to be smaller 
  2783 		// than the disadvantage of having a much more complicated generator structure. 
  2784 		// See also comment in stubRoutines.hpp.
  2785 		StubRoutines::_forward_exception_entry = generate_forward_exception();    
  2786 		StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
  2787 		// is referenced by megamorphic call    
  2788 		StubRoutines::_catch_exception_entry = generate_catch_exception();    
  2790 		StubRoutines::_handler_for_unsafe_access_entry = generate_handler_for_unsafe_access();
  2792 		// platform dependent
  2793 		StubRoutines::gs2::_get_previous_fp_entry = generate_get_previous_fp();
  2796 void generate_all() {
  2797 #ifdef aoqi_test
  2798 tty->print_cr("%s:%d", __func__, __LINE__);
  2799 #endif
  2800     // Generates all stubs and initializes the entry points
  2802     // These entry points require SharedInfo::stack0 to be set up in
  2803     // non-core builds and need to be relocatable, so they each
  2804     // fabricate a RuntimeStub internally.
  2805 	/*
  2806     StubRoutines::_throw_AbstractMethodError_entry =
  2807       generate_throw_exception("AbstractMethodError throw_exception",
  2808                                CAST_FROM_FN_PTR(address,
  2809                                                 SharedRuntime::
  2810                                                 throw_AbstractMethodError),
  2811                                false);
  2813     StubRoutines::_throw_IncompatibleClassChangeError_entry =
  2814       generate_throw_exception("IncompatibleClassChangeError throw_exception",
  2815                                CAST_FROM_FN_PTR(address,
  2816                                                 SharedRuntime::
  2817                                                 throw_IncompatibleClassChangeError),
  2818                                false);
  2820     StubRoutines::_throw_ArithmeticException_entry =
  2821       generate_throw_exception("ArithmeticException throw_exception",
  2822                                CAST_FROM_FN_PTR(address,
  2823                                                 SharedRuntime::
  2824                                                 throw_ArithmeticException),
  2825                                true);
  2827     StubRoutines::_throw_NullPointerException_entry =
  2828       generate_throw_exception("NullPointerException throw_exception",
  2829                                CAST_FROM_FN_PTR(address,
  2830                                                 SharedRuntime::
  2831                                                 throw_NullPointerException),
  2832                                true);
  2834     StubRoutines::_throw_NullPointerException_at_call_entry =
  2835       generate_throw_exception("NullPointerException at call throw_exception",
  2836                                CAST_FROM_FN_PTR(address,
  2837                                                 SharedRuntime::
  2838                                                 throw_NullPointerException_at_call),
  2839                                false);
  2841     StubRoutines::_throw_StackOverflowError_entry =
  2842       generate_throw_exception("StackOverflowError throw_exception",
  2843                                CAST_FROM_FN_PTR(address,
  2844                                                 SharedRuntime::
  2845                                                 throw_StackOverflowError),
  2846                                false);
  2848     // entry points that are platform specific
  2849     StubRoutines::mips::_f2i_fixup = generate_f2i_fixup();
  2850     StubRoutines::mips::_f2l_fixup = generate_f2l_fixup();
  2851     StubRoutines::mips::_d2i_fixup = generate_d2i_fixup();
  2852     StubRoutines::mips::_d2l_fixup = generate_d2l_fixup();
  2854     StubRoutines::mips::_float_sign_mask  = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
  2855     StubRoutines::mips::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
  2856     StubRoutines::mips::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
  2857     StubRoutines::mips::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
  2859     // support for verify_oop (must happen after universe_init)
  2860     StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
  2862     // arraycopy stubs used by compilers
  2863     generate_arraycopy_stubs();
  2864 	*/
  2865 #ifdef aoqi_test
  2866 tty->print_cr("%s:%d", __func__, __LINE__);
  2867 #endif
  2868 		StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
  2869 #ifdef aoqi_test
  2870 tty->print_cr("%s:%d", __func__, __LINE__);
  2871 #endif
  2872 //		StubRoutines::_throw_ArithmeticException_entry         = generate_throw_exception("ArithmeticException throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_ArithmeticException),  true);
  2873 #ifdef aoqi_test
  2874 tty->print_cr("%s:%d", __func__, __LINE__);
  2875 #endif
  2876 //		StubRoutines::_throw_NullPointerException_entry        = generate_throw_exception("NullPointerException throw_exception",         CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException), true);
  2877 #ifdef aoqi_test
  2878 tty->print_cr("%s:%d", __func__, __LINE__);
  2879 #endif
  2880 		StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
  2881 #ifdef aoqi_test
  2882 tty->print_cr("%s:%d", __func__, __LINE__);
  2883 #endif
  2884 		StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError),   false);
  2885 #ifdef aoqi_test
  2886 tty->print_cr("%s:%d", __func__, __LINE__);
  2887 #endif
  2889 		//------------------------------------------------------
  2890 		//------------------------------------------------------------------
  2891 		// entry points that are platform specific  
  2893 		// support for verify_oop (must happen after universe_init)
  2894 #ifdef aoqi_test
  2895 tty->print_cr("%s:%d", __func__, __LINE__);
  2896 #endif
  2897 		StubRoutines::_verify_oop_subroutine_entry	   = generate_verify_oop();
  2898 #ifdef aoqi_test
  2899 tty->print_cr("%s:%d", __func__, __LINE__);
  2900 #endif
  2901 #ifndef CORE
  2902 		// arraycopy stubs used by compilers
  2903 		generate_arraycopy_stubs();
  2904 #ifdef aoqi_test
  2905 tty->print_cr("%s:%d", __func__, __LINE__);
  2906 #endif
  2907 #endif
  2909     // Safefetch stubs.
  2910     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
  2911                                                        &StubRoutines::_safefetch32_fault_pc,
  2912                                                        &StubRoutines::_safefetch32_continuation_pc);
  2913     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
  2914                                                        &StubRoutines::_safefetchN_fault_pc,
  2915                                                        &StubRoutines::_safefetchN_continuation_pc);
  2918  public:
  2919   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
  2920     if (all) {
  2921       generate_all();
  2922     } else {
  2923       generate_initial();
  2926 }; // end class declaration
  2927 /*
  2928 address StubGenerator::disjoint_byte_copy_entry  = NULL;
  2929 address StubGenerator::disjoint_short_copy_entry = NULL;
  2930 address StubGenerator::disjoint_int_copy_entry   = NULL;
  2931 address StubGenerator::disjoint_long_copy_entry  = NULL;
  2932 address StubGenerator::disjoint_oop_copy_entry   = NULL;
  2934 address StubGenerator::byte_copy_entry  = NULL;
  2935 address StubGenerator::short_copy_entry = NULL;
  2936 address StubGenerator::int_copy_entry   = NULL;
  2937 address StubGenerator::long_copy_entry  = NULL;
  2938 address StubGenerator::oop_copy_entry   = NULL;
  2940 address StubGenerator::checkcast_copy_entry = NULL;
  2941 */
  2942 void StubGenerator_generate(CodeBuffer* code, bool all) {
  2943   StubGenerator g(code, all);

mercurial