src/cpu/mips/vm/stubGenerator_mips_64.cpp

Fri, 29 Apr 2016 00:06:10 +0800

author
aoqi
date
Fri, 29 Apr 2016 00:06:10 +0800
changeset 1
2d8a650513c2
child 7
e26ad49b7194
permissions
-rw-r--r--

Added MIPS 64-bit port.

     1 /*
     2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
     3  * Copyright (c) 2015, 2016, Loongson Technology. All rights reserved.
     4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     5  *
     6  * This code is free software; you can redistribute it and/or modify it
     7  * under the terms of the GNU General Public License version 2 only, as
     8  * published by the Free Software Foundation.
     9  *
    10  * This code is distributed in the hope that it will be useful, but WITHOUT
    11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    13  * version 2 for more details (a copy is included in the LICENSE file that
    14  * accompanied this code).
    15  *
    16  * You should have received a copy of the GNU General Public License version
    17  * 2 along with this work; if not, write to the Free Software Foundation,
    18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    19  *
    20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    21  * or visit www.oracle.com if you need additional information or have any
    22  * questions.
    23  *
    24  */
    26 #include "precompiled.hpp"
    27 #include "asm/macroAssembler.hpp"
    28 #include "asm/macroAssembler.inline.hpp"
    29 #include "interpreter/interpreter.hpp"
    30 #include "nativeInst_mips.hpp"
    31 #include "oops/instanceOop.hpp"
    32 #include "oops/method.hpp"
    33 #include "oops/objArrayKlass.hpp"
    34 #include "oops/oop.inline.hpp"
    35 #include "prims/methodHandles.hpp"
    36 #include "runtime/frame.inline.hpp"
    37 #include "runtime/handles.inline.hpp"
    38 #include "runtime/sharedRuntime.hpp"
    39 #include "runtime/stubCodeGenerator.hpp"
    40 #include "runtime/stubRoutines.hpp"
    41 #include "runtime/thread.inline.hpp"
    42 #include "utilities/top.hpp"
    43 #ifdef COMPILER2
    44 #include "opto/runtime.hpp"
    45 #endif
    48 // Declaration and definition of StubGenerator (no .hpp file).
    49 // For a more detailed description of the stub routine structure
    50 // see the comment in stubRoutines.hpp
    52 #define __ _masm->
    53 //#define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
    54 //#define a__ ((Assembler*)_masm)->
    56 //#ifdef PRODUCT
    57 //#define BLOCK_COMMENT(str) /* nothing */
    58 //#else
    59 //#define BLOCK_COMMENT(str) __ block_comment(str)
    60 //#endif
    62 //#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
    63 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
    65 // Stub Code definitions
    67 static address handle_unsafe_access() {
    68   JavaThread* thread = JavaThread::current();
    69   address pc = thread->saved_exception_pc();
    70   // pc is the instruction which we must emulate
    71   // doing a no-op is fine:  return garbage from the load
    72   // therefore, compute npc
    73   //address npc = Assembler::locate_next_instruction(pc);
    74 	address npc = (address)((unsigned long)pc + sizeof(unsigned long));
    76   // request an async exception
    77   thread->set_pending_unsafe_access_error();
    79   // return address of next instruction to execute
    80   return npc;
    81 }
    83 class StubGenerator: public StubCodeGenerator {
    84  private:
    86   // ABI mips n64
    87   // This fig is not MIPS ABI. It is call Java from C ABI.
    88   // Call stubs are used to call Java from C
    89   //
    90   //    [ return_from_Java     ]
    91   //    [ argument word n-1    ] <--- sp
    92   //      ...
    93   //    [ argument word 0      ]
    94   //      ...
    95   //-10 [ S6     	       ]
    96   // -9 [ S5		       ] 
    97   // -8 [ S4		       ]
    98   // -7 [ S3                   ]
    99   // -6 [ S0  		       ]
   100   // -5 [ TSR(S2)	       ]
   101   // -4 [ LVP(S7)              ]
   102   // -3 [ BCP(S1)              ]
   103   // -2 [ saved fp             ] <--- fp_after_call
   104   // -1 [ return address       ] 
   105   //  0 [ ptr. to call wrapper ] <--- a0 (old sp -->)fp
   106   //  1 [ result               ] <--- a1
   107   //  2 [ result_type          ] <--- a2
   108   //  3 [ method               ] <--- a3
   109   //  4 [ entry_point          ] <--- a4
   110   //  5 [ parameters           ] <--- a5
   111   //  6 [ parameter_size       ] <--- a6
   112   //  7 [ thread               ] <--- a7
   114   //
   115   // _LP64: n64 does not save paras in sp.
   116   //
   117   //    [ return_from_Java     ]
   118   //    [ argument word n-1    ] <--- sp
   119   //      ...
   120   //    [ argument word 0      ]
   121   //      ...
   122   //-14 [ thread               ]
   123   //-13 [ result_type          ] <--- a2
   124   //-12 [ result               ] <--- a1
   125   //-11 [ ptr. to call wrapper ] <--- a0
   126   //-10 [ S6     	       ]
   127   // -9 [ S5		       ] 
   128   // -8 [ S4		       ]
   129   // -7 [ S3                   ]
   130   // -6 [ S0  		       ]
   131   // -5 [ TSR(S2)	       ]
   132   // -4 [ LVP(S7)              ]
   133   // -3 [ BCP(S1)              ]
   134   // -2 [ saved fp             ] <--- fp_after_call
   135   // -1 [ return address       ] 
   136   //  0 [        	       ] <--- old sp
   137   /*
   138    * 2014/01/16 Fu: Find a right place in the call_stub for GP.
   139    * GP will point to the starting point of Interpreter::dispatch_table(itos). 
   140    * It should be saved/restored before/after Java calls. 
   141    *
   142    */
   143    enum call_stub_layout {
   144      RA_off		  = -1,
   145      FP_off		  = -2,
   146      BCP_off		  = -3,
   147      LVP_off		  = -4,
   148      TSR_off		  = -5,
   149      S1_off		  = -6,
   150      S3_off		  = -7,
   151      S4_off		  = -8,
   152      S5_off		  = -9,
   153      S6_off		  = -10,
   154      result_off		  = -11,
   155      result_type_off	  = -12,
   156      thread_off		  = -13,
   157      total_off		  = thread_off - 3,
   158      GP_off               = -16,
   159    };
   161   address generate_call_stub(address& return_address) {
   163     StubCodeMark mark(this, "StubRoutines", "call_stub");
   164     address start = __ pc();
   166     // same as in generate_catch_exception()!
   168     // stub code
   169     // save ra and fp
   170     __ sd(RA, SP, RA_off * wordSize);
   171     __ sd(FP, SP, FP_off * wordSize);
   172     __ sd(BCP, SP, BCP_off * wordSize);
   173     __ sd(LVP, SP, LVP_off * wordSize);
   174     __ sd(GP, SP, GP_off * wordSize);
   175     __ sd(TSR, SP, TSR_off * wordSize);
   176     __ sd(S1, SP, S1_off * wordSize);
   177     __ sd(S3, SP, S3_off * wordSize);
   178     __ sd(S4, SP, S4_off * wordSize);
   179     __ sd(S5, SP, S5_off * wordSize);
   180     __ sd(S6, SP, S6_off * wordSize);
   183     __ li48(GP, (long)Interpreter::dispatch_table(itos));
   185     // I think 14 is the max gap between argument and callee saved register
   186     __ daddi(FP, SP, (-2) * wordSize);
   187     __ daddi(SP, SP, total_off * wordSize);
   188 //FIXME, aoqi. find a suitable place to save A1 & A2.
   189     /*
   190     __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);
   191     __ sd(A1, FP, 3 * wordSize);
   192     __ sd(A2, FP, 4 * wordSize);
   193     __ sd(A3, FP, 5 * wordSize);
   194     __ sd(A4, FP, 6 * wordSize);
   195     __ sd(A5, FP, 7 * wordSize);
   196     __ sd(A6, FP, 8 * wordSize);
   197     __ sd(A7, FP, 9 * wordSize);
   198     */
   199     __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);
   200     __ sd(A1, FP, result_off * wordSize);
   201     __ sd(A2, FP, result_type_off * wordSize);
   202     __ sd(A7, FP, thread_off * wordSize);
   204 #ifdef OPT_THREAD
   205     //__ get_thread(TREG);
   206     __ move(TREG, A7);
   208     //__ ld(TREG, FP, thread_off * wordSize);
   209 #endif
   210     //add for compressedoops
   211     __ reinit_heapbase();
   213 #ifdef ASSERT
   214     // make sure we have no pending exceptions
   215     { 
   216       Label L;
   217     	__ ld(AT, A7, in_bytes(Thread::pending_exception_offset()));
   218     	__ beq(AT, R0, L); 
   219     	__ delayed()->nop();
   220     	/* FIXME: I do not know how to realize stop in mips arch, do it in the future */
   221     	__ stop("StubRoutines::call_stub: entered with pending exception");
   222     	__ bind(L);
   223     }
   224 #endif
   226     // pass parameters if any
   227     // A5: parameter
   228     // A6: parameter_size
   229     // T0: parameter_size_tmp(--)
   230     // T2: offset(++)
   231     // T3: tmp
   232     Label parameters_done;
   233     // judge if the parameter_size equals 0
   234     __ beq(A6, R0, parameters_done);
   235     __ delayed()->nop();
   236     __ dsll(AT, A6, Interpreter::logStackElementSize);
   237     __ dsub(SP, SP, AT); 
   238     __ move(AT, -StackAlignmentInBytes); 
   239     __ andr(SP, SP , AT); 
   240     // Copy Java parameters in reverse order (receiver last)
   241     // Note that the argument order is inverted in the process
   242     // source is edx[ecx: N-1..0]
   243     // dest   is esp[ebx: 0..N-1]
   244     Label loop;
   245     __ move(T0, A6);
   246     __ move(T2, R0);
   247     __ bind(loop);
   249     // get parameter
   250     __ dsll(T3, T0, LogBytesPerWord);   
   251     __ dadd(T3, T3, A5);	    
   252     __ ld(AT, T3,  -wordSize);
   253     __ dsll(T3, T2, LogBytesPerWord); 
   254     __ dadd(T3, T3, SP); 
   255     __ sd(AT, T3, Interpreter::expr_offset_in_bytes(0));
   256     __ daddi(T2, T2, 1); 
   257     __ daddi(T0, T0, -1); 
   258     __ bne(T0, R0, loop);
   259     __ delayed()->nop();
   260     // advance to next parameter
   262     // call Java function
   263     __ bind(parameters_done);
   265     // receiver in V0, methodOop in Rmethod
   267     __ move(Rmethod, A3);
   268     __ move(Rsender, SP);             //set sender sp
   269     __ jalr(A4);
   270     __ delayed()->nop();
   271     return_address = __ pc();
   273     Label common_return;
   274     __ bind(common_return);
   276     // store result depending on type
   277     // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
   278     __ ld(T0, FP, result_off * wordSize); 	// result --> T0
   279     Label is_long, is_float, is_double, exit;
   280     __ ld(T2, FP, result_type_off * wordSize);	// result_type --> T2
   281     __ daddi(T3, T2, (-1) * T_LONG);
   282     __ beq(T3, R0, is_long);
   283     __ delayed()->daddi(T3, T2, (-1) * T_FLOAT);
   284     __ beq(T3, R0, is_float);
   285     __ delayed()->daddi(T3, T2, (-1) * T_DOUBLE);
   286     __ beq(T3, R0, is_double);
   287     __ delayed()->nop();
   289     // handle T_INT case
   290     __ sd(V0, T0, 0 * wordSize);
   291     __ bind(exit);
   293     // restore 
   294     __ daddi(SP, FP, 2 * wordSize );
   295     __ ld(RA, SP, RA_off * wordSize);
   296     __ ld(FP, SP, FP_off * wordSize);
   297     __ ld(BCP, SP, BCP_off * wordSize);
   298     __ ld(LVP, SP, LVP_off * wordSize);
   299     __ ld(GP, SP, GP_off * wordSize);
   300     __ ld(TSR, SP, TSR_off * wordSize);
   302     __ ld(S1, SP, S1_off * wordSize);
   303     __ ld(S3, SP, S3_off * wordSize);
   304     __ ld(S4, SP, S4_off * wordSize);
   305     __ ld(S5, SP, S5_off * wordSize);
   306     __ ld(S6, SP, S6_off * wordSize);
   308     // return
   309     __ jr(RA);
   310     __ delayed()->nop();
   312     // handle return types different from T_INT
   313     __ bind(is_long);
   314     __ sd(V0, T0, 0 * wordSize);
   315     //__ sd(V1, T0, 1 * wordSize);
   316     __ sd(R0, T0, 1 * wordSize);
   317     __ b(exit);
   318     __ delayed()->nop();
   320     __ bind(is_float);
   321     __ swc1(F0, T0, 0 * wordSize);
   322     __ b(exit);
   323     __ delayed()->nop();
   325     __ bind(is_double);
   326     __ sdc1(F0, T0, 0 * wordSize);
   327     //__ sdc1(F1, T0, 1 * wordSize);
   328     __ sd(R0, T0, 1 * wordSize);
   329     __ b(exit);
   330     __ delayed()->nop();
   331     //FIXME, 1.6 mips version add operation of fpu here
   332     StubRoutines::gs2::set_call_stub_compiled_return(__ pc());
   333     __ b(common_return);
   334     __ delayed()->nop(); 
   335     return start;
   336   }
   338   // Return point for a Java call if there's an exception thrown in
   339   // Java code.  The exception is caught and transformed into a
   340   // pending exception stored in JavaThread that can be tested from
   341   // within the VM.
   342   //
   343   // Note: Usually the parameters are removed by the callee. In case
   344   // of an exception crossing an activation frame boundary, that is
   345   // not the case if the callee is compiled code => need to setup the
   346   // rsp.
   347   //
   348   // rax: exception oop
   350   address generate_catch_exception() {
   351     StubCodeMark mark(this, "StubRoutines", "catch_exception");
   352     address start = __ pc();
   354     Register thread = TREG;
   356     // get thread directly
   357 #ifndef OPT_THREAD
   358     __ ld(thread, FP, thread_off * wordSize);
   359 #endif
   361 #ifdef ASSERT
   362     // verify that threads correspond
   363     { Label L;
   364       __ get_thread(T8);
   365       __ beq(T8, thread, L);
   366       __ delayed()->nop();
   367       __ stop("StubRoutines::catch_exception: threads must correspond");
   368       __ bind(L);
   369     }
   370 #endif
   371     // set pending exception
   372     __ verify_oop(V0);
   373     __ sd(V0, thread, in_bytes(Thread::pending_exception_offset()));
   374     __ li(AT, (long)__FILE__);
   375     __ sd(AT, thread, in_bytes(Thread::exception_file_offset   ()));
   376     __ li(AT, (long)__LINE__);
   377     __ sd(AT, thread, in_bytes(Thread::exception_line_offset   ()));
   379     // complete return to VM
   380     assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
   381     __ jmp(StubRoutines::_call_stub_return_address, relocInfo::none);
   382     __ delayed()->nop();
   384     return start;
   385   }
   387   // Continuation point for runtime calls returning with a pending
   388   // exception.  The pending exception check happened in the runtime
   389   // or native call stub.  The pending exception in Thread is
   390   // converted into a Java-level exception.
   391   //
   392   // Contract with Java-level exception handlers:
   393   // rax: exception
   394   // rdx: throwing pc
   395   //
   396   // NOTE: At entry of this stub, exception-pc must be on stack !!
   398   address generate_forward_exception() {
   399     StubCodeMark mark(this, "StubRoutines", "forward exception");
   400     //Register thread = TREG;
   401     Register thread = TREG;
   402     address start = __ pc();
   404     // Upon entry, the sp points to the return address returning into Java
   405     // (interpreted or compiled) code; i.e., the return address becomes the
   406     // throwing pc.
   407     //
   408     // Arguments pushed before the runtime call are still on the stack but
   409     // the exception handler will reset the stack pointer -> ignore them.
   410     // A potential result in registers can be ignored as well.
   412 #ifdef ASSERT
   413     // make sure this code is only executed if there is a pending exception
   414 #ifndef OPT_THREAD
   415     __ get_thread(thread);
   416 #endif
   417     { Label L;
   418       __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));
   419       __ bne(AT, R0, L);
   420       __ delayed()->nop();
   421       __ stop("StubRoutines::forward exception: no pending exception (1)");
   422       __ bind(L);
   423     }
   424 #endif
   426     // compute exception handler into T9
   427     __ ld(A1, SP, 0);
   428     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, A1);
   429     __ move(T9, V0);
   430     __ pop(V1);
   432 #ifndef OPT_THREAD
   433     __ get_thread(thread);
   434 #endif
   435     __ ld(V0, thread, in_bytes(Thread::pending_exception_offset()));
   436     __ sd(R0, thread, in_bytes(Thread::pending_exception_offset()));
   438 #ifdef ASSERT
   439     // make sure exception is set
   440     { Label L;
   441       __ bne(V0, R0, L);
   442       __ delayed()->nop();
   443       __ stop("StubRoutines::forward exception: no pending exception (2)");
   444       __ bind(L);
   445     }
   446 #endif
   448     // continue at exception handler (return address removed)
   449     // V0: exception
   450     // T9: exception handler
   451     // V1: throwing pc
   452     __ verify_oop(V0);
   453     __ jr(T9);
   454     __ delayed()->nop();
   456     return start;
   457   }
   459   // Support for intptr_t get_previous_fp()
   460   //
   461   // This routine is used to find the previous frame pointer for the
   462   // caller (current_frame_guess). This is used as part of debugging
   463   // ps() is seemingly lost trying to find frames.
   464   // This code assumes that caller current_frame_guess) has a frame.
   465   address generate_get_previous_fp() {
   466     StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
   467     const Address old_fp       (FP,  0);
   468     const Address older_fp       (V0,  0);
   469     address start = __ pc();
   470     __ enter();    
   471     __ lw(V0, old_fp); // callers fp
   472     __ lw(V0, older_fp); // the frame for ps()
   473     __ leave();
   474     __ jr(RA);
   475     __ delayed()->nop();
   476     return start;
   477   }
   478   // The following routine generates a subroutine to throw an
   479   // asynchronous UnknownError when an unsafe access gets a fault that
   480   // could not be reasonably prevented by the programmer.  (Example:
   481   // SIGBUS/OBJERR.)
   482   address generate_handler_for_unsafe_access() {
   483 		StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
   484 		address start = __ pc();
   485 		__ pushad();                      // push registers
   486 		//  Address next_pc(esp, RegisterImpl::number_of_registers * BytesPerWord);
   487 		__ call(CAST_FROM_FN_PTR(address, handle_unsafe_access), relocInfo::runtime_call_type);
   488 		__ delayed()->nop(); 
   489 		__ sw(V0, SP, RegisterImpl::number_of_registers * BytesPerWord); 
   490 		__ popad();
   491 		__ jr(RA);
   492 		__ delayed()->nop();  
   493 		return start;
   494   }
   496   // Non-destructive plausibility checks for oops
   497   //
   498   // Arguments:
   499   //    all args on stack!
   500   //
   501   // Stack after saving c_rarg3:
   502   //    [tos + 0]: saved c_rarg3
   503   //    [tos + 1]: saved c_rarg2
   504   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
   505   //    [tos + 3]: saved flags
   506   //    [tos + 4]: return address
   507   //  * [tos + 5]: error message (char*)
   508   //  * [tos + 6]: object to verify (oop)
   509   //  * [tos + 7]: saved rax - saved by caller and bashed
   510   //  * = popped on exit
   511   address generate_verify_oop() {
   512 	  StubCodeMark mark(this, "StubRoutines", "verify_oop");
   513 	  address start = __ pc();
   514 	  __ reinit_heapbase();
   515 	  __ verify_oop_subroutine(); 
   516     address end = __ pc();
   517 	  return start;
   518   }
   520   //
   521   //  Generate overlap test for array copy stubs
   522   //
   523   //  Input:
   524   //     A0    -  array1
   525   //     A1    -  array2
   526   //     A2    -  element count
   527   //
   528   //  Note: this code can only use %eax, %ecx, and %edx
   529   //
   531  // use T9 as temp 
   532   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
   533     int elem_size = 1 << log2_elem_size;
   534     Address::ScaleFactor sf = Address::times_1;
   536     switch (log2_elem_size) {
   537       case 0: sf = Address::times_1; break;
   538       case 1: sf = Address::times_2; break;
   539       case 2: sf = Address::times_4; break;
   540       case 3: sf = Address::times_8; break;
   541     }
   543     __ dsll(AT, A2, sf);
   544     __ dadd(AT, AT, A0); 
   545     __ lea(T9, Address(AT, -elem_size)); 
   546     __ dsub(AT, A1, A0); 
   547     __ blez(AT, no_overlap_target); 
   548     __ delayed()->nop(); 
   549     __ dsub(AT, A1, T9); 
   550     __ bgtz(AT, no_overlap_target); 
   551     __ delayed()->nop(); 
   553   }
   555   //
   556   //  Generate store check for array
   557   //
   558   //  Input:
   559   //     %edi    -  starting address
   560   //     %ecx    -  element count
   561   //
   562   //  The 2 input registers are overwritten
   563   //
   565   //
   566   //  Generate store check for array
   567   //
   568   //  Input:
   569   //     T0    -  starting address(edi)
   570   //     T1    -  element count  (ecx)
   571   //
   572   //  The 2 input registers are overwritten
   573   //
   575 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
   577 	void array_store_check() {
   578 		BarrierSet* bs = Universe::heap()->barrier_set();
   579 		assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
   580 		CardTableModRefBS* ct = (CardTableModRefBS*)bs;
   581 		assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
   582 		Label l_0;
   584 		__ dsll(AT, T1, TIMES_OOP);
   585 		__ dadd(AT, T0, AT); 
   586 		__ daddiu(T1, AT, - BytesPerHeapOop);
   588 		__ shr(T0, CardTableModRefBS::card_shift); 
   589 		__ shr(T1, CardTableModRefBS::card_shift);
   591 		__ dsub(T1, T1, T0);   // end --> cards count
   592 		__ bind(l_0);
   594 		__ li48(AT, (long)ct->byte_map_base); 
   595 		__ dadd(AT, AT, T0); 
   596 		__ dadd(AT, AT, T1); 
   597 		__ sb(R0, AT, 0);
   598 		//__ daddi(T1, T1, -4);  
   599 		__ daddi(T1, T1, - 1);
   600 		__ bgez(T1, l_0);
   601 		__ delayed()->nop(); 
   602 	}
   604   // Arguments:
   605   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
   606   //             ignored
   607   //   name    - stub name string
   608   //
   609   // Inputs:
   610   //   c_rarg0   - source array address
   611   //   c_rarg1   - destination array address
   612   //   c_rarg2   - element count, treated as ssize_t, can be zero
   613   //
   614   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
   615   // we let the hardware handle it.  The one to eight bytes within words,
   616   // dwords or qwords that span cache line boundaries will still be loaded
   617   // and stored atomically.
   618   //
   619   // Side Effects:
   620   //   disjoint_byte_copy_entry is set to the no-overlap entry point
   621   //   used by generate_conjoint_byte_copy().
   622   //
   623 	address generate_disjoint_byte_copy(bool aligned, const char *name) {
   624 	  StubCodeMark mark(this, "StubRoutines", name);
   625 	  __ align(CodeEntryAlignment);
   626 	  address start = __ pc();
   627 	  Label l_0, l_1, l_2, l_3, l_4, l_5, l_6;
   629 	  __ push(T3);
   630 	  __ push(T0);
   631 	  __ push(T1);
   632 	  __ push(T8);
   633 	  __ move(T3, A0); 
   634 	  __ move(T0, A1);
   635 	  __ move(T1, A2);  
   636 	  __ move(T8, T1);             // original count in T1
   637 	  __ daddi(AT, T1, -3); 
   638 	  __ blez(AT, l_4);  
   639 	  __ delayed()->nop();	
   640 	  if (!aligned) {
   641 	    // align source address at dword address boundary
   642 	    __ move(T1, 4); 
   643 	    __ sub(T1, T1, T3); 
   644 	    __ andi(T1, T1, 3); 
   645 	    __ beq(T1, R0, l_1); 
   646 	    __ delayed()->nop();	
   647 	    __ sub(T8,T8,T1); 
   648 	    __ bind(l_0);
   649 	    __ lb(AT, T3, 0); 
   650 	    __ sb(AT, T0, 0); 
   651 	    __ addi(T3, T3, 1); 
   652 	    __ addi(T0, T0, 1); 
   653 	    __ addi(T1 ,T1, -1);  
   654 	    __ bne(T1, R0, l_0); 
   655 	    __ delayed()->nop(); 
   656 	    __ bind(l_1);
   657 	    __ move(T1, T8); 
   658 	  }
   659 	  __ shr(T1, 2); 
   660 	  __ beq(T1, R0, l_4);     // no dwords to move
   661 	  __ delayed()->nop(); 
   662 	  // copy aligned dwords
   663 	  __ bind(l_2);
   664 	  __ align(16);
   665 	  __ bind(l_3);
   666 	  __ lw(AT, T3, 0);   
   667 	  __ sw(AT, T0, 0 ); 
   668 	  __ addi(T3, T3, 4); 
   669 	  __ addi(T0, T0, 4); 
   670 	  __ addi(T1, T1, -1); 
   671 	  __ bne(T1, R0, l_3); 
   672 	  __ delayed()->nop(); 
   673 	  __ bind(l_4);
   674 	  __ move(T1, T8); 
   675 	  __ andi(T1, T1, 3); 
   676 	  __ beq(T1, R0, l_6);  
   677 	  __ delayed()->nop(); 
   678 	  // copy suffix
   679 	  __ bind(l_5);
   680 	  __ lb(AT, T3, 0); 
   681 	  __ sb(AT, T0, 0); 
   682 	  __ addi(T3, T3, 1);  
   683 	  __ addi(T0, T0, 1);  
   684 	  __ addi(T1, T1, -1); 
   685 	  __ bne(T1, R0, l_5 ); 
   686 	  __ delayed()->nop(); 
   687 	  __ bind(l_6);
   688 	  __ pop(T8); 
   689 	  __ pop(T1); 
   690 	  __ pop(T0); 
   691 	  __ pop(T3); 
   692 	  __ jr(RA); 
   693 	  __ delayed()->nop(); 
   694 	  return start;
   695   }
   697   // Arguments:
   698   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
   699   //             ignored
   700   //   name    - stub name string
   701   //
   702   // Inputs:
   703   //   c_rarg0   - source array address
   704   //   c_rarg1   - destination array address
   705   //   c_rarg2   - element count, treated as ssize_t, can be zero
   706   //
   707   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
   708   // we let the hardware handle it.  The one to eight bytes within words,
   709   // dwords or qwords that span cache line boundaries will still be loaded
   710   // and stored atomically.
   711   //
   712   address generate_conjoint_byte_copy(bool aligned, const char *name) {
   713 		Label l_1, l_2, l_3, l_4, l_5;
   714 		StubCodeMark mark(this, "StubRoutines", name);
   715 		__ align(CodeEntryAlignment);
   716 		address start = __ pc();
   717 		address nooverlap_target = aligned ?
   718 		StubRoutines::arrayof_jbyte_disjoint_arraycopy() :
   719 		StubRoutines::jbyte_disjoint_arraycopy();
   721 		array_overlap_test(nooverlap_target, 0);
   723 		__ push(T3);	
   724 		__ push(T0);	
   725 		__ push(T1);	
   726 		__ push(T8);	
   729 		// copy from high to low
   730 		__ move(T3, A0); 
   731 		__ move(T0, A1);
   732 		__ move(T1, A2);  
   733 		__ dadd(AT, T3, T1);  
   734 		__ lea(T3, Address(AT, -4));
   735 		__ dadd(AT, T0, T1);  
   736 		__ lea(T0, Address(AT, -4));
   737 		__ move(T8, T1); 
   738 		__ daddi(AT, T1, -3); 
   739 		__ blez(AT, l_3); 
   740 		__ delayed()->nop();	
   741 		__ dsrl(T1, T1, 2); 
   742 		__ align(16);
   743 		__ bind(l_1);
   744 		__ lw(AT, T3, 0);   
   745 		__ sw(AT, T0, 0); 
   746 		__ addi(T3, T3, -4);    
   747 		__ addi(T0, T0, -4);    
   748 		__ addi(T1, T1, -1);  
   749 		__ bne(T1, R0, l_1); 
   750 		__ delayed()->nop(); 
   751 		__ b(l_3);  
   752 		__ delayed()->nop(); 
   753 		// copy dwords aligned or not with repeat move
   754 		__ bind(l_2);
   755 		__ bind(l_3);
   756 		// copy suffix (0-3 bytes)
   757 		__ andi(T8, T8, 3); 
   758 		__ beq(T8, R0, l_5); 
   759 		__ delayed()->nop(); 
   760 		__ addi(T3, T3, 3); 
   761 		__ addi(T0, T0, 3); 
   762 		__ bind(l_4);
   763 		__ lb(AT, T3, 0);  
   764 		__ sb(AT, T0, 0); 
   765 		__ addi(T3, T3, -1);  
   766 		__ addi(T0, T0, -1);  
   767 		__ addi(T8, T8, -1); 
   768 		__ bne(T8, R0, l_4); 
   769 		__ delayed()->nop(); 
   770 		__ bind(l_5);
   771 		__ pop(T8);	
   772 		__ pop(T1);	
   773 		__ pop(T0);	
   774 		__ pop(T3);	
   775 		__ jr(RA); 
   776 		__ delayed()->nop(); 
   777 		return start;
   778   }
   780   // Arguments:
   781   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
   782   //             ignored
   783   //   name    - stub name string
   784   //
   785   // Inputs:
   786   //   c_rarg0   - source array address
   787   //   c_rarg1   - destination array address
   788   //   c_rarg2   - element count, treated as ssize_t, can be zero
   789   //
   790   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
   791   // let the hardware handle it.  The two or four words within dwords
   792   // or qwords that span cache line boundaries will still be loaded
   793   // and stored atomically.
   794   //
   795   // Side Effects:
   796   //   disjoint_short_copy_entry is set to the no-overlap entry point
   797   //   used by generate_conjoint_short_copy().
   798   //
   799   address generate_disjoint_short_copy(bool aligned, const char *name) {
   800 		Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
   801 		StubCodeMark mark(this, "StubRoutines", name);
   802 		__ align(CodeEntryAlignment);
   803 		address start = __ pc();
   805 		__ push(T3);	
   806 		__ push(T0);	
   807 		__ push(T1);	
   808 		__ push(T8);	
   809 		__ move(T1, A2);  
   810 		__ move(T3, A0); 
   811 		__ move(T0, A1);
   813 		if (!aligned) {
   814 			__ beq(T1, R0, l_5);
   815 			__ delayed()->nop(); 
   816 			// align source address at dword address boundary
   817 			__ move(T8, T3); // original from
   818 			__ andi(T8, T8, 3); // either 0 or 2
   819 			__ beq(T8, R0, l_1); // no prefix
   820 			__ delayed()->nop();
   821 			// copy prefix
   822 			__ lh(AT, T3, 0);
   823 			__ sh(AT, T0, 0); 
   824 			__ add(T3, T3, T8); 
   825 			__ add(T0, T0, T8);
   826 			__ addi(T1, T1, -1); 
   827 			__ bind(l_1);
   828 		}
   829 		__ move(T8, T1);            // word count less prefix
   830 		__ sra(T1, T1, 1); 
   831 		__ beq(T1, R0, l_4); 
   832 		__ delayed()->nop(); 
   833     // copy aligned dwords
   834 		__ bind(l_2);
   835 		__ align(16);
   836 		__ bind(l_3);
   837 		__ lw(AT, T3, 0);   
   838 		__ sw(AT, T0, 0 ); 
   839 		__ addi(T3, T3, 4); 
   840 		__ addi(T0, T0, 4); 
   841 		__ addi(T1, T1, -1); 
   842 		__ bne(T1, R0, l_3); 
   843 		__ delayed()->nop(); 
   844 		__ bind(l_4);
   845 		__ andi(T8, T8, 1); 
   846 		__ beq(T8, R0, l_5);  
   847 		__ delayed()->nop(); 
   848 		// copy suffix
   849 		__ lh(AT, T3, 0); 
   850 		__ sh(AT, T0, 0); 
   851 		__ bind(l_5);
   852 		__ pop(T8);	
   853 		__ pop(T1);	
   854 		__ pop(T0);	
   855 		__ pop(T3);	
   856 		__ jr(RA); 
   857 		__ delayed()->nop();  
   858 		return start;
   859   }
   861   // Arguments:
   862   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
   863   //             ignored
   864   //   name    - stub name string
   865   //
   866   // Inputs:
   867   //   c_rarg0   - source array address
   868   //   c_rarg1   - destination array address
   869   //   c_rarg2   - element count, treated as ssize_t, can be zero
   870   //
   871   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
   872   // let the hardware handle it.  The two or four words within dwords
   873   // or qwords that span cache line boundaries will still be loaded
   874   // and stored atomically.
   875   //
   876   address generate_conjoint_short_copy(bool aligned, const char *name) {
   877 		Label l_1, l_2, l_3, l_4, l_5;
   878 		StubCodeMark mark(this, "StubRoutines", name);
   879 		__ align(CodeEntryAlignment);
   880 		address start = __ pc();
   881 		address nooverlap_target = aligned ?
   882 						StubRoutines::arrayof_jshort_disjoint_arraycopy() :
   883 						StubRoutines::jshort_disjoint_arraycopy();
   885 		array_overlap_test(nooverlap_target, 1);
   887 		__ push(T3);	
   888 		__ push(T0);	
   889 		__ push(T1);	
   890 		__ push(T8);	
   892 		/*
   893 			 __ pushl(esi);
   894 			 __ movl(ecx, Address(esp, 4+12));      // count
   895 			 __ pushl(edi);
   896 			 __ movl(esi, Address(esp, 8+ 4));      // from
   897 			 __ movl(edi, Address(esp, 8+ 8));      // to
   898 		 */ 
   899 		__ move(T1, A2);  
   900 		__ move(T3, A0); 
   901 		__ move(T0, A1);
   904 		// copy dwords from high to low
   905 		// __ leal(esi, Address(esi, ecx, Address::times_2, -4)); // from + count*2 - 4
   906 		__ sll(AT, T1, Address::times_2); 
   907 		__ add(AT, T3, AT); 
   908 		__ lea(T3, Address( AT, -4)); 
   909 		//__ std();
   910 		//__ leal(edi, Address(edi, ecx, Address::times_2, -4)); // to + count*2 - 4
   911 		__ sll(AT,T1 , Address::times_2); 
   912 		__ add(AT, T0, AT); 
   913 		__ lea(T0, Address( AT, -4)); 
   914 		//  __ movl(eax, ecx);
   915 		__ move(T8, T1); 
   916 		__ bind(l_1);
   917 		//   __ sarl(ecx, 1);              // dword count
   918 		__ sra(T1,T1, 1); 
   919 		//__ jcc(Assembler::equal, l_4);                   // no dwords to move
   920 		__ beq(T1, R0, l_4);  
   921 		__ delayed()->nop(); 
   922 		/*    __ cmpl(ecx, 32);
   923 					__ jcc(Assembler::above, l_3);                   // > 32 dwords
   924 		// copy dwords with loop
   925 		__ subl(edi, esi);
   926 		 */     __ align(16);
   927 		__ bind(l_2);
   928 		//__ movl(edx, Address(esi));
   929 		__ lw(AT, T3, 0);   
   930 		//__ movl(Address(edi, esi, Address::times_1), edx);
   931 		__ sw(AT, T0, 0); 
   932 		//__ subl(esi, 4);
   933 		__ addi(T3, T3, -4); 
   934 		__ addi(T0, T0, -4); 
   935 		//__ decl(ecx);
   936 		__ addi(T1, T1, -1); 
   937 		//  __ jcc(Assembler::notEqual, l_2);
   938 		__ bne(T1, R0, l_2); 
   939 		__ delayed()->nop(); 
   940 		//  __ addl(edi, esi);
   941 		// __ jmp(l_4);
   942 		__ b(l_4);
   943 		__ delayed()->nop();
   944 		// copy dwords with repeat move
   945 		__ bind(l_3);
   946 		//   __ rep_movl();
   947 		__ bind(l_4);
   948 		//  __ andl(eax, 1);              // suffix count
   949 		__ andi(T8, T8, 1);              // suffix count
   950 		//__ jcc(Assembler::equal, l_5);                   // no suffix
   951 		__ beq(T8, R0, l_5 );  
   952 		__ delayed()->nop(); 
   953 		// copy suffix
   954 		//   __ movw(edx, Address(esi, 2));
   955 		__ lh(AT, T3, 2); 
   956 		//  __ movw(Address(edi, 2), edx);
   957 		__ sh(AT, T0, 2); 
   958 		__ bind(l_5);
   959 		//    __ cld();
   960 		//    __ popl(edi);
   961 		//    __ popl(esi);
   962 		//   __ ret(0);
   963 		__ pop(T8);	
   964 		__ pop(T1);	
   965 		__ pop(T0);	
   966 		__ pop(T3);	
   967 		__ jr(RA); 
   968 		__ delayed()->nop();   
   969 		return start;
   970   }
   972   // Arguments:
   973   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
   974   //             ignored
   975   //   is_oop  - true => oop array, so generate store check code
   976   //   name    - stub name string
   977   //
   978   // Inputs:
   979   //   c_rarg0   - source array address
   980   //   c_rarg1   - destination array address
   981   //   c_rarg2   - element count, treated as ssize_t, can be zero
   982   //
   983   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
   984   // the hardware handle it.  The two dwords within qwords that span
   985   // cache line boundaries will still be loaded and stored atomicly.
   986   //
   987   // Side Effects:
   988   //   disjoint_int_copy_entry is set to the no-overlap entry point
   989   //   used by generate_conjoint_int_oop_copy().
   990   //
   991   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {
   992 		Label l_2, l_3, l_4, l_stchk;
   993 		StubCodeMark mark(this, "StubRoutines", name);
   994 		__ align(CodeEntryAlignment);
   995 		address start = __ pc();
   996 		/*
   997 			 __ pushl(esi);
   998 			 __ movl(ecx, Address(esp, 4+12));      // count
   999 			 __ pushl(edi);
  1000 			 __ movl(esi, Address(esp, 8+ 4));      // from
  1001 			 __ movl(edi, Address(esp, 8+ 8));      // to
  1002 		 */
  1003 		__ push(T3);	
  1004 		__ push(T0);	
  1005 		__ push(T1);	
  1006 		__ push(T8);	
  1007 		__ move(T1, A2);  
  1008 		__ move(T3, A0); 
  1009 		__ move(T0, A1);
  1011 		// __ cmpl(ecx, 32);
  1012 		// __ jcc(Assembler::belowEqual, l_2);                   // <= 32 dwords
  1013 		// __ rep_movl();
  1014 		__ b(l_2); 	
  1015 		__ delayed()->nop();	
  1016 		if (is_oop) {
  1017 		//  __ jmp(l_stchk);
  1018 			__ b(l_stchk); 
  1019 			__ delayed()->nop(); 
  1021 		//    __ popl(edi);
  1022 		//   __ popl(esi);
  1023 		//  __ ret(0);
  1024 		__ pop(T8);	
  1025 		__ pop(T1);	
  1026 		__ pop(T0);	
  1027 		__ pop(T3);	
  1028 		__ jr(RA); 
  1029 		__ delayed()->nop(); 
  1031 		__ bind(l_2);
  1032 		//  __ subl(edi, esi);
  1033 		//  __ testl(ecx, ecx);
  1034 		// __ jcc(Assembler::zero, l_4);
  1035 		__ beq(T1, R0, l_4);  
  1036 		__ delayed()->nop(); 
  1037 		__ align(16);
  1038 		__ bind(l_3);
  1039 		//__ movl(edx, Address(esi));
  1040 		__ lw(AT, T3, 0);   
  1041 		// __ movl(Address(edi, esi, Address::times_1), edx);
  1042 		__ sw(AT, T0, 0); 
  1043 		// __ addl(esi, 4);
  1044 		__ addi(T3, T3, 4);
  1045 		__ addi(T0, T0, 4);
  1046 		//   __ decl(ecx);
  1047 		__ addi(T1, T1, -1); 
  1048 		//    __ jcc(Assembler::notEqual, l_3);
  1049 		__ bne(T1, R0, l_3); 
  1050 		__ delayed()->nop(); 
  1051 		if (is_oop) {
  1052 			__ bind(l_stchk);
  1053 			//      __ movl(edi, Address(esp, 8+ 8));
  1054 			//     __ movl(ecx, Address(esp, 8+ 12));
  1055 			__ move(T0, A1); 
  1056 			__ move(T1, A2); 
  1057 			array_store_check();
  1059 		__ bind(l_4);
  1060 		//    __ popl(edi);
  1061 		//   __ popl(esi);
  1062 		//  __ ret(0);
  1063 		__ pop(T8);
  1064 		__ pop(T1);
  1065 		__ pop(T0);
  1066 		__ pop(T3);
  1067 		__ jr(RA); 
  1068 		__ delayed()->nop(); 
  1069 		return start;
  1072   // Arguments:
  1073   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1074   //             ignored
  1075   //   is_oop  - true => oop array, so generate store check code
  1076   //   name    - stub name string
  1077   //
  1078   // Inputs:
  1079   //   c_rarg0   - source array address
  1080   //   c_rarg1   - destination array address
  1081   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1082   //
  1083   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1084   // the hardware handle it.  The two dwords within qwords that span
  1085   // cache line boundaries will still be loaded and stored atomicly.
  1086   //
  1087   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {
  1088 		Label l_2, l_3, l_4, l_stchk;
  1089 		StubCodeMark mark(this, "StubRoutines", name);
  1090 		__ align(CodeEntryAlignment);
  1091 		address start = __ pc();
  1092 		address nooverlap_target;
  1094 		if (is_oop) {
  1095 			nooverlap_target = aligned ?
  1096 							StubRoutines::arrayof_oop_disjoint_arraycopy() :
  1097 							StubRoutines::oop_disjoint_arraycopy();
  1098 		}else {
  1099 			nooverlap_target = aligned ?
  1100 							StubRoutines::arrayof_jint_disjoint_arraycopy() :
  1101 							StubRoutines::jint_disjoint_arraycopy();
  1104 		array_overlap_test(nooverlap_target, 2);
  1106 		__ push(T3);
  1107 		__ push(T0);
  1108 		__ push(T1);
  1109 		__ push(T8);
  1111 		/*
  1112 			 __ pushl(esi);
  1113 			 __ movl(ecx, Address(esp, 4+12));      // count
  1114 			 __ pushl(edi);
  1115 			 __ movl(esi, Address(esp, 8+ 4));      // from
  1116 			 __ movl(edi, Address(esp, 8+ 8));      // to
  1117 		 */ 
  1118 		__ move(T1, A2);  
  1119 		__ move(T3, A0); 
  1120 		__ move(T0, A1);
  1122 		//__ leal(esi, Address(esi, ecx, Address::times_4, -4)); // from + count*4 - 4
  1123 		__ sll(AT, T1, Address::times_4); 
  1124 		__ add(AT, T3, AT); 
  1125 		__ lea(T3 , Address(AT, -4)); 
  1126 		//__ std();
  1127 		//__ leal(edi, Address(edi, ecx, Address::times_4, -4)); // to + count*4 - 4
  1128 		__ sll(AT, T1, Address::times_4); 
  1129 		__ add(AT, T0, AT); 
  1130 		__ lea(T0 , Address(AT, -4)); 
  1132 		//    __ cmpl(ecx, 32);
  1133 		//   __ jcc(Assembler::above, l_3);                   // > 32 dwords
  1134 		//  __ testl(ecx, ecx);
  1135 		//__ jcc(Assembler::zero, l_4);
  1136 		__ beq(T1, R0, l_4); 
  1137 		__ delayed()->nop();  
  1138 		// __ subl(edi, esi);
  1139 		__ align(16);
  1140 		__ bind(l_2);
  1141 		// __ movl(edx, Address(esi));
  1142 		__ lw(AT, T3, 0);   
  1143 		// __ movl(Address(esi, edi, Address::times_1), edx);
  1144 		__ sw(AT, T0, 0); 
  1145 		// __ subl(esi, 4);
  1146 		__ addi(T3, T3, -4); 
  1147 		__ addi(T0, T0, -4); 
  1148 		//   __ decl(ecx);
  1149 		__ addi(T1, T1, -1); 
  1150 		//__ jcc(Assembler::notEqual, l_2);
  1151 		__ bne(T1, R0, l_2);  
  1152 		__ delayed()->nop(); 
  1153 		if (is_oop) {
  1154 			// __ jmp(l_stchk);
  1155 			__ b( l_stchk); 
  1156 			__ delayed()->nop(); 
  1158 		__ bind(l_4);
  1159 		//      __ cld();
  1160 		//     __ popl(edi);
  1161 		//    __ popl(esi);
  1162 		//   __ ret(0);
  1163 		__ pop(T8); 
  1164 		__ pop(T1); 
  1165 		__ pop(T0); 
  1166 		__ pop(T3); 
  1167 		__ jr(RA); 
  1168 		__ delayed()->nop(); 
  1169 		__ bind(l_3);
  1170 		//   __ rep_movl();
  1171 		if (is_oop) {
  1172 			__ bind(l_stchk);
  1173 			//  __ movl(edi, Address(esp, 8+ 8));
  1174 			__ move(T0, A1);  
  1175 			// __ movl(ecx, Address(esp, 8+ 12));
  1176 			__ move(T1, A2);  
  1177 			array_store_check();
  1179 		//    __ cld();
  1180 		//   __ popl(edi);
  1181 		//   __ popl(esi);
  1182 		//  __ ret(0);
  1183 		__ pop(T8);	
  1184 		__ pop(T1);	
  1185 		__ pop(T0);	
  1186 		__ pop(T3);	
  1187 		__ jr(RA);	
  1188 		__ delayed()->nop(); 
  1189 		return start;
  1192   // Arguments:
  1193   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1194   //             ignored
  1195   //   is_oop  - true => oop array, so generate store check code
  1196   //   name    - stub name string
  1197   //
  1198   // Inputs:
  1199   //   c_rarg0   - source array address
  1200   //   c_rarg1   - destination array address
  1201   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1202   //
  1203   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1204   // the hardware handle it.  The two dwords within qwords that span
  1205   // cache line boundaries will still be loaded and stored atomicly.
  1206   //
  1207   // Side Effects:
  1208   //   disjoint_int_copy_entry is set to the no-overlap entry point
  1209   //   used by generate_conjoint_int_oop_copy().
  1210   //
  1211   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
  1212 		Label l_2, l_3, l_4, l_stchk;
  1213 		StubCodeMark mark(this, "StubRoutines", name);
  1214 		__ align(CodeEntryAlignment);
  1215 		address start = __ pc();
  1216 		__ push(T3);	
  1217 		__ push(T0);	
  1218 		__ push(T1);	
  1219 		__ push(T8);	
  1220 		__ move(T1, A2);  
  1221 		__ move(T3, A0); 
  1222 		__ move(T0, A1);
  1224 		// __ cmpl(ecx, 32);
  1225 		// __ jcc(Assembler::belowEqual, l_2);                   // <= 32 dwords
  1226 		// __ rep_movl();
  1227 		__ b(l_2); 	
  1228 		__ delayed()->nop();	
  1229 		if (is_oop) {
  1230 		//  __ jmp(l_stchk);
  1231 			__ b(l_stchk); 
  1232 			__ delayed()->nop(); 
  1234 		//    __ popl(edi);
  1235 		//   __ popl(esi);
  1236 		//  __ ret(0);
  1237 		__ pop(T8);	
  1238 		__ pop(T1);	
  1239 		__ pop(T0);	
  1240 		__ pop(T3);	
  1241 		__ jr(RA); 
  1242 		__ delayed()->nop(); 
  1244 		__ bind(l_2);
  1245 		//  __ subl(edi, esi);
  1246 		//  __ testl(ecx, ecx);
  1247 		// __ jcc(Assembler::zero, l_4);
  1248 		__ beq(T1, R0, l_4);  
  1249 		__ delayed()->nop(); 
  1250 		__ align(16);
  1251 		__ bind(l_3);
  1252 		//__ movl(edx, Address(esi));
  1253 		__ ld(AT, T3, 0);   
  1254 		// __ movl(Address(edi, esi, Address::times_1), edx);
  1255 		__ sd(AT, T0, 0); 
  1256 		// __ addl(esi, 4);
  1257 		__ addi(T3, T3, 8);
  1258 		__ addi(T0, T0, 8);
  1259 		//   __ decl(ecx);
  1260 		__ addi(T1, T1, -1); 
  1261 		//    __ jcc(Assembler::notEqual, l_3);
  1262 		__ bne(T1, R0, l_3); 
  1263 		__ delayed()->nop(); 
  1264 		if (is_oop) {
  1265 			__ bind(l_stchk);
  1266 			//      __ movl(edi, Address(esp, 8+ 8));
  1267 			//     __ movl(ecx, Address(esp, 8+ 12));
  1268 			__ move(T0, A1); 
  1269 			__ move(T1, A2); 
  1270 			array_store_check();
  1272 		__ bind(l_4);
  1273 		//    __ popl(edi);
  1274 		//   __ popl(esi);
  1275 		//  __ ret(0);
  1276 		__ pop(T8);
  1277 		__ pop(T1);
  1278 		__ pop(T0);
  1279 		__ pop(T3);
  1280 		__ jr(RA); 
  1281 		__ delayed()->nop(); 
  1282 		return start;
  1285   // Arguments:
  1286   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1287   //             ignored
  1288   //   is_oop  - true => oop array, so generate store check code
  1289   //   name    - stub name string
  1290   //
  1291   // Inputs:
  1292   //   c_rarg0   - source array address
  1293   //   c_rarg1   - destination array address
  1294   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1295   //
  1296   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1297   // the hardware handle it.  The two dwords within qwords that span
  1298   // cache line boundaries will still be loaded and stored atomicly.
  1299   //
  1300   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
  1301 		Label l_2, l_3, l_4, l_stchk;
  1302 		StubCodeMark mark(this, "StubRoutines", name);
  1303 		__ align(CodeEntryAlignment);
  1304 		address start = __ pc();
  1305 		address nooverlap_target;
  1307 		if (is_oop) {
  1308 			nooverlap_target = aligned ?
  1309 							StubRoutines::arrayof_oop_disjoint_arraycopy() :
  1310 							StubRoutines::oop_disjoint_arraycopy();
  1311 		}else {
  1312 			nooverlap_target = aligned ?
  1313 							StubRoutines::arrayof_jlong_disjoint_arraycopy() :
  1314 							StubRoutines::jlong_disjoint_arraycopy();
  1317 		array_overlap_test(nooverlap_target, 3);
  1319 		__ push(T3);
  1320 		__ push(T0);
  1321 		__ push(T1);
  1322 		__ push(T8);
  1324 		__ move(T1, A2);  
  1325 		__ move(T3, A0); 
  1326 		__ move(T0, A1);
  1328 		//__ leal(esi, Address(esi, ecx, Address::times_4, -4)); // from + count*4 - 4
  1329 		__ sll(AT, T1, Address::times_8); 
  1330 		__ add(AT, T3, AT); 
  1331 		__ lea(T3 , Address(AT, -8)); 
  1332 		//__ std();
  1333 		//__ leal(edi, Address(edi, ecx, Address::times_4, -4)); // to + count*4 - 4
  1334 		__ sll(AT, T1, Address::times_8); 
  1335 		__ add(AT, T0, AT); 
  1336 		__ lea(T0 , Address(AT, -8)); 
  1338 		//    __ cmpl(ecx, 32);
  1339 		//   __ jcc(Assembler::above, l_3);                   // > 32 dwords
  1340 		//  __ testl(ecx, ecx);
  1341 		//__ jcc(Assembler::zero, l_4);
  1342 		__ beq(T1, R0, l_4); 
  1343 		__ delayed()->nop();  
  1344 		// __ subl(edi, esi);
  1345 		__ align(16);
  1346 		__ bind(l_2);
  1347 		// __ movl(edx, Address(esi));
  1348 		__ ld(AT, T3, 0);   
  1349 		// __ movl(Address(esi, edi, Address::times_1), edx);
  1350 		__ sd(AT, T0, 0); 
  1351 		// __ subl(esi, 4);
  1352 		__ addi(T3, T3, -8); 
  1353 		__ addi(T0, T0, -8); 
  1354 		//   __ decl(ecx);
  1355 		__ addi(T1, T1, -1); 
  1356 		//__ jcc(Assembler::notEqual, l_2);
  1357 		__ bne(T1, R0, l_2);  
  1358 		__ delayed()->nop(); 
  1359 		if (is_oop) {
  1360 			// __ jmp(l_stchk);
  1361 			__ b( l_stchk); 
  1362 			__ delayed()->nop(); 
  1364 		__ bind(l_4);
  1365 		//      __ cld();
  1366 		//     __ popl(edi);
  1367 		//    __ popl(esi);
  1368 		//   __ ret(0);
  1369 		__ pop(T8); 
  1370 		__ pop(T1); 
  1371 		__ pop(T0); 
  1372 		__ pop(T3); 
  1373 		__ jr(RA); 
  1374 		__ delayed()->nop(); 
  1375 		__ bind(l_3);
  1376 		//   __ rep_movl();
  1377 		if (is_oop) {
  1378 			__ bind(l_stchk);
  1379 			//  __ movl(edi, Address(esp, 8+ 8));
  1380 			__ move(T0, A1);  
  1381 			// __ movl(ecx, Address(esp, 8+ 12));
  1382 			__ move(T1, A2);  
  1383 			array_store_check();
  1385 		//    __ cld();
  1386 		//   __ popl(edi);
  1387 		//   __ popl(esi);
  1388 		//  __ ret(0);
  1389 		__ pop(T8);	
  1390 		__ pop(T1);	
  1391 		__ pop(T0);	
  1392 		__ pop(T3);	
  1393 		__ jr(RA);	
  1394 		__ delayed()->nop(); 
  1395 		return start;
  1397 #if 0
  1398   // Arguments:
  1399   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
  1400   //             ignored
  1401   //   is_oop  - true => oop array, so generate store check code
  1402   //   name    - stub name string
  1403   //
  1404   // Inputs:
  1405   //   c_rarg0   - source array address
  1406   //   c_rarg1   - destination array address
  1407   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1408   //
  1409   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
  1410     __ align(CodeEntryAlignment);
  1411     StubCodeMark mark(this, "StubRoutines", name);
  1412     address start = __ pc();
  1414     Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
  1415     const Register from        = rdi;  // source array address
  1416     const Register to          = rsi;  // destination array address
  1417     const Register qword_count = rdx;  // elements count
  1418     const Register saved_count = rcx;
  1420     __ enter(); // required for proper stackwalking of RuntimeStub frame
  1421     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
  1423     address disjoint_copy_entry = NULL;
  1424     if (is_oop) {
  1425       assert(!UseCompressedOops, "shouldn't be called for compressed oops");
  1426       disjoint_copy_entry = disjoint_oop_copy_entry;
  1427       oop_copy_entry  = __ pc();
  1428       array_overlap_test(disjoint_oop_copy_entry, Address::times_8);
  1429     } else {
  1430       disjoint_copy_entry = disjoint_long_copy_entry;
  1431       long_copy_entry = __ pc();
  1432       array_overlap_test(disjoint_long_copy_entry, Address::times_8);
  1434     BLOCK_COMMENT("Entry:");
  1435     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  1437     array_overlap_test(disjoint_copy_entry, Address::times_8);
  1438     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
  1439                       // r9 and r10 may be used to save non-volatile registers
  1441     // 'from', 'to' and 'qword_count' are now valid
  1443     if (is_oop) {
  1444       // Save to and count for store barrier
  1445       __ movptr(saved_count, qword_count);
  1446       // No registers are destroyed by this call
  1447       gen_write_ref_array_pre_barrier(to, saved_count);
  1450     __ jmp(L_copy_32_bytes);
  1452     // Copy trailing qwords
  1453   __ BIND(L_copy_8_bytes);
  1454     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
  1455     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
  1456     __ decrement(qword_count);
  1457     __ jcc(Assembler::notZero, L_copy_8_bytes);
  1459     if (is_oop) {
  1460       __ jmp(L_exit);
  1461     } else {
  1462       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
  1463       restore_arg_regs();
  1464       __ xorptr(rax, rax); // return 0
  1465       __ leave(); // required for proper stackwalking of RuntimeStub frame
  1466       __ ret(0);
  1469     // Copy in 32-bytes chunks
  1470     copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
  1472     if (is_oop) {
  1473     __ BIND(L_exit);
  1474       __ lea(rcx, Address(to, saved_count, Address::times_8, -8));
  1475       gen_write_ref_array_post_barrier(to, rcx, rax);
  1476       inc_counter_np(SharedRuntime::_oop_array_copy_ctr);
  1477     } else {
  1478       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
  1480     restore_arg_regs();
  1481     __ xorptr(rax, rax); // return 0
  1482     __ leave(); // required for proper stackwalking of RuntimeStub frame
  1483     __ ret(0);
  1485     return start;
  1489   // Helper for generating a dynamic type check.
  1490   // Smashes no registers.
  1491   void generate_type_check(Register sub_klass,
  1492                            Register super_check_offset,
  1493                            Register super_klass,
  1494                            Label& L_success) {
  1495     assert_different_registers(sub_klass, super_check_offset, super_klass);
  1497     BLOCK_COMMENT("type_check:");
  1499     Label L_miss;
  1501     // a couple of useful fields in sub_klass:
  1502     int ss_offset = (klassOopDesc::header_size() * HeapWordSize +
  1503                      Klass::secondary_supers_offset_in_bytes());
  1504     int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
  1505                      Klass::secondary_super_cache_offset_in_bytes());
  1506     Address secondary_supers_addr(sub_klass, ss_offset);
  1507     Address super_cache_addr(     sub_klass, sc_offset);
  1509     // if the pointers are equal, we are done (e.g., String[] elements)
  1510     __ cmpptr(super_klass, sub_klass);
  1511     __ jcc(Assembler::equal, L_success);
  1513     // check the supertype display:
  1514     Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
  1515     __ cmpptr(super_klass, super_check_addr); // test the super type
  1516     __ jcc(Assembler::equal, L_success);
  1518     // if it was a primary super, we can just fail immediately
  1519     __ cmpl(super_check_offset, sc_offset);
  1520     __ jcc(Assembler::notEqual, L_miss);
  1522     // Now do a linear scan of the secondary super-klass chain.
  1523     // The repne_scan instruction uses fixed registers, which we must spill.
  1524     // (We need a couple more temps in any case.)
  1525     // This code is rarely used, so simplicity is a virtue here.
  1526     inc_counter_np(SharedRuntime::_partial_subtype_ctr);
  1528       __ push(rax);
  1529       __ push(rcx);
  1530       __ push(rdi);
  1531       assert_different_registers(sub_klass, super_klass, rax, rcx, rdi);
  1533       __ movptr(rdi, secondary_supers_addr);
  1534       // Load the array length.
  1535       __ movl(rcx, Address(rdi, arrayOopDesc::length_offset_in_bytes()));
  1536       // Skip to start of data.
  1537       __ addptr(rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
  1538       // Scan rcx words at [rdi] for occurance of rax
  1539       // Set NZ/Z based on last compare
  1540       __ movptr(rax, super_klass);
  1541       if (UseCompressedOops) {
  1542         // Compare against compressed form.  Don't need to uncompress because
  1543         // looks like orig rax is restored in popq below.
  1544         __ encode_heap_oop(rax);
  1545         __ repne_scanl();
  1546       } else {
  1547         __ repne_scan();
  1550       // Unspill the temp. registers:
  1551       __ pop(rdi);
  1552       __ pop(rcx);
  1553       __ pop(rax);
  1555       __ jcc(Assembler::notEqual, L_miss);
  1558     // Success.  Cache the super we found and proceed in triumph.
  1559     __ movptr(super_cache_addr, super_klass); // note: rax is dead
  1560     __ jmp(L_success);
  1562     // Fall through on failure!
  1563     __ BIND(L_miss);
  1566   //
  1567   //  Generate checkcasting array copy stub
  1568   //
  1569   //  Input:
  1570   //    c_rarg0   - source array address
  1571   //    c_rarg1   - destination array address
  1572   //    c_rarg2   - element count, treated as ssize_t, can be zero
  1573   //    c_rarg3   - size_t ckoff (super_check_offset)
  1574   // not Win64
  1575   //    c_rarg4   - oop ckval (super_klass)
  1576   // Win64
  1577   //    rsp+40    - oop ckval (super_klass)
  1578   //
  1579   //  Output:
  1580   //    rax ==  0  -  success
  1581   //    rax == -1^K - failure, where K is partial transfer count
  1582   //
  1583   address generate_checkcast_copy(const char *name) {
  1585     Label L_load_element, L_store_element, L_do_card_marks, L_done;
  1587     // Input registers (after setup_arg_regs)
  1588     const Register from        = rdi;   // source array address
  1589     const Register to          = rsi;   // destination array address
  1590     const Register length      = rdx;   // elements count
  1591     const Register ckoff       = rcx;   // super_check_offset
  1592     const Register ckval       = r8;    // super_klass
  1594     // Registers used as temps (r13, r14 are save-on-entry)
  1595     const Register end_from    = from;  // source array end address
  1596     const Register end_to      = r13;   // destination array end address
  1597     const Register count       = rdx;   // -(count_remaining)
  1598     const Register r14_length  = r14;   // saved copy of length
  1599     // End pointers are inclusive, and if length is not zero they point
  1600     // to the last unit copied:  end_to[0] := end_from[0]
  1602     const Register rax_oop    = rax;    // actual oop copied
  1603     const Register r11_klass  = r11;    // oop._klass
  1605     //---------------------------------------------------------------
  1606     // Assembler stub will be used for this call to arraycopy
  1607     // if the two arrays are subtypes of Object[] but the
  1608     // destination array type is not equal to or a supertype
  1609     // of the source type.  Each element must be separately
  1610     // checked.
  1612     __ align(CodeEntryAlignment);
  1613     StubCodeMark mark(this, "StubRoutines", name);
  1614     address start = __ pc();
  1616     __ enter(); // required for proper stackwalking of RuntimeStub frame
  1618     checkcast_copy_entry  = __ pc();
  1619     BLOCK_COMMENT("Entry:");
  1621 #ifdef ASSERT
  1622     // caller guarantees that the arrays really are different
  1623     // otherwise, we would have to make conjoint checks
  1624     { Label L;
  1625       array_overlap_test(L, TIMES_OOP);
  1626       __ stop("checkcast_copy within a single array");
  1627       __ bind(L);
  1629 #endif //ASSERT
  1631     // allocate spill slots for r13, r14
  1632     enum {
  1633       saved_r13_offset,
  1634       saved_r14_offset,
  1635       saved_rbp_offset,
  1636       saved_rip_offset,
  1637       saved_rarg0_offset
  1638     };
  1639     __ subptr(rsp, saved_rbp_offset * wordSize);
  1640     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
  1641     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
  1642     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
  1643                        // ckoff => rcx, ckval => r8
  1644                        // r9 and r10 may be used to save non-volatile registers
  1645 #ifdef _WIN64
  1646     // last argument (#4) is on stack on Win64
  1647     const int ckval_offset = saved_rarg0_offset + 4;
  1648     __ movptr(ckval, Address(rsp, ckval_offset * wordSize));
  1649 #endif
  1651     // check that int operands are properly extended to size_t
  1652     assert_clean_int(length, rax);
  1653     assert_clean_int(ckoff, rax);
  1655 #ifdef ASSERT
  1656     BLOCK_COMMENT("assert consistent ckoff/ckval");
  1657     // The ckoff and ckval must be mutually consistent,
  1658     // even though caller generates both.
  1659     { Label L;
  1660       int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
  1661                         Klass::super_check_offset_offset_in_bytes());
  1662       __ cmpl(ckoff, Address(ckval, sco_offset));
  1663       __ jcc(Assembler::equal, L);
  1664       __ stop("super_check_offset inconsistent");
  1665       __ bind(L);
  1667 #endif //ASSERT
  1669     // Loop-invariant addresses.  They are exclusive end pointers.
  1670     Address end_from_addr(from, length, TIMES_OOP, 0);
  1671     Address   end_to_addr(to,   length, TIMES_OOP, 0);
  1672     // Loop-variant addresses.  They assume post-incremented count < 0.
  1673     Address from_element_addr(end_from, count, TIMES_OOP, 0);
  1674     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
  1676     gen_write_ref_array_pre_barrier(to, count);
  1678     // Copy from low to high addresses, indexed from the end of each array.
  1679     __ lea(end_from, end_from_addr);
  1680     __ lea(end_to,   end_to_addr);
  1681     __ movptr(r14_length, length);        // save a copy of the length
  1682     assert(length == count, "");          // else fix next line:
  1683     __ negptr(count);                     // negate and test the length
  1684     __ jcc(Assembler::notZero, L_load_element);
  1686     // Empty array:  Nothing to do.
  1687     __ xorptr(rax, rax);                  // return 0 on (trivial) success
  1688     __ jmp(L_done);
  1690     // ======== begin loop ========
  1691     // (Loop is rotated; its entry is L_load_element.)
  1692     // Loop control:
  1693     //   for (count = -count; count != 0; count++)
  1694     // Base pointers src, dst are biased by 8*(count-1),to last element.
  1695     __ align(16);
  1697     __ BIND(L_store_element);
  1698     __ store_heap_oop(rax_oop, to_element_addr);  // store the oop
  1699     __ increment(count);               // increment the count toward zero
  1700     __ jcc(Assembler::zero, L_do_card_marks);
  1702     // ======== loop entry is here ========
  1703     __ BIND(L_load_element);
  1704     __ load_heap_oop(rax_oop, from_element_addr); // load the oop
  1705     __ testptr(rax_oop, rax_oop);
  1706     __ jcc(Assembler::zero, L_store_element);
  1708     __ load_klass(r11_klass, rax_oop);// query the object klass
  1709     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
  1710     // ======== end loop ========
  1712     // It was a real error; we must depend on the caller to finish the job.
  1713     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
  1714     // Emit GC store barriers for the oops we have copied (r14 + rdx),
  1715     // and report their number to the caller.
  1716     assert_different_registers(rax, r14_length, count, to, end_to, rcx);
  1717     __ lea(end_to, to_element_addr);
  1718     gen_write_ref_array_post_barrier(to, end_to, rscratch1);
  1719     __ movptr(rax, r14_length);           // original oops
  1720     __ addptr(rax, count);                // K = (original - remaining) oops
  1721     __ notptr(rax);                       // report (-1^K) to caller
  1722     __ jmp(L_done);
  1724     // Come here on success only.
  1725     __ BIND(L_do_card_marks);
  1726     __ addptr(end_to, -wordSize);         // make an inclusive end pointer
  1727     gen_write_ref_array_post_barrier(to, end_to, rscratch1);
  1728     __ xorptr(rax, rax);                  // return 0 on success
  1730     // Common exit point (success or failure).
  1731     __ BIND(L_done);
  1732     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
  1733     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
  1734     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
  1735     restore_arg_regs();
  1736     __ leave(); // required for proper stackwalking of RuntimeStub frame
  1737     __ ret(0);
  1739     return start;
  1742   //
  1743   //  Generate 'unsafe' array copy stub
  1744   //  Though just as safe as the other stubs, it takes an unscaled
  1745   //  size_t argument instead of an element count.
  1746   //
  1747   //  Input:
  1748   //    c_rarg0   - source array address
  1749   //    c_rarg1   - destination array address
  1750   //    c_rarg2   - byte count, treated as ssize_t, can be zero
  1751   //
  1752   // Examines the alignment of the operands and dispatches
  1753   // to a long, int, short, or byte copy loop.
  1754   //
  1755   address generate_unsafe_copy(const char *name) {
  1757     Label L_long_aligned, L_int_aligned, L_short_aligned;
  1759     // Input registers (before setup_arg_regs)
  1760     const Register from        = c_rarg0;  // source array address
  1761     const Register to          = c_rarg1;  // destination array address
  1762     const Register size        = c_rarg2;  // byte count (size_t)
  1764     // Register used as a temp
  1765     const Register bits        = rax;      // test copy of low bits
  1767     __ align(CodeEntryAlignment);
  1768     StubCodeMark mark(this, "StubRoutines", name);
  1769     address start = __ pc();
  1771     __ enter(); // required for proper stackwalking of RuntimeStub frame
  1773     // bump this on entry, not on exit:
  1774     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
  1776     __ mov(bits, from);
  1777     __ orptr(bits, to);
  1778     __ orptr(bits, size);
  1780     __ testb(bits, BytesPerLong-1);
  1781     __ jccb(Assembler::zero, L_long_aligned);
  1783     __ testb(bits, BytesPerInt-1);
  1784     __ jccb(Assembler::zero, L_int_aligned);
  1786     __ testb(bits, BytesPerShort-1);
  1787     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
  1789     __ BIND(L_short_aligned);
  1790     __ shrptr(size, LogBytesPerShort); // size => short_count
  1791     __ jump(RuntimeAddress(short_copy_entry));
  1793     __ BIND(L_int_aligned);
  1794     __ shrptr(size, LogBytesPerInt); // size => int_count
  1795     __ jump(RuntimeAddress(int_copy_entry));
  1797     __ BIND(L_long_aligned);
  1798     __ shrptr(size, LogBytesPerLong); // size => qword_count
  1799     __ jump(RuntimeAddress(long_copy_entry));
  1801     return start;
  1804   // Perform range checks on the proposed arraycopy.
  1805   // Kills temp, but nothing else.
  1806   // Also, clean the sign bits of src_pos and dst_pos.
  1807   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
  1808                               Register src_pos, // source position (c_rarg1)
  1809                               Register dst,     // destination array oo (c_rarg2)
  1810                               Register dst_pos, // destination position (c_rarg3)
  1811                               Register length,
  1812                               Register temp,
  1813                               Label& L_failed) {
  1814     BLOCK_COMMENT("arraycopy_range_checks:");
  1816     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
  1817     __ movl(temp, length);
  1818     __ addl(temp, src_pos);             // src_pos + length
  1819     __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
  1820     __ jcc(Assembler::above, L_failed);
  1822     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
  1823     __ movl(temp, length);
  1824     __ addl(temp, dst_pos);             // dst_pos + length
  1825     __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
  1826     __ jcc(Assembler::above, L_failed);
  1828     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
  1829     // Move with sign extension can be used since they are positive.
  1830     __ movslq(src_pos, src_pos);
  1831     __ movslq(dst_pos, dst_pos);
  1833     BLOCK_COMMENT("arraycopy_range_checks done");
  1836   //
  1837   //  Generate generic array copy stubs
  1838   //
  1839   //  Input:
  1840   //    c_rarg0    -  src oop
  1841   //    c_rarg1    -  src_pos (32-bits)
  1842   //    c_rarg2    -  dst oop
  1843   //    c_rarg3    -  dst_pos (32-bits)
  1844   // not Win64
  1845   //    c_rarg4    -  element count (32-bits)
  1846   // Win64
  1847   //    rsp+40     -  element count (32-bits)
  1848   //
  1849   //  Output:
  1850   //    rax ==  0  -  success
  1851   //    rax == -1^K - failure, where K is partial transfer count
  1852   //
  1853   address generate_generic_copy(const char *name) {
  1855     Label L_failed, L_failed_0, L_objArray;
  1856     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
  1858     // Input registers
  1859     const Register src        = c_rarg0;  // source array oop
  1860     const Register src_pos    = c_rarg1;  // source position
  1861     const Register dst        = c_rarg2;  // destination array oop
  1862     const Register dst_pos    = c_rarg3;  // destination position
  1863     // elements count is on stack on Win64
  1864 #ifdef _WIN64
  1865 #define C_RARG4 Address(rsp, 6 * wordSize)
  1866 #else
  1867 #define C_RARG4 c_rarg4
  1868 #endif
  1870     { int modulus = CodeEntryAlignment;
  1871       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
  1872       int advance = target - (__ offset() % modulus);
  1873       if (advance < 0)  advance += modulus;
  1874       if (advance > 0)  __ nop(advance);
  1876     StubCodeMark mark(this, "StubRoutines", name);
  1878     // Short-hop target to L_failed.  Makes for denser prologue code.
  1879     __ BIND(L_failed_0);
  1880     __ jmp(L_failed);
  1881     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
  1883     __ align(CodeEntryAlignment);
  1884     address start = __ pc();
  1886     __ enter(); // required for proper stackwalking of RuntimeStub frame
  1888     // bump this on entry, not on exit:
  1889     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
  1891     //-----------------------------------------------------------------------
  1892     // Assembler stub will be used for this call to arraycopy
  1893     // if the following conditions are met:
  1894     //
  1895     // (1) src and dst must not be null.
  1896     // (2) src_pos must not be negative.
  1897     // (3) dst_pos must not be negative.
  1898     // (4) length  must not be negative.
  1899     // (5) src klass and dst klass should be the same and not NULL.
  1900     // (6) src and dst should be arrays.
  1901     // (7) src_pos + length must not exceed length of src.
  1902     // (8) dst_pos + length must not exceed length of dst.
  1903     //
  1905     //  if (src == NULL) return -1;
  1906     __ testptr(src, src);         // src oop
  1907     size_t j1off = __ offset();
  1908     __ jccb(Assembler::zero, L_failed_0);
  1910     //  if (src_pos < 0) return -1;
  1911     __ testl(src_pos, src_pos); // src_pos (32-bits)
  1912     __ jccb(Assembler::negative, L_failed_0);
  1914     //  if (dst == NULL) return -1;
  1915     __ testptr(dst, dst);         // dst oop
  1916     __ jccb(Assembler::zero, L_failed_0);
  1918     //  if (dst_pos < 0) return -1;
  1919     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
  1920     size_t j4off = __ offset();
  1921     __ jccb(Assembler::negative, L_failed_0);
  1923     // The first four tests are very dense code,
  1924     // but not quite dense enough to put four
  1925     // jumps in a 16-byte instruction fetch buffer.
  1926     // That's good, because some branch predicters
  1927     // do not like jumps so close together.
  1928     // Make sure of this.
  1929     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
  1931     // registers used as temp
  1932     const Register r11_length    = r11; // elements count to copy
  1933     const Register r10_src_klass = r10; // array klass
  1934     const Register r9_dst_klass  = r9;  // dest array klass
  1936     //  if (length < 0) return -1;
  1937     __ movl(r11_length, C_RARG4);       // length (elements count, 32-bits value)
  1938     __ testl(r11_length, r11_length);
  1939     __ jccb(Assembler::negative, L_failed_0);
  1941     __ load_klass(r10_src_klass, src);
  1942 #ifdef ASSERT
  1943     //  assert(src->klass() != NULL);
  1944     BLOCK_COMMENT("assert klasses not null");
  1945     { Label L1, L2;
  1946       __ testptr(r10_src_klass, r10_src_klass);
  1947       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
  1948       __ bind(L1);
  1949       __ stop("broken null klass");
  1950       __ bind(L2);
  1951       __ load_klass(r9_dst_klass, dst);
  1952       __ cmpq(r9_dst_klass, 0);
  1953       __ jcc(Assembler::equal, L1);     // this would be broken also
  1954       BLOCK_COMMENT("assert done");
  1956 #endif
  1958     // Load layout helper (32-bits)
  1959     //
  1960     //  |array_tag|     | header_size | element_type |     |log2_element_size|
  1961     // 32        30    24            16              8     2                 0
  1962     //
  1963     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
  1964     //
  1966     int lh_offset = klassOopDesc::header_size() * HeapWordSize +
  1967                     Klass::layout_helper_offset_in_bytes();
  1969     const Register rax_lh = rax;  // layout helper
  1971     __ movl(rax_lh, Address(r10_src_klass, lh_offset));
  1973     // Handle objArrays completely differently...
  1974     jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
  1975     __ cmpl(rax_lh, objArray_lh);
  1976     __ jcc(Assembler::equal, L_objArray);
  1978     //  if (src->klass() != dst->klass()) return -1;
  1979     __ load_klass(r9_dst_klass, dst);
  1980     __ cmpq(r10_src_klass, r9_dst_klass);
  1981     __ jcc(Assembler::notEqual, L_failed);
  1983     //  if (!src->is_Array()) return -1;
  1984     __ cmpl(rax_lh, Klass::_lh_neutral_value);
  1985     __ jcc(Assembler::greaterEqual, L_failed);
  1987     // At this point, it is known to be a typeArray (array_tag 0x3).
  1988 #ifdef ASSERT
  1989     { Label L;
  1990       __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
  1991       __ jcc(Assembler::greaterEqual, L);
  1992       __ stop("must be a primitive array");
  1993       __ bind(L);
  1995 #endif
  1997     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
  1998                            r10, L_failed);
  2000     // typeArrayKlass
  2001     //
  2002     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
  2003     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
  2004     //
  2006     const Register r10_offset = r10;    // array offset
  2007     const Register rax_elsize = rax_lh; // element size
  2009     __ movl(r10_offset, rax_lh);
  2010     __ shrl(r10_offset, Klass::_lh_header_size_shift);
  2011     __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
  2012     __ addptr(src, r10_offset);           // src array offset
  2013     __ addptr(dst, r10_offset);           // dst array offset
  2014     BLOCK_COMMENT("choose copy loop based on element size");
  2015     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
  2017     // next registers should be set before the jump to corresponding stub
  2018     const Register from     = c_rarg0;  // source array address
  2019     const Register to       = c_rarg1;  // destination array address
  2020     const Register count    = c_rarg2;  // elements count
  2022     // 'from', 'to', 'count' registers should be set in such order
  2023     // since they are the same as 'src', 'src_pos', 'dst'.
  2025   __ BIND(L_copy_bytes);
  2026     __ cmpl(rax_elsize, 0);
  2027     __ jccb(Assembler::notEqual, L_copy_shorts);
  2028     __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
  2029     __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
  2030     __ movl2ptr(count, r11_length); // length
  2031     __ jump(RuntimeAddress(byte_copy_entry));
  2033   __ BIND(L_copy_shorts);
  2034     __ cmpl(rax_elsize, LogBytesPerShort);
  2035     __ jccb(Assembler::notEqual, L_copy_ints);
  2036     __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
  2037     __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
  2038     __ movl2ptr(count, r11_length); // length
  2039     __ jump(RuntimeAddress(short_copy_entry));
  2041   __ BIND(L_copy_ints);
  2042     __ cmpl(rax_elsize, LogBytesPerInt);
  2043     __ jccb(Assembler::notEqual, L_copy_longs);
  2044     __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
  2045     __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
  2046     __ movl2ptr(count, r11_length); // length
  2047     __ jump(RuntimeAddress(int_copy_entry));
  2049   __ BIND(L_copy_longs);
  2050 #ifdef ASSERT
  2051     { Label L;
  2052       __ cmpl(rax_elsize, LogBytesPerLong);
  2053       __ jcc(Assembler::equal, L);
  2054       __ stop("must be long copy, but elsize is wrong");
  2055       __ bind(L);
  2057 #endif
  2058     __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
  2059     __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
  2060     __ movl2ptr(count, r11_length); // length
  2061     __ jump(RuntimeAddress(long_copy_entry));
  2063     // objArrayKlass
  2064   __ BIND(L_objArray);
  2065     // live at this point:  r10_src_klass, src[_pos], dst[_pos]
  2067     Label L_plain_copy, L_checkcast_copy;
  2068     //  test array classes for subtyping
  2069     __ load_klass(r9_dst_klass, dst);
  2070     __ cmpq(r10_src_klass, r9_dst_klass); // usual case is exact equality
  2071     __ jcc(Assembler::notEqual, L_checkcast_copy);
  2073     // Identically typed arrays can be copied without element-wise checks.
  2074     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
  2075                            r10, L_failed);
  2077     __ lea(from, Address(src, src_pos, TIMES_OOP,
  2078                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
  2079     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
  2080                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
  2081     __ movl2ptr(count, r11_length); // length
  2082   __ BIND(L_plain_copy);
  2083     __ jump(RuntimeAddress(oop_copy_entry));
  2085   __ BIND(L_checkcast_copy);
  2086     // live at this point:  r10_src_klass, !r11_length
  2088       // assert(r11_length == C_RARG4); // will reload from here
  2089       Register r11_dst_klass = r11;
  2090       __ load_klass(r11_dst_klass, dst);
  2092       // Before looking at dst.length, make sure dst is also an objArray.
  2093       __ cmpl(Address(r11_dst_klass, lh_offset), objArray_lh);
  2094       __ jcc(Assembler::notEqual, L_failed);
  2096       // It is safe to examine both src.length and dst.length.
  2097 #ifndef _WIN64
  2098       arraycopy_range_checks(src, src_pos, dst, dst_pos, C_RARG4,
  2099                              rax, L_failed);
  2100 #else
  2101       __ movl(r11_length, C_RARG4);     // reload
  2102       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
  2103                              rax, L_failed);
  2104       __ load_klass(r11_dst_klass, dst); // reload
  2105 #endif
  2107       // Marshal the base address arguments now, freeing registers.
  2108       __ lea(from, Address(src, src_pos, TIMES_OOP,
  2109                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
  2110       __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
  2111                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
  2112       __ movl(count, C_RARG4);          // length (reloaded)
  2113       Register sco_temp = c_rarg3;      // this register is free now
  2114       assert_different_registers(from, to, count, sco_temp,
  2115                                  r11_dst_klass, r10_src_klass);
  2116       assert_clean_int(count, sco_temp);
  2118       // Generate the type check.
  2119       int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
  2120                         Klass::super_check_offset_offset_in_bytes());
  2121       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
  2122       assert_clean_int(sco_temp, rax);
  2123       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
  2125       // Fetch destination element klass from the objArrayKlass header.
  2126       int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
  2127                        objArrayKlass::element_klass_offset_in_bytes());
  2128       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
  2129       __ movl(sco_temp,      Address(r11_dst_klass, sco_offset));
  2130       assert_clean_int(sco_temp, rax);
  2132       // the checkcast_copy loop needs two extra arguments:
  2133       assert(c_rarg3 == sco_temp, "#3 already in place");
  2134       __ movptr(C_RARG4, r11_dst_klass);  // dst.klass.element_klass
  2135       __ jump(RuntimeAddress(checkcast_copy_entry));
  2138   __ BIND(L_failed);
  2139     __ xorptr(rax, rax);
  2140     __ notptr(rax); // return -1
  2141     __ leave();   // required for proper stackwalking of RuntimeStub frame
  2142     __ ret(0);
  2144     return start;
  2147 #undef length_arg
  2148 #endif
  2150 //FIXME
  2151   address generate_disjoint_long_copy(bool aligned, const char *name) {
  2152 	  Label l_1, l_2;
  2153 	  StubCodeMark mark(this, "StubRoutines", name);
  2154 	  __ align(CodeEntryAlignment);
  2155 	  address start = __ pc();
  2157 	  //      __ movl(ecx, Address(esp, 4+8));       // count
  2158 	  //     __ movl(eax, Address(esp, 4+0));       // from
  2159 	  //    __ movl(edx, Address(esp, 4+4));       // to
  2160 	  __ move(T1, A2);  
  2161 	  __ move(T3, A0); 
  2162 	  __ move(T0, A1);
  2163 	  __ push(T3); 
  2164 	  __ push(T0);
  2165 	  __ push(T1);
  2166 	  //__ subl(edx, eax);
  2167 	  //__ jmp(l_2);
  2168 	  __ b(l_2);  
  2169 	  __ delayed()->nop();   
  2170 	  __ align(16);
  2171 	  __ bind(l_1);
  2172 	  //   if (VM_Version::supports_mmx()) {
  2173 	  //     __ movq(mmx0, Address(eax));
  2174 	  //     __ movq(Address(eax, edx, Address::times_1), mmx0);
  2175 	  //   } else {
  2176 	  //   __ fild_d(Address(eax));
  2177 	  __ ld(AT, T3, 0);   
  2178 	  // __ fistp_d(Address(eax, edx, Address::times_1));
  2179 	  __ sd (AT, T0, 0); 
  2180 	  //   }
  2181 	  //   __ addl(eax, 8);
  2182 	  __ addi(T3, T3, 8); 
  2183 	  __ addi(T0, T0, 8); 
  2184 	  __ bind(l_2);
  2185 	  //    __ decl(ecx);
  2186 	  __ addi(T1, T1, -1); 
  2187 	  //    __ jcc(Assembler::greaterEqual, l_1);
  2188 	  __ bgez(T1, l_1);    
  2189 	  __ delayed()->nop(); 
  2190 	  //  if (VM_Version::supports_mmx()) {
  2191 	  //    __ emms();
  2192 	  //  }
  2193 	  //  __ ret(0);
  2194 	  __ pop(T1); 
  2195 	  __ pop(T0); 
  2196 	  __ pop(T3); 
  2197 	  __ jr(RA); 
  2198 	  __ delayed()->nop(); 
  2199 	  return start;
  2203   address generate_conjoint_long_copy(bool aligned, const char *name) {
  2204 	  Label l_1, l_2;
  2205 	  StubCodeMark mark(this, "StubRoutines", name);
  2206 	  __ align(CodeEntryAlignment);
  2207 	  address start = __ pc();
  2208 	  address nooverlap_target = aligned ?
  2209 		  StubRoutines::arrayof_jlong_disjoint_arraycopy() :
  2210 		  StubRoutines::jlong_disjoint_arraycopy();
  2211 	  array_overlap_test(nooverlap_target, 3);
  2213 	  __ push(T3); 
  2214 	  __ push(T0); 
  2215 	  __ push(T1); 
  2217 		/*      __ movl(ecx, Address(esp, 4+8));       // count
  2218 						__ movl(eax, Address(esp, 4+0));       // from
  2219 						__ movl(edx, Address(esp, 4+4));       // to
  2220 						__ jmp(l_2);
  2222 		 */
  2223 	  __ move(T1, A2);  
  2224 	  __ move(T3, A0); 
  2225 	  __ move(T0, A1);
  2226 	  __ sll(AT, T1, Address::times_8); 
  2227 	  __ add(AT, T3, AT); 
  2228 	  __ lea(T3 , Address(AT, -8)); 
  2229 	  __ sll(AT, T1, Address::times_8); 
  2230 	  __ add(AT, T0, AT); 
  2231 	  __ lea(T0 , Address(AT, -8)); 
  2235 	  __ b(l_2); 
  2236 	  __ delayed()->nop(); 
  2237 	  __ align(16);
  2238 		__ bind(l_1);
  2239 		/*      if (VM_Version::supports_mmx()) {
  2240 						__ movq(mmx0, Address(eax, ecx, Address::times_8));
  2241 						__ movq(Address(edx, ecx,Address::times_8), mmx0);
  2242 						} else {
  2243 						__ fild_d(Address(eax, ecx, Address::times_8));
  2244 						__ fistp_d(Address(edx, ecx,Address::times_8));
  2246 		 */    
  2247 		__ ld(AT, T3, 0);   
  2248 		__ sd (AT, T0, 0); 
  2249 	  __ addi(T3, T3, -8); 
  2250 	  __ addi(T0, T0,-8); 
  2251 	  __ bind(l_2);
  2252 	  //	    __ decl(ecx);
  2253 	  __ addi(T1, T1, -1); 
  2254 	  //__ jcc(Assembler::greaterEqual, l_1);
  2255 	  __ bgez(T1, l_1); 
  2256 	  __ delayed()->nop(); 
  2257 	  //      if (VM_Version::supports_mmx()) {
  2258 	  //      __ emms();
  2259 	  //   }
  2260 	  //  __ ret(0);
  2261 	  __ pop(T1); 
  2262 	  __ pop(T0); 
  2263 	  __ pop(T3); 
  2264 	  __ jr(RA); 
  2265 	  __ delayed()->nop();  
  2266 	  return start;
  2269   void generate_arraycopy_stubs() {
  2270     if (UseCompressedOops) {
  2271       StubRoutines::_oop_disjoint_arraycopy    = generate_disjoint_int_oop_copy(false, true, "oop_disjoint_arraycopy");
  2272       StubRoutines::_oop_arraycopy   	= generate_conjoint_int_oop_copy(false, true, "oop_arraycopy");
  2273     } else {
  2274       StubRoutines::_oop_disjoint_arraycopy    = generate_disjoint_long_oop_copy(false, true, "oop_disjoint_arraycopy");
  2275       StubRoutines::_oop_arraycopy   	= generate_conjoint_long_oop_copy(false, true, "oop_arraycopy");
  2278     StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
  2279     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
  2280     StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, "jint_disjoint_arraycopy");
  2281     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
  2282     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
  2284     //  if (VM_Version::supports_mmx())
  2285     //if (false)
  2286     // StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_mmx_copy_aligned("arrayof_jshort_disjoint_arraycopy");
  2287     // else
  2288     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
  2289     StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(true, false, "arrayof_jint_disjoint_arraycopy");
  2290     //StubRoutines::_arrayof_oop_disjoint_arraycopy   = generate_disjoint_int_oop_copy(true, true, "arrayof_oop_disjoint_arraycopy");
  2291     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
  2293     StubRoutines::_jbyte_arraycopy  = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
  2294     StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, "jshort_arraycopy");
  2295     StubRoutines::_jint_arraycopy   = generate_conjoint_int_oop_copy(false, false, "jint_arraycopy");
  2296     StubRoutines::_jlong_arraycopy  = generate_conjoint_long_copy(false, "jlong_arraycopy");
  2298     StubRoutines::_arrayof_jbyte_arraycopy  = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
  2299     StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
  2300     StubRoutines::_arrayof_jint_arraycopy   = generate_conjoint_int_oop_copy(true, false, "arrayof_jint_arraycopy");
  2301     //StubRoutines::_arrayof_oop_arraycopy    = generate_conjoint_int_oop_copy(true, true, "arrayof_oop_arraycopy");
  2302     StubRoutines::_arrayof_jlong_arraycopy  = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");
  2304     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
  2305     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
  2308 //Wang: add a function to implement SafeFetch32 and SafeFetchN
  2309   void generate_safefetch(const char* name, int size, address* entry,
  2310                           address* fault_pc, address* continuation_pc) {
  2311     // safefetch signatures:
  2312     //   int      SafeFetch32(int*      adr, int      errValue);
  2313     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
  2314     //
  2315     // arguments:
  2316     //   A0 = adr
  2317     //   A1 = errValue
  2318     //
  2319     // result:
  2320     //   PPC_RET  = *adr or errValue
  2322     StubCodeMark mark(this, "StubRoutines", name);
  2324     // Entry point, pc or function descriptor.
  2325     *entry = __ pc();
  2327     // Load *adr into A1, may fault.
  2328     *fault_pc = __ pc();
  2329     switch (size) {
  2330       case 4:
  2331         // int32_t
  2332         __ lw(A1, A0, 0); 
  2333         break;
  2334       case 8:
  2335         // int64_t
  2336         __ ld(A1, A0, 0); 
  2337         break;
  2338       default:
  2339         ShouldNotReachHere();
  2342     // return errValue or *adr
  2343     *continuation_pc = __ pc();
  2344     __ addu(V0,A1,R0);
  2345     __ jr(RA);
  2346     __ delayed()->nop();
  2350 #undef __
  2351 #define __ masm->
  2353   // Continuation point for throwing of implicit exceptions that are
  2354   // not handled in the current activation. Fabricates an exception
  2355   // oop and initiates normal exception dispatching in this
  2356   // frame. Since we need to preserve callee-saved values (currently
  2357   // only for C2, but done for C1 as well) we need a callee-saved oop
  2358   // map and therefore have to make these stubs into RuntimeStubs
  2359   // rather than BufferBlobs.  If the compiler needs all registers to
  2360   // be preserved between the fault point and the exception handler
  2361   // then it must assume responsibility for that in
  2362   // AbstractCompiler::continuation_for_implicit_null_exception or
  2363   // continuation_for_implicit_division_by_zero_exception. All other
  2364   // implicit exceptions (e.g., NullPointerException or
  2365   // AbstractMethodError on entry) are either at call sites or
  2366   // otherwise assume that stack unwinding will be initiated, so
  2367   // caller saved registers were assumed volatile in the compiler.
  2368   address generate_throw_exception(const char* name,
  2369                                    address runtime_entry,
  2370                                    bool restore_saved_exception_pc) {
  2371     // Information about frame layout at time of blocking runtime call.
  2372     // Note that we only have to preserve callee-saved registers since
  2373     // the compilers are responsible for supplying a continuation point
  2374 		// if they expect all registers to be preserved.
  2375 //#define aoqi_test
  2376 #ifdef aoqi_test
  2377 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2378 #endif
  2379 		enum layout {
  2380 			thread_off,    // last_java_sp                
  2381 			S7_off,        // callee saved register      sp + 1
  2382 			S6_off,        // callee saved register      sp + 2
  2383 			S5_off,        // callee saved register      sp + 3
  2384 			S4_off,        // callee saved register      sp + 4
  2385 			S3_off,        // callee saved register      sp + 5
  2386 			S2_off,        // callee saved register      sp + 6
  2387 			S1_off,        // callee saved register      sp + 7
  2388 			S0_off,        // callee saved register      sp + 8
  2389 			FP_off,
  2390 			ret_address,
  2391 			framesize
  2392 		};
  2394 		int insts_size = 2048;
  2395 		int locs_size  = 32;
  2397 		//  CodeBuffer* code     = new CodeBuffer(insts_size, locs_size, 0, 0, 0, false, 
  2398 		//  NULL, NULL, NULL, false, NULL, name, false);
  2399 		CodeBuffer code (name , insts_size, locs_size);
  2400 #ifdef aoqi_test
  2401 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2402 #endif
  2403 		OopMapSet* oop_maps  = new OopMapSet();
  2404 #ifdef aoqi_test
  2405 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2406 #endif
  2407 		MacroAssembler* masm = new MacroAssembler(&code);
  2408 #ifdef aoqi_test
  2409 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2410 #endif
  2412 		address start = __ pc();
  2413     	//__ stop("generate_throw_exception");
  2414 		/*
  2415 			 __ move(AT, (int)&jerome1 );
  2416 			 __ sw(SP, AT, 0); 	
  2417 			 __ move(AT, (int)&jerome2 );
  2418 			 __ sw(FP, AT, 0); 	
  2419 			 __ move(AT, (int)&jerome3 );
  2420 			 __ sw(RA, AT, 0); 	
  2421 			 __ move(AT, (int)&jerome4 );
  2422 			 __ sw(R0, AT, 0); 	
  2423 			 __ move(AT, (int)&jerome5 );
  2424 			 __ sw(R0, AT, 0); 	
  2425 			 __ move(AT, (int)&jerome6 );
  2426 			 __ sw(R0, AT, 0); 	
  2427 			 __ move(AT, (int)&jerome7 );
  2428 			 __ sw(R0, AT, 0); 	
  2429 			 __ move(AT, (int)&jerome10 );
  2430 			 __ sw(R0, AT, 0); 	
  2432 			 __ pushad();
  2434 		//__ enter();
  2435 		__ call(CAST_FROM_FN_PTR(address, SharedRuntime::print_call_statistics), 
  2436 		relocInfo::runtime_call_type);
  2437 		__ delayed()->nop();
  2439 		//__ leave();
  2440 		__ popad();
  2442 		 */
  2444 		// This is an inlined and slightly modified version of call_VM
  2445 		// which has the ability to fetch the return PC out of
  2446 		// thread-local storage and also sets up last_Java_sp slightly
  2447 		// differently than the real call_VM
  2448 #ifndef OPT_THREAD	
  2449 		Register java_thread = TREG;
  2450 		__ get_thread(java_thread);
  2451 #else
  2452 		Register java_thread = TREG;
  2453 #endif
  2454 #ifdef aoqi_test
  2455 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2456 #endif
  2457 		if (restore_saved_exception_pc) {
  2458 			__ ld(RA, java_thread, in_bytes(JavaThread::saved_exception_pc_offset())); // eax
  2461 		__ enter(); // required for proper stackwalking of RuntimeStub frame
  2463 		__ addi(SP, SP, (-1) * (framesize-2) * wordSize); // prolog
  2464 		__ sd(S0, SP, S0_off * wordSize);
  2465 		__ sd(S1, SP, S1_off * wordSize);
  2466 		__ sd(S2, SP, S2_off * wordSize);
  2467 		__ sd(S3, SP, S3_off * wordSize);
  2468 		__ sd(S4, SP, S4_off * wordSize);
  2469 		__ sd(S5, SP, S5_off * wordSize);
  2470 		__ sd(S6, SP, S6_off * wordSize);
  2471 		__ sd(S7, SP, S7_off * wordSize);
  2473 		int frame_complete = __ pc() - start;
  2474 		// push java thread (becomes first argument of C function)
  2475 		__ sd(java_thread, SP, thread_off * wordSize);
  2476 		if (java_thread!=A0)
  2477 			__ move(A0, java_thread);
  2479 		// Set up last_Java_sp and last_Java_fp
  2480 		__ set_last_Java_frame(java_thread, SP, FP, NULL);
  2481 		__ relocate(relocInfo::internal_pc_type);
  2483 			intptr_t save_pc = (intptr_t)__ pc() +  NativeMovConstReg::instruction_size + NativeCall::return_address_offset + 4;
  2484 			__ li48(AT, save_pc);
  2486 		__ sd(AT, java_thread, in_bytes(JavaThread::last_Java_pc_offset())); 
  2488 		// Call runtime
  2489 		__ call(runtime_entry);
  2490 		__ delayed()->nop();
  2491 		// Generate oop map
  2492 		OopMap* map =  new OopMap(framesize, 0);        
  2493 		oop_maps->add_gc_map(__ offset(),  map);
  2495 		// restore the thread (cannot use the pushed argument since arguments
  2496 		// may be overwritten by C code generated by an optimizing compiler);
  2497 		// however can use the register value directly if it is callee saved.
  2498 #ifndef OPT_THREAD
  2499 		__ get_thread(java_thread);
  2500 #endif
  2502 		__ ld(SP, java_thread, in_bytes(JavaThread::last_Java_sp_offset()));
  2503 		//  __ reset_last_Java_frame(java_thread, true);
  2504 		__ reset_last_Java_frame(java_thread, true, true);
  2506 		// Restore callee save registers.  This must be done after resetting the Java frame
  2507 		__ ld(S0, SP, S0_off * wordSize);
  2508 		__ ld(S1, SP, S1_off * wordSize);
  2509 		__ ld(S2, SP, S2_off * wordSize);
  2510 		__ ld(S3, SP, S3_off * wordSize);
  2511 		__ ld(S4, SP, S4_off * wordSize);
  2512 		__ ld(S5, SP, S5_off * wordSize);
  2513 		__ ld(S6, SP, S6_off * wordSize);
  2514 		__ ld(S7, SP, S7_off * wordSize);
  2516 		// discard arguments
  2517 		__ addi(SP, SP, (framesize-2) * wordSize); // epilog
  2518 		//	__ leave(); // required for proper stackwalking of RuntimeStub frame
  2519 		__ addi(SP, FP, wordSize);
  2520 		__ ld(FP, SP, -1*wordSize);
  2521 		// check for pending exceptions
  2522 #ifdef ASSERT
  2523 		Label L;
  2524 		__ lw(AT, java_thread, in_bytes(Thread::pending_exception_offset()));
  2525 		__ bne(AT, R0, L);
  2526 		__ delayed()->nop();
  2527 		__ should_not_reach_here();
  2528 		__ bind(L);
  2529 #endif //ASSERT
  2530 		__ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
  2531 		__ delayed()->nop();
  2532 #ifdef aoqi_test
  2533 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2534 #endif
  2535 		RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code,frame_complete, 
  2536 										framesize, oop_maps, false);
  2537 #ifdef aoqi_test
  2538 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2539 #endif
  2540 		return stub->entry_point();
  2543   // Initialization
  2544   void generate_initial() {
  2545 /*
  2546 		// Generates all stubs and initializes the entry points
  2548     // This platform-specific stub is needed by generate_call_stub()
  2549     StubRoutines::mips::_mxcsr_std        = generate_fp_mask("mxcsr_std",        0x0000000000001F80);
  2551     // entry points that exist in all platforms Note: This is code
  2552     // that could be shared among different platforms - however the
  2553     // benefit seems to be smaller than the disadvantage of having a
  2554     // much more complicated generator structure. See also comment in
  2555     // stubRoutines.hpp.
  2557     StubRoutines::_forward_exception_entry = generate_forward_exception();
  2559     StubRoutines::_call_stub_entry =
  2560       generate_call_stub(StubRoutines::_call_stub_return_address);
  2562     // is referenced by megamorphic call
  2563     StubRoutines::_catch_exception_entry = generate_catch_exception();
  2565     // atomic calls
  2566     StubRoutines::_atomic_xchg_entry         = generate_atomic_xchg();
  2567     StubRoutines::_atomic_xchg_ptr_entry     = generate_atomic_xchg_ptr();
  2568     StubRoutines::_atomic_cmpxchg_entry      = generate_atomic_cmpxchg();
  2569     StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
  2570     StubRoutines::_atomic_add_entry          = generate_atomic_add();
  2571     StubRoutines::_atomic_add_ptr_entry      = generate_atomic_add_ptr();
  2572     StubRoutines::_fence_entry               = generate_orderaccess_fence();
  2574     StubRoutines::_handler_for_unsafe_access_entry =
  2575       generate_handler_for_unsafe_access();
  2577     // platform dependent
  2578     StubRoutines::mips::_get_previous_fp_entry = generate_get_previous_fp();
  2580     StubRoutines::mips::_verify_mxcsr_entry    = generate_verify_mxcsr();
  2581 */
  2582 		// Generates all stubs and initializes the entry points
  2584 		//-------------------------------------------------------------
  2585 		//-----------------------------------------------------------
  2586 		// entry points that exist in all platforms
  2587 		// Note: This is code that could be shared among different platforms - however the benefit seems to be smaller 
  2588 		// than the disadvantage of having a much more complicated generator structure. 
  2589 		// See also comment in stubRoutines.hpp.
  2590 		StubRoutines::_forward_exception_entry = generate_forward_exception();    
  2591 		StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
  2592 		// is referenced by megamorphic call    
  2593 		StubRoutines::_catch_exception_entry = generate_catch_exception();    
  2595 		StubRoutines::_handler_for_unsafe_access_entry = generate_handler_for_unsafe_access();
  2597 		// platform dependent
  2598 		StubRoutines::gs2::_get_previous_fp_entry = generate_get_previous_fp();
  2601 void generate_all() {
  2602 #ifdef aoqi_test
  2603 tty->print_cr("%s:%d", __func__, __LINE__);
  2604 #endif
  2605     // Generates all stubs and initializes the entry points
  2607     // These entry points require SharedInfo::stack0 to be set up in
  2608     // non-core builds and need to be relocatable, so they each
  2609     // fabricate a RuntimeStub internally.
  2610 	/*
  2611     StubRoutines::_throw_AbstractMethodError_entry =
  2612       generate_throw_exception("AbstractMethodError throw_exception",
  2613                                CAST_FROM_FN_PTR(address,
  2614                                                 SharedRuntime::
  2615                                                 throw_AbstractMethodError),
  2616                                false);
  2618     StubRoutines::_throw_IncompatibleClassChangeError_entry =
  2619       generate_throw_exception("IncompatibleClassChangeError throw_exception",
  2620                                CAST_FROM_FN_PTR(address,
  2621                                                 SharedRuntime::
  2622                                                 throw_IncompatibleClassChangeError),
  2623                                false);
  2625     StubRoutines::_throw_ArithmeticException_entry =
  2626       generate_throw_exception("ArithmeticException throw_exception",
  2627                                CAST_FROM_FN_PTR(address,
  2628                                                 SharedRuntime::
  2629                                                 throw_ArithmeticException),
  2630                                true);
  2632     StubRoutines::_throw_NullPointerException_entry =
  2633       generate_throw_exception("NullPointerException throw_exception",
  2634                                CAST_FROM_FN_PTR(address,
  2635                                                 SharedRuntime::
  2636                                                 throw_NullPointerException),
  2637                                true);
  2639     StubRoutines::_throw_NullPointerException_at_call_entry =
  2640       generate_throw_exception("NullPointerException at call throw_exception",
  2641                                CAST_FROM_FN_PTR(address,
  2642                                                 SharedRuntime::
  2643                                                 throw_NullPointerException_at_call),
  2644                                false);
  2646     StubRoutines::_throw_StackOverflowError_entry =
  2647       generate_throw_exception("StackOverflowError throw_exception",
  2648                                CAST_FROM_FN_PTR(address,
  2649                                                 SharedRuntime::
  2650                                                 throw_StackOverflowError),
  2651                                false);
  2653     // entry points that are platform specific
  2654     StubRoutines::mips::_f2i_fixup = generate_f2i_fixup();
  2655     StubRoutines::mips::_f2l_fixup = generate_f2l_fixup();
  2656     StubRoutines::mips::_d2i_fixup = generate_d2i_fixup();
  2657     StubRoutines::mips::_d2l_fixup = generate_d2l_fixup();
  2659     StubRoutines::mips::_float_sign_mask  = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
  2660     StubRoutines::mips::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
  2661     StubRoutines::mips::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
  2662     StubRoutines::mips::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
  2664     // support for verify_oop (must happen after universe_init)
  2665     StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
  2667     // arraycopy stubs used by compilers
  2668     generate_arraycopy_stubs();
  2669 	*/
  2670 #ifdef aoqi_test
  2671 tty->print_cr("%s:%d", __func__, __LINE__);
  2672 #endif
  2673 		StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
  2674 #ifdef aoqi_test
  2675 tty->print_cr("%s:%d", __func__, __LINE__);
  2676 #endif
  2677 //		StubRoutines::_throw_ArithmeticException_entry         = generate_throw_exception("ArithmeticException throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_ArithmeticException),  true);
  2678 #ifdef aoqi_test
  2679 tty->print_cr("%s:%d", __func__, __LINE__);
  2680 #endif
  2681 //		StubRoutines::_throw_NullPointerException_entry        = generate_throw_exception("NullPointerException throw_exception",         CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException), true);
  2682 #ifdef aoqi_test
  2683 tty->print_cr("%s:%d", __func__, __LINE__);
  2684 #endif
  2685 		StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
  2686 #ifdef aoqi_test
  2687 tty->print_cr("%s:%d", __func__, __LINE__);
  2688 #endif
  2689 		StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError),   false);
  2690 #ifdef aoqi_test
  2691 tty->print_cr("%s:%d", __func__, __LINE__);
  2692 #endif
  2694 		//------------------------------------------------------
  2695 		//------------------------------------------------------------------
  2696 		// entry points that are platform specific  
  2698 		// support for verify_oop (must happen after universe_init)
  2699 #ifdef aoqi_test
  2700 tty->print_cr("%s:%d", __func__, __LINE__);
  2701 #endif
  2702 		StubRoutines::_verify_oop_subroutine_entry	   = generate_verify_oop();
  2703 #ifdef aoqi_test
  2704 tty->print_cr("%s:%d", __func__, __LINE__);
  2705 #endif
  2706 #ifndef CORE
  2707 		// arraycopy stubs used by compilers
  2708 		generate_arraycopy_stubs();
  2709 #ifdef aoqi_test
  2710 tty->print_cr("%s:%d", __func__, __LINE__);
  2711 #endif
  2712 #endif
  2714     // Safefetch stubs.
  2715     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
  2716                                                        &StubRoutines::_safefetch32_fault_pc,
  2717                                                        &StubRoutines::_safefetch32_continuation_pc);
  2718     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
  2719                                                        &StubRoutines::_safefetchN_fault_pc,
  2720                                                        &StubRoutines::_safefetchN_continuation_pc);
  2723  public:
  2724   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
  2725     if (all) {
  2726       generate_all();
  2727     } else {
  2728       generate_initial();
  2731 }; // end class declaration
  2732 /*
  2733 address StubGenerator::disjoint_byte_copy_entry  = NULL;
  2734 address StubGenerator::disjoint_short_copy_entry = NULL;
  2735 address StubGenerator::disjoint_int_copy_entry   = NULL;
  2736 address StubGenerator::disjoint_long_copy_entry  = NULL;
  2737 address StubGenerator::disjoint_oop_copy_entry   = NULL;
  2739 address StubGenerator::byte_copy_entry  = NULL;
  2740 address StubGenerator::short_copy_entry = NULL;
  2741 address StubGenerator::int_copy_entry   = NULL;
  2742 address StubGenerator::long_copy_entry  = NULL;
  2743 address StubGenerator::oop_copy_entry   = NULL;
  2745 address StubGenerator::checkcast_copy_entry = NULL;
  2746 */
  2747 void StubGenerator_generate(CodeBuffer* code, bool all) {
  2748   StubGenerator g(code, all);

mercurial