src/cpu/mips/vm/stubGenerator_mips_64.cpp

Wed, 29 Mar 2017 09:41:51 +0800

author
aoqi
date
Wed, 29 Mar 2017 09:41:51 +0800
changeset 392
4bfb40d1e17a
parent 373
3a34fc828b4a
child 401
721a83ed5111
permissions
-rw-r--r--

#4662 TieredCompilation is turned off.
TieredCompilation is not supported yet.

     1 /*
     2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
     3  * Copyright (c) 2015, 2016, Loongson Technology. All rights reserved.
     4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     5  *
     6  * This code is free software; you can redistribute it and/or modify it
     7  * under the terms of the GNU General Public License version 2 only, as
     8  * published by the Free Software Foundation.
     9  *
    10  * This code is distributed in the hope that it will be useful, but WITHOUT
    11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    13  * version 2 for more details (a copy is included in the LICENSE file that
    14  * accompanied this code).
    15  *
    16  * You should have received a copy of the GNU General Public License version
    17  * 2 along with this work; if not, write to the Free Software Foundation,
    18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    19  *
    20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    21  * or visit www.oracle.com if you need additional information or have any
    22  * questions.
    23  *
    24  */
    26 #include "precompiled.hpp"
    27 #include "asm/macroAssembler.hpp"
    28 #include "asm/macroAssembler.inline.hpp"
    29 #include "interpreter/interpreter.hpp"
    30 #include "nativeInst_mips.hpp"
    31 #include "oops/instanceOop.hpp"
    32 #include "oops/method.hpp"
    33 #include "oops/objArrayKlass.hpp"
    34 #include "oops/oop.inline.hpp"
    35 #include "prims/methodHandles.hpp"
    36 #include "runtime/frame.inline.hpp"
    37 #include "runtime/handles.inline.hpp"
    38 #include "runtime/sharedRuntime.hpp"
    39 #include "runtime/stubCodeGenerator.hpp"
    40 #include "runtime/stubRoutines.hpp"
    41 #include "runtime/thread.inline.hpp"
    42 #include "utilities/top.hpp"
    43 #ifdef COMPILER2
    44 #include "opto/runtime.hpp"
    45 #endif
    48 // Declaration and definition of StubGenerator (no .hpp file).
    49 // For a more detailed description of the stub routine structure
    50 // see the comment in stubRoutines.hpp
    52 #define __ _masm->
    53 //#define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
    54 //#define a__ ((Assembler*)_masm)->
    56 //#ifdef PRODUCT
    57 //#define BLOCK_COMMENT(str) /* nothing */
    58 //#else
    59 //#define BLOCK_COMMENT(str) __ block_comment(str)
    60 //#endif
    62 //#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
    63 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
    65 // Stub Code definitions
    67 static address handle_unsafe_access() {
    68   JavaThread* thread = JavaThread::current();
    69   address pc = thread->saved_exception_pc();
    70   // pc is the instruction which we must emulate
    71   // doing a no-op is fine:  return garbage from the load
    72   // therefore, compute npc
    73   //address npc = Assembler::locate_next_instruction(pc);
    74 	address npc = (address)((unsigned long)pc + sizeof(unsigned long));
    76   // request an async exception
    77   thread->set_pending_unsafe_access_error();
    79   // return address of next instruction to execute
    80   return npc;
    81 }
    83 class StubGenerator: public StubCodeGenerator {
    84  private:
    86   // ABI mips n64
    87   // This fig is not MIPS ABI. It is call Java from C ABI.
    88   // Call stubs are used to call Java from C
    89   //
    90   //    [ return_from_Java     ]
    91   //    [ argument word n-1    ] <--- sp
    92   //      ...
    93   //    [ argument word 0      ]
    94   //      ...
    95   //-10 [ S6     	       ]
    96   // -9 [ S5		       ] 
    97   // -8 [ S4		       ]
    98   // -7 [ S3                   ]
    99   // -6 [ S0  		       ]
   100   // -5 [ TSR(S2)	       ]
   101   // -4 [ LVP(S7)              ]
   102   // -3 [ BCP(S1)              ]
   103   // -2 [ saved fp             ] <--- fp_after_call
   104   // -1 [ return address       ] 
   105   //  0 [ ptr. to call wrapper ] <--- a0 (old sp -->)fp
   106   //  1 [ result               ] <--- a1
   107   //  2 [ result_type          ] <--- a2
   108   //  3 [ method               ] <--- a3
   109   //  4 [ entry_point          ] <--- a4
   110   //  5 [ parameters           ] <--- a5
   111   //  6 [ parameter_size       ] <--- a6
   112   //  7 [ thread               ] <--- a7
   114   //
   115   // _LP64: n64 does not save paras in sp.
   116   //
   117   //    [ return_from_Java     ]
   118   //    [ argument word n-1    ] <--- sp
   119   //      ...
   120   //    [ argument word 0      ]
   121   //      ...
   122   //-14 [ thread               ]
   123   //-13 [ result_type          ] <--- a2
   124   //-12 [ result               ] <--- a1
   125   //-11 [ ptr. to call wrapper ] <--- a0
   126   //-10 [ S6     	       ]
   127   // -9 [ S5		       ] 
   128   // -8 [ S4		       ]
   129   // -7 [ S3                   ]
   130   // -6 [ S0  		       ]
   131   // -5 [ TSR(S2)	       ]
   132   // -4 [ LVP(S7)              ]
   133   // -3 [ BCP(S1)              ]
   134   // -2 [ saved fp             ] <--- fp_after_call
   135   // -1 [ return address       ] 
   136   //  0 [        	       ] <--- old sp
   137   /*
   138    * 2014/01/16 Fu: Find a right place in the call_stub for GP.
   139    * GP will point to the starting point of Interpreter::dispatch_table(itos). 
   140    * It should be saved/restored before/after Java calls. 
   141    *
   142    */
   143    enum call_stub_layout {
   144      RA_off		  = -1,
   145      FP_off		  = -2,
   146      BCP_off		  = -3,
   147      LVP_off		  = -4,
   148      TSR_off		  = -5,
   149      S1_off		  = -6,
   150      S3_off		  = -7,
   151      S4_off		  = -8,
   152      S5_off		  = -9,
   153      S6_off		  = -10,
   154      result_off		  = -11,
   155      result_type_off	  = -12,
   156      thread_off		  = -13,
   157      total_off		  = thread_off - 3,
   158      GP_off               = -16,
   159    };
   161   address generate_call_stub(address& return_address) {
   163     StubCodeMark mark(this, "StubRoutines", "call_stub");
   164     address start = __ pc();
   166     // same as in generate_catch_exception()!
   168     // stub code
   169     // save ra and fp
   170     __ sd(RA, SP, RA_off * wordSize);
   171     __ sd(FP, SP, FP_off * wordSize);
   172     __ sd(BCP, SP, BCP_off * wordSize);
   173     __ sd(LVP, SP, LVP_off * wordSize);
   174     __ sd(GP, SP, GP_off * wordSize);
   175     __ sd(TSR, SP, TSR_off * wordSize);
   176     __ sd(S1, SP, S1_off * wordSize);
   177     __ sd(S3, SP, S3_off * wordSize);
   178     __ sd(S4, SP, S4_off * wordSize);
   179     __ sd(S5, SP, S5_off * wordSize);
   180     __ sd(S6, SP, S6_off * wordSize);
   183     __ set64(GP, (long)Interpreter::dispatch_table(itos));
   185     // I think 14 is the max gap between argument and callee saved register
   186     __ daddi(FP, SP, (-2) * wordSize);
   187     __ daddi(SP, SP, total_off * wordSize);
   188 //FIXME, aoqi. find a suitable place to save A1 & A2.
   189     /*
   190     __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);
   191     __ sd(A1, FP, 3 * wordSize);
   192     __ sd(A2, FP, 4 * wordSize);
   193     __ sd(A3, FP, 5 * wordSize);
   194     __ sd(A4, FP, 6 * wordSize);
   195     __ sd(A5, FP, 7 * wordSize);
   196     __ sd(A6, FP, 8 * wordSize);
   197     __ sd(A7, FP, 9 * wordSize);
   198     */
   199     __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);
   200     __ sd(A1, FP, result_off * wordSize);
   201     __ sd(A2, FP, result_type_off * wordSize);
   202     __ sd(A7, FP, thread_off * wordSize);
   204 #ifdef OPT_THREAD
   205     //__ get_thread(TREG);
   206     __ move(TREG, A7);
   208     //__ ld(TREG, FP, thread_off * wordSize);
   209 #endif
   210     //add for compressedoops
   211     __ reinit_heapbase();
   213 #ifdef ASSERT
   214     // make sure we have no pending exceptions
   215     { 
   216       Label L;
   217     	__ ld(AT, A7, in_bytes(Thread::pending_exception_offset()));
   218     	__ beq(AT, R0, L); 
   219     	__ delayed()->nop();
   220     	/* FIXME: I do not know how to realize stop in mips arch, do it in the future */
   221     	__ stop("StubRoutines::call_stub: entered with pending exception");
   222     	__ bind(L);
   223     }
   224 #endif
   226     // pass parameters if any
   227     // A5: parameter
   228     // A6: parameter_size
   229     // T0: parameter_size_tmp(--)
   230     // T2: offset(++)
   231     // T3: tmp
   232     Label parameters_done;
   233     // judge if the parameter_size equals 0
   234     __ beq(A6, R0, parameters_done);
   235     __ delayed()->nop();
   236     __ dsll(AT, A6, Interpreter::logStackElementSize);
   237     __ dsub(SP, SP, AT); 
   238     __ move(AT, -StackAlignmentInBytes); 
   239     __ andr(SP, SP , AT); 
   240     // Copy Java parameters in reverse order (receiver last)
   241     // Note that the argument order is inverted in the process
   242     // source is edx[ecx: N-1..0]
   243     // dest   is esp[ebx: 0..N-1]
   244     Label loop;
   245     __ move(T0, A6);
   246     __ move(T2, R0);
   247     __ bind(loop);
   249     // get parameter
   250     __ dsll(T3, T0, LogBytesPerWord);   
   251     __ dadd(T3, T3, A5);	    
   252     __ ld(AT, T3,  -wordSize);
   253     __ dsll(T3, T2, LogBytesPerWord); 
   254     __ dadd(T3, T3, SP); 
   255     __ sd(AT, T3, Interpreter::expr_offset_in_bytes(0));
   256     __ daddi(T2, T2, 1); 
   257     __ daddi(T0, T0, -1); 
   258     __ bne(T0, R0, loop);
   259     __ delayed()->nop();
   260     // advance to next parameter
   262     // call Java function
   263     __ bind(parameters_done);
   265     // receiver in V0, methodOop in Rmethod
   267     __ move(Rmethod, A3);
   268     __ move(Rsender, SP);             //set sender sp
   269     __ jalr(A4);
   270     __ delayed()->nop();
   271     return_address = __ pc();
   273     Label common_return;
   274     __ bind(common_return);
   276     // store result depending on type
   277     // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
   278     __ ld(T0, FP, result_off * wordSize); 	// result --> T0
   279     Label is_long, is_float, is_double, exit;
   280     __ ld(T2, FP, result_type_off * wordSize);	// result_type --> T2
   281     __ daddi(T3, T2, (-1) * T_LONG);
   282     __ beq(T3, R0, is_long);
   283     __ delayed()->daddi(T3, T2, (-1) * T_FLOAT);
   284     __ beq(T3, R0, is_float);
   285     __ delayed()->daddi(T3, T2, (-1) * T_DOUBLE);
   286     __ beq(T3, R0, is_double);
   287     __ delayed()->nop();
   289     // handle T_INT case
   290     __ sd(V0, T0, 0 * wordSize);
   291     __ bind(exit);
   293     // restore 
   294     __ daddi(SP, FP, 2 * wordSize );
   295     __ ld(RA, SP, RA_off * wordSize);
   296     __ ld(FP, SP, FP_off * wordSize);
   297     __ ld(BCP, SP, BCP_off * wordSize);
   298     __ ld(LVP, SP, LVP_off * wordSize);
   299     __ ld(GP, SP, GP_off * wordSize);
   300     __ ld(TSR, SP, TSR_off * wordSize);
   302     __ ld(S1, SP, S1_off * wordSize);
   303     __ ld(S3, SP, S3_off * wordSize);
   304     __ ld(S4, SP, S4_off * wordSize);
   305     __ ld(S5, SP, S5_off * wordSize);
   306     __ ld(S6, SP, S6_off * wordSize);
   308     // return
   309     __ jr(RA);
   310     __ delayed()->nop();
   312     // handle return types different from T_INT
   313     __ bind(is_long);
   314     __ sd(V0, T0, 0 * wordSize);
   315     //__ sd(V1, T0, 1 * wordSize);
   316     //__ sd(R0, T0, 1 * wordSize);
   317     __ b(exit);
   318     __ delayed()->nop();
   320     __ bind(is_float);
   321     __ swc1(F0, T0, 0 * wordSize);
   322     __ b(exit);
   323     __ delayed()->nop();
   325     __ bind(is_double);
   326     __ sdc1(F0, T0, 0 * wordSize);
   327     //__ sdc1(F1, T0, 1 * wordSize);
   328     //__ sd(R0, T0, 1 * wordSize);
   329     __ b(exit);
   330     __ delayed()->nop();
   331     //FIXME, 1.6 mips version add operation of fpu here
   332     StubRoutines::gs2::set_call_stub_compiled_return(__ pc());
   333     __ b(common_return);
   334     __ delayed()->nop(); 
   335     return start;
   336   }
   338   // Return point for a Java call if there's an exception thrown in
   339   // Java code.  The exception is caught and transformed into a
   340   // pending exception stored in JavaThread that can be tested from
   341   // within the VM.
   342   //
   343   // Note: Usually the parameters are removed by the callee. In case
   344   // of an exception crossing an activation frame boundary, that is
   345   // not the case if the callee is compiled code => need to setup the
   346   // rsp.
   347   //
   348   // rax: exception oop
   350   address generate_catch_exception() {
   351     StubCodeMark mark(this, "StubRoutines", "catch_exception");
   352     address start = __ pc();
   354     Register thread = TREG;
   356     // get thread directly
   357 #ifndef OPT_THREAD
   358     __ ld(thread, FP, thread_off * wordSize);
   359 #endif
   361 #ifdef ASSERT
   362     // verify that threads correspond
   363     { Label L;
   364       __ get_thread(T8);
   365       __ beq(T8, thread, L);
   366       __ delayed()->nop();
   367       __ stop("StubRoutines::catch_exception: threads must correspond");
   368       __ bind(L);
   369     }
   370 #endif
   371     // set pending exception
   372     __ verify_oop(V0);
   373     __ sd(V0, thread, in_bytes(Thread::pending_exception_offset()));
   374     __ li(AT, (long)__FILE__);
   375     __ sd(AT, thread, in_bytes(Thread::exception_file_offset   ()));
   376     __ li(AT, (long)__LINE__);
   377     __ sd(AT, thread, in_bytes(Thread::exception_line_offset   ()));
   379     // complete return to VM
   380     assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
   381     __ jmp(StubRoutines::_call_stub_return_address, relocInfo::none);
   382     __ delayed()->nop();
   384     return start;
   385   }
   387   // Continuation point for runtime calls returning with a pending
   388   // exception.  The pending exception check happened in the runtime
   389   // or native call stub.  The pending exception in Thread is
   390   // converted into a Java-level exception.
   391   //
   392   // Contract with Java-level exception handlers:
   393   // rax: exception
   394   // rdx: throwing pc
   395   //
   396   // NOTE: At entry of this stub, exception-pc must be on stack !!
   398   address generate_forward_exception() {
   399     StubCodeMark mark(this, "StubRoutines", "forward exception");
   400     //Register thread = TREG;
   401     Register thread = TREG;
   402     address start = __ pc();
   404     // Upon entry, the sp points to the return address returning into Java
   405     // (interpreted or compiled) code; i.e., the return address becomes the
   406     // throwing pc.
   407     //
   408     // Arguments pushed before the runtime call are still on the stack but
   409     // the exception handler will reset the stack pointer -> ignore them.
   410     // A potential result in registers can be ignored as well.
   412 #ifdef ASSERT
   413     // make sure this code is only executed if there is a pending exception
   414 #ifndef OPT_THREAD
   415     __ get_thread(thread);
   416 #endif
   417     { Label L;
   418       __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));
   419       __ bne(AT, R0, L);
   420       __ delayed()->nop();
   421       __ stop("StubRoutines::forward exception: no pending exception (1)");
   422       __ bind(L);
   423     }
   424 #endif
   426     // compute exception handler into T9
   427     __ ld(A1, SP, 0);
   428     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, A1);
   429     __ move(T9, V0);
   430     __ pop(V1);
   432 #ifndef OPT_THREAD
   433     __ get_thread(thread);
   434 #endif
   435     __ ld(V0, thread, in_bytes(Thread::pending_exception_offset()));
   436     __ sd(R0, thread, in_bytes(Thread::pending_exception_offset()));
   438 #ifdef ASSERT
   439     // make sure exception is set
   440     { Label L;
   441       __ bne(V0, R0, L);
   442       __ delayed()->nop();
   443       __ stop("StubRoutines::forward exception: no pending exception (2)");
   444       __ bind(L);
   445     }
   446 #endif
   448     // continue at exception handler (return address removed)
   449     // V0: exception
   450     // T9: exception handler
   451     // V1: throwing pc
   452     __ verify_oop(V0);
   453     __ jr(T9);
   454     __ delayed()->nop();
   456     return start;
   457   }
   459   // Support for intptr_t get_previous_fp()
   460   //
   461   // This routine is used to find the previous frame pointer for the
   462   // caller (current_frame_guess). This is used as part of debugging
   463   // ps() is seemingly lost trying to find frames.
   464   // This code assumes that caller current_frame_guess) has a frame.
   465   address generate_get_previous_fp() {
   466     StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
   467     const Address old_fp       (FP,  0);
   468     const Address older_fp       (V0,  0);
   469     address start = __ pc();
   470     __ enter();    
   471     __ lw(V0, old_fp); // callers fp
   472     __ lw(V0, older_fp); // the frame for ps()
   473     __ leave();
   474     __ jr(RA);
   475     __ delayed()->nop();
   476     return start;
   477   }
   478   // The following routine generates a subroutine to throw an
   479   // asynchronous UnknownError when an unsafe access gets a fault that
   480   // could not be reasonably prevented by the programmer.  (Example:
   481   // SIGBUS/OBJERR.)
   482   address generate_handler_for_unsafe_access() {
   483 		StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
   484 		address start = __ pc();
   485 		__ pushad();                      // push registers
   486 		//  Address next_pc(esp, RegisterImpl::number_of_registers * BytesPerWord);
   487 		__ call(CAST_FROM_FN_PTR(address, handle_unsafe_access), relocInfo::runtime_call_type);
   488 		__ delayed()->nop(); 
   489 		__ sw(V0, SP, RegisterImpl::number_of_registers * BytesPerWord); 
   490 		__ popad();
   491 		__ jr(RA);
   492 		__ delayed()->nop();  
   493 		return start;
   494   }
   496   // Non-destructive plausibility checks for oops
   497   //
   498   // Arguments:
   499   //    all args on stack!
   500   //
   501   // Stack after saving c_rarg3:
   502   //    [tos + 0]: saved c_rarg3
   503   //    [tos + 1]: saved c_rarg2
   504   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
   505   //    [tos + 3]: saved flags
   506   //    [tos + 4]: return address
   507   //  * [tos + 5]: error message (char*)
   508   //  * [tos + 6]: object to verify (oop)
   509   //  * [tos + 7]: saved rax - saved by caller and bashed
   510   //  * = popped on exit
   511   address generate_verify_oop() {
   512 	  StubCodeMark mark(this, "StubRoutines", "verify_oop");
   513 	  address start = __ pc();
   514 	  __ reinit_heapbase();
   515 	  __ verify_oop_subroutine(); 
   516     address end = __ pc();
   517 	  return start;
   518   }
   520   //
   521   //  Generate overlap test for array copy stubs
   522   //
   523   //  Input:
   524   //     A0    -  array1
   525   //     A1    -  array2
   526   //     A2    -  element count
   527   //
   528   //  Note: this code can only use %eax, %ecx, and %edx
   529   //
   531  // use T9 as temp 
   532   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
   533     int elem_size = 1 << log2_elem_size;
   534     Address::ScaleFactor sf = Address::times_1;
   536     switch (log2_elem_size) {
   537       case 0: sf = Address::times_1; break;
   538       case 1: sf = Address::times_2; break;
   539       case 2: sf = Address::times_4; break;
   540       case 3: sf = Address::times_8; break;
   541     }
   543     __ dsll(AT, A2, sf);
   544     __ dadd(AT, AT, A0); 
   545     __ lea(T9, Address(AT, -elem_size)); 
   546     __ dsub(AT, A1, A0); 
   547     __ blez(AT, no_overlap_target); 
   548     __ delayed()->nop(); 
   549     __ dsub(AT, A1, T9); 
   550     __ bgtz(AT, no_overlap_target); 
   551     __ delayed()->nop(); 
   553     // 2016/05/10 aoqi: If A0 = 0xf... and A1 = 0x0..., than goto no_overlap_target 
   554     Label L;
   555     __ bgez(A0, L);
   556     __ delayed()->nop(); 
   557     __ bgtz(A1, no_overlap_target);
   558     __ delayed()->nop(); 
   559     __ bind(L);
   561   }
   563   //
   564   //  Generate store check for array
   565   //
   566   //  Input:
   567   //     T0    -  starting address(edi)
   568   //     T1    -  element count  (ecx)
   569   //
   570   //  The 2 input registers are overwritten
   571   //
   573 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
   575 	void array_store_check() {
   576 		BarrierSet* bs = Universe::heap()->barrier_set();
   577 		assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
   578 		CardTableModRefBS* ct = (CardTableModRefBS*)bs;
   579 		assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
   580 		Label l_0;
   582 		__ dsll(AT, T1, TIMES_OOP);
   583 		__ dadd(AT, T0, AT); 
   584 		__ daddiu(T1, AT, - BytesPerHeapOop);
   586 		__ shr(T0, CardTableModRefBS::card_shift); 
   587 		__ shr(T1, CardTableModRefBS::card_shift);
   589 		__ dsub(T1, T1, T0);   // end --> cards count
   590 		__ bind(l_0);
   592 		__ set64(AT, (long)ct->byte_map_base); 
   593 		__ dadd(AT, AT, T0); 
   594 		__ dadd(AT, AT, T1); 
   595 		__ sb(R0, AT, 0);
   596                 __ sync();
   597 		__ bgez(T1, l_0);
   598 		__ delayed()->daddi(T1, T1, - 1); 
   599 	}
   601   // Arguments:
   602   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
   603   //             ignored
   604   //   name    - stub name string
   605   //
   606   // Inputs:
   607   //   c_rarg0   - source array address
   608   //   c_rarg1   - destination array address
   609   //   c_rarg2   - element count, treated as ssize_t, can be zero
   610   //
   611   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
   612   // we let the hardware handle it.  The one to eight bytes within words,
   613   // dwords or qwords that span cache line boundaries will still be loaded
   614   // and stored atomically.
   615   //
   616   // Side Effects:
   617   //   disjoint_byte_copy_entry is set to the no-overlap entry point
   618   //   used by generate_conjoint_byte_copy().
   619   //
   620   address generate_disjoint_byte_copy(bool aligned, const char * name) {
   621     StubCodeMark mark(this, "StubRoutines", name);
   622     __ align(CodeEntryAlignment);
   625     Register tmp1 = T0;
   626     Register tmp2 = T1;
   627     Register tmp3 = T3;
   629     address start = __ pc();
   631     __ push(tmp1);
   632     __ push(tmp2);
   633     __ push(tmp3);
   634     __ move(tmp1, A0);
   635     __ move(tmp2, A1);
   636     __ move(tmp3, A2);
   639     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10, l_11;
   640     Label l_debug;
   642     __ daddi(AT, tmp3, -9); //why the number is 9 ?
   643     __ blez(AT, l_9);
   644     __ delayed()->nop();
   646     if (!aligned) {
   647       __ xorr(AT, tmp1, tmp2);
   648       __ andi(AT, AT, 1);
   649       __ bne(AT, R0, l_9); // if arrays don't have the same alignment mod 2, do 1 element copy
   650       __ delayed()->nop();
   652       __ andi(AT, tmp1, 1);
   653       __ beq(AT, R0, l_10); //copy 1 enlement if necessary to aligh to 2 bytes
   654       __ delayed()->nop();
   656       __ lb(AT, tmp1, 0);
   657       __ daddi(tmp1, tmp1, 1);
   658       __ sb(AT, tmp2, 0);
   659       __ daddi(tmp2, tmp2, 1);
   660       __ daddi(tmp3, tmp3, -1);
   661       __ bind(l_10);
   663       __ xorr(AT, tmp1, tmp2);
   664       __ andi(AT, AT, 3);
   665       __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 2 elements copy
   666       __ delayed()->nop();
   668       // At this point it is guaranteed that both, from and to have the same alignment mod 4.
   670       // Copy 2 elements if necessary to align to 4 bytes.
   671       __ andi(AT, tmp1, 3);
   672       __ beq(AT, R0, l_2);
   673       __ delayed()->nop();
   675       __ lhu(AT, tmp1, 0);
   676       __ daddi(tmp1, tmp1, 2);
   677       __ sh(AT, tmp2, 0);
   678       __ daddi(tmp2, tmp2, 2);
   679       __ daddi(tmp3, tmp3, -2);
   680       __ bind(l_2);
   682       // At this point the positions of both, from and to, are at least 4 byte aligned.
   684       // Copy 4 elements at a time.
   685       // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
   686       __ xorr(AT, tmp1, tmp2);
   687       __ andi(AT, AT, 7);
   688       __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned
   689       __ delayed()->nop();
   691       // Copy a 4 elements if necessary to align to 8 bytes.
   692       __ andi(AT, tmp1, 7);
   693       __ beq(AT, R0, l_7);
   694       __ delayed()->nop();
   696       __ lw(AT, tmp1, 0);
   697       __ daddi(tmp3, tmp3, -4);
   698       __ sw(AT, tmp2, 0);
   699       { // FasterArrayCopy
   700         __ daddi(tmp1, tmp1, 4);
   701         __ daddi(tmp2, tmp2, 4);
   702       }
   703     }
   705     __ bind(l_7);
   707     // Copy 4 elements at a time; either the loads or the stores can
   708     // be unaligned if aligned == false.
   710     { // FasterArrayCopy
   711       __ daddi(AT, tmp3, -7);
   712       __ blez(AT, l_6); // copy 4 at a time if less than 4 elements remain
   713       __ delayed()->nop();
   715       __ bind(l_8);
   716       // For Loongson, there is 128-bit memory access. TODO
   717       __ ld(AT, tmp1, 0);
   718       __ sd(AT, tmp2, 0);
   719       __ daddi(tmp1, tmp1, 8);
   720       __ daddi(tmp2, tmp2, 8);
   721       __ daddi(tmp3, tmp3, -8);
   722       __ daddi(AT, tmp3, -8);
   723       __ bgez(AT, l_8);
   724       __ delayed()->nop();
   725     }
   726     __ bind(l_6);
   728     // copy 4 bytes at a time
   729     { // FasterArrayCopy
   730       __ daddi(AT, tmp3, -3);
   731       __ blez(AT, l_1);
   732       __ delayed()->nop();
   734       __ bind(l_3);
   735       __ lw(AT, tmp1, 0);
   736       __ sw(AT, tmp2, 0);
   737       __ daddi(tmp1, tmp1, 4);
   738       __ daddi(tmp2, tmp2, 4);
   739       __ daddi(tmp3, tmp3, -4);
   740       __ daddi(AT, tmp3, -4);
   741       __ bgez(AT, l_3);
   742       __ delayed()->nop();
   744     }
   746     // do 2 bytes copy
   747     __ bind(l_1);
   748     { 
   749       __ daddi(AT, tmp3, -1);
   750       __ blez(AT, l_9);
   751       __ delayed()->nop();
   753       __ bind(l_5);
   754       __ lhu(AT, tmp1, 0);
   755       __ daddi(tmp3, tmp3, -2);
   756       __ sh(AT, tmp2, 0);
   757       __ daddi(tmp1, tmp1, 2);
   758       __ daddi(tmp2, tmp2, 2);
   759       __ daddi(AT, tmp3, -2);
   760       __ bgez(AT, l_5);
   761       __ delayed()->nop();
   762     }
   764     //do 1 element copy--byte
   765     __ bind(l_9);
   766     __ beq(R0, tmp3, l_4);
   767     __ delayed()->nop();
   769     {
   770       __ bind(l_11);
   771       __ lb(AT, tmp1, 0);
   772       __ daddi(tmp3, tmp3, -1);
   773       __ sb(AT, tmp2, 0);
   774       __ daddi(tmp1, tmp1, 1);
   775       __ daddi(tmp2, tmp2, 1);
   776       __ daddi(AT, tmp3, -1);
   777       __ bgez(AT, l_11);
   778       __ delayed()->nop();
   779     }
   781     __ bind(l_4);
   782     __ pop(tmp3);
   783     __ pop(tmp2);
   784     __ pop(tmp1);
   786     __ jr(RA);
   787     __ delayed()->nop();
   789     return start;
   790   }
   792   // Arguments:
   793   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
   794   //             ignored
   795   //   name    - stub name string
   796   //
   797   // Inputs:
   798   //   A0   - source array address
   799   //   A1   - destination array address
   800   //   A2   - element count, treated as ssize_t, can be zero
   801   //
   802   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
   803   // we let the hardware handle it.  The one to eight bytes within words,
   804   // dwords or qwords that span cache line boundaries will still be loaded
   805   // and stored atomically.
   806   //
   807   address generate_conjoint_byte_copy(bool aligned, const char *name) {
   808     __ align(CodeEntryAlignment);
   809     StubCodeMark mark(this, "StubRoutines", name);
   810     address start = __ pc();
   812     Label l_copy_4_bytes_loop, l_copy_suffix, l_copy_suffix_loop, l_exit;
   813     Label l_copy_byte, l_from_unaligned, l_unaligned, l_4_bytes_aligned;
   815     address nooverlap_target = aligned ?
   816 	    StubRoutines::arrayof_jbyte_disjoint_arraycopy() :
   817 	    StubRoutines::jbyte_disjoint_arraycopy();
   819     array_overlap_test(nooverlap_target, 0);
   821     const Register from      = A0;   // source array address
   822     const Register to        = A1;   // destination array address
   823     const Register count     = A2;   // elements count
   824     const Register end_from  = T3;   // source array end address
   825     const Register end_to    = T0;   // destination array end address
   826     const Register end_count = T1;   // destination array end address
   828     __ push(end_from);	
   829     __ push(end_to);	
   830     __ push(end_count);	
   831     __ push(T8);	
   833     // copy from high to low
   834     __ move(end_count, count);  
   835     __ dadd(end_from, from, end_count);  
   836     __ dadd(end_to, to, end_count);  
   838     // 2016/05/08 aoqi: If end_from and end_to has differante alignment, unaligned copy is performed.
   839     __ andi(AT, end_from, 3); 
   840     __ andi(T8, end_to, 3); 
   841     __ bne(AT, T8, l_copy_byte); 
   842     __ delayed()->nop();	
   844     // First deal with the unaligned data at the top.
   845     __ bind(l_unaligned);
   846     __ beq(end_count, R0, l_exit); 
   847     __ delayed()->nop(); 
   849     __ andi(AT, end_from, 3);    
   850     __ bne(AT, R0, l_from_unaligned); 
   851     __ delayed()->nop(); 
   853     __ andi(AT, end_to, 3);    
   854     __ beq(AT, R0, l_4_bytes_aligned); 
   855     __ delayed()->nop(); 
   857     __ bind(l_from_unaligned);
   858     __ lb(AT, end_from, -1);   
   859     __ sb(AT, end_to, -1); 
   860     __ daddi(end_from, end_from, -1); 
   861     __ daddi(end_to, end_to, -1); 
   862     __ daddi(end_count, end_count, -1); 
   863     __ b(l_unaligned); 
   864     __ delayed()->nop(); 
   866     // now end_to, end_from point to 4-byte aligned high-ends
   867     //     end_count contains byte count that is not copied.
   868     // copy 4 bytes at a time
   869     __ bind(l_4_bytes_aligned);
   871     __ move(T8, end_count); 
   872     __ daddi(AT, end_count, -3); 
   873     __ blez(AT, l_copy_suffix); 
   874     __ delayed()->nop();	
   876     //__ andi(T8, T8, 3); 
   877     __ lea(end_from, Address(end_from, -4));
   878     __ lea(end_to, Address(end_to, -4));
   880     __ dsrl(end_count, end_count, 2); 
   881     __ align(16);
   882     __ bind(l_copy_4_bytes_loop); //l_copy_4_bytes
   883     __ lw(AT, end_from, 0);   
   884     __ sw(AT, end_to, 0); 
   885     __ addi(end_from, end_from, -4);    
   886     __ addi(end_to, end_to, -4);    
   887     __ addi(end_count, end_count, -1);  
   888     __ bne(end_count, R0, l_copy_4_bytes_loop); 
   889     __ delayed()->nop(); 
   891     __ b(l_copy_suffix);  
   892     __ delayed()->nop(); 
   893     // copy dwords aligned or not with repeat move
   894     // l_copy_suffix
   895     // copy suffix (0-3 bytes)
   896     __ bind(l_copy_suffix); 
   897     __ andi(T8, T8, 3); 
   898     __ beq(T8, R0, l_exit); 
   899     __ delayed()->nop(); 
   900     __ addi(end_from, end_from, 3); 
   901     __ addi(end_to, end_to, 3); 
   902     __ bind(l_copy_suffix_loop);
   903     __ lb(AT, end_from, 0);  
   904     __ sb(AT, end_to, 0); 
   905     __ addi(end_from, end_from, -1);  
   906     __ addi(end_to, end_to, -1);  
   907     __ addi(T8, T8, -1); 
   908     __ bne(T8, R0, l_copy_suffix_loop); 
   909     __ delayed()->nop(); 
   911     __ bind(l_copy_byte);
   912     __ beq(end_count, R0, l_exit); 
   913     __ delayed()->nop(); 
   914     __ lb(AT, end_from, -1);   
   915     __ sb(AT, end_to, -1); 
   916     __ daddi(end_from, end_from, -1); 
   917     __ daddi(end_to, end_to, -1); 
   918     __ daddi(end_count, end_count, -1); 
   919     __ b(l_copy_byte); 
   920     __ delayed()->nop(); 
   922     __ bind(l_exit);
   923     __ pop(T8);	
   924     __ pop(end_count);	
   925     __ pop(end_to);	
   926     __ pop(end_from);	
   927     __ jr(RA); 
   928     __ delayed()->nop(); 
   929     return start;
   930   }
   932   // Generate stub for disjoint short copy.  If "aligned" is true, the
   933   // "from" and "to" addresses are assumed to be heapword aligned.
   934   //
   935   // Arguments for generated stub:
   936   //      from:  A0
   937   //      to:    A1
   938   //  elm.count: A2 treated as signed
   939   //  one element: 2 bytes
   940   //
   941   // Strategy for aligned==true:
   942   //
   943   //  If length <= 9:
   944   //     1. copy 1 elements at a time (l_5)
   945   //
   946   //  If length > 9:
   947   //     1. copy 4 elements at a time until less than 4 elements are left (l_7)
   948   //     2. copy 2 elements at a time until less than 2 elements are left (l_6)
   949   //     3. copy last element if one was left in step 2. (l_1)
   950   //
   951   //
   952   // Strategy for aligned==false:
   953   //
   954   //  If length <= 9: same as aligned==true case
   955   //
   956   //  If length > 9:
   957   //     1. continue with step 7. if the alignment of from and to mod 4
   958   //        is different.
   959   //     2. align from and to to 4 bytes by copying 1 element if necessary
   960   //     3. at l_2 from and to are 4 byte aligned; continue with
   961   //        6. if they cannot be aligned to 8 bytes because they have
   962   //        got different alignment mod 8.
   963   //     4. at this point we know that both, from and to, have the same
   964   //        alignment mod 8, now copy one element if necessary to get
   965   //        8 byte alignment of from and to.
   966   //     5. copy 4 elements at a time until less than 4 elements are
   967   //        left; depending on step 3. all load/stores are aligned.
   968   //     6. copy 2 elements at a time until less than 2 elements are
   969   //        left. (l_6)
   970   //     7. copy 1 element at a time. (l_5)
   971   //     8. copy last element if one was left in step 6. (l_1)
   973   address generate_disjoint_short_copy(bool aligned, const char * name) {
   974     StubCodeMark mark(this, "StubRoutines", name);
   975     __ align(CodeEntryAlignment);
   977     Register tmp1 = T0;
   978     Register tmp2 = T1;
   979     Register tmp3 = T3;
   980     Register tmp4 = T8;
   981     Register tmp5 = T9;
   982     Register tmp6 = T2;
   984     address start = __ pc();
   986     __ push(tmp1);
   987     __ push(tmp2);
   988     __ push(tmp3);
   989     __ move(tmp1, A0);
   990     __ move(tmp2, A1);
   991     __ move(tmp3, A2);
   993     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10, l_11, l_12, l_13, l_14;
   994     Label l_debug;
   995     // don't try anything fancy if arrays don't have many elements
   996     __ daddi(AT, tmp3, -23);
   997     __ blez(AT, l_14);
   998     __ delayed()->nop();
   999     // move push here
  1000     __ push(tmp4);
  1001     __ push(tmp5);
  1002     __ push(tmp6);
  1004     if (!aligned) {
  1005       __ xorr(AT, A0, A1);
  1006       __ andi(AT, AT, 1);
  1007       __ bne(AT, R0, l_debug); // if arrays don't have the same alignment mod 2, can this happen?
  1008       __ delayed()->nop();
  1010       __ xorr(AT, A0, A1);
  1011       __ andi(AT, AT, 3);
  1012       __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 1 element copy
  1013       __ delayed()->nop();
  1015       // At this point it is guaranteed that both, from and to have the same alignment mod 4.
  1017       // Copy 1 element if necessary to align to 4 bytes.
  1018       __ andi(AT, A0, 3);
  1019       __ beq(AT, R0, l_2);
  1020       __ delayed()->nop();
  1022       __ lhu(AT, tmp1, 0);
  1023       __ daddi(tmp1, tmp1, 2);
  1024       __ sh(AT, tmp2, 0);
  1025       __ daddi(tmp2, tmp2, 2);
  1026       __ daddi(tmp3, tmp3, -1);
  1027       __ bind(l_2);
  1029       // At this point the positions of both, from and to, are at least 4 byte aligned.
  1031       // Copy 4 elements at a time.
  1032       // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
  1033       __ xorr(AT, tmp1, tmp2);
  1034       __ andi(AT, AT, 7);
  1035       __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned
  1036       __ delayed()->nop();
  1038       // Copy a 2-element word if necessary to align to 8 bytes.
  1039       __ andi(AT, tmp1, 7);
  1040       __ beq(AT, R0, l_7);
  1041       __ delayed()->nop();
  1043       __ lw(AT, tmp1, 0);
  1044       __ daddi(tmp3, tmp3, -2);
  1045       __ sw(AT, tmp2, 0);
  1046       __ daddi(tmp1, tmp1, 4);
  1047       __ daddi(tmp2, tmp2, 4);
  1048     }// end of if (!aligned)
  1050     __ bind(l_7);
  1051     // At this time the position of both, from and to, are at least 8 byte aligned.
  1052     // Copy 8 elemnets at a time.
  1053     // Align to 16 bytes, but only if both from and to have same alignment mod 8.
  1054     __ xorr(AT, tmp1, tmp2);
  1055     __ andi(AT, AT, 15);
  1056     __ bne(AT, R0, l_9);
  1057     __ delayed()->nop();
  1059     // Copy 4-element word if necessary to align to 16 bytes,
  1060     __ andi(AT, tmp1, 15);
  1061     __ beq(AT, R0, l_10);
  1062     __ delayed()->nop();
  1064     __ ld(AT, tmp1, 0);
  1065     __ daddi(tmp3, tmp3, -4);
  1066     __ sd(AT, tmp2, 0);
  1067     __ daddi(tmp1, tmp1, 8);
  1068     __ daddi(tmp2, tmp2, 8);
  1070     __ bind(l_10);
  1072     // Copy 8 elements at a time; either the loads or the stores can 
  1073     // be unalligned if aligned == false
  1075     { // FasterArrayCopy
  1076       __ bind(l_11);
  1077       // For loongson the 128-bit memory access instruction is gslq/gssq
  1078       if (UseLoongsonISA) {
  1079         __ gslq(AT, tmp4, tmp1, 0);
  1080         __ gslq(tmp5, tmp6, tmp1, 16);
  1081         __ daddi(tmp1, tmp1, 32);
  1082         __ daddi(tmp2, tmp2, 32);
  1083         __ gssq(AT, tmp4, tmp2, -32);
  1084         __ gssq(tmp5, tmp6, tmp2, -16);
  1085       } else {
  1086         __ ld(AT, tmp1, 0);
  1087         __ ld(tmp4, tmp1, 8);
  1088         __ ld(tmp5, tmp1, 16);
  1089         __ ld(tmp6, tmp1, 24);
  1090         __ daddi(tmp1, tmp1, 32);
  1091         __ sd(AT, tmp2, 0);
  1092         __ sd(tmp4, tmp2, 8);
  1093         __ sd(tmp5, tmp2, 16);
  1094         __ sd(tmp6, tmp2, 24);
  1095         __ daddi(tmp2, tmp2, 32);
  1097       __ daddi(tmp3, tmp3, -16);
  1098       __ daddi(AT, tmp3, -16);
  1099       __ bgez(AT, l_11);
  1100       __ delayed()->nop();
  1102     __ bind(l_9);
  1104     // Copy 4 elements at a time; either the loads or the stores can
  1105     // be unaligned if aligned == false.
  1106     { // FasterArrayCopy
  1107       __ daddi(AT, tmp3, -15);// loop unrolling 4 times, so if the elements should not be less than 16
  1108       __ blez(AT, l_4); // copy 2 at a time if less than 16 elements remain
  1109       __ delayed()->nop();
  1111       __ bind(l_8);
  1112       __ ld(AT, tmp1, 0);
  1113       __ ld(tmp4, tmp1, 8);
  1114       __ ld(tmp5, tmp1, 16);
  1115       __ ld(tmp6, tmp1, 24);
  1116       __ sd(AT, tmp2, 0);
  1117       __ sd(tmp4, tmp2, 8);
  1118       __ sd(tmp5, tmp2,16);
  1119       __ daddi(tmp1, tmp1, 32);
  1120       __ daddi(tmp2, tmp2, 32);
  1121       __ daddi(tmp3, tmp3, -16);
  1122       __ daddi(AT, tmp3, -16);
  1123       __ bgez(AT, l_8);
  1124       __ sd(tmp6, tmp2, -8);
  1126     __ bind(l_6);
  1128     // copy 2 element at a time
  1129     { // FasterArrayCopy
  1130       __ daddi(AT, tmp3, -7);
  1131       __ blez(AT, l_4);
  1132       __ delayed()->nop();
  1134       __ bind(l_3);
  1135       __ lw(AT, tmp1, 0);
  1136       __ lw(tmp4, tmp1, 4);
  1137       __ lw(tmp5, tmp1, 8);
  1138       __ lw(tmp6, tmp1, 12);
  1139       __ sw(AT, tmp2, 0);
  1140       __ sw(tmp4, tmp2, 4);
  1141       __ sw(tmp5, tmp2, 8);
  1142       __ daddi(tmp1, tmp1, 16);
  1143       __ daddi(tmp2, tmp2, 16);
  1144       __ daddi(tmp3, tmp3, -8);
  1145       __ daddi(AT, tmp3, -8);
  1146       __ bgez(AT, l_3);
  1147       __ sw(tmp6, tmp2, -4);
  1150     __ bind(l_1);
  1151     // do single element copy (8 bit), can this happen?
  1152     { // FasterArrayCopy
  1153       __ daddi(AT, tmp3, -3);
  1154       __ blez(AT, l_4);
  1155       __ delayed()->nop();
  1157       __ bind(l_5);
  1158       __ lhu(AT, tmp1, 0);
  1159       __ lhu(tmp4, tmp1, 2);
  1160       __ lhu(tmp5, tmp1, 4);
  1161       __ lhu(tmp6, tmp1, 6);
  1162       __ sh(AT, tmp2, 0);
  1163       __ sh(tmp4, tmp2, 2);
  1164       __ sh(tmp5, tmp2, 4);
  1165       __ daddi(tmp1, tmp1, 8);
  1166       __ daddi(tmp2, tmp2, 8);
  1167       __ daddi(tmp3, tmp3, -4);
  1168       __ daddi(AT, tmp3, -4);
  1169       __ bgez(AT, l_5);
  1170       __ sh(tmp6, tmp2, -2);
  1172     // single element
  1173     __ bind(l_4);
  1175     __ pop(tmp6);
  1176     __ pop(tmp5);
  1177     __ pop(tmp4);
  1179     __ bind(l_14);
  1180     { // FasterArrayCopy
  1181       __ beq(R0, tmp3, l_13);
  1182       __ delayed()->nop();
  1184       __ bind(l_12);
  1185       __ lhu(AT, tmp1, 0);
  1186       __ sh(AT, tmp2, 0);
  1187       __ daddi(tmp1, tmp1, 2);
  1188       __ daddi(tmp2, tmp2, 2);
  1189       __ daddi(tmp3, tmp3, -1);
  1190       __ daddi(AT, tmp3, -1);
  1191       __ bgez(AT, l_12);
  1192       __ delayed()->nop();
  1195     __ bind(l_13);
  1196     __ pop(tmp3);
  1197     __ pop(tmp2);
  1198     __ pop(tmp1);
  1200     __ jr(RA);
  1201     __ delayed()->nop();
  1203     __ bind(l_debug);
  1204     __ stop("generate_disjoint_short_copy should not reach here");
  1205     return start;
  1208   // Arguments:
  1209   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1210   //             ignored
  1211   //   name    - stub name string
  1212   //
  1213   // Inputs:
  1214   //   c_rarg0   - source array address
  1215   //   c_rarg1   - destination array address
  1216   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1217   //
  1218   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
  1219   // let the hardware handle it.  The two or four words within dwords
  1220   // or qwords that span cache line boundaries will still be loaded
  1221   // and stored atomically.
  1222   //
  1223   address generate_conjoint_short_copy(bool aligned, const char *name) {
  1224 		Label l_1, l_2, l_3, l_4, l_5;
  1225 		StubCodeMark mark(this, "StubRoutines", name);
  1226 		__ align(CodeEntryAlignment);
  1227 		address start = __ pc();
  1228 		address nooverlap_target = aligned ?
  1229 						StubRoutines::arrayof_jshort_disjoint_arraycopy() :
  1230 						StubRoutines::jshort_disjoint_arraycopy();
  1232 		array_overlap_test(nooverlap_target, 1);
  1234 		__ push(T3);	
  1235 		__ push(T0);	
  1236 		__ push(T1);	
  1237 		__ push(T8);	
  1239 		/*
  1240 			 __ pushl(esi);
  1241 			 __ movl(ecx, Address(esp, 4+12));      // count
  1242 			 __ pushl(edi);
  1243 			 __ movl(esi, Address(esp, 8+ 4));      // from
  1244 			 __ movl(edi, Address(esp, 8+ 8));      // to
  1245 		 */ 
  1246 		__ move(T1, A2);  
  1247 		__ move(T3, A0); 
  1248 		__ move(T0, A1);
  1251 		// copy dwords from high to low
  1252 		// __ leal(esi, Address(esi, ecx, Address::times_2, -4)); // from + count*2 - 4
  1253 		__ sll(AT, T1, Address::times_2); 
  1254 		__ add(AT, T3, AT); 
  1255 		__ lea(T3, Address( AT, -4)); 
  1256 		//__ std();
  1257 		//__ leal(edi, Address(edi, ecx, Address::times_2, -4)); // to + count*2 - 4
  1258 		__ sll(AT,T1 , Address::times_2); 
  1259 		__ add(AT, T0, AT); 
  1260 		__ lea(T0, Address( AT, -4)); 
  1261 		//  __ movl(eax, ecx);
  1262 		__ move(T8, T1); 
  1263 		__ bind(l_1);
  1264 		//   __ sarl(ecx, 1);              // dword count
  1265 		__ sra(T1,T1, 1); 
  1266 		//__ jcc(Assembler::equal, l_4);                   // no dwords to move
  1267 		__ beq(T1, R0, l_4);  
  1268 		__ delayed()->nop(); 
  1269 		/*    __ cmpl(ecx, 32);
  1270 					__ jcc(Assembler::above, l_3);                   // > 32 dwords
  1271 		// copy dwords with loop
  1272 		__ subl(edi, esi);
  1273 		 */     __ align(16);
  1274 		__ bind(l_2);
  1275 		//__ movl(edx, Address(esi));
  1276 		__ lw(AT, T3, 0);   
  1277 		//__ movl(Address(edi, esi, Address::times_1), edx);
  1278 		__ sw(AT, T0, 0); 
  1279 		//__ subl(esi, 4);
  1280 		__ addi(T3, T3, -4); 
  1281 		__ addi(T0, T0, -4); 
  1282 		//__ decl(ecx);
  1283 		__ addi(T1, T1, -1); 
  1284 		//  __ jcc(Assembler::notEqual, l_2);
  1285 		__ bne(T1, R0, l_2); 
  1286 		__ delayed()->nop(); 
  1287 		//  __ addl(edi, esi);
  1288 		// __ jmp(l_4);
  1289 		__ b(l_4);
  1290 		__ delayed()->nop();
  1291 		// copy dwords with repeat move
  1292 		__ bind(l_3);
  1293 		//   __ rep_movl();
  1294 		__ bind(l_4);
  1295 		//  __ andl(eax, 1);              // suffix count
  1296 		__ andi(T8, T8, 1);              // suffix count
  1297 		//__ jcc(Assembler::equal, l_5);                   // no suffix
  1298 		__ beq(T8, R0, l_5 );  
  1299 		__ delayed()->nop(); 
  1300 		// copy suffix
  1301 		//   __ movw(edx, Address(esi, 2));
  1302 		__ lh(AT, T3, 2); 
  1303 		//  __ movw(Address(edi, 2), edx);
  1304 		__ sh(AT, T0, 2); 
  1305 		__ bind(l_5);
  1306 		//    __ cld();
  1307 		//    __ popl(edi);
  1308 		//    __ popl(esi);
  1309 		//   __ ret(0);
  1310 		__ pop(T8);	
  1311 		__ pop(T1);	
  1312 		__ pop(T0);	
  1313 		__ pop(T3);	
  1314 		__ jr(RA); 
  1315 		__ delayed()->nop();   
  1316 		return start;
  1319   // Arguments:
  1320   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1321   //             ignored
  1322   //   is_oop  - true => oop array, so generate store check code
  1323   //   name    - stub name string
  1324   //
  1325   // Inputs:
  1326   //   c_rarg0   - source array address
  1327   //   c_rarg1   - destination array address
  1328   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1329   //
  1330   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1331   // the hardware handle it.  The two dwords within qwords that span
  1332   // cache line boundaries will still be loaded and stored atomicly.
  1333   //
  1334   // Side Effects:
  1335   //   disjoint_int_copy_entry is set to the no-overlap entry point
  1336   //   used by generate_conjoint_int_oop_copy().
  1337   //
  1338   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {
  1339     Label l_3, l_4, l_5, l_6, l_7;
  1340     StubCodeMark mark(this, "StubRoutines", name);
  1342     __ align(CodeEntryAlignment);
  1343     address start = __ pc();
  1344     __ push(T3);
  1345     __ push(T0);
  1346     __ push(T1);
  1347     __ push(T8);
  1348     __ move(T1, A2);
  1349     __ move(T3, A0);
  1350     __ move(T0, A1);
  1352     if (is_oop) {
  1353       if (Use3A2000) __ sync();
  1356     if(!aligned) {
  1357       __ xorr(AT, T3, T0);
  1358       __ andi(AT, AT, 7);
  1359       __ bne(AT, R0, l_5); // not same alignment mod 8 -> copy 1 element each time
  1360       __ delayed()->nop();
  1362       __ andi(AT, T3, 7);
  1363       __ beq(AT, R0, l_6); //copy 2 elements each time
  1364       __ delayed()->nop();
  1366       __ lw(AT, T3, 0);
  1367       __ daddi(T1, T1, -1);
  1368       __ sw(AT, T0, 0);
  1369       __ daddi(T3, T3, 4);
  1370       __ daddi(T0, T0, 4);
  1374       __ bind(l_6);
  1375       __ daddi(AT, T1, -1);
  1376       __ blez(AT, l_5);
  1377       __ delayed()->nop();
  1379       __ bind(l_7);
  1380       __ ld(AT, T3, 0);
  1381       __ sd(AT, T0, 0);
  1382       __ daddi(T3, T3, 8);
  1383       __ daddi(T0, T0, 8);
  1384       __ daddi(T1, T1, -2);
  1385       __ daddi(AT, T1, -2);
  1386       __ bgez(AT, l_7);
  1387       __ delayed()->nop();
  1390     __ bind(l_5);
  1391     __ beq(T1, R0, l_4);
  1392     __ delayed()->nop();
  1394     __ align(16);
  1395     __ bind(l_3);
  1396     __ lw(AT, T3, 0);
  1397     __ sw(AT, T0, 0);
  1398     __ addi(T3, T3, 4);
  1399     __ addi(T0, T0, 4);
  1400     __ addi(T1, T1, -1);
  1401     __ bne(T1, R0, l_3);
  1402     __ delayed()->nop();
  1404     if (is_oop) {
  1405       __ move(T0, A1);
  1406       __ move(T1, A2);
  1407       array_store_check();
  1410     // exit
  1411     __ bind(l_4);
  1412     __ pop(T8);
  1413     __ pop(T1);
  1414     __ pop(T0);
  1415     __ pop(T3);
  1416     __ jr(RA);
  1417     __ delayed()->nop();
  1419     return start;
  1422   // Arguments:
  1423   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1424   //             ignored
  1425   //   is_oop  - true => oop array, so generate store check code
  1426   //   name    - stub name string
  1427   //
  1428   // Inputs:
  1429   //   c_rarg0   - source array address
  1430   //   c_rarg1   - destination array address
  1431   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1432   //
  1433   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1434   // the hardware handle it.  The two dwords within qwords that span
  1435   // cache line boundaries will still be loaded and stored atomicly.
  1436   //
  1437   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {
  1438 		Label l_2, l_4;
  1439 		StubCodeMark mark(this, "StubRoutines", name);
  1440 		__ align(CodeEntryAlignment);
  1441 		address start = __ pc();
  1442 		address nooverlap_target;
  1444 		if (is_oop) {
  1445 			nooverlap_target = aligned ?
  1446 							StubRoutines::arrayof_oop_disjoint_arraycopy() :
  1447 							StubRoutines::oop_disjoint_arraycopy();
  1448 		}else {
  1449 			nooverlap_target = aligned ?
  1450 							StubRoutines::arrayof_jint_disjoint_arraycopy() :
  1451 							StubRoutines::jint_disjoint_arraycopy();
  1454 		array_overlap_test(nooverlap_target, 2);
  1456 		__ push(T3);
  1457 		__ push(T0);
  1458 		__ push(T1);
  1459 		__ push(T8);
  1461 		__ move(T1, A2);  
  1462 		__ move(T3, A0); 
  1463 		__ move(T0, A1);
  1464                 // T3: source array address
  1465                 // T0: destination array address
  1466                 // T1: element count
  1468 		if (is_oop) {
  1469                    if (Use3A2000) __ sync();
  1472 		__ sll(AT, T1, Address::times_4); 
  1473 		__ add(AT, T3, AT); 
  1474 		__ lea(T3 , Address(AT, -4)); 
  1475 		__ sll(AT, T1, Address::times_4); 
  1476 		__ add(AT, T0, AT); 
  1477 		__ lea(T0 , Address(AT, -4)); 
  1479 		__ beq(T1, R0, l_4); 
  1480 		__ delayed()->nop();  
  1482 		__ align(16);
  1483 		__ bind(l_2);
  1484 		__ lw(AT, T3, 0);   
  1485 		__ sw(AT, T0, 0); 
  1486 		__ addi(T3, T3, -4); 
  1487 		__ addi(T0, T0, -4); 
  1488 		__ addi(T1, T1, -1); 
  1489 		__ bne(T1, R0, l_2);  
  1490 		__ delayed()->nop(); 
  1492 		if (is_oop) {
  1493 			__ move(T0, A1);  
  1494 			__ move(T1, A2);  
  1495 			array_store_check();
  1497 		__ bind(l_4);
  1498 		__ pop(T8); 
  1499 		__ pop(T1); 
  1500 		__ pop(T0); 
  1501 		__ pop(T3); 
  1502 		__ jr(RA); 
  1503 		__ delayed()->nop(); 
  1505 		return start;
  1508   // Arguments:
  1509   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1510   //             ignored
  1511   //   is_oop  - true => oop array, so generate store check code
  1512   //   name    - stub name string
  1513   //
  1514   // Inputs:
  1515   //   c_rarg0   - source array address
  1516   //   c_rarg1   - destination array address
  1517   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1518   //
  1519   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1520   // the hardware handle it.  The two dwords within qwords that span
  1521   // cache line boundaries will still be loaded and stored atomicly.
  1522   //
  1523   // Side Effects:
  1524   //   disjoint_int_copy_entry is set to the no-overlap entry point
  1525   //   used by generate_conjoint_int_oop_copy().
  1526   //
  1527   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
  1528 		Label l_3, l_4;
  1529 		StubCodeMark mark(this, "StubRoutines", name);
  1530 		__ align(CodeEntryAlignment);
  1531 		address start = __ pc();
  1533 		__ push(T3);	
  1534 		__ push(T0);	
  1535 		__ push(T1);	
  1536 		__ push(T8);	
  1538 		__ move(T1, A2);  
  1539 		__ move(T3, A0); 
  1540 		__ move(T0, A1);
  1541                 // T3: source array address
  1542                 // T0: destination array address
  1543                 // T1: element count
  1545 		if (is_oop) {
  1546                   if (Use3A2000) __ sync();
  1549 		__ beq(T1, R0, l_4);  
  1550 		__ delayed()->nop(); 
  1552 		__ align(16);
  1553 		__ bind(l_3);
  1554 		__ ld(AT, T3, 0);   
  1555 		__ sd(AT, T0, 0); 
  1556 		__ addi(T3, T3, 8);
  1557 		__ addi(T0, T0, 8);
  1558 		__ addi(T1, T1, -1); 
  1559 		__ bne(T1, R0, l_3); 
  1560 		__ delayed()->nop(); 
  1562 		if (is_oop) {
  1563 			__ move(T0, A1); 
  1564 			__ move(T1, A2); 
  1565 			array_store_check();
  1568                 // exit
  1569 		__ bind(l_4);
  1570 		__ pop(T8);
  1571 		__ pop(T1);
  1572 		__ pop(T0);
  1573 		__ pop(T3);
  1574 		__ jr(RA); 
  1575 		__ delayed()->nop(); 
  1576 		return start;
  1579   // Arguments:
  1580   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1581   //             ignored
  1582   //   is_oop  - true => oop array, so generate store check code
  1583   //   name    - stub name string
  1584   //
  1585   // Inputs:
  1586   //   c_rarg0   - source array address
  1587   //   c_rarg1   - destination array address
  1588   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1589   //
  1590   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1591   // the hardware handle it.  The two dwords within qwords that span
  1592   // cache line boundaries will still be loaded and stored atomicly.
  1593   //
  1594   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
  1595 		Label l_2, l_4;
  1596 		StubCodeMark mark(this, "StubRoutines", name);
  1597 		__ align(CodeEntryAlignment);
  1598 		address start = __ pc();
  1599 		address nooverlap_target;
  1601 		if (is_oop) {
  1602 			nooverlap_target = aligned ?
  1603 							StubRoutines::arrayof_oop_disjoint_arraycopy() :
  1604 							StubRoutines::oop_disjoint_arraycopy();
  1605 		}else {
  1606 			nooverlap_target = aligned ?
  1607 							StubRoutines::arrayof_jlong_disjoint_arraycopy() :
  1608 							StubRoutines::jlong_disjoint_arraycopy();
  1611 		array_overlap_test(nooverlap_target, 3);
  1613 		__ push(T3);
  1614 		__ push(T0);
  1615 		__ push(T1);
  1616 		__ push(T8);
  1618 		__ move(T1, A2);  
  1619 		__ move(T3, A0); 
  1620 		__ move(T0, A1);
  1622 		if (is_oop) {
  1623                    if (Use3A2000) __ sync();
  1626 		__ sll(AT, T1, Address::times_8); 
  1627 		__ add(AT, T3, AT); 
  1628 		__ lea(T3 , Address(AT, -8)); 
  1629 		__ sll(AT, T1, Address::times_8); 
  1630 		__ add(AT, T0, AT); 
  1631 		__ lea(T0 , Address(AT, -8)); 
  1633 		__ beq(T1, R0, l_4); 
  1634 		__ delayed()->nop();  
  1636 		__ align(16);
  1637 		__ bind(l_2);
  1638 		__ ld(AT, T3, 0);   
  1639 		__ sd(AT, T0, 0); 
  1640 		__ addi(T3, T3, -8); 
  1641 		__ addi(T0, T0, -8); 
  1642 		__ addi(T1, T1, -1); 
  1643 		__ bne(T1, R0, l_2);  
  1644 		__ delayed()->nop(); 
  1646 		if (is_oop) {
  1647 			__ move(T0, A1);  
  1648 			__ move(T1, A2);  
  1649 			array_store_check();
  1651 		__ bind(l_4);
  1652 		__ pop(T8);	
  1653 		__ pop(T1);	
  1654 		__ pop(T0);	
  1655 		__ pop(T3);	
  1656 		__ jr(RA);	
  1657 		__ delayed()->nop(); 
  1658 		return start;
  1661 //FIXME
  1662   address generate_disjoint_long_copy(bool aligned, const char *name) {
  1663 	  Label l_1, l_2;
  1664 	  StubCodeMark mark(this, "StubRoutines", name);
  1665 	  __ align(CodeEntryAlignment);
  1666 	  address start = __ pc();
  1668 	  //      __ movl(ecx, Address(esp, 4+8));       // count
  1669 	  //     __ movl(eax, Address(esp, 4+0));       // from
  1670 	  //    __ movl(edx, Address(esp, 4+4));       // to
  1671 	  __ move(T1, A2);  
  1672 	  __ move(T3, A0); 
  1673 	  __ move(T0, A1);
  1674 	  __ push(T3); 
  1675 	  __ push(T0);
  1676 	  __ push(T1);
  1677 	  //__ subl(edx, eax);
  1678 	  //__ jmp(l_2);
  1679 	  __ b(l_2);  
  1680 	  __ delayed()->nop();   
  1681 	  __ align(16);
  1682 	  __ bind(l_1);
  1683 	  //   if (VM_Version::supports_mmx()) {
  1684 	  //     __ movq(mmx0, Address(eax));
  1685 	  //     __ movq(Address(eax, edx, Address::times_1), mmx0);
  1686 	  //   } else {
  1687 	  //   __ fild_d(Address(eax));
  1688 	  __ ld(AT, T3, 0);   
  1689 	  // __ fistp_d(Address(eax, edx, Address::times_1));
  1690 	  __ sd (AT, T0, 0); 
  1691 	  //   }
  1692 	  //   __ addl(eax, 8);
  1693 	  __ addi(T3, T3, 8); 
  1694 	  __ addi(T0, T0, 8); 
  1695 	  __ bind(l_2);
  1696 	  //    __ decl(ecx);
  1697 	  __ addi(T1, T1, -1); 
  1698 	  //    __ jcc(Assembler::greaterEqual, l_1);
  1699 	  __ bgez(T1, l_1);    
  1700 	  __ delayed()->nop(); 
  1701 	  //  if (VM_Version::supports_mmx()) {
  1702 	  //    __ emms();
  1703 	  //  }
  1704 	  //  __ ret(0);
  1705 	  __ pop(T1); 
  1706 	  __ pop(T0); 
  1707 	  __ pop(T3); 
  1708 	  __ jr(RA); 
  1709 	  __ delayed()->nop(); 
  1710 	  return start;
  1714   address generate_conjoint_long_copy(bool aligned, const char *name) {
  1715 	  Label l_1, l_2;
  1716 	  StubCodeMark mark(this, "StubRoutines", name);
  1717 	  __ align(CodeEntryAlignment);
  1718 	  address start = __ pc();
  1719 	  address nooverlap_target = aligned ?
  1720 		  StubRoutines::arrayof_jlong_disjoint_arraycopy() :
  1721 		  StubRoutines::jlong_disjoint_arraycopy();
  1722 	  array_overlap_test(nooverlap_target, 3);
  1724 	  __ push(T3); 
  1725 	  __ push(T0); 
  1726 	  __ push(T1); 
  1728 		/*      __ movl(ecx, Address(esp, 4+8));       // count
  1729 						__ movl(eax, Address(esp, 4+0));       // from
  1730 						__ movl(edx, Address(esp, 4+4));       // to
  1731 						__ jmp(l_2);
  1733 		 */
  1734 	  __ move(T1, A2);  
  1735 	  __ move(T3, A0); 
  1736 	  __ move(T0, A1);
  1737 	  __ sll(AT, T1, Address::times_8); 
  1738 	  __ add(AT, T3, AT); 
  1739 	  __ lea(T3 , Address(AT, -8)); 
  1740 	  __ sll(AT, T1, Address::times_8); 
  1741 	  __ add(AT, T0, AT); 
  1742 	  __ lea(T0 , Address(AT, -8)); 
  1746 	  __ b(l_2); 
  1747 	  __ delayed()->nop(); 
  1748 	  __ align(16);
  1749 		__ bind(l_1);
  1750 		/*      if (VM_Version::supports_mmx()) {
  1751 						__ movq(mmx0, Address(eax, ecx, Address::times_8));
  1752 						__ movq(Address(edx, ecx,Address::times_8), mmx0);
  1753 						} else {
  1754 						__ fild_d(Address(eax, ecx, Address::times_8));
  1755 						__ fistp_d(Address(edx, ecx,Address::times_8));
  1757 		 */    
  1758 		__ ld(AT, T3, 0);   
  1759 		__ sd (AT, T0, 0); 
  1760 	  __ addi(T3, T3, -8); 
  1761 	  __ addi(T0, T0,-8); 
  1762 	  __ bind(l_2);
  1763 	  //	    __ decl(ecx);
  1764 	  __ addi(T1, T1, -1); 
  1765 	  //__ jcc(Assembler::greaterEqual, l_1);
  1766 	  __ bgez(T1, l_1); 
  1767 	  __ delayed()->nop(); 
  1768 	  //      if (VM_Version::supports_mmx()) {
  1769 	  //      __ emms();
  1770 	  //   }
  1771 	  //  __ ret(0);
  1772 	  __ pop(T1); 
  1773 	  __ pop(T0); 
  1774 	  __ pop(T3); 
  1775 	  __ jr(RA); 
  1776 	  __ delayed()->nop();  
  1777 	  return start;
  1780   void generate_arraycopy_stubs() {
  1781     if (UseCompressedOops) {
  1782       StubRoutines::_oop_disjoint_arraycopy          = generate_disjoint_int_oop_copy(false, true,
  1783                                                                                       "oop_disjoint_arraycopy");
  1784       StubRoutines::_oop_arraycopy                   = generate_conjoint_int_oop_copy(false, true,
  1785                                                                                       "oop_arraycopy");
  1786       StubRoutines::_oop_disjoint_arraycopy_uninit   = generate_disjoint_int_oop_copy(false, true,
  1787                                                                                       "oop_disjoint_arraycopy_uninit");
  1788       StubRoutines::_oop_arraycopy_uninit            = generate_conjoint_int_oop_copy(false, true,
  1789                                                                                       "oop_arraycopy_uninit");
  1790     } else {
  1791       StubRoutines::_oop_disjoint_arraycopy          = generate_disjoint_long_oop_copy(false, true,
  1792                                                                                        "oop_disjoint_arraycopy");
  1793       StubRoutines::_oop_arraycopy                   = generate_conjoint_long_oop_copy(false, true,
  1794                                                                                        "oop_arraycopy");
  1795       StubRoutines::_oop_disjoint_arraycopy_uninit   = generate_disjoint_long_oop_copy(false, true,
  1796                                                                                        "oop_disjoint_arraycopy_uninit");
  1797       StubRoutines::_oop_arraycopy_uninit            = generate_conjoint_long_oop_copy(false, true,
  1798                                                                                        "oop_arraycopy_uninit");
  1801     StubRoutines::_jbyte_disjoint_arraycopy          = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
  1802     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
  1803     StubRoutines::_jint_disjoint_arraycopy           = generate_disjoint_int_oop_copy(false, false, "jint_disjoint_arraycopy");
  1804     StubRoutines::_jlong_disjoint_arraycopy          = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
  1806     StubRoutines::_jbyte_arraycopy  = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
  1807     StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, "jshort_arraycopy");
  1808     StubRoutines::_jint_arraycopy   = generate_conjoint_int_oop_copy(false, false, "jint_arraycopy");
  1809     StubRoutines::_jlong_arraycopy  = generate_conjoint_long_copy(false, "jlong_arraycopy");
  1811     // We don't generate specialized code for HeapWord-aligned source
  1812     // arrays, so just use the code we've already generated
  1813     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
  1814     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
  1816     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
  1817     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
  1819     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
  1820     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
  1822     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
  1823     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
  1825     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
  1826     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
  1828     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
  1829     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
  1832 //Wang: add a function to implement SafeFetch32 and SafeFetchN
  1833   void generate_safefetch(const char* name, int size, address* entry,
  1834                           address* fault_pc, address* continuation_pc) {
  1835     // safefetch signatures:
  1836     //   int      SafeFetch32(int*      adr, int      errValue);
  1837     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
  1838     //
  1839     // arguments:
  1840     //   A0 = adr
  1841     //   A1 = errValue
  1842     //
  1843     // result:
  1844     //   PPC_RET  = *adr or errValue
  1846     StubCodeMark mark(this, "StubRoutines", name);
  1848     // Entry point, pc or function descriptor.
  1849     *entry = __ pc();
  1851     // Load *adr into A1, may fault.
  1852     *fault_pc = __ pc();
  1853     switch (size) {
  1854       case 4:
  1855         // int32_t
  1856         __ lw(A1, A0, 0); 
  1857         break;
  1858       case 8:
  1859         // int64_t
  1860         __ ld(A1, A0, 0); 
  1861         break;
  1862       default:
  1863         ShouldNotReachHere();
  1866     // return errValue or *adr
  1867     *continuation_pc = __ pc();
  1868     __ addu(V0,A1,R0);
  1869     __ jr(RA);
  1870     __ delayed()->nop();
  1874 #undef __
  1875 #define __ masm->
  1877   // Continuation point for throwing of implicit exceptions that are
  1878   // not handled in the current activation. Fabricates an exception
  1879   // oop and initiates normal exception dispatching in this
  1880   // frame. Since we need to preserve callee-saved values (currently
  1881   // only for C2, but done for C1 as well) we need a callee-saved oop
  1882   // map and therefore have to make these stubs into RuntimeStubs
  1883   // rather than BufferBlobs.  If the compiler needs all registers to
  1884   // be preserved between the fault point and the exception handler
  1885   // then it must assume responsibility for that in
  1886   // AbstractCompiler::continuation_for_implicit_null_exception or
  1887   // continuation_for_implicit_division_by_zero_exception. All other
  1888   // implicit exceptions (e.g., NullPointerException or
  1889   // AbstractMethodError on entry) are either at call sites or
  1890   // otherwise assume that stack unwinding will be initiated, so
  1891   // caller saved registers were assumed volatile in the compiler.
  1892   address generate_throw_exception(const char* name,
  1893                                    address runtime_entry,
  1894                                    bool restore_saved_exception_pc) {
  1895     // Information about frame layout at time of blocking runtime call.
  1896     // Note that we only have to preserve callee-saved registers since
  1897     // the compilers are responsible for supplying a continuation point
  1898 		// if they expect all registers to be preserved.
  1899 //#define aoqi_test
  1900 #ifdef aoqi_test
  1901 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  1902 #endif
  1903 		enum layout {
  1904 			thread_off,    // last_java_sp                
  1905 			S7_off,        // callee saved register      sp + 1
  1906 			S6_off,        // callee saved register      sp + 2
  1907 			S5_off,        // callee saved register      sp + 3
  1908 			S4_off,        // callee saved register      sp + 4
  1909 			S3_off,        // callee saved register      sp + 5
  1910 			S2_off,        // callee saved register      sp + 6
  1911 			S1_off,        // callee saved register      sp + 7
  1912 			S0_off,        // callee saved register      sp + 8
  1913 			FP_off,
  1914 			ret_address,
  1915 			framesize
  1916 		};
  1918 		int insts_size = 2048;
  1919 		int locs_size  = 32;
  1921 		//  CodeBuffer* code     = new CodeBuffer(insts_size, locs_size, 0, 0, 0, false, 
  1922 		//  NULL, NULL, NULL, false, NULL, name, false);
  1923 		CodeBuffer code (name , insts_size, locs_size);
  1924 #ifdef aoqi_test
  1925 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  1926 #endif
  1927 		OopMapSet* oop_maps  = new OopMapSet();
  1928 #ifdef aoqi_test
  1929 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  1930 #endif
  1931 		MacroAssembler* masm = new MacroAssembler(&code);
  1932 #ifdef aoqi_test
  1933 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  1934 #endif
  1936 		address start = __ pc();
  1937     	//__ stop("generate_throw_exception");
  1938 		/*
  1939 			 __ move(AT, (int)&jerome1 );
  1940 			 __ sw(SP, AT, 0); 	
  1941 			 __ move(AT, (int)&jerome2 );
  1942 			 __ sw(FP, AT, 0); 	
  1943 			 __ move(AT, (int)&jerome3 );
  1944 			 __ sw(RA, AT, 0); 	
  1945 			 __ move(AT, (int)&jerome4 );
  1946 			 __ sw(R0, AT, 0); 	
  1947 			 __ move(AT, (int)&jerome5 );
  1948 			 __ sw(R0, AT, 0); 	
  1949 			 __ move(AT, (int)&jerome6 );
  1950 			 __ sw(R0, AT, 0); 	
  1951 			 __ move(AT, (int)&jerome7 );
  1952 			 __ sw(R0, AT, 0); 	
  1953 			 __ move(AT, (int)&jerome10 );
  1954 			 __ sw(R0, AT, 0); 	
  1956 			 __ pushad();
  1958 		//__ enter();
  1959 		__ call(CAST_FROM_FN_PTR(address, SharedRuntime::print_call_statistics), 
  1960 		relocInfo::runtime_call_type);
  1961 		__ delayed()->nop();
  1963 		//__ leave();
  1964 		__ popad();
  1966 		 */
  1968 		// This is an inlined and slightly modified version of call_VM
  1969 		// which has the ability to fetch the return PC out of
  1970 		// thread-local storage and also sets up last_Java_sp slightly
  1971 		// differently than the real call_VM
  1972 #ifndef OPT_THREAD	
  1973 		Register java_thread = TREG;
  1974 		__ get_thread(java_thread);
  1975 #else
  1976 		Register java_thread = TREG;
  1977 #endif
  1978 #ifdef aoqi_test
  1979 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  1980 #endif
  1981 		if (restore_saved_exception_pc) {
  1982 			__ ld(RA, java_thread, in_bytes(JavaThread::saved_exception_pc_offset())); // eax
  1985 		__ enter(); // required for proper stackwalking of RuntimeStub frame
  1987 		__ addi(SP, SP, (-1) * (framesize-2) * wordSize); // prolog
  1988 		__ sd(S0, SP, S0_off * wordSize);
  1989 		__ sd(S1, SP, S1_off * wordSize);
  1990 		__ sd(S2, SP, S2_off * wordSize);
  1991 		__ sd(S3, SP, S3_off * wordSize);
  1992 		__ sd(S4, SP, S4_off * wordSize);
  1993 		__ sd(S5, SP, S5_off * wordSize);
  1994 		__ sd(S6, SP, S6_off * wordSize);
  1995 		__ sd(S7, SP, S7_off * wordSize);
  1997 		int frame_complete = __ pc() - start;
  1998 		// push java thread (becomes first argument of C function)
  1999 		__ sd(java_thread, SP, thread_off * wordSize);
  2000 		if (java_thread!=A0)
  2001 			__ move(A0, java_thread);
  2003 		// Set up last_Java_sp and last_Java_fp
  2004 		__ set_last_Java_frame(java_thread, SP, FP, NULL);
  2005 		__ relocate(relocInfo::internal_pc_type);
  2007 			intptr_t save_pc = (intptr_t)__ pc() +  NativeMovConstReg::instruction_size + 28;
  2008 			__ patchable_set48(AT, save_pc);
  2010 		__ sd(AT, java_thread, in_bytes(JavaThread::last_Java_pc_offset())); 
  2012 		// Call runtime
  2013 		__ call(runtime_entry);
  2014 		__ delayed()->nop();
  2015 		// Generate oop map
  2016 		OopMap* map =  new OopMap(framesize, 0);        
  2017 		oop_maps->add_gc_map(__ offset(),  map);
  2019 		// restore the thread (cannot use the pushed argument since arguments
  2020 		// may be overwritten by C code generated by an optimizing compiler);
  2021 		// however can use the register value directly if it is callee saved.
  2022 #ifndef OPT_THREAD
  2023 		__ get_thread(java_thread);
  2024 #endif
  2026 		__ ld(SP, java_thread, in_bytes(JavaThread::last_Java_sp_offset()));
  2027 		//  __ reset_last_Java_frame(java_thread, true);
  2028 		__ reset_last_Java_frame(java_thread, true, true);
  2030 		// Restore callee save registers.  This must be done after resetting the Java frame
  2031 		__ ld(S0, SP, S0_off * wordSize);
  2032 		__ ld(S1, SP, S1_off * wordSize);
  2033 		__ ld(S2, SP, S2_off * wordSize);
  2034 		__ ld(S3, SP, S3_off * wordSize);
  2035 		__ ld(S4, SP, S4_off * wordSize);
  2036 		__ ld(S5, SP, S5_off * wordSize);
  2037 		__ ld(S6, SP, S6_off * wordSize);
  2038 		__ ld(S7, SP, S7_off * wordSize);
  2040 		// discard arguments
  2041 		__ addi(SP, SP, (framesize-2) * wordSize); // epilog
  2042 		//	__ leave(); // required for proper stackwalking of RuntimeStub frame
  2043 		__ addi(SP, FP, wordSize);
  2044 		__ ld(FP, SP, -1*wordSize);
  2045 		// check for pending exceptions
  2046 #ifdef ASSERT
  2047 		Label L;
  2048 		__ lw(AT, java_thread, in_bytes(Thread::pending_exception_offset()));
  2049 		__ bne(AT, R0, L);
  2050 		__ delayed()->nop();
  2051 		__ should_not_reach_here();
  2052 		__ bind(L);
  2053 #endif //ASSERT
  2054 		__ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
  2055 		__ delayed()->nop();
  2056 #ifdef aoqi_test
  2057 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2058 #endif
  2059 		RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code,frame_complete, 
  2060 										framesize, oop_maps, false);
  2061 #ifdef aoqi_test
  2062 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
  2063 #endif
  2064 		return stub->entry_point();
  2067   // Initialization
  2068   void generate_initial() {
  2069 /*
  2070 		// Generates all stubs and initializes the entry points
  2072     // This platform-specific stub is needed by generate_call_stub()
  2073     StubRoutines::mips::_mxcsr_std        = generate_fp_mask("mxcsr_std",        0x0000000000001F80);
  2075     // entry points that exist in all platforms Note: This is code
  2076     // that could be shared among different platforms - however the
  2077     // benefit seems to be smaller than the disadvantage of having a
  2078     // much more complicated generator structure. See also comment in
  2079     // stubRoutines.hpp.
  2081     StubRoutines::_forward_exception_entry = generate_forward_exception();
  2083     StubRoutines::_call_stub_entry =
  2084       generate_call_stub(StubRoutines::_call_stub_return_address);
  2086     // is referenced by megamorphic call
  2087     StubRoutines::_catch_exception_entry = generate_catch_exception();
  2089     // atomic calls
  2090     StubRoutines::_atomic_xchg_entry         = generate_atomic_xchg();
  2091     StubRoutines::_atomic_xchg_ptr_entry     = generate_atomic_xchg_ptr();
  2092     StubRoutines::_atomic_cmpxchg_entry      = generate_atomic_cmpxchg();
  2093     StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
  2094     StubRoutines::_atomic_add_entry          = generate_atomic_add();
  2095     StubRoutines::_atomic_add_ptr_entry      = generate_atomic_add_ptr();
  2096     StubRoutines::_fence_entry               = generate_orderaccess_fence();
  2098     StubRoutines::_handler_for_unsafe_access_entry =
  2099       generate_handler_for_unsafe_access();
  2101     // platform dependent
  2102     StubRoutines::mips::_get_previous_fp_entry = generate_get_previous_fp();
  2104     StubRoutines::mips::_verify_mxcsr_entry    = generate_verify_mxcsr();
  2105 */
  2106 		// Generates all stubs and initializes the entry points
  2108 		//-------------------------------------------------------------
  2109 		//-----------------------------------------------------------
  2110 		// entry points that exist in all platforms
  2111 		// Note: This is code that could be shared among different platforms - however the benefit seems to be smaller 
  2112 		// than the disadvantage of having a much more complicated generator structure. 
  2113 		// See also comment in stubRoutines.hpp.
  2114 		StubRoutines::_forward_exception_entry = generate_forward_exception();    
  2115 		StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
  2116 		// is referenced by megamorphic call    
  2117 		StubRoutines::_catch_exception_entry = generate_catch_exception();    
  2119 		StubRoutines::_handler_for_unsafe_access_entry = generate_handler_for_unsafe_access();
  2121 		// platform dependent
  2122 		StubRoutines::gs2::_get_previous_fp_entry = generate_get_previous_fp();
  2125 void generate_all() {
  2126 #ifdef aoqi_test
  2127 tty->print_cr("%s:%d", __func__, __LINE__);
  2128 #endif
  2129     // Generates all stubs and initializes the entry points
  2131     // These entry points require SharedInfo::stack0 to be set up in
  2132     // non-core builds and need to be relocatable, so they each
  2133     // fabricate a RuntimeStub internally.
  2134 	/*
  2135     StubRoutines::_throw_AbstractMethodError_entry =
  2136       generate_throw_exception("AbstractMethodError throw_exception",
  2137                                CAST_FROM_FN_PTR(address,
  2138                                                 SharedRuntime::
  2139                                                 throw_AbstractMethodError),
  2140                                false);
  2142     StubRoutines::_throw_IncompatibleClassChangeError_entry =
  2143       generate_throw_exception("IncompatibleClassChangeError throw_exception",
  2144                                CAST_FROM_FN_PTR(address,
  2145                                                 SharedRuntime::
  2146                                                 throw_IncompatibleClassChangeError),
  2147                                false);
  2149     StubRoutines::_throw_ArithmeticException_entry =
  2150       generate_throw_exception("ArithmeticException throw_exception",
  2151                                CAST_FROM_FN_PTR(address,
  2152                                                 SharedRuntime::
  2153                                                 throw_ArithmeticException),
  2154                                true);
  2156     StubRoutines::_throw_NullPointerException_entry =
  2157       generate_throw_exception("NullPointerException throw_exception",
  2158                                CAST_FROM_FN_PTR(address,
  2159                                                 SharedRuntime::
  2160                                                 throw_NullPointerException),
  2161                                true);
  2163     StubRoutines::_throw_NullPointerException_at_call_entry =
  2164       generate_throw_exception("NullPointerException at call throw_exception",
  2165                                CAST_FROM_FN_PTR(address,
  2166                                                 SharedRuntime::
  2167                                                 throw_NullPointerException_at_call),
  2168                                false);
  2170     StubRoutines::_throw_StackOverflowError_entry =
  2171       generate_throw_exception("StackOverflowError throw_exception",
  2172                                CAST_FROM_FN_PTR(address,
  2173                                                 SharedRuntime::
  2174                                                 throw_StackOverflowError),
  2175                                false);
  2177     // entry points that are platform specific
  2178     StubRoutines::mips::_f2i_fixup = generate_f2i_fixup();
  2179     StubRoutines::mips::_f2l_fixup = generate_f2l_fixup();
  2180     StubRoutines::mips::_d2i_fixup = generate_d2i_fixup();
  2181     StubRoutines::mips::_d2l_fixup = generate_d2l_fixup();
  2183     StubRoutines::mips::_float_sign_mask  = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
  2184     StubRoutines::mips::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
  2185     StubRoutines::mips::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
  2186     StubRoutines::mips::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
  2188     // support for verify_oop (must happen after universe_init)
  2189     StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
  2191     // arraycopy stubs used by compilers
  2192     generate_arraycopy_stubs();
  2193 	*/
  2194 #ifdef aoqi_test
  2195 tty->print_cr("%s:%d", __func__, __LINE__);
  2196 #endif
  2197 		StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
  2198 #ifdef aoqi_test
  2199 tty->print_cr("%s:%d", __func__, __LINE__);
  2200 #endif
  2201 //		StubRoutines::_throw_ArithmeticException_entry         = generate_throw_exception("ArithmeticException throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_ArithmeticException),  true);
  2202 #ifdef aoqi_test
  2203 tty->print_cr("%s:%d", __func__, __LINE__);
  2204 #endif
  2205 //		StubRoutines::_throw_NullPointerException_entry        = generate_throw_exception("NullPointerException throw_exception",         CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException), true);
  2206 #ifdef aoqi_test
  2207 tty->print_cr("%s:%d", __func__, __LINE__);
  2208 #endif
  2209 		StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
  2210 #ifdef aoqi_test
  2211 tty->print_cr("%s:%d", __func__, __LINE__);
  2212 #endif
  2213 		StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError),   false);
  2214 #ifdef aoqi_test
  2215 tty->print_cr("%s:%d", __func__, __LINE__);
  2216 #endif
  2218 		//------------------------------------------------------
  2219 		//------------------------------------------------------------------
  2220 		// entry points that are platform specific  
  2222 		// support for verify_oop (must happen after universe_init)
  2223 #ifdef aoqi_test
  2224 tty->print_cr("%s:%d", __func__, __LINE__);
  2225 #endif
  2226 		StubRoutines::_verify_oop_subroutine_entry	   = generate_verify_oop();
  2227 #ifdef aoqi_test
  2228 tty->print_cr("%s:%d", __func__, __LINE__);
  2229 #endif
  2230 #ifndef CORE
  2231 		// arraycopy stubs used by compilers
  2232 		generate_arraycopy_stubs();
  2233 #ifdef aoqi_test
  2234 tty->print_cr("%s:%d", __func__, __LINE__);
  2235 #endif
  2236 #endif
  2238     // Safefetch stubs.
  2239     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
  2240                                                        &StubRoutines::_safefetch32_fault_pc,
  2241                                                        &StubRoutines::_safefetch32_continuation_pc);
  2242     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
  2243                                                        &StubRoutines::_safefetchN_fault_pc,
  2244                                                        &StubRoutines::_safefetchN_continuation_pc);
  2247  public:
  2248   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
  2249     if (all) {
  2250       generate_all();
  2251     } else {
  2252       generate_initial();
  2255 }; // end class declaration
  2256 /*
  2257 address StubGenerator::disjoint_byte_copy_entry  = NULL;
  2258 address StubGenerator::disjoint_short_copy_entry = NULL;
  2259 address StubGenerator::disjoint_int_copy_entry   = NULL;
  2260 address StubGenerator::disjoint_long_copy_entry  = NULL;
  2261 address StubGenerator::disjoint_oop_copy_entry   = NULL;
  2263 address StubGenerator::byte_copy_entry  = NULL;
  2264 address StubGenerator::short_copy_entry = NULL;
  2265 address StubGenerator::int_copy_entry   = NULL;
  2266 address StubGenerator::long_copy_entry  = NULL;
  2267 address StubGenerator::oop_copy_entry   = NULL;
  2269 address StubGenerator::checkcast_copy_entry = NULL;
  2270 */
  2271 void StubGenerator_generate(CodeBuffer* code, bool all) {
  2272   StubGenerator g(code, all);

mercurial