jdk8-mips64-public/hotspot: src/cpu/mips/vm/stubGenerator_mips

     1 /*

     2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.

     3  * Copyright (c) 2015, 2016, Loongson Technology. All rights reserved.

     4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.

     5  *

     6  * This code is free software; you can redistribute it and/or modify it

     7  * under the terms of the GNU General Public License version 2 only, as

     8  * published by the Free Software Foundation.

     9  *

    10  * This code is distributed in the hope that it will be useful, but WITHOUT

    11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or

    12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License

    13  * version 2 for more details (a copy is included in the LICENSE file that

    14  * accompanied this code).

    15  *

    16  * You should have received a copy of the GNU General Public License version

    17  * 2 along with this work; if not, write to the Free Software Foundation,

    18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.

    19  *

    20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA

    21  * or visit www.oracle.com if you need additional information or have any

    22  * questions.

    23  *

    24  */

    26 #include "precompiled.hpp"

    27 #include "asm/macroAssembler.hpp"

    28 #include "asm/macroAssembler.inline.hpp"

    29 #include "interpreter/interpreter.hpp"

    30 #include "nativeInst_mips.hpp"

    31 #include "oops/instanceOop.hpp"

    32 #include "oops/method.hpp"

    33 #include "oops/objArrayKlass.hpp"

    34 #include "oops/oop.inline.hpp"

    35 #include "prims/methodHandles.hpp"

    36 #include "runtime/frame.inline.hpp"

    37 #include "runtime/handles.inline.hpp"

    38 #include "runtime/sharedRuntime.hpp"

    39 #include "runtime/stubCodeGenerator.hpp"

    40 #include "runtime/stubRoutines.hpp"

    41 #include "runtime/thread.inline.hpp"

    42 #include "utilities/top.hpp"

    43 #ifdef COMPILER2

    44 #include "opto/runtime.hpp"

    45 #endif

    48 // Declaration and definition of StubGenerator (no .hpp file).

    49 // For a more detailed description of the stub routine structure

    50 // see the comment in stubRoutines.hpp

    52 #define __ _masm->

    53 //#define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)

    54 //#define a__ ((Assembler*)_masm)->

    56 //#ifdef PRODUCT

    57 //#define BLOCK_COMMENT(str) /* nothing */

    58 //#else

    59 //#define BLOCK_COMMENT(str) __ block_comment(str)

    60 //#endif

    62 //#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")

    63 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions

    65 // Stub Code definitions

    67 static address handle_unsafe_access() {

    68   JavaThread* thread = JavaThread::current();

    69   address pc = thread->saved_exception_pc();

    70   // pc is the instruction which we must emulate

    71   // doing a no-op is fine:  return garbage from the load

    72   // therefore, compute npc

    73   //address npc = Assembler::locate_next_instruction(pc);

    74 	address npc = (address)((unsigned long)pc + sizeof(unsigned long));

    76   // request an async exception

    77   thread->set_pending_unsafe_access_error();

    79   // return address of next instruction to execute

    80   return npc;

    81 }

    83 class StubGenerator: public StubCodeGenerator {

    84  private:

    86   // ABI mips n64

    87   // This fig is not MIPS ABI. It is call Java from C ABI.

    88   // Call stubs are used to call Java from C

    89   //

    90   //    [ return_from_Java     ]

    91   //    [ argument word n-1    ] <--- sp

    92   //      ...

    93   //    [ argument word 0      ]

    94   //      ...

    95   //-10 [ S6     	       ]

    96   // -9 [ S5		       ]

    97   // -8 [ S4		       ]

    98   // -7 [ S3                   ]

    99   // -6 [ S0  		       ]

   100   // -5 [ TSR(S2)	       ]

   101   // -4 [ LVP(S7)              ]

   102   // -3 [ BCP(S1)              ]

   103   // -2 [ saved fp             ] <--- fp_after_call

   104   // -1 [ return address       ]

   105   //  0 [ ptr. to call wrapper ] <--- a0 (old sp -->)fp

   106   //  1 [ result               ] <--- a1

   107   //  2 [ result_type          ] <--- a2

   108   //  3 [ method               ] <--- a3

   109   //  4 [ entry_point          ] <--- a4

   110   //  5 [ parameters           ] <--- a5

   111   //  6 [ parameter_size       ] <--- a6

   112   //  7 [ thread               ] <--- a7

   114   //

   115   // _LP64: n64 does not save paras in sp.

   116   //

   117   //    [ return_from_Java     ]

   118   //    [ argument word n-1    ] <--- sp

   119   //      ...

   120   //    [ argument word 0      ]

   121   //      ...

   122   //-14 [ thread               ]

   123   //-13 [ result_type          ] <--- a2

   124   //-12 [ result               ] <--- a1

   125   //-11 [ ptr. to call wrapper ] <--- a0

   126   //-10 [ S6     	       ]

   127   // -9 [ S5		       ]

   128   // -8 [ S4		       ]

   129   // -7 [ S3                   ]

   130   // -6 [ S0  		       ]

   131   // -5 [ TSR(S2)	       ]

   132   // -4 [ LVP(S7)              ]

   133   // -3 [ BCP(S1)              ]

   134   // -2 [ saved fp             ] <--- fp_after_call

   135   // -1 [ return address       ]

   136   //  0 [        	       ] <--- old sp

   137   /*

   138    * 2014/01/16 Fu: Find a right place in the call_stub for GP.

   139    * GP will point to the starting point of Interpreter::dispatch_table(itos).

   140    * It should be saved/restored before/after Java calls.

   141    *

   142    */

   143    enum call_stub_layout {

   144      RA_off		  = -1,

   145      FP_off		  = -2,

   146      BCP_off		  = -3,

   147      LVP_off		  = -4,

   148      TSR_off		  = -5,

   149      S1_off		  = -6,

   150      S3_off		  = -7,

   151      S4_off		  = -8,

   152      S5_off		  = -9,

   153      S6_off		  = -10,

   154      result_off		  = -11,

   155      result_type_off	  = -12,

   156      thread_off		  = -13,

   157      total_off		  = thread_off - 3,

   158      GP_off               = -16,

   159    };

   161   address generate_call_stub(address& return_address) {

   163     StubCodeMark mark(this, "StubRoutines", "call_stub");

   164     address start = __ pc();

   166     // same as in generate_catch_exception()!

   168     // stub code

   169     // save ra and fp

   170     __ sd(RA, SP, RA_off * wordSize);

   171     __ sd(FP, SP, FP_off * wordSize);

   172     __ sd(BCP, SP, BCP_off * wordSize);

   173     __ sd(LVP, SP, LVP_off * wordSize);

   174     __ sd(GP, SP, GP_off * wordSize);

   175     __ sd(TSR, SP, TSR_off * wordSize);

   176     __ sd(S1, SP, S1_off * wordSize);

   177     __ sd(S3, SP, S3_off * wordSize);

   178     __ sd(S4, SP, S4_off * wordSize);

   179     __ sd(S5, SP, S5_off * wordSize);

   180     __ sd(S6, SP, S6_off * wordSize);

   183     __ set64(GP, (long)Interpreter::dispatch_table(itos));

   185     // I think 14 is the max gap between argument and callee saved register

   186     __ daddi(FP, SP, (-2) * wordSize);

   187     __ daddi(SP, SP, total_off * wordSize);

   188 //FIXME, aoqi. find a suitable place to save A1 & A2.

   189     /*

   190     __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);

   191     __ sd(A1, FP, 3 * wordSize);

   192     __ sd(A2, FP, 4 * wordSize);

   193     __ sd(A3, FP, 5 * wordSize);

   194     __ sd(A4, FP, 6 * wordSize);

   195     __ sd(A5, FP, 7 * wordSize);

   196     __ sd(A6, FP, 8 * wordSize);

   197     __ sd(A7, FP, 9 * wordSize);

   198     */

   199     __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);

   200     __ sd(A1, FP, result_off * wordSize);

   201     __ sd(A2, FP, result_type_off * wordSize);

   202     __ sd(A7, FP, thread_off * wordSize);

   204 #ifdef OPT_THREAD

   205     //__ get_thread(TREG);

   206     __ move(TREG, A7);

   208     //__ ld(TREG, FP, thread_off * wordSize);

   209 #endif

   210     //add for compressedoops

   211     __ reinit_heapbase();

   213 #ifdef ASSERT

   214     // make sure we have no pending exceptions

   215     {

   216       Label L;

   217     	__ ld(AT, A7, in_bytes(Thread::pending_exception_offset()));

   218     	__ beq(AT, R0, L);

   219     	__ delayed()->nop();

   220     	/* FIXME: I do not know how to realize stop in mips arch, do it in the future */

   221     	__ stop("StubRoutines::call_stub: entered with pending exception");

   222     	__ bind(L);

   223     }

   224 #endif

   226     // pass parameters if any

   227     // A5: parameter

   228     // A6: parameter_size

   229     // T0: parameter_size_tmp(--)

   230     // T2: offset(++)

   231     // T3: tmp

   232     Label parameters_done;

   233     // judge if the parameter_size equals 0

   234     __ beq(A6, R0, parameters_done);

   235     __ delayed()->nop();

   236     __ dsll(AT, A6, Interpreter::logStackElementSize);

   237     __ dsub(SP, SP, AT);

   238     __ move(AT, -StackAlignmentInBytes);

   239     __ andr(SP, SP , AT);

   240     // Copy Java parameters in reverse order (receiver last)

   241     // Note that the argument order is inverted in the process

   242     // source is edx[ecx: N-1..0]

   243     // dest   is esp[ebx: 0..N-1]

   244     Label loop;

   245     __ move(T0, A6);

   246     __ move(T2, R0);

   247     __ bind(loop);

   249     // get parameter

   250     __ dsll(T3, T0, LogBytesPerWord);

   251     __ dadd(T3, T3, A5);

   252     __ ld(AT, T3,  -wordSize);

   253     __ dsll(T3, T2, LogBytesPerWord);

   254     __ dadd(T3, T3, SP);

   255     __ sd(AT, T3, Interpreter::expr_offset_in_bytes(0));

   256     __ daddi(T2, T2, 1);

   257     __ daddi(T0, T0, -1);

   258     __ bne(T0, R0, loop);

   259     __ delayed()->nop();

   260     // advance to next parameter

   262     // call Java function

   263     __ bind(parameters_done);

   265     // receiver in V0, methodOop in Rmethod

   267     __ move(Rmethod, A3);

   268     __ move(Rsender, SP);             //set sender sp

   269     __ jalr(A4);

   270     __ delayed()->nop();

   271     return_address = __ pc();

   273     Label common_return;

   274     __ bind(common_return);

   276     // store result depending on type

   277     // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)

   278     __ ld(T0, FP, result_off * wordSize); 	// result --> T0

   279     Label is_long, is_float, is_double, exit;

   280     __ ld(T2, FP, result_type_off * wordSize);	// result_type --> T2

   281     __ daddi(T3, T2, (-1) * T_LONG);

   282     __ beq(T3, R0, is_long);

   283     __ delayed()->daddi(T3, T2, (-1) * T_FLOAT);

   284     __ beq(T3, R0, is_float);

   285     __ delayed()->daddi(T3, T2, (-1) * T_DOUBLE);

   286     __ beq(T3, R0, is_double);

   287     __ delayed()->nop();

   289     // handle T_INT case

   290     __ sd(V0, T0, 0 * wordSize);

   291     __ bind(exit);

   293     // restore

   294     __ daddi(SP, FP, 2 * wordSize );

   295     __ ld(RA, SP, RA_off * wordSize);

   296     __ ld(FP, SP, FP_off * wordSize);

   297     __ ld(BCP, SP, BCP_off * wordSize);

   298     __ ld(LVP, SP, LVP_off * wordSize);

   299     __ ld(GP, SP, GP_off * wordSize);

   300     __ ld(TSR, SP, TSR_off * wordSize);

   302     __ ld(S1, SP, S1_off * wordSize);

   303     __ ld(S3, SP, S3_off * wordSize);

   304     __ ld(S4, SP, S4_off * wordSize);

   305     __ ld(S5, SP, S5_off * wordSize);

   306     __ ld(S6, SP, S6_off * wordSize);

   308     // return

   309     __ jr(RA);

   310     __ delayed()->nop();

   312     // handle return types different from T_INT

   313     __ bind(is_long);

   314     __ sd(V0, T0, 0 * wordSize);

   315     //__ sd(V1, T0, 1 * wordSize);

   316     //__ sd(R0, T0, 1 * wordSize);

   317     __ b(exit);

   318     __ delayed()->nop();

   320     __ bind(is_float);

   321     __ swc1(F0, T0, 0 * wordSize);

   322     __ b(exit);

   323     __ delayed()->nop();

   325     __ bind(is_double);

   326     __ sdc1(F0, T0, 0 * wordSize);

   327     //__ sdc1(F1, T0, 1 * wordSize);

   328     //__ sd(R0, T0, 1 * wordSize);

   329     __ b(exit);

   330     __ delayed()->nop();

   331     //FIXME, 1.6 mips version add operation of fpu here

   332     StubRoutines::gs2::set_call_stub_compiled_return(__ pc());

   333     __ b(common_return);

   334     __ delayed()->nop();

   335     return start;

   336   }

   338   // Return point for a Java call if there's an exception thrown in

   339   // Java code.  The exception is caught and transformed into a

   340   // pending exception stored in JavaThread that can be tested from

   341   // within the VM.

   342   //

   343   // Note: Usually the parameters are removed by the callee. In case

   344   // of an exception crossing an activation frame boundary, that is

   345   // not the case if the callee is compiled code => need to setup the

   346   // rsp.

   347   //

   348   // rax: exception oop

   350   address generate_catch_exception() {

   351     StubCodeMark mark(this, "StubRoutines", "catch_exception");

   352     address start = __ pc();

   354     Register thread = TREG;

   356     // get thread directly

   357 #ifndef OPT_THREAD

   358     __ ld(thread, FP, thread_off * wordSize);

   359 #endif

   361 #ifdef ASSERT

   362     // verify that threads correspond

   363     { Label L;

   364       __ get_thread(T8);

   365       __ beq(T8, thread, L);

   366       __ delayed()->nop();

   367       __ stop("StubRoutines::catch_exception: threads must correspond");

   368       __ bind(L);

   369     }

   370 #endif

   371     // set pending exception

   372     __ verify_oop(V0);

   373     __ sd(V0, thread, in_bytes(Thread::pending_exception_offset()));

   374     __ li(AT, (long)__FILE__);

   375     __ sd(AT, thread, in_bytes(Thread::exception_file_offset   ()));

   376     __ li(AT, (long)__LINE__);

   377     __ sd(AT, thread, in_bytes(Thread::exception_line_offset   ()));

   379     // complete return to VM

   380     assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");

   381     __ jmp(StubRoutines::_call_stub_return_address, relocInfo::none);

   382     __ delayed()->nop();

   384     return start;

   385   }

   387   // Continuation point for runtime calls returning with a pending

   388   // exception.  The pending exception check happened in the runtime

   389   // or native call stub.  The pending exception in Thread is

   390   // converted into a Java-level exception.

   391   //

   392   // Contract with Java-level exception handlers:

   393   // rax: exception

   394   // rdx: throwing pc

   395   //

   396   // NOTE: At entry of this stub, exception-pc must be on stack !!

   398   address generate_forward_exception() {

   399     StubCodeMark mark(this, "StubRoutines", "forward exception");

   400     //Register thread = TREG;

   401     Register thread = TREG;

   402     address start = __ pc();

   404     // Upon entry, the sp points to the return address returning into Java

   405     // (interpreted or compiled) code; i.e., the return address becomes the

   406     // throwing pc.

   407     //

   408     // Arguments pushed before the runtime call are still on the stack but

   409     // the exception handler will reset the stack pointer -> ignore them.

   410     // A potential result in registers can be ignored as well.

   412 #ifdef ASSERT

   413     // make sure this code is only executed if there is a pending exception

   414 #ifndef OPT_THREAD

   415     __ get_thread(thread);

   416 #endif

   417     { Label L;

   418       __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));

   419       __ bne(AT, R0, L);

   420       __ delayed()->nop();

   421       __ stop("StubRoutines::forward exception: no pending exception (1)");

   422       __ bind(L);

   423     }

   424 #endif

   426     // compute exception handler into T9

   427     __ ld(A1, SP, 0);

   428     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, A1);

   429     __ move(T9, V0);

   430     __ pop(V1);

   432 #ifndef OPT_THREAD

   433     __ get_thread(thread);

   434 #endif

   435     __ ld(V0, thread, in_bytes(Thread::pending_exception_offset()));

   436     __ sd(R0, thread, in_bytes(Thread::pending_exception_offset()));

   438 #ifdef ASSERT

   439     // make sure exception is set

   440     { Label L;

   441       __ bne(V0, R0, L);

   442       __ delayed()->nop();

   443       __ stop("StubRoutines::forward exception: no pending exception (2)");

   444       __ bind(L);

   445     }

   446 #endif

   448     // continue at exception handler (return address removed)

   449     // V0: exception

   450     // T9: exception handler

   451     // V1: throwing pc

   452     __ verify_oop(V0);

   453     __ jr(T9);

   454     __ delayed()->nop();

   456     return start;

   457   }

   459   // Support for intptr_t get_previous_fp()

   460   //

   461   // This routine is used to find the previous frame pointer for the

   462   // caller (current_frame_guess). This is used as part of debugging

   463   // ps() is seemingly lost trying to find frames.

   464   // This code assumes that caller current_frame_guess) has a frame.

   465   address generate_get_previous_fp() {

   466     StubCodeMark mark(this, "StubRoutines", "get_previous_fp");

   467     const Address old_fp       (FP,  0);

   468     const Address older_fp       (V0,  0);

   469     address start = __ pc();

   470     __ enter();

   471     __ lw(V0, old_fp); // callers fp

   472     __ lw(V0, older_fp); // the frame for ps()

   473     __ leave();

   474     __ jr(RA);

   475     __ delayed()->nop();

   476     return start;

   477   }

   478   // The following routine generates a subroutine to throw an

   479   // asynchronous UnknownError when an unsafe access gets a fault that

   480   // could not be reasonably prevented by the programmer.  (Example:

   481   // SIGBUS/OBJERR.)

   482   address generate_handler_for_unsafe_access() {

   483 		StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");

   484 		address start = __ pc();

   485 		__ pushad();                      // push registers

   486 		//  Address next_pc(esp, RegisterImpl::number_of_registers * BytesPerWord);

   487 		__ call(CAST_FROM_FN_PTR(address, handle_unsafe_access), relocInfo::runtime_call_type);

   488 		__ delayed()->nop();

   489 		__ sw(V0, SP, RegisterImpl::number_of_registers * BytesPerWord);

   490 		__ popad();

   491 		__ jr(RA);

   492 		__ delayed()->nop();

   493 		return start;

   494   }

   496   // Non-destructive plausibility checks for oops

   497   //

   498   // Arguments:

   499   //    all args on stack!

   500   //

   501   // Stack after saving c_rarg3:

   502   //    [tos + 0]: saved c_rarg3

   503   //    [tos + 1]: saved c_rarg2

   504   //    [tos + 2]: saved r12 (several TemplateTable methods use it)

   505   //    [tos + 3]: saved flags

   506   //    [tos + 4]: return address

   507   //  * [tos + 5]: error message (char*)

   508   //  * [tos + 6]: object to verify (oop)

   509   //  * [tos + 7]: saved rax - saved by caller and bashed

   510   //  * = popped on exit

   511   address generate_verify_oop() {

   512 	  StubCodeMark mark(this, "StubRoutines", "verify_oop");

   513 	  address start = __ pc();

   514 	  __ reinit_heapbase();

   515 	  __ verify_oop_subroutine();

   516     address end = __ pc();

   517 	  return start;

   518   }

   520   //

   521   //  Generate overlap test for array copy stubs

   522   //

   523   //  Input:

   524   //     A0    -  array1

   525   //     A1    -  array2

   526   //     A2    -  element count

   527   //

   528   //  Note: this code can only use %eax, %ecx, and %edx

   529   //

   531  // use T9 as temp

   532   void array_overlap_test(address no_overlap_target, int log2_elem_size) {

   533     int elem_size = 1 << log2_elem_size;

   534     Address::ScaleFactor sf = Address::times_1;

   536     switch (log2_elem_size) {

   537       case 0: sf = Address::times_1; break;

   538       case 1: sf = Address::times_2; break;

   539       case 2: sf = Address::times_4; break;

   540       case 3: sf = Address::times_8; break;

   541     }

   543     __ dsll(AT, A2, sf);

   544     __ dadd(AT, AT, A0);

   545     __ lea(T9, Address(AT, -elem_size));

   546     __ dsub(AT, A1, A0);

   547     __ blez(AT, no_overlap_target);

   548     __ delayed()->nop();

   549     __ dsub(AT, A1, T9);

   550     __ bgtz(AT, no_overlap_target);

   551     __ delayed()->nop();

   553     // 2016/05/10 aoqi: If A0 = 0xf... and A1 = 0x0..., than goto no_overlap_target

   554     Label L;

   555     __ bgez(A0, L);

   556     __ delayed()->nop();

   557     __ bgtz(A1, no_overlap_target);

   558     __ delayed()->nop();

   559     __ bind(L);

   561   }

   563   //

   564   //  Generate store check for array

   565   //

   566   //  Input:

   567   //     T0    -  starting address(edi)

   568   //     T1    -  element count  (ecx)

   569   //

   570   //  The 2 input registers are overwritten

   571   //

   573 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)

   575 	void array_store_check() {

   576 		BarrierSet* bs = Universe::heap()->barrier_set();

   577 		assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");

   578 		CardTableModRefBS* ct = (CardTableModRefBS*)bs;

   579 		assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");

   580 		Label l_0;

   582 		__ dsll(AT, T1, TIMES_OOP);

   583 		__ dadd(AT, T0, AT);

   584 		__ daddiu(T1, AT, - BytesPerHeapOop);

   586 		__ shr(T0, CardTableModRefBS::card_shift);

   587 		__ shr(T1, CardTableModRefBS::card_shift);

   589 		__ dsub(T1, T1, T0);   // end --> cards count

   590 		__ bind(l_0);

   592 		__ set64(AT, (long)ct->byte_map_base);

   593 		__ dadd(AT, AT, T0);

   594 		__ dadd(AT, AT, T1);

   595 		__ sb(R0, AT, 0);

   596                 __ sync();

   597 		__ bgez(T1, l_0);

   598 		__ delayed()->daddi(T1, T1, - 1);

   599 	}

   601   // Arguments:

   602   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary

   603   //             ignored

   604   //   name    - stub name string

   605   //

   606   // Inputs:

   607   //   c_rarg0   - source array address

   608   //   c_rarg1   - destination array address

   609   //   c_rarg2   - element count, treated as ssize_t, can be zero

   610   //

   611   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,

   612   // we let the hardware handle it.  The one to eight bytes within words,

   613   // dwords or qwords that span cache line boundaries will still be loaded

   614   // and stored atomically.

   615   //

   616   // Side Effects:

   617   //   disjoint_byte_copy_entry is set to the no-overlap entry point

   618   //   used by generate_conjoint_byte_copy().

   619   //

   620   address generate_disjoint_byte_copy(bool aligned, const char * name) {

   621     StubCodeMark mark(this, "StubRoutines", name);

   622     __ align(CodeEntryAlignment);

   625     Register tmp1 = T0;

   626     Register tmp2 = T1;

   627     Register tmp3 = T3;

   629     address start = __ pc();

   631     __ push(tmp1);

   632     __ push(tmp2);

   633     __ push(tmp3);

   634     __ move(tmp1, A0);

   635     __ move(tmp2, A1);

   636     __ move(tmp3, A2);

   639     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10, l_11;

   640     Label l_debug;

   642     __ daddi(AT, tmp3, -9); //why the number is 9 ?

   643     __ blez(AT, l_9);

   644     __ delayed()->nop();

   646     if (!aligned) {

   647       __ xorr(AT, tmp1, tmp2);

   648       __ andi(AT, AT, 1);

   649       __ bne(AT, R0, l_9); // if arrays don't have the same alignment mod 2, do 1 element copy

   650       __ delayed()->nop();

   652       __ andi(AT, tmp1, 1);

   653       __ beq(AT, R0, l_10); //copy 1 enlement if necessary to aligh to 2 bytes

   654       __ delayed()->nop();

   656       __ lb(AT, tmp1, 0);

   657       __ daddi(tmp1, tmp1, 1);

   658       __ sb(AT, tmp2, 0);

   659       __ daddi(tmp2, tmp2, 1);

   660       __ daddi(tmp3, tmp3, -1);

   661       __ bind(l_10);

   663       __ xorr(AT, tmp1, tmp2);

   664       __ andi(AT, AT, 3);

   665       __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 2 elements copy

   666       __ delayed()->nop();

   668       // At this point it is guaranteed that both, from and to have the same alignment mod 4.

   670       // Copy 2 elements if necessary to align to 4 bytes.

   671       __ andi(AT, tmp1, 3);

   672       __ beq(AT, R0, l_2);

   673       __ delayed()->nop();

   675       __ lhu(AT, tmp1, 0);

   676       __ daddi(tmp1, tmp1, 2);

   677       __ sh(AT, tmp2, 0);

   678       __ daddi(tmp2, tmp2, 2);

   679       __ daddi(tmp3, tmp3, -2);

   680       __ bind(l_2);

   682       // At this point the positions of both, from and to, are at least 4 byte aligned.

   684       // Copy 4 elements at a time.

   685       // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.

   686       __ xorr(AT, tmp1, tmp2);

   687       __ andi(AT, AT, 7);

   688       __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned

   689       __ delayed()->nop();

   691       // Copy a 4 elements if necessary to align to 8 bytes.

   692       __ andi(AT, tmp1, 7);

   693       __ beq(AT, R0, l_7);

   694       __ delayed()->nop();

   696       __ lw(AT, tmp1, 0);

   697       __ daddi(tmp3, tmp3, -4);

   698       __ sw(AT, tmp2, 0);

   699       { // FasterArrayCopy

   700         __ daddi(tmp1, tmp1, 4);

   701         __ daddi(tmp2, tmp2, 4);

   702       }

   703     }

   705     __ bind(l_7);

   707     // Copy 4 elements at a time; either the loads or the stores can

   708     // be unaligned if aligned == false.

   710     { // FasterArrayCopy

   711       __ daddi(AT, tmp3, -7);

   712       __ blez(AT, l_6); // copy 4 at a time if less than 4 elements remain

   713       __ delayed()->nop();

   715       __ bind(l_8);

   716       // For Loongson, there is 128-bit memory access. TODO

   717       __ ld(AT, tmp1, 0);

   718       __ sd(AT, tmp2, 0);

   719       __ daddi(tmp1, tmp1, 8);

   720       __ daddi(tmp2, tmp2, 8);

   721       __ daddi(tmp3, tmp3, -8);

   722       __ daddi(AT, tmp3, -8);

   723       __ bgez(AT, l_8);

   724       __ delayed()->nop();

   725     }

   726     __ bind(l_6);

   728     // copy 4 bytes at a time

   729     { // FasterArrayCopy

   730       __ daddi(AT, tmp3, -3);

   731       __ blez(AT, l_1);

   732       __ delayed()->nop();

   734       __ bind(l_3);

   735       __ lw(AT, tmp1, 0);

   736       __ sw(AT, tmp2, 0);

   737       __ daddi(tmp1, tmp1, 4);

   738       __ daddi(tmp2, tmp2, 4);

   739       __ daddi(tmp3, tmp3, -4);

   740       __ daddi(AT, tmp3, -4);

   741       __ bgez(AT, l_3);

   742       __ delayed()->nop();

   744     }

   746     // do 2 bytes copy

   747     __ bind(l_1);

   748     {

   749       __ daddi(AT, tmp3, -1);

   750       __ blez(AT, l_9);

   751       __ delayed()->nop();

   753       __ bind(l_5);

   754       __ lhu(AT, tmp1, 0);

   755       __ daddi(tmp3, tmp3, -2);

   756       __ sh(AT, tmp2, 0);

   757       __ daddi(tmp1, tmp1, 2);

   758       __ daddi(tmp2, tmp2, 2);

   759       __ daddi(AT, tmp3, -2);

   760       __ bgez(AT, l_5);

   761       __ delayed()->nop();

   762     }

   764     //do 1 element copy--byte

   765     __ bind(l_9);

   766     __ beq(R0, tmp3, l_4);

   767     __ delayed()->nop();

   769     {

   770       __ bind(l_11);

   771       __ lb(AT, tmp1, 0);

   772       __ daddi(tmp3, tmp3, -1);

   773       __ sb(AT, tmp2, 0);

   774       __ daddi(tmp1, tmp1, 1);

   775       __ daddi(tmp2, tmp2, 1);

   776       __ daddi(AT, tmp3, -1);

   777       __ bgez(AT, l_11);

   778       __ delayed()->nop();

   779     }

   781     __ bind(l_4);

   782     __ pop(tmp3);

   783     __ pop(tmp2);

   784     __ pop(tmp1);

   786     __ jr(RA);

   787     __ delayed()->nop();

   789     return start;

   790   }

   792   // Arguments:

   793   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary

   794   //             ignored

   795   //   name    - stub name string

   796   //

   797   // Inputs:

   798   //   A0   - source array address

   799   //   A1   - destination array address

   800   //   A2   - element count, treated as ssize_t, can be zero

   801   //

   802   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,

   803   // we let the hardware handle it.  The one to eight bytes within words,

   804   // dwords or qwords that span cache line boundaries will still be loaded

   805   // and stored atomically.

   806   //

   807   address generate_conjoint_byte_copy(bool aligned, const char *name) {

   808     __ align(CodeEntryAlignment);

   809     StubCodeMark mark(this, "StubRoutines", name);

   810     address start = __ pc();

   812     Label l_copy_4_bytes_loop, l_copy_suffix, l_copy_suffix_loop, l_exit;

   813     Label l_copy_byte, l_from_unaligned, l_unaligned, l_4_bytes_aligned;

   815     address nooverlap_target = aligned ?

   816 	    StubRoutines::arrayof_jbyte_disjoint_arraycopy() :

   817 	    StubRoutines::jbyte_disjoint_arraycopy();

   819     array_overlap_test(nooverlap_target, 0);

   821     const Register from      = A0;   // source array address

   822     const Register to        = A1;   // destination array address

   823     const Register count     = A2;   // elements count

   824     const Register end_from  = T3;   // source array end address

   825     const Register end_to    = T0;   // destination array end address

   826     const Register end_count = T1;   // destination array end address

   828     __ push(end_from);

   829     __ push(end_to);

   830     __ push(end_count);

   831     __ push(T8);

   833     // copy from high to low

   834     __ move(end_count, count);

   835     __ dadd(end_from, from, end_count);

   836     __ dadd(end_to, to, end_count);

   838     // 2016/05/08 aoqi: If end_from and end_to has differante alignment, unaligned copy is performed.

   839     __ andi(AT, end_from, 3);

   840     __ andi(T8, end_to, 3);

   841     __ bne(AT, T8, l_copy_byte);

   842     __ delayed()->nop();

   844     // First deal with the unaligned data at the top.

   845     __ bind(l_unaligned);

   846     __ beq(end_count, R0, l_exit);

   847     __ delayed()->nop();

   849     __ andi(AT, end_from, 3);

   850     __ bne(AT, R0, l_from_unaligned);

   851     __ delayed()->nop();

   853     __ andi(AT, end_to, 3);

   854     __ beq(AT, R0, l_4_bytes_aligned);

   855     __ delayed()->nop();

   857     __ bind(l_from_unaligned);

   858     __ lb(AT, end_from, -1);

   859     __ sb(AT, end_to, -1);

   860     __ daddi(end_from, end_from, -1);

   861     __ daddi(end_to, end_to, -1);

   862     __ daddi(end_count, end_count, -1);

   863     __ b(l_unaligned);

   864     __ delayed()->nop();

   866     // now end_to, end_from point to 4-byte aligned high-ends

   867     //     end_count contains byte count that is not copied.

   868     // copy 4 bytes at a time

   869     __ bind(l_4_bytes_aligned);

   871     __ move(T8, end_count);

   872     __ daddi(AT, end_count, -3);

   873     __ blez(AT, l_copy_suffix);

   874     __ delayed()->nop();

   876     //__ andi(T8, T8, 3);

   877     __ lea(end_from, Address(end_from, -4));

   878     __ lea(end_to, Address(end_to, -4));

   880     __ dsrl(end_count, end_count, 2);

   881     __ align(16);

   882     __ bind(l_copy_4_bytes_loop); //l_copy_4_bytes

   883     __ lw(AT, end_from, 0);

   884     __ sw(AT, end_to, 0);

   885     __ addi(end_from, end_from, -4);

   886     __ addi(end_to, end_to, -4);

   887     __ addi(end_count, end_count, -1);

   888     __ bne(end_count, R0, l_copy_4_bytes_loop);

   889     __ delayed()->nop();

   891     __ b(l_copy_suffix);

   892     __ delayed()->nop();

   893     // copy dwords aligned or not with repeat move

   894     // l_copy_suffix

   895     // copy suffix (0-3 bytes)

   896     __ bind(l_copy_suffix);

   897     __ andi(T8, T8, 3);

   898     __ beq(T8, R0, l_exit);

   899     __ delayed()->nop();

   900     __ addi(end_from, end_from, 3);

   901     __ addi(end_to, end_to, 3);

   902     __ bind(l_copy_suffix_loop);

   903     __ lb(AT, end_from, 0);

   904     __ sb(AT, end_to, 0);

   905     __ addi(end_from, end_from, -1);

   906     __ addi(end_to, end_to, -1);

   907     __ addi(T8, T8, -1);

   908     __ bne(T8, R0, l_copy_suffix_loop);

   909     __ delayed()->nop();

   911     __ bind(l_copy_byte);

   912     __ beq(end_count, R0, l_exit);

   913     __ delayed()->nop();

   914     __ lb(AT, end_from, -1);

   915     __ sb(AT, end_to, -1);

   916     __ daddi(end_from, end_from, -1);

   917     __ daddi(end_to, end_to, -1);

   918     __ daddi(end_count, end_count, -1);

   919     __ b(l_copy_byte);

   920     __ delayed()->nop();

   922     __ bind(l_exit);

   923     __ pop(T8);

   924     __ pop(end_count);

   925     __ pop(end_to);

   926     __ pop(end_from);

   927     __ jr(RA);

   928     __ delayed()->nop();

   929     return start;

   930   }

   932   // Generate stub for disjoint short copy.  If "aligned" is true, the

   933   // "from" and "to" addresses are assumed to be heapword aligned.

   934   //

   935   // Arguments for generated stub:

   936   //      from:  A0

   937   //      to:    A1

   938   //  elm.count: A2 treated as signed

   939   //  one element: 2 bytes

   940   //

   941   // Strategy for aligned==true:

   942   //

   943   //  If length <= 9:

   944   //     1. copy 1 elements at a time (l_5)

   945   //

   946   //  If length > 9:

   947   //     1. copy 4 elements at a time until less than 4 elements are left (l_7)

   948   //     2. copy 2 elements at a time until less than 2 elements are left (l_6)

   949   //     3. copy last element if one was left in step 2. (l_1)

   950   //

   951   //

   952   // Strategy for aligned==false:

   953   //

   954   //  If length <= 9: same as aligned==true case

   955   //

   956   //  If length > 9:

   957   //     1. continue with step 7. if the alignment of from and to mod 4

   958   //        is different.

   959   //     2. align from and to to 4 bytes by copying 1 element if necessary

   960   //     3. at l_2 from and to are 4 byte aligned; continue with

   961   //        6. if they cannot be aligned to 8 bytes because they have

   962   //        got different alignment mod 8.

   963   //     4. at this point we know that both, from and to, have the same

   964   //        alignment mod 8, now copy one element if necessary to get

   965   //        8 byte alignment of from and to.

   966   //     5. copy 4 elements at a time until less than 4 elements are

   967   //        left; depending on step 3. all load/stores are aligned.

   968   //     6. copy 2 elements at a time until less than 2 elements are

   969   //        left. (l_6)

   970   //     7. copy 1 element at a time. (l_5)

   971   //     8. copy last element if one was left in step 6. (l_1)

   973   address generate_disjoint_short_copy(bool aligned, const char * name) {

   974     StubCodeMark mark(this, "StubRoutines", name);

   975     __ align(CodeEntryAlignment);

   977     Register tmp1 = T0;

   978     Register tmp2 = T1;

   979     Register tmp3 = T3;

   980     Register tmp4 = T8;

   981     Register tmp5 = T9;

   982     Register tmp6 = T2;

   984     address start = __ pc();

   986     __ push(tmp1);

   987     __ push(tmp2);

   988     __ push(tmp3);

   989     __ move(tmp1, A0);

   990     __ move(tmp2, A1);

   991     __ move(tmp3, A2);

   993     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10, l_11, l_12, l_13, l_14;

   994     Label l_debug;

   995     // don't try anything fancy if arrays don't have many elements

   996     __ daddi(AT, tmp3, -23);

   997     __ blez(AT, l_14);

   998     __ delayed()->nop();

   999     // move push here

  1000     __ push(tmp4);

  1001     __ push(tmp5);

  1002     __ push(tmp6);

  1004     if (!aligned) {

  1005       __ xorr(AT, A0, A1);

  1006       __ andi(AT, AT, 1);

  1007       __ bne(AT, R0, l_debug); // if arrays don't have the same alignment mod 2, can this happen?

  1008       __ delayed()->nop();

  1010       __ xorr(AT, A0, A1);

  1011       __ andi(AT, AT, 3);

  1012       __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 1 element copy

  1013       __ delayed()->nop();

  1015       // At this point it is guaranteed that both, from and to have the same alignment mod 4.

  1017       // Copy 1 element if necessary to align to 4 bytes.

  1018       __ andi(AT, A0, 3);

  1019       __ beq(AT, R0, l_2);

  1020       __ delayed()->nop();

  1022       __ lhu(AT, tmp1, 0);

  1023       __ daddi(tmp1, tmp1, 2);

  1024       __ sh(AT, tmp2, 0);

  1025       __ daddi(tmp2, tmp2, 2);

  1026       __ daddi(tmp3, tmp3, -1);

  1027       __ bind(l_2);

  1029       // At this point the positions of both, from and to, are at least 4 byte aligned.

  1031       // Copy 4 elements at a time.

  1032       // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.

  1033       __ xorr(AT, tmp1, tmp2);

  1034       __ andi(AT, AT, 7);

  1035       __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned

  1036       __ delayed()->nop();

  1038       // Copy a 2-element word if necessary to align to 8 bytes.

  1039       __ andi(AT, tmp1, 7);

  1040       __ beq(AT, R0, l_7);

  1041       __ delayed()->nop();

  1043       __ lw(AT, tmp1, 0);

  1044       __ daddi(tmp3, tmp3, -2);

  1045       __ sw(AT, tmp2, 0);

  1046       __ daddi(tmp1, tmp1, 4);

  1047       __ daddi(tmp2, tmp2, 4);

  1048     }// end of if (!aligned)

  1050     __ bind(l_7);

  1051     // At this time the position of both, from and to, are at least 8 byte aligned.

  1052     // Copy 8 elemnets at a time.

  1053     // Align to 16 bytes, but only if both from and to have same alignment mod 8.

  1054     __ xorr(AT, tmp1, tmp2);

  1055     __ andi(AT, AT, 15);

  1056     __ bne(AT, R0, l_9);

  1057     __ delayed()->nop();

  1059     // Copy 4-element word if necessary to align to 16 bytes,

  1060     __ andi(AT, tmp1, 15);

  1061     __ beq(AT, R0, l_10);

  1062     __ delayed()->nop();

  1064     __ ld(AT, tmp1, 0);

  1065     __ daddi(tmp3, tmp3, -4);

  1066     __ sd(AT, tmp2, 0);

  1067     __ daddi(tmp1, tmp1, 8);

  1068     __ daddi(tmp2, tmp2, 8);

  1070     __ bind(l_10);

  1072     // Copy 8 elements at a time; either the loads or the stores can

  1073     // be unalligned if aligned == false

  1075     { // FasterArrayCopy

  1076       __ bind(l_11);

  1077       // For loongson the 128-bit memory access instruction is gslq/gssq

  1078       if (UseLoongsonISA) {

  1079         __ gslq(AT, tmp4, tmp1, 0);

  1080         __ gslq(tmp5, tmp6, tmp1, 16);

  1081         __ daddi(tmp1, tmp1, 32);

  1082         __ daddi(tmp2, tmp2, 32);

  1083         __ gssq(AT, tmp4, tmp2, -32);

  1084         __ gssq(tmp5, tmp6, tmp2, -16);

  1085       } else {

  1086         __ ld(AT, tmp1, 0);

  1087         __ ld(tmp4, tmp1, 8);

  1088         __ ld(tmp5, tmp1, 16);

  1089         __ ld(tmp6, tmp1, 24);

  1090         __ daddi(tmp1, tmp1, 32);

  1091         __ sd(AT, tmp2, 0);

  1092         __ sd(tmp4, tmp2, 8);

  1093         __ sd(tmp5, tmp2, 16);

  1094         __ sd(tmp6, tmp2, 24);

  1095         __ daddi(tmp2, tmp2, 32);

  1096       }

  1097       __ daddi(tmp3, tmp3, -16);

  1098       __ daddi(AT, tmp3, -16);

  1099       __ bgez(AT, l_11);

  1100       __ delayed()->nop();

  1101     }

  1102     __ bind(l_9);

  1104     // Copy 4 elements at a time; either the loads or the stores can

  1105     // be unaligned if aligned == false.

  1106     { // FasterArrayCopy

  1107       __ daddi(AT, tmp3, -15);// loop unrolling 4 times, so if the elements should not be less than 16

  1108       __ blez(AT, l_4); // copy 2 at a time if less than 16 elements remain

  1109       __ delayed()->nop();

  1111       __ bind(l_8);

  1112       __ ld(AT, tmp1, 0);

  1113       __ ld(tmp4, tmp1, 8);

  1114       __ ld(tmp5, tmp1, 16);

  1115       __ ld(tmp6, tmp1, 24);

  1116       __ sd(AT, tmp2, 0);

  1117       __ sd(tmp4, tmp2, 8);

  1118       __ sd(tmp5, tmp2,16);

  1119       __ daddi(tmp1, tmp1, 32);

  1120       __ daddi(tmp2, tmp2, 32);

  1121       __ daddi(tmp3, tmp3, -16);

  1122       __ daddi(AT, tmp3, -16);

  1123       __ bgez(AT, l_8);

  1124       __ sd(tmp6, tmp2, -8);

  1125     }

  1126     __ bind(l_6);

  1128     // copy 2 element at a time

  1129     { // FasterArrayCopy

  1130       __ daddi(AT, tmp3, -7);

  1131       __ blez(AT, l_4);

  1132       __ delayed()->nop();

  1134       __ bind(l_3);

  1135       __ lw(AT, tmp1, 0);

  1136       __ lw(tmp4, tmp1, 4);

  1137       __ lw(tmp5, tmp1, 8);

  1138       __ lw(tmp6, tmp1, 12);

  1139       __ sw(AT, tmp2, 0);

  1140       __ sw(tmp4, tmp2, 4);

  1141       __ sw(tmp5, tmp2, 8);

  1142       __ daddi(tmp1, tmp1, 16);

  1143       __ daddi(tmp2, tmp2, 16);

  1144       __ daddi(tmp3, tmp3, -8);

  1145       __ daddi(AT, tmp3, -8);

  1146       __ bgez(AT, l_3);

  1147       __ sw(tmp6, tmp2, -4);

  1148     }

  1150     __ bind(l_1);

  1151     // do single element copy (8 bit), can this happen?

  1152     { // FasterArrayCopy

  1153       __ daddi(AT, tmp3, -3);

  1154       __ blez(AT, l_4);

  1155       __ delayed()->nop();

  1157       __ bind(l_5);

  1158       __ lhu(AT, tmp1, 0);

  1159       __ lhu(tmp4, tmp1, 2);

  1160       __ lhu(tmp5, tmp1, 4);

  1161       __ lhu(tmp6, tmp1, 6);

  1162       __ sh(AT, tmp2, 0);

  1163       __ sh(tmp4, tmp2, 2);

  1164       __ sh(tmp5, tmp2, 4);

  1165       __ daddi(tmp1, tmp1, 8);

  1166       __ daddi(tmp2, tmp2, 8);

  1167       __ daddi(tmp3, tmp3, -4);

  1168       __ daddi(AT, tmp3, -4);

  1169       __ bgez(AT, l_5);

  1170       __ sh(tmp6, tmp2, -2);

  1171     }

  1172     // single element

  1173     __ bind(l_4);

  1175     __ pop(tmp6);

  1176     __ pop(tmp5);

  1177     __ pop(tmp4);

  1179     __ bind(l_14);

  1180     { // FasterArrayCopy

  1181       __ beq(R0, tmp3, l_13);

  1182       __ delayed()->nop();

  1184       __ bind(l_12);

  1185       __ lhu(AT, tmp1, 0);

  1186       __ sh(AT, tmp2, 0);

  1187       __ daddi(tmp1, tmp1, 2);

  1188       __ daddi(tmp2, tmp2, 2);

  1189       __ daddi(tmp3, tmp3, -1);

  1190       __ daddi(AT, tmp3, -1);

  1191       __ bgez(AT, l_12);

  1192       __ delayed()->nop();

  1193     }

  1195     __ bind(l_13);

  1196     __ pop(tmp3);

  1197     __ pop(tmp2);

  1198     __ pop(tmp1);

  1200     __ jr(RA);

  1201     __ delayed()->nop();

  1203     __ bind(l_debug);

  1204     __ stop("generate_disjoint_short_copy should not reach here");

  1205     return start;

  1206   }

  1208   // Arguments:

  1209   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary

  1210   //             ignored

  1211   //   name    - stub name string

  1212   //

  1213   // Inputs:

  1214   //   c_rarg0   - source array address

  1215   //   c_rarg1   - destination array address

  1216   //   c_rarg2   - element count, treated as ssize_t, can be zero

  1217   //

  1218   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we

  1219   // let the hardware handle it.  The two or four words within dwords

  1220   // or qwords that span cache line boundaries will still be loaded

  1221   // and stored atomically.

  1222   //

  1223   address generate_conjoint_short_copy(bool aligned, const char *name) {

  1224 		Label l_1, l_2, l_3, l_4, l_5;

  1225 		StubCodeMark mark(this, "StubRoutines", name);

  1226 		__ align(CodeEntryAlignment);

  1227 		address start = __ pc();

  1228 		address nooverlap_target = aligned ?

  1229 						StubRoutines::arrayof_jshort_disjoint_arraycopy() :

  1230 						StubRoutines::jshort_disjoint_arraycopy();

  1232 		array_overlap_test(nooverlap_target, 1);

  1234 		__ push(T3);

  1235 		__ push(T0);

  1236 		__ push(T1);

  1237 		__ push(T8);

  1239 		/*

  1240 			 __ pushl(esi);

  1241 			 __ movl(ecx, Address(esp, 4+12));      // count

  1242 			 __ pushl(edi);

  1243 			 __ movl(esi, Address(esp, 8+ 4));      // from

  1244 			 __ movl(edi, Address(esp, 8+ 8));      // to

  1245 		 */

  1246 		__ move(T1, A2);

  1247 		__ move(T3, A0);

  1248 		__ move(T0, A1);

  1251 		// copy dwords from high to low

  1252 		// __ leal(esi, Address(esi, ecx, Address::times_2, -4)); // from + count*2 - 4

  1253 		__ sll(AT, T1, Address::times_2);

  1254 		__ add(AT, T3, AT);

  1255 		__ lea(T3, Address( AT, -4));

  1256 		//__ std();

  1257 		//__ leal(edi, Address(edi, ecx, Address::times_2, -4)); // to + count*2 - 4

  1258 		__ sll(AT,T1 , Address::times_2);

  1259 		__ add(AT, T0, AT);

  1260 		__ lea(T0, Address( AT, -4));

  1261 		//  __ movl(eax, ecx);

  1262 		__ move(T8, T1);

  1263 		__ bind(l_1);

  1264 		//   __ sarl(ecx, 1);              // dword count

  1265 		__ sra(T1,T1, 1);

  1266 		//__ jcc(Assembler::equal, l_4);                   // no dwords to move

  1267 		__ beq(T1, R0, l_4);

  1268 		__ delayed()->nop();

  1269 		/*    __ cmpl(ecx, 32);

  1270 					__ jcc(Assembler::above, l_3);                   // > 32 dwords

  1271 		// copy dwords with loop

  1272 		__ subl(edi, esi);

  1273 		 */     __ align(16);

  1274 		__ bind(l_2);

  1275 		//__ movl(edx, Address(esi));

  1276 		__ lw(AT, T3, 0);

  1277 		//__ movl(Address(edi, esi, Address::times_1), edx);

  1278 		__ sw(AT, T0, 0);

  1279 		//__ subl(esi, 4);

  1280 		__ addi(T3, T3, -4);

  1281 		__ addi(T0, T0, -4);

  1282 		//__ decl(ecx);

  1283 		__ addi(T1, T1, -1);

  1284 		//  __ jcc(Assembler::notEqual, l_2);

  1285 		__ bne(T1, R0, l_2);

  1286 		__ delayed()->nop();

  1287 		//  __ addl(edi, esi);

  1288 		// __ jmp(l_4);

  1289 		__ b(l_4);

  1290 		__ delayed()->nop();

  1291 		// copy dwords with repeat move

  1292 		__ bind(l_3);

  1293 		//   __ rep_movl();

  1294 		__ bind(l_4);

  1295 		//  __ andl(eax, 1);              // suffix count

  1296 		__ andi(T8, T8, 1);              // suffix count

  1297 		//__ jcc(Assembler::equal, l_5);                   // no suffix

  1298 		__ beq(T8, R0, l_5 );

  1299 		__ delayed()->nop();

  1300 		// copy suffix

  1301 		//   __ movw(edx, Address(esi, 2));

  1302 		__ lh(AT, T3, 2);

  1303 		//  __ movw(Address(edi, 2), edx);

  1304 		__ sh(AT, T0, 2);

  1305 		__ bind(l_5);

  1306 		//    __ cld();

  1307 		//    __ popl(edi);

  1308 		//    __ popl(esi);

  1309 		//   __ ret(0);

  1310 		__ pop(T8);

  1311 		__ pop(T1);

  1312 		__ pop(T0);

  1313 		__ pop(T3);

  1314 		__ jr(RA);

  1315 		__ delayed()->nop();

  1316 		return start;

  1317   }

  1319   // Arguments:

  1320   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary

  1321   //             ignored

  1322   //   is_oop  - true => oop array, so generate store check code

  1323   //   name    - stub name string

  1324   //

  1325   // Inputs:

  1326   //   c_rarg0   - source array address

  1327   //   c_rarg1   - destination array address

  1328   //   c_rarg2   - element count, treated as ssize_t, can be zero

  1329   //

  1330   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let

  1331   // the hardware handle it.  The two dwords within qwords that span

  1332   // cache line boundaries will still be loaded and stored atomicly.

  1333   //

  1334   // Side Effects:

  1335   //   disjoint_int_copy_entry is set to the no-overlap entry point

  1336   //   used by generate_conjoint_int_oop_copy().

  1337   //

  1338   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {

  1339     Label l_3, l_4, l_5, l_6, l_7;

  1340     StubCodeMark mark(this, "StubRoutines", name);

  1342     __ align(CodeEntryAlignment);

  1343     address start = __ pc();

  1344     __ push(T3);

  1345     __ push(T0);

  1346     __ push(T1);

  1347     __ push(T8);

  1348     __ move(T1, A2);

  1349     __ move(T3, A0);

  1350     __ move(T0, A1);

  1352     if (is_oop) {

  1353       if (Use3A2000) __ sync();

  1354     }

  1356     if(!aligned) {

  1357       __ xorr(AT, T3, T0);

  1358       __ andi(AT, AT, 7);

  1359       __ bne(AT, R0, l_5); // not same alignment mod 8 -> copy 1 element each time

  1360       __ delayed()->nop();

  1362       __ andi(AT, T3, 7);

  1363       __ beq(AT, R0, l_6); //copy 2 elements each time

  1364       __ delayed()->nop();

  1366       __ lw(AT, T3, 0);

  1367       __ daddi(T1, T1, -1);

  1368       __ sw(AT, T0, 0);

  1369       __ daddi(T3, T3, 4);

  1370       __ daddi(T0, T0, 4);

  1371     }

  1373     {

  1374       __ bind(l_6);

  1375       __ daddi(AT, T1, -1);

  1376       __ blez(AT, l_5);

  1377       __ delayed()->nop();

  1379       __ bind(l_7);

  1380       __ ld(AT, T3, 0);

  1381       __ sd(AT, T0, 0);

  1382       __ daddi(T3, T3, 8);

  1383       __ daddi(T0, T0, 8);

  1384       __ daddi(T1, T1, -2);

  1385       __ daddi(AT, T1, -2);

  1386       __ bgez(AT, l_7);

  1387       __ delayed()->nop();

  1388     }

  1390     __ bind(l_5);

  1391     __ beq(T1, R0, l_4);

  1392     __ delayed()->nop();

  1394     __ align(16);

  1395     __ bind(l_3);

  1396     __ lw(AT, T3, 0);

  1397     __ sw(AT, T0, 0);

  1398     __ addi(T3, T3, 4);

  1399     __ addi(T0, T0, 4);

  1400     __ addi(T1, T1, -1);

  1401     __ bne(T1, R0, l_3);

  1402     __ delayed()->nop();

  1404     if (is_oop) {

  1405       __ move(T0, A1);

  1406       __ move(T1, A2);

  1407       array_store_check();

  1408     }

  1410     // exit

  1411     __ bind(l_4);

  1412     __ pop(T8);

  1413     __ pop(T1);

  1414     __ pop(T0);

  1415     __ pop(T3);

  1416     __ jr(RA);

  1417     __ delayed()->nop();

  1419     return start;

  1420   }

  1422   // Arguments:

  1423   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary

  1424   //             ignored

  1425   //   is_oop  - true => oop array, so generate store check code

  1426   //   name    - stub name string

  1427   //

  1428   // Inputs:

  1429   //   c_rarg0   - source array address

  1430   //   c_rarg1   - destination array address

  1431   //   c_rarg2   - element count, treated as ssize_t, can be zero

  1432   //

  1433   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let

  1434   // the hardware handle it.  The two dwords within qwords that span

  1435   // cache line boundaries will still be loaded and stored atomicly.

  1436   //

  1437   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {

  1438 		Label l_2, l_4;

  1439 		StubCodeMark mark(this, "StubRoutines", name);

  1440 		__ align(CodeEntryAlignment);

  1441 		address start = __ pc();

  1442 		address nooverlap_target;

  1444 		if (is_oop) {

  1445 			nooverlap_target = aligned ?

  1446 							StubRoutines::arrayof_oop_disjoint_arraycopy() :

  1447 							StubRoutines::oop_disjoint_arraycopy();

  1448 		}else {

  1449 			nooverlap_target = aligned ?

  1450 							StubRoutines::arrayof_jint_disjoint_arraycopy() :

  1451 							StubRoutines::jint_disjoint_arraycopy();

  1452 		}

  1454 		array_overlap_test(nooverlap_target, 2);

  1456 		__ push(T3);

  1457 		__ push(T0);

  1458 		__ push(T1);

  1459 		__ push(T8);

  1461 		__ move(T1, A2);

  1462 		__ move(T3, A0);

  1463 		__ move(T0, A1);

  1464                 // T3: source array address

  1465                 // T0: destination array address

  1466                 // T1: element count

  1468 		if (is_oop) {

  1469                    if (Use3A2000) __ sync();

  1470 		}

  1472 		__ sll(AT, T1, Address::times_4);

  1473 		__ add(AT, T3, AT);

  1474 		__ lea(T3 , Address(AT, -4));

  1475 		__ sll(AT, T1, Address::times_4);

  1476 		__ add(AT, T0, AT);

  1477 		__ lea(T0 , Address(AT, -4));

  1479 		__ beq(T1, R0, l_4);

  1480 		__ delayed()->nop();

  1482 		__ align(16);

  1483 		__ bind(l_2);

  1484 		__ lw(AT, T3, 0);

  1485 		__ sw(AT, T0, 0);

  1486 		__ addi(T3, T3, -4);

  1487 		__ addi(T0, T0, -4);

  1488 		__ addi(T1, T1, -1);

  1489 		__ bne(T1, R0, l_2);

  1490 		__ delayed()->nop();

  1492 		if (is_oop) {

  1493 			__ move(T0, A1);

  1494 			__ move(T1, A2);

  1495 			array_store_check();

  1496 		}

  1497 		__ bind(l_4);

  1498 		__ pop(T8);

  1499 		__ pop(T1);

  1500 		__ pop(T0);

  1501 		__ pop(T3);

  1502 		__ jr(RA);

  1503 		__ delayed()->nop();

  1505 		return start;

  1506   }

  1508   // Arguments:

  1509   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary

  1510   //             ignored

  1511   //   is_oop  - true => oop array, so generate store check code

  1512   //   name    - stub name string

  1513   //

  1514   // Inputs:

  1515   //   c_rarg0   - source array address

  1516   //   c_rarg1   - destination array address

  1517   //   c_rarg2   - element count, treated as ssize_t, can be zero

  1518   //

  1519   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let

  1520   // the hardware handle it.  The two dwords within qwords that span

  1521   // cache line boundaries will still be loaded and stored atomicly.

  1522   //

  1523   // Side Effects:

  1524   //   disjoint_int_copy_entry is set to the no-overlap entry point

  1525   //   used by generate_conjoint_int_oop_copy().

  1526   //

  1527   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {

  1528 		Label l_3, l_4;

  1529 		StubCodeMark mark(this, "StubRoutines", name);

  1530 		__ align(CodeEntryAlignment);

  1531 		address start = __ pc();

  1533 		__ push(T3);

  1534 		__ push(T0);

  1535 		__ push(T1);

  1536 		__ push(T8);

  1538 		__ move(T1, A2);

  1539 		__ move(T3, A0);

  1540 		__ move(T0, A1);

  1541                 // T3: source array address

  1542                 // T0: destination array address

  1543                 // T1: element count

  1545 		if (is_oop) {

  1546                   if (Use3A2000) __ sync();

  1547 		}

  1549 		__ beq(T1, R0, l_4);

  1550 		__ delayed()->nop();

  1552 		__ align(16);

  1553 		__ bind(l_3);

  1554 		__ ld(AT, T3, 0);

  1555 		__ sd(AT, T0, 0);

  1556 		__ addi(T3, T3, 8);

  1557 		__ addi(T0, T0, 8);

  1558 		__ addi(T1, T1, -1);

  1559 		__ bne(T1, R0, l_3);

  1560 		__ delayed()->nop();

  1562 		if (is_oop) {

  1563 			__ move(T0, A1);

  1564 			__ move(T1, A2);

  1565 			array_store_check();

  1566 		}

  1568                 // exit

  1569 		__ bind(l_4);

  1570 		__ pop(T8);

  1571 		__ pop(T1);

  1572 		__ pop(T0);

  1573 		__ pop(T3);

  1574 		__ jr(RA);

  1575 		__ delayed()->nop();

  1576 		return start;

  1577 	}

  1579   // Arguments:

  1580   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary

  1581   //             ignored

  1582   //   is_oop  - true => oop array, so generate store check code

  1583   //   name    - stub name string

  1584   //

  1585   // Inputs:

  1586   //   c_rarg0   - source array address

  1587   //   c_rarg1   - destination array address

  1588   //   c_rarg2   - element count, treated as ssize_t, can be zero

  1589   //

  1590   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let

  1591   // the hardware handle it.  The two dwords within qwords that span

  1592   // cache line boundaries will still be loaded and stored atomicly.

  1593   //

  1594   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {

  1595 		Label l_2, l_4;

  1596 		StubCodeMark mark(this, "StubRoutines", name);

  1597 		__ align(CodeEntryAlignment);

  1598 		address start = __ pc();

  1599 		address nooverlap_target;

  1601 		if (is_oop) {

  1602 			nooverlap_target = aligned ?

  1603 							StubRoutines::arrayof_oop_disjoint_arraycopy() :

  1604 							StubRoutines::oop_disjoint_arraycopy();

  1605 		}else {

  1606 			nooverlap_target = aligned ?

  1607 							StubRoutines::arrayof_jlong_disjoint_arraycopy() :

  1608 							StubRoutines::jlong_disjoint_arraycopy();

  1609 		}

  1611 		array_overlap_test(nooverlap_target, 3);

  1613 		__ push(T3);

  1614 		__ push(T0);

  1615 		__ push(T1);

  1616 		__ push(T8);

  1618 		__ move(T1, A2);

  1619 		__ move(T3, A0);

  1620 		__ move(T0, A1);

  1622 		if (is_oop) {

  1623                    if (Use3A2000) __ sync();

  1624 		}

  1626 		__ sll(AT, T1, Address::times_8);

  1627 		__ add(AT, T3, AT);

  1628 		__ lea(T3 , Address(AT, -8));

  1629 		__ sll(AT, T1, Address::times_8);

  1630 		__ add(AT, T0, AT);

  1631 		__ lea(T0 , Address(AT, -8));

  1633 		__ beq(T1, R0, l_4);

  1634 		__ delayed()->nop();

  1636 		__ align(16);

  1637 		__ bind(l_2);

  1638 		__ ld(AT, T3, 0);

  1639 		__ sd(AT, T0, 0);

  1640 		__ addi(T3, T3, -8);

  1641 		__ addi(T0, T0, -8);

  1642 		__ addi(T1, T1, -1);

  1643 		__ bne(T1, R0, l_2);

  1644 		__ delayed()->nop();

  1646 		if (is_oop) {

  1647 			__ move(T0, A1);

  1648 			__ move(T1, A2);

  1649 			array_store_check();

  1650 		}

  1651 		__ bind(l_4);

  1652 		__ pop(T8);

  1653 		__ pop(T1);

  1654 		__ pop(T0);

  1655 		__ pop(T3);

  1656 		__ jr(RA);

  1657 		__ delayed()->nop();

  1658 		return start;

  1659   }

  1661 //FIXME

  1662   address generate_disjoint_long_copy(bool aligned, const char *name) {

  1663 	  Label l_1, l_2;

  1664 	  StubCodeMark mark(this, "StubRoutines", name);

  1665 	  __ align(CodeEntryAlignment);

  1666 	  address start = __ pc();

  1668 	  //      __ movl(ecx, Address(esp, 4+8));       // count

  1669 	  //     __ movl(eax, Address(esp, 4+0));       // from

  1670 	  //    __ movl(edx, Address(esp, 4+4));       // to

  1671 	  __ move(T1, A2);

  1672 	  __ move(T3, A0);

  1673 	  __ move(T0, A1);

  1674 	  __ push(T3);

  1675 	  __ push(T0);

  1676 	  __ push(T1);

  1677 	  //__ subl(edx, eax);

  1678 	  //__ jmp(l_2);

  1679 	  __ b(l_2);

  1680 	  __ delayed()->nop();

  1681 	  __ align(16);

  1682 	  __ bind(l_1);

  1683 	  //   if (VM_Version::supports_mmx()) {

  1684 	  //     __ movq(mmx0, Address(eax));

  1685 	  //     __ movq(Address(eax, edx, Address::times_1), mmx0);

  1686 	  //   } else {

  1687 	  //   __ fild_d(Address(eax));

  1688 	  __ ld(AT, T3, 0);

  1689 	  // __ fistp_d(Address(eax, edx, Address::times_1));

  1690 	  __ sd (AT, T0, 0);

  1691 	  //   }

  1692 	  //   __ addl(eax, 8);

  1693 	  __ addi(T3, T3, 8);

  1694 	  __ addi(T0, T0, 8);

  1695 	  __ bind(l_2);

  1696 	  //    __ decl(ecx);

  1697 	  __ addi(T1, T1, -1);

  1698 	  //    __ jcc(Assembler::greaterEqual, l_1);

  1699 	  __ bgez(T1, l_1);

  1700 	  __ delayed()->nop();

  1701 	  //  if (VM_Version::supports_mmx()) {

  1702 	  //    __ emms();

  1703 	  //  }

  1704 	  //  __ ret(0);

  1705 	  __ pop(T1);

  1706 	  __ pop(T0);

  1707 	  __ pop(T3);

  1708 	  __ jr(RA);

  1709 	  __ delayed()->nop();

  1710 	  return start;

  1711   }

  1714   address generate_conjoint_long_copy(bool aligned, const char *name) {

  1715 	  Label l_1, l_2;

  1716 	  StubCodeMark mark(this, "StubRoutines", name);

  1717 	  __ align(CodeEntryAlignment);

  1718 	  address start = __ pc();

  1719 	  address nooverlap_target = aligned ?

  1720 		  StubRoutines::arrayof_jlong_disjoint_arraycopy() :

  1721 		  StubRoutines::jlong_disjoint_arraycopy();

  1722 	  array_overlap_test(nooverlap_target, 3);

  1724 	  __ push(T3);

  1725 	  __ push(T0);

  1726 	  __ push(T1);

  1728 		/*      __ movl(ecx, Address(esp, 4+8));       // count

  1729 						__ movl(eax, Address(esp, 4+0));       // from

  1730 						__ movl(edx, Address(esp, 4+4));       // to

  1731 						__ jmp(l_2);

  1733 		 */

  1734 	  __ move(T1, A2);

  1735 	  __ move(T3, A0);

  1736 	  __ move(T0, A1);

  1737 	  __ sll(AT, T1, Address::times_8);

  1738 	  __ add(AT, T3, AT);

  1739 	  __ lea(T3 , Address(AT, -8));

  1740 	  __ sll(AT, T1, Address::times_8);

  1741 	  __ add(AT, T0, AT);

  1742 	  __ lea(T0 , Address(AT, -8));

  1746 	  __ b(l_2);

  1747 	  __ delayed()->nop();

  1748 	  __ align(16);

  1749 		__ bind(l_1);

  1750 		/*      if (VM_Version::supports_mmx()) {

  1751 						__ movq(mmx0, Address(eax, ecx, Address::times_8));

  1752 						__ movq(Address(edx, ecx,Address::times_8), mmx0);

  1753 						} else {

  1754 						__ fild_d(Address(eax, ecx, Address::times_8));

  1755 						__ fistp_d(Address(edx, ecx,Address::times_8));

  1756 						}

  1757 		 */

  1758 		__ ld(AT, T3, 0);

  1759 		__ sd (AT, T0, 0);

  1760 	  __ addi(T3, T3, -8);

  1761 	  __ addi(T0, T0,-8);

  1762 	  __ bind(l_2);

  1763 	  //	    __ decl(ecx);

  1764 	  __ addi(T1, T1, -1);

  1765 	  //__ jcc(Assembler::greaterEqual, l_1);

  1766 	  __ bgez(T1, l_1);

  1767 	  __ delayed()->nop();

  1768 	  //      if (VM_Version::supports_mmx()) {

  1769 	  //      __ emms();

  1770 	  //   }

  1771 	  //  __ ret(0);

  1772 	  __ pop(T1);

  1773 	  __ pop(T0);

  1774 	  __ pop(T3);

  1775 	  __ jr(RA);

  1776 	  __ delayed()->nop();

  1777 	  return start;

  1778   }

  1780   void generate_arraycopy_stubs() {

  1781     if (UseCompressedOops) {

  1782       StubRoutines::_oop_disjoint_arraycopy          = generate_disjoint_int_oop_copy(false, true,

  1783                                                                                       "oop_disjoint_arraycopy");

  1784       StubRoutines::_oop_arraycopy                   = generate_conjoint_int_oop_copy(false, true,

  1785                                                                                       "oop_arraycopy");

  1786       StubRoutines::_oop_disjoint_arraycopy_uninit   = generate_disjoint_int_oop_copy(false, true,

  1787                                                                                       "oop_disjoint_arraycopy_uninit");

  1788       StubRoutines::_oop_arraycopy_uninit            = generate_conjoint_int_oop_copy(false, true,

  1789                                                                                       "oop_arraycopy_uninit");

  1790     } else {

  1791       StubRoutines::_oop_disjoint_arraycopy          = generate_disjoint_long_oop_copy(false, true,

  1792                                                                                        "oop_disjoint_arraycopy");

  1793       StubRoutines::_oop_arraycopy                   = generate_conjoint_long_oop_copy(false, true,

  1794                                                                                        "oop_arraycopy");

  1795       StubRoutines::_oop_disjoint_arraycopy_uninit   = generate_disjoint_long_oop_copy(false, true,

  1796                                                                                        "oop_disjoint_arraycopy_uninit");

  1797       StubRoutines::_oop_arraycopy_uninit            = generate_conjoint_long_oop_copy(false, true,

  1798                                                                                        "oop_arraycopy_uninit");

  1799     }

  1801     StubRoutines::_jbyte_disjoint_arraycopy          = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");

  1802     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");

  1803     StubRoutines::_jint_disjoint_arraycopy           = generate_disjoint_int_oop_copy(false, false, "jint_disjoint_arraycopy");

  1804     StubRoutines::_jlong_disjoint_arraycopy          = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");

  1806     StubRoutines::_jbyte_arraycopy  = generate_conjoint_byte_copy(false, "jbyte_arraycopy");

  1807     StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, "jshort_arraycopy");

  1808     StubRoutines::_jint_arraycopy   = generate_conjoint_int_oop_copy(false, false, "jint_arraycopy");

  1809     StubRoutines::_jlong_arraycopy  = generate_conjoint_long_copy(false, "jlong_arraycopy");

  1811     // We don't generate specialized code for HeapWord-aligned source

  1812     // arrays, so just use the code we've already generated

  1813     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;

  1814     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;

  1816     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;

  1817     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;

  1819     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;

  1820     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;

  1822     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;

  1823     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;

  1825     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;

  1826     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;

  1828     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;

  1829     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;

  1830   }

  1832 //Wang: add a function to implement SafeFetch32 and SafeFetchN

  1833   void generate_safefetch(const char* name, int size, address* entry,

  1834                           address* fault_pc, address* continuation_pc) {

  1835     // safefetch signatures:

  1836     //   int      SafeFetch32(int*      adr, int      errValue);

  1837     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);

  1838     //

  1839     // arguments:

  1840     //   A0 = adr

  1841     //   A1 = errValue

  1842     //

  1843     // result:

  1844     //   PPC_RET  = *adr or errValue

  1846     StubCodeMark mark(this, "StubRoutines", name);

  1848     // Entry point, pc or function descriptor.

  1849     *entry = __ pc();

  1851     // Load *adr into A1, may fault.

  1852     *fault_pc = __ pc();

  1853     switch (size) {

  1854       case 4:

  1855         // int32_t

  1856         __ lw(A1, A0, 0);

  1857         break;

  1858       case 8:

  1859         // int64_t

  1860         __ ld(A1, A0, 0);

  1861         break;

  1862       default:

  1863         ShouldNotReachHere();

  1864     }

  1866     // return errValue or *adr

  1867     *continuation_pc = __ pc();

  1868     __ addu(V0,A1,R0);

  1869     __ jr(RA);

  1870     __ delayed()->nop();

  1871   }

  1874 #undef __

  1875 #define __ masm->

  1877   // Continuation point for throwing of implicit exceptions that are

  1878   // not handled in the current activation. Fabricates an exception

  1879   // oop and initiates normal exception dispatching in this

  1880   // frame. Since we need to preserve callee-saved values (currently

  1881   // only for C2, but done for C1 as well) we need a callee-saved oop

  1882   // map and therefore have to make these stubs into RuntimeStubs

  1883   // rather than BufferBlobs.  If the compiler needs all registers to

  1884   // be preserved between the fault point and the exception handler

  1885   // then it must assume responsibility for that in

  1886   // AbstractCompiler::continuation_for_implicit_null_exception or

  1887   // continuation_for_implicit_division_by_zero_exception. All other

  1888   // implicit exceptions (e.g., NullPointerException or

  1889   // AbstractMethodError on entry) are either at call sites or

  1890   // otherwise assume that stack unwinding will be initiated, so

  1891   // caller saved registers were assumed volatile in the compiler.

  1892   address generate_throw_exception(const char* name,

  1893                                    address runtime_entry,

  1894                                    bool restore_saved_exception_pc) {

  1895     // Information about frame layout at time of blocking runtime call.

  1896     // Note that we only have to preserve callee-saved registers since

  1897     // the compilers are responsible for supplying a continuation point

  1898 		// if they expect all registers to be preserved.

  1899 //#define aoqi_test

  1900 #ifdef aoqi_test

  1901 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);

  1902 #endif

  1903 		enum layout {

  1904 			thread_off,    // last_java_sp

  1905 			S7_off,        // callee saved register      sp + 1

  1906 			S6_off,        // callee saved register      sp + 2

  1907 			S5_off,        // callee saved register      sp + 3

  1908 			S4_off,        // callee saved register      sp + 4

  1909 			S3_off,        // callee saved register      sp + 5

  1910 			S2_off,        // callee saved register      sp + 6

  1911 			S1_off,        // callee saved register      sp + 7

  1912 			S0_off,        // callee saved register      sp + 8

  1913 			FP_off,

  1914 			ret_address,

  1915 			framesize

  1916 		};

  1918 		int insts_size = 2048;

  1919 		int locs_size  = 32;

  1921 		//  CodeBuffer* code     = new CodeBuffer(insts_size, locs_size, 0, 0, 0, false,

  1922 		//  NULL, NULL, NULL, false, NULL, name, false);

  1923 		CodeBuffer code (name , insts_size, locs_size);

  1924 #ifdef aoqi_test

  1925 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);

  1926 #endif

  1927 		OopMapSet* oop_maps  = new OopMapSet();

  1928 #ifdef aoqi_test

  1929 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);

  1930 #endif

  1931 		MacroAssembler* masm = new MacroAssembler(&code);

  1932 #ifdef aoqi_test

  1933 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);

  1934 #endif

  1936 		address start = __ pc();

  1937     	//__ stop("generate_throw_exception");

  1938 		/*

  1939 			 __ move(AT, (int)&jerome1 );

  1940 			 __ sw(SP, AT, 0);

  1941 			 __ move(AT, (int)&jerome2 );

  1942 			 __ sw(FP, AT, 0);

  1943 			 __ move(AT, (int)&jerome3 );

  1944 			 __ sw(RA, AT, 0);

  1945 			 __ move(AT, (int)&jerome4 );

  1946 			 __ sw(R0, AT, 0);

  1947 			 __ move(AT, (int)&jerome5 );

  1948 			 __ sw(R0, AT, 0);

  1949 			 __ move(AT, (int)&jerome6 );

  1950 			 __ sw(R0, AT, 0);

  1951 			 __ move(AT, (int)&jerome7 );

  1952 			 __ sw(R0, AT, 0);

  1953 			 __ move(AT, (int)&jerome10 );

  1954 			 __ sw(R0, AT, 0);

  1956 			 __ pushad();

  1958 		//__ enter();

  1959 		__ call(CAST_FROM_FN_PTR(address, SharedRuntime::print_call_statistics),

  1960 		relocInfo::runtime_call_type);

  1961 		__ delayed()->nop();

  1963 		//__ leave();

  1964 		__ popad();

  1966 		 */

  1968 		// This is an inlined and slightly modified version of call_VM

  1969 		// which has the ability to fetch the return PC out of

  1970 		// thread-local storage and also sets up last_Java_sp slightly

  1971 		// differently than the real call_VM

  1972 #ifndef OPT_THREAD

  1973 		Register java_thread = TREG;

  1974 		__ get_thread(java_thread);

  1975 #else

  1976 		Register java_thread = TREG;

  1977 #endif

  1978 #ifdef aoqi_test

  1979 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);

  1980 #endif

  1981 		if (restore_saved_exception_pc) {

  1982 			__ ld(RA, java_thread, in_bytes(JavaThread::saved_exception_pc_offset())); // eax

  1983 		}

  1985 		__ enter(); // required for proper stackwalking of RuntimeStub frame

  1987 		__ addi(SP, SP, (-1) * (framesize-2) * wordSize); // prolog

  1988 		__ sd(S0, SP, S0_off * wordSize);

  1989 		__ sd(S1, SP, S1_off * wordSize);

  1990 		__ sd(S2, SP, S2_off * wordSize);

  1991 		__ sd(S3, SP, S3_off * wordSize);

  1992 		__ sd(S4, SP, S4_off * wordSize);

  1993 		__ sd(S5, SP, S5_off * wordSize);

  1994 		__ sd(S6, SP, S6_off * wordSize);

  1995 		__ sd(S7, SP, S7_off * wordSize);

  1997 		int frame_complete = __ pc() - start;

  1998 		// push java thread (becomes first argument of C function)

  1999 		__ sd(java_thread, SP, thread_off * wordSize);

  2000 		if (java_thread!=A0)

  2001 			__ move(A0, java_thread);

  2003 		// Set up last_Java_sp and last_Java_fp

  2004 		__ set_last_Java_frame(java_thread, SP, FP, NULL);

  2005 		__ relocate(relocInfo::internal_pc_type);

  2006 		{

  2007 			intptr_t save_pc = (intptr_t)__ pc() +  NativeMovConstReg::instruction_size + 28;

  2008 			__ patchable_set48(AT, save_pc);

  2009 		}

  2010 		__ sd(AT, java_thread, in_bytes(JavaThread::last_Java_pc_offset()));

  2012 		// Call runtime

  2013 		__ call(runtime_entry);

  2014 		__ delayed()->nop();

  2015 		// Generate oop map

  2016 		OopMap* map =  new OopMap(framesize, 0);

  2017 		oop_maps->add_gc_map(__ offset(),  map);

  2019 		// restore the thread (cannot use the pushed argument since arguments

  2020 		// may be overwritten by C code generated by an optimizing compiler);

  2021 		// however can use the register value directly if it is callee saved.

  2022 #ifndef OPT_THREAD

  2023 		__ get_thread(java_thread);

  2024 #endif

  2026 		__ ld(SP, java_thread, in_bytes(JavaThread::last_Java_sp_offset()));

  2027 		//  __ reset_last_Java_frame(java_thread, true);

  2028 		__ reset_last_Java_frame(java_thread, true, true);

  2030 		// Restore callee save registers.  This must be done after resetting the Java frame

  2031 		__ ld(S0, SP, S0_off * wordSize);

  2032 		__ ld(S1, SP, S1_off * wordSize);

  2033 		__ ld(S2, SP, S2_off * wordSize);

  2034 		__ ld(S3, SP, S3_off * wordSize);

  2035 		__ ld(S4, SP, S4_off * wordSize);

  2036 		__ ld(S5, SP, S5_off * wordSize);

  2037 		__ ld(S6, SP, S6_off * wordSize);

  2038 		__ ld(S7, SP, S7_off * wordSize);

  2040 		// discard arguments

  2041 		__ addi(SP, SP, (framesize-2) * wordSize); // epilog

  2042 		//	__ leave(); // required for proper stackwalking of RuntimeStub frame

  2043 		__ addi(SP, FP, wordSize);

  2044 		__ ld(FP, SP, -1*wordSize);

  2045 		// check for pending exceptions

  2046 #ifdef ASSERT

  2047 		Label L;

  2048 		__ lw(AT, java_thread, in_bytes(Thread::pending_exception_offset()));

  2049 		__ bne(AT, R0, L);

  2050 		__ delayed()->nop();

  2051 		__ should_not_reach_here();

  2052 		__ bind(L);

  2053 #endif //ASSERT

  2054 		__ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);

  2055 		__ delayed()->nop();

  2056 #ifdef aoqi_test

  2057 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);

  2058 #endif

  2059 		RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code,frame_complete,

  2060 										framesize, oop_maps, false);

  2061 #ifdef aoqi_test

  2062 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);

  2063 #endif

  2064 		return stub->entry_point();

  2065   }

  2067   // Initialization

  2068   void generate_initial() {

  2069 /*

  2070 		// Generates all stubs and initializes the entry points

  2072     // This platform-specific stub is needed by generate_call_stub()

  2073     StubRoutines::mips::_mxcsr_std        = generate_fp_mask("mxcsr_std",        0x0000000000001F80);

  2075     // entry points that exist in all platforms Note: This is code

  2076     // that could be shared among different platforms - however the

  2077     // benefit seems to be smaller than the disadvantage of having a

  2078     // much more complicated generator structure. See also comment in

  2079     // stubRoutines.hpp.

  2081     StubRoutines::_forward_exception_entry = generate_forward_exception();

  2083     StubRoutines::_call_stub_entry =

  2084       generate_call_stub(StubRoutines::_call_stub_return_address);

  2086     // is referenced by megamorphic call

  2087     StubRoutines::_catch_exception_entry = generate_catch_exception();

  2089     // atomic calls

  2090     StubRoutines::_atomic_xchg_entry         = generate_atomic_xchg();

  2091     StubRoutines::_atomic_xchg_ptr_entry     = generate_atomic_xchg_ptr();

  2092     StubRoutines::_atomic_cmpxchg_entry      = generate_atomic_cmpxchg();

  2093     StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();

  2094     StubRoutines::_atomic_add_entry          = generate_atomic_add();

  2095     StubRoutines::_atomic_add_ptr_entry      = generate_atomic_add_ptr();

  2096     StubRoutines::_fence_entry               = generate_orderaccess_fence();

  2098     StubRoutines::_handler_for_unsafe_access_entry =

  2099       generate_handler_for_unsafe_access();

  2101     // platform dependent

  2102     StubRoutines::mips::_get_previous_fp_entry = generate_get_previous_fp();

  2104     StubRoutines::mips::_verify_mxcsr_entry    = generate_verify_mxcsr();

  2105 */

  2106 		// Generates all stubs and initializes the entry points

  2108 		//-------------------------------------------------------------

  2109 		//-----------------------------------------------------------

  2110 		// entry points that exist in all platforms

  2111 		// Note: This is code that could be shared among different platforms - however the benefit seems to be smaller

  2112 		// than the disadvantage of having a much more complicated generator structure.

  2113 		// See also comment in stubRoutines.hpp.

  2114 		StubRoutines::_forward_exception_entry = generate_forward_exception();

  2115 		StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);

  2116 		// is referenced by megamorphic call

  2117 		StubRoutines::_catch_exception_entry = generate_catch_exception();

  2119 		StubRoutines::_handler_for_unsafe_access_entry = generate_handler_for_unsafe_access();

  2121 		// platform dependent

  2122 		StubRoutines::gs2::_get_previous_fp_entry = generate_get_previous_fp();

  2123 	}

  2125 void generate_all() {

  2126 #ifdef aoqi_test

  2127 tty->print_cr("%s:%d", __func__, __LINE__);

  2128 #endif

  2129     // Generates all stubs and initializes the entry points

  2131     // These entry points require SharedInfo::stack0 to be set up in

  2132     // non-core builds and need to be relocatable, so they each

  2133     // fabricate a RuntimeStub internally.

  2134 	/*

  2135     StubRoutines::_throw_AbstractMethodError_entry =

  2136       generate_throw_exception("AbstractMethodError throw_exception",

  2137                                CAST_FROM_FN_PTR(address,

  2138                                                 SharedRuntime::

  2139                                                 throw_AbstractMethodError),

  2140                                false);

  2142     StubRoutines::_throw_IncompatibleClassChangeError_entry =

  2143       generate_throw_exception("IncompatibleClassChangeError throw_exception",

  2144                                CAST_FROM_FN_PTR(address,

  2145                                                 SharedRuntime::

  2146                                                 throw_IncompatibleClassChangeError),

  2147                                false);

  2149     StubRoutines::_throw_ArithmeticException_entry =

  2150       generate_throw_exception("ArithmeticException throw_exception",

  2151                                CAST_FROM_FN_PTR(address,

  2152                                                 SharedRuntime::

  2153                                                 throw_ArithmeticException),

  2154                                true);

  2156     StubRoutines::_throw_NullPointerException_entry =

  2157       generate_throw_exception("NullPointerException throw_exception",

  2158                                CAST_FROM_FN_PTR(address,

  2159                                                 SharedRuntime::

  2160                                                 throw_NullPointerException),

  2161                                true);

  2163     StubRoutines::_throw_NullPointerException_at_call_entry =

  2164       generate_throw_exception("NullPointerException at call throw_exception",

  2165                                CAST_FROM_FN_PTR(address,

  2166                                                 SharedRuntime::

  2167                                                 throw_NullPointerException_at_call),

  2168                                false);

  2170     StubRoutines::_throw_StackOverflowError_entry =

  2171       generate_throw_exception("StackOverflowError throw_exception",

  2172                                CAST_FROM_FN_PTR(address,

  2173                                                 SharedRuntime::

  2174                                                 throw_StackOverflowError),

  2175                                false);

  2177     // entry points that are platform specific

  2178     StubRoutines::mips::_f2i_fixup = generate_f2i_fixup();

  2179     StubRoutines::mips::_f2l_fixup = generate_f2l_fixup();

  2180     StubRoutines::mips::_d2i_fixup = generate_d2i_fixup();

  2181     StubRoutines::mips::_d2l_fixup = generate_d2l_fixup();

  2183     StubRoutines::mips::_float_sign_mask  = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);

  2184     StubRoutines::mips::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);

  2185     StubRoutines::mips::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);

  2186     StubRoutines::mips::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);

  2188     // support for verify_oop (must happen after universe_init)

  2189     StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();

  2191     // arraycopy stubs used by compilers

  2192     generate_arraycopy_stubs();

  2193 	*/

  2194 #ifdef aoqi_test

  2195 tty->print_cr("%s:%d", __func__, __LINE__);

  2196 #endif

  2197 		StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);

  2198 #ifdef aoqi_test

  2199 tty->print_cr("%s:%d", __func__, __LINE__);

  2200 #endif

  2201 //		StubRoutines::_throw_ArithmeticException_entry         = generate_throw_exception("ArithmeticException throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_ArithmeticException),  true);

  2202 #ifdef aoqi_test

  2203 tty->print_cr("%s:%d", __func__, __LINE__);

  2204 #endif

  2205 //		StubRoutines::_throw_NullPointerException_entry        = generate_throw_exception("NullPointerException throw_exception",         CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException), true);

  2206 #ifdef aoqi_test

  2207 tty->print_cr("%s:%d", __func__, __LINE__);

  2208 #endif

  2209 		StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);

  2210 #ifdef aoqi_test

  2211 tty->print_cr("%s:%d", __func__, __LINE__);

  2212 #endif

  2213 		StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError),   false);

  2214 #ifdef aoqi_test

  2215 tty->print_cr("%s:%d", __func__, __LINE__);

  2216 #endif

  2218 		//------------------------------------------------------

  2219 		//------------------------------------------------------------------

  2220 		// entry points that are platform specific

  2222 		// support for verify_oop (must happen after universe_init)

  2223 #ifdef aoqi_test

  2224 tty->print_cr("%s:%d", __func__, __LINE__);

  2225 #endif

  2226 		StubRoutines::_verify_oop_subroutine_entry	   = generate_verify_oop();

  2227 #ifdef aoqi_test

  2228 tty->print_cr("%s:%d", __func__, __LINE__);

  2229 #endif

  2230 #ifndef CORE

  2231 		// arraycopy stubs used by compilers

  2232 		generate_arraycopy_stubs();

  2233 #ifdef aoqi_test

  2234 tty->print_cr("%s:%d", __func__, __LINE__);

  2235 #endif

  2236 #endif

  2238     // Safefetch stubs.

  2239     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,

  2240                                                        &StubRoutines::_safefetch32_fault_pc,

  2241                                                        &StubRoutines::_safefetch32_continuation_pc);

  2242     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,

  2243                                                        &StubRoutines::_safefetchN_fault_pc,

  2244                                                        &StubRoutines::_safefetchN_continuation_pc);

  2245 	}

  2247  public:

  2248   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {

  2249     if (all) {

  2250       generate_all();

  2251     } else {

  2252       generate_initial();

  2253     }

  2254   }

  2255 }; // end class declaration

  2256 /*

  2257 address StubGenerator::disjoint_byte_copy_entry  = NULL;

  2258 address StubGenerator::disjoint_short_copy_entry = NULL;

  2259 address StubGenerator::disjoint_int_copy_entry   = NULL;

  2260 address StubGenerator::disjoint_long_copy_entry  = NULL;

  2261 address StubGenerator::disjoint_oop_copy_entry   = NULL;

  2263 address StubGenerator::byte_copy_entry  = NULL;

  2264 address StubGenerator::short_copy_entry = NULL;

  2265 address StubGenerator::int_copy_entry   = NULL;

  2266 address StubGenerator::long_copy_entry  = NULL;

  2267 address StubGenerator::oop_copy_entry   = NULL;

  2269 address StubGenerator::checkcast_copy_entry = NULL;

  2270 */

  2271 void StubGenerator_generate(CodeBuffer* code, bool all) {

  2272   StubGenerator g(code, all);

  2273 }

src/cpu/mips/vm/stubGenerator_mips_64.cpp@4bfb40d1e17a

src/cpu/mips/vm/stubGenerator_mips_64.cpp

Mercurial > jdk8-mips64-public > hotspot / file revision

src/cpu/mips/vm/stubGenerator_mips_64.cpp@4bfb40d1e17a

src/cpu/mips/vm/stubGenerator_mips_64.cpp