jdk8-mips64-public/hotspot: src/cpu/mips/vm/stubGenerator_mips

#7157 Fix all forgot saying delayed() when filling delay slot issues
Summary: enable check_delay and guarantee delay_state is at_delay_slot when filling delay slot
Reviewed-by: aoqi

     1 /*

     2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.

     3  * Copyright (c) 2015, 2016, Loongson Technology. All rights reserved.

     4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.

     5  *

     6  * This code is free software; you can redistribute it and/or modify it

     7  * under the terms of the GNU General Public License version 2 only, as

     8  * published by the Free Software Foundation.

     9  *

    10  * This code is distributed in the hope that it will be useful, but WITHOUT

    11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or

    12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License

    13  * version 2 for more details (a copy is included in the LICENSE file that

    14  * accompanied this code).

    15  *

    16  * You should have received a copy of the GNU General Public License version

    17  * 2 along with this work; if not, write to the Free Software Foundation,

    18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.

    19  *

    20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA

    21  * or visit www.oracle.com if you need additional information or have any

    22  * questions.

    23  *

    24  */

    26 #include "precompiled.hpp"

    27 #include "asm/macroAssembler.hpp"

    28 #include "asm/macroAssembler.inline.hpp"

    29 #include "interpreter/interpreter.hpp"

    30 #include "nativeInst_mips.hpp"

    31 #include "oops/instanceOop.hpp"

    32 #include "oops/method.hpp"

    33 #include "oops/objArrayKlass.hpp"

    34 #include "oops/oop.inline.hpp"

    35 #include "prims/methodHandles.hpp"

    36 #include "runtime/frame.inline.hpp"

    37 #include "runtime/handles.inline.hpp"

    38 #include "runtime/sharedRuntime.hpp"

    39 #include "runtime/stubCodeGenerator.hpp"

    40 #include "runtime/stubRoutines.hpp"

    41 #include "runtime/thread.inline.hpp"

    42 #include "utilities/top.hpp"

    43 #ifdef COMPILER2

    44 #include "opto/runtime.hpp"

    45 #endif

    47 // Declaration and definition of StubGenerator (no .hpp file).

    48 // For a more detailed description of the stub routine structure

    49 // see the comment in stubRoutines.hpp

    51 #define __ _masm->

    52 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)

    53 //#define a__ ((Assembler*)_masm)->

    55 //#ifdef PRODUCT

    56 //#define BLOCK_COMMENT(str) /* nothing */

    57 //#else

    58 //#define BLOCK_COMMENT(str) __ block_comment(str)

    59 //#endif

    61 //#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")

    62 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions

    64 // Stub Code definitions

    66 static address handle_unsafe_access() {

    67   JavaThread* thread = JavaThread::current();

    68   address pc = thread->saved_exception_pc();

    69   // pc is the instruction which we must emulate

    70   // doing a no-op is fine:  return garbage from the load

    71   // therefore, compute npc

    72   address npc = (address)((unsigned long)pc + sizeof(unsigned long));

    74   // request an async exception

    75   thread->set_pending_unsafe_access_error();

    77   // return address of next instruction to execute

    78   return npc;

    79 }

    81 class StubGenerator: public StubCodeGenerator {

    82  private:

    84   // ABI mips n64

    85   // This fig is not MIPS ABI. It is call Java from C ABI.

    86   // Call stubs are used to call Java from C

    87   //

    88   //    [ return_from_Java     ]

    89   //    [ argument word n-1    ] <--- sp

    90   //      ...

    91   //    [ argument word 0      ]

    92   //      ...

    93   //-10 [ S6              ]

    94   // -9 [ S5           ]

    95   // -8 [ S4           ]

    96   // -7 [ S3                   ]

    97   // -6 [ S0             ]

    98   // -5 [ TSR(S2)         ]

    99   // -4 [ LVP(S7)              ]

   100   // -3 [ BCP(S1)              ]

   101   // -2 [ saved fp             ] <--- fp_after_call

   102   // -1 [ return address       ]

   103   //  0 [ ptr. to call wrapper ] <--- a0 (old sp -->)fp

   104   //  1 [ result               ] <--- a1

   105   //  2 [ result_type          ] <--- a2

   106   //  3 [ method               ] <--- a3

   107   //  4 [ entry_point          ] <--- a4

   108   //  5 [ parameters           ] <--- a5

   109   //  6 [ parameter_size       ] <--- a6

   110   //  7 [ thread               ] <--- a7

   112   //

   113   // _LP64: n64 does not save paras in sp.

   114   //

   115   //    [ return_from_Java     ]

   116   //    [ argument word n-1    ] <--- sp

   117   //      ...

   118   //    [ argument word 0      ]

   119   //      ...

   120   //-14 [ thread               ]

   121   //-13 [ result_type          ] <--- a2

   122   //-12 [ result               ] <--- a1

   123   //-11 [ ptr. to call wrapper ] <--- a0

   124   //-10 [ S6              ]

   125   // -9 [ S5           ]

   126   // -8 [ S4           ]

   127   // -7 [ S3                   ]

   128   // -6 [ S0             ]

   129   // -5 [ TSR(S2)         ]

   130   // -4 [ LVP(S7)              ]

   131   // -3 [ BCP(S1)              ]

   132   // -2 [ saved fp             ] <--- fp_after_call

   133   // -1 [ return address       ]

   134   //  0 [                 ] <--- old sp

   135   /*

   136    * 2014/01/16 Fu: Find a right place in the call_stub for GP.

   137    * GP will point to the starting point of Interpreter::dispatch_table(itos).

   138    * It should be saved/restored before/after Java calls.

   139    *

   140    */

   141   enum call_stub_layout {

   142     RA_off             = -1,

   143     FP_off             = -2,

   144     BCP_off            = -3,

   145     LVP_off            = -4,

   146     TSR_off            = -5,

   147     S1_off             = -6,

   148     S3_off             = -7,

   149     S4_off             = -8,

   150     S5_off             = -9,

   151     S6_off             = -10,

   152     result_off         = -11,

   153     result_type_off    = -12,

   154     thread_off         = -13,

   155     total_off          = thread_off - 3,

   156     GP_off             = -16,

   157  };

   159   address generate_call_stub(address& return_address) {

   161     StubCodeMark mark(this, "StubRoutines", "call_stub");

   162     address start = __ pc();

   164     // same as in generate_catch_exception()!

   166     // stub code

   167     // save ra and fp

   168     __ sd(RA, SP, RA_off * wordSize);

   169     __ sd(FP, SP, FP_off * wordSize);

   170     __ sd(BCP, SP, BCP_off * wordSize);

   171     __ sd(LVP, SP, LVP_off * wordSize);

   172     __ sd(GP, SP, GP_off * wordSize);

   173     __ sd(TSR, SP, TSR_off * wordSize);

   174     __ sd(S1, SP, S1_off * wordSize);

   175     __ sd(S3, SP, S3_off * wordSize);

   176     __ sd(S4, SP, S4_off * wordSize);

   177     __ sd(S5, SP, S5_off * wordSize);

   178     __ sd(S6, SP, S6_off * wordSize);

   181     __ set64(GP, (long)Interpreter::dispatch_table(itos));

   183     // I think 14 is the max gap between argument and callee saved register

   184     __ daddi(FP, SP, (-2) * wordSize);

   185     __ daddi(SP, SP, total_off * wordSize);

   186     __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);

   187     __ sd(A1, FP, result_off * wordSize);

   188     __ sd(A2, FP, result_type_off * wordSize);

   189     __ sd(A7, FP, thread_off * wordSize);

   191 #ifdef OPT_THREAD

   192     __ move(TREG, A7);

   193 #endif

   194     //add for compressedoops

   195     __ reinit_heapbase();

   197 #ifdef ASSERT

   198     // make sure we have no pending exceptions

   199     {

   200       Label L;

   201       __ ld(AT, A7, in_bytes(Thread::pending_exception_offset()));

   202       __ beq(AT, R0, L);

   203       __ delayed()->nop();

   204       /* FIXME: I do not know how to realize stop in mips arch, do it in the future */

   205       __ stop("StubRoutines::call_stub: entered with pending exception");

   206       __ bind(L);

   207     }

   208 #endif

   210     // pass parameters if any

   211     // A5: parameter

   212     // A6: parameter_size

   213     // T0: parameter_size_tmp(--)

   214     // T2: offset(++)

   215     // T3: tmp

   216     Label parameters_done;

   217     // judge if the parameter_size equals 0

   218     __ beq(A6, R0, parameters_done);

   219     __ delayed()->nop();

   220     __ dsll(AT, A6, Interpreter::logStackElementSize);

   221     __ dsub(SP, SP, AT);

   222     __ move(AT, -StackAlignmentInBytes);

   223     __ andr(SP, SP , AT);

   224     // Copy Java parameters in reverse order (receiver last)

   225     // Note that the argument order is inverted in the process

   226     // source is edx[ecx: N-1..0]

   227     // dest   is esp[ebx: 0..N-1]

   228     Label loop;

   229     __ move(T0, A6);

   230     __ move(T2, R0);

   231     __ bind(loop);

   233     // get parameter

   234     __ dsll(T3, T0, LogBytesPerWord);

   235     __ dadd(T3, T3, A5);

   236     __ ld(AT, T3,  -wordSize);

   237     __ dsll(T3, T2, LogBytesPerWord);

   238     __ dadd(T3, T3, SP);

   239     __ sd(AT, T3, Interpreter::expr_offset_in_bytes(0));

   240     __ daddi(T2, T2, 1);

   241     __ daddi(T0, T0, -1);

   242     __ bne(T0, R0, loop);

   243     __ delayed()->nop();

   244     // advance to next parameter

   246     // call Java function

   247     __ bind(parameters_done);

   249     // receiver in V0, methodOop in Rmethod

   251     __ move(Rmethod, A3);

   252     __ move(Rsender, SP);             //set sender sp

   253     __ jalr(A4);

   254     __ delayed()->nop();

   255     return_address = __ pc();

   257     Label common_return;

   258     __ bind(common_return);

   260     // store result depending on type

   261     // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)

   262     __ ld(T0, FP, result_off * wordSize);   // result --> T0

   263     Label is_long, is_float, is_double, exit;

   264     __ ld(T2, FP, result_type_off * wordSize);  // result_type --> T2

   265     __ daddi(T3, T2, (-1) * T_LONG);

   266     __ beq(T3, R0, is_long);

   267     __ delayed()->daddi(T3, T2, (-1) * T_FLOAT);

   268     __ beq(T3, R0, is_float);

   269     __ delayed()->daddi(T3, T2, (-1) * T_DOUBLE);

   270     __ beq(T3, R0, is_double);

   271     __ delayed()->nop();

   273     // handle T_INT case

   274     __ sd(V0, T0, 0 * wordSize);

   275     __ bind(exit);

   277     // restore

   278     __ daddi(SP, FP, 2 * wordSize );

   279     __ ld(RA, SP, RA_off * wordSize);

   280     __ ld(FP, SP, FP_off * wordSize);

   281     __ ld(BCP, SP, BCP_off * wordSize);

   282     __ ld(LVP, SP, LVP_off * wordSize);

   283     __ ld(GP, SP, GP_off * wordSize);

   284     __ ld(TSR, SP, TSR_off * wordSize);

   286     __ ld(S1, SP, S1_off * wordSize);

   287     __ ld(S3, SP, S3_off * wordSize);

   288     __ ld(S4, SP, S4_off * wordSize);

   289     __ ld(S5, SP, S5_off * wordSize);

   290     __ ld(S6, SP, S6_off * wordSize);

   292     // return

   293     __ jr(RA);

   294     __ delayed()->nop();

   296     // handle return types different from T_INT

   297     __ bind(is_long);

   298     __ sd(V0, T0, 0 * wordSize);

   299     //__ sd(V1, T0, 1 * wordSize);

   300     //__ sd(R0, T0, 1 * wordSize);

   301     __ b(exit);

   302     __ delayed()->nop();

   304     __ bind(is_float);

   305     __ swc1(F0, T0, 0 * wordSize);

   306     __ b(exit);

   307     __ delayed()->nop();

   309     __ bind(is_double);

   310     __ sdc1(F0, T0, 0 * wordSize);

   311     __ b(exit);

   312     __ delayed()->nop();

   313     //FIXME, 1.6 mips version add operation of fpu here

   314     StubRoutines::gs2::set_call_stub_compiled_return(__ pc());

   315     __ b(common_return);

   316     __ delayed()->nop();

   317     return start;

   318   }

   320   // Return point for a Java call if there's an exception thrown in

   321   // Java code.  The exception is caught and transformed into a

   322   // pending exception stored in JavaThread that can be tested from

   323   // within the VM.

   324   //

   325   // Note: Usually the parameters are removed by the callee. In case

   326   // of an exception crossing an activation frame boundary, that is

   327   // not the case if the callee is compiled code => need to setup the

   328   // rsp.

   329   //

   330   // rax: exception oop

   332   address generate_catch_exception() {

   333     StubCodeMark mark(this, "StubRoutines", "catch_exception");

   334     address start = __ pc();

   336     Register thread = TREG;

   338     // get thread directly

   339 #ifndef OPT_THREAD

   340     __ ld(thread, FP, thread_off * wordSize);

   341 #endif

   343 #ifdef ASSERT

   344     // verify that threads correspond

   345     { Label L;

   346       __ get_thread(T8);

   347       __ beq(T8, thread, L);

   348       __ delayed()->nop();

   349       __ stop("StubRoutines::catch_exception: threads must correspond");

   350       __ bind(L);

   351     }

   352 #endif

   353     // set pending exception

   354     __ verify_oop(V0);

   355     __ sd(V0, thread, in_bytes(Thread::pending_exception_offset()));

   356     __ li(AT, (long)__FILE__);

   357     __ sd(AT, thread, in_bytes(Thread::exception_file_offset   ()));

   358     __ li(AT, (long)__LINE__);

   359     __ sd(AT, thread, in_bytes(Thread::exception_line_offset   ()));

   361     // complete return to VM

   362     assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");

   363     __ jmp(StubRoutines::_call_stub_return_address, relocInfo::none);

   364     __ delayed()->nop();

   366     return start;

   367   }

   369   // Continuation point for runtime calls returning with a pending

   370   // exception.  The pending exception check happened in the runtime

   371   // or native call stub.  The pending exception in Thread is

   372   // converted into a Java-level exception.

   373   //

   374   // Contract with Java-level exception handlers:

   375   // rax: exception

   376   // rdx: throwing pc

   377   //

   378   // NOTE: At entry of this stub, exception-pc must be on stack !!

   380   address generate_forward_exception() {

   381     StubCodeMark mark(this, "StubRoutines", "forward exception");

   382     //Register thread = TREG;

   383     Register thread = TREG;

   384     address start = __ pc();

   386     // Upon entry, the sp points to the return address returning into

   387     // Java (interpreted or compiled) code; i.e., the return address

   388     // throwing pc.

   389     //

   390     // Arguments pushed before the runtime call are still on the stack

   391     // but the exception handler will reset the stack pointer ->

   392     // ignore them.  A potential result in registers can be ignored as

   393     // well.

   395 #ifndef OPT_THREAD

   396     __ get_thread(thread);

   397 #endif

   398 #ifdef ASSERT

   399     // make sure this code is only executed if there is a pending exception

   400     {

   401       Label L;

   402       __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));

   403       __ bne(AT, R0, L);

   404       __ delayed()->nop();

   405       __ stop("StubRoutines::forward exception: no pending exception (1)");

   406       __ bind(L);

   407     }

   408 #endif

   410     // compute exception handler into T9

   411     __ ld(A1, SP, 0);

   412     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, A1);

   413     __ move(T9, V0);

   414     __ pop(V1);

   416 #ifndef OPT_THREAD

   417     __ get_thread(thread);

   418 #endif

   419     __ ld(V0, thread, in_bytes(Thread::pending_exception_offset()));

   420     __ sd(R0, thread, in_bytes(Thread::pending_exception_offset()));

   422 #ifdef ASSERT

   423     // make sure exception is set

   424     {

   425       Label L;

   426       __ bne(V0, R0, L);

   427       __ delayed()->nop();

   428       __ stop("StubRoutines::forward exception: no pending exception (2)");

   429       __ bind(L);

   430     }

   431 #endif

   433     // continue at exception handler (return address removed)

   434     // V0: exception

   435     // T9: exception handler

   436     // V1: throwing pc

   437     __ verify_oop(V0);

   438     __ jr(T9);

   439     __ delayed()->nop();

   441     return start;

   442   }

   444   // Support for intptr_t get_previous_fp()

   445   //

   446   // This routine is used to find the previous frame pointer for the

   447   // caller (current_frame_guess). This is used as part of debugging

   448   // ps() is seemingly lost trying to find frames.

   449   // This code assumes that caller current_frame_guess) has a frame.

   450   address generate_get_previous_fp() {

   451     StubCodeMark mark(this, "StubRoutines", "get_previous_fp");

   452     const Address old_fp       (FP,  0);

   453     const Address older_fp       (V0,  0);

   454     address start = __ pc();

   455     __ enter();

   456     __ lw(V0, old_fp); // callers fp

   457     __ lw(V0, older_fp); // the frame for ps()

   458     __ leave();

   459     __ jr(RA);

   460     __ delayed()->nop();

   461     return start;

   462   }

   464   // The following routine generates a subroutine to throw an

   465   // asynchronous UnknownError when an unsafe access gets a fault that

   466   // could not be reasonably prevented by the programmer.  (Example:

   467   // SIGBUS/OBJERR.)

   468   address generate_handler_for_unsafe_access() {

   469     StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");

   470     address start = __ pc();

   471     __ pushad();                      // push registers

   472     //  Address next_pc(esp, RegisterImpl::number_of_registers * BytesPerWord);

   473     __ call(CAST_FROM_FN_PTR(address, handle_unsafe_access), relocInfo::runtime_call_type);

   474     __ delayed()->nop();

   475     __ sw(V0, SP, RegisterImpl::number_of_registers * BytesPerWord);

   476     __ popad();

   477     __ jr(RA);

   478     __ delayed()->nop();

   479     return start;

   480   }

   482   // Non-destructive plausibility checks for oops

   483   //

   484   // Arguments:

   485   //    all args on stack!

   486   //

   487   // Stack after saving c_rarg3:

   488   //    [tos + 0]: saved c_rarg3

   489   //    [tos + 1]: saved c_rarg2

   490   //    [tos + 2]: saved r12 (several TemplateTable methods use it)

   491   //    [tos + 3]: saved flags

   492   //    [tos + 4]: return address

   493   //  * [tos + 5]: error message (char*)

   494   //  * [tos + 6]: object to verify (oop)

   495   //  * [tos + 7]: saved rax - saved by caller and bashed

   496   //  * = popped on exit

   497   address generate_verify_oop() {

   498     StubCodeMark mark(this, "StubRoutines", "verify_oop");

   499     address start = __ pc();

   500     __ reinit_heapbase();

   501     __ verify_oop_subroutine();

   502     address end = __ pc();

   503     return start;

   504   }

   506   //

   507   //  Generate overlap test for array copy stubs

   508   //

   509   //  Input:

   510   //     A0    -  array1

   511   //     A1    -  array2

   512   //     A2    -  element count

   513   //

   514   //  Note: this code can only use %eax, %ecx, and %edx

   515   //

   517  // use T9 as temp

   518   void array_overlap_test(address no_overlap_target, int log2_elem_size) {

   519     int elem_size = 1 << log2_elem_size;

   520     Address::ScaleFactor sf = Address::times_1;

   522     switch (log2_elem_size) {

   523       case 0: sf = Address::times_1; break;

   524       case 1: sf = Address::times_2; break;

   525       case 2: sf = Address::times_4; break;

   526       case 3: sf = Address::times_8; break;

   527     }

   529     __ dsll(AT, A2, sf);

   530     __ dadd(AT, AT, A0);

   531     __ lea(T9, Address(AT, -elem_size));

   532     __ dsub(AT, A1, A0);

   533     __ blez(AT, no_overlap_target);

   534     __ delayed()->nop();

   535     __ dsub(AT, A1, T9);

   536     __ bgtz(AT, no_overlap_target);

   537     __ delayed()->nop();

   539     // 2016/05/10 aoqi: If A0 = 0xf... and A1 = 0x0..., than goto no_overlap_target

   540     Label L;

   541     __ bgez(A0, L);

   542     __ delayed()->nop();

   543     __ bgtz(A1, no_overlap_target);

   544     __ delayed()->nop();

   545     __ bind(L);

   547   }

   549   //

   550   //  Generate store check for array

   551   //

   552   //  Input:

   553   //     T0    -  starting address(edi)

   554   //     T1    -  element count  (ecx)

   555   //

   556   //  The 2 input registers are overwritten

   557   //

   560   void array_store_check(Register tmp) {

   561     assert_different_registers(tmp, AT, T0, T1);

   562     BarrierSet* bs = Universe::heap()->barrier_set();

   563     assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");

   564     CardTableModRefBS* ct = (CardTableModRefBS*)bs;

   565     assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");

   566     Label l_0;

   568     if (UseConcMarkSweepGC) __ sync();

   570     __ set64(tmp, (long)ct->byte_map_base);

   572     __ dsll(AT, T1, TIMES_OOP);

   573     __ dadd(AT, T0, AT);

   574     __ daddiu(T1, AT, - BytesPerHeapOop);

   576     __ shr(T0, CardTableModRefBS::card_shift);

   577     __ shr(T1, CardTableModRefBS::card_shift);

   579     __ dsub(T1, T1, T0);   // end --> cards count

   580     __ bind(l_0);

   582     __ dadd(AT, tmp, T0);

   583     if (UseLoongsonISA) {

   584       __ gssbx(R0, AT, T1, 0);

   585     } else {

   586       __ dadd(AT, AT, T1);

   587       __ sb(R0, AT, 0);

   588     }

   590     __ bgtz(T1, l_0);

   591     __ delayed()->daddi(T1, T1, - 1);

   592   }

   594   // Generate code for an array write pre barrier

   595   //

   596   //     addr    -  starting address

   597   //     count   -  element count

   598   //     tmp     - scratch register

   599   //

   600   //     Destroy no registers!

   601   //

   602   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {

   603     BarrierSet* bs = Universe::heap()->barrier_set();

   604     switch (bs->kind()) {

   605       case BarrierSet::G1SATBCT:

   606       case BarrierSet::G1SATBCTLogging:

   607         // With G1, don't generate the call if we statically know that the target in uninitialized

   608         if (!dest_uninitialized) {

   609            __ pushad();                      // push registers

   610            if (count == A0) {

   611              if (addr == A1) {

   612                // exactly backwards!!

   613                //__ xchgptr(c_rarg1, c_rarg0);

   614                __ move(AT, A0);

   615                __ move(A0, A1);

   616                __ move(A1, AT);

   617              } else {

   618                __ move(A1, count);

   619                __ move(A0, addr);

   620              }

   621            } else {

   622              __ move(A0, addr);

   623              __ move(A1, count);

   624            }

   625            __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);

   626            __ popad();

   627         }

   628         break;

   629       case BarrierSet::CardTableModRef:

   630       case BarrierSet::CardTableExtension:

   631       case BarrierSet::ModRef:

   632         break;

   633       default:

   634         ShouldNotReachHere();

   636     }

   637   }

   639   //

   640   // Generate code for an array write post barrier

   641   //

   642   //  Input:

   643   //     start    - register containing starting address of destination array

   644   //     count    - elements count

   645   //     scratch  - scratch register

   646   //

   647   //  The input registers are overwritten.

   648   //

   649   void  gen_write_ref_array_post_barrier(Register start, Register count, Register scratch) {

   650     assert_different_registers(start, count, scratch, AT);

   651     BarrierSet* bs = Universe::heap()->barrier_set();

   652     switch (bs->kind()) {

   653       case BarrierSet::G1SATBCT:

   654       case BarrierSet::G1SATBCTLogging:

   655         {

   656           __ pushad();             // push registers (overkill)

   657           if (count == A0) {

   658             if (start == A1) {

   659               // exactly backwards!!

   660               //__ xchgptr(c_rarg1, c_rarg0);

   661               __ move(AT, A0);

   662               __ move(A0, A1);

   663               __ move(A1, AT);

   664             } else {

   665               __ move(A1, count);

   666               __ move(A0, start);

   667             }

   668           } else {

   669             __ move(A0, start);

   670             __ move(A1, count);

   671           }

   672           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);

   673           __ popad();

   674         }

   675         break;

   676       case BarrierSet::CardTableModRef:

   677       case BarrierSet::CardTableExtension:

   678         {

   679           CardTableModRefBS* ct = (CardTableModRefBS*)bs;

   680           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");

   682           Label L_loop;

   683           const Register end = count;

   685           if (UseConcMarkSweepGC) __ sync();

   687           int64_t disp = (int64_t) ct->byte_map_base;

   688           __ set64(scratch, disp);

   690           __ lea(end, Address(start, count, TIMES_OOP, 0));  // end == start+count*oop_size

   691           __ daddiu(end, end, -BytesPerHeapOop); // end - 1 to make inclusive

   692           __ shr(start, CardTableModRefBS::card_shift);

   693           __ shr(end,   CardTableModRefBS::card_shift);

   694           __ dsubu(end, end, start); // end --> cards count

   696           __ daddu(start, start, scratch);

   698           __ bind(L_loop);

   699           if (UseLoongsonISA) {

   700             __ gssbx(R0, start, count, 0);

   701           } else {

   702             __ daddu(AT, start, count);

   703             __ sb(R0, AT, 0);

   704           }

   705           __ daddiu(count, count, -1);

   706           __ slt(AT, count, R0);

   707           __ beq(AT, R0, L_loop);

   708           __ delayed()->nop();

   709         }

   710         break;

   711       default:

   712         ShouldNotReachHere();

   713     }

   714   }

   716   // Arguments:

   717   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary

   718   //             ignored

   719   //   name    - stub name string

   720   //

   721   // Inputs:

   722   //   c_rarg0   - source array address

   723   //   c_rarg1   - destination array address

   724   //   c_rarg2   - element count, treated as ssize_t, can be zero

   725   //

   726   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,

   727   // we let the hardware handle it.  The one to eight bytes within words,

   728   // dwords or qwords that span cache line boundaries will still be loaded

   729   // and stored atomically.

   730   //

   731   // Side Effects:

   732   //   disjoint_byte_copy_entry is set to the no-overlap entry point

   733   //   used by generate_conjoint_byte_copy().

   734   //

   735   address generate_disjoint_byte_copy(bool aligned, const char * name) {

   736     StubCodeMark mark(this, "StubRoutines", name);

   737     __ align(CodeEntryAlignment);

   740     Register tmp1 = T0;

   741     Register tmp2 = T1;

   742     Register tmp3 = T3;

   744     address start = __ pc();

   746     __ push(tmp1);

   747     __ push(tmp2);

   748     __ push(tmp3);

   749     __ move(tmp1, A0);

   750     __ move(tmp2, A1);

   751     __ move(tmp3, A2);

   754     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10, l_11;

   755     Label l_debug;

   757     __ daddi(AT, tmp3, -9); //why the number is 9 ?

   758     __ blez(AT, l_9);

   759     __ delayed()->nop();

   761     if (!aligned) {

   762       __ xorr(AT, tmp1, tmp2);

   763       __ andi(AT, AT, 1);

   764       __ bne(AT, R0, l_9); // if arrays don't have the same alignment mod 2, do 1 element copy

   765       __ delayed()->nop();

   767       __ andi(AT, tmp1, 1);

   768       __ beq(AT, R0, l_10); //copy 1 enlement if necessary to aligh to 2 bytes

   769       __ delayed()->nop();

   771       __ lb(AT, tmp1, 0);

   772       __ daddi(tmp1, tmp1, 1);

   773       __ sb(AT, tmp2, 0);

   774       __ daddi(tmp2, tmp2, 1);

   775       __ daddi(tmp3, tmp3, -1);

   776       __ bind(l_10);

   778       __ xorr(AT, tmp1, tmp2);

   779       __ andi(AT, AT, 3);

   780       __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 2 elements copy

   781       __ delayed()->nop();

   783       // At this point it is guaranteed that both, from and to have the same alignment mod 4.

   785       // Copy 2 elements if necessary to align to 4 bytes.

   786       __ andi(AT, tmp1, 3);

   787       __ beq(AT, R0, l_2);

   788       __ delayed()->nop();

   790       __ lhu(AT, tmp1, 0);

   791       __ daddi(tmp1, tmp1, 2);

   792       __ sh(AT, tmp2, 0);

   793       __ daddi(tmp2, tmp2, 2);

   794       __ daddi(tmp3, tmp3, -2);

   795       __ bind(l_2);

   797       // At this point the positions of both, from and to, are at least 4 byte aligned.

   799       // Copy 4 elements at a time.

   800       // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.

   801       __ xorr(AT, tmp1, tmp2);

   802       __ andi(AT, AT, 7);

   803       __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned

   804       __ delayed()->nop();

   806       // Copy a 4 elements if necessary to align to 8 bytes.

   807       __ andi(AT, tmp1, 7);

   808       __ beq(AT, R0, l_7);

   809       __ delayed()->nop();

   811       __ lw(AT, tmp1, 0);

   812       __ daddi(tmp3, tmp3, -4);

   813       __ sw(AT, tmp2, 0);

   814       { // FasterArrayCopy

   815         __ daddi(tmp1, tmp1, 4);

   816         __ daddi(tmp2, tmp2, 4);

   817       }

   818     }

   820     __ bind(l_7);

   822     // Copy 4 elements at a time; either the loads or the stores can

   823     // be unaligned if aligned == false.

   825     { // FasterArrayCopy

   826       __ daddi(AT, tmp3, -7);

   827       __ blez(AT, l_6); // copy 4 at a time if less than 4 elements remain

   828       __ delayed()->nop();

   830       __ bind(l_8);

   831       // For Loongson, there is 128-bit memory access. TODO

   832       __ ld(AT, tmp1, 0);

   833       __ sd(AT, tmp2, 0);

   834       __ daddi(tmp1, tmp1, 8);

   835       __ daddi(tmp2, tmp2, 8);

   836       __ daddi(tmp3, tmp3, -8);

   837       __ daddi(AT, tmp3, -8);

   838       __ bgez(AT, l_8);

   839       __ delayed()->nop();

   840     }

   841     __ bind(l_6);

   843     // copy 4 bytes at a time

   844     { // FasterArrayCopy

   845       __ daddi(AT, tmp3, -3);

   846       __ blez(AT, l_1);

   847       __ delayed()->nop();

   849       __ bind(l_3);

   850       __ lw(AT, tmp1, 0);

   851       __ sw(AT, tmp2, 0);

   852       __ daddi(tmp1, tmp1, 4);

   853       __ daddi(tmp2, tmp2, 4);

   854       __ daddi(tmp3, tmp3, -4);

   855       __ daddi(AT, tmp3, -4);

   856       __ bgez(AT, l_3);

   857       __ delayed()->nop();

   859     }

   861     // do 2 bytes copy

   862     __ bind(l_1);

   863     {

   864       __ daddi(AT, tmp3, -1);

   865       __ blez(AT, l_9);

   866       __ delayed()->nop();

   868       __ bind(l_5);

   869       __ lhu(AT, tmp1, 0);

   870       __ daddi(tmp3, tmp3, -2);

   871       __ sh(AT, tmp2, 0);

   872       __ daddi(tmp1, tmp1, 2);

   873       __ daddi(tmp2, tmp2, 2);

   874       __ daddi(AT, tmp3, -2);

   875       __ bgez(AT, l_5);

   876       __ delayed()->nop();

   877     }

   879     //do 1 element copy--byte

   880     __ bind(l_9);

   881     __ beq(R0, tmp3, l_4);

   882     __ delayed()->nop();

   884     {

   885       __ bind(l_11);

   886       __ lb(AT, tmp1, 0);

   887       __ daddi(tmp3, tmp3, -1);

   888       __ sb(AT, tmp2, 0);

   889       __ daddi(tmp1, tmp1, 1);

   890       __ daddi(tmp2, tmp2, 1);

   891       __ daddi(AT, tmp3, -1);

   892       __ bgez(AT, l_11);

   893       __ delayed()->nop();

   894     }

   896     __ bind(l_4);

   897     __ pop(tmp3);

   898     __ pop(tmp2);

   899     __ pop(tmp1);

   901     __ jr(RA);

   902     __ delayed()->nop();

   904     return start;

   905   }

   907   // Arguments:

   908   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary

   909   //             ignored

   910   //   name    - stub name string

   911   //

   912   // Inputs:

   913   //   A0   - source array address

   914   //   A1   - destination array address

   915   //   A2   - element count, treated as ssize_t, can be zero

   916   //

   917   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,

   918   // we let the hardware handle it.  The one to eight bytes within words,

   919   // dwords or qwords that span cache line boundaries will still be loaded

   920   // and stored atomically.

   921   //

   922   address generate_conjoint_byte_copy(bool aligned, const char *name) {

   923     __ align(CodeEntryAlignment);

   924     StubCodeMark mark(this, "StubRoutines", name);

   925     address start = __ pc();

   927     Label l_copy_4_bytes_loop, l_copy_suffix, l_copy_suffix_loop, l_exit;

   928     Label l_copy_byte, l_from_unaligned, l_unaligned, l_4_bytes_aligned;

   930     address nooverlap_target = aligned ?

   931       StubRoutines::arrayof_jbyte_disjoint_arraycopy() :

   932       StubRoutines::jbyte_disjoint_arraycopy();

   934     array_overlap_test(nooverlap_target, 0);

   936     const Register from      = A0;   // source array address

   937     const Register to        = A1;   // destination array address

   938     const Register count     = A2;   // elements count

   939     const Register end_from  = T3;   // source array end address

   940     const Register end_to    = T0;   // destination array end address

   941     const Register end_count = T1;   // destination array end address

   943     __ push(end_from);

   944     __ push(end_to);

   945     __ push(end_count);

   946     __ push(T8);

   948     // copy from high to low

   949     __ move(end_count, count);

   950     __ dadd(end_from, from, end_count);

   951     __ dadd(end_to, to, end_count);

   953     // 2016/05/08 aoqi: If end_from and end_to has differante alignment, unaligned copy is performed.

   954     __ andi(AT, end_from, 3);

   955     __ andi(T8, end_to, 3);

   956     __ bne(AT, T8, l_copy_byte);

   957     __ delayed()->nop();

   959     // First deal with the unaligned data at the top.

   960     __ bind(l_unaligned);

   961     __ beq(end_count, R0, l_exit);

   962     __ delayed()->nop();

   964     __ andi(AT, end_from, 3);

   965     __ bne(AT, R0, l_from_unaligned);

   966     __ delayed()->nop();

   968     __ andi(AT, end_to, 3);

   969     __ beq(AT, R0, l_4_bytes_aligned);

   970     __ delayed()->nop();

   972     __ bind(l_from_unaligned);

   973     __ lb(AT, end_from, -1);

   974     __ sb(AT, end_to, -1);

   975     __ daddi(end_from, end_from, -1);

   976     __ daddi(end_to, end_to, -1);

   977     __ daddi(end_count, end_count, -1);

   978     __ b(l_unaligned);

   979     __ delayed()->nop();

   981     // now end_to, end_from point to 4-byte aligned high-ends

   982     //     end_count contains byte count that is not copied.

   983     // copy 4 bytes at a time

   984     __ bind(l_4_bytes_aligned);

   986     __ move(T8, end_count);

   987     __ daddi(AT, end_count, -3);

   988     __ blez(AT, l_copy_suffix);

   989     __ delayed()->nop();

   991     //__ andi(T8, T8, 3);

   992     __ lea(end_from, Address(end_from, -4));

   993     __ lea(end_to, Address(end_to, -4));

   995     __ dsrl(end_count, end_count, 2);

   996     __ align(16);

   997     __ bind(l_copy_4_bytes_loop); //l_copy_4_bytes

   998     __ lw(AT, end_from, 0);

   999     __ sw(AT, end_to, 0);

  1000     __ addi(end_from, end_from, -4);

  1001     __ addi(end_to, end_to, -4);

  1002     __ addi(end_count, end_count, -1);

  1003     __ bne(end_count, R0, l_copy_4_bytes_loop);

  1004     __ delayed()->nop();

  1006     __ b(l_copy_suffix);

  1007     __ delayed()->nop();

  1008     // copy dwords aligned or not with repeat move

  1009     // l_copy_suffix

  1010     // copy suffix (0-3 bytes)

  1011     __ bind(l_copy_suffix);

  1012     __ andi(T8, T8, 3);

  1013     __ beq(T8, R0, l_exit);

  1014     __ delayed()->nop();

  1015     __ addi(end_from, end_from, 3);

  1016     __ addi(end_to, end_to, 3);

  1017     __ bind(l_copy_suffix_loop);

  1018     __ lb(AT, end_from, 0);

  1019     __ sb(AT, end_to, 0);

  1020     __ addi(end_from, end_from, -1);

  1021     __ addi(end_to, end_to, -1);

  1022     __ addi(T8, T8, -1);

  1023     __ bne(T8, R0, l_copy_suffix_loop);

  1024     __ delayed()->nop();

  1026     __ bind(l_copy_byte);

  1027     __ beq(end_count, R0, l_exit);

  1028     __ delayed()->nop();

  1029     __ lb(AT, end_from, -1);

  1030     __ sb(AT, end_to, -1);

  1031     __ daddi(end_from, end_from, -1);

  1032     __ daddi(end_to, end_to, -1);

  1033     __ daddi(end_count, end_count, -1);

  1034     __ b(l_copy_byte);

  1035     __ delayed()->nop();

  1037     __ bind(l_exit);

  1038     __ pop(T8);

  1039     __ pop(end_count);

  1040     __ pop(end_to);

  1041     __ pop(end_from);

  1042     __ jr(RA);

  1043     __ delayed()->nop();

  1044     return start;

  1045   }

  1047   // Generate stub for disjoint short copy.  If "aligned" is true, the

  1048   // "from" and "to" addresses are assumed to be heapword aligned.

  1049   //

  1050   // Arguments for generated stub:

  1051   //      from:  A0

  1052   //      to:    A1

  1053   //  elm.count: A2 treated as signed

  1054   //  one element: 2 bytes

  1055   //

  1056   // Strategy for aligned==true:

  1057   //

  1058   //  If length <= 9:

  1059   //     1. copy 1 elements at a time (l_5)

  1060   //

  1061   //  If length > 9:

  1062   //     1. copy 4 elements at a time until less than 4 elements are left (l_7)

  1063   //     2. copy 2 elements at a time until less than 2 elements are left (l_6)

  1064   //     3. copy last element if one was left in step 2. (l_1)

  1065   //

  1066   //

  1067   // Strategy for aligned==false:

  1068   //

  1069   //  If length <= 9: same as aligned==true case

  1070   //

  1071   //  If length > 9:

  1072   //     1. continue with step 7. if the alignment of from and to mod 4

  1073   //        is different.

  1074   //     2. align from and to to 4 bytes by copying 1 element if necessary

  1075   //     3. at l_2 from and to are 4 byte aligned; continue with

  1076   //        6. if they cannot be aligned to 8 bytes because they have

  1077   //        got different alignment mod 8.

  1078   //     4. at this point we know that both, from and to, have the same

  1079   //        alignment mod 8, now copy one element if necessary to get

  1080   //        8 byte alignment of from and to.

  1081   //     5. copy 4 elements at a time until less than 4 elements are

  1082   //        left; depending on step 3. all load/stores are aligned.

  1083   //     6. copy 2 elements at a time until less than 2 elements are

  1084   //        left. (l_6)

  1085   //     7. copy 1 element at a time. (l_5)

  1086   //     8. copy last element if one was left in step 6. (l_1)

  1088   address generate_disjoint_short_copy(bool aligned, const char * name) {

  1089     StubCodeMark mark(this, "StubRoutines", name);

  1090     __ align(CodeEntryAlignment);

  1092     Register tmp1 = T0;

  1093     Register tmp2 = T1;

  1094     Register tmp3 = T3;

  1095     Register tmp4 = T8;

  1096     Register tmp5 = T9;

  1097     Register tmp6 = T2;

  1099     address start = __ pc();

  1101     __ push(tmp1);

  1102     __ push(tmp2);

  1103     __ push(tmp3);

  1104     __ move(tmp1, A0);

  1105     __ move(tmp2, A1);

  1106     __ move(tmp3, A2);

  1108     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10, l_11, l_12, l_13, l_14;

  1109     Label l_debug;

  1110     // don't try anything fancy if arrays don't have many elements

  1111     __ daddi(AT, tmp3, -23);

  1112     __ blez(AT, l_14);

  1113     __ delayed()->nop();

  1114     // move push here

  1115     __ push(tmp4);

  1116     __ push(tmp5);

  1117     __ push(tmp6);

  1119     if (!aligned) {

  1120       __ xorr(AT, A0, A1);

  1121       __ andi(AT, AT, 1);

  1122       __ bne(AT, R0, l_debug); // if arrays don't have the same alignment mod 2, can this happen?

  1123       __ delayed()->nop();

  1125       __ xorr(AT, A0, A1);

  1126       __ andi(AT, AT, 3);

  1127       __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 1 element copy

  1128       __ delayed()->nop();

  1130       // At this point it is guaranteed that both, from and to have the same alignment mod 4.

  1132       // Copy 1 element if necessary to align to 4 bytes.

  1133       __ andi(AT, A0, 3);

  1134       __ beq(AT, R0, l_2);

  1135       __ delayed()->nop();

  1137       __ lhu(AT, tmp1, 0);

  1138       __ daddi(tmp1, tmp1, 2);

  1139       __ sh(AT, tmp2, 0);

  1140       __ daddi(tmp2, tmp2, 2);

  1141       __ daddi(tmp3, tmp3, -1);

  1142       __ bind(l_2);

  1144       // At this point the positions of both, from and to, are at least 4 byte aligned.

  1146       // Copy 4 elements at a time.

  1147       // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.

  1148       __ xorr(AT, tmp1, tmp2);

  1149       __ andi(AT, AT, 7);

  1150       __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned

  1151       __ delayed()->nop();

  1153       // Copy a 2-element word if necessary to align to 8 bytes.

  1154       __ andi(AT, tmp1, 7);

  1155       __ beq(AT, R0, l_7);

  1156       __ delayed()->nop();

  1158       __ lw(AT, tmp1, 0);

  1159       __ daddi(tmp3, tmp3, -2);

  1160       __ sw(AT, tmp2, 0);

  1161       __ daddi(tmp1, tmp1, 4);

  1162       __ daddi(tmp2, tmp2, 4);

  1163     }// end of if (!aligned)

  1165     __ bind(l_7);

  1166     // At this time the position of both, from and to, are at least 8 byte aligned.

  1167     // Copy 8 elemnets at a time.

  1168     // Align to 16 bytes, but only if both from and to have same alignment mod 8.

  1169     __ xorr(AT, tmp1, tmp2);

  1170     __ andi(AT, AT, 15);

  1171     __ bne(AT, R0, l_9);

  1172     __ delayed()->nop();

  1174     // Copy 4-element word if necessary to align to 16 bytes,

  1175     __ andi(AT, tmp1, 15);

  1176     __ beq(AT, R0, l_10);

  1177     __ delayed()->nop();

  1179     __ ld(AT, tmp1, 0);

  1180     __ daddi(tmp3, tmp3, -4);

  1181     __ sd(AT, tmp2, 0);

  1182     __ daddi(tmp1, tmp1, 8);

  1183     __ daddi(tmp2, tmp2, 8);

  1185     __ bind(l_10);

  1187     // Copy 8 elements at a time; either the loads or the stores can

  1188     // be unalligned if aligned == false

  1190     { // FasterArrayCopy

  1191       __ bind(l_11);

  1192       // For loongson the 128-bit memory access instruction is gslq/gssq

  1193       if (UseLoongsonISA) {

  1194         __ gslq(AT, tmp4, tmp1, 0);

  1195         __ gslq(tmp5, tmp6, tmp1, 16);

  1196         __ daddi(tmp1, tmp1, 32);

  1197         __ daddi(tmp2, tmp2, 32);

  1198         __ gssq(AT, tmp4, tmp2, -32);

  1199         __ gssq(tmp5, tmp6, tmp2, -16);

  1200       } else {

  1201         __ ld(AT, tmp1, 0);

  1202         __ ld(tmp4, tmp1, 8);

  1203         __ ld(tmp5, tmp1, 16);

  1204         __ ld(tmp6, tmp1, 24);

  1205         __ daddi(tmp1, tmp1, 32);

  1206         __ sd(AT, tmp2, 0);

  1207         __ sd(tmp4, tmp2, 8);

  1208         __ sd(tmp5, tmp2, 16);

  1209         __ sd(tmp6, tmp2, 24);

  1210         __ daddi(tmp2, tmp2, 32);

  1211       }

  1212       __ daddi(tmp3, tmp3, -16);

  1213       __ daddi(AT, tmp3, -16);

  1214       __ bgez(AT, l_11);

  1215       __ delayed()->nop();

  1216     }

  1217     __ bind(l_9);

  1219     // Copy 4 elements at a time; either the loads or the stores can

  1220     // be unaligned if aligned == false.

  1221     { // FasterArrayCopy

  1222       __ daddi(AT, tmp3, -15);// loop unrolling 4 times, so if the elements should not be less than 16

  1223       __ blez(AT, l_4); // copy 2 at a time if less than 16 elements remain

  1224       __ delayed()->nop();

  1226       __ bind(l_8);

  1227       __ ld(AT, tmp1, 0);

  1228       __ ld(tmp4, tmp1, 8);

  1229       __ ld(tmp5, tmp1, 16);

  1230       __ ld(tmp6, tmp1, 24);

  1231       __ sd(AT, tmp2, 0);

  1232       __ sd(tmp4, tmp2, 8);

  1233       __ sd(tmp5, tmp2,16);

  1234       __ daddi(tmp1, tmp1, 32);

  1235       __ daddi(tmp2, tmp2, 32);

  1236       __ daddi(tmp3, tmp3, -16);

  1237       __ daddi(AT, tmp3, -16);

  1238       __ bgez(AT, l_8);

  1239       __ delayed()->sd(tmp6, tmp2, -8);

  1240     }

  1241     __ bind(l_6);

  1243     // copy 2 element at a time

  1244     { // FasterArrayCopy

  1245       __ daddi(AT, tmp3, -7);

  1246       __ blez(AT, l_4);

  1247       __ delayed()->nop();

  1249       __ bind(l_3);

  1250       __ lw(AT, tmp1, 0);

  1251       __ lw(tmp4, tmp1, 4);

  1252       __ lw(tmp5, tmp1, 8);

  1253       __ lw(tmp6, tmp1, 12);

  1254       __ sw(AT, tmp2, 0);

  1255       __ sw(tmp4, tmp2, 4);

  1256       __ sw(tmp5, tmp2, 8);

  1257       __ daddi(tmp1, tmp1, 16);

  1258       __ daddi(tmp2, tmp2, 16);

  1259       __ daddi(tmp3, tmp3, -8);

  1260       __ daddi(AT, tmp3, -8);

  1261       __ bgez(AT, l_3);

  1262       __ delayed()->sw(tmp6, tmp2, -4);

  1263     }

  1265     __ bind(l_1);

  1266     // do single element copy (8 bit), can this happen?

  1267     { // FasterArrayCopy

  1268       __ daddi(AT, tmp3, -3);

  1269       __ blez(AT, l_4);

  1270       __ delayed()->nop();

  1272       __ bind(l_5);

  1273       __ lhu(AT, tmp1, 0);

  1274       __ lhu(tmp4, tmp1, 2);

  1275       __ lhu(tmp5, tmp1, 4);

  1276       __ lhu(tmp6, tmp1, 6);

  1277       __ sh(AT, tmp2, 0);

  1278       __ sh(tmp4, tmp2, 2);

  1279       __ sh(tmp5, tmp2, 4);

  1280       __ daddi(tmp1, tmp1, 8);

  1281       __ daddi(tmp2, tmp2, 8);

  1282       __ daddi(tmp3, tmp3, -4);

  1283       __ daddi(AT, tmp3, -4);

  1284       __ bgez(AT, l_5);

  1285       __ delayed()->sh(tmp6, tmp2, -2);

  1286     }

  1287     // single element

  1288     __ bind(l_4);

  1290     __ pop(tmp6);

  1291     __ pop(tmp5);

  1292     __ pop(tmp4);

  1294     __ bind(l_14);

  1295     { // FasterArrayCopy

  1296       __ beq(R0, tmp3, l_13);

  1297       __ delayed()->nop();

  1299       __ bind(l_12);

  1300       __ lhu(AT, tmp1, 0);

  1301       __ sh(AT, tmp2, 0);

  1302       __ daddi(tmp1, tmp1, 2);

  1303       __ daddi(tmp2, tmp2, 2);

  1304       __ daddi(tmp3, tmp3, -1);

  1305       __ daddi(AT, tmp3, -1);

  1306       __ bgez(AT, l_12);

  1307       __ delayed()->nop();

  1308     }

  1310     __ bind(l_13);

  1311     __ pop(tmp3);

  1312     __ pop(tmp2);

  1313     __ pop(tmp1);

  1315     __ jr(RA);

  1316     __ delayed()->nop();

  1318     __ bind(l_debug);

  1319     __ stop("generate_disjoint_short_copy should not reach here");

  1320     return start;

  1321   }

  1323   // Arguments:

  1324   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary

  1325   //             ignored

  1326   //   name    - stub name string

  1327   //

  1328   // Inputs:

  1329   //   c_rarg0   - source array address

  1330   //   c_rarg1   - destination array address

  1331   //   c_rarg2   - element count, treated as ssize_t, can be zero

  1332   //

  1333   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we

  1334   // let the hardware handle it.  The two or four words within dwords

  1335   // or qwords that span cache line boundaries will still be loaded

  1336   // and stored atomically.

  1337   //

  1338   address generate_conjoint_short_copy(bool aligned, const char *name) {

  1339     Label l_1, l_2, l_3, l_4, l_5;

  1340     StubCodeMark mark(this, "StubRoutines", name);

  1341     __ align(CodeEntryAlignment);

  1342     address start = __ pc();

  1343     address nooverlap_target = aligned ?

  1344             StubRoutines::arrayof_jshort_disjoint_arraycopy() :

  1345             StubRoutines::jshort_disjoint_arraycopy();

  1347     array_overlap_test(nooverlap_target, 1);

  1349     __ push(T3);

  1350     __ push(T0);

  1351     __ push(T1);

  1352     __ push(T8);

  1354     __ move(T1, A2);

  1355     __ move(T3, A0);

  1356     __ move(T0, A1);

  1359     // copy dwords from high to low

  1360     __ sll(AT, T1, Address::times_2);

  1361     __ add(AT, T3, AT);

  1362     __ lea(T3, Address( AT, -4));

  1363     __ sll(AT,T1 , Address::times_2);

  1364     __ add(AT, T0, AT);

  1365     __ lea(T0, Address( AT, -4));

  1366     __ move(T8, T1);

  1367     __ bind(l_1);

  1368     __ sra(T1,T1, 1);

  1369     __ beq(T1, R0, l_4);

  1370     __ delayed()->nop();

  1371     __ align(16);

  1372     __ bind(l_2);

  1373     __ lw(AT, T3, 0);

  1374     __ sw(AT, T0, 0);

  1375     __ addi(T3, T3, -4);

  1376     __ addi(T0, T0, -4);

  1377     __ addi(T1, T1, -1);

  1378     __ bne(T1, R0, l_2);

  1379     __ delayed()->nop();

  1380     __ b(l_4);

  1381     __ delayed()->nop();

  1382     // copy dwords with repeat move

  1383     __ bind(l_3);

  1384     __ bind(l_4);

  1385     __ andi(T8, T8, 1);              // suffix count

  1386     __ beq(T8, R0, l_5 );

  1387     __ delayed()->nop();

  1388     // copy suffix

  1389     __ lh(AT, T3, 2);

  1390     __ sh(AT, T0, 2);

  1391     __ bind(l_5);

  1392     __ pop(T8);

  1393     __ pop(T1);

  1394     __ pop(T0);

  1395     __ pop(T3);

  1396     __ jr(RA);

  1397     __ delayed()->nop();

  1398     return start;

  1399   }

  1401   // Arguments:

  1402   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary

  1403   //             ignored

  1404   //   is_oop  - true => oop array, so generate store check code

  1405   //   name    - stub name string

  1406   //

  1407   // Inputs:

  1408   //   c_rarg0   - source array address

  1409   //   c_rarg1   - destination array address

  1410   //   c_rarg2   - element count, treated as ssize_t, can be zero

  1411   //

  1412   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let

  1413   // the hardware handle it.  The two dwords within qwords that span

  1414   // cache line boundaries will still be loaded and stored atomicly.

  1415   //

  1416   // Side Effects:

  1417   //   disjoint_int_copy_entry is set to the no-overlap entry point

  1418   //   used by generate_conjoint_int_oop_copy().

  1419   //

  1420   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, const char *name, bool dest_uninitialized = false) {

  1421     Label l_3, l_4, l_5, l_6, l_7;

  1422     StubCodeMark mark(this, "StubRoutines", name);

  1424     __ align(CodeEntryAlignment);

  1425     address start = __ pc();

  1426     __ push(T3);

  1427     __ push(T0);

  1428     __ push(T1);

  1429     __ push(T8);

  1430     __ push(T9);

  1431     __ move(T1, A2);

  1432     __ move(T3, A0);

  1433     __ move(T0, A1);

  1435     if (is_oop) {

  1436       gen_write_ref_array_pre_barrier(A1, A2, dest_uninitialized);

  1437     }

  1439     if(!aligned) {

  1440       __ xorr(AT, T3, T0);

  1441       __ andi(AT, AT, 7);

  1442       __ bne(AT, R0, l_5); // not same alignment mod 8 -> copy 1 element each time

  1443       __ delayed()->nop();

  1445       __ andi(AT, T3, 7);

  1446       __ beq(AT, R0, l_6); //copy 2 elements each time

  1447       __ delayed()->nop();

  1449       __ lw(AT, T3, 0);

  1450       __ daddi(T1, T1, -1);

  1451       __ sw(AT, T0, 0);

  1452       __ daddi(T3, T3, 4);

  1453       __ daddi(T0, T0, 4);

  1454     }

  1456     {

  1457       __ bind(l_6);

  1458       __ daddi(AT, T1, -1);

  1459       __ blez(AT, l_5);

  1460       __ delayed()->nop();

  1462       __ bind(l_7);

  1463       __ ld(AT, T3, 0);

  1464       __ sd(AT, T0, 0);

  1465       __ daddi(T3, T3, 8);

  1466       __ daddi(T0, T0, 8);

  1467       __ daddi(T1, T1, -2);

  1468       __ daddi(AT, T1, -2);

  1469       __ bgez(AT, l_7);

  1470       __ delayed()->nop();

  1471     }

  1473     __ bind(l_5);

  1474     __ beq(T1, R0, l_4);

  1475     __ delayed()->nop();

  1477     __ align(16);

  1478     __ bind(l_3);

  1479     __ lw(AT, T3, 0);

  1480     __ sw(AT, T0, 0);

  1481     __ addi(T3, T3, 4);

  1482     __ addi(T0, T0, 4);

  1483     __ addi(T1, T1, -1);

  1484     __ bne(T1, R0, l_3);

  1485     __ delayed()->nop();

  1487     // exit

  1488     __ bind(l_4);

  1489     if (is_oop) {

  1490       gen_write_ref_array_post_barrier(A1, A2, T1);

  1491     }

  1492     __ pop(T9);

  1493     __ pop(T8);

  1494     __ pop(T1);

  1495     __ pop(T0);

  1496     __ pop(T3);

  1497     __ jr(RA);

  1498     __ delayed()->nop();

  1500     return start;

  1501   }

  1503   // Arguments:

  1504   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary

  1505   //             ignored

  1506   //   is_oop  - true => oop array, so generate store check code

  1507   //   name    - stub name string

  1508   //

  1509   // Inputs:

  1510   //   c_rarg0   - source array address

  1511   //   c_rarg1   - destination array address

  1512   //   c_rarg2   - element count, treated as ssize_t, can be zero

  1513   //

  1514   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let

  1515   // the hardware handle it.  The two dwords within qwords that span

  1516   // cache line boundaries will still be loaded and stored atomicly.

  1517   //

  1518   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, const char *name, bool dest_uninitialized = false) {

  1519     Label l_2, l_4;

  1520     StubCodeMark mark(this, "StubRoutines", name);

  1521     __ align(CodeEntryAlignment);

  1522     address start = __ pc();

  1523     address nooverlap_target;

  1525     if (is_oop) {

  1526       nooverlap_target = aligned ?

  1527               StubRoutines::arrayof_oop_disjoint_arraycopy() :

  1528               StubRoutines::oop_disjoint_arraycopy();

  1529     }else {

  1530       nooverlap_target = aligned ?

  1531               StubRoutines::arrayof_jint_disjoint_arraycopy() :

  1532               StubRoutines::jint_disjoint_arraycopy();

  1533     }

  1535     array_overlap_test(nooverlap_target, 2);

  1537     if (is_oop) {

  1538       gen_write_ref_array_pre_barrier(A1, A2, dest_uninitialized);

  1539     }

  1541     __ push(T3);

  1542     __ push(T0);

  1543     __ push(T1);

  1544     __ push(T8);

  1545     __ push(T9);

  1547     __ move(T1, A2);

  1548     __ move(T3, A0);

  1549     __ move(T0, A1);

  1551     // T3: source array address

  1552     // T0: destination array address

  1553     // T1: element count

  1555     __ sll(AT, T1, Address::times_4);

  1556     __ add(AT, T3, AT);

  1557     __ lea(T3 , Address(AT, -4));

  1558     __ sll(AT, T1, Address::times_4);

  1559     __ add(AT, T0, AT);

  1560     __ lea(T0 , Address(AT, -4));

  1562     __ beq(T1, R0, l_4);

  1563     __ delayed()->nop();

  1565     __ align(16);

  1566     __ bind(l_2);

  1567     __ lw(AT, T3, 0);

  1568     __ sw(AT, T0, 0);

  1569     __ addi(T3, T3, -4);

  1570     __ addi(T0, T0, -4);

  1571     __ addi(T1, T1, -1);

  1572     __ bne(T1, R0, l_2);

  1573     __ delayed()->nop();

  1575     __ bind(l_4);

  1576     if (is_oop) {

  1577       gen_write_ref_array_post_barrier(A1, A2, T1);

  1578     }

  1579     __ pop(T9);

  1580     __ pop(T8);

  1581     __ pop(T1);

  1582     __ pop(T0);

  1583     __ pop(T3);

  1584     __ jr(RA);

  1585     __ delayed()->nop();

  1587     return start;

  1588   }

  1590   // Arguments:

  1591   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary

  1592   //             ignored

  1593   //   is_oop  - true => oop array, so generate store check code

  1594   //   name    - stub name string

  1595   //

  1596   // Inputs:

  1597   //   c_rarg0   - source array address

  1598   //   c_rarg1   - destination array address

  1599   //   c_rarg2   - element count, treated as ssize_t, can be zero

  1600   //

  1601   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let

  1602   // the hardware handle it.  The two dwords within qwords that span

  1603   // cache line boundaries will still be loaded and stored atomicly.

  1604   //

  1605   // Side Effects:

  1606   //   disjoint_int_copy_entry is set to the no-overlap entry point

  1607   //   used by generate_conjoint_int_oop_copy().

  1608   //

  1609   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, const char *name, bool dest_uninitialized = false) {

  1610     Label l_3, l_4;

  1611     StubCodeMark mark(this, "StubRoutines", name);

  1612     __ align(CodeEntryAlignment);

  1613     address start = __ pc();

  1615     if (is_oop) {

  1616       gen_write_ref_array_pre_barrier(A1, A2, dest_uninitialized);

  1617     }

  1619     __ push(T3);

  1620     __ push(T0);

  1621     __ push(T1);

  1622     __ push(T8);

  1623     __ push(T9);

  1625     __ move(T1, A2);

  1626     __ move(T3, A0);

  1627     __ move(T0, A1);

  1629     // T3: source array address

  1630     // T0: destination array address

  1631     // T1: element count

  1633     __ beq(T1, R0, l_4);

  1634     __ delayed()->nop();

  1636     __ align(16);

  1637     __ bind(l_3);

  1638     __ ld(AT, T3, 0);

  1639     __ sd(AT, T0, 0);

  1640     __ addi(T3, T3, 8);

  1641     __ addi(T0, T0, 8);

  1642     __ addi(T1, T1, -1);

  1643     __ bne(T1, R0, l_3);

  1644     __ delayed()->nop();

  1646     // exit

  1647     __ bind(l_4);

  1648     if (is_oop) {

  1649       gen_write_ref_array_post_barrier(A1, A2, T1);

  1650     }

  1651     __ pop(T9);

  1652     __ pop(T8);

  1653     __ pop(T1);

  1654     __ pop(T0);

  1655     __ pop(T3);

  1656     __ jr(RA);

  1657     __ delayed()->nop();

  1658     return start;

  1659   }

  1661   // Arguments:

  1662   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary

  1663   //             ignored

  1664   //   is_oop  - true => oop array, so generate store check code

  1665   //   name    - stub name string

  1666   //

  1667   // Inputs:

  1668   //   c_rarg0   - source array address

  1669   //   c_rarg1   - destination array address

  1670   //   c_rarg2   - element count, treated as ssize_t, can be zero

  1671   //

  1672   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let

  1673   // the hardware handle it.  The two dwords within qwords that span

  1674   // cache line boundaries will still be loaded and stored atomicly.

  1675   //

  1676   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, const char *name, bool dest_uninitialized = false) {

  1677     Label l_2, l_4;

  1678     StubCodeMark mark(this, "StubRoutines", name);

  1679     __ align(CodeEntryAlignment);

  1680     address start = __ pc();

  1681     address nooverlap_target;

  1683     if (is_oop) {

  1684       nooverlap_target = aligned ?

  1685               StubRoutines::arrayof_oop_disjoint_arraycopy() :

  1686               StubRoutines::oop_disjoint_arraycopy();

  1687     }else {

  1688       nooverlap_target = aligned ?

  1689               StubRoutines::arrayof_jlong_disjoint_arraycopy() :

  1690               StubRoutines::jlong_disjoint_arraycopy();

  1691     }

  1693     array_overlap_test(nooverlap_target, 3);

  1695     if (is_oop) {

  1696       gen_write_ref_array_pre_barrier(A1, A2, dest_uninitialized);

  1697     }

  1699     __ push(T3);

  1700     __ push(T0);

  1701     __ push(T1);

  1702     __ push(T8);

  1703     __ push(T9);

  1705     __ move(T1, A2);

  1706     __ move(T3, A0);

  1707     __ move(T0, A1);

  1709     __ sll(AT, T1, Address::times_8);

  1710     __ add(AT, T3, AT);

  1711     __ lea(T3 , Address(AT, -8));

  1712     __ sll(AT, T1, Address::times_8);

  1713     __ add(AT, T0, AT);

  1714     __ lea(T0 , Address(AT, -8));

  1716     __ beq(T1, R0, l_4);

  1717     __ delayed()->nop();

  1719     __ align(16);

  1720     __ bind(l_2);

  1721     __ ld(AT, T3, 0);

  1722     __ sd(AT, T0, 0);

  1723     __ addi(T3, T3, -8);

  1724     __ addi(T0, T0, -8);

  1725     __ addi(T1, T1, -1);

  1726     __ bne(T1, R0, l_2);

  1727     __ delayed()->nop();

  1729     // exit

  1730     __ bind(l_4);

  1731     if (is_oop) {

  1732       gen_write_ref_array_post_barrier(A1, A2, T1);

  1733     }

  1734     __ pop(T9);

  1735     __ pop(T8);

  1736     __ pop(T1);

  1737     __ pop(T0);

  1738     __ pop(T3);

  1739     __ jr(RA);

  1740     __ delayed()->nop();

  1741     return start;

  1742   }

  1744   //FIXME

  1745   address generate_disjoint_long_copy(bool aligned, const char *name) {

  1746     Label l_1, l_2;

  1747     StubCodeMark mark(this, "StubRoutines", name);

  1748     __ align(CodeEntryAlignment);

  1749     address start = __ pc();

  1751     __ move(T1, A2);

  1752     __ move(T3, A0);

  1753     __ move(T0, A1);

  1754     __ push(T3);

  1755     __ push(T0);

  1756     __ push(T1);

  1757     __ b(l_2);

  1758     __ delayed()->nop();

  1759     __ align(16);

  1760     __ bind(l_1);

  1761     __ ld(AT, T3, 0);

  1762     __ sd (AT, T0, 0);

  1763     __ addi(T3, T3, 8);

  1764     __ addi(T0, T0, 8);

  1765     __ bind(l_2);

  1766     __ addi(T1, T1, -1);

  1767     __ bgez(T1, l_1);

  1768     __ delayed()->nop();

  1769     __ pop(T1);

  1770     __ pop(T0);

  1771     __ pop(T3);

  1772     __ jr(RA);

  1773     __ delayed()->nop();

  1774     return start;

  1775   }

  1778   address generate_conjoint_long_copy(bool aligned, const char *name) {

  1779     Label l_1, l_2;

  1780     StubCodeMark mark(this, "StubRoutines", name);

  1781     __ align(CodeEntryAlignment);

  1782     address start = __ pc();

  1783     address nooverlap_target = aligned ?

  1784       StubRoutines::arrayof_jlong_disjoint_arraycopy() :

  1785       StubRoutines::jlong_disjoint_arraycopy();

  1786     array_overlap_test(nooverlap_target, 3);

  1788     __ push(T3);

  1789     __ push(T0);

  1790     __ push(T1);

  1792     __ move(T1, A2);

  1793     __ move(T3, A0);

  1794     __ move(T0, A1);

  1795     __ sll(AT, T1, Address::times_8);

  1796     __ add(AT, T3, AT);

  1797     __ lea(T3 , Address(AT, -8));

  1798     __ sll(AT, T1, Address::times_8);

  1799     __ add(AT, T0, AT);

  1800     __ lea(T0 , Address(AT, -8));

  1802     __ b(l_2);

  1803     __ delayed()->nop();

  1804     __ align(16);

  1805     __ bind(l_1);

  1806     __ ld(AT, T3, 0);

  1807     __ sd (AT, T0, 0);

  1808     __ addi(T3, T3, -8);

  1809     __ addi(T0, T0,-8);

  1810     __ bind(l_2);

  1811     __ addi(T1, T1, -1);

  1812     __ bgez(T1, l_1);

  1813     __ delayed()->nop();

  1814     __ pop(T1);

  1815     __ pop(T0);

  1816     __ pop(T3);

  1817     __ jr(RA);

  1818     __ delayed()->nop();

  1819     return start;

  1820   }

  1822   void generate_arraycopy_stubs() {

  1823     if (UseCompressedOops) {

  1824       StubRoutines::_oop_disjoint_arraycopy          = generate_disjoint_int_oop_copy(false, true,

  1825                                                                                       "oop_disjoint_arraycopy");

  1826       StubRoutines::_oop_arraycopy                   = generate_conjoint_int_oop_copy(false, true,

  1827                                                                                       "oop_arraycopy");

  1828       StubRoutines::_oop_disjoint_arraycopy_uninit   = generate_disjoint_int_oop_copy(false, true,

  1829                                                                                       "oop_disjoint_arraycopy_uninit", true);

  1830       StubRoutines::_oop_arraycopy_uninit            = generate_conjoint_int_oop_copy(false, true,

  1831                                                                                       "oop_arraycopy_uninit", true);

  1832     } else {

  1833       StubRoutines::_oop_disjoint_arraycopy          = generate_disjoint_long_oop_copy(false, true,

  1834                                                                                        "oop_disjoint_arraycopy");

  1835       StubRoutines::_oop_arraycopy                   = generate_conjoint_long_oop_copy(false, true,

  1836                                                                                        "oop_arraycopy");

  1837       StubRoutines::_oop_disjoint_arraycopy_uninit   = generate_disjoint_long_oop_copy(false, true,

  1838                                                                                        "oop_disjoint_arraycopy_uninit", true);

  1839       StubRoutines::_oop_arraycopy_uninit            = generate_conjoint_long_oop_copy(false, true,

  1840                                                                                        "oop_arraycopy_uninit", true);

  1841     }

  1843     StubRoutines::_jbyte_disjoint_arraycopy          = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");

  1844     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");

  1845     StubRoutines::_jint_disjoint_arraycopy           = generate_disjoint_int_oop_copy(false, false, "jint_disjoint_arraycopy");

  1846     StubRoutines::_jlong_disjoint_arraycopy          = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");

  1848     StubRoutines::_jbyte_arraycopy  = generate_conjoint_byte_copy(false, "jbyte_arraycopy");

  1849     StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, "jshort_arraycopy");

  1850     StubRoutines::_jint_arraycopy   = generate_conjoint_int_oop_copy(false, false, "jint_arraycopy");

  1851     StubRoutines::_jlong_arraycopy  = generate_conjoint_long_copy(false, "jlong_arraycopy");

  1853     // We don't generate specialized code for HeapWord-aligned source

  1854     // arrays, so just use the code we've already generated

  1855     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;

  1856     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;

  1858     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;

  1859     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;

  1861     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;

  1862     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;

  1864     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;

  1865     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;

  1867     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;

  1868     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;

  1870     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;

  1871     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;

  1872   }

  1874   //Wang: add a function to implement SafeFetch32 and SafeFetchN

  1875   void generate_safefetch(const char* name, int size, address* entry,

  1876                           address* fault_pc, address* continuation_pc) {

  1877     // safefetch signatures:

  1878     //   int      SafeFetch32(int*      adr, int      errValue);

  1879     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);

  1880     //

  1881     // arguments:

  1882     //   A0 = adr

  1883     //   A1 = errValue

  1884     //

  1885     // result:

  1886     //   PPC_RET  = *adr or errValue

  1888     StubCodeMark mark(this, "StubRoutines", name);

  1890     // Entry point, pc or function descriptor.

  1891     *entry = __ pc();

  1893     // Load *adr into A1, may fault.

  1894     *fault_pc = __ pc();

  1895     switch (size) {

  1896       case 4:

  1897         // int32_t

  1898         __ lw(A1, A0, 0);

  1899         break;

  1900       case 8:

  1901         // int64_t

  1902         __ ld(A1, A0, 0);

  1903         break;

  1904       default:

  1905         ShouldNotReachHere();

  1906     }

  1908     // return errValue or *adr

  1909     *continuation_pc = __ pc();

  1910     __ addu(V0,A1,R0);

  1911     __ jr(RA);

  1912     __ delayed()->nop();

  1913   }

  1916 #undef __

  1917 #define __ masm->

  1919   // Continuation point for throwing of implicit exceptions that are

  1920   // not handled in the current activation. Fabricates an exception

  1921   // oop and initiates normal exception dispatching in this

  1922   // frame. Since we need to preserve callee-saved values (currently

  1923   // only for C2, but done for C1 as well) we need a callee-saved oop

  1924   // map and therefore have to make these stubs into RuntimeStubs

  1925   // rather than BufferBlobs.  If the compiler needs all registers to

  1926   // be preserved between the fault point and the exception handler

  1927   // then it must assume responsibility for that in

  1928   // AbstractCompiler::continuation_for_implicit_null_exception or

  1929   // continuation_for_implicit_division_by_zero_exception. All other

  1930   // implicit exceptions (e.g., NullPointerException or

  1931   // AbstractMethodError on entry) are either at call sites or

  1932   // otherwise assume that stack unwinding will be initiated, so

  1933   // caller saved registers were assumed volatile in the compiler.

  1934   address generate_throw_exception(const char* name,

  1935                                    address runtime_entry,

  1936                                    bool restore_saved_exception_pc) {

  1937     // Information about frame layout at time of blocking runtime call.

  1938     // Note that we only have to preserve callee-saved registers since

  1939     // the compilers are responsible for supplying a continuation point

  1940     // if they expect all registers to be preserved.

  1941     enum layout {

  1942       thread_off,    // last_java_sp

  1943       S7_off,        // callee saved register      sp + 1

  1944       S6_off,        // callee saved register      sp + 2

  1945       S5_off,        // callee saved register      sp + 3

  1946       S4_off,        // callee saved register      sp + 4

  1947       S3_off,        // callee saved register      sp + 5

  1948       S2_off,        // callee saved register      sp + 6

  1949       S1_off,        // callee saved register      sp + 7

  1950       S0_off,        // callee saved register      sp + 8

  1951       FP_off,

  1952       ret_address,

  1953       framesize

  1954     };

  1956     int insts_size = 2048;

  1957     int locs_size  = 32;

  1959     //  CodeBuffer* code     = new CodeBuffer(insts_size, locs_size, 0, 0, 0, false,

  1960     //  NULL, NULL, NULL, false, NULL, name, false);

  1961     CodeBuffer code (name , insts_size, locs_size);

  1962     OopMapSet* oop_maps  = new OopMapSet();

  1963     MacroAssembler* masm = new MacroAssembler(&code);

  1965     address start = __ pc();

  1967     // This is an inlined and slightly modified version of call_VM

  1968     // which has the ability to fetch the return PC out of

  1969     // thread-local storage and also sets up last_Java_sp slightly

  1970     // differently than the real call_VM

  1971 #ifndef OPT_THREAD

  1972     Register java_thread = TREG;

  1973     __ get_thread(java_thread);

  1974 #else

  1975     Register java_thread = TREG;

  1976 #endif

  1977     if (restore_saved_exception_pc) {

  1978       __ ld(RA, java_thread, in_bytes(JavaThread::saved_exception_pc_offset())); // eax

  1979     }

  1981     __ enter(); // required for proper stackwalking of RuntimeStub frame

  1983     __ addi(SP, SP, (-1) * (framesize-2) * wordSize); // prolog

  1984     __ sd(S0, SP, S0_off * wordSize);

  1985     __ sd(S1, SP, S1_off * wordSize);

  1986     __ sd(S2, SP, S2_off * wordSize);

  1987     __ sd(S3, SP, S3_off * wordSize);

  1988     __ sd(S4, SP, S4_off * wordSize);

  1989     __ sd(S5, SP, S5_off * wordSize);

  1990     __ sd(S6, SP, S6_off * wordSize);

  1991     __ sd(S7, SP, S7_off * wordSize);

  1993     int frame_complete = __ pc() - start;

  1994     // push java thread (becomes first argument of C function)

  1995     __ sd(java_thread, SP, thread_off * wordSize);

  1996     if (java_thread != A0)

  1997       __ move(A0, java_thread);

  1999     // Set up last_Java_sp and last_Java_fp

  2000     __ set_last_Java_frame(java_thread, SP, FP, NULL);

  2001     // Align stack

  2002     __ set64(AT, -(StackAlignmentInBytes));

  2003     __ andr(SP, SP, AT);

  2005     __ relocate(relocInfo::internal_pc_type);

  2006     {

  2007       intptr_t save_pc = (intptr_t)__ pc() +  NativeMovConstReg::instruction_size + 28;

  2008       __ patchable_set48(AT, save_pc);

  2009     }

  2010     __ sd(AT, java_thread, in_bytes(JavaThread::last_Java_pc_offset()));

  2012     // Call runtime

  2013     __ call(runtime_entry);

  2014     __ delayed()->nop();

  2015     // Generate oop map

  2016     OopMap* map =  new OopMap(framesize, 0);

  2017     oop_maps->add_gc_map(__ offset(),  map);

  2019     // restore the thread (cannot use the pushed argument since arguments

  2020     // may be overwritten by C code generated by an optimizing compiler);

  2021     // however can use the register value directly if it is callee saved.

  2022 #ifndef OPT_THREAD

  2023     __ get_thread(java_thread);

  2024 #endif

  2026     __ ld(SP, java_thread, in_bytes(JavaThread::last_Java_sp_offset()));

  2027     __ reset_last_Java_frame(java_thread, true, true);

  2029     // Restore callee save registers.  This must be done after resetting the Java frame

  2030     __ ld(S0, SP, S0_off * wordSize);

  2031     __ ld(S1, SP, S1_off * wordSize);

  2032     __ ld(S2, SP, S2_off * wordSize);

  2033     __ ld(S3, SP, S3_off * wordSize);

  2034     __ ld(S4, SP, S4_off * wordSize);

  2035     __ ld(S5, SP, S5_off * wordSize);

  2036     __ ld(S6, SP, S6_off * wordSize);

  2037     __ ld(S7, SP, S7_off * wordSize);

  2039     // discard arguments

  2040     __ addi(SP, SP, (framesize-2) * wordSize); // epilog

  2041     __ addi(SP, FP, wordSize);

  2042     __ ld(FP, SP, -1*wordSize);

  2043     // check for pending exceptions

  2044 #ifdef ASSERT

  2045     Label L;

  2046     __ lw(AT, java_thread, in_bytes(Thread::pending_exception_offset()));

  2047     __ bne(AT, R0, L);

  2048     __ delayed()->nop();

  2049     __ should_not_reach_here();

  2050     __ bind(L);

  2051 #endif //ASSERT

  2052     __ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);

  2053     __ delayed()->nop();

  2054     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name,

  2055                                                       &code,

  2056                                                       frame_complete,

  2057                                                       framesize,

  2058                                                       oop_maps, false);

  2059     return stub->entry_point();

  2060   }

  2062   // Initialization

  2063   void generate_initial() {

  2064     // Generates all stubs and initializes the entry points

  2066     //-------------------------------------------------------------

  2067     //-----------------------------------------------------------

  2068     // entry points that exist in all platforms

  2069     // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller

  2070     // than the disadvantage of having a much more complicated generator structure.

  2071     // See also comment in stubRoutines.hpp.

  2072     StubRoutines::_forward_exception_entry = generate_forward_exception();

  2073     StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);

  2074     // is referenced by megamorphic call

  2075     StubRoutines::_catch_exception_entry = generate_catch_exception();

  2077     StubRoutines::_handler_for_unsafe_access_entry = generate_handler_for_unsafe_access();

  2079     StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception",

  2080                                                                               CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError),   false);

  2081     // platform dependent

  2082     StubRoutines::gs2::_get_previous_fp_entry = generate_get_previous_fp();

  2083   }

  2085   void generate_all() {

  2086     // Generates all stubs and initializes the entry points

  2088     // These entry points require SharedInfo::stack0 to be set up in

  2089     // non-core builds and need to be relocatable, so they each

  2090     // fabricate a RuntimeStub internally.

  2091     StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception",

  2092                                                                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);

  2094     StubRoutines::_throw_IncompatibleClassChangeError_entry = generate_throw_exception("IncompatibleClassChangeError throw_exception",

  2095                                                                                CAST_FROM_FN_PTR(address, SharedRuntime:: throw_IncompatibleClassChangeError), false);

  2097     StubRoutines::_throw_NullPointerException_at_call_entry = generate_throw_exception("NullPointerException at call throw_exception",

  2098                                                                                         CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);

  2100     //------------------------------------------------------------------

  2101     // entry points that are platform specific

  2103     // support for verify_oop (must happen after universe_init)

  2104     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();

  2105 #ifndef CORE

  2106     // arraycopy stubs used by compilers

  2107     generate_arraycopy_stubs();

  2108 #endif

  2110     // Safefetch stubs.

  2111     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,

  2112                                                        &StubRoutines::_safefetch32_fault_pc,

  2113                                                        &StubRoutines::_safefetch32_continuation_pc);

  2114     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,

  2115                                                        &StubRoutines::_safefetchN_fault_pc,

  2116                                                        &StubRoutines::_safefetchN_continuation_pc);

  2117   }

  2119  public:

  2120   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {

  2121     if (all) {

  2122       generate_all();

  2123     } else {

  2124       generate_initial();

  2125     }

  2126   }

  2127 }; // end class declaration

  2129 void StubGenerator_generate(CodeBuffer* code, bool all) {

  2130   StubGenerator g(code, all);

  2131 }

src/cpu/mips/vm/stubGenerator_mips_64.cpp@cecfc245b19a

src/cpu/mips/vm/stubGenerator_mips_64.cpp

Mercurial > jdk8-mips64-public > hotspot / file revision

src/cpu/mips/vm/stubGenerator_mips_64.cpp@cecfc245b19a

src/cpu/mips/vm/stubGenerator_mips_64.cpp