src/cpu/mips/vm/stubGenerator_mips_64.cpp

Mon, 13 Nov 2017 15:49:42 +0800

author
aoqi
date
Mon, 13 Nov 2017 15:49:42 +0800
changeset 8009
0477693968a6
parent 8005
b5abf640a085
child 9144
cecfc245b19a
permissions
-rw-r--r--

#5963 wrong frame offset (SP) in StackOverflowError handler
Summary: push/pop before/after bang_stack_with_offset is removed. compiler/6865265/StackOverflowBug.java passed.
This patch also includes code cleanup and code style fix.

     1 /*
     2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
     3  * Copyright (c) 2015, 2016, Loongson Technology. All rights reserved.
     4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     5  *
     6  * This code is free software; you can redistribute it and/or modify it
     7  * under the terms of the GNU General Public License version 2 only, as
     8  * published by the Free Software Foundation.
     9  *
    10  * This code is distributed in the hope that it will be useful, but WITHOUT
    11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    13  * version 2 for more details (a copy is included in the LICENSE file that
    14  * accompanied this code).
    15  *
    16  * You should have received a copy of the GNU General Public License version
    17  * 2 along with this work; if not, write to the Free Software Foundation,
    18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    19  *
    20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    21  * or visit www.oracle.com if you need additional information or have any
    22  * questions.
    23  *
    24  */
    26 #include "precompiled.hpp"
    27 #include "asm/macroAssembler.hpp"
    28 #include "asm/macroAssembler.inline.hpp"
    29 #include "interpreter/interpreter.hpp"
    30 #include "nativeInst_mips.hpp"
    31 #include "oops/instanceOop.hpp"
    32 #include "oops/method.hpp"
    33 #include "oops/objArrayKlass.hpp"
    34 #include "oops/oop.inline.hpp"
    35 #include "prims/methodHandles.hpp"
    36 #include "runtime/frame.inline.hpp"
    37 #include "runtime/handles.inline.hpp"
    38 #include "runtime/sharedRuntime.hpp"
    39 #include "runtime/stubCodeGenerator.hpp"
    40 #include "runtime/stubRoutines.hpp"
    41 #include "runtime/thread.inline.hpp"
    42 #include "utilities/top.hpp"
    43 #ifdef COMPILER2
    44 #include "opto/runtime.hpp"
    45 #endif
    47 // Declaration and definition of StubGenerator (no .hpp file).
    48 // For a more detailed description of the stub routine structure
    49 // see the comment in stubRoutines.hpp
    51 #define __ _masm->
    52 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
    53 //#define a__ ((Assembler*)_masm)->
    55 //#ifdef PRODUCT
    56 //#define BLOCK_COMMENT(str) /* nothing */
    57 //#else
    58 //#define BLOCK_COMMENT(str) __ block_comment(str)
    59 //#endif
    61 //#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
    62 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
    64 // Stub Code definitions
    66 static address handle_unsafe_access() {
    67   JavaThread* thread = JavaThread::current();
    68   address pc = thread->saved_exception_pc();
    69   // pc is the instruction which we must emulate
    70   // doing a no-op is fine:  return garbage from the load
    71   // therefore, compute npc
    72   address npc = (address)((unsigned long)pc + sizeof(unsigned long));
    74   // request an async exception
    75   thread->set_pending_unsafe_access_error();
    77   // return address of next instruction to execute
    78   return npc;
    79 }
    81 class StubGenerator: public StubCodeGenerator {
    82  private:
    84   // ABI mips n64
    85   // This fig is not MIPS ABI. It is call Java from C ABI.
    86   // Call stubs are used to call Java from C
    87   //
    88   //    [ return_from_Java     ]
    89   //    [ argument word n-1    ] <--- sp
    90   //      ...
    91   //    [ argument word 0      ]
    92   //      ...
    93   //-10 [ S6              ]
    94   // -9 [ S5           ]
    95   // -8 [ S4           ]
    96   // -7 [ S3                   ]
    97   // -6 [ S0             ]
    98   // -5 [ TSR(S2)         ]
    99   // -4 [ LVP(S7)              ]
   100   // -3 [ BCP(S1)              ]
   101   // -2 [ saved fp             ] <--- fp_after_call
   102   // -1 [ return address       ]
   103   //  0 [ ptr. to call wrapper ] <--- a0 (old sp -->)fp
   104   //  1 [ result               ] <--- a1
   105   //  2 [ result_type          ] <--- a2
   106   //  3 [ method               ] <--- a3
   107   //  4 [ entry_point          ] <--- a4
   108   //  5 [ parameters           ] <--- a5
   109   //  6 [ parameter_size       ] <--- a6
   110   //  7 [ thread               ] <--- a7
   112   //
   113   // _LP64: n64 does not save paras in sp.
   114   //
   115   //    [ return_from_Java     ]
   116   //    [ argument word n-1    ] <--- sp
   117   //      ...
   118   //    [ argument word 0      ]
   119   //      ...
   120   //-14 [ thread               ]
   121   //-13 [ result_type          ] <--- a2
   122   //-12 [ result               ] <--- a1
   123   //-11 [ ptr. to call wrapper ] <--- a0
   124   //-10 [ S6              ]
   125   // -9 [ S5           ]
   126   // -8 [ S4           ]
   127   // -7 [ S3                   ]
   128   // -6 [ S0             ]
   129   // -5 [ TSR(S2)         ]
   130   // -4 [ LVP(S7)              ]
   131   // -3 [ BCP(S1)              ]
   132   // -2 [ saved fp             ] <--- fp_after_call
   133   // -1 [ return address       ]
   134   //  0 [                 ] <--- old sp
   135   /*
   136    * 2014/01/16 Fu: Find a right place in the call_stub for GP.
   137    * GP will point to the starting point of Interpreter::dispatch_table(itos).
   138    * It should be saved/restored before/after Java calls.
   139    *
   140    */
   141   enum call_stub_layout {
   142     RA_off             = -1,
   143     FP_off             = -2,
   144     BCP_off            = -3,
   145     LVP_off            = -4,
   146     TSR_off            = -5,
   147     S1_off             = -6,
   148     S3_off             = -7,
   149     S4_off             = -8,
   150     S5_off             = -9,
   151     S6_off             = -10,
   152     result_off         = -11,
   153     result_type_off    = -12,
   154     thread_off         = -13,
   155     total_off          = thread_off - 3,
   156     GP_off             = -16,
   157  };
   159   address generate_call_stub(address& return_address) {
   161     StubCodeMark mark(this, "StubRoutines", "call_stub");
   162     address start = __ pc();
   164     // same as in generate_catch_exception()!
   166     // stub code
   167     // save ra and fp
   168     __ sd(RA, SP, RA_off * wordSize);
   169     __ sd(FP, SP, FP_off * wordSize);
   170     __ sd(BCP, SP, BCP_off * wordSize);
   171     __ sd(LVP, SP, LVP_off * wordSize);
   172     __ sd(GP, SP, GP_off * wordSize);
   173     __ sd(TSR, SP, TSR_off * wordSize);
   174     __ sd(S1, SP, S1_off * wordSize);
   175     __ sd(S3, SP, S3_off * wordSize);
   176     __ sd(S4, SP, S4_off * wordSize);
   177     __ sd(S5, SP, S5_off * wordSize);
   178     __ sd(S6, SP, S6_off * wordSize);
   181     __ set64(GP, (long)Interpreter::dispatch_table(itos));
   183     // I think 14 is the max gap between argument and callee saved register
   184     __ daddi(FP, SP, (-2) * wordSize);
   185     __ daddi(SP, SP, total_off * wordSize);
   186     __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);
   187     __ sd(A1, FP, result_off * wordSize);
   188     __ sd(A2, FP, result_type_off * wordSize);
   189     __ sd(A7, FP, thread_off * wordSize);
   191 #ifdef OPT_THREAD
   192     __ move(TREG, A7);
   193 #endif
   194     //add for compressedoops
   195     __ reinit_heapbase();
   197 #ifdef ASSERT
   198     // make sure we have no pending exceptions
   199     {
   200       Label L;
   201       __ ld(AT, A7, in_bytes(Thread::pending_exception_offset()));
   202       __ beq(AT, R0, L);
   203       __ delayed()->nop();
   204       /* FIXME: I do not know how to realize stop in mips arch, do it in the future */
   205       __ stop("StubRoutines::call_stub: entered with pending exception");
   206       __ bind(L);
   207     }
   208 #endif
   210     // pass parameters if any
   211     // A5: parameter
   212     // A6: parameter_size
   213     // T0: parameter_size_tmp(--)
   214     // T2: offset(++)
   215     // T3: tmp
   216     Label parameters_done;
   217     // judge if the parameter_size equals 0
   218     __ beq(A6, R0, parameters_done);
   219     __ delayed()->nop();
   220     __ dsll(AT, A6, Interpreter::logStackElementSize);
   221     __ dsub(SP, SP, AT);
   222     __ move(AT, -StackAlignmentInBytes);
   223     __ andr(SP, SP , AT);
   224     // Copy Java parameters in reverse order (receiver last)
   225     // Note that the argument order is inverted in the process
   226     // source is edx[ecx: N-1..0]
   227     // dest   is esp[ebx: 0..N-1]
   228     Label loop;
   229     __ move(T0, A6);
   230     __ move(T2, R0);
   231     __ bind(loop);
   233     // get parameter
   234     __ dsll(T3, T0, LogBytesPerWord);
   235     __ dadd(T3, T3, A5);
   236     __ ld(AT, T3,  -wordSize);
   237     __ dsll(T3, T2, LogBytesPerWord);
   238     __ dadd(T3, T3, SP);
   239     __ sd(AT, T3, Interpreter::expr_offset_in_bytes(0));
   240     __ daddi(T2, T2, 1);
   241     __ daddi(T0, T0, -1);
   242     __ bne(T0, R0, loop);
   243     __ delayed()->nop();
   244     // advance to next parameter
   246     // call Java function
   247     __ bind(parameters_done);
   249     // receiver in V0, methodOop in Rmethod
   251     __ move(Rmethod, A3);
   252     __ move(Rsender, SP);             //set sender sp
   253     __ jalr(A4);
   254     __ delayed()->nop();
   255     return_address = __ pc();
   257     Label common_return;
   258     __ bind(common_return);
   260     // store result depending on type
   261     // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
   262     __ ld(T0, FP, result_off * wordSize);   // result --> T0
   263     Label is_long, is_float, is_double, exit;
   264     __ ld(T2, FP, result_type_off * wordSize);  // result_type --> T2
   265     __ daddi(T3, T2, (-1) * T_LONG);
   266     __ beq(T3, R0, is_long);
   267     __ delayed()->daddi(T3, T2, (-1) * T_FLOAT);
   268     __ beq(T3, R0, is_float);
   269     __ delayed()->daddi(T3, T2, (-1) * T_DOUBLE);
   270     __ beq(T3, R0, is_double);
   271     __ delayed()->nop();
   273     // handle T_INT case
   274     __ sd(V0, T0, 0 * wordSize);
   275     __ bind(exit);
   277     // restore
   278     __ daddi(SP, FP, 2 * wordSize );
   279     __ ld(RA, SP, RA_off * wordSize);
   280     __ ld(FP, SP, FP_off * wordSize);
   281     __ ld(BCP, SP, BCP_off * wordSize);
   282     __ ld(LVP, SP, LVP_off * wordSize);
   283     __ ld(GP, SP, GP_off * wordSize);
   284     __ ld(TSR, SP, TSR_off * wordSize);
   286     __ ld(S1, SP, S1_off * wordSize);
   287     __ ld(S3, SP, S3_off * wordSize);
   288     __ ld(S4, SP, S4_off * wordSize);
   289     __ ld(S5, SP, S5_off * wordSize);
   290     __ ld(S6, SP, S6_off * wordSize);
   292     // return
   293     __ jr(RA);
   294     __ delayed()->nop();
   296     // handle return types different from T_INT
   297     __ bind(is_long);
   298     __ sd(V0, T0, 0 * wordSize);
   299     //__ sd(V1, T0, 1 * wordSize);
   300     //__ sd(R0, T0, 1 * wordSize);
   301     __ b(exit);
   302     __ delayed()->nop();
   304     __ bind(is_float);
   305     __ swc1(F0, T0, 0 * wordSize);
   306     __ b(exit);
   307     __ delayed()->nop();
   309     __ bind(is_double);
   310     __ sdc1(F0, T0, 0 * wordSize);
   311     __ b(exit);
   312     __ delayed()->nop();
   313     //FIXME, 1.6 mips version add operation of fpu here
   314     StubRoutines::gs2::set_call_stub_compiled_return(__ pc());
   315     __ b(common_return);
   316     __ delayed()->nop();
   317     return start;
   318   }
   320   // Return point for a Java call if there's an exception thrown in
   321   // Java code.  The exception is caught and transformed into a
   322   // pending exception stored in JavaThread that can be tested from
   323   // within the VM.
   324   //
   325   // Note: Usually the parameters are removed by the callee. In case
   326   // of an exception crossing an activation frame boundary, that is
   327   // not the case if the callee is compiled code => need to setup the
   328   // rsp.
   329   //
   330   // rax: exception oop
   332   address generate_catch_exception() {
   333     StubCodeMark mark(this, "StubRoutines", "catch_exception");
   334     address start = __ pc();
   336     Register thread = TREG;
   338     // get thread directly
   339 #ifndef OPT_THREAD
   340     __ ld(thread, FP, thread_off * wordSize);
   341 #endif
   343 #ifdef ASSERT
   344     // verify that threads correspond
   345     { Label L;
   346       __ get_thread(T8);
   347       __ beq(T8, thread, L);
   348       __ delayed()->nop();
   349       __ stop("StubRoutines::catch_exception: threads must correspond");
   350       __ bind(L);
   351     }
   352 #endif
   353     // set pending exception
   354     __ verify_oop(V0);
   355     __ sd(V0, thread, in_bytes(Thread::pending_exception_offset()));
   356     __ li(AT, (long)__FILE__);
   357     __ sd(AT, thread, in_bytes(Thread::exception_file_offset   ()));
   358     __ li(AT, (long)__LINE__);
   359     __ sd(AT, thread, in_bytes(Thread::exception_line_offset   ()));
   361     // complete return to VM
   362     assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
   363     __ jmp(StubRoutines::_call_stub_return_address, relocInfo::none);
   364     __ delayed()->nop();
   366     return start;
   367   }
   369   // Continuation point for runtime calls returning with a pending
   370   // exception.  The pending exception check happened in the runtime
   371   // or native call stub.  The pending exception in Thread is
   372   // converted into a Java-level exception.
   373   //
   374   // Contract with Java-level exception handlers:
   375   // rax: exception
   376   // rdx: throwing pc
   377   //
   378   // NOTE: At entry of this stub, exception-pc must be on stack !!
   380   address generate_forward_exception() {
   381     StubCodeMark mark(this, "StubRoutines", "forward exception");
   382     //Register thread = TREG;
   383     Register thread = TREG;
   384     address start = __ pc();
   386     // Upon entry, the sp points to the return address returning into
   387     // Java (interpreted or compiled) code; i.e., the return address
   388     // throwing pc.
   389     //
   390     // Arguments pushed before the runtime call are still on the stack
   391     // but the exception handler will reset the stack pointer ->
   392     // ignore them.  A potential result in registers can be ignored as
   393     // well.
   395 #ifndef OPT_THREAD
   396     __ get_thread(thread);
   397 #endif
   398 #ifdef ASSERT
   399     // make sure this code is only executed if there is a pending exception
   400     {
   401       Label L;
   402       __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));
   403       __ bne(AT, R0, L);
   404       __ delayed()->nop();
   405       __ stop("StubRoutines::forward exception: no pending exception (1)");
   406       __ bind(L);
   407     }
   408 #endif
   410     // compute exception handler into T9
   411     __ ld(A1, SP, 0);
   412     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, A1);
   413     __ move(T9, V0);
   414     __ pop(V1);
   416 #ifndef OPT_THREAD
   417     __ get_thread(thread);
   418 #endif
   419     __ ld(V0, thread, in_bytes(Thread::pending_exception_offset()));
   420     __ sd(R0, thread, in_bytes(Thread::pending_exception_offset()));
   422 #ifdef ASSERT
   423     // make sure exception is set
   424     {
   425       Label L;
   426       __ bne(V0, R0, L);
   427       __ delayed()->nop();
   428       __ stop("StubRoutines::forward exception: no pending exception (2)");
   429       __ bind(L);
   430     }
   431 #endif
   433     // continue at exception handler (return address removed)
   434     // V0: exception
   435     // T9: exception handler
   436     // V1: throwing pc
   437     __ verify_oop(V0);
   438     __ jr(T9);
   439     __ delayed()->nop();
   441     return start;
   442   }
   444   // Support for intptr_t get_previous_fp()
   445   //
   446   // This routine is used to find the previous frame pointer for the
   447   // caller (current_frame_guess). This is used as part of debugging
   448   // ps() is seemingly lost trying to find frames.
   449   // This code assumes that caller current_frame_guess) has a frame.
   450   address generate_get_previous_fp() {
   451     StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
   452     const Address old_fp       (FP,  0);
   453     const Address older_fp       (V0,  0);
   454     address start = __ pc();
   455     __ enter();
   456     __ lw(V0, old_fp); // callers fp
   457     __ lw(V0, older_fp); // the frame for ps()
   458     __ leave();
   459     __ jr(RA);
   460     __ delayed()->nop();
   461     return start;
   462   }
   464   // The following routine generates a subroutine to throw an
   465   // asynchronous UnknownError when an unsafe access gets a fault that
   466   // could not be reasonably prevented by the programmer.  (Example:
   467   // SIGBUS/OBJERR.)
   468   address generate_handler_for_unsafe_access() {
   469     StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
   470     address start = __ pc();
   471     __ pushad();                      // push registers
   472     //  Address next_pc(esp, RegisterImpl::number_of_registers * BytesPerWord);
   473     __ call(CAST_FROM_FN_PTR(address, handle_unsafe_access), relocInfo::runtime_call_type);
   474     __ delayed()->nop();
   475     __ sw(V0, SP, RegisterImpl::number_of_registers * BytesPerWord);
   476     __ popad();
   477     __ jr(RA);
   478     __ delayed()->nop();
   479     return start;
   480   }
   482   // Non-destructive plausibility checks for oops
   483   //
   484   // Arguments:
   485   //    all args on stack!
   486   //
   487   // Stack after saving c_rarg3:
   488   //    [tos + 0]: saved c_rarg3
   489   //    [tos + 1]: saved c_rarg2
   490   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
   491   //    [tos + 3]: saved flags
   492   //    [tos + 4]: return address
   493   //  * [tos + 5]: error message (char*)
   494   //  * [tos + 6]: object to verify (oop)
   495   //  * [tos + 7]: saved rax - saved by caller and bashed
   496   //  * = popped on exit
   497   address generate_verify_oop() {
   498     StubCodeMark mark(this, "StubRoutines", "verify_oop");
   499     address start = __ pc();
   500     __ reinit_heapbase();
   501     __ verify_oop_subroutine();
   502     address end = __ pc();
   503     return start;
   504   }
   506   //
   507   //  Generate overlap test for array copy stubs
   508   //
   509   //  Input:
   510   //     A0    -  array1
   511   //     A1    -  array2
   512   //     A2    -  element count
   513   //
   514   //  Note: this code can only use %eax, %ecx, and %edx
   515   //
   517  // use T9 as temp
   518   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
   519     int elem_size = 1 << log2_elem_size;
   520     Address::ScaleFactor sf = Address::times_1;
   522     switch (log2_elem_size) {
   523       case 0: sf = Address::times_1; break;
   524       case 1: sf = Address::times_2; break;
   525       case 2: sf = Address::times_4; break;
   526       case 3: sf = Address::times_8; break;
   527     }
   529     __ dsll(AT, A2, sf);
   530     __ dadd(AT, AT, A0);
   531     __ lea(T9, Address(AT, -elem_size));
   532     __ dsub(AT, A1, A0);
   533     __ blez(AT, no_overlap_target);
   534     __ delayed()->nop();
   535     __ dsub(AT, A1, T9);
   536     __ bgtz(AT, no_overlap_target);
   537     __ delayed()->nop();
   539     // 2016/05/10 aoqi: If A0 = 0xf... and A1 = 0x0..., than goto no_overlap_target
   540     Label L;
   541     __ bgez(A0, L);
   542     __ delayed()->nop();
   543     __ bgtz(A1, no_overlap_target);
   544     __ delayed()->nop();
   545     __ bind(L);
   547   }
   549   //
   550   //  Generate store check for array
   551   //
   552   //  Input:
   553   //     T0    -  starting address(edi)
   554   //     T1    -  element count  (ecx)
   555   //
   556   //  The 2 input registers are overwritten
   557   //
   560   void array_store_check(Register tmp) {
   561     assert_different_registers(tmp, AT, T0, T1);
   562     BarrierSet* bs = Universe::heap()->barrier_set();
   563     assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
   564     CardTableModRefBS* ct = (CardTableModRefBS*)bs;
   565     assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
   566     Label l_0;
   568     if (UseConcMarkSweepGC) __ sync();
   570     __ set64(tmp, (long)ct->byte_map_base);
   572     __ dsll(AT, T1, TIMES_OOP);
   573     __ dadd(AT, T0, AT);
   574     __ daddiu(T1, AT, - BytesPerHeapOop);
   576     __ shr(T0, CardTableModRefBS::card_shift);
   577     __ shr(T1, CardTableModRefBS::card_shift);
   579     __ dsub(T1, T1, T0);   // end --> cards count
   580     __ bind(l_0);
   582     __ dadd(AT, tmp, T0);
   583     if (UseLoongsonISA) {
   584       __ gssbx(R0, AT, T1, 0);
   585     } else {
   586       __ dadd(AT, AT, T1);
   587       __ sb(R0, AT, 0);
   588     }
   590     __ bgtz(T1, l_0);
   591     __ delayed()->daddi(T1, T1, - 1);
   592   }
   594   // Generate code for an array write pre barrier
   595   //
   596   //     addr    -  starting address
   597   //     count   -  element count
   598   //     tmp     - scratch register
   599   //
   600   //     Destroy no registers!
   601   //
   602   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
   603     BarrierSet* bs = Universe::heap()->barrier_set();
   604     switch (bs->kind()) {
   605       case BarrierSet::G1SATBCT:
   606       case BarrierSet::G1SATBCTLogging:
   607         // With G1, don't generate the call if we statically know that the target in uninitialized
   608         if (!dest_uninitialized) {
   609            __ pushad();                      // push registers
   610            if (count == A0) {
   611              if (addr == A1) {
   612                // exactly backwards!!
   613                //__ xchgptr(c_rarg1, c_rarg0);
   614                __ move(AT, A0);
   615                __ move(A0, A1);
   616                __ move(A1, AT);
   617              } else {
   618                __ move(A1, count);
   619                __ move(A0, addr);
   620              }
   621            } else {
   622              __ move(A0, addr);
   623              __ move(A1, count);
   624            }
   625            __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
   626            __ popad();
   627         }
   628         break;
   629       case BarrierSet::CardTableModRef:
   630       case BarrierSet::CardTableExtension:
   631       case BarrierSet::ModRef:
   632         break;
   633       default:
   634         ShouldNotReachHere();
   636     }
   637   }
   639   //
   640   // Generate code for an array write post barrier
   641   //
   642   //  Input:
   643   //     start    - register containing starting address of destination array
   644   //     count    - elements count
   645   //     scratch  - scratch register
   646   //
   647   //  The input registers are overwritten.
   648   //
   649   void  gen_write_ref_array_post_barrier(Register start, Register count, Register scratch) {
   650     assert_different_registers(start, count, scratch, AT);
   651     BarrierSet* bs = Universe::heap()->barrier_set();
   652     switch (bs->kind()) {
   653       case BarrierSet::G1SATBCT:
   654       case BarrierSet::G1SATBCTLogging:
   655         {
   656           __ pushad();             // push registers (overkill)
   657           if (count == A0) {
   658             if (start == A1) {
   659               // exactly backwards!!
   660               //__ xchgptr(c_rarg1, c_rarg0);
   661               __ move(AT, A0);
   662               __ move(A0, A1);
   663               __ move(A1, AT);
   664             } else {
   665               __ move(A1, count);
   666               __ move(A0, start);
   667             }
   668           } else {
   669             __ move(A0, start);
   670             __ move(A1, count);
   671           }
   672           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
   673           __ popad();
   674         }
   675         break;
   676       case BarrierSet::CardTableModRef:
   677       case BarrierSet::CardTableExtension:
   678         {
   679           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
   680           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
   682           Label L_loop;
   683           const Register end = count;
   685           if (UseConcMarkSweepGC) __ sync();
   687           int64_t disp = (int64_t) ct->byte_map_base;
   688           __ set64(scratch, disp);
   690           __ lea(end, Address(start, count, TIMES_OOP, 0));  // end == start+count*oop_size
   691           __ daddiu(end, end, -BytesPerHeapOop); // end - 1 to make inclusive
   692           __ shr(start, CardTableModRefBS::card_shift);
   693           __ shr(end,   CardTableModRefBS::card_shift);
   694           __ dsubu(end, end, start); // end --> cards count
   696           __ daddu(start, start, scratch);
   698           __ bind(L_loop);
   699           if (UseLoongsonISA) {
   700             __ gssbx(R0, start, count, 0);
   701           } else {
   702             __ daddu(AT, start, count);
   703             __ sb(R0, AT, 0);
   704           }
   705           __ daddiu(count, count, -1);
   706           __ slt(AT, count, R0);
   707           __ beq(AT, R0, L_loop);
   708           __ nop();
   709         }
   710         break;
   711       default:
   712         ShouldNotReachHere();
   713     }
   714   }
   716   // Arguments:
   717   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
   718   //             ignored
   719   //   name    - stub name string
   720   //
   721   // Inputs:
   722   //   c_rarg0   - source array address
   723   //   c_rarg1   - destination array address
   724   //   c_rarg2   - element count, treated as ssize_t, can be zero
   725   //
   726   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
   727   // we let the hardware handle it.  The one to eight bytes within words,
   728   // dwords or qwords that span cache line boundaries will still be loaded
   729   // and stored atomically.
   730   //
   731   // Side Effects:
   732   //   disjoint_byte_copy_entry is set to the no-overlap entry point
   733   //   used by generate_conjoint_byte_copy().
   734   //
   735   address generate_disjoint_byte_copy(bool aligned, const char * name) {
   736     StubCodeMark mark(this, "StubRoutines", name);
   737     __ align(CodeEntryAlignment);
   740     Register tmp1 = T0;
   741     Register tmp2 = T1;
   742     Register tmp3 = T3;
   744     address start = __ pc();
   746     __ push(tmp1);
   747     __ push(tmp2);
   748     __ push(tmp3);
   749     __ move(tmp1, A0);
   750     __ move(tmp2, A1);
   751     __ move(tmp3, A2);
   754     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10, l_11;
   755     Label l_debug;
   757     __ daddi(AT, tmp3, -9); //why the number is 9 ?
   758     __ blez(AT, l_9);
   759     __ delayed()->nop();
   761     if (!aligned) {
   762       __ xorr(AT, tmp1, tmp2);
   763       __ andi(AT, AT, 1);
   764       __ bne(AT, R0, l_9); // if arrays don't have the same alignment mod 2, do 1 element copy
   765       __ delayed()->nop();
   767       __ andi(AT, tmp1, 1);
   768       __ beq(AT, R0, l_10); //copy 1 enlement if necessary to aligh to 2 bytes
   769       __ delayed()->nop();
   771       __ lb(AT, tmp1, 0);
   772       __ daddi(tmp1, tmp1, 1);
   773       __ sb(AT, tmp2, 0);
   774       __ daddi(tmp2, tmp2, 1);
   775       __ daddi(tmp3, tmp3, -1);
   776       __ bind(l_10);
   778       __ xorr(AT, tmp1, tmp2);
   779       __ andi(AT, AT, 3);
   780       __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 2 elements copy
   781       __ delayed()->nop();
   783       // At this point it is guaranteed that both, from and to have the same alignment mod 4.
   785       // Copy 2 elements if necessary to align to 4 bytes.
   786       __ andi(AT, tmp1, 3);
   787       __ beq(AT, R0, l_2);
   788       __ delayed()->nop();
   790       __ lhu(AT, tmp1, 0);
   791       __ daddi(tmp1, tmp1, 2);
   792       __ sh(AT, tmp2, 0);
   793       __ daddi(tmp2, tmp2, 2);
   794       __ daddi(tmp3, tmp3, -2);
   795       __ bind(l_2);
   797       // At this point the positions of both, from and to, are at least 4 byte aligned.
   799       // Copy 4 elements at a time.
   800       // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
   801       __ xorr(AT, tmp1, tmp2);
   802       __ andi(AT, AT, 7);
   803       __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned
   804       __ delayed()->nop();
   806       // Copy a 4 elements if necessary to align to 8 bytes.
   807       __ andi(AT, tmp1, 7);
   808       __ beq(AT, R0, l_7);
   809       __ delayed()->nop();
   811       __ lw(AT, tmp1, 0);
   812       __ daddi(tmp3, tmp3, -4);
   813       __ sw(AT, tmp2, 0);
   814       { // FasterArrayCopy
   815         __ daddi(tmp1, tmp1, 4);
   816         __ daddi(tmp2, tmp2, 4);
   817       }
   818     }
   820     __ bind(l_7);
   822     // Copy 4 elements at a time; either the loads or the stores can
   823     // be unaligned if aligned == false.
   825     { // FasterArrayCopy
   826       __ daddi(AT, tmp3, -7);
   827       __ blez(AT, l_6); // copy 4 at a time if less than 4 elements remain
   828       __ delayed()->nop();
   830       __ bind(l_8);
   831       // For Loongson, there is 128-bit memory access. TODO
   832       __ ld(AT, tmp1, 0);
   833       __ sd(AT, tmp2, 0);
   834       __ daddi(tmp1, tmp1, 8);
   835       __ daddi(tmp2, tmp2, 8);
   836       __ daddi(tmp3, tmp3, -8);
   837       __ daddi(AT, tmp3, -8);
   838       __ bgez(AT, l_8);
   839       __ delayed()->nop();
   840     }
   841     __ bind(l_6);
   843     // copy 4 bytes at a time
   844     { // FasterArrayCopy
   845       __ daddi(AT, tmp3, -3);
   846       __ blez(AT, l_1);
   847       __ delayed()->nop();
   849       __ bind(l_3);
   850       __ lw(AT, tmp1, 0);
   851       __ sw(AT, tmp2, 0);
   852       __ daddi(tmp1, tmp1, 4);
   853       __ daddi(tmp2, tmp2, 4);
   854       __ daddi(tmp3, tmp3, -4);
   855       __ daddi(AT, tmp3, -4);
   856       __ bgez(AT, l_3);
   857       __ delayed()->nop();
   859     }
   861     // do 2 bytes copy
   862     __ bind(l_1);
   863     {
   864       __ daddi(AT, tmp3, -1);
   865       __ blez(AT, l_9);
   866       __ delayed()->nop();
   868       __ bind(l_5);
   869       __ lhu(AT, tmp1, 0);
   870       __ daddi(tmp3, tmp3, -2);
   871       __ sh(AT, tmp2, 0);
   872       __ daddi(tmp1, tmp1, 2);
   873       __ daddi(tmp2, tmp2, 2);
   874       __ daddi(AT, tmp3, -2);
   875       __ bgez(AT, l_5);
   876       __ delayed()->nop();
   877     }
   879     //do 1 element copy--byte
   880     __ bind(l_9);
   881     __ beq(R0, tmp3, l_4);
   882     __ delayed()->nop();
   884     {
   885       __ bind(l_11);
   886       __ lb(AT, tmp1, 0);
   887       __ daddi(tmp3, tmp3, -1);
   888       __ sb(AT, tmp2, 0);
   889       __ daddi(tmp1, tmp1, 1);
   890       __ daddi(tmp2, tmp2, 1);
   891       __ daddi(AT, tmp3, -1);
   892       __ bgez(AT, l_11);
   893       __ delayed()->nop();
   894     }
   896     __ bind(l_4);
   897     __ pop(tmp3);
   898     __ pop(tmp2);
   899     __ pop(tmp1);
   901     __ jr(RA);
   902     __ delayed()->nop();
   904     return start;
   905   }
   907   // Arguments:
   908   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
   909   //             ignored
   910   //   name    - stub name string
   911   //
   912   // Inputs:
   913   //   A0   - source array address
   914   //   A1   - destination array address
   915   //   A2   - element count, treated as ssize_t, can be zero
   916   //
   917   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
   918   // we let the hardware handle it.  The one to eight bytes within words,
   919   // dwords or qwords that span cache line boundaries will still be loaded
   920   // and stored atomically.
   921   //
   922   address generate_conjoint_byte_copy(bool aligned, const char *name) {
   923     __ align(CodeEntryAlignment);
   924     StubCodeMark mark(this, "StubRoutines", name);
   925     address start = __ pc();
   927     Label l_copy_4_bytes_loop, l_copy_suffix, l_copy_suffix_loop, l_exit;
   928     Label l_copy_byte, l_from_unaligned, l_unaligned, l_4_bytes_aligned;
   930     address nooverlap_target = aligned ?
   931       StubRoutines::arrayof_jbyte_disjoint_arraycopy() :
   932       StubRoutines::jbyte_disjoint_arraycopy();
   934     array_overlap_test(nooverlap_target, 0);
   936     const Register from      = A0;   // source array address
   937     const Register to        = A1;   // destination array address
   938     const Register count     = A2;   // elements count
   939     const Register end_from  = T3;   // source array end address
   940     const Register end_to    = T0;   // destination array end address
   941     const Register end_count = T1;   // destination array end address
   943     __ push(end_from);
   944     __ push(end_to);
   945     __ push(end_count);
   946     __ push(T8);
   948     // copy from high to low
   949     __ move(end_count, count);
   950     __ dadd(end_from, from, end_count);
   951     __ dadd(end_to, to, end_count);
   953     // 2016/05/08 aoqi: If end_from and end_to has differante alignment, unaligned copy is performed.
   954     __ andi(AT, end_from, 3);
   955     __ andi(T8, end_to, 3);
   956     __ bne(AT, T8, l_copy_byte);
   957     __ delayed()->nop();
   959     // First deal with the unaligned data at the top.
   960     __ bind(l_unaligned);
   961     __ beq(end_count, R0, l_exit);
   962     __ delayed()->nop();
   964     __ andi(AT, end_from, 3);
   965     __ bne(AT, R0, l_from_unaligned);
   966     __ delayed()->nop();
   968     __ andi(AT, end_to, 3);
   969     __ beq(AT, R0, l_4_bytes_aligned);
   970     __ delayed()->nop();
   972     __ bind(l_from_unaligned);
   973     __ lb(AT, end_from, -1);
   974     __ sb(AT, end_to, -1);
   975     __ daddi(end_from, end_from, -1);
   976     __ daddi(end_to, end_to, -1);
   977     __ daddi(end_count, end_count, -1);
   978     __ b(l_unaligned);
   979     __ delayed()->nop();
   981     // now end_to, end_from point to 4-byte aligned high-ends
   982     //     end_count contains byte count that is not copied.
   983     // copy 4 bytes at a time
   984     __ bind(l_4_bytes_aligned);
   986     __ move(T8, end_count);
   987     __ daddi(AT, end_count, -3);
   988     __ blez(AT, l_copy_suffix);
   989     __ delayed()->nop();
   991     //__ andi(T8, T8, 3);
   992     __ lea(end_from, Address(end_from, -4));
   993     __ lea(end_to, Address(end_to, -4));
   995     __ dsrl(end_count, end_count, 2);
   996     __ align(16);
   997     __ bind(l_copy_4_bytes_loop); //l_copy_4_bytes
   998     __ lw(AT, end_from, 0);
   999     __ sw(AT, end_to, 0);
  1000     __ addi(end_from, end_from, -4);
  1001     __ addi(end_to, end_to, -4);
  1002     __ addi(end_count, end_count, -1);
  1003     __ bne(end_count, R0, l_copy_4_bytes_loop);
  1004     __ delayed()->nop();
  1006     __ b(l_copy_suffix);
  1007     __ delayed()->nop();
  1008     // copy dwords aligned or not with repeat move
  1009     // l_copy_suffix
  1010     // copy suffix (0-3 bytes)
  1011     __ bind(l_copy_suffix);
  1012     __ andi(T8, T8, 3);
  1013     __ beq(T8, R0, l_exit);
  1014     __ delayed()->nop();
  1015     __ addi(end_from, end_from, 3);
  1016     __ addi(end_to, end_to, 3);
  1017     __ bind(l_copy_suffix_loop);
  1018     __ lb(AT, end_from, 0);
  1019     __ sb(AT, end_to, 0);
  1020     __ addi(end_from, end_from, -1);
  1021     __ addi(end_to, end_to, -1);
  1022     __ addi(T8, T8, -1);
  1023     __ bne(T8, R0, l_copy_suffix_loop);
  1024     __ delayed()->nop();
  1026     __ bind(l_copy_byte);
  1027     __ beq(end_count, R0, l_exit);
  1028     __ delayed()->nop();
  1029     __ lb(AT, end_from, -1);
  1030     __ sb(AT, end_to, -1);
  1031     __ daddi(end_from, end_from, -1);
  1032     __ daddi(end_to, end_to, -1);
  1033     __ daddi(end_count, end_count, -1);
  1034     __ b(l_copy_byte);
  1035     __ delayed()->nop();
  1037     __ bind(l_exit);
  1038     __ pop(T8);
  1039     __ pop(end_count);
  1040     __ pop(end_to);
  1041     __ pop(end_from);
  1042     __ jr(RA);
  1043     __ delayed()->nop();
  1044     return start;
  1047   // Generate stub for disjoint short copy.  If "aligned" is true, the
  1048   // "from" and "to" addresses are assumed to be heapword aligned.
  1049   //
  1050   // Arguments for generated stub:
  1051   //      from:  A0
  1052   //      to:    A1
  1053   //  elm.count: A2 treated as signed
  1054   //  one element: 2 bytes
  1055   //
  1056   // Strategy for aligned==true:
  1057   //
  1058   //  If length <= 9:
  1059   //     1. copy 1 elements at a time (l_5)
  1060   //
  1061   //  If length > 9:
  1062   //     1. copy 4 elements at a time until less than 4 elements are left (l_7)
  1063   //     2. copy 2 elements at a time until less than 2 elements are left (l_6)
  1064   //     3. copy last element if one was left in step 2. (l_1)
  1065   //
  1066   //
  1067   // Strategy for aligned==false:
  1068   //
  1069   //  If length <= 9: same as aligned==true case
  1070   //
  1071   //  If length > 9:
  1072   //     1. continue with step 7. if the alignment of from and to mod 4
  1073   //        is different.
  1074   //     2. align from and to to 4 bytes by copying 1 element if necessary
  1075   //     3. at l_2 from and to are 4 byte aligned; continue with
  1076   //        6. if they cannot be aligned to 8 bytes because they have
  1077   //        got different alignment mod 8.
  1078   //     4. at this point we know that both, from and to, have the same
  1079   //        alignment mod 8, now copy one element if necessary to get
  1080   //        8 byte alignment of from and to.
  1081   //     5. copy 4 elements at a time until less than 4 elements are
  1082   //        left; depending on step 3. all load/stores are aligned.
  1083   //     6. copy 2 elements at a time until less than 2 elements are
  1084   //        left. (l_6)
  1085   //     7. copy 1 element at a time. (l_5)
  1086   //     8. copy last element if one was left in step 6. (l_1)
  1088   address generate_disjoint_short_copy(bool aligned, const char * name) {
  1089     StubCodeMark mark(this, "StubRoutines", name);
  1090     __ align(CodeEntryAlignment);
  1092     Register tmp1 = T0;
  1093     Register tmp2 = T1;
  1094     Register tmp3 = T3;
  1095     Register tmp4 = T8;
  1096     Register tmp5 = T9;
  1097     Register tmp6 = T2;
  1099     address start = __ pc();
  1101     __ push(tmp1);
  1102     __ push(tmp2);
  1103     __ push(tmp3);
  1104     __ move(tmp1, A0);
  1105     __ move(tmp2, A1);
  1106     __ move(tmp3, A2);
  1108     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10, l_11, l_12, l_13, l_14;
  1109     Label l_debug;
  1110     // don't try anything fancy if arrays don't have many elements
  1111     __ daddi(AT, tmp3, -23);
  1112     __ blez(AT, l_14);
  1113     __ delayed()->nop();
  1114     // move push here
  1115     __ push(tmp4);
  1116     __ push(tmp5);
  1117     __ push(tmp6);
  1119     if (!aligned) {
  1120       __ xorr(AT, A0, A1);
  1121       __ andi(AT, AT, 1);
  1122       __ bne(AT, R0, l_debug); // if arrays don't have the same alignment mod 2, can this happen?
  1123       __ delayed()->nop();
  1125       __ xorr(AT, A0, A1);
  1126       __ andi(AT, AT, 3);
  1127       __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 1 element copy
  1128       __ delayed()->nop();
  1130       // At this point it is guaranteed that both, from and to have the same alignment mod 4.
  1132       // Copy 1 element if necessary to align to 4 bytes.
  1133       __ andi(AT, A0, 3);
  1134       __ beq(AT, R0, l_2);
  1135       __ delayed()->nop();
  1137       __ lhu(AT, tmp1, 0);
  1138       __ daddi(tmp1, tmp1, 2);
  1139       __ sh(AT, tmp2, 0);
  1140       __ daddi(tmp2, tmp2, 2);
  1141       __ daddi(tmp3, tmp3, -1);
  1142       __ bind(l_2);
  1144       // At this point the positions of both, from and to, are at least 4 byte aligned.
  1146       // Copy 4 elements at a time.
  1147       // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
  1148       __ xorr(AT, tmp1, tmp2);
  1149       __ andi(AT, AT, 7);
  1150       __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned
  1151       __ delayed()->nop();
  1153       // Copy a 2-element word if necessary to align to 8 bytes.
  1154       __ andi(AT, tmp1, 7);
  1155       __ beq(AT, R0, l_7);
  1156       __ delayed()->nop();
  1158       __ lw(AT, tmp1, 0);
  1159       __ daddi(tmp3, tmp3, -2);
  1160       __ sw(AT, tmp2, 0);
  1161       __ daddi(tmp1, tmp1, 4);
  1162       __ daddi(tmp2, tmp2, 4);
  1163     }// end of if (!aligned)
  1165     __ bind(l_7);
  1166     // At this time the position of both, from and to, are at least 8 byte aligned.
  1167     // Copy 8 elemnets at a time.
  1168     // Align to 16 bytes, but only if both from and to have same alignment mod 8.
  1169     __ xorr(AT, tmp1, tmp2);
  1170     __ andi(AT, AT, 15);
  1171     __ bne(AT, R0, l_9);
  1172     __ delayed()->nop();
  1174     // Copy 4-element word if necessary to align to 16 bytes,
  1175     __ andi(AT, tmp1, 15);
  1176     __ beq(AT, R0, l_10);
  1177     __ delayed()->nop();
  1179     __ ld(AT, tmp1, 0);
  1180     __ daddi(tmp3, tmp3, -4);
  1181     __ sd(AT, tmp2, 0);
  1182     __ daddi(tmp1, tmp1, 8);
  1183     __ daddi(tmp2, tmp2, 8);
  1185     __ bind(l_10);
  1187     // Copy 8 elements at a time; either the loads or the stores can
  1188     // be unalligned if aligned == false
  1190     { // FasterArrayCopy
  1191       __ bind(l_11);
  1192       // For loongson the 128-bit memory access instruction is gslq/gssq
  1193       if (UseLoongsonISA) {
  1194         __ gslq(AT, tmp4, tmp1, 0);
  1195         __ gslq(tmp5, tmp6, tmp1, 16);
  1196         __ daddi(tmp1, tmp1, 32);
  1197         __ daddi(tmp2, tmp2, 32);
  1198         __ gssq(AT, tmp4, tmp2, -32);
  1199         __ gssq(tmp5, tmp6, tmp2, -16);
  1200       } else {
  1201         __ ld(AT, tmp1, 0);
  1202         __ ld(tmp4, tmp1, 8);
  1203         __ ld(tmp5, tmp1, 16);
  1204         __ ld(tmp6, tmp1, 24);
  1205         __ daddi(tmp1, tmp1, 32);
  1206         __ sd(AT, tmp2, 0);
  1207         __ sd(tmp4, tmp2, 8);
  1208         __ sd(tmp5, tmp2, 16);
  1209         __ sd(tmp6, tmp2, 24);
  1210         __ daddi(tmp2, tmp2, 32);
  1212       __ daddi(tmp3, tmp3, -16);
  1213       __ daddi(AT, tmp3, -16);
  1214       __ bgez(AT, l_11);
  1215       __ delayed()->nop();
  1217     __ bind(l_9);
  1219     // Copy 4 elements at a time; either the loads or the stores can
  1220     // be unaligned if aligned == false.
  1221     { // FasterArrayCopy
  1222       __ daddi(AT, tmp3, -15);// loop unrolling 4 times, so if the elements should not be less than 16
  1223       __ blez(AT, l_4); // copy 2 at a time if less than 16 elements remain
  1224       __ delayed()->nop();
  1226       __ bind(l_8);
  1227       __ ld(AT, tmp1, 0);
  1228       __ ld(tmp4, tmp1, 8);
  1229       __ ld(tmp5, tmp1, 16);
  1230       __ ld(tmp6, tmp1, 24);
  1231       __ sd(AT, tmp2, 0);
  1232       __ sd(tmp4, tmp2, 8);
  1233       __ sd(tmp5, tmp2,16);
  1234       __ daddi(tmp1, tmp1, 32);
  1235       __ daddi(tmp2, tmp2, 32);
  1236       __ daddi(tmp3, tmp3, -16);
  1237       __ daddi(AT, tmp3, -16);
  1238       __ bgez(AT, l_8);
  1239       __ sd(tmp6, tmp2, -8);
  1241     __ bind(l_6);
  1243     // copy 2 element at a time
  1244     { // FasterArrayCopy
  1245       __ daddi(AT, tmp3, -7);
  1246       __ blez(AT, l_4);
  1247       __ delayed()->nop();
  1249       __ bind(l_3);
  1250       __ lw(AT, tmp1, 0);
  1251       __ lw(tmp4, tmp1, 4);
  1252       __ lw(tmp5, tmp1, 8);
  1253       __ lw(tmp6, tmp1, 12);
  1254       __ sw(AT, tmp2, 0);
  1255       __ sw(tmp4, tmp2, 4);
  1256       __ sw(tmp5, tmp2, 8);
  1257       __ daddi(tmp1, tmp1, 16);
  1258       __ daddi(tmp2, tmp2, 16);
  1259       __ daddi(tmp3, tmp3, -8);
  1260       __ daddi(AT, tmp3, -8);
  1261       __ bgez(AT, l_3);
  1262       __ sw(tmp6, tmp2, -4);
  1265     __ bind(l_1);
  1266     // do single element copy (8 bit), can this happen?
  1267     { // FasterArrayCopy
  1268       __ daddi(AT, tmp3, -3);
  1269       __ blez(AT, l_4);
  1270       __ delayed()->nop();
  1272       __ bind(l_5);
  1273       __ lhu(AT, tmp1, 0);
  1274       __ lhu(tmp4, tmp1, 2);
  1275       __ lhu(tmp5, tmp1, 4);
  1276       __ lhu(tmp6, tmp1, 6);
  1277       __ sh(AT, tmp2, 0);
  1278       __ sh(tmp4, tmp2, 2);
  1279       __ sh(tmp5, tmp2, 4);
  1280       __ daddi(tmp1, tmp1, 8);
  1281       __ daddi(tmp2, tmp2, 8);
  1282       __ daddi(tmp3, tmp3, -4);
  1283       __ daddi(AT, tmp3, -4);
  1284       __ bgez(AT, l_5);
  1285       __ sh(tmp6, tmp2, -2);
  1287     // single element
  1288     __ bind(l_4);
  1290     __ pop(tmp6);
  1291     __ pop(tmp5);
  1292     __ pop(tmp4);
  1294     __ bind(l_14);
  1295     { // FasterArrayCopy
  1296       __ beq(R0, tmp3, l_13);
  1297       __ delayed()->nop();
  1299       __ bind(l_12);
  1300       __ lhu(AT, tmp1, 0);
  1301       __ sh(AT, tmp2, 0);
  1302       __ daddi(tmp1, tmp1, 2);
  1303       __ daddi(tmp2, tmp2, 2);
  1304       __ daddi(tmp3, tmp3, -1);
  1305       __ daddi(AT, tmp3, -1);
  1306       __ bgez(AT, l_12);
  1307       __ delayed()->nop();
  1310     __ bind(l_13);
  1311     __ pop(tmp3);
  1312     __ pop(tmp2);
  1313     __ pop(tmp1);
  1315     __ jr(RA);
  1316     __ delayed()->nop();
  1318     __ bind(l_debug);
  1319     __ stop("generate_disjoint_short_copy should not reach here");
  1320     return start;
  1323   // Arguments:
  1324   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1325   //             ignored
  1326   //   name    - stub name string
  1327   //
  1328   // Inputs:
  1329   //   c_rarg0   - source array address
  1330   //   c_rarg1   - destination array address
  1331   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1332   //
  1333   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
  1334   // let the hardware handle it.  The two or four words within dwords
  1335   // or qwords that span cache line boundaries will still be loaded
  1336   // and stored atomically.
  1337   //
  1338   address generate_conjoint_short_copy(bool aligned, const char *name) {
  1339     Label l_1, l_2, l_3, l_4, l_5;
  1340     StubCodeMark mark(this, "StubRoutines", name);
  1341     __ align(CodeEntryAlignment);
  1342     address start = __ pc();
  1343     address nooverlap_target = aligned ?
  1344             StubRoutines::arrayof_jshort_disjoint_arraycopy() :
  1345             StubRoutines::jshort_disjoint_arraycopy();
  1347     array_overlap_test(nooverlap_target, 1);
  1349     __ push(T3);
  1350     __ push(T0);
  1351     __ push(T1);
  1352     __ push(T8);
  1354     __ move(T1, A2);
  1355     __ move(T3, A0);
  1356     __ move(T0, A1);
  1359     // copy dwords from high to low
  1360     __ sll(AT, T1, Address::times_2);
  1361     __ add(AT, T3, AT);
  1362     __ lea(T3, Address( AT, -4));
  1363     __ sll(AT,T1 , Address::times_2);
  1364     __ add(AT, T0, AT);
  1365     __ lea(T0, Address( AT, -4));
  1366     __ move(T8, T1);
  1367     __ bind(l_1);
  1368     __ sra(T1,T1, 1);
  1369     __ beq(T1, R0, l_4);
  1370     __ delayed()->nop();
  1371     __ align(16);
  1372     __ bind(l_2);
  1373     __ lw(AT, T3, 0);
  1374     __ sw(AT, T0, 0);
  1375     __ addi(T3, T3, -4);
  1376     __ addi(T0, T0, -4);
  1377     __ addi(T1, T1, -1);
  1378     __ bne(T1, R0, l_2);
  1379     __ delayed()->nop();
  1380     __ b(l_4);
  1381     __ delayed()->nop();
  1382     // copy dwords with repeat move
  1383     __ bind(l_3);
  1384     __ bind(l_4);
  1385     __ andi(T8, T8, 1);              // suffix count
  1386     __ beq(T8, R0, l_5 );
  1387     __ delayed()->nop();
  1388     // copy suffix
  1389     __ lh(AT, T3, 2);
  1390     __ sh(AT, T0, 2);
  1391     __ bind(l_5);
  1392     __ pop(T8);
  1393     __ pop(T1);
  1394     __ pop(T0);
  1395     __ pop(T3);
  1396     __ jr(RA);
  1397     __ delayed()->nop();
  1398     return start;
  1401   // Arguments:
  1402   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1403   //             ignored
  1404   //   is_oop  - true => oop array, so generate store check code
  1405   //   name    - stub name string
  1406   //
  1407   // Inputs:
  1408   //   c_rarg0   - source array address
  1409   //   c_rarg1   - destination array address
  1410   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1411   //
  1412   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1413   // the hardware handle it.  The two dwords within qwords that span
  1414   // cache line boundaries will still be loaded and stored atomicly.
  1415   //
  1416   // Side Effects:
  1417   //   disjoint_int_copy_entry is set to the no-overlap entry point
  1418   //   used by generate_conjoint_int_oop_copy().
  1419   //
  1420   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, const char *name, bool dest_uninitialized = false) {
  1421     Label l_3, l_4, l_5, l_6, l_7;
  1422     StubCodeMark mark(this, "StubRoutines", name);
  1424     __ align(CodeEntryAlignment);
  1425     address start = __ pc();
  1426     __ push(T3);
  1427     __ push(T0);
  1428     __ push(T1);
  1429     __ push(T8);
  1430     __ push(T9);
  1431     __ move(T1, A2);
  1432     __ move(T3, A0);
  1433     __ move(T0, A1);
  1435     if (is_oop) {
  1436       gen_write_ref_array_pre_barrier(A1, A2, dest_uninitialized);
  1439     if(!aligned) {
  1440       __ xorr(AT, T3, T0);
  1441       __ andi(AT, AT, 7);
  1442       __ bne(AT, R0, l_5); // not same alignment mod 8 -> copy 1 element each time
  1443       __ delayed()->nop();
  1445       __ andi(AT, T3, 7);
  1446       __ beq(AT, R0, l_6); //copy 2 elements each time
  1447       __ delayed()->nop();
  1449       __ lw(AT, T3, 0);
  1450       __ daddi(T1, T1, -1);
  1451       __ sw(AT, T0, 0);
  1452       __ daddi(T3, T3, 4);
  1453       __ daddi(T0, T0, 4);
  1457       __ bind(l_6);
  1458       __ daddi(AT, T1, -1);
  1459       __ blez(AT, l_5);
  1460       __ delayed()->nop();
  1462       __ bind(l_7);
  1463       __ ld(AT, T3, 0);
  1464       __ sd(AT, T0, 0);
  1465       __ daddi(T3, T3, 8);
  1466       __ daddi(T0, T0, 8);
  1467       __ daddi(T1, T1, -2);
  1468       __ daddi(AT, T1, -2);
  1469       __ bgez(AT, l_7);
  1470       __ delayed()->nop();
  1473     __ bind(l_5);
  1474     __ beq(T1, R0, l_4);
  1475     __ delayed()->nop();
  1477     __ align(16);
  1478     __ bind(l_3);
  1479     __ lw(AT, T3, 0);
  1480     __ sw(AT, T0, 0);
  1481     __ addi(T3, T3, 4);
  1482     __ addi(T0, T0, 4);
  1483     __ addi(T1, T1, -1);
  1484     __ bne(T1, R0, l_3);
  1485     __ delayed()->nop();
  1487     // exit
  1488     __ bind(l_4);
  1489     if (is_oop) {
  1490       gen_write_ref_array_post_barrier(A1, A2, T1);
  1492     __ pop(T9);
  1493     __ pop(T8);
  1494     __ pop(T1);
  1495     __ pop(T0);
  1496     __ pop(T3);
  1497     __ jr(RA);
  1498     __ delayed()->nop();
  1500     return start;
  1503   // Arguments:
  1504   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1505   //             ignored
  1506   //   is_oop  - true => oop array, so generate store check code
  1507   //   name    - stub name string
  1508   //
  1509   // Inputs:
  1510   //   c_rarg0   - source array address
  1511   //   c_rarg1   - destination array address
  1512   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1513   //
  1514   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1515   // the hardware handle it.  The two dwords within qwords that span
  1516   // cache line boundaries will still be loaded and stored atomicly.
  1517   //
  1518   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, const char *name, bool dest_uninitialized = false) {
  1519     Label l_2, l_4;
  1520     StubCodeMark mark(this, "StubRoutines", name);
  1521     __ align(CodeEntryAlignment);
  1522     address start = __ pc();
  1523     address nooverlap_target;
  1525     if (is_oop) {
  1526       nooverlap_target = aligned ?
  1527               StubRoutines::arrayof_oop_disjoint_arraycopy() :
  1528               StubRoutines::oop_disjoint_arraycopy();
  1529     }else {
  1530       nooverlap_target = aligned ?
  1531               StubRoutines::arrayof_jint_disjoint_arraycopy() :
  1532               StubRoutines::jint_disjoint_arraycopy();
  1535     array_overlap_test(nooverlap_target, 2);
  1537     if (is_oop) {
  1538       gen_write_ref_array_pre_barrier(A1, A2, dest_uninitialized);
  1541     __ push(T3);
  1542     __ push(T0);
  1543     __ push(T1);
  1544     __ push(T8);
  1545     __ push(T9);
  1547     __ move(T1, A2);
  1548     __ move(T3, A0);
  1549     __ move(T0, A1);
  1551     // T3: source array address
  1552     // T0: destination array address
  1553     // T1: element count
  1555     __ sll(AT, T1, Address::times_4);
  1556     __ add(AT, T3, AT);
  1557     __ lea(T3 , Address(AT, -4));
  1558     __ sll(AT, T1, Address::times_4);
  1559     __ add(AT, T0, AT);
  1560     __ lea(T0 , Address(AT, -4));
  1562     __ beq(T1, R0, l_4);
  1563     __ delayed()->nop();
  1565     __ align(16);
  1566     __ bind(l_2);
  1567     __ lw(AT, T3, 0);
  1568     __ sw(AT, T0, 0);
  1569     __ addi(T3, T3, -4);
  1570     __ addi(T0, T0, -4);
  1571     __ addi(T1, T1, -1);
  1572     __ bne(T1, R0, l_2);
  1573     __ delayed()->nop();
  1575     __ bind(l_4);
  1576     if (is_oop) {
  1577       gen_write_ref_array_post_barrier(A1, A2, T1);
  1579     __ pop(T9);
  1580     __ pop(T8);
  1581     __ pop(T1);
  1582     __ pop(T0);
  1583     __ pop(T3);
  1584     __ jr(RA);
  1585     __ delayed()->nop();
  1587     return start;
  1590   // Arguments:
  1591   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1592   //             ignored
  1593   //   is_oop  - true => oop array, so generate store check code
  1594   //   name    - stub name string
  1595   //
  1596   // Inputs:
  1597   //   c_rarg0   - source array address
  1598   //   c_rarg1   - destination array address
  1599   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1600   //
  1601   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1602   // the hardware handle it.  The two dwords within qwords that span
  1603   // cache line boundaries will still be loaded and stored atomicly.
  1604   //
  1605   // Side Effects:
  1606   //   disjoint_int_copy_entry is set to the no-overlap entry point
  1607   //   used by generate_conjoint_int_oop_copy().
  1608   //
  1609   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, const char *name, bool dest_uninitialized = false) {
  1610     Label l_3, l_4;
  1611     StubCodeMark mark(this, "StubRoutines", name);
  1612     __ align(CodeEntryAlignment);
  1613     address start = __ pc();
  1615     if (is_oop) {
  1616       gen_write_ref_array_pre_barrier(A1, A2, dest_uninitialized);
  1619     __ push(T3);
  1620     __ push(T0);
  1621     __ push(T1);
  1622     __ push(T8);
  1623     __ push(T9);
  1625     __ move(T1, A2);
  1626     __ move(T3, A0);
  1627     __ move(T0, A1);
  1629     // T3: source array address
  1630     // T0: destination array address
  1631     // T1: element count
  1633     __ beq(T1, R0, l_4);
  1634     __ delayed()->nop();
  1636     __ align(16);
  1637     __ bind(l_3);
  1638     __ ld(AT, T3, 0);
  1639     __ sd(AT, T0, 0);
  1640     __ addi(T3, T3, 8);
  1641     __ addi(T0, T0, 8);
  1642     __ addi(T1, T1, -1);
  1643     __ bne(T1, R0, l_3);
  1644     __ delayed()->nop();
  1646     // exit
  1647     __ bind(l_4);
  1648     if (is_oop) {
  1649       gen_write_ref_array_post_barrier(A1, A2, T1);
  1651     __ pop(T9);
  1652     __ pop(T8);
  1653     __ pop(T1);
  1654     __ pop(T0);
  1655     __ pop(T3);
  1656     __ jr(RA);
  1657     __ delayed()->nop();
  1658     return start;
  1661   // Arguments:
  1662   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  1663   //             ignored
  1664   //   is_oop  - true => oop array, so generate store check code
  1665   //   name    - stub name string
  1666   //
  1667   // Inputs:
  1668   //   c_rarg0   - source array address
  1669   //   c_rarg1   - destination array address
  1670   //   c_rarg2   - element count, treated as ssize_t, can be zero
  1671   //
  1672   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  1673   // the hardware handle it.  The two dwords within qwords that span
  1674   // cache line boundaries will still be loaded and stored atomicly.
  1675   //
  1676   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, const char *name, bool dest_uninitialized = false) {
  1677     Label l_2, l_4;
  1678     StubCodeMark mark(this, "StubRoutines", name);
  1679     __ align(CodeEntryAlignment);
  1680     address start = __ pc();
  1681     address nooverlap_target;
  1683     if (is_oop) {
  1684       nooverlap_target = aligned ?
  1685               StubRoutines::arrayof_oop_disjoint_arraycopy() :
  1686               StubRoutines::oop_disjoint_arraycopy();
  1687     }else {
  1688       nooverlap_target = aligned ?
  1689               StubRoutines::arrayof_jlong_disjoint_arraycopy() :
  1690               StubRoutines::jlong_disjoint_arraycopy();
  1693     array_overlap_test(nooverlap_target, 3);
  1695     if (is_oop) {
  1696       gen_write_ref_array_pre_barrier(A1, A2, dest_uninitialized);
  1699     __ push(T3);
  1700     __ push(T0);
  1701     __ push(T1);
  1702     __ push(T8);
  1703     __ push(T9);
  1705     __ move(T1, A2);
  1706     __ move(T3, A0);
  1707     __ move(T0, A1);
  1709     __ sll(AT, T1, Address::times_8);
  1710     __ add(AT, T3, AT);
  1711     __ lea(T3 , Address(AT, -8));
  1712     __ sll(AT, T1, Address::times_8);
  1713     __ add(AT, T0, AT);
  1714     __ lea(T0 , Address(AT, -8));
  1716     __ beq(T1, R0, l_4);
  1717     __ delayed()->nop();
  1719     __ align(16);
  1720     __ bind(l_2);
  1721     __ ld(AT, T3, 0);
  1722     __ sd(AT, T0, 0);
  1723     __ addi(T3, T3, -8);
  1724     __ addi(T0, T0, -8);
  1725     __ addi(T1, T1, -1);
  1726     __ bne(T1, R0, l_2);
  1727     __ delayed()->nop();
  1729     // exit
  1730     __ bind(l_4);
  1731     if (is_oop) {
  1732       gen_write_ref_array_post_barrier(A1, A2, T1);
  1734     __ pop(T9);
  1735     __ pop(T8);
  1736     __ pop(T1);
  1737     __ pop(T0);
  1738     __ pop(T3);
  1739     __ jr(RA);
  1740     __ delayed()->nop();
  1741     return start;
  1744   //FIXME
  1745   address generate_disjoint_long_copy(bool aligned, const char *name) {
  1746     Label l_1, l_2;
  1747     StubCodeMark mark(this, "StubRoutines", name);
  1748     __ align(CodeEntryAlignment);
  1749     address start = __ pc();
  1751     __ move(T1, A2);
  1752     __ move(T3, A0);
  1753     __ move(T0, A1);
  1754     __ push(T3);
  1755     __ push(T0);
  1756     __ push(T1);
  1757     __ b(l_2);
  1758     __ delayed()->nop();
  1759     __ align(16);
  1760     __ bind(l_1);
  1761     __ ld(AT, T3, 0);
  1762     __ sd (AT, T0, 0);
  1763     __ addi(T3, T3, 8);
  1764     __ addi(T0, T0, 8);
  1765     __ bind(l_2);
  1766     __ addi(T1, T1, -1);
  1767     __ bgez(T1, l_1);
  1768     __ delayed()->nop();
  1769     __ pop(T1);
  1770     __ pop(T0);
  1771     __ pop(T3);
  1772     __ jr(RA);
  1773     __ delayed()->nop();
  1774     return start;
  1778   address generate_conjoint_long_copy(bool aligned, const char *name) {
  1779     Label l_1, l_2;
  1780     StubCodeMark mark(this, "StubRoutines", name);
  1781     __ align(CodeEntryAlignment);
  1782     address start = __ pc();
  1783     address nooverlap_target = aligned ?
  1784       StubRoutines::arrayof_jlong_disjoint_arraycopy() :
  1785       StubRoutines::jlong_disjoint_arraycopy();
  1786     array_overlap_test(nooverlap_target, 3);
  1788     __ push(T3);
  1789     __ push(T0);
  1790     __ push(T1);
  1792     __ move(T1, A2);
  1793     __ move(T3, A0);
  1794     __ move(T0, A1);
  1795     __ sll(AT, T1, Address::times_8);
  1796     __ add(AT, T3, AT);
  1797     __ lea(T3 , Address(AT, -8));
  1798     __ sll(AT, T1, Address::times_8);
  1799     __ add(AT, T0, AT);
  1800     __ lea(T0 , Address(AT, -8));
  1802     __ b(l_2);
  1803     __ delayed()->nop();
  1804     __ align(16);
  1805     __ bind(l_1);
  1806     __ ld(AT, T3, 0);
  1807     __ sd (AT, T0, 0);
  1808     __ addi(T3, T3, -8);
  1809     __ addi(T0, T0,-8);
  1810     __ bind(l_2);
  1811     __ addi(T1, T1, -1);
  1812     __ bgez(T1, l_1);
  1813     __ delayed()->nop();
  1814     __ pop(T1);
  1815     __ pop(T0);
  1816     __ pop(T3);
  1817     __ jr(RA);
  1818     __ delayed()->nop();
  1819     return start;
  1822   void generate_arraycopy_stubs() {
  1823     if (UseCompressedOops) {
  1824       StubRoutines::_oop_disjoint_arraycopy          = generate_disjoint_int_oop_copy(false, true,
  1825                                                                                       "oop_disjoint_arraycopy");
  1826       StubRoutines::_oop_arraycopy                   = generate_conjoint_int_oop_copy(false, true,
  1827                                                                                       "oop_arraycopy");
  1828       StubRoutines::_oop_disjoint_arraycopy_uninit   = generate_disjoint_int_oop_copy(false, true,
  1829                                                                                       "oop_disjoint_arraycopy_uninit", true);
  1830       StubRoutines::_oop_arraycopy_uninit            = generate_conjoint_int_oop_copy(false, true,
  1831                                                                                       "oop_arraycopy_uninit", true);
  1832     } else {
  1833       StubRoutines::_oop_disjoint_arraycopy          = generate_disjoint_long_oop_copy(false, true,
  1834                                                                                        "oop_disjoint_arraycopy");
  1835       StubRoutines::_oop_arraycopy                   = generate_conjoint_long_oop_copy(false, true,
  1836                                                                                        "oop_arraycopy");
  1837       StubRoutines::_oop_disjoint_arraycopy_uninit   = generate_disjoint_long_oop_copy(false, true,
  1838                                                                                        "oop_disjoint_arraycopy_uninit", true);
  1839       StubRoutines::_oop_arraycopy_uninit            = generate_conjoint_long_oop_copy(false, true,
  1840                                                                                        "oop_arraycopy_uninit", true);
  1843     StubRoutines::_jbyte_disjoint_arraycopy          = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
  1844     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
  1845     StubRoutines::_jint_disjoint_arraycopy           = generate_disjoint_int_oop_copy(false, false, "jint_disjoint_arraycopy");
  1846     StubRoutines::_jlong_disjoint_arraycopy          = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
  1848     StubRoutines::_jbyte_arraycopy  = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
  1849     StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, "jshort_arraycopy");
  1850     StubRoutines::_jint_arraycopy   = generate_conjoint_int_oop_copy(false, false, "jint_arraycopy");
  1851     StubRoutines::_jlong_arraycopy  = generate_conjoint_long_copy(false, "jlong_arraycopy");
  1853     // We don't generate specialized code for HeapWord-aligned source
  1854     // arrays, so just use the code we've already generated
  1855     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
  1856     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
  1858     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
  1859     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
  1861     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
  1862     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
  1864     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
  1865     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
  1867     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
  1868     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
  1870     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
  1871     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
  1874   //Wang: add a function to implement SafeFetch32 and SafeFetchN
  1875   void generate_safefetch(const char* name, int size, address* entry,
  1876                           address* fault_pc, address* continuation_pc) {
  1877     // safefetch signatures:
  1878     //   int      SafeFetch32(int*      adr, int      errValue);
  1879     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
  1880     //
  1881     // arguments:
  1882     //   A0 = adr
  1883     //   A1 = errValue
  1884     //
  1885     // result:
  1886     //   PPC_RET  = *adr or errValue
  1888     StubCodeMark mark(this, "StubRoutines", name);
  1890     // Entry point, pc or function descriptor.
  1891     *entry = __ pc();
  1893     // Load *adr into A1, may fault.
  1894     *fault_pc = __ pc();
  1895     switch (size) {
  1896       case 4:
  1897         // int32_t
  1898         __ lw(A1, A0, 0);
  1899         break;
  1900       case 8:
  1901         // int64_t
  1902         __ ld(A1, A0, 0);
  1903         break;
  1904       default:
  1905         ShouldNotReachHere();
  1908     // return errValue or *adr
  1909     *continuation_pc = __ pc();
  1910     __ addu(V0,A1,R0);
  1911     __ jr(RA);
  1912     __ delayed()->nop();
  1916 #undef __
  1917 #define __ masm->
  1919   // Continuation point for throwing of implicit exceptions that are
  1920   // not handled in the current activation. Fabricates an exception
  1921   // oop and initiates normal exception dispatching in this
  1922   // frame. Since we need to preserve callee-saved values (currently
  1923   // only for C2, but done for C1 as well) we need a callee-saved oop
  1924   // map and therefore have to make these stubs into RuntimeStubs
  1925   // rather than BufferBlobs.  If the compiler needs all registers to
  1926   // be preserved between the fault point and the exception handler
  1927   // then it must assume responsibility for that in
  1928   // AbstractCompiler::continuation_for_implicit_null_exception or
  1929   // continuation_for_implicit_division_by_zero_exception. All other
  1930   // implicit exceptions (e.g., NullPointerException or
  1931   // AbstractMethodError on entry) are either at call sites or
  1932   // otherwise assume that stack unwinding will be initiated, so
  1933   // caller saved registers were assumed volatile in the compiler.
  1934   address generate_throw_exception(const char* name,
  1935                                    address runtime_entry,
  1936                                    bool restore_saved_exception_pc) {
  1937     // Information about frame layout at time of blocking runtime call.
  1938     // Note that we only have to preserve callee-saved registers since
  1939     // the compilers are responsible for supplying a continuation point
  1940     // if they expect all registers to be preserved.
  1941     enum layout {
  1942       thread_off,    // last_java_sp
  1943       S7_off,        // callee saved register      sp + 1
  1944       S6_off,        // callee saved register      sp + 2
  1945       S5_off,        // callee saved register      sp + 3
  1946       S4_off,        // callee saved register      sp + 4
  1947       S3_off,        // callee saved register      sp + 5
  1948       S2_off,        // callee saved register      sp + 6
  1949       S1_off,        // callee saved register      sp + 7
  1950       S0_off,        // callee saved register      sp + 8
  1951       FP_off,
  1952       ret_address,
  1953       framesize
  1954     };
  1956     int insts_size = 2048;
  1957     int locs_size  = 32;
  1959     //  CodeBuffer* code     = new CodeBuffer(insts_size, locs_size, 0, 0, 0, false,
  1960     //  NULL, NULL, NULL, false, NULL, name, false);
  1961     CodeBuffer code (name , insts_size, locs_size);
  1962     OopMapSet* oop_maps  = new OopMapSet();
  1963     MacroAssembler* masm = new MacroAssembler(&code);
  1965     address start = __ pc();
  1967     // This is an inlined and slightly modified version of call_VM
  1968     // which has the ability to fetch the return PC out of
  1969     // thread-local storage and also sets up last_Java_sp slightly
  1970     // differently than the real call_VM
  1971 #ifndef OPT_THREAD
  1972     Register java_thread = TREG;
  1973     __ get_thread(java_thread);
  1974 #else
  1975     Register java_thread = TREG;
  1976 #endif
  1977     if (restore_saved_exception_pc) {
  1978       __ ld(RA, java_thread, in_bytes(JavaThread::saved_exception_pc_offset())); // eax
  1981     __ enter(); // required for proper stackwalking of RuntimeStub frame
  1983     __ addi(SP, SP, (-1) * (framesize-2) * wordSize); // prolog
  1984     __ sd(S0, SP, S0_off * wordSize);
  1985     __ sd(S1, SP, S1_off * wordSize);
  1986     __ sd(S2, SP, S2_off * wordSize);
  1987     __ sd(S3, SP, S3_off * wordSize);
  1988     __ sd(S4, SP, S4_off * wordSize);
  1989     __ sd(S5, SP, S5_off * wordSize);
  1990     __ sd(S6, SP, S6_off * wordSize);
  1991     __ sd(S7, SP, S7_off * wordSize);
  1993     int frame_complete = __ pc() - start;
  1994     // push java thread (becomes first argument of C function)
  1995     __ sd(java_thread, SP, thread_off * wordSize);
  1996     if (java_thread != A0)
  1997       __ move(A0, java_thread);
  1999     // Set up last_Java_sp and last_Java_fp
  2000     __ set_last_Java_frame(java_thread, SP, FP, NULL);
  2001     // Align stack
  2002     __ set64(AT, -(StackAlignmentInBytes));
  2003     __ andr(SP, SP, AT);
  2005     __ relocate(relocInfo::internal_pc_type);
  2007       intptr_t save_pc = (intptr_t)__ pc() +  NativeMovConstReg::instruction_size + 28;
  2008       __ patchable_set48(AT, save_pc);
  2010     __ sd(AT, java_thread, in_bytes(JavaThread::last_Java_pc_offset()));
  2012     // Call runtime
  2013     __ call(runtime_entry);
  2014     __ delayed()->nop();
  2015     // Generate oop map
  2016     OopMap* map =  new OopMap(framesize, 0);
  2017     oop_maps->add_gc_map(__ offset(),  map);
  2019     // restore the thread (cannot use the pushed argument since arguments
  2020     // may be overwritten by C code generated by an optimizing compiler);
  2021     // however can use the register value directly if it is callee saved.
  2022 #ifndef OPT_THREAD
  2023     __ get_thread(java_thread);
  2024 #endif
  2026     __ ld(SP, java_thread, in_bytes(JavaThread::last_Java_sp_offset()));
  2027     __ reset_last_Java_frame(java_thread, true, true);
  2029     // Restore callee save registers.  This must be done after resetting the Java frame
  2030     __ ld(S0, SP, S0_off * wordSize);
  2031     __ ld(S1, SP, S1_off * wordSize);
  2032     __ ld(S2, SP, S2_off * wordSize);
  2033     __ ld(S3, SP, S3_off * wordSize);
  2034     __ ld(S4, SP, S4_off * wordSize);
  2035     __ ld(S5, SP, S5_off * wordSize);
  2036     __ ld(S6, SP, S6_off * wordSize);
  2037     __ ld(S7, SP, S7_off * wordSize);
  2039     // discard arguments
  2040     __ addi(SP, SP, (framesize-2) * wordSize); // epilog
  2041     __ addi(SP, FP, wordSize);
  2042     __ ld(FP, SP, -1*wordSize);
  2043     // check for pending exceptions
  2044 #ifdef ASSERT
  2045     Label L;
  2046     __ lw(AT, java_thread, in_bytes(Thread::pending_exception_offset()));
  2047     __ bne(AT, R0, L);
  2048     __ delayed()->nop();
  2049     __ should_not_reach_here();
  2050     __ bind(L);
  2051 #endif //ASSERT
  2052     __ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
  2053     __ delayed()->nop();
  2054     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name,
  2055                                                       &code,
  2056                                                       frame_complete,
  2057                                                       framesize,
  2058                                                       oop_maps, false);
  2059     return stub->entry_point();
  2062   // Initialization
  2063   void generate_initial() {
  2064     // Generates all stubs and initializes the entry points
  2066     //-------------------------------------------------------------
  2067     //-----------------------------------------------------------
  2068     // entry points that exist in all platforms
  2069     // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller
  2070     // than the disadvantage of having a much more complicated generator structure.
  2071     // See also comment in stubRoutines.hpp.
  2072     StubRoutines::_forward_exception_entry = generate_forward_exception();
  2073     StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
  2074     // is referenced by megamorphic call
  2075     StubRoutines::_catch_exception_entry = generate_catch_exception();
  2077     StubRoutines::_handler_for_unsafe_access_entry = generate_handler_for_unsafe_access();
  2079     StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception",
  2080                                                                               CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError),   false);
  2081     // platform dependent
  2082     StubRoutines::gs2::_get_previous_fp_entry = generate_get_previous_fp();
  2085   void generate_all() {
  2086     // Generates all stubs and initializes the entry points
  2088     // These entry points require SharedInfo::stack0 to be set up in
  2089     // non-core builds and need to be relocatable, so they each
  2090     // fabricate a RuntimeStub internally.
  2091     StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception",
  2092                                                                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
  2094     StubRoutines::_throw_IncompatibleClassChangeError_entry = generate_throw_exception("IncompatibleClassChangeError throw_exception",
  2095                                                                                CAST_FROM_FN_PTR(address, SharedRuntime:: throw_IncompatibleClassChangeError), false);
  2097     StubRoutines::_throw_NullPointerException_at_call_entry = generate_throw_exception("NullPointerException at call throw_exception",
  2098                                                                                         CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
  2100     //------------------------------------------------------------------
  2101     // entry points that are platform specific
  2103     // support for verify_oop (must happen after universe_init)
  2104     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
  2105 #ifndef CORE
  2106     // arraycopy stubs used by compilers
  2107     generate_arraycopy_stubs();
  2108 #endif
  2110     // Safefetch stubs.
  2111     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
  2112                                                        &StubRoutines::_safefetch32_fault_pc,
  2113                                                        &StubRoutines::_safefetch32_continuation_pc);
  2114     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
  2115                                                        &StubRoutines::_safefetchN_fault_pc,
  2116                                                        &StubRoutines::_safefetchN_continuation_pc);
  2119  public:
  2120   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
  2121     if (all) {
  2122       generate_all();
  2123     } else {
  2124       generate_initial();
  2127 }; // end class declaration
  2129 void StubGenerator_generate(CodeBuffer* code, bool all) {
  2130   StubGenerator g(code, all);

mercurial