src/cpu/ppc/vm/stubGenerator_ppc.cpp

Wed, 27 Nov 2013 16:16:21 -0800

author
goetz
date
Wed, 27 Nov 2013 16:16:21 -0800
changeset 6490
41b780b43b74
parent 6458
ec28f9c041ff
child 6495
67fa91961822
permissions
-rw-r--r--

8029015: PPC64 (part 216): opto: trap based null and range checks
Summary: On PPC64 use tdi instruction that does a compare and raises SIGTRAP for NULL and range checks.
Reviewed-by: kvn

     1 /*
     2  * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
     3  * Copyright 2012, 2013 SAP AG. All rights reserved.
     4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     5  *
     6  * This code is free software; you can redistribute it and/or modify it
     7  * under the terms of the GNU General Public License version 2 only, as
     8  * published by the Free Software Foundation.
     9  *
    10  * This code is distributed in the hope that it will be useful, but WITHOUT
    11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    13  * version 2 for more details (a copy is included in the LICENSE file that
    14  * accompanied this code).
    15  *
    16  * You should have received a copy of the GNU General Public License version
    17  * 2 along with this work; if not, write to the Free Software Foundation,
    18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    19  *
    20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    21  * or visit www.oracle.com if you need additional information or have any
    22  * questions.
    23  *
    24  */
    26 #include "precompiled.hpp"
    27 #include "asm/assembler.hpp"
    28 #include "asm/macroAssembler.inline.hpp"
    29 #include "interpreter/interpreter.hpp"
    30 #include "nativeInst_ppc.hpp"
    31 #include "oops/instanceOop.hpp"
    32 #include "oops/method.hpp"
    33 #include "oops/objArrayKlass.hpp"
    34 #include "oops/oop.inline.hpp"
    35 #include "prims/methodHandles.hpp"
    36 #include "runtime/frame.inline.hpp"
    37 #include "runtime/handles.inline.hpp"
    38 #include "runtime/sharedRuntime.hpp"
    39 #include "runtime/stubCodeGenerator.hpp"
    40 #include "runtime/stubRoutines.hpp"
    41 #include "utilities/top.hpp"
    42 #ifdef TARGET_OS_FAMILY_aix
    43 # include "thread_aix.inline.hpp"
    44 #endif
    45 #ifdef TARGET_OS_FAMILY_linux
    46 # include "thread_linux.inline.hpp"
    47 #endif
    48 #ifdef COMPILER2
    49 #include "opto/runtime.hpp"
    50 #endif
    52 #define __ _masm->
    54 #ifdef PRODUCT
    55 #define BLOCK_COMMENT(str) // nothing
    56 #else
    57 #define BLOCK_COMMENT(str) __ block_comment(str)
    58 #endif
    60 class StubGenerator: public StubCodeGenerator {
    61  private:
    63   // Call stubs are used to call Java from C
    64   //
    65   // Arguments:
    66   //
    67   //   R3  - call wrapper address     : address
    68   //   R4  - result                   : intptr_t*
    69   //   R5  - result type              : BasicType
    70   //   R6  - method                   : Method
    71   //   R7  - frame mgr entry point    : address
    72   //   R8  - parameter block          : intptr_t*
    73   //   R9  - parameter count in words : int
    74   //   R10 - thread                   : Thread*
    75   //
    76   address generate_call_stub(address& return_address) {
    77     // Setup a new c frame, copy java arguments, call frame manager or
    78     // native_entry, and process result.
    80     StubCodeMark mark(this, "StubRoutines", "call_stub");
    82     address start = __ emit_fd();
    84     // some sanity checks
    85     assert((sizeof(frame::abi_48) % 16) == 0,                 "unaligned");
    86     assert((sizeof(frame::abi_112) % 16) == 0,                "unaligned");
    87     assert((sizeof(frame::spill_nonvolatiles) % 16) == 0,     "unaligned");
    88     assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned");
    89     assert((sizeof(frame::entry_frame_locals) % 16) == 0,     "unaligned");
    91     Register r_arg_call_wrapper_addr        = R3;
    92     Register r_arg_result_addr              = R4;
    93     Register r_arg_result_type              = R5;
    94     Register r_arg_method                   = R6;
    95     Register r_arg_entry                    = R7;
    96     Register r_arg_thread                   = R10;
    98     Register r_temp                         = R24;
    99     Register r_top_of_arguments_addr        = R25;
   100     Register r_entryframe_fp                = R26;
   102     {
   103       // Stack on entry to call_stub:
   104       //
   105       //      F1      [C_FRAME]
   106       //              ...
   108       Register r_arg_argument_addr          = R8;
   109       Register r_arg_argument_count         = R9;
   110       Register r_frame_alignment_in_bytes   = R27;
   111       Register r_argument_addr              = R28;
   112       Register r_argumentcopy_addr          = R29;
   113       Register r_argument_size_in_bytes     = R30;
   114       Register r_frame_size                 = R23;
   116       Label arguments_copied;
   118       // Save LR/CR to caller's C_FRAME.
   119       __ save_LR_CR(R0);
   121       // Zero extend arg_argument_count.
   122       __ clrldi(r_arg_argument_count, r_arg_argument_count, 32);
   124       // Save non-volatiles GPRs to ENTRY_FRAME (not yet pushed, but it's safe).
   125       __ save_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
   127       // Keep copy of our frame pointer (caller's SP).
   128       __ mr(r_entryframe_fp, R1_SP);
   130       BLOCK_COMMENT("Push ENTRY_FRAME including arguments");
   131       // Push ENTRY_FRAME including arguments:
   132       //
   133       //      F0      [TOP_IJAVA_FRAME_ABI]
   134       //              alignment (optional)
   135       //              [outgoing Java arguments]
   136       //              [ENTRY_FRAME_LOCALS]
   137       //      F1      [C_FRAME]
   138       //              ...
   140       // calculate frame size
   142       // unaligned size of arguments
   143       __ sldi(r_argument_size_in_bytes,
   144                   r_arg_argument_count, Interpreter::logStackElementSize);
   145       // arguments alignment (max 1 slot)
   146       // FIXME: use round_to() here
   147       __ andi_(r_frame_alignment_in_bytes, r_arg_argument_count, 1);
   148       __ sldi(r_frame_alignment_in_bytes,
   149                   r_frame_alignment_in_bytes, Interpreter::logStackElementSize);
   151       // size = unaligned size of arguments + top abi's size
   152       __ addi(r_frame_size, r_argument_size_in_bytes,
   153               frame::top_ijava_frame_abi_size);
   154       // size += arguments alignment
   155       __ add(r_frame_size,
   156                  r_frame_size, r_frame_alignment_in_bytes);
   157       // size += size of call_stub locals
   158       __ addi(r_frame_size,
   159               r_frame_size, frame::entry_frame_locals_size);
   161       // push ENTRY_FRAME
   162       __ push_frame(r_frame_size, r_temp);
   164       // initialize call_stub locals (step 1)
   165       __ std(r_arg_call_wrapper_addr,
   166              _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp);
   167       __ std(r_arg_result_addr,
   168              _entry_frame_locals_neg(result_address), r_entryframe_fp);
   169       __ std(r_arg_result_type,
   170              _entry_frame_locals_neg(result_type), r_entryframe_fp);
   171       // we will save arguments_tos_address later
   174       BLOCK_COMMENT("Copy Java arguments");
   175       // copy Java arguments
   177       // Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later.
   178       // FIXME: why not simply use SP+frame::top_ijava_frame_size?
   179       __ addi(r_top_of_arguments_addr,
   180               R1_SP, frame::top_ijava_frame_abi_size);
   181       __ add(r_top_of_arguments_addr,
   182                  r_top_of_arguments_addr, r_frame_alignment_in_bytes);
   184       // any arguments to copy?
   185       __ cmpdi(CCR0, r_arg_argument_count, 0);
   186       __ beq(CCR0, arguments_copied);
   188       // prepare loop and copy arguments in reverse order
   189       {
   190         // init CTR with arg_argument_count
   191         __ mtctr(r_arg_argument_count);
   193         // let r_argumentcopy_addr point to last outgoing Java arguments P
   194         __ mr(r_argumentcopy_addr, r_top_of_arguments_addr);
   196         // let r_argument_addr point to last incoming java argument
   197         __ add(r_argument_addr,
   198                    r_arg_argument_addr, r_argument_size_in_bytes);
   199         __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
   201         // now loop while CTR > 0 and copy arguments
   202         {
   203           Label next_argument;
   204           __ bind(next_argument);
   206           __ ld(r_temp, 0, r_argument_addr);
   207           // argument_addr--;
   208           __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
   209           __ std(r_temp, 0, r_argumentcopy_addr);
   210           // argumentcopy_addr++;
   211           __ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord);
   213           __ bdnz(next_argument);
   214         }
   215       }
   217       // Arguments copied, continue.
   218       __ bind(arguments_copied);
   219     }
   221     {
   222       BLOCK_COMMENT("Call frame manager or native entry.");
   223       // Call frame manager or native entry.
   224       Register r_new_arg_entry = R14_state;
   225       assert_different_registers(r_new_arg_entry, r_top_of_arguments_addr,
   226                                  r_arg_method, r_arg_thread);
   228       __ mr(r_new_arg_entry, r_arg_entry);
   230       // Register state on entry to frame manager / native entry:
   231       //
   232       //   R17_tos     -  intptr_t*    sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8
   233       //   R19_method  -  Method
   234       //   R16_thread  -  JavaThread*
   236       // R17_tos must point to last argument - element_size.
   237       __ addi(R17_tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);
   239       // initialize call_stub locals (step 2)
   240       // now save R17_tos as arguments_tos_address
   241       __ std(R17_tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
   243       // load argument registers for call
   244       __ mr(R19_method, r_arg_method);
   245       __ mr(R16_thread, r_arg_thread);
   246       assert(R17_tos != r_arg_method, "trashed r_arg_method");
   247       assert(R17_tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");
   249       // Set R15_prev_state to 0 for simplifying checks in callee.
   250       __ li(R15_prev_state, 0);
   252       // Stack on entry to frame manager / native entry:
   253       //
   254       //      F0      [TOP_IJAVA_FRAME_ABI]
   255       //              alignment (optional)
   256       //              [outgoing Java arguments]
   257       //              [ENTRY_FRAME_LOCALS]
   258       //      F1      [C_FRAME]
   259       //              ...
   260       //
   262       // global toc register
   263       __ load_const(R29, MacroAssembler::global_toc(), R11_scratch1);
   265       // Load narrow oop base.
   266       __ reinit_heapbase(R30, R11_scratch1);
   268       // Remember the senderSP so we interpreter can pop c2i arguments off of the stack
   269       // when called via a c2i.
   271       // Pass initial_caller_sp to framemanager.
   272       __ mr(R21_tmp1, R1_SP);
   274       // Do a light-weight C-call here, r_new_arg_entry holds the address
   275       // of the interpreter entry point (frame manager or native entry)
   276       // and save runtime-value of LR in return_address.
   277       assert(r_new_arg_entry != R17_tos && r_new_arg_entry != R19_method && r_new_arg_entry != R16_thread,
   278              "trashed r_new_arg_entry");
   279       return_address = __ call_stub(r_new_arg_entry);
   280     }
   282     {
   283       BLOCK_COMMENT("Returned from frame manager or native entry.");
   284       // Returned from frame manager or native entry.
   285       // Now pop frame, process result, and return to caller.
   287       // Stack on exit from frame manager / native entry:
   288       //
   289       //      F0      [ABI]
   290       //              ...
   291       //              [ENTRY_FRAME_LOCALS]
   292       //      F1      [C_FRAME]
   293       //              ...
   294       //
   295       // Just pop the topmost frame ...
   296       //
   298       Label ret_is_object;
   299       Label ret_is_long;
   300       Label ret_is_float;
   301       Label ret_is_double;
   303       Register r_entryframe_fp = R30;
   304       Register r_lr            = R7_ARG5;
   305       Register r_cr            = R8_ARG6;
   307       // Reload some volatile registers which we've spilled before the call
   308       // to frame manager / native entry.
   309       // Access all locals via frame pointer, because we know nothing about
   310       // the topmost frame's size.
   311       __ ld(r_entryframe_fp, _abi(callers_sp), R1_SP);
   312       assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);
   313       __ ld(r_arg_result_addr,
   314             _entry_frame_locals_neg(result_address), r_entryframe_fp);
   315       __ ld(r_arg_result_type,
   316             _entry_frame_locals_neg(result_type), r_entryframe_fp);
   317       __ ld(r_cr, _abi(cr), r_entryframe_fp);
   318       __ ld(r_lr, _abi(lr), r_entryframe_fp);
   320       // pop frame and restore non-volatiles, LR and CR
   321       __ mr(R1_SP, r_entryframe_fp);
   322       __ mtcr(r_cr);
   323       __ mtlr(r_lr);
   325       // Store result depending on type. Everything that is not
   326       // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
   327       __ cmpwi(CCR0, r_arg_result_type, T_OBJECT);
   328       __ cmpwi(CCR1, r_arg_result_type, T_LONG);
   329       __ cmpwi(CCR5,  r_arg_result_type, T_FLOAT);
   330       __ cmpwi(CCR6,  r_arg_result_type, T_DOUBLE);
   332       // restore non-volatile registers
   333       __ restore_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
   336       // Stack on exit from call_stub:
   337       //
   338       //      0       [C_FRAME]
   339       //              ...
   340       //
   341       //  no call_stub frames left.
   343       // All non-volatiles have been restored at this point!!
   344       assert(R3_RET == R3, "R3_RET should be R3");
   346       __ beq(CCR0, ret_is_object);
   347       __ beq(CCR1, ret_is_long);
   348       __ beq(CCR5,  ret_is_float);
   349       __ beq(CCR6,  ret_is_double);
   351       // default:
   352       __ stw(R3_RET, 0, r_arg_result_addr);
   353       __ blr(); // return to caller
   355       // case T_OBJECT:
   356       __ bind(ret_is_object);
   357       __ std(R3_RET, 0, r_arg_result_addr);
   358       __ blr(); // return to caller
   360       // case T_LONG:
   361       __ bind(ret_is_long);
   362       __ std(R3_RET, 0, r_arg_result_addr);
   363       __ blr(); // return to caller
   365       // case T_FLOAT:
   366       __ bind(ret_is_float);
   367       __ stfs(F1_RET, 0, r_arg_result_addr);
   368       __ blr(); // return to caller
   370       // case T_DOUBLE:
   371       __ bind(ret_is_double);
   372       __ stfd(F1_RET, 0, r_arg_result_addr);
   373       __ blr(); // return to caller
   374     }
   376     return start;
   377   }
   379   // Return point for a Java call if there's an exception thrown in
   380   // Java code.  The exception is caught and transformed into a
   381   // pending exception stored in JavaThread that can be tested from
   382   // within the VM.
   383   //
   384   address generate_catch_exception() {
   385     StubCodeMark mark(this, "StubRoutines", "catch_exception");
   387     address start = __ pc();
   389     // Registers alive
   390     //
   391     //  R16_thread
   392     //  R3_ARG1 - address of pending exception
   393     //  R4_ARG2 - return address in call stub
   395     const Register exception_file = R21_tmp1;
   396     const Register exception_line = R22_tmp2;
   398     __ load_const(exception_file, (void*)__FILE__);
   399     __ load_const(exception_line, (void*)__LINE__);
   401     __ std(R3_ARG1, thread_(pending_exception));
   402     // store into `char *'
   403     __ std(exception_file, thread_(exception_file));
   404     // store into `int'
   405     __ stw(exception_line, thread_(exception_line));
   407     // complete return to VM
   408     assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
   410     __ mtlr(R4_ARG2);
   411     // continue in call stub
   412     __ blr();
   414     return start;
   415   }
   417   // Continuation point for runtime calls returning with a pending
   418   // exception.  The pending exception check happened in the runtime
   419   // or native call stub.  The pending exception in Thread is
   420   // converted into a Java-level exception.
   421   //
   422   address generate_forward_exception() {
   423     StubCodeMark mark(this, "StubRoutines", "forward_exception");
   424     address start = __ pc();
   426 #if !defined(PRODUCT)
   427     if (VerifyOops) {
   428       // Get pending exception oop.
   429       __ ld(R3_ARG1,
   430                 in_bytes(Thread::pending_exception_offset()),
   431                 R16_thread);
   432       // Make sure that this code is only executed if there is a pending exception.
   433       {
   434         Label L;
   435         __ cmpdi(CCR0, R3_ARG1, 0);
   436         __ bne(CCR0, L);
   437         __ stop("StubRoutines::forward exception: no pending exception (1)");
   438         __ bind(L);
   439       }
   440       __ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");
   441     }
   442 #endif
   444     // Save LR/CR and copy exception pc (LR) into R4_ARG2.
   445     __ save_LR_CR(R4_ARG2);
   446     __ push_frame_abi112(0, R0);
   447     // Find exception handler.
   448     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
   449                      SharedRuntime::exception_handler_for_return_address),
   450                     R16_thread,
   451                     R4_ARG2);
   452     // Copy handler's address.
   453     __ mtctr(R3_RET);
   454     __ pop_frame();
   455     __ restore_LR_CR(R0);
   457     // Set up the arguments for the exception handler:
   458     //  - R3_ARG1: exception oop
   459     //  - R4_ARG2: exception pc.
   461     // Load pending exception oop.
   462     __ ld(R3_ARG1,
   463               in_bytes(Thread::pending_exception_offset()),
   464               R16_thread);
   466     // The exception pc is the return address in the caller.
   467     // Must load it into R4_ARG2.
   468     __ mflr(R4_ARG2);
   470 #ifdef ASSERT
   471     // Make sure exception is set.
   472     {
   473       Label L;
   474       __ cmpdi(CCR0, R3_ARG1, 0);
   475       __ bne(CCR0, L);
   476       __ stop("StubRoutines::forward exception: no pending exception (2)");
   477       __ bind(L);
   478     }
   479 #endif
   481     // Clear the pending exception.
   482     __ li(R0, 0);
   483     __ std(R0,
   484                in_bytes(Thread::pending_exception_offset()),
   485                R16_thread);
   486     // Jump to exception handler.
   487     __ bctr();
   489     return start;
   490   }
   492 #undef __
   493 #define __ masm->
   494   // Continuation point for throwing of implicit exceptions that are
   495   // not handled in the current activation. Fabricates an exception
   496   // oop and initiates normal exception dispatching in this
   497   // frame. Only callee-saved registers are preserved (through the
   498   // normal register window / RegisterMap handling).  If the compiler
   499   // needs all registers to be preserved between the fault point and
   500   // the exception handler then it must assume responsibility for that
   501   // in AbstractCompiler::continuation_for_implicit_null_exception or
   502   // continuation_for_implicit_division_by_zero_exception. All other
   503   // implicit exceptions (e.g., NullPointerException or
   504   // AbstractMethodError on entry) are either at call sites or
   505   // otherwise assume that stack unwinding will be initiated, so
   506   // caller saved registers were assumed volatile in the compiler.
   507   //
   508   // Note that we generate only this stub into a RuntimeStub, because
   509   // it needs to be properly traversed and ignored during GC, so we
   510   // change the meaning of the "__" macro within this method.
   511   //
   512   // Note: the routine set_pc_not_at_call_for_caller in
   513   // SharedRuntime.cpp requires that this code be generated into a
   514   // RuntimeStub.
   515   address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc,
   516                                    Register arg1 = noreg, Register arg2 = noreg) {
   517     CodeBuffer code(name, 1024 DEBUG_ONLY(+ 512), 0);
   518     MacroAssembler* masm = new MacroAssembler(&code);
   520     OopMapSet* oop_maps  = new OopMapSet();
   521     int frame_size_in_bytes = frame::abi_112_size;
   522     OopMap* map = new OopMap(frame_size_in_bytes / sizeof(jint), 0);
   524     StubCodeMark mark(this, "StubRoutines", "throw_exception");
   526     address start = __ pc();
   528     __ save_LR_CR(R11_scratch1);
   530     // Push a frame.
   531     __ push_frame_abi112(0, R11_scratch1);
   533     address frame_complete_pc = __ pc();
   535     if (restore_saved_exception_pc) {
   536       __ unimplemented("StubGenerator::throw_exception with restore_saved_exception_pc", 74);
   537     }
   539     // Note that we always have a runtime stub frame on the top of
   540     // stack by this point. Remember the offset of the instruction
   541     // whose address will be moved to R11_scratch1.
   542     address gc_map_pc = __ get_PC_trash_LR(R11_scratch1);
   544     __ set_last_Java_frame(/*sp*/R1_SP, /*pc*/R11_scratch1);
   546     __ mr(R3_ARG1, R16_thread);
   547     if (arg1 != noreg) {
   548       __ mr(R4_ARG2, arg1);
   549     }
   550     if (arg2 != noreg) {
   551       __ mr(R5_ARG3, arg2);
   552     }
   553     __ call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, runtime_entry),
   554               relocInfo::none);
   556     // Set an oopmap for the call site.
   557     oop_maps->add_gc_map((int)(gc_map_pc - start), map);
   559     __ reset_last_Java_frame();
   561 #ifdef ASSERT
   562     // Make sure that this code is only executed if there is a pending
   563     // exception.
   564     {
   565       Label L;
   566       __ ld(R0,
   567                 in_bytes(Thread::pending_exception_offset()),
   568                 R16_thread);
   569       __ cmpdi(CCR0, R0, 0);
   570       __ bne(CCR0, L);
   571       __ stop("StubRoutines::throw_exception: no pending exception");
   572       __ bind(L);
   573     }
   574 #endif
   576     // Pop frame.
   577     __ pop_frame();
   579     __ restore_LR_CR(R11_scratch1);
   581     __ load_const(R11_scratch1, StubRoutines::forward_exception_entry());
   582     __ mtctr(R11_scratch1);
   583     __ bctr();
   585     // Create runtime stub with OopMap.
   586     RuntimeStub* stub =
   587       RuntimeStub::new_runtime_stub(name, &code,
   588                                     /*frame_complete=*/ (int)(frame_complete_pc - start),
   589                                     frame_size_in_bytes/wordSize,
   590                                     oop_maps,
   591                                     false);
   592     return stub->entry_point();
   593   }
   594 #undef __
   595 #define __ _masm->
   597   //  Generate G1 pre-write barrier for array.
   598   //
   599   //  Input:
   600   //     from     - register containing src address (only needed for spilling)
   601   //     to       - register containing starting address
   602   //     count    - register containing element count
   603   //     tmp      - scratch register
   604   //
   605   //  Kills:
   606   //     nothing
   607   //
   608   void gen_write_ref_array_pre_barrier(Register from, Register to, Register count, bool dest_uninitialized, Register Rtmp1) {
   609     BarrierSet* const bs = Universe::heap()->barrier_set();
   610     switch (bs->kind()) {
   611       case BarrierSet::G1SATBCT:
   612       case BarrierSet::G1SATBCTLogging:
   613         // With G1, don't generate the call if we statically know that the target in uninitialized
   614         if (!dest_uninitialized) {
   615           const int spill_slots = 4 * wordSize;
   616           const int frame_size  = frame::abi_112_size + spill_slots;
   618           __ save_LR_CR(R0);
   619           __ push_frame_abi112(spill_slots, R0);
   620           __ std(from,  frame_size - 1 * wordSize, R1_SP);
   621           __ std(to,    frame_size - 2 * wordSize, R1_SP);
   622           __ std(count, frame_size - 3 * wordSize, R1_SP);
   624           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), to, count);
   626           __ ld(from,  frame_size - 1 * wordSize, R1_SP);
   627           __ ld(to,    frame_size - 2 * wordSize, R1_SP);
   628           __ ld(count, frame_size - 3 * wordSize, R1_SP);
   629           __ pop_frame();
   630           __ restore_LR_CR(R0);
   631         }
   632         break;
   633       case BarrierSet::CardTableModRef:
   634       case BarrierSet::CardTableExtension:
   635       case BarrierSet::ModRef:
   636         break;
   637       default:
   638         ShouldNotReachHere();
   639     }
   640   }
   642   //  Generate CMS/G1 post-write barrier for array.
   643   //
   644   //  Input:
   645   //     addr     - register containing starting address
   646   //     count    - register containing element count
   647   //     tmp      - scratch register
   648   //
   649   //  The input registers and R0 are overwritten.
   650   //
   651   void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp) {
   652     BarrierSet* const bs = Universe::heap()->barrier_set();
   654     switch (bs->kind()) {
   655       case BarrierSet::G1SATBCT:
   656       case BarrierSet::G1SATBCTLogging:
   657         {
   658           __ save_LR_CR(R0);
   659           // We need this frame only that the callee can spill LR/CR.
   660           __ push_frame_abi112(0, R0);
   662           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), addr, count);
   664           __ pop_frame();
   665           __ restore_LR_CR(R0);
   666         }
   667         break;
   668       case BarrierSet::CardTableModRef:
   669       case BarrierSet::CardTableExtension:
   670         {
   671           Label Lskip_loop, Lstore_loop;
   672           if (UseConcMarkSweepGC) {
   673             // TODO PPC port: contribute optimization / requires shared changes
   674             __ release();
   675           }
   677           CardTableModRefBS* const ct = (CardTableModRefBS*)bs;
   678           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
   679           assert_different_registers(addr, count, tmp);
   681           __ sldi(count, count, LogBytesPerHeapOop);
   682           __ addi(count, count, -BytesPerHeapOop);
   683           __ add(count, addr, count);
   684           // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
   685           __ srdi(addr, addr, CardTableModRefBS::card_shift);
   686           __ srdi(count, count, CardTableModRefBS::card_shift);
   687           __ subf(count, addr, count);
   688           assert_different_registers(R0, addr, count, tmp);
   689           __ load_const(tmp, (address)ct->byte_map_base);
   690           __ addic_(count, count, 1);
   691           __ beq(CCR0, Lskip_loop);
   692           __ li(R0, 0);
   693           __ mtctr(count);
   694           // Byte store loop
   695           __ bind(Lstore_loop);
   696           __ stbx(R0, tmp, addr);
   697           __ addi(addr, addr, 1);
   698           __ bdnz(Lstore_loop);
   699           __ bind(Lskip_loop);
   700         }
   701       break;
   702       case BarrierSet::ModRef:
   703         break;
   704       default:
   705         ShouldNotReachHere();
   706     }
   707   }
   709   // Support for void zero_words_aligned8(HeapWord* to, size_t count)
   710   //
   711   // Arguments:
   712   //   to:
   713   //   count:
   714   //
   715   // Destroys:
   716   //
   717   address generate_zero_words_aligned8() {
   718     StubCodeMark mark(this, "StubRoutines", "zero_words_aligned8");
   720     // Implemented as in ClearArray.
   721     address start = __ emit_fd();
   723     Register base_ptr_reg   = R3_ARG1; // tohw (needs to be 8b aligned)
   724     Register cnt_dwords_reg = R4_ARG2; // count (in dwords)
   725     Register tmp1_reg       = R5_ARG3;
   726     Register tmp2_reg       = R6_ARG4;
   727     Register zero_reg       = R7_ARG5;
   729     // Procedure for large arrays (uses data cache block zero instruction).
   730     Label dwloop, fast, fastloop, restloop, lastdword, done;
   731     int cl_size=VM_Version::get_cache_line_size(), cl_dwords=cl_size>>3, cl_dwordaddr_bits=exact_log2(cl_dwords);
   732     int min_dcbz=2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines.
   734     // Clear up to 128byte boundary if long enough, dword_cnt=(16-(base>>3))%16.
   735     __ dcbtst(base_ptr_reg);                    // Indicate write access to first cache line ...
   736     __ andi(tmp2_reg, cnt_dwords_reg, 1);       // to check if number of dwords is even.
   737     __ srdi_(tmp1_reg, cnt_dwords_reg, 1);      // number of double dwords
   738     __ load_const_optimized(zero_reg, 0L);      // Use as zero register.
   740     __ cmpdi(CCR1, tmp2_reg, 0);                // cnt_dwords even?
   741     __ beq(CCR0, lastdword);                    // size <= 1
   742     __ mtctr(tmp1_reg);                         // Speculatively preload counter for rest loop (>0).
   743     __ cmpdi(CCR0, cnt_dwords_reg, (min_dcbz+1)*cl_dwords-1); // Big enough to ensure >=min_dcbz cache lines are included?
   744     __ neg(tmp1_reg, base_ptr_reg);             // bit 0..58: bogus, bit 57..60: (16-(base>>3))%16, bit 61..63: 000
   746     __ blt(CCR0, restloop);                     // Too small. (<31=(2*cl_dwords)-1 is sufficient, but bigger performs better.)
   747     __ rldicl_(tmp1_reg, tmp1_reg, 64-3, 64-cl_dwordaddr_bits); // Extract number of dwords to 128byte boundary=(16-(base>>3))%16.
   749     __ beq(CCR0, fast);                         // already 128byte aligned
   750     __ mtctr(tmp1_reg);                         // Set ctr to hit 128byte boundary (0<ctr<cnt).
   751     __ subf(cnt_dwords_reg, tmp1_reg, cnt_dwords_reg); // rest (>0 since size>=256-8)
   753     // Clear in first cache line dword-by-dword if not already 128byte aligned.
   754     __ bind(dwloop);
   755       __ std(zero_reg, 0, base_ptr_reg);        // Clear 8byte aligned block.
   756       __ addi(base_ptr_reg, base_ptr_reg, 8);
   757     __ bdnz(dwloop);
   759     // clear 128byte blocks
   760     __ bind(fast);
   761     __ srdi(tmp1_reg, cnt_dwords_reg, cl_dwordaddr_bits); // loop count for 128byte loop (>0 since size>=256-8)
   762     __ andi(tmp2_reg, cnt_dwords_reg, 1);       // to check if rest even
   764     __ mtctr(tmp1_reg);                         // load counter
   765     __ cmpdi(CCR1, tmp2_reg, 0);                // rest even?
   766     __ rldicl_(tmp1_reg, cnt_dwords_reg, 63, 65-cl_dwordaddr_bits); // rest in double dwords
   768     __ bind(fastloop);
   769       __ dcbz(base_ptr_reg);                    // Clear 128byte aligned block.
   770       __ addi(base_ptr_reg, base_ptr_reg, cl_size);
   771     __ bdnz(fastloop);
   773     //__ dcbtst(base_ptr_reg);                  // Indicate write access to last cache line.
   774     __ beq(CCR0, lastdword);                    // rest<=1
   775     __ mtctr(tmp1_reg);                         // load counter
   777     // Clear rest.
   778     __ bind(restloop);
   779       __ std(zero_reg, 0, base_ptr_reg);        // Clear 8byte aligned block.
   780       __ std(zero_reg, 8, base_ptr_reg);        // Clear 8byte aligned block.
   781       __ addi(base_ptr_reg, base_ptr_reg, 16);
   782     __ bdnz(restloop);
   784     __ bind(lastdword);
   785     __ beq(CCR1, done);
   786     __ std(zero_reg, 0, base_ptr_reg);
   787     __ bind(done);
   788     __ blr();                                   // return
   790     return start;
   791   }
   793   // The following routine generates a subroutine to throw an asynchronous
   794   // UnknownError when an unsafe access gets a fault that could not be
   795   // reasonably prevented by the programmer.  (Example: SIGBUS/OBJERR.)
   796   //
   797   address generate_handler_for_unsafe_access() {
   798     StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
   799     address start = __ emit_fd();
   800     __ unimplemented("StubRoutines::handler_for_unsafe_access", 93);
   801     return start;
   802   }
   804 #if !defined(PRODUCT)
   805   // Wrapper which calls oopDesc::is_oop_or_null()
   806   // Only called by MacroAssembler::verify_oop
   807   static void verify_oop_helper(const char* message, oop o) {
   808     if (!o->is_oop_or_null()) {
   809       fatal(message);
   810     }
   811     ++ StubRoutines::_verify_oop_count;
   812   }
   813 #endif
   815   // Return address of code to be called from code generated by
   816   // MacroAssembler::verify_oop.
   817   //
   818   // Don't generate, rather use C++ code.
   819   address generate_verify_oop() {
   820     StubCodeMark mark(this, "StubRoutines", "verify_oop");
   822     // this is actually a `FunctionDescriptor*'.
   823     address start = 0;
   825 #if !defined(PRODUCT)
   826     start = CAST_FROM_FN_PTR(address, verify_oop_helper);
   827 #endif
   829     return start;
   830   }
   832   // Fairer handling of safepoints for native methods.
   833   //
   834   // Generate code which reads from the polling page. This special handling is needed as the
   835   // linux-ppc64 kernel before 2.6.6 doesn't set si_addr on some segfaults in 64bit mode
   836   // (cf. http://www.kernel.org/pub/linux/kernel/v2.6/ChangeLog-2.6.6), especially when we try
   837   // to read from the safepoint polling page.
   838   address generate_load_from_poll() {
   839     StubCodeMark mark(this, "StubRoutines", "generate_load_from_poll");
   840     address start = __ emit_fd();
   841     __ unimplemented("StubRoutines::verify_oop", 95);  // TODO PPC port
   842     return start;
   843   }
   845   // -XX:+OptimizeFill : convert fill/copy loops into intrinsic
   846   //
   847   // The code is implemented(ported from sparc) as we believe it benefits JVM98, however
   848   // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!
   849   //
   850   // Source code in function is_range_check_if() shows OptimizeFill relaxed the condition
   851   // for turning on loop predication optimization, and hence the behavior of "array range check"
   852   // and "loop invariant check" could be influenced, which potentially boosted JVM98.
   853   //
   854   // We leave the code here and see if Oracle has updates in later releases(later than HS20).
   855   //
   856   //  Generate stub for disjoint short fill.  If "aligned" is true, the
   857   //  "to" address is assumed to be heapword aligned.
   858   //
   859   // Arguments for generated stub:
   860   //      to:    R3_ARG1
   861   //      value: R4_ARG2
   862   //      count: R5_ARG3 treated as signed
   863   //
   864   address generate_fill(BasicType t, bool aligned, const char* name) {
   865     StubCodeMark mark(this, "StubRoutines", name);
   866     address start = __ emit_fd();
   868     const Register to        = R3_ARG1;   // source array address
   869     const Register value     = R4_ARG2;   // fill value
   870     const Register count     = R5_ARG3;   // elements count
   871     const Register temp      = R6_ARG4;   // temp register
   873     //assert_clean_int(count, O3);     // Make sure 'count' is clean int.
   875     Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
   876     Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;
   878     int shift = -1;
   879     switch (t) {
   880        case T_BYTE:
   881         shift = 2;
   882         // clone bytes (zero extend not needed because store instructions below ignore high order bytes)
   883         __ rldimi(value, value, 8, 48);     // 8 bit -> 16 bit
   884         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element
   885         __ blt(CCR0, L_fill_elements);
   886         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
   887         break;
   888        case T_SHORT:
   889         shift = 1;
   890         // clone bytes (zero extend not needed because store instructions below ignore high order bytes)
   891         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
   892         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element
   893         __ blt(CCR0, L_fill_elements);
   894         break;
   895       case T_INT:
   896         shift = 0;
   897         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element
   898         __ blt(CCR0, L_fill_4_bytes);
   899         break;
   900       default: ShouldNotReachHere();
   901     }
   903     if (!aligned && (t == T_BYTE || t == T_SHORT)) {
   904       // align source address at 4 bytes address boundary
   905       if (t == T_BYTE) {
   906         // One byte misalignment happens only for byte arrays
   907         __ andi_(temp, to, 1);
   908         __ beq(CCR0, L_skip_align1);
   909         __ stb(value, 0, to);
   910         __ addi(to, to, 1);
   911         __ addi(count, count, -1);
   912         __ bind(L_skip_align1);
   913       }
   914       // Two bytes misalignment happens only for byte and short (char) arrays.
   915       __ andi_(temp, to, 2);
   916       __ beq(CCR0, L_skip_align2);
   917       __ sth(value, 0, to);
   918       __ addi(to, to, 2);
   919       __ addi(count, count, -(1 << (shift - 1)));
   920       __ bind(L_skip_align2);
   921     }
   923     if (!aligned) {
   924       // Align to 8 bytes, we know we are 4 byte aligned to start.
   925       __ andi_(temp, to, 7);
   926       __ beq(CCR0, L_fill_32_bytes);
   927       __ stw(value, 0, to);
   928       __ addi(to, to, 4);
   929       __ addi(count, count, -(1 << shift));
   930       __ bind(L_fill_32_bytes);
   931     }
   933     __ li(temp, 8<<shift);              // prepare for 32 byte loop
   934     // clone bytes int->long as above
   935     __ rldimi(value, value, 32, 0);     // 32 bit -> 64 bit
   937     Label L_check_fill_8_bytes;
   938     // Fill 32-byte chunks
   939     __ subf_(count, temp, count);
   940     __ blt(CCR0, L_check_fill_8_bytes);
   942     Label L_fill_32_bytes_loop;
   943     __ align(32);
   944     __ bind(L_fill_32_bytes_loop);
   946     __ std(value, 0, to);
   947     __ std(value, 8, to);
   948     __ subf_(count, temp, count); // update count
   949     __ std(value, 16, to);
   950     __ std(value, 24, to);
   952     __ addi(to, to, 32);
   953     __ bge(CCR0, L_fill_32_bytes_loop);
   955     __ bind(L_check_fill_8_bytes);
   956     __ add_(count, temp, count);
   957     __ beq(CCR0, L_exit);
   958     __ addic_(count, count, -(2 << shift));
   959     __ blt(CCR0, L_fill_4_bytes);
   961     //
   962     // Length is too short, just fill 8 bytes at a time.
   963     //
   964     Label L_fill_8_bytes_loop;
   965     __ bind(L_fill_8_bytes_loop);
   966     __ std(value, 0, to);
   967     __ addic_(count, count, -(2 << shift));
   968     __ addi(to, to, 8);
   969     __ bge(CCR0, L_fill_8_bytes_loop);
   971     // fill trailing 4 bytes
   972     __ bind(L_fill_4_bytes);
   973     __ andi_(temp, count, 1<<shift);
   974     __ beq(CCR0, L_fill_2_bytes);
   976     __ stw(value, 0, to);
   977     if (t == T_BYTE || t == T_SHORT) {
   978       __ addi(to, to, 4);
   979       // fill trailing 2 bytes
   980       __ bind(L_fill_2_bytes);
   981       __ andi_(temp, count, 1<<(shift-1));
   982       __ beq(CCR0, L_fill_byte);
   983       __ sth(value, 0, to);
   984       if (t == T_BYTE) {
   985         __ addi(to, to, 2);
   986         // fill trailing byte
   987         __ bind(L_fill_byte);
   988         __ andi_(count, count, 1);
   989         __ beq(CCR0, L_exit);
   990         __ stb(value, 0, to);
   991       } else {
   992         __ bind(L_fill_byte);
   993       }
   994     } else {
   995       __ bind(L_fill_2_bytes);
   996     }
   997     __ bind(L_exit);
   998     __ blr();
  1000     // Handle copies less than 8 bytes.  Int is handled elsewhere.
  1001     if (t == T_BYTE) {
  1002       __ bind(L_fill_elements);
  1003       Label L_fill_2, L_fill_4;
  1004       __ andi_(temp, count, 1);
  1005       __ beq(CCR0, L_fill_2);
  1006       __ stb(value, 0, to);
  1007       __ addi(to, to, 1);
  1008       __ bind(L_fill_2);
  1009       __ andi_(temp, count, 2);
  1010       __ beq(CCR0, L_fill_4);
  1011       __ stb(value, 0, to);
  1012       __ stb(value, 0, to);
  1013       __ addi(to, to, 2);
  1014       __ bind(L_fill_4);
  1015       __ andi_(temp, count, 4);
  1016       __ beq(CCR0, L_exit);
  1017       __ stb(value, 0, to);
  1018       __ stb(value, 1, to);
  1019       __ stb(value, 2, to);
  1020       __ stb(value, 3, to);
  1021       __ blr();
  1024     if (t == T_SHORT) {
  1025       Label L_fill_2;
  1026       __ bind(L_fill_elements);
  1027       __ andi_(temp, count, 1);
  1028       __ beq(CCR0, L_fill_2);
  1029       __ sth(value, 0, to);
  1030       __ addi(to, to, 2);
  1031       __ bind(L_fill_2);
  1032       __ andi_(temp, count, 2);
  1033       __ beq(CCR0, L_exit);
  1034       __ sth(value, 0, to);
  1035       __ sth(value, 2, to);
  1036       __ blr();
  1038     return start;
  1042   // Generate overlap test for array copy stubs
  1043   //
  1044   // Input:
  1045   //   R3_ARG1    -  from
  1046   //   R4_ARG2    -  to
  1047   //   R5_ARG3    -  element count
  1048   //
  1049   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
  1050     Register tmp1 = R6_ARG4;
  1051     Register tmp2 = R7_ARG5;
  1053     Label l_overlap;
  1054 #ifdef ASSERT
  1055     __ srdi_(tmp2, R5_ARG3, 31);
  1056     __ asm_assert_eq("missing zero extend", 0xAFFE);
  1057 #endif
  1059     __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
  1060     __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
  1061     __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
  1062     __ cmpld(CCR1, tmp1, tmp2);
  1063     __ crand(/*CCR0 lt*/0, /*CCR1 lt*/4+0, /*CCR0 lt*/0);
  1064     __ blt(CCR0, l_overlap); // Src before dst and distance smaller than size.
  1066     // need to copy forwards
  1067     if (__ is_within_range_of_b(no_overlap_target, __ pc())) {
  1068       __ b(no_overlap_target);
  1069     } else {
  1070       __ load_const(tmp1, no_overlap_target, tmp2);
  1071       __ mtctr(tmp1);
  1072       __ bctr();
  1075     __ bind(l_overlap);
  1076     // need to copy backwards
  1079   // The guideline in the implementations of generate_disjoint_xxx_copy
  1080   // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
  1081   // single instructions, but to avoid alignment interrupts (see subsequent
  1082   // comment). Furthermore, we try to minimize misaligned access, even
  1083   // though they cause no alignment interrupt.
  1084   //
  1085   // In Big-Endian mode, the PowerPC architecture requires implementations to
  1086   // handle automatically misaligned integer halfword and word accesses,
  1087   // word-aligned integer doubleword accesses, and word-aligned floating-point
  1088   // accesses. Other accesses may or may not generate an Alignment interrupt
  1089   // depending on the implementation.
  1090   // Alignment interrupt handling may require on the order of hundreds of cycles,
  1091   // so every effort should be made to avoid misaligned memory values.
  1092   //
  1093   //
  1094   // Generate stub for disjoint byte copy.  If "aligned" is true, the
  1095   // "from" and "to" addresses are assumed to be heapword aligned.
  1096   //
  1097   // Arguments for generated stub:
  1098   //      from:  R3_ARG1
  1099   //      to:    R4_ARG2
  1100   //      count: R5_ARG3 treated as signed
  1101   //
  1102   address generate_disjoint_byte_copy(bool aligned, const char * name) {
  1103     StubCodeMark mark(this, "StubRoutines", name);
  1104     address start = __ emit_fd();
  1106     Register tmp1 = R6_ARG4;
  1107     Register tmp2 = R7_ARG5;
  1108     Register tmp3 = R8_ARG6;
  1109     Register tmp4 = R9_ARG7;
  1112     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
  1113     // Don't try anything fancy if arrays don't have many elements.
  1114     __ li(tmp3, 0);
  1115     __ cmpwi(CCR0, R5_ARG3, 17);
  1116     __ ble(CCR0, l_6); // copy 4 at a time
  1118     if (!aligned) {
  1119       __ xorr(tmp1, R3_ARG1, R4_ARG2);
  1120       __ andi_(tmp1, tmp1, 3);
  1121       __ bne(CCR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
  1123       // Copy elements if necessary to align to 4 bytes.
  1124       __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
  1125       __ andi_(tmp1, tmp1, 3);
  1126       __ beq(CCR0, l_2);
  1128       __ subf(R5_ARG3, tmp1, R5_ARG3);
  1129       __ bind(l_9);
  1130       __ lbz(tmp2, 0, R3_ARG1);
  1131       __ addic_(tmp1, tmp1, -1);
  1132       __ stb(tmp2, 0, R4_ARG2);
  1133       __ addi(R3_ARG1, R3_ARG1, 1);
  1134       __ addi(R4_ARG2, R4_ARG2, 1);
  1135       __ bne(CCR0, l_9);
  1137       __ bind(l_2);
  1140     // copy 8 elements at a time
  1141     __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
  1142     __ andi_(tmp1, tmp2, 7);
  1143     __ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8
  1145     // copy a 2-element word if necessary to align to 8 bytes
  1146     __ andi_(R0, R3_ARG1, 7);
  1147     __ beq(CCR0, l_7);
  1149     __ lwzx(tmp2, R3_ARG1, tmp3);
  1150     __ addi(R5_ARG3, R5_ARG3, -4);
  1151     __ stwx(tmp2, R4_ARG2, tmp3);
  1152     { // FasterArrayCopy
  1153       __ addi(R3_ARG1, R3_ARG1, 4);
  1154       __ addi(R4_ARG2, R4_ARG2, 4);
  1156     __ bind(l_7);
  1158     { // FasterArrayCopy
  1159       __ cmpwi(CCR0, R5_ARG3, 31);
  1160       __ ble(CCR0, l_6); // copy 2 at a time if less than 32 elements remain
  1162       __ srdi(tmp1, R5_ARG3, 5);
  1163       __ andi_(R5_ARG3, R5_ARG3, 31);
  1164       __ mtctr(tmp1);
  1166       __ bind(l_8);
  1167       // Use unrolled version for mass copying (copy 32 elements a time)
  1168       // Load feeding store gets zero latency on Power6, however not on Power5.
  1169       // Therefore, the following sequence is made for the good of both.
  1170       __ ld(tmp1, 0, R3_ARG1);
  1171       __ ld(tmp2, 8, R3_ARG1);
  1172       __ ld(tmp3, 16, R3_ARG1);
  1173       __ ld(tmp4, 24, R3_ARG1);
  1174       __ std(tmp1, 0, R4_ARG2);
  1175       __ std(tmp2, 8, R4_ARG2);
  1176       __ std(tmp3, 16, R4_ARG2);
  1177       __ std(tmp4, 24, R4_ARG2);
  1178       __ addi(R3_ARG1, R3_ARG1, 32);
  1179       __ addi(R4_ARG2, R4_ARG2, 32);
  1180       __ bdnz(l_8);
  1183     __ bind(l_6);
  1185     // copy 4 elements at a time
  1186     __ cmpwi(CCR0, R5_ARG3, 4);
  1187     __ blt(CCR0, l_1);
  1188     __ srdi(tmp1, R5_ARG3, 2);
  1189     __ mtctr(tmp1); // is > 0
  1190     __ andi_(R5_ARG3, R5_ARG3, 3);
  1192     { // FasterArrayCopy
  1193       __ addi(R3_ARG1, R3_ARG1, -4);
  1194       __ addi(R4_ARG2, R4_ARG2, -4);
  1195       __ bind(l_3);
  1196       __ lwzu(tmp2, 4, R3_ARG1);
  1197       __ stwu(tmp2, 4, R4_ARG2);
  1198       __ bdnz(l_3);
  1199       __ addi(R3_ARG1, R3_ARG1, 4);
  1200       __ addi(R4_ARG2, R4_ARG2, 4);
  1203     // do single element copy
  1204     __ bind(l_1);
  1205     __ cmpwi(CCR0, R5_ARG3, 0);
  1206     __ beq(CCR0, l_4);
  1208     { // FasterArrayCopy
  1209       __ mtctr(R5_ARG3);
  1210       __ addi(R3_ARG1, R3_ARG1, -1);
  1211       __ addi(R4_ARG2, R4_ARG2, -1);
  1213       __ bind(l_5);
  1214       __ lbzu(tmp2, 1, R3_ARG1);
  1215       __ stbu(tmp2, 1, R4_ARG2);
  1216       __ bdnz(l_5);
  1219     __ bind(l_4);
  1220     __ blr();
  1222     return start;
  1225   // Generate stub for conjoint byte copy.  If "aligned" is true, the
  1226   // "from" and "to" addresses are assumed to be heapword aligned.
  1227   //
  1228   // Arguments for generated stub:
  1229   //      from:  R3_ARG1
  1230   //      to:    R4_ARG2
  1231   //      count: R5_ARG3 treated as signed
  1232   //
  1233   address generate_conjoint_byte_copy(bool aligned, const char * name) {
  1234     StubCodeMark mark(this, "StubRoutines", name);
  1235     address start = __ emit_fd();
  1237     Register tmp1 = R6_ARG4;
  1238     Register tmp2 = R7_ARG5;
  1239     Register tmp3 = R8_ARG6;
  1241     address nooverlap_target = aligned ?
  1242       ((FunctionDescriptor*)StubRoutines::arrayof_jbyte_disjoint_arraycopy())->entry() :
  1243       ((FunctionDescriptor*)StubRoutines::jbyte_disjoint_arraycopy())->entry();
  1245     array_overlap_test(nooverlap_target, 0);
  1246     // Do reverse copy. We assume the case of actual overlap is rare enough
  1247     // that we don't have to optimize it.
  1248     Label l_1, l_2;
  1250     __ b(l_2);
  1251     __ bind(l_1);
  1252     __ stbx(tmp1, R4_ARG2, R5_ARG3);
  1253     __ bind(l_2);
  1254     __ addic_(R5_ARG3, R5_ARG3, -1);
  1255     __ lbzx(tmp1, R3_ARG1, R5_ARG3);
  1256     __ bge(CCR0, l_1);
  1258     __ blr();
  1260     return start;
  1263   // Generate stub for disjoint short copy.  If "aligned" is true, the
  1264   // "from" and "to" addresses are assumed to be heapword aligned.
  1265   //
  1266   // Arguments for generated stub:
  1267   //      from:  R3_ARG1
  1268   //      to:    R4_ARG2
  1269   //  elm.count: R5_ARG3 treated as signed
  1270   //
  1271   // Strategy for aligned==true:
  1272   //
  1273   //  If length <= 9:
  1274   //     1. copy 2 elements at a time (l_6)
  1275   //     2. copy last element if original element count was odd (l_1)
  1276   //
  1277   //  If length > 9:
  1278   //     1. copy 4 elements at a time until less than 4 elements are left (l_7)
  1279   //     2. copy 2 elements at a time until less than 2 elements are left (l_6)
  1280   //     3. copy last element if one was left in step 2. (l_1)
  1281   //
  1282   //
  1283   // Strategy for aligned==false:
  1284   //
  1285   //  If length <= 9: same as aligned==true case, but NOTE: load/stores
  1286   //                  can be unaligned (see comment below)
  1287   //
  1288   //  If length > 9:
  1289   //     1. continue with step 6. if the alignment of from and to mod 4
  1290   //        is different.
  1291   //     2. align from and to to 4 bytes by copying 1 element if necessary
  1292   //     3. at l_2 from and to are 4 byte aligned; continue with
  1293   //        5. if they cannot be aligned to 8 bytes because they have
  1294   //        got different alignment mod 8.
  1295   //     4. at this point we know that both, from and to, have the same
  1296   //        alignment mod 8, now copy one element if necessary to get
  1297   //        8 byte alignment of from and to.
  1298   //     5. copy 4 elements at a time until less than 4 elements are
  1299   //        left; depending on step 3. all load/stores are aligned or
  1300   //        either all loads or all stores are unaligned.
  1301   //     6. copy 2 elements at a time until less than 2 elements are
  1302   //        left (l_6); arriving here from step 1., there is a chance
  1303   //        that all accesses are unaligned.
  1304   //     7. copy last element if one was left in step 6. (l_1)
  1305   //
  1306   //  There are unaligned data accesses using integer load/store
  1307   //  instructions in this stub. POWER allows such accesses.
  1308   //
  1309   //  According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
  1310   //  Chapter 2: Effect of Operand Placement on Performance) unaligned
  1311   //  integer load/stores have good performance. Only unaligned
  1312   //  floating point load/stores can have poor performance.
  1313   //
  1314   //  TODO:
  1315   //
  1316   //  1. check if aligning the backbranch target of loops is beneficial
  1317   //
  1318   address generate_disjoint_short_copy(bool aligned, const char * name) {
  1319     StubCodeMark mark(this, "StubRoutines", name);
  1321     Register tmp1 = R6_ARG4;
  1322     Register tmp2 = R7_ARG5;
  1323     Register tmp3 = R8_ARG6;
  1324     Register tmp4 = R9_ARG7;
  1326     address start = __ emit_fd();
  1328       Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
  1329     // don't try anything fancy if arrays don't have many elements
  1330     __ li(tmp3, 0);
  1331     __ cmpwi(CCR0, R5_ARG3, 9);
  1332     __ ble(CCR0, l_6); // copy 2 at a time
  1334     if (!aligned) {
  1335       __ xorr(tmp1, R3_ARG1, R4_ARG2);
  1336       __ andi_(tmp1, tmp1, 3);
  1337       __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
  1339       // At this point it is guaranteed that both, from and to have the same alignment mod 4.
  1341       // Copy 1 element if necessary to align to 4 bytes.
  1342       __ andi_(tmp1, R3_ARG1, 3);
  1343       __ beq(CCR0, l_2);
  1345       __ lhz(tmp2, 0, R3_ARG1);
  1346       __ addi(R3_ARG1, R3_ARG1, 2);
  1347       __ sth(tmp2, 0, R4_ARG2);
  1348       __ addi(R4_ARG2, R4_ARG2, 2);
  1349       __ addi(R5_ARG3, R5_ARG3, -1);
  1350       __ bind(l_2);
  1352       // At this point the positions of both, from and to, are at least 4 byte aligned.
  1354       // Copy 4 elements at a time.
  1355       // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
  1356       __ xorr(tmp2, R3_ARG1, R4_ARG2);
  1357       __ andi_(tmp1, tmp2, 7);
  1358       __ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
  1360       // Copy a 2-element word if necessary to align to 8 bytes.
  1361       __ andi_(R0, R3_ARG1, 7);
  1362       __ beq(CCR0, l_7);
  1364       __ lwzx(tmp2, R3_ARG1, tmp3);
  1365       __ addi(R5_ARG3, R5_ARG3, -2);
  1366       __ stwx(tmp2, R4_ARG2, tmp3);
  1367       { // FasterArrayCopy
  1368         __ addi(R3_ARG1, R3_ARG1, 4);
  1369         __ addi(R4_ARG2, R4_ARG2, 4);
  1373     __ bind(l_7);
  1375     // Copy 4 elements at a time; either the loads or the stores can
  1376     // be unaligned if aligned == false.
  1378     { // FasterArrayCopy
  1379       __ cmpwi(CCR0, R5_ARG3, 15);
  1380       __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain
  1382       __ srdi(tmp1, R5_ARG3, 4);
  1383       __ andi_(R5_ARG3, R5_ARG3, 15);
  1384       __ mtctr(tmp1);
  1386       __ bind(l_8);
  1387       // Use unrolled version for mass copying (copy 16 elements a time).
  1388       // Load feeding store gets zero latency on Power6, however not on Power5.
  1389       // Therefore, the following sequence is made for the good of both.
  1390       __ ld(tmp1, 0, R3_ARG1);
  1391       __ ld(tmp2, 8, R3_ARG1);
  1392       __ ld(tmp3, 16, R3_ARG1);
  1393       __ ld(tmp4, 24, R3_ARG1);
  1394       __ std(tmp1, 0, R4_ARG2);
  1395       __ std(tmp2, 8, R4_ARG2);
  1396       __ std(tmp3, 16, R4_ARG2);
  1397       __ std(tmp4, 24, R4_ARG2);
  1398       __ addi(R3_ARG1, R3_ARG1, 32);
  1399       __ addi(R4_ARG2, R4_ARG2, 32);
  1400       __ bdnz(l_8);
  1402     __ bind(l_6);
  1404     // copy 2 elements at a time
  1405     { // FasterArrayCopy
  1406       __ cmpwi(CCR0, R5_ARG3, 2);
  1407       __ blt(CCR0, l_1);
  1408       __ srdi(tmp1, R5_ARG3, 1);
  1409       __ andi_(R5_ARG3, R5_ARG3, 1);
  1411       __ addi(R3_ARG1, R3_ARG1, -4);
  1412       __ addi(R4_ARG2, R4_ARG2, -4);
  1413       __ mtctr(tmp1);
  1415       __ bind(l_3);
  1416       __ lwzu(tmp2, 4, R3_ARG1);
  1417       __ stwu(tmp2, 4, R4_ARG2);
  1418       __ bdnz(l_3);
  1420       __ addi(R3_ARG1, R3_ARG1, 4);
  1421       __ addi(R4_ARG2, R4_ARG2, 4);
  1424     // do single element copy
  1425     __ bind(l_1);
  1426     __ cmpwi(CCR0, R5_ARG3, 0);
  1427     __ beq(CCR0, l_4);
  1429     { // FasterArrayCopy
  1430       __ mtctr(R5_ARG3);
  1431       __ addi(R3_ARG1, R3_ARG1, -2);
  1432       __ addi(R4_ARG2, R4_ARG2, -2);
  1434       __ bind(l_5);
  1435       __ lhzu(tmp2, 2, R3_ARG1);
  1436       __ sthu(tmp2, 2, R4_ARG2);
  1437       __ bdnz(l_5);
  1439     __ bind(l_4);
  1440     __ blr();
  1442     return start;
  1445   // Generate stub for conjoint short copy.  If "aligned" is true, the
  1446   // "from" and "to" addresses are assumed to be heapword aligned.
  1447   //
  1448   // Arguments for generated stub:
  1449   //      from:  R3_ARG1
  1450   //      to:    R4_ARG2
  1451   //      count: R5_ARG3 treated as signed
  1452   //
  1453   address generate_conjoint_short_copy(bool aligned, const char * name) {
  1454     StubCodeMark mark(this, "StubRoutines", name);
  1455     address start = __ emit_fd();
  1457     Register tmp1 = R6_ARG4;
  1458     Register tmp2 = R7_ARG5;
  1459     Register tmp3 = R8_ARG6;
  1461     address nooverlap_target = aligned ?
  1462         ((FunctionDescriptor*)StubRoutines::arrayof_jshort_disjoint_arraycopy())->entry() :
  1463         ((FunctionDescriptor*)StubRoutines::jshort_disjoint_arraycopy())->entry();
  1465     array_overlap_test(nooverlap_target, 1);
  1467     Label l_1, l_2;
  1468     __ sldi(tmp1, R5_ARG3, 1);
  1469     __ b(l_2);
  1470     __ bind(l_1);
  1471     __ sthx(tmp2, R4_ARG2, tmp1);
  1472     __ bind(l_2);
  1473     __ addic_(tmp1, tmp1, -2);
  1474     __ lhzx(tmp2, R3_ARG1, tmp1);
  1475     __ bge(CCR0, l_1);
  1477     __ blr();
  1479     return start;
  1482   // Generate core code for disjoint int copy (and oop copy on 32-bit).  If "aligned"
  1483   // is true, the "from" and "to" addresses are assumed to be heapword aligned.
  1484   //
  1485   // Arguments:
  1486   //      from:  R3_ARG1
  1487   //      to:    R4_ARG2
  1488   //      count: R5_ARG3 treated as signed
  1489   //
  1490   void generate_disjoint_int_copy_core(bool aligned) {
  1491     Register tmp1 = R6_ARG4;
  1492     Register tmp2 = R7_ARG5;
  1493     Register tmp3 = R8_ARG6;
  1494     Register tmp4 = R0;
  1496     Label l_1, l_2, l_3, l_4, l_5, l_6;
  1497     // for short arrays, just do single element copy
  1498     __ li(tmp3, 0);
  1499     __ cmpwi(CCR0, R5_ARG3, 5);
  1500     __ ble(CCR0, l_2);
  1502     if (!aligned) {
  1503         // check if arrays have same alignment mod 8.
  1504         __ xorr(tmp1, R3_ARG1, R4_ARG2);
  1505         __ andi_(R0, tmp1, 7);
  1506         // Not the same alignment, but ld and std just need to be 4 byte aligned.
  1507         __ bne(CCR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
  1509         // copy 1 element to align to and from on an 8 byte boundary
  1510         __ andi_(R0, R3_ARG1, 7);
  1511         __ beq(CCR0, l_4);
  1513         __ lwzx(tmp2, R3_ARG1, tmp3);
  1514         __ addi(R5_ARG3, R5_ARG3, -1);
  1515         __ stwx(tmp2, R4_ARG2, tmp3);
  1516         { // FasterArrayCopy
  1517           __ addi(R3_ARG1, R3_ARG1, 4);
  1518           __ addi(R4_ARG2, R4_ARG2, 4);
  1520         __ bind(l_4);
  1523     { // FasterArrayCopy
  1524       __ cmpwi(CCR0, R5_ARG3, 7);
  1525       __ ble(CCR0, l_2); // copy 1 at a time if less than 8 elements remain
  1527       __ srdi(tmp1, R5_ARG3, 3);
  1528       __ andi_(R5_ARG3, R5_ARG3, 7);
  1529       __ mtctr(tmp1);
  1531       __ bind(l_6);
  1532       // Use unrolled version for mass copying (copy 8 elements a time).
  1533       // Load feeding store gets zero latency on power6, however not on power 5.
  1534       // Therefore, the following sequence is made for the good of both.
  1535       __ ld(tmp1, 0, R3_ARG1);
  1536       __ ld(tmp2, 8, R3_ARG1);
  1537       __ ld(tmp3, 16, R3_ARG1);
  1538       __ ld(tmp4, 24, R3_ARG1);
  1539       __ std(tmp1, 0, R4_ARG2);
  1540       __ std(tmp2, 8, R4_ARG2);
  1541       __ std(tmp3, 16, R4_ARG2);
  1542       __ std(tmp4, 24, R4_ARG2);
  1543       __ addi(R3_ARG1, R3_ARG1, 32);
  1544       __ addi(R4_ARG2, R4_ARG2, 32);
  1545       __ bdnz(l_6);
  1548     // copy 1 element at a time
  1549     __ bind(l_2);
  1550     __ cmpwi(CCR0, R5_ARG3, 0);
  1551     __ beq(CCR0, l_1);
  1553     { // FasterArrayCopy
  1554       __ mtctr(R5_ARG3);
  1555       __ addi(R3_ARG1, R3_ARG1, -4);
  1556       __ addi(R4_ARG2, R4_ARG2, -4);
  1558       __ bind(l_3);
  1559       __ lwzu(tmp2, 4, R3_ARG1);
  1560       __ stwu(tmp2, 4, R4_ARG2);
  1561       __ bdnz(l_3);
  1564     __ bind(l_1);
  1565     return;
  1568   // Generate stub for disjoint int copy.  If "aligned" is true, the
  1569   // "from" and "to" addresses are assumed to be heapword aligned.
  1570   //
  1571   // Arguments for generated stub:
  1572   //      from:  R3_ARG1
  1573   //      to:    R4_ARG2
  1574   //      count: R5_ARG3 treated as signed
  1575   //
  1576   address generate_disjoint_int_copy(bool aligned, const char * name) {
  1577     StubCodeMark mark(this, "StubRoutines", name);
  1578     address start = __ emit_fd();
  1579     generate_disjoint_int_copy_core(aligned);
  1580     __ blr();
  1581     return start;
  1584   // Generate core code for conjoint int copy (and oop copy on
  1585   // 32-bit).  If "aligned" is true, the "from" and "to" addresses
  1586   // are assumed to be heapword aligned.
  1587   //
  1588   // Arguments:
  1589   //      from:  R3_ARG1
  1590   //      to:    R4_ARG2
  1591   //      count: R5_ARG3 treated as signed
  1592   //
  1593   void generate_conjoint_int_copy_core(bool aligned) {
  1594     // Do reverse copy.  We assume the case of actual overlap is rare enough
  1595     // that we don't have to optimize it.
  1597     Label l_1, l_2, l_3, l_4, l_5, l_6;
  1599     Register tmp1 = R6_ARG4;
  1600     Register tmp2 = R7_ARG5;
  1601     Register tmp3 = R8_ARG6;
  1602     Register tmp4 = R0;
  1604     { // FasterArrayCopy
  1605       __ cmpwi(CCR0, R5_ARG3, 0);
  1606       __ beq(CCR0, l_6);
  1608       __ sldi(R5_ARG3, R5_ARG3, 2);
  1609       __ add(R3_ARG1, R3_ARG1, R5_ARG3);
  1610       __ add(R4_ARG2, R4_ARG2, R5_ARG3);
  1611       __ srdi(R5_ARG3, R5_ARG3, 2);
  1613       __ cmpwi(CCR0, R5_ARG3, 7);
  1614       __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain
  1616       __ srdi(tmp1, R5_ARG3, 3);
  1617       __ andi(R5_ARG3, R5_ARG3, 7);
  1618       __ mtctr(tmp1);
  1620       __ bind(l_4);
  1621       // Use unrolled version for mass copying (copy 4 elements a time).
  1622       // Load feeding store gets zero latency on Power6, however not on Power5.
  1623       // Therefore, the following sequence is made for the good of both.
  1624       __ addi(R3_ARG1, R3_ARG1, -32);
  1625       __ addi(R4_ARG2, R4_ARG2, -32);
  1626       __ ld(tmp4, 24, R3_ARG1);
  1627       __ ld(tmp3, 16, R3_ARG1);
  1628       __ ld(tmp2, 8, R3_ARG1);
  1629       __ ld(tmp1, 0, R3_ARG1);
  1630       __ std(tmp4, 24, R4_ARG2);
  1631       __ std(tmp3, 16, R4_ARG2);
  1632       __ std(tmp2, 8, R4_ARG2);
  1633       __ std(tmp1, 0, R4_ARG2);
  1634       __ bdnz(l_4);
  1636       __ cmpwi(CCR0, R5_ARG3, 0);
  1637       __ beq(CCR0, l_6);
  1639       __ bind(l_5);
  1640       __ mtctr(R5_ARG3);
  1641       __ bind(l_3);
  1642       __ lwz(R0, -4, R3_ARG1);
  1643       __ stw(R0, -4, R4_ARG2);
  1644       __ addi(R3_ARG1, R3_ARG1, -4);
  1645       __ addi(R4_ARG2, R4_ARG2, -4);
  1646       __ bdnz(l_3);
  1648       __ bind(l_6);
  1652   // Generate stub for conjoint int copy.  If "aligned" is true, the
  1653   // "from" and "to" addresses are assumed to be heapword aligned.
  1654   //
  1655   // Arguments for generated stub:
  1656   //      from:  R3_ARG1
  1657   //      to:    R4_ARG2
  1658   //      count: R5_ARG3 treated as signed
  1659   //
  1660   address generate_conjoint_int_copy(bool aligned, const char * name) {
  1661     StubCodeMark mark(this, "StubRoutines", name);
  1662     address start = __ emit_fd();
  1664     address nooverlap_target = aligned ?
  1665       ((FunctionDescriptor*)StubRoutines::arrayof_jint_disjoint_arraycopy())->entry() :
  1666       ((FunctionDescriptor*)StubRoutines::jint_disjoint_arraycopy())->entry();
  1668     array_overlap_test(nooverlap_target, 2);
  1670     generate_conjoint_int_copy_core(aligned);
  1672     __ blr();
  1674     return start;
  1677   // Generate core code for disjoint long copy (and oop copy on
  1678   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
  1679   // are assumed to be heapword aligned.
  1680   //
  1681   // Arguments:
  1682   //      from:  R3_ARG1
  1683   //      to:    R4_ARG2
  1684   //      count: R5_ARG3 treated as signed
  1685   //
  1686   void generate_disjoint_long_copy_core(bool aligned) {
  1687     Register tmp1 = R6_ARG4;
  1688     Register tmp2 = R7_ARG5;
  1689     Register tmp3 = R8_ARG6;
  1690     Register tmp4 = R0;
  1692     Label l_1, l_2, l_3, l_4;
  1694     { // FasterArrayCopy
  1695       __ cmpwi(CCR0, R5_ARG3, 3);
  1696       __ ble(CCR0, l_3); // copy 1 at a time if less than 4 elements remain
  1698       __ srdi(tmp1, R5_ARG3, 2);
  1699       __ andi_(R5_ARG3, R5_ARG3, 3);
  1700       __ mtctr(tmp1);
  1702       __ bind(l_4);
  1703       // Use unrolled version for mass copying (copy 4 elements a time).
  1704       // Load feeding store gets zero latency on Power6, however not on Power5.
  1705       // Therefore, the following sequence is made for the good of both.
  1706       __ ld(tmp1, 0, R3_ARG1);
  1707       __ ld(tmp2, 8, R3_ARG1);
  1708       __ ld(tmp3, 16, R3_ARG1);
  1709       __ ld(tmp4, 24, R3_ARG1);
  1710       __ std(tmp1, 0, R4_ARG2);
  1711       __ std(tmp2, 8, R4_ARG2);
  1712       __ std(tmp3, 16, R4_ARG2);
  1713       __ std(tmp4, 24, R4_ARG2);
  1714       __ addi(R3_ARG1, R3_ARG1, 32);
  1715       __ addi(R4_ARG2, R4_ARG2, 32);
  1716       __ bdnz(l_4);
  1719     // copy 1 element at a time
  1720     __ bind(l_3);
  1721     __ cmpwi(CCR0, R5_ARG3, 0);
  1722     __ beq(CCR0, l_1);
  1724     { // FasterArrayCopy
  1725       __ mtctr(R5_ARG3);
  1726       __ addi(R3_ARG1, R3_ARG1, -8);
  1727       __ addi(R4_ARG2, R4_ARG2, -8);
  1729       __ bind(l_2);
  1730       __ ldu(R0, 8, R3_ARG1);
  1731       __ stdu(R0, 8, R4_ARG2);
  1732       __ bdnz(l_2);
  1735     __ bind(l_1);
  1738   // Generate stub for disjoint long copy.  If "aligned" is true, the
  1739   // "from" and "to" addresses are assumed to be heapword aligned.
  1740   //
  1741   // Arguments for generated stub:
  1742   //      from:  R3_ARG1
  1743   //      to:    R4_ARG2
  1744   //      count: R5_ARG3 treated as signed
  1745   //
  1746   address generate_disjoint_long_copy(bool aligned, const char * name) {
  1747     StubCodeMark mark(this, "StubRoutines", name);
  1748     address start = __ emit_fd();
  1749     generate_disjoint_long_copy_core(aligned);
  1750     __ blr();
  1752     return start;
  1755   // Generate core code for conjoint long copy (and oop copy on
  1756   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
  1757   // are assumed to be heapword aligned.
  1758   //
  1759   // Arguments:
  1760   //      from:  R3_ARG1
  1761   //      to:    R4_ARG2
  1762   //      count: R5_ARG3 treated as signed
  1763   //
  1764   void generate_conjoint_long_copy_core(bool aligned) {
  1765     Register tmp1 = R6_ARG4;
  1766     Register tmp2 = R7_ARG5;
  1767     Register tmp3 = R8_ARG6;
  1768     Register tmp4 = R0;
  1770     Label l_1, l_2, l_3, l_4, l_5;
  1772     __ cmpwi(CCR0, R5_ARG3, 0);
  1773     __ beq(CCR0, l_1);
  1775     { // FasterArrayCopy
  1776       __ sldi(R5_ARG3, R5_ARG3, 3);
  1777       __ add(R3_ARG1, R3_ARG1, R5_ARG3);
  1778       __ add(R4_ARG2, R4_ARG2, R5_ARG3);
  1779       __ srdi(R5_ARG3, R5_ARG3, 3);
  1781       __ cmpwi(CCR0, R5_ARG3, 3);
  1782       __ ble(CCR0, l_5); // copy 1 at a time if less than 4 elements remain
  1784       __ srdi(tmp1, R5_ARG3, 2);
  1785       __ andi(R5_ARG3, R5_ARG3, 3);
  1786       __ mtctr(tmp1);
  1788       __ bind(l_4);
  1789       // Use unrolled version for mass copying (copy 4 elements a time).
  1790       // Load feeding store gets zero latency on Power6, however not on Power5.
  1791       // Therefore, the following sequence is made for the good of both.
  1792       __ addi(R3_ARG1, R3_ARG1, -32);
  1793       __ addi(R4_ARG2, R4_ARG2, -32);
  1794       __ ld(tmp4, 24, R3_ARG1);
  1795       __ ld(tmp3, 16, R3_ARG1);
  1796       __ ld(tmp2, 8, R3_ARG1);
  1797       __ ld(tmp1, 0, R3_ARG1);
  1798       __ std(tmp4, 24, R4_ARG2);
  1799       __ std(tmp3, 16, R4_ARG2);
  1800       __ std(tmp2, 8, R4_ARG2);
  1801       __ std(tmp1, 0, R4_ARG2);
  1802       __ bdnz(l_4);
  1804       __ cmpwi(CCR0, R5_ARG3, 0);
  1805       __ beq(CCR0, l_1);
  1807       __ bind(l_5);
  1808       __ mtctr(R5_ARG3);
  1809       __ bind(l_3);
  1810       __ ld(R0, -8, R3_ARG1);
  1811       __ std(R0, -8, R4_ARG2);
  1812       __ addi(R3_ARG1, R3_ARG1, -8);
  1813       __ addi(R4_ARG2, R4_ARG2, -8);
  1814       __ bdnz(l_3);
  1817     __ bind(l_1);
  1820   // Generate stub for conjoint long copy.  If "aligned" is true, the
  1821   // "from" and "to" addresses are assumed to be heapword aligned.
  1822   //
  1823   // Arguments for generated stub:
  1824   //      from:  R3_ARG1
  1825   //      to:    R4_ARG2
  1826   //      count: R5_ARG3 treated as signed
  1827   //
  1828   address generate_conjoint_long_copy(bool aligned, const char * name) {
  1829     StubCodeMark mark(this, "StubRoutines", name);
  1830     address start = __ emit_fd();
  1832     address nooverlap_target = aligned ?
  1833       ((FunctionDescriptor*)StubRoutines::arrayof_jlong_disjoint_arraycopy())->entry() :
  1834       ((FunctionDescriptor*)StubRoutines::jlong_disjoint_arraycopy())->entry();
  1836     array_overlap_test(nooverlap_target, 3);
  1837     generate_conjoint_long_copy_core(aligned);
  1839     __ blr();
  1841     return start;
  1844   // Generate stub for conjoint oop copy.  If "aligned" is true, the
  1845   // "from" and "to" addresses are assumed to be heapword aligned.
  1846   //
  1847   // Arguments for generated stub:
  1848   //      from:  R3_ARG1
  1849   //      to:    R4_ARG2
  1850   //      count: R5_ARG3 treated as signed
  1851   //      dest_uninitialized: G1 support
  1852   //
  1853   address generate_conjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
  1854     StubCodeMark mark(this, "StubRoutines", name);
  1856     address start = __ emit_fd();
  1858     address nooverlap_target = aligned ?
  1859       ((FunctionDescriptor*)StubRoutines::arrayof_oop_disjoint_arraycopy())->entry() :
  1860       ((FunctionDescriptor*)StubRoutines::oop_disjoint_arraycopy())->entry();
  1862     gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);
  1864     // Save arguments.
  1865     __ mr(R9_ARG7, R4_ARG2);
  1866     __ mr(R10_ARG8, R5_ARG3);
  1868     if (UseCompressedOops) {
  1869       array_overlap_test(nooverlap_target, 2);
  1870       generate_conjoint_int_copy_core(aligned);
  1871     } else {
  1872       array_overlap_test(nooverlap_target, 3);
  1873       generate_conjoint_long_copy_core(aligned);
  1876     gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1);
  1878     __ blr();
  1880     return start;
  1883   // Generate stub for disjoint oop copy.  If "aligned" is true, the
  1884   // "from" and "to" addresses are assumed to be heapword aligned.
  1885   //
  1886   // Arguments for generated stub:
  1887   //      from:  R3_ARG1
  1888   //      to:    R4_ARG2
  1889   //      count: R5_ARG3 treated as signed
  1890   //      dest_uninitialized: G1 support
  1891   //
  1892   address generate_disjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
  1893     StubCodeMark mark(this, "StubRoutines", name);
  1894     address start = __ emit_fd();
  1896     gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);
  1898     // save some arguments, disjoint_long_copy_core destroys them.
  1899     // needed for post barrier
  1900     __ mr(R9_ARG7, R4_ARG2);
  1901     __ mr(R10_ARG8, R5_ARG3);
  1903     if (UseCompressedOops) {
  1904       generate_disjoint_int_copy_core(aligned);
  1905     } else {
  1906       generate_disjoint_long_copy_core(aligned);
  1909     gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1);
  1911     __ blr();
  1913     return start;
  1916   void generate_arraycopy_stubs() {
  1917     // Note: the disjoint stubs must be generated first, some of
  1918     // the conjoint stubs use them.
  1920     // non-aligned disjoint versions
  1921     StubRoutines::_jbyte_disjoint_arraycopy       = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
  1922     StubRoutines::_jshort_disjoint_arraycopy      = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
  1923     StubRoutines::_jint_disjoint_arraycopy        = generate_disjoint_int_copy(false, "jint_disjoint_arraycopy");
  1924     StubRoutines::_jlong_disjoint_arraycopy       = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
  1925     StubRoutines::_oop_disjoint_arraycopy         = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy", false);
  1926     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy_uninit", true);
  1928     // aligned disjoint versions
  1929     StubRoutines::_arrayof_jbyte_disjoint_arraycopy      = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
  1930     StubRoutines::_arrayof_jshort_disjoint_arraycopy     = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
  1931     StubRoutines::_arrayof_jint_disjoint_arraycopy       = generate_disjoint_int_copy(true, "arrayof_jint_disjoint_arraycopy");
  1932     StubRoutines::_arrayof_jlong_disjoint_arraycopy      = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
  1933     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(true, "arrayof_oop_disjoint_arraycopy", false);
  1934     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, "oop_disjoint_arraycopy_uninit", true);
  1936     // non-aligned conjoint versions
  1937     StubRoutines::_jbyte_arraycopy      = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
  1938     StubRoutines::_jshort_arraycopy     = generate_conjoint_short_copy(false, "jshort_arraycopy");
  1939     StubRoutines::_jint_arraycopy       = generate_conjoint_int_copy(false, "jint_arraycopy");
  1940     StubRoutines::_jlong_arraycopy      = generate_conjoint_long_copy(false, "jlong_arraycopy");
  1941     StubRoutines::_oop_arraycopy        = generate_conjoint_oop_copy(false, "oop_arraycopy", false);
  1942     StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, "oop_arraycopy_uninit", true);
  1944     // aligned conjoint versions
  1945     StubRoutines::_arrayof_jbyte_arraycopy      = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
  1946     StubRoutines::_arrayof_jshort_arraycopy     = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
  1947     StubRoutines::_arrayof_jint_arraycopy       = generate_conjoint_int_copy(true, "arrayof_jint_arraycopy");
  1948     StubRoutines::_arrayof_jlong_arraycopy      = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");
  1949     StubRoutines::_arrayof_oop_arraycopy        = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", false);
  1950     StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", true);
  1952     // fill routines
  1953     StubRoutines::_jbyte_fill          = generate_fill(T_BYTE,  false, "jbyte_fill");
  1954     StubRoutines::_jshort_fill         = generate_fill(T_SHORT, false, "jshort_fill");
  1955     StubRoutines::_jint_fill           = generate_fill(T_INT,   false, "jint_fill");
  1956     StubRoutines::_arrayof_jbyte_fill  = generate_fill(T_BYTE,  true, "arrayof_jbyte_fill");
  1957     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
  1958     StubRoutines::_arrayof_jint_fill   = generate_fill(T_INT,   true, "arrayof_jint_fill");
  1961   // Safefetch stubs.
  1962   void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) {
  1963     // safefetch signatures:
  1964     //   int      SafeFetch32(int*      adr, int      errValue);
  1965     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
  1966     //
  1967     // arguments:
  1968     //   R3_ARG1 = adr
  1969     //   R4_ARG2 = errValue
  1970     //
  1971     // result:
  1972     //   R3_RET  = *adr or errValue
  1974     StubCodeMark mark(this, "StubRoutines", name);
  1976     // Entry point, pc or function descriptor.
  1977     *entry = __ emit_fd();
  1979     // Load *adr into R4_ARG2, may fault.
  1980     *fault_pc = __ pc();
  1981     switch (size) {
  1982       case 4:
  1983         // int32_t, signed extended
  1984         __ lwa(R4_ARG2, 0, R3_ARG1);
  1985         break;
  1986       case 8:
  1987         // int64_t
  1988         __ ld(R4_ARG2, 0, R3_ARG1);
  1989         break;
  1990       default:
  1991         ShouldNotReachHere();
  1994     // return errValue or *adr
  1995     *continuation_pc = __ pc();
  1996     __ mr(R3_RET, R4_ARG2);
  1997     __ blr();
  2000   // Initialization
  2001   void generate_initial() {
  2002     // Generates all stubs and initializes the entry points
  2004     // Entry points that exist in all platforms.
  2005     // Note: This is code that could be shared among different platforms - however the
  2006     // benefit seems to be smaller than the disadvantage of having a
  2007     // much more complicated generator structure. See also comment in
  2008     // stubRoutines.hpp.
  2010     StubRoutines::_forward_exception_entry          = generate_forward_exception();
  2011     StubRoutines::_call_stub_entry                  = generate_call_stub(StubRoutines::_call_stub_return_address);
  2012     StubRoutines::_catch_exception_entry            = generate_catch_exception();
  2015   void generate_all() {
  2016     // Generates all stubs and initializes the entry points
  2018     // These entry points require SharedInfo::stack0 to be set up in
  2019     // non-core builds
  2020     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
  2021     // Handle IncompatibleClassChangeError in itable stubs.
  2022     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError),  false);
  2023     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
  2024     StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError),   false);
  2026     StubRoutines::_handler_for_unsafe_access_entry         = generate_handler_for_unsafe_access();
  2028     // support for verify_oop (must happen after universe_init)
  2029     StubRoutines::_verify_oop_subroutine_entry             = generate_verify_oop();
  2031     // arraycopy stubs used by compilers
  2032     generate_arraycopy_stubs();
  2034     // PPC uses stubs for safefetch.
  2035     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
  2036                                                        &StubRoutines::_safefetch32_fault_pc,
  2037                                                        &StubRoutines::_safefetch32_continuation_pc);
  2038     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
  2039                                                        &StubRoutines::_safefetchN_fault_pc,
  2040                                                        &StubRoutines::_safefetchN_continuation_pc);
  2043  public:
  2044   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
  2045     // replace the standard masm with a special one:
  2046     _masm = new MacroAssembler(code);
  2047     if (all) {
  2048       generate_all();
  2049     } else {
  2050       generate_initial();
  2053 };
  2055 void StubGenerator_generate(CodeBuffer* code, bool all) {
  2056   StubGenerator g(code, all);

mercurial