src/cpu/ppc/vm/stubGenerator_ppc.cpp

Thu, 06 Mar 2014 10:55:28 -0800

author
goetz
date
Thu, 06 Mar 2014 10:55:28 -0800
changeset 6511
31e80afe3fed
parent 6508
c4178a748df9
child 6512
fd1b9f02cc91
permissions
-rw-r--r--

8035647: PPC64: Support for elf v2 abi.
Summary: ELFv2 ABI used by the little endian PowerPC64 on Linux.
Reviewed-by: kvn
Contributed-by: asmundak@google.com

     1 /*
     2  * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
     3  * Copyright 2012, 2013 SAP AG. All rights reserved.
     4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     5  *
     6  * This code is free software; you can redistribute it and/or modify it
     7  * under the terms of the GNU General Public License version 2 only, as
     8  * published by the Free Software Foundation.
     9  *
    10  * This code is distributed in the hope that it will be useful, but WITHOUT
    11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    13  * version 2 for more details (a copy is included in the LICENSE file that
    14  * accompanied this code).
    15  *
    16  * You should have received a copy of the GNU General Public License version
    17  * 2 along with this work; if not, write to the Free Software Foundation,
    18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    19  *
    20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    21  * or visit www.oracle.com if you need additional information or have any
    22  * questions.
    23  *
    24  */
    26 #include "precompiled.hpp"
    27 #include "asm/assembler.hpp"
    28 #include "asm/macroAssembler.inline.hpp"
    29 #include "interpreter/interpreter.hpp"
    30 #include "nativeInst_ppc.hpp"
    31 #include "oops/instanceOop.hpp"
    32 #include "oops/method.hpp"
    33 #include "oops/objArrayKlass.hpp"
    34 #include "oops/oop.inline.hpp"
    35 #include "prims/methodHandles.hpp"
    36 #include "runtime/frame.inline.hpp"
    37 #include "runtime/handles.inline.hpp"
    38 #include "runtime/sharedRuntime.hpp"
    39 #include "runtime/stubCodeGenerator.hpp"
    40 #include "runtime/stubRoutines.hpp"
    41 #include "utilities/top.hpp"
    42 #ifdef TARGET_OS_FAMILY_aix
    43 # include "thread_aix.inline.hpp"
    44 #endif
    45 #ifdef TARGET_OS_FAMILY_linux
    46 # include "thread_linux.inline.hpp"
    47 #endif
    48 #ifdef COMPILER2
    49 #include "opto/runtime.hpp"
    50 #endif
    52 #define __ _masm->
    54 #ifdef PRODUCT
    55 #define BLOCK_COMMENT(str) // nothing
    56 #else
    57 #define BLOCK_COMMENT(str) __ block_comment(str)
    58 #endif
    60 class StubGenerator: public StubCodeGenerator {
    61  private:
    63   // Call stubs are used to call Java from C
    64   //
    65   // Arguments:
    66   //
    67   //   R3  - call wrapper address     : address
    68   //   R4  - result                   : intptr_t*
    69   //   R5  - result type              : BasicType
    70   //   R6  - method                   : Method
    71   //   R7  - frame mgr entry point    : address
    72   //   R8  - parameter block          : intptr_t*
    73   //   R9  - parameter count in words : int
    74   //   R10 - thread                   : Thread*
    75   //
    76   address generate_call_stub(address& return_address) {
    77     // Setup a new c frame, copy java arguments, call frame manager or
    78     // native_entry, and process result.
    80     StubCodeMark mark(this, "StubRoutines", "call_stub");
    82     address start = __ function_entry();
    84     // some sanity checks
    85     assert((sizeof(frame::abi_minframe) % 16) == 0,           "unaligned");
    86     assert((sizeof(frame::abi_reg_args) % 16) == 0,           "unaligned");
    87     assert((sizeof(frame::spill_nonvolatiles) % 16) == 0,     "unaligned");
    88     assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned");
    89     assert((sizeof(frame::entry_frame_locals) % 16) == 0,     "unaligned");
    91     Register r_arg_call_wrapper_addr        = R3;
    92     Register r_arg_result_addr              = R4;
    93     Register r_arg_result_type              = R5;
    94     Register r_arg_method                   = R6;
    95     Register r_arg_entry                    = R7;
    96     Register r_arg_thread                   = R10;
    98     Register r_temp                         = R24;
    99     Register r_top_of_arguments_addr        = R25;
   100     Register r_entryframe_fp                = R26;
   102     {
   103       // Stack on entry to call_stub:
   104       //
   105       //      F1      [C_FRAME]
   106       //              ...
   108       Register r_arg_argument_addr          = R8;
   109       Register r_arg_argument_count         = R9;
   110       Register r_frame_alignment_in_bytes   = R27;
   111       Register r_argument_addr              = R28;
   112       Register r_argumentcopy_addr          = R29;
   113       Register r_argument_size_in_bytes     = R30;
   114       Register r_frame_size                 = R23;
   116       Label arguments_copied;
   118       // Save LR/CR to caller's C_FRAME.
   119       __ save_LR_CR(R0);
   121       // Zero extend arg_argument_count.
   122       __ clrldi(r_arg_argument_count, r_arg_argument_count, 32);
   124       // Save non-volatiles GPRs to ENTRY_FRAME (not yet pushed, but it's safe).
   125       __ save_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
   127       // Keep copy of our frame pointer (caller's SP).
   128       __ mr(r_entryframe_fp, R1_SP);
   130       BLOCK_COMMENT("Push ENTRY_FRAME including arguments");
   131       // Push ENTRY_FRAME including arguments:
   132       //
   133       //      F0      [TOP_IJAVA_FRAME_ABI]
   134       //              alignment (optional)
   135       //              [outgoing Java arguments]
   136       //              [ENTRY_FRAME_LOCALS]
   137       //      F1      [C_FRAME]
   138       //              ...
   140       // calculate frame size
   142       // unaligned size of arguments
   143       __ sldi(r_argument_size_in_bytes,
   144                   r_arg_argument_count, Interpreter::logStackElementSize);
   145       // arguments alignment (max 1 slot)
   146       // FIXME: use round_to() here
   147       __ andi_(r_frame_alignment_in_bytes, r_arg_argument_count, 1);
   148       __ sldi(r_frame_alignment_in_bytes,
   149               r_frame_alignment_in_bytes, Interpreter::logStackElementSize);
   151       // size = unaligned size of arguments + top abi's size
   152       __ addi(r_frame_size, r_argument_size_in_bytes,
   153               frame::top_ijava_frame_abi_size);
   154       // size += arguments alignment
   155       __ add(r_frame_size,
   156              r_frame_size, r_frame_alignment_in_bytes);
   157       // size += size of call_stub locals
   158       __ addi(r_frame_size,
   159               r_frame_size, frame::entry_frame_locals_size);
   161       // push ENTRY_FRAME
   162       __ push_frame(r_frame_size, r_temp);
   164       // initialize call_stub locals (step 1)
   165       __ std(r_arg_call_wrapper_addr,
   166              _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp);
   167       __ std(r_arg_result_addr,
   168              _entry_frame_locals_neg(result_address), r_entryframe_fp);
   169       __ std(r_arg_result_type,
   170              _entry_frame_locals_neg(result_type), r_entryframe_fp);
   171       // we will save arguments_tos_address later
   174       BLOCK_COMMENT("Copy Java arguments");
   175       // copy Java arguments
   177       // Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later.
   178       // FIXME: why not simply use SP+frame::top_ijava_frame_size?
   179       __ addi(r_top_of_arguments_addr,
   180               R1_SP, frame::top_ijava_frame_abi_size);
   181       __ add(r_top_of_arguments_addr,
   182              r_top_of_arguments_addr, r_frame_alignment_in_bytes);
   184       // any arguments to copy?
   185       __ cmpdi(CCR0, r_arg_argument_count, 0);
   186       __ beq(CCR0, arguments_copied);
   188       // prepare loop and copy arguments in reverse order
   189       {
   190         // init CTR with arg_argument_count
   191         __ mtctr(r_arg_argument_count);
   193         // let r_argumentcopy_addr point to last outgoing Java arguments P
   194         __ mr(r_argumentcopy_addr, r_top_of_arguments_addr);
   196         // let r_argument_addr point to last incoming java argument
   197         __ add(r_argument_addr,
   198                    r_arg_argument_addr, r_argument_size_in_bytes);
   199         __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
   201         // now loop while CTR > 0 and copy arguments
   202         {
   203           Label next_argument;
   204           __ bind(next_argument);
   206           __ ld(r_temp, 0, r_argument_addr);
   207           // argument_addr--;
   208           __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
   209           __ std(r_temp, 0, r_argumentcopy_addr);
   210           // argumentcopy_addr++;
   211           __ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord);
   213           __ bdnz(next_argument);
   214         }
   215       }
   217       // Arguments copied, continue.
   218       __ bind(arguments_copied);
   219     }
   221     {
   222       BLOCK_COMMENT("Call frame manager or native entry.");
   223       // Call frame manager or native entry.
   224       Register r_new_arg_entry = R14_state;
   225       assert_different_registers(r_new_arg_entry, r_top_of_arguments_addr,
   226                                  r_arg_method, r_arg_thread);
   228       __ mr(r_new_arg_entry, r_arg_entry);
   230       // Register state on entry to frame manager / native entry:
   231       //
   232       //   tos         -  intptr_t*    sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8
   233       //   R19_method  -  Method
   234       //   R16_thread  -  JavaThread*
   236       // Tos must point to last argument - element_size.
   237       const Register tos = R17_tos;
   238       __ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);
   240       // initialize call_stub locals (step 2)
   241       // now save tos as arguments_tos_address
   242       __ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
   244       // load argument registers for call
   245       __ mr(R19_method, r_arg_method);
   246       __ mr(R16_thread, r_arg_thread);
   247       assert(tos != r_arg_method, "trashed r_arg_method");
   248       assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");
   250       // Set R15_prev_state to 0 for simplifying checks in callee.
   251       __ li(R15_prev_state, 0);
   253       // Stack on entry to frame manager / native entry:
   254       //
   255       //      F0      [TOP_IJAVA_FRAME_ABI]
   256       //              alignment (optional)
   257       //              [outgoing Java arguments]
   258       //              [ENTRY_FRAME_LOCALS]
   259       //      F1      [C_FRAME]
   260       //              ...
   261       //
   263       // global toc register
   264       __ load_const(R29, MacroAssembler::global_toc(), R11_scratch1);
   266       // Load narrow oop base.
   267       __ reinit_heapbase(R30, R11_scratch1);
   269       // Remember the senderSP so we interpreter can pop c2i arguments off of the stack
   270       // when called via a c2i.
   272       // Pass initial_caller_sp to framemanager.
   273       __ mr(R21_tmp1, R1_SP);
   275       // Do a light-weight C-call here, r_new_arg_entry holds the address
   276       // of the interpreter entry point (frame manager or native entry)
   277       // and save runtime-value of LR in return_address.
   278       assert(r_new_arg_entry != tos && r_new_arg_entry != R19_method && r_new_arg_entry != R16_thread,
   279              "trashed r_new_arg_entry");
   280       return_address = __ call_stub(r_new_arg_entry);
   281     }
   283     {
   284       BLOCK_COMMENT("Returned from frame manager or native entry.");
   285       // Returned from frame manager or native entry.
   286       // Now pop frame, process result, and return to caller.
   288       // Stack on exit from frame manager / native entry:
   289       //
   290       //      F0      [ABI]
   291       //              ...
   292       //              [ENTRY_FRAME_LOCALS]
   293       //      F1      [C_FRAME]
   294       //              ...
   295       //
   296       // Just pop the topmost frame ...
   297       //
   299       Label ret_is_object;
   300       Label ret_is_long;
   301       Label ret_is_float;
   302       Label ret_is_double;
   304       Register r_entryframe_fp = R30;
   305       Register r_lr            = R7_ARG5;
   306       Register r_cr            = R8_ARG6;
   308       // Reload some volatile registers which we've spilled before the call
   309       // to frame manager / native entry.
   310       // Access all locals via frame pointer, because we know nothing about
   311       // the topmost frame's size.
   312       __ ld(r_entryframe_fp, _abi(callers_sp), R1_SP);
   313       assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);
   314       __ ld(r_arg_result_addr,
   315             _entry_frame_locals_neg(result_address), r_entryframe_fp);
   316       __ ld(r_arg_result_type,
   317             _entry_frame_locals_neg(result_type), r_entryframe_fp);
   318       __ ld(r_cr, _abi(cr), r_entryframe_fp);
   319       __ ld(r_lr, _abi(lr), r_entryframe_fp);
   321       // pop frame and restore non-volatiles, LR and CR
   322       __ mr(R1_SP, r_entryframe_fp);
   323       __ mtcr(r_cr);
   324       __ mtlr(r_lr);
   326       // Store result depending on type. Everything that is not
   327       // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
   328       __ cmpwi(CCR0, r_arg_result_type, T_OBJECT);
   329       __ cmpwi(CCR1, r_arg_result_type, T_LONG);
   330       __ cmpwi(CCR5, r_arg_result_type, T_FLOAT);
   331       __ cmpwi(CCR6, r_arg_result_type, T_DOUBLE);
   333       // restore non-volatile registers
   334       __ restore_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
   337       // Stack on exit from call_stub:
   338       //
   339       //      0       [C_FRAME]
   340       //              ...
   341       //
   342       //  no call_stub frames left.
   344       // All non-volatiles have been restored at this point!!
   345       assert(R3_RET == R3, "R3_RET should be R3");
   347       __ beq(CCR0, ret_is_object);
   348       __ beq(CCR1, ret_is_long);
   349       __ beq(CCR5, ret_is_float);
   350       __ beq(CCR6, ret_is_double);
   352       // default:
   353       __ stw(R3_RET, 0, r_arg_result_addr);
   354       __ blr(); // return to caller
   356       // case T_OBJECT:
   357       __ bind(ret_is_object);
   358       __ std(R3_RET, 0, r_arg_result_addr);
   359       __ blr(); // return to caller
   361       // case T_LONG:
   362       __ bind(ret_is_long);
   363       __ std(R3_RET, 0, r_arg_result_addr);
   364       __ blr(); // return to caller
   366       // case T_FLOAT:
   367       __ bind(ret_is_float);
   368       __ stfs(F1_RET, 0, r_arg_result_addr);
   369       __ blr(); // return to caller
   371       // case T_DOUBLE:
   372       __ bind(ret_is_double);
   373       __ stfd(F1_RET, 0, r_arg_result_addr);
   374       __ blr(); // return to caller
   375     }
   377     return start;
   378   }
   380   // Return point for a Java call if there's an exception thrown in
   381   // Java code.  The exception is caught and transformed into a
   382   // pending exception stored in JavaThread that can be tested from
   383   // within the VM.
   384   //
   385   address generate_catch_exception() {
   386     StubCodeMark mark(this, "StubRoutines", "catch_exception");
   388     address start = __ pc();
   390     // Registers alive
   391     //
   392     //  R16_thread
   393     //  R3_ARG1 - address of pending exception
   394     //  R4_ARG2 - return address in call stub
   396     const Register exception_file = R21_tmp1;
   397     const Register exception_line = R22_tmp2;
   399     __ load_const(exception_file, (void*)__FILE__);
   400     __ load_const(exception_line, (void*)__LINE__);
   402     __ std(R3_ARG1, thread_(pending_exception));
   403     // store into `char *'
   404     __ std(exception_file, thread_(exception_file));
   405     // store into `int'
   406     __ stw(exception_line, thread_(exception_line));
   408     // complete return to VM
   409     assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
   411     __ mtlr(R4_ARG2);
   412     // continue in call stub
   413     __ blr();
   415     return start;
   416   }
   418   // Continuation point for runtime calls returning with a pending
   419   // exception.  The pending exception check happened in the runtime
   420   // or native call stub.  The pending exception in Thread is
   421   // converted into a Java-level exception.
   422   //
   423   address generate_forward_exception() {
   424     StubCodeMark mark(this, "StubRoutines", "forward_exception");
   425     address start = __ pc();
   427 #if !defined(PRODUCT)
   428     if (VerifyOops) {
   429       // Get pending exception oop.
   430       __ ld(R3_ARG1,
   431                 in_bytes(Thread::pending_exception_offset()),
   432                 R16_thread);
   433       // Make sure that this code is only executed if there is a pending exception.
   434       {
   435         Label L;
   436         __ cmpdi(CCR0, R3_ARG1, 0);
   437         __ bne(CCR0, L);
   438         __ stop("StubRoutines::forward exception: no pending exception (1)");
   439         __ bind(L);
   440       }
   441       __ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");
   442     }
   443 #endif
   445     // Save LR/CR and copy exception pc (LR) into R4_ARG2.
   446     __ save_LR_CR(R4_ARG2);
   447     __ push_frame_reg_args(0, R0);
   448     // Find exception handler.
   449     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
   450                      SharedRuntime::exception_handler_for_return_address),
   451                     R16_thread,
   452                     R4_ARG2);
   453     // Copy handler's address.
   454     __ mtctr(R3_RET);
   455     __ pop_frame();
   456     __ restore_LR_CR(R0);
   458     // Set up the arguments for the exception handler:
   459     //  - R3_ARG1: exception oop
   460     //  - R4_ARG2: exception pc.
   462     // Load pending exception oop.
   463     __ ld(R3_ARG1,
   464               in_bytes(Thread::pending_exception_offset()),
   465               R16_thread);
   467     // The exception pc is the return address in the caller.
   468     // Must load it into R4_ARG2.
   469     __ mflr(R4_ARG2);
   471 #ifdef ASSERT
   472     // Make sure exception is set.
   473     {
   474       Label L;
   475       __ cmpdi(CCR0, R3_ARG1, 0);
   476       __ bne(CCR0, L);
   477       __ stop("StubRoutines::forward exception: no pending exception (2)");
   478       __ bind(L);
   479     }
   480 #endif
   482     // Clear the pending exception.
   483     __ li(R0, 0);
   484     __ std(R0,
   485                in_bytes(Thread::pending_exception_offset()),
   486                R16_thread);
   487     // Jump to exception handler.
   488     __ bctr();
   490     return start;
   491   }
   493 #undef __
   494 #define __ masm->
   495   // Continuation point for throwing of implicit exceptions that are
   496   // not handled in the current activation. Fabricates an exception
   497   // oop and initiates normal exception dispatching in this
   498   // frame. Only callee-saved registers are preserved (through the
   499   // normal register window / RegisterMap handling).  If the compiler
   500   // needs all registers to be preserved between the fault point and
   501   // the exception handler then it must assume responsibility for that
   502   // in AbstractCompiler::continuation_for_implicit_null_exception or
   503   // continuation_for_implicit_division_by_zero_exception. All other
   504   // implicit exceptions (e.g., NullPointerException or
   505   // AbstractMethodError on entry) are either at call sites or
   506   // otherwise assume that stack unwinding will be initiated, so
   507   // caller saved registers were assumed volatile in the compiler.
   508   //
   509   // Note that we generate only this stub into a RuntimeStub, because
   510   // it needs to be properly traversed and ignored during GC, so we
   511   // change the meaning of the "__" macro within this method.
   512   //
   513   // Note: the routine set_pc_not_at_call_for_caller in
   514   // SharedRuntime.cpp requires that this code be generated into a
   515   // RuntimeStub.
   516   address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc,
   517                                    Register arg1 = noreg, Register arg2 = noreg) {
   518     CodeBuffer code(name, 1024 DEBUG_ONLY(+ 512), 0);
   519     MacroAssembler* masm = new MacroAssembler(&code);
   521     OopMapSet* oop_maps  = new OopMapSet();
   522     int frame_size_in_bytes = frame::abi_reg_args_size;
   523     OopMap* map = new OopMap(frame_size_in_bytes / sizeof(jint), 0);
   525     StubCodeMark mark(this, "StubRoutines", "throw_exception");
   527     address start = __ pc();
   529     __ save_LR_CR(R11_scratch1);
   531     // Push a frame.
   532     __ push_frame_reg_args(0, R11_scratch1);
   534     address frame_complete_pc = __ pc();
   536     if (restore_saved_exception_pc) {
   537       __ unimplemented("StubGenerator::throw_exception with restore_saved_exception_pc", 74);
   538     }
   540     // Note that we always have a runtime stub frame on the top of
   541     // stack by this point. Remember the offset of the instruction
   542     // whose address will be moved to R11_scratch1.
   543     address gc_map_pc = __ get_PC_trash_LR(R11_scratch1);
   545     __ set_last_Java_frame(/*sp*/R1_SP, /*pc*/R11_scratch1);
   547     __ mr(R3_ARG1, R16_thread);
   548     if (arg1 != noreg) {
   549       __ mr(R4_ARG2, arg1);
   550     }
   551     if (arg2 != noreg) {
   552       __ mr(R5_ARG3, arg2);
   553     }
   554 #if defined(ABI_ELFv2)
   555     __ call_c(runtime_entry, relocInfo::none);
   556 #else
   557     __ call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, runtime_entry), relocInfo::none);
   558 #endif
   560     // Set an oopmap for the call site.
   561     oop_maps->add_gc_map((int)(gc_map_pc - start), map);
   563     __ reset_last_Java_frame();
   565 #ifdef ASSERT
   566     // Make sure that this code is only executed if there is a pending
   567     // exception.
   568     {
   569       Label L;
   570       __ ld(R0,
   571                 in_bytes(Thread::pending_exception_offset()),
   572                 R16_thread);
   573       __ cmpdi(CCR0, R0, 0);
   574       __ bne(CCR0, L);
   575       __ stop("StubRoutines::throw_exception: no pending exception");
   576       __ bind(L);
   577     }
   578 #endif
   580     // Pop frame.
   581     __ pop_frame();
   583     __ restore_LR_CR(R11_scratch1);
   585     __ load_const(R11_scratch1, StubRoutines::forward_exception_entry());
   586     __ mtctr(R11_scratch1);
   587     __ bctr();
   589     // Create runtime stub with OopMap.
   590     RuntimeStub* stub =
   591       RuntimeStub::new_runtime_stub(name, &code,
   592                                     /*frame_complete=*/ (int)(frame_complete_pc - start),
   593                                     frame_size_in_bytes/wordSize,
   594                                     oop_maps,
   595                                     false);
   596     return stub->entry_point();
   597   }
   598 #undef __
   599 #define __ _masm->
   601   //  Generate G1 pre-write barrier for array.
   602   //
   603   //  Input:
   604   //     from     - register containing src address (only needed for spilling)
   605   //     to       - register containing starting address
   606   //     count    - register containing element count
   607   //     tmp      - scratch register
   608   //
   609   //  Kills:
   610   //     nothing
   611   //
   612   void gen_write_ref_array_pre_barrier(Register from, Register to, Register count, bool dest_uninitialized, Register Rtmp1) {
   613     BarrierSet* const bs = Universe::heap()->barrier_set();
   614     switch (bs->kind()) {
   615       case BarrierSet::G1SATBCT:
   616       case BarrierSet::G1SATBCTLogging:
   617         // With G1, don't generate the call if we statically know that the target in uninitialized
   618         if (!dest_uninitialized) {
   619           const int spill_slots = 4 * wordSize;
   620           const int frame_size  = frame::abi_reg_args_size + spill_slots;
   621           Label filtered;
   623           // Is marking active?
   624           if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
   625             __ lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
   626           } else {
   627             guarantee(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
   628             __ lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
   629           }
   630           __ cmpdi(CCR0, Rtmp1, 0);
   631           __ beq(CCR0, filtered);
   633           __ save_LR_CR(R0);
   634           __ push_frame_reg_args(spill_slots, R0);
   635           __ std(from,  frame_size - 1 * wordSize, R1_SP);
   636           __ std(to,    frame_size - 2 * wordSize, R1_SP);
   637           __ std(count, frame_size - 3 * wordSize, R1_SP);
   639           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), to, count);
   641           __ ld(from,  frame_size - 1 * wordSize, R1_SP);
   642           __ ld(to,    frame_size - 2 * wordSize, R1_SP);
   643           __ ld(count, frame_size - 3 * wordSize, R1_SP);
   644           __ pop_frame();
   645           __ restore_LR_CR(R0);
   647           __ bind(filtered);
   648         }
   649         break;
   650       case BarrierSet::CardTableModRef:
   651       case BarrierSet::CardTableExtension:
   652       case BarrierSet::ModRef:
   653         break;
   654       default:
   655         ShouldNotReachHere();
   656     }
   657   }
   659   //  Generate CMS/G1 post-write barrier for array.
   660   //
   661   //  Input:
   662   //     addr     - register containing starting address
   663   //     count    - register containing element count
   664   //     tmp      - scratch register
   665   //
   666   //  The input registers and R0 are overwritten.
   667   //
   668   void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp, bool branchToEnd) {
   669     BarrierSet* const bs = Universe::heap()->barrier_set();
   671     switch (bs->kind()) {
   672       case BarrierSet::G1SATBCT:
   673       case BarrierSet::G1SATBCTLogging:
   674         {
   675           if (branchToEnd) {
   676             __ save_LR_CR(R0);
   677             // We need this frame only to spill LR.
   678             __ push_frame_reg_args(0, R0);
   679             __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), addr, count);
   680             __ pop_frame();
   681             __ restore_LR_CR(R0);
   682           } else {
   683             // Tail call: fake call from stub caller by branching without linking.
   684             address entry_point = (address)CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post);
   685             __ mr_if_needed(R3_ARG1, addr);
   686             __ mr_if_needed(R4_ARG2, count);
   687             __ load_const(R11, entry_point, R0);
   688             __ call_c_and_return_to_caller(R11);
   689           }
   690         }
   691         break;
   692       case BarrierSet::CardTableModRef:
   693       case BarrierSet::CardTableExtension:
   694         {
   695           Label Lskip_loop, Lstore_loop;
   696           if (UseConcMarkSweepGC) {
   697             // TODO PPC port: contribute optimization / requires shared changes
   698             __ release();
   699           }
   701           CardTableModRefBS* const ct = (CardTableModRefBS*)bs;
   702           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
   703           assert_different_registers(addr, count, tmp);
   705           __ sldi(count, count, LogBytesPerHeapOop);
   706           __ addi(count, count, -BytesPerHeapOop);
   707           __ add(count, addr, count);
   708           // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
   709           __ srdi(addr, addr, CardTableModRefBS::card_shift);
   710           __ srdi(count, count, CardTableModRefBS::card_shift);
   711           __ subf(count, addr, count);
   712           assert_different_registers(R0, addr, count, tmp);
   713           __ load_const(tmp, (address)ct->byte_map_base);
   714           __ addic_(count, count, 1);
   715           __ beq(CCR0, Lskip_loop);
   716           __ li(R0, 0);
   717           __ mtctr(count);
   718           // Byte store loop
   719           __ bind(Lstore_loop);
   720           __ stbx(R0, tmp, addr);
   721           __ addi(addr, addr, 1);
   722           __ bdnz(Lstore_loop);
   723           __ bind(Lskip_loop);
   725           if (!branchToEnd) __ blr();
   726         }
   727       break;
   728       case BarrierSet::ModRef:
   729         if (!branchToEnd) __ blr();
   730         break;
   731       default:
   732         ShouldNotReachHere();
   733     }
   734   }
   736   // Support for void zero_words_aligned8(HeapWord* to, size_t count)
   737   //
   738   // Arguments:
   739   //   to:
   740   //   count:
   741   //
   742   // Destroys:
   743   //
   744   address generate_zero_words_aligned8() {
   745     StubCodeMark mark(this, "StubRoutines", "zero_words_aligned8");
   747     // Implemented as in ClearArray.
   748     address start = __ function_entry();
   750     Register base_ptr_reg   = R3_ARG1; // tohw (needs to be 8b aligned)
   751     Register cnt_dwords_reg = R4_ARG2; // count (in dwords)
   752     Register tmp1_reg       = R5_ARG3;
   753     Register tmp2_reg       = R6_ARG4;
   754     Register zero_reg       = R7_ARG5;
   756     // Procedure for large arrays (uses data cache block zero instruction).
   757     Label dwloop, fast, fastloop, restloop, lastdword, done;
   758     int cl_size=VM_Version::get_cache_line_size(), cl_dwords=cl_size>>3, cl_dwordaddr_bits=exact_log2(cl_dwords);
   759     int min_dcbz=2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines.
   761     // Clear up to 128byte boundary if long enough, dword_cnt=(16-(base>>3))%16.
   762     __ dcbtst(base_ptr_reg);                    // Indicate write access to first cache line ...
   763     __ andi(tmp2_reg, cnt_dwords_reg, 1);       // to check if number of dwords is even.
   764     __ srdi_(tmp1_reg, cnt_dwords_reg, 1);      // number of double dwords
   765     __ load_const_optimized(zero_reg, 0L);      // Use as zero register.
   767     __ cmpdi(CCR1, tmp2_reg, 0);                // cnt_dwords even?
   768     __ beq(CCR0, lastdword);                    // size <= 1
   769     __ mtctr(tmp1_reg);                         // Speculatively preload counter for rest loop (>0).
   770     __ cmpdi(CCR0, cnt_dwords_reg, (min_dcbz+1)*cl_dwords-1); // Big enough to ensure >=min_dcbz cache lines are included?
   771     __ neg(tmp1_reg, base_ptr_reg);             // bit 0..58: bogus, bit 57..60: (16-(base>>3))%16, bit 61..63: 000
   773     __ blt(CCR0, restloop);                     // Too small. (<31=(2*cl_dwords)-1 is sufficient, but bigger performs better.)
   774     __ rldicl_(tmp1_reg, tmp1_reg, 64-3, 64-cl_dwordaddr_bits); // Extract number of dwords to 128byte boundary=(16-(base>>3))%16.
   776     __ beq(CCR0, fast);                         // already 128byte aligned
   777     __ mtctr(tmp1_reg);                         // Set ctr to hit 128byte boundary (0<ctr<cnt).
   778     __ subf(cnt_dwords_reg, tmp1_reg, cnt_dwords_reg); // rest (>0 since size>=256-8)
   780     // Clear in first cache line dword-by-dword if not already 128byte aligned.
   781     __ bind(dwloop);
   782       __ std(zero_reg, 0, base_ptr_reg);        // Clear 8byte aligned block.
   783       __ addi(base_ptr_reg, base_ptr_reg, 8);
   784     __ bdnz(dwloop);
   786     // clear 128byte blocks
   787     __ bind(fast);
   788     __ srdi(tmp1_reg, cnt_dwords_reg, cl_dwordaddr_bits); // loop count for 128byte loop (>0 since size>=256-8)
   789     __ andi(tmp2_reg, cnt_dwords_reg, 1);       // to check if rest even
   791     __ mtctr(tmp1_reg);                         // load counter
   792     __ cmpdi(CCR1, tmp2_reg, 0);                // rest even?
   793     __ rldicl_(tmp1_reg, cnt_dwords_reg, 63, 65-cl_dwordaddr_bits); // rest in double dwords
   795     __ bind(fastloop);
   796       __ dcbz(base_ptr_reg);                    // Clear 128byte aligned block.
   797       __ addi(base_ptr_reg, base_ptr_reg, cl_size);
   798     __ bdnz(fastloop);
   800     //__ dcbtst(base_ptr_reg);                  // Indicate write access to last cache line.
   801     __ beq(CCR0, lastdword);                    // rest<=1
   802     __ mtctr(tmp1_reg);                         // load counter
   804     // Clear rest.
   805     __ bind(restloop);
   806       __ std(zero_reg, 0, base_ptr_reg);        // Clear 8byte aligned block.
   807       __ std(zero_reg, 8, base_ptr_reg);        // Clear 8byte aligned block.
   808       __ addi(base_ptr_reg, base_ptr_reg, 16);
   809     __ bdnz(restloop);
   811     __ bind(lastdword);
   812     __ beq(CCR1, done);
   813     __ std(zero_reg, 0, base_ptr_reg);
   814     __ bind(done);
   815     __ blr();                                   // return
   817     return start;
   818   }
   820   // The following routine generates a subroutine to throw an asynchronous
   821   // UnknownError when an unsafe access gets a fault that could not be
   822   // reasonably prevented by the programmer.  (Example: SIGBUS/OBJERR.)
   823   //
   824   address generate_handler_for_unsafe_access() {
   825     StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
   826     address start = __ function_entry();
   827     __ unimplemented("StubRoutines::handler_for_unsafe_access", 93);
   828     return start;
   829   }
   831 #if !defined(PRODUCT)
   832   // Wrapper which calls oopDesc::is_oop_or_null()
   833   // Only called by MacroAssembler::verify_oop
   834   static void verify_oop_helper(const char* message, oop o) {
   835     if (!o->is_oop_or_null()) {
   836       fatal(message);
   837     }
   838     ++ StubRoutines::_verify_oop_count;
   839   }
   840 #endif
   842   // Return address of code to be called from code generated by
   843   // MacroAssembler::verify_oop.
   844   //
   845   // Don't generate, rather use C++ code.
   846   address generate_verify_oop() {
   847     StubCodeMark mark(this, "StubRoutines", "verify_oop");
   849     // this is actually a `FunctionDescriptor*'.
   850     address start = 0;
   852 #if !defined(PRODUCT)
   853     start = CAST_FROM_FN_PTR(address, verify_oop_helper);
   854 #endif
   856     return start;
   857   }
   859   // Fairer handling of safepoints for native methods.
   860   //
   861   // Generate code which reads from the polling page. This special handling is needed as the
   862   // linux-ppc64 kernel before 2.6.6 doesn't set si_addr on some segfaults in 64bit mode
   863   // (cf. http://www.kernel.org/pub/linux/kernel/v2.6/ChangeLog-2.6.6), especially when we try
   864   // to read from the safepoint polling page.
   865   address generate_load_from_poll() {
   866     StubCodeMark mark(this, "StubRoutines", "generate_load_from_poll");
   867     address start = __ function_entry();
   868     __ unimplemented("StubRoutines::verify_oop", 95);  // TODO PPC port
   869     return start;
   870   }
   872   // -XX:+OptimizeFill : convert fill/copy loops into intrinsic
   873   //
   874   // The code is implemented(ported from sparc) as we believe it benefits JVM98, however
   875   // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!
   876   //
   877   // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition
   878   // for turning on loop predication optimization, and hence the behavior of "array range check"
   879   // and "loop invariant check" could be influenced, which potentially boosted JVM98.
   880   //
   881   // Generate stub for disjoint short fill. If "aligned" is true, the
   882   // "to" address is assumed to be heapword aligned.
   883   //
   884   // Arguments for generated stub:
   885   //   to:    R3_ARG1
   886   //   value: R4_ARG2
   887   //   count: R5_ARG3 treated as signed
   888   //
   889   address generate_fill(BasicType t, bool aligned, const char* name) {
   890     StubCodeMark mark(this, "StubRoutines", name);
   891     address start = __ function_entry();
   893     const Register to    = R3_ARG1;   // source array address
   894     const Register value = R4_ARG2;   // fill value
   895     const Register count = R5_ARG3;   // elements count
   896     const Register temp  = R6_ARG4;   // temp register
   898     //assert_clean_int(count, O3);    // Make sure 'count' is clean int.
   900     Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
   901     Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;
   903     int shift = -1;
   904     switch (t) {
   905        case T_BYTE:
   906         shift = 2;
   907         // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
   908         __ rldimi(value, value, 8, 48);     // 8 bit -> 16 bit
   909         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
   910         __ blt(CCR0, L_fill_elements);
   911         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
   912         break;
   913        case T_SHORT:
   914         shift = 1;
   915         // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
   916         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
   917         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
   918         __ blt(CCR0, L_fill_elements);
   919         break;
   920       case T_INT:
   921         shift = 0;
   922         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
   923         __ blt(CCR0, L_fill_4_bytes);
   924         break;
   925       default: ShouldNotReachHere();
   926     }
   928     if (!aligned && (t == T_BYTE || t == T_SHORT)) {
   929       // Align source address at 4 bytes address boundary.
   930       if (t == T_BYTE) {
   931         // One byte misalignment happens only for byte arrays.
   932         __ andi_(temp, to, 1);
   933         __ beq(CCR0, L_skip_align1);
   934         __ stb(value, 0, to);
   935         __ addi(to, to, 1);
   936         __ addi(count, count, -1);
   937         __ bind(L_skip_align1);
   938       }
   939       // Two bytes misalignment happens only for byte and short (char) arrays.
   940       __ andi_(temp, to, 2);
   941       __ beq(CCR0, L_skip_align2);
   942       __ sth(value, 0, to);
   943       __ addi(to, to, 2);
   944       __ addi(count, count, -(1 << (shift - 1)));
   945       __ bind(L_skip_align2);
   946     }
   948     if (!aligned) {
   949       // Align to 8 bytes, we know we are 4 byte aligned to start.
   950       __ andi_(temp, to, 7);
   951       __ beq(CCR0, L_fill_32_bytes);
   952       __ stw(value, 0, to);
   953       __ addi(to, to, 4);
   954       __ addi(count, count, -(1 << shift));
   955       __ bind(L_fill_32_bytes);
   956     }
   958     __ li(temp, 8<<shift);                  // Prepare for 32 byte loop.
   959     // Clone bytes int->long as above.
   960     __ rldimi(value, value, 32, 0);         // 32 bit -> 64 bit
   962     Label L_check_fill_8_bytes;
   963     // Fill 32-byte chunks.
   964     __ subf_(count, temp, count);
   965     __ blt(CCR0, L_check_fill_8_bytes);
   967     Label L_fill_32_bytes_loop;
   968     __ align(32);
   969     __ bind(L_fill_32_bytes_loop);
   971     __ std(value, 0, to);
   972     __ std(value, 8, to);
   973     __ subf_(count, temp, count);           // Update count.
   974     __ std(value, 16, to);
   975     __ std(value, 24, to);
   977     __ addi(to, to, 32);
   978     __ bge(CCR0, L_fill_32_bytes_loop);
   980     __ bind(L_check_fill_8_bytes);
   981     __ add_(count, temp, count);
   982     __ beq(CCR0, L_exit);
   983     __ addic_(count, count, -(2 << shift));
   984     __ blt(CCR0, L_fill_4_bytes);
   986     //
   987     // Length is too short, just fill 8 bytes at a time.
   988     //
   989     Label L_fill_8_bytes_loop;
   990     __ bind(L_fill_8_bytes_loop);
   991     __ std(value, 0, to);
   992     __ addic_(count, count, -(2 << shift));
   993     __ addi(to, to, 8);
   994     __ bge(CCR0, L_fill_8_bytes_loop);
   996     // Fill trailing 4 bytes.
   997     __ bind(L_fill_4_bytes);
   998     __ andi_(temp, count, 1<<shift);
   999     __ beq(CCR0, L_fill_2_bytes);
  1001     __ stw(value, 0, to);
  1002     if (t == T_BYTE || t == T_SHORT) {
  1003       __ addi(to, to, 4);
  1004       // Fill trailing 2 bytes.
  1005       __ bind(L_fill_2_bytes);
  1006       __ andi_(temp, count, 1<<(shift-1));
  1007       __ beq(CCR0, L_fill_byte);
  1008       __ sth(value, 0, to);
  1009       if (t == T_BYTE) {
  1010         __ addi(to, to, 2);
  1011         // Fill trailing byte.
  1012         __ bind(L_fill_byte);
  1013         __ andi_(count, count, 1);
  1014         __ beq(CCR0, L_exit);
  1015         __ stb(value, 0, to);
  1016       } else {
  1017         __ bind(L_fill_byte);
  1019     } else {
  1020       __ bind(L_fill_2_bytes);
  1022     __ bind(L_exit);
  1023     __ blr();
  1025     // Handle copies less than 8 bytes. Int is handled elsewhere.
  1026     if (t == T_BYTE) {
  1027       __ bind(L_fill_elements);
  1028       Label L_fill_2, L_fill_4;
  1029       __ andi_(temp, count, 1);
  1030       __ beq(CCR0, L_fill_2);
  1031       __ stb(value, 0, to);
  1032       __ addi(to, to, 1);
  1033       __ bind(L_fill_2);
  1034       __ andi_(temp, count, 2);
  1035       __ beq(CCR0, L_fill_4);
  1036       __ stb(value, 0, to);
  1037       __ stb(value, 0, to);
  1038       __ addi(to, to, 2);
  1039       __ bind(L_fill_4);
  1040       __ andi_(temp, count, 4);
  1041       __ beq(CCR0, L_exit);
  1042       __ stb(value, 0, to);
  1043       __ stb(value, 1, to);
  1044       __ stb(value, 2, to);
  1045       __ stb(value, 3, to);
  1046       __ blr();
  1049     if (t == T_SHORT) {
  1050       Label L_fill_2;
  1051       __ bind(L_fill_elements);
  1052       __ andi_(temp, count, 1);
  1053       __ beq(CCR0, L_fill_2);
  1054       __ sth(value, 0, to);
  1055       __ addi(to, to, 2);
  1056       __ bind(L_fill_2);
  1057       __ andi_(temp, count, 2);
  1058       __ beq(CCR0, L_exit);
  1059       __ sth(value, 0, to);
  1060       __ sth(value, 2, to);
  1061       __ blr();
  1063     return start;
  1067   // Generate overlap test for array copy stubs.
  1068   //
  1069   // Input:
  1070   //   R3_ARG1    -  from
  1071   //   R4_ARG2    -  to
  1072   //   R5_ARG3    -  element count
  1073   //
  1074   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
  1075     Register tmp1 = R6_ARG4;
  1076     Register tmp2 = R7_ARG5;
  1078     Label l_overlap;
  1079 #ifdef ASSERT
  1080     __ srdi_(tmp2, R5_ARG3, 31);
  1081     __ asm_assert_eq("missing zero extend", 0xAFFE);
  1082 #endif
  1084     __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
  1085     __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
  1086     __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
  1087     __ cmpld(CCR1, tmp1, tmp2);
  1088     __ crand(/*CCR0 lt*/0, /*CCR1 lt*/4+0, /*CCR0 lt*/0);
  1089     __ blt(CCR0, l_overlap); // Src before dst and distance smaller than size.
  1091     // need to copy forwards
  1092     if (__ is_within_range_of_b(no_overlap_target, __ pc())) {
  1093       __ b(no_overlap_target);
  1094     } else {
  1095       __ load_const(tmp1, no_overlap_target, tmp2);
  1096       __ mtctr(tmp1);
  1097       __ bctr();
  1100     __ bind(l_overlap);
  1101     // need to copy backwards
  1104   // The guideline in the implementations of generate_disjoint_xxx_copy
  1105   // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
  1106   // single instructions, but to avoid alignment interrupts (see subsequent
  1107   // comment). Furthermore, we try to minimize misaligned access, even
  1108   // though they cause no alignment interrupt.
  1109   //
  1110   // In Big-Endian mode, the PowerPC architecture requires implementations to
  1111   // handle automatically misaligned integer halfword and word accesses,
  1112   // word-aligned integer doubleword accesses, and word-aligned floating-point
  1113   // accesses. Other accesses may or may not generate an Alignment interrupt
  1114   // depending on the implementation.
  1115   // Alignment interrupt handling may require on the order of hundreds of cycles,
  1116   // so every effort should be made to avoid misaligned memory values.
  1117   //
  1118   //
  1119   // Generate stub for disjoint byte copy.  If "aligned" is true, the
  1120   // "from" and "to" addresses are assumed to be heapword aligned.
  1121   //
  1122   // Arguments for generated stub:
  1123   //      from:  R3_ARG1
  1124   //      to:    R4_ARG2
  1125   //      count: R5_ARG3 treated as signed
  1126   //
  1127   address generate_disjoint_byte_copy(bool aligned, const char * name) {
  1128     StubCodeMark mark(this, "StubRoutines", name);
  1129     address start = __ function_entry();
  1131     Register tmp1 = R6_ARG4;
  1132     Register tmp2 = R7_ARG5;
  1133     Register tmp3 = R8_ARG6;
  1134     Register tmp4 = R9_ARG7;
  1137     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
  1138     // Don't try anything fancy if arrays don't have many elements.
  1139     __ li(tmp3, 0);
  1140     __ cmpwi(CCR0, R5_ARG3, 17);
  1141     __ ble(CCR0, l_6); // copy 4 at a time
  1143     if (!aligned) {
  1144       __ xorr(tmp1, R3_ARG1, R4_ARG2);
  1145       __ andi_(tmp1, tmp1, 3);
  1146       __ bne(CCR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
  1148       // Copy elements if necessary to align to 4 bytes.
  1149       __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
  1150       __ andi_(tmp1, tmp1, 3);
  1151       __ beq(CCR0, l_2);
  1153       __ subf(R5_ARG3, tmp1, R5_ARG3);
  1154       __ bind(l_9);
  1155       __ lbz(tmp2, 0, R3_ARG1);
  1156       __ addic_(tmp1, tmp1, -1);
  1157       __ stb(tmp2, 0, R4_ARG2);
  1158       __ addi(R3_ARG1, R3_ARG1, 1);
  1159       __ addi(R4_ARG2, R4_ARG2, 1);
  1160       __ bne(CCR0, l_9);
  1162       __ bind(l_2);
  1165     // copy 8 elements at a time
  1166     __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
  1167     __ andi_(tmp1, tmp2, 7);
  1168     __ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8
  1170     // copy a 2-element word if necessary to align to 8 bytes
  1171     __ andi_(R0, R3_ARG1, 7);
  1172     __ beq(CCR0, l_7);
  1174     __ lwzx(tmp2, R3_ARG1, tmp3);
  1175     __ addi(R5_ARG3, R5_ARG3, -4);
  1176     __ stwx(tmp2, R4_ARG2, tmp3);
  1177     { // FasterArrayCopy
  1178       __ addi(R3_ARG1, R3_ARG1, 4);
  1179       __ addi(R4_ARG2, R4_ARG2, 4);
  1181     __ bind(l_7);
  1183     { // FasterArrayCopy
  1184       __ cmpwi(CCR0, R5_ARG3, 31);
  1185       __ ble(CCR0, l_6); // copy 2 at a time if less than 32 elements remain
  1187       __ srdi(tmp1, R5_ARG3, 5);
  1188       __ andi_(R5_ARG3, R5_ARG3, 31);
  1189       __ mtctr(tmp1);
  1191       __ bind(l_8);
  1192       // Use unrolled version for mass copying (copy 32 elements a time)
  1193       // Load feeding store gets zero latency on Power6, however not on Power5.
  1194       // Therefore, the following sequence is made for the good of both.
  1195       __ ld(tmp1, 0, R3_ARG1);
  1196       __ ld(tmp2, 8, R3_ARG1);
  1197       __ ld(tmp3, 16, R3_ARG1);
  1198       __ ld(tmp4, 24, R3_ARG1);
  1199       __ std(tmp1, 0, R4_ARG2);
  1200       __ std(tmp2, 8, R4_ARG2);
  1201       __ std(tmp3, 16, R4_ARG2);
  1202       __ std(tmp4, 24, R4_ARG2);
  1203       __ addi(R3_ARG1, R3_ARG1, 32);
  1204       __ addi(R4_ARG2, R4_ARG2, 32);
  1205       __ bdnz(l_8);
  1208     __ bind(l_6);
  1210     // copy 4 elements at a time
  1211     __ cmpwi(CCR0, R5_ARG3, 4);
  1212     __ blt(CCR0, l_1);
  1213     __ srdi(tmp1, R5_ARG3, 2);
  1214     __ mtctr(tmp1); // is > 0
  1215     __ andi_(R5_ARG3, R5_ARG3, 3);
  1217     { // FasterArrayCopy
  1218       __ addi(R3_ARG1, R3_ARG1, -4);
  1219       __ addi(R4_ARG2, R4_ARG2, -4);
  1220       __ bind(l_3);
  1221       __ lwzu(tmp2, 4, R3_ARG1);
  1222       __ stwu(tmp2, 4, R4_ARG2);
  1223       __ bdnz(l_3);
  1224       __ addi(R3_ARG1, R3_ARG1, 4);
  1225       __ addi(R4_ARG2, R4_ARG2, 4);
  1228     // do single element copy
  1229     __ bind(l_1);
  1230     __ cmpwi(CCR0, R5_ARG3, 0);
  1231     __ beq(CCR0, l_4);
  1233     { // FasterArrayCopy
  1234       __ mtctr(R5_ARG3);
  1235       __ addi(R3_ARG1, R3_ARG1, -1);
  1236       __ addi(R4_ARG2, R4_ARG2, -1);
  1238       __ bind(l_5);
  1239       __ lbzu(tmp2, 1, R3_ARG1);
  1240       __ stbu(tmp2, 1, R4_ARG2);
  1241       __ bdnz(l_5);
  1244     __ bind(l_4);
  1245     __ blr();
  1247     return start;
  1250   // Generate stub for conjoint byte copy.  If "aligned" is true, the
  1251   // "from" and "to" addresses are assumed to be heapword aligned.
  1252   //
  1253   // Arguments for generated stub:
  1254   //      from:  R3_ARG1
  1255   //      to:    R4_ARG2
  1256   //      count: R5_ARG3 treated as signed
  1257   //
  1258   address generate_conjoint_byte_copy(bool aligned, const char * name) {
  1259     StubCodeMark mark(this, "StubRoutines", name);
  1260     address start = __ function_entry();
  1262     Register tmp1 = R6_ARG4;
  1263     Register tmp2 = R7_ARG5;
  1264     Register tmp3 = R8_ARG6;
  1266 #if defined(ABI_ELFv2)
  1267      address nooverlap_target = aligned ?
  1268        StubRoutines::arrayof_jbyte_disjoint_arraycopy() :
  1269        StubRoutines::jbyte_disjoint_arraycopy();
  1270 #else
  1271     address nooverlap_target = aligned ?
  1272       ((FunctionDescriptor*)StubRoutines::arrayof_jbyte_disjoint_arraycopy())->entry() :
  1273       ((FunctionDescriptor*)StubRoutines::jbyte_disjoint_arraycopy())->entry();
  1274 #endif
  1276     array_overlap_test(nooverlap_target, 0);
  1277     // Do reverse copy. We assume the case of actual overlap is rare enough
  1278     // that we don't have to optimize it.
  1279     Label l_1, l_2;
  1281     __ b(l_2);
  1282     __ bind(l_1);
  1283     __ stbx(tmp1, R4_ARG2, R5_ARG3);
  1284     __ bind(l_2);
  1285     __ addic_(R5_ARG3, R5_ARG3, -1);
  1286     __ lbzx(tmp1, R3_ARG1, R5_ARG3);
  1287     __ bge(CCR0, l_1);
  1289     __ blr();
  1291     return start;
  1294   // Generate stub for disjoint short copy.  If "aligned" is true, the
  1295   // "from" and "to" addresses are assumed to be heapword aligned.
  1296   //
  1297   // Arguments for generated stub:
  1298   //      from:  R3_ARG1
  1299   //      to:    R4_ARG2
  1300   //  elm.count: R5_ARG3 treated as signed
  1301   //
  1302   // Strategy for aligned==true:
  1303   //
  1304   //  If length <= 9:
  1305   //     1. copy 2 elements at a time (l_6)
  1306   //     2. copy last element if original element count was odd (l_1)
  1307   //
  1308   //  If length > 9:
  1309   //     1. copy 4 elements at a time until less than 4 elements are left (l_7)
  1310   //     2. copy 2 elements at a time until less than 2 elements are left (l_6)
  1311   //     3. copy last element if one was left in step 2. (l_1)
  1312   //
  1313   //
  1314   // Strategy for aligned==false:
  1315   //
  1316   //  If length <= 9: same as aligned==true case, but NOTE: load/stores
  1317   //                  can be unaligned (see comment below)
  1318   //
  1319   //  If length > 9:
  1320   //     1. continue with step 6. if the alignment of from and to mod 4
  1321   //        is different.
  1322   //     2. align from and to to 4 bytes by copying 1 element if necessary
  1323   //     3. at l_2 from and to are 4 byte aligned; continue with
  1324   //        5. if they cannot be aligned to 8 bytes because they have
  1325   //        got different alignment mod 8.
  1326   //     4. at this point we know that both, from and to, have the same
  1327   //        alignment mod 8, now copy one element if necessary to get
  1328   //        8 byte alignment of from and to.
  1329   //     5. copy 4 elements at a time until less than 4 elements are
  1330   //        left; depending on step 3. all load/stores are aligned or
  1331   //        either all loads or all stores are unaligned.
  1332   //     6. copy 2 elements at a time until less than 2 elements are
  1333   //        left (l_6); arriving here from step 1., there is a chance
  1334   //        that all accesses are unaligned.
  1335   //     7. copy last element if one was left in step 6. (l_1)
  1336   //
  1337   //  There are unaligned data accesses using integer load/store
  1338   //  instructions in this stub. POWER allows such accesses.
  1339   //
  1340   //  According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
  1341   //  Chapter 2: Effect of Operand Placement on Performance) unaligned
  1342   //  integer load/stores have good performance. Only unaligned
  1343   //  floating point load/stores can have poor performance.
  1344   //
  1345   //  TODO:
  1346   //
  1347   //  1. check if aligning the backbranch target of loops is beneficial
  1348   //
  1349   address generate_disjoint_short_copy(bool aligned, const char * name) {
  1350     StubCodeMark mark(this, "StubRoutines", name);
  1352     Register tmp1 = R6_ARG4;
  1353     Register tmp2 = R7_ARG5;
  1354     Register tmp3 = R8_ARG6;
  1355     Register tmp4 = R9_ARG7;
  1357     address start = __ function_entry();
  1359       Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
  1360     // don't try anything fancy if arrays don't have many elements
  1361     __ li(tmp3, 0);
  1362     __ cmpwi(CCR0, R5_ARG3, 9);
  1363     __ ble(CCR0, l_6); // copy 2 at a time
  1365     if (!aligned) {
  1366       __ xorr(tmp1, R3_ARG1, R4_ARG2);
  1367       __ andi_(tmp1, tmp1, 3);
  1368       __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
  1370       // At this point it is guaranteed that both, from and to have the same alignment mod 4.
  1372       // Copy 1 element if necessary to align to 4 bytes.
  1373       __ andi_(tmp1, R3_ARG1, 3);
  1374       __ beq(CCR0, l_2);
  1376       __ lhz(tmp2, 0, R3_ARG1);
  1377       __ addi(R3_ARG1, R3_ARG1, 2);
  1378       __ sth(tmp2, 0, R4_ARG2);
  1379       __ addi(R4_ARG2, R4_ARG2, 2);
  1380       __ addi(R5_ARG3, R5_ARG3, -1);
  1381       __ bind(l_2);
  1383       // At this point the positions of both, from and to, are at least 4 byte aligned.
  1385       // Copy 4 elements at a time.
  1386       // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
  1387       __ xorr(tmp2, R3_ARG1, R4_ARG2);
  1388       __ andi_(tmp1, tmp2, 7);
  1389       __ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
  1391       // Copy a 2-element word if necessary to align to 8 bytes.
  1392       __ andi_(R0, R3_ARG1, 7);
  1393       __ beq(CCR0, l_7);
  1395       __ lwzx(tmp2, R3_ARG1, tmp3);
  1396       __ addi(R5_ARG3, R5_ARG3, -2);
  1397       __ stwx(tmp2, R4_ARG2, tmp3);
  1398       { // FasterArrayCopy
  1399         __ addi(R3_ARG1, R3_ARG1, 4);
  1400         __ addi(R4_ARG2, R4_ARG2, 4);
  1404     __ bind(l_7);
  1406     // Copy 4 elements at a time; either the loads or the stores can
  1407     // be unaligned if aligned == false.
  1409     { // FasterArrayCopy
  1410       __ cmpwi(CCR0, R5_ARG3, 15);
  1411       __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain
  1413       __ srdi(tmp1, R5_ARG3, 4);
  1414       __ andi_(R5_ARG3, R5_ARG3, 15);
  1415       __ mtctr(tmp1);
  1417       __ bind(l_8);
  1418       // Use unrolled version for mass copying (copy 16 elements a time).
  1419       // Load feeding store gets zero latency on Power6, however not on Power5.
  1420       // Therefore, the following sequence is made for the good of both.
  1421       __ ld(tmp1, 0, R3_ARG1);
  1422       __ ld(tmp2, 8, R3_ARG1);
  1423       __ ld(tmp3, 16, R3_ARG1);
  1424       __ ld(tmp4, 24, R3_ARG1);
  1425       __ std(tmp1, 0, R4_ARG2);
  1426       __ std(tmp2, 8, R4_ARG2);
  1427       __ std(tmp3, 16, R4_ARG2);
  1428       __ std(tmp4, 24, R4_ARG2);
  1429       __ addi(R3_ARG1, R3_ARG1, 32);
  1430       __ addi(R4_ARG2, R4_ARG2, 32);
  1431       __ bdnz(l_8);
  1433     __ bind(l_6);
  1435     // copy 2 elements at a time
  1436     { // FasterArrayCopy
  1437       __ cmpwi(CCR0, R5_ARG3, 2);
  1438       __ blt(CCR0, l_1);
  1439       __ srdi(tmp1, R5_ARG3, 1);
  1440       __ andi_(R5_ARG3, R5_ARG3, 1);
  1442       __ addi(R3_ARG1, R3_ARG1, -4);
  1443       __ addi(R4_ARG2, R4_ARG2, -4);
  1444       __ mtctr(tmp1);
  1446       __ bind(l_3);
  1447       __ lwzu(tmp2, 4, R3_ARG1);
  1448       __ stwu(tmp2, 4, R4_ARG2);
  1449       __ bdnz(l_3);
  1451       __ addi(R3_ARG1, R3_ARG1, 4);
  1452       __ addi(R4_ARG2, R4_ARG2, 4);
  1455     // do single element copy
  1456     __ bind(l_1);
  1457     __ cmpwi(CCR0, R5_ARG3, 0);
  1458     __ beq(CCR0, l_4);
  1460     { // FasterArrayCopy
  1461       __ mtctr(R5_ARG3);
  1462       __ addi(R3_ARG1, R3_ARG1, -2);
  1463       __ addi(R4_ARG2, R4_ARG2, -2);
  1465       __ bind(l_5);
  1466       __ lhzu(tmp2, 2, R3_ARG1);
  1467       __ sthu(tmp2, 2, R4_ARG2);
  1468       __ bdnz(l_5);
  1470     __ bind(l_4);
  1471     __ blr();
  1473     return start;
  1476   // Generate stub for conjoint short copy.  If "aligned" is true, the
  1477   // "from" and "to" addresses are assumed to be heapword aligned.
  1478   //
  1479   // Arguments for generated stub:
  1480   //      from:  R3_ARG1
  1481   //      to:    R4_ARG2
  1482   //      count: R5_ARG3 treated as signed
  1483   //
  1484   address generate_conjoint_short_copy(bool aligned, const char * name) {
  1485     StubCodeMark mark(this, "StubRoutines", name);
  1486     address start = __ function_entry();
  1488     Register tmp1 = R6_ARG4;
  1489     Register tmp2 = R7_ARG5;
  1490     Register tmp3 = R8_ARG6;
  1492 #if defined(ABI_ELFv2)
  1493     address nooverlap_target = aligned ?
  1494         StubRoutines::arrayof_jshort_disjoint_arraycopy() :
  1495         StubRoutines::jshort_disjoint_arraycopy();
  1496 #else
  1497     address nooverlap_target = aligned ?
  1498         ((FunctionDescriptor*)StubRoutines::arrayof_jshort_disjoint_arraycopy())->entry() :
  1499         ((FunctionDescriptor*)StubRoutines::jshort_disjoint_arraycopy())->entry();
  1500 #endif
  1502     array_overlap_test(nooverlap_target, 1);
  1504     Label l_1, l_2;
  1505     __ sldi(tmp1, R5_ARG3, 1);
  1506     __ b(l_2);
  1507     __ bind(l_1);
  1508     __ sthx(tmp2, R4_ARG2, tmp1);
  1509     __ bind(l_2);
  1510     __ addic_(tmp1, tmp1, -2);
  1511     __ lhzx(tmp2, R3_ARG1, tmp1);
  1512     __ bge(CCR0, l_1);
  1514     __ blr();
  1516     return start;
  1519   // Generate core code for disjoint int copy (and oop copy on 32-bit).  If "aligned"
  1520   // is true, the "from" and "to" addresses are assumed to be heapword aligned.
  1521   //
  1522   // Arguments:
  1523   //      from:  R3_ARG1
  1524   //      to:    R4_ARG2
  1525   //      count: R5_ARG3 treated as signed
  1526   //
  1527   void generate_disjoint_int_copy_core(bool aligned) {
  1528     Register tmp1 = R6_ARG4;
  1529     Register tmp2 = R7_ARG5;
  1530     Register tmp3 = R8_ARG6;
  1531     Register tmp4 = R0;
  1533     Label l_1, l_2, l_3, l_4, l_5, l_6;
  1534     // for short arrays, just do single element copy
  1535     __ li(tmp3, 0);
  1536     __ cmpwi(CCR0, R5_ARG3, 5);
  1537     __ ble(CCR0, l_2);
  1539     if (!aligned) {
  1540         // check if arrays have same alignment mod 8.
  1541         __ xorr(tmp1, R3_ARG1, R4_ARG2);
  1542         __ andi_(R0, tmp1, 7);
  1543         // Not the same alignment, but ld and std just need to be 4 byte aligned.
  1544         __ bne(CCR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
  1546         // copy 1 element to align to and from on an 8 byte boundary
  1547         __ andi_(R0, R3_ARG1, 7);
  1548         __ beq(CCR0, l_4);
  1550         __ lwzx(tmp2, R3_ARG1, tmp3);
  1551         __ addi(R5_ARG3, R5_ARG3, -1);
  1552         __ stwx(tmp2, R4_ARG2, tmp3);
  1553         { // FasterArrayCopy
  1554           __ addi(R3_ARG1, R3_ARG1, 4);
  1555           __ addi(R4_ARG2, R4_ARG2, 4);
  1557         __ bind(l_4);
  1560     { // FasterArrayCopy
  1561       __ cmpwi(CCR0, R5_ARG3, 7);
  1562       __ ble(CCR0, l_2); // copy 1 at a time if less than 8 elements remain
  1564       __ srdi(tmp1, R5_ARG3, 3);
  1565       __ andi_(R5_ARG3, R5_ARG3, 7);
  1566       __ mtctr(tmp1);
  1568       __ bind(l_6);
  1569       // Use unrolled version for mass copying (copy 8 elements a time).
  1570       // Load feeding store gets zero latency on power6, however not on power 5.
  1571       // Therefore, the following sequence is made for the good of both.
  1572       __ ld(tmp1, 0, R3_ARG1);
  1573       __ ld(tmp2, 8, R3_ARG1);
  1574       __ ld(tmp3, 16, R3_ARG1);
  1575       __ ld(tmp4, 24, R3_ARG1);
  1576       __ std(tmp1, 0, R4_ARG2);
  1577       __ std(tmp2, 8, R4_ARG2);
  1578       __ std(tmp3, 16, R4_ARG2);
  1579       __ std(tmp4, 24, R4_ARG2);
  1580       __ addi(R3_ARG1, R3_ARG1, 32);
  1581       __ addi(R4_ARG2, R4_ARG2, 32);
  1582       __ bdnz(l_6);
  1585     // copy 1 element at a time
  1586     __ bind(l_2);
  1587     __ cmpwi(CCR0, R5_ARG3, 0);
  1588     __ beq(CCR0, l_1);
  1590     { // FasterArrayCopy
  1591       __ mtctr(R5_ARG3);
  1592       __ addi(R3_ARG1, R3_ARG1, -4);
  1593       __ addi(R4_ARG2, R4_ARG2, -4);
  1595       __ bind(l_3);
  1596       __ lwzu(tmp2, 4, R3_ARG1);
  1597       __ stwu(tmp2, 4, R4_ARG2);
  1598       __ bdnz(l_3);
  1601     __ bind(l_1);
  1602     return;
  1605   // Generate stub for disjoint int copy.  If "aligned" is true, the
  1606   // "from" and "to" addresses are assumed to be heapword aligned.
  1607   //
  1608   // Arguments for generated stub:
  1609   //      from:  R3_ARG1
  1610   //      to:    R4_ARG2
  1611   //      count: R5_ARG3 treated as signed
  1612   //
  1613   address generate_disjoint_int_copy(bool aligned, const char * name) {
  1614     StubCodeMark mark(this, "StubRoutines", name);
  1615     address start = __ function_entry();
  1616     generate_disjoint_int_copy_core(aligned);
  1617     __ blr();
  1618     return start;
  1621   // Generate core code for conjoint int copy (and oop copy on
  1622   // 32-bit).  If "aligned" is true, the "from" and "to" addresses
  1623   // are assumed to be heapword aligned.
  1624   //
  1625   // Arguments:
  1626   //      from:  R3_ARG1
  1627   //      to:    R4_ARG2
  1628   //      count: R5_ARG3 treated as signed
  1629   //
  1630   void generate_conjoint_int_copy_core(bool aligned) {
  1631     // Do reverse copy.  We assume the case of actual overlap is rare enough
  1632     // that we don't have to optimize it.
  1634     Label l_1, l_2, l_3, l_4, l_5, l_6;
  1636     Register tmp1 = R6_ARG4;
  1637     Register tmp2 = R7_ARG5;
  1638     Register tmp3 = R8_ARG6;
  1639     Register tmp4 = R0;
  1641     { // FasterArrayCopy
  1642       __ cmpwi(CCR0, R5_ARG3, 0);
  1643       __ beq(CCR0, l_6);
  1645       __ sldi(R5_ARG3, R5_ARG3, 2);
  1646       __ add(R3_ARG1, R3_ARG1, R5_ARG3);
  1647       __ add(R4_ARG2, R4_ARG2, R5_ARG3);
  1648       __ srdi(R5_ARG3, R5_ARG3, 2);
  1650       __ cmpwi(CCR0, R5_ARG3, 7);
  1651       __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain
  1653       __ srdi(tmp1, R5_ARG3, 3);
  1654       __ andi(R5_ARG3, R5_ARG3, 7);
  1655       __ mtctr(tmp1);
  1657       __ bind(l_4);
  1658       // Use unrolled version for mass copying (copy 4 elements a time).
  1659       // Load feeding store gets zero latency on Power6, however not on Power5.
  1660       // Therefore, the following sequence is made for the good of both.
  1661       __ addi(R3_ARG1, R3_ARG1, -32);
  1662       __ addi(R4_ARG2, R4_ARG2, -32);
  1663       __ ld(tmp4, 24, R3_ARG1);
  1664       __ ld(tmp3, 16, R3_ARG1);
  1665       __ ld(tmp2, 8, R3_ARG1);
  1666       __ ld(tmp1, 0, R3_ARG1);
  1667       __ std(tmp4, 24, R4_ARG2);
  1668       __ std(tmp3, 16, R4_ARG2);
  1669       __ std(tmp2, 8, R4_ARG2);
  1670       __ std(tmp1, 0, R4_ARG2);
  1671       __ bdnz(l_4);
  1673       __ cmpwi(CCR0, R5_ARG3, 0);
  1674       __ beq(CCR0, l_6);
  1676       __ bind(l_5);
  1677       __ mtctr(R5_ARG3);
  1678       __ bind(l_3);
  1679       __ lwz(R0, -4, R3_ARG1);
  1680       __ stw(R0, -4, R4_ARG2);
  1681       __ addi(R3_ARG1, R3_ARG1, -4);
  1682       __ addi(R4_ARG2, R4_ARG2, -4);
  1683       __ bdnz(l_3);
  1685       __ bind(l_6);
  1689   // Generate stub for conjoint int copy.  If "aligned" is true, the
  1690   // "from" and "to" addresses are assumed to be heapword aligned.
  1691   //
  1692   // Arguments for generated stub:
  1693   //      from:  R3_ARG1
  1694   //      to:    R4_ARG2
  1695   //      count: R5_ARG3 treated as signed
  1696   //
  1697   address generate_conjoint_int_copy(bool aligned, const char * name) {
  1698     StubCodeMark mark(this, "StubRoutines", name);
  1699     address start = __ function_entry();
  1701 #if defined(ABI_ELFv2)
  1702     address nooverlap_target = aligned ?
  1703       StubRoutines::arrayof_jint_disjoint_arraycopy() :
  1704       StubRoutines::jint_disjoint_arraycopy();
  1705 #else
  1706     address nooverlap_target = aligned ?
  1707       ((FunctionDescriptor*)StubRoutines::arrayof_jint_disjoint_arraycopy())->entry() :
  1708       ((FunctionDescriptor*)StubRoutines::jint_disjoint_arraycopy())->entry();
  1709 #endif
  1711     array_overlap_test(nooverlap_target, 2);
  1713     generate_conjoint_int_copy_core(aligned);
  1715     __ blr();
  1717     return start;
  1720   // Generate core code for disjoint long copy (and oop copy on
  1721   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
  1722   // are assumed to be heapword aligned.
  1723   //
  1724   // Arguments:
  1725   //      from:  R3_ARG1
  1726   //      to:    R4_ARG2
  1727   //      count: R5_ARG3 treated as signed
  1728   //
  1729   void generate_disjoint_long_copy_core(bool aligned) {
  1730     Register tmp1 = R6_ARG4;
  1731     Register tmp2 = R7_ARG5;
  1732     Register tmp3 = R8_ARG6;
  1733     Register tmp4 = R0;
  1735     Label l_1, l_2, l_3, l_4;
  1737     { // FasterArrayCopy
  1738       __ cmpwi(CCR0, R5_ARG3, 3);
  1739       __ ble(CCR0, l_3); // copy 1 at a time if less than 4 elements remain
  1741       __ srdi(tmp1, R5_ARG3, 2);
  1742       __ andi_(R5_ARG3, R5_ARG3, 3);
  1743       __ mtctr(tmp1);
  1745       __ bind(l_4);
  1746       // Use unrolled version for mass copying (copy 4 elements a time).
  1747       // Load feeding store gets zero latency on Power6, however not on Power5.
  1748       // Therefore, the following sequence is made for the good of both.
  1749       __ ld(tmp1, 0, R3_ARG1);
  1750       __ ld(tmp2, 8, R3_ARG1);
  1751       __ ld(tmp3, 16, R3_ARG1);
  1752       __ ld(tmp4, 24, R3_ARG1);
  1753       __ std(tmp1, 0, R4_ARG2);
  1754       __ std(tmp2, 8, R4_ARG2);
  1755       __ std(tmp3, 16, R4_ARG2);
  1756       __ std(tmp4, 24, R4_ARG2);
  1757       __ addi(R3_ARG1, R3_ARG1, 32);
  1758       __ addi(R4_ARG2, R4_ARG2, 32);
  1759       __ bdnz(l_4);
  1762     // copy 1 element at a time
  1763     __ bind(l_3);
  1764     __ cmpwi(CCR0, R5_ARG3, 0);
  1765     __ beq(CCR0, l_1);
  1767     { // FasterArrayCopy
  1768       __ mtctr(R5_ARG3);
  1769       __ addi(R3_ARG1, R3_ARG1, -8);
  1770       __ addi(R4_ARG2, R4_ARG2, -8);
  1772       __ bind(l_2);
  1773       __ ldu(R0, 8, R3_ARG1);
  1774       __ stdu(R0, 8, R4_ARG2);
  1775       __ bdnz(l_2);
  1778     __ bind(l_1);
  1781   // Generate stub for disjoint long copy.  If "aligned" is true, the
  1782   // "from" and "to" addresses are assumed to be heapword aligned.
  1783   //
  1784   // Arguments for generated stub:
  1785   //      from:  R3_ARG1
  1786   //      to:    R4_ARG2
  1787   //      count: R5_ARG3 treated as signed
  1788   //
  1789   address generate_disjoint_long_copy(bool aligned, const char * name) {
  1790     StubCodeMark mark(this, "StubRoutines", name);
  1791     address start = __ function_entry();
  1792     generate_disjoint_long_copy_core(aligned);
  1793     __ blr();
  1795     return start;
  1798   // Generate core code for conjoint long copy (and oop copy on
  1799   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
  1800   // are assumed to be heapword aligned.
  1801   //
  1802   // Arguments:
  1803   //      from:  R3_ARG1
  1804   //      to:    R4_ARG2
  1805   //      count: R5_ARG3 treated as signed
  1806   //
  1807   void generate_conjoint_long_copy_core(bool aligned) {
  1808     Register tmp1 = R6_ARG4;
  1809     Register tmp2 = R7_ARG5;
  1810     Register tmp3 = R8_ARG6;
  1811     Register tmp4 = R0;
  1813     Label l_1, l_2, l_3, l_4, l_5;
  1815     __ cmpwi(CCR0, R5_ARG3, 0);
  1816     __ beq(CCR0, l_1);
  1818     { // FasterArrayCopy
  1819       __ sldi(R5_ARG3, R5_ARG3, 3);
  1820       __ add(R3_ARG1, R3_ARG1, R5_ARG3);
  1821       __ add(R4_ARG2, R4_ARG2, R5_ARG3);
  1822       __ srdi(R5_ARG3, R5_ARG3, 3);
  1824       __ cmpwi(CCR0, R5_ARG3, 3);
  1825       __ ble(CCR0, l_5); // copy 1 at a time if less than 4 elements remain
  1827       __ srdi(tmp1, R5_ARG3, 2);
  1828       __ andi(R5_ARG3, R5_ARG3, 3);
  1829       __ mtctr(tmp1);
  1831       __ bind(l_4);
  1832       // Use unrolled version for mass copying (copy 4 elements a time).
  1833       // Load feeding store gets zero latency on Power6, however not on Power5.
  1834       // Therefore, the following sequence is made for the good of both.
  1835       __ addi(R3_ARG1, R3_ARG1, -32);
  1836       __ addi(R4_ARG2, R4_ARG2, -32);
  1837       __ ld(tmp4, 24, R3_ARG1);
  1838       __ ld(tmp3, 16, R3_ARG1);
  1839       __ ld(tmp2, 8, R3_ARG1);
  1840       __ ld(tmp1, 0, R3_ARG1);
  1841       __ std(tmp4, 24, R4_ARG2);
  1842       __ std(tmp3, 16, R4_ARG2);
  1843       __ std(tmp2, 8, R4_ARG2);
  1844       __ std(tmp1, 0, R4_ARG2);
  1845       __ bdnz(l_4);
  1847       __ cmpwi(CCR0, R5_ARG3, 0);
  1848       __ beq(CCR0, l_1);
  1850       __ bind(l_5);
  1851       __ mtctr(R5_ARG3);
  1852       __ bind(l_3);
  1853       __ ld(R0, -8, R3_ARG1);
  1854       __ std(R0, -8, R4_ARG2);
  1855       __ addi(R3_ARG1, R3_ARG1, -8);
  1856       __ addi(R4_ARG2, R4_ARG2, -8);
  1857       __ bdnz(l_3);
  1860     __ bind(l_1);
  1863   // Generate stub for conjoint long copy.  If "aligned" is true, the
  1864   // "from" and "to" addresses are assumed to be heapword aligned.
  1865   //
  1866   // Arguments for generated stub:
  1867   //      from:  R3_ARG1
  1868   //      to:    R4_ARG2
  1869   //      count: R5_ARG3 treated as signed
  1870   //
  1871   address generate_conjoint_long_copy(bool aligned, const char * name) {
  1872     StubCodeMark mark(this, "StubRoutines", name);
  1873     address start = __ function_entry();
  1875 #if defined(ABI_ELFv2)
  1876     address nooverlap_target = aligned ?
  1877       StubRoutines::arrayof_jlong_disjoint_arraycopy() :
  1878       StubRoutines::jlong_disjoint_arraycopy();
  1879 #else
  1880     address nooverlap_target = aligned ?
  1881       ((FunctionDescriptor*)StubRoutines::arrayof_jlong_disjoint_arraycopy())->entry() :
  1882       ((FunctionDescriptor*)StubRoutines::jlong_disjoint_arraycopy())->entry();
  1883 #endif
  1885     array_overlap_test(nooverlap_target, 3);
  1886     generate_conjoint_long_copy_core(aligned);
  1888     __ blr();
  1890     return start;
  1893   // Generate stub for conjoint oop copy.  If "aligned" is true, the
  1894   // "from" and "to" addresses are assumed to be heapword aligned.
  1895   //
  1896   // Arguments for generated stub:
  1897   //      from:  R3_ARG1
  1898   //      to:    R4_ARG2
  1899   //      count: R5_ARG3 treated as signed
  1900   //      dest_uninitialized: G1 support
  1901   //
  1902   address generate_conjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
  1903     StubCodeMark mark(this, "StubRoutines", name);
  1905     address start = __ function_entry();
  1907 #if defined(ABI_ELFv2)
  1908     address nooverlap_target = aligned ?
  1909       StubRoutines::arrayof_oop_disjoint_arraycopy() :
  1910       StubRoutines::oop_disjoint_arraycopy();
  1911 #else
  1912     address nooverlap_target = aligned ?
  1913       ((FunctionDescriptor*)StubRoutines::arrayof_oop_disjoint_arraycopy())->entry() :
  1914       ((FunctionDescriptor*)StubRoutines::oop_disjoint_arraycopy())->entry();
  1915 #endif
  1917     gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);
  1919     // Save arguments.
  1920     __ mr(R9_ARG7, R4_ARG2);
  1921     __ mr(R10_ARG8, R5_ARG3);
  1923     if (UseCompressedOops) {
  1924       array_overlap_test(nooverlap_target, 2);
  1925       generate_conjoint_int_copy_core(aligned);
  1926     } else {
  1927       array_overlap_test(nooverlap_target, 3);
  1928       generate_conjoint_long_copy_core(aligned);
  1931     gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);
  1932     return start;
  1935   // Generate stub for disjoint oop copy.  If "aligned" is true, the
  1936   // "from" and "to" addresses are assumed to be heapword aligned.
  1937   //
  1938   // Arguments for generated stub:
  1939   //      from:  R3_ARG1
  1940   //      to:    R4_ARG2
  1941   //      count: R5_ARG3 treated as signed
  1942   //      dest_uninitialized: G1 support
  1943   //
  1944   address generate_disjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
  1945     StubCodeMark mark(this, "StubRoutines", name);
  1946     address start = __ function_entry();
  1948     gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);
  1950     // save some arguments, disjoint_long_copy_core destroys them.
  1951     // needed for post barrier
  1952     __ mr(R9_ARG7, R4_ARG2);
  1953     __ mr(R10_ARG8, R5_ARG3);
  1955     if (UseCompressedOops) {
  1956       generate_disjoint_int_copy_core(aligned);
  1957     } else {
  1958       generate_disjoint_long_copy_core(aligned);
  1961     gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);
  1963     return start;
  1966   void generate_arraycopy_stubs() {
  1967     // Note: the disjoint stubs must be generated first, some of
  1968     // the conjoint stubs use them.
  1970     // non-aligned disjoint versions
  1971     StubRoutines::_jbyte_disjoint_arraycopy       = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
  1972     StubRoutines::_jshort_disjoint_arraycopy      = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
  1973     StubRoutines::_jint_disjoint_arraycopy        = generate_disjoint_int_copy(false, "jint_disjoint_arraycopy");
  1974     StubRoutines::_jlong_disjoint_arraycopy       = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
  1975     StubRoutines::_oop_disjoint_arraycopy         = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy", false);
  1976     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy_uninit", true);
  1978     // aligned disjoint versions
  1979     StubRoutines::_arrayof_jbyte_disjoint_arraycopy      = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
  1980     StubRoutines::_arrayof_jshort_disjoint_arraycopy     = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
  1981     StubRoutines::_arrayof_jint_disjoint_arraycopy       = generate_disjoint_int_copy(true, "arrayof_jint_disjoint_arraycopy");
  1982     StubRoutines::_arrayof_jlong_disjoint_arraycopy      = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
  1983     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(true, "arrayof_oop_disjoint_arraycopy", false);
  1984     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, "oop_disjoint_arraycopy_uninit", true);
  1986     // non-aligned conjoint versions
  1987     StubRoutines::_jbyte_arraycopy      = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
  1988     StubRoutines::_jshort_arraycopy     = generate_conjoint_short_copy(false, "jshort_arraycopy");
  1989     StubRoutines::_jint_arraycopy       = generate_conjoint_int_copy(false, "jint_arraycopy");
  1990     StubRoutines::_jlong_arraycopy      = generate_conjoint_long_copy(false, "jlong_arraycopy");
  1991     StubRoutines::_oop_arraycopy        = generate_conjoint_oop_copy(false, "oop_arraycopy", false);
  1992     StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, "oop_arraycopy_uninit", true);
  1994     // aligned conjoint versions
  1995     StubRoutines::_arrayof_jbyte_arraycopy      = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
  1996     StubRoutines::_arrayof_jshort_arraycopy     = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
  1997     StubRoutines::_arrayof_jint_arraycopy       = generate_conjoint_int_copy(true, "arrayof_jint_arraycopy");
  1998     StubRoutines::_arrayof_jlong_arraycopy      = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");
  1999     StubRoutines::_arrayof_oop_arraycopy        = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", false);
  2000     StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", true);
  2002     // fill routines
  2003     StubRoutines::_jbyte_fill          = generate_fill(T_BYTE,  false, "jbyte_fill");
  2004     StubRoutines::_jshort_fill         = generate_fill(T_SHORT, false, "jshort_fill");
  2005     StubRoutines::_jint_fill           = generate_fill(T_INT,   false, "jint_fill");
  2006     StubRoutines::_arrayof_jbyte_fill  = generate_fill(T_BYTE,  true, "arrayof_jbyte_fill");
  2007     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
  2008     StubRoutines::_arrayof_jint_fill   = generate_fill(T_INT,   true, "arrayof_jint_fill");
  2011   // Safefetch stubs.
  2012   void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) {
  2013     // safefetch signatures:
  2014     //   int      SafeFetch32(int*      adr, int      errValue);
  2015     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
  2016     //
  2017     // arguments:
  2018     //   R3_ARG1 = adr
  2019     //   R4_ARG2 = errValue
  2020     //
  2021     // result:
  2022     //   R3_RET  = *adr or errValue
  2024     StubCodeMark mark(this, "StubRoutines", name);
  2026     // Entry point, pc or function descriptor.
  2027     *entry = __ function_entry();
  2029     // Load *adr into R4_ARG2, may fault.
  2030     *fault_pc = __ pc();
  2031     switch (size) {
  2032       case 4:
  2033         // int32_t, signed extended
  2034         __ lwa(R4_ARG2, 0, R3_ARG1);
  2035         break;
  2036       case 8:
  2037         // int64_t
  2038         __ ld(R4_ARG2, 0, R3_ARG1);
  2039         break;
  2040       default:
  2041         ShouldNotReachHere();
  2044     // return errValue or *adr
  2045     *continuation_pc = __ pc();
  2046     __ mr(R3_RET, R4_ARG2);
  2047     __ blr();
  2050   // Initialization
  2051   void generate_initial() {
  2052     // Generates all stubs and initializes the entry points
  2054     // Entry points that exist in all platforms.
  2055     // Note: This is code that could be shared among different platforms - however the
  2056     // benefit seems to be smaller than the disadvantage of having a
  2057     // much more complicated generator structure. See also comment in
  2058     // stubRoutines.hpp.
  2060     StubRoutines::_forward_exception_entry          = generate_forward_exception();
  2061     StubRoutines::_call_stub_entry                  = generate_call_stub(StubRoutines::_call_stub_return_address);
  2062     StubRoutines::_catch_exception_entry            = generate_catch_exception();
  2064     // Build this early so it's available for the interpreter.
  2065     StubRoutines::_throw_StackOverflowError_entry   =
  2066       generate_throw_exception("StackOverflowError throw_exception",
  2067                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
  2070   void generate_all() {
  2071     // Generates all stubs and initializes the entry points
  2073     // These entry points require SharedInfo::stack0 to be set up in
  2074     // non-core builds
  2075     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
  2076     // Handle IncompatibleClassChangeError in itable stubs.
  2077     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError),  false);
  2078     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
  2080     StubRoutines::_handler_for_unsafe_access_entry         = generate_handler_for_unsafe_access();
  2082     // support for verify_oop (must happen after universe_init)
  2083     StubRoutines::_verify_oop_subroutine_entry             = generate_verify_oop();
  2085     // arraycopy stubs used by compilers
  2086     generate_arraycopy_stubs();
  2088     if (UseAESIntrinsics) {
  2089       guarantee(!UseAESIntrinsics, "not yet implemented.");
  2092     // PPC uses stubs for safefetch.
  2093     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
  2094                                                        &StubRoutines::_safefetch32_fault_pc,
  2095                                                        &StubRoutines::_safefetch32_continuation_pc);
  2096     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
  2097                                                        &StubRoutines::_safefetchN_fault_pc,
  2098                                                        &StubRoutines::_safefetchN_continuation_pc);
  2101  public:
  2102   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
  2103     // replace the standard masm with a special one:
  2104     _masm = new MacroAssembler(code);
  2105     if (all) {
  2106       generate_all();
  2107     } else {
  2108       generate_initial();
  2111 };
  2113 void StubGenerator_generate(CodeBuffer* code, bool all) {
  2114   StubGenerator g(code, all);

mercurial