jdk8-mips64-public/hotspot: src/cpu/ppc/vm/stubGenerator

8029015: PPC64 (part 216): opto: trap based null and range checks
Summary: On PPC64 use tdi instruction that does a compare and raises SIGTRAP for NULL and range checks.
Reviewed-by: kvn

     1 /*

     2  * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.

     3  * Copyright 2012, 2013 SAP AG. All rights reserved.

     4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.

     5  *

     6  * This code is free software; you can redistribute it and/or modify it

     7  * under the terms of the GNU General Public License version 2 only, as

     8  * published by the Free Software Foundation.

     9  *

    10  * This code is distributed in the hope that it will be useful, but WITHOUT

    11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or

    12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License

    13  * version 2 for more details (a copy is included in the LICENSE file that

    14  * accompanied this code).

    15  *

    16  * You should have received a copy of the GNU General Public License version

    17  * 2 along with this work; if not, write to the Free Software Foundation,

    18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.

    19  *

    20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA

    21  * or visit www.oracle.com if you need additional information or have any

    22  * questions.

    23  *

    24  */

    26 #include "precompiled.hpp"

    27 #include "asm/assembler.hpp"

    28 #include "asm/macroAssembler.inline.hpp"

    29 #include "interpreter/interpreter.hpp"

    30 #include "nativeInst_ppc.hpp"

    31 #include "oops/instanceOop.hpp"

    32 #include "oops/method.hpp"

    33 #include "oops/objArrayKlass.hpp"

    34 #include "oops/oop.inline.hpp"

    35 #include "prims/methodHandles.hpp"

    36 #include "runtime/frame.inline.hpp"

    37 #include "runtime/handles.inline.hpp"

    38 #include "runtime/sharedRuntime.hpp"

    39 #include "runtime/stubCodeGenerator.hpp"

    40 #include "runtime/stubRoutines.hpp"

    41 #include "utilities/top.hpp"

    42 #ifdef TARGET_OS_FAMILY_aix

    43 # include "thread_aix.inline.hpp"

    44 #endif

    45 #ifdef TARGET_OS_FAMILY_linux

    46 # include "thread_linux.inline.hpp"

    47 #endif

    48 #ifdef COMPILER2

    49 #include "opto/runtime.hpp"

    50 #endif

    52 #define __ _masm->

    54 #ifdef PRODUCT

    55 #define BLOCK_COMMENT(str) // nothing

    56 #else

    57 #define BLOCK_COMMENT(str) __ block_comment(str)

    58 #endif

    60 class StubGenerator: public StubCodeGenerator {

    61  private:

    63   // Call stubs are used to call Java from C

    64   //

    65   // Arguments:

    66   //

    67   //   R3  - call wrapper address     : address

    68   //   R4  - result                   : intptr_t*

    69   //   R5  - result type              : BasicType

    70   //   R6  - method                   : Method

    71   //   R7  - frame mgr entry point    : address

    72   //   R8  - parameter block          : intptr_t*

    73   //   R9  - parameter count in words : int

    74   //   R10 - thread                   : Thread*

    75   //

    76   address generate_call_stub(address& return_address) {

    77     // Setup a new c frame, copy java arguments, call frame manager or

    78     // native_entry, and process result.

    80     StubCodeMark mark(this, "StubRoutines", "call_stub");

    82     address start = __ emit_fd();

    84     // some sanity checks

    85     assert((sizeof(frame::abi_48) % 16) == 0,                 "unaligned");

    86     assert((sizeof(frame::abi_112) % 16) == 0,                "unaligned");

    87     assert((sizeof(frame::spill_nonvolatiles) % 16) == 0,     "unaligned");

    88     assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned");

    89     assert((sizeof(frame::entry_frame_locals) % 16) == 0,     "unaligned");

    91     Register r_arg_call_wrapper_addr        = R3;

    92     Register r_arg_result_addr              = R4;

    93     Register r_arg_result_type              = R5;

    94     Register r_arg_method                   = R6;

    95     Register r_arg_entry                    = R7;

    96     Register r_arg_thread                   = R10;

    98     Register r_temp                         = R24;

    99     Register r_top_of_arguments_addr        = R25;

   100     Register r_entryframe_fp                = R26;

   102     {

   103       // Stack on entry to call_stub:

   104       //

   105       //      F1      [C_FRAME]

   106       //              ...

   108       Register r_arg_argument_addr          = R8;

   109       Register r_arg_argument_count         = R9;

   110       Register r_frame_alignment_in_bytes   = R27;

   111       Register r_argument_addr              = R28;

   112       Register r_argumentcopy_addr          = R29;

   113       Register r_argument_size_in_bytes     = R30;

   114       Register r_frame_size                 = R23;

   116       Label arguments_copied;

   118       // Save LR/CR to caller's C_FRAME.

   119       __ save_LR_CR(R0);

   121       // Zero extend arg_argument_count.

   122       __ clrldi(r_arg_argument_count, r_arg_argument_count, 32);

   124       // Save non-volatiles GPRs to ENTRY_FRAME (not yet pushed, but it's safe).

   125       __ save_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));

   127       // Keep copy of our frame pointer (caller's SP).

   128       __ mr(r_entryframe_fp, R1_SP);

   130       BLOCK_COMMENT("Push ENTRY_FRAME including arguments");

   131       // Push ENTRY_FRAME including arguments:

   132       //

   133       //      F0      [TOP_IJAVA_FRAME_ABI]

   134       //              alignment (optional)

   135       //              [outgoing Java arguments]

   136       //              [ENTRY_FRAME_LOCALS]

   137       //      F1      [C_FRAME]

   138       //              ...

   140       // calculate frame size

   142       // unaligned size of arguments

   143       __ sldi(r_argument_size_in_bytes,

   144                   r_arg_argument_count, Interpreter::logStackElementSize);

   145       // arguments alignment (max 1 slot)

   146       // FIXME: use round_to() here

   147       __ andi_(r_frame_alignment_in_bytes, r_arg_argument_count, 1);

   148       __ sldi(r_frame_alignment_in_bytes,

   149                   r_frame_alignment_in_bytes, Interpreter::logStackElementSize);

   151       // size = unaligned size of arguments + top abi's size

   152       __ addi(r_frame_size, r_argument_size_in_bytes,

   153               frame::top_ijava_frame_abi_size);

   154       // size += arguments alignment

   155       __ add(r_frame_size,

   156                  r_frame_size, r_frame_alignment_in_bytes);

   157       // size += size of call_stub locals

   158       __ addi(r_frame_size,

   159               r_frame_size, frame::entry_frame_locals_size);

   161       // push ENTRY_FRAME

   162       __ push_frame(r_frame_size, r_temp);

   164       // initialize call_stub locals (step 1)

   165       __ std(r_arg_call_wrapper_addr,

   166              _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp);

   167       __ std(r_arg_result_addr,

   168              _entry_frame_locals_neg(result_address), r_entryframe_fp);

   169       __ std(r_arg_result_type,

   170              _entry_frame_locals_neg(result_type), r_entryframe_fp);

   171       // we will save arguments_tos_address later

   174       BLOCK_COMMENT("Copy Java arguments");

   175       // copy Java arguments

   177       // Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later.

   178       // FIXME: why not simply use SP+frame::top_ijava_frame_size?

   179       __ addi(r_top_of_arguments_addr,

   180               R1_SP, frame::top_ijava_frame_abi_size);

   181       __ add(r_top_of_arguments_addr,

   182                  r_top_of_arguments_addr, r_frame_alignment_in_bytes);

   184       // any arguments to copy?

   185       __ cmpdi(CCR0, r_arg_argument_count, 0);

   186       __ beq(CCR0, arguments_copied);

   188       // prepare loop and copy arguments in reverse order

   189       {

   190         // init CTR with arg_argument_count

   191         __ mtctr(r_arg_argument_count);

   193         // let r_argumentcopy_addr point to last outgoing Java arguments P

   194         __ mr(r_argumentcopy_addr, r_top_of_arguments_addr);

   196         // let r_argument_addr point to last incoming java argument

   197         __ add(r_argument_addr,

   198                    r_arg_argument_addr, r_argument_size_in_bytes);

   199         __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);

   201         // now loop while CTR > 0 and copy arguments

   202         {

   203           Label next_argument;

   204           __ bind(next_argument);

   206           __ ld(r_temp, 0, r_argument_addr);

   207           // argument_addr--;

   208           __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);

   209           __ std(r_temp, 0, r_argumentcopy_addr);

   210           // argumentcopy_addr++;

   211           __ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord);

   213           __ bdnz(next_argument);

   214         }

   215       }

   217       // Arguments copied, continue.

   218       __ bind(arguments_copied);

   219     }

   221     {

   222       BLOCK_COMMENT("Call frame manager or native entry.");

   223       // Call frame manager or native entry.

   224       Register r_new_arg_entry = R14_state;

   225       assert_different_registers(r_new_arg_entry, r_top_of_arguments_addr,

   226                                  r_arg_method, r_arg_thread);

   228       __ mr(r_new_arg_entry, r_arg_entry);

   230       // Register state on entry to frame manager / native entry:

   231       //

   232       //   R17_tos     -  intptr_t*    sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8

   233       //   R19_method  -  Method

   234       //   R16_thread  -  JavaThread*

   236       // R17_tos must point to last argument - element_size.

   237       __ addi(R17_tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);

   239       // initialize call_stub locals (step 2)

   240       // now save R17_tos as arguments_tos_address

   241       __ std(R17_tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);

   243       // load argument registers for call

   244       __ mr(R19_method, r_arg_method);

   245       __ mr(R16_thread, r_arg_thread);

   246       assert(R17_tos != r_arg_method, "trashed r_arg_method");

   247       assert(R17_tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");

   249       // Set R15_prev_state to 0 for simplifying checks in callee.

   250       __ li(R15_prev_state, 0);

   252       // Stack on entry to frame manager / native entry:

   253       //

   254       //      F0      [TOP_IJAVA_FRAME_ABI]

   255       //              alignment (optional)

   256       //              [outgoing Java arguments]

   257       //              [ENTRY_FRAME_LOCALS]

   258       //      F1      [C_FRAME]

   259       //              ...

   260       //

   262       // global toc register

   263       __ load_const(R29, MacroAssembler::global_toc(), R11_scratch1);

   265       // Load narrow oop base.

   266       __ reinit_heapbase(R30, R11_scratch1);

   268       // Remember the senderSP so we interpreter can pop c2i arguments off of the stack

   269       // when called via a c2i.

   271       // Pass initial_caller_sp to framemanager.

   272       __ mr(R21_tmp1, R1_SP);

   274       // Do a light-weight C-call here, r_new_arg_entry holds the address

   275       // of the interpreter entry point (frame manager or native entry)

   276       // and save runtime-value of LR in return_address.

   277       assert(r_new_arg_entry != R17_tos && r_new_arg_entry != R19_method && r_new_arg_entry != R16_thread,

   278              "trashed r_new_arg_entry");

   279       return_address = __ call_stub(r_new_arg_entry);

   280     }

   282     {

   283       BLOCK_COMMENT("Returned from frame manager or native entry.");

   284       // Returned from frame manager or native entry.

   285       // Now pop frame, process result, and return to caller.

   287       // Stack on exit from frame manager / native entry:

   288       //

   289       //      F0      [ABI]

   290       //              ...

   291       //              [ENTRY_FRAME_LOCALS]

   292       //      F1      [C_FRAME]

   293       //              ...

   294       //

   295       // Just pop the topmost frame ...

   296       //

   298       Label ret_is_object;

   299       Label ret_is_long;

   300       Label ret_is_float;

   301       Label ret_is_double;

   303       Register r_entryframe_fp = R30;

   304       Register r_lr            = R7_ARG5;

   305       Register r_cr            = R8_ARG6;

   307       // Reload some volatile registers which we've spilled before the call

   308       // to frame manager / native entry.

   309       // Access all locals via frame pointer, because we know nothing about

   310       // the topmost frame's size.

   311       __ ld(r_entryframe_fp, _abi(callers_sp), R1_SP);

   312       assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);

   313       __ ld(r_arg_result_addr,

   314             _entry_frame_locals_neg(result_address), r_entryframe_fp);

   315       __ ld(r_arg_result_type,

   316             _entry_frame_locals_neg(result_type), r_entryframe_fp);

   317       __ ld(r_cr, _abi(cr), r_entryframe_fp);

   318       __ ld(r_lr, _abi(lr), r_entryframe_fp);

   320       // pop frame and restore non-volatiles, LR and CR

   321       __ mr(R1_SP, r_entryframe_fp);

   322       __ mtcr(r_cr);

   323       __ mtlr(r_lr);

   325       // Store result depending on type. Everything that is not

   326       // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.

   327       __ cmpwi(CCR0, r_arg_result_type, T_OBJECT);

   328       __ cmpwi(CCR1, r_arg_result_type, T_LONG);

   329       __ cmpwi(CCR5,  r_arg_result_type, T_FLOAT);

   330       __ cmpwi(CCR6,  r_arg_result_type, T_DOUBLE);

   332       // restore non-volatile registers

   333       __ restore_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));

   336       // Stack on exit from call_stub:

   337       //

   338       //      0       [C_FRAME]

   339       //              ...

   340       //

   341       //  no call_stub frames left.

   343       // All non-volatiles have been restored at this point!!

   344       assert(R3_RET == R3, "R3_RET should be R3");

   346       __ beq(CCR0, ret_is_object);

   347       __ beq(CCR1, ret_is_long);

   348       __ beq(CCR5,  ret_is_float);

   349       __ beq(CCR6,  ret_is_double);

   351       // default:

   352       __ stw(R3_RET, 0, r_arg_result_addr);

   353       __ blr(); // return to caller

   355       // case T_OBJECT:

   356       __ bind(ret_is_object);

   357       __ std(R3_RET, 0, r_arg_result_addr);

   358       __ blr(); // return to caller

   360       // case T_LONG:

   361       __ bind(ret_is_long);

   362       __ std(R3_RET, 0, r_arg_result_addr);

   363       __ blr(); // return to caller

   365       // case T_FLOAT:

   366       __ bind(ret_is_float);

   367       __ stfs(F1_RET, 0, r_arg_result_addr);

   368       __ blr(); // return to caller

   370       // case T_DOUBLE:

   371       __ bind(ret_is_double);

   372       __ stfd(F1_RET, 0, r_arg_result_addr);

   373       __ blr(); // return to caller

   374     }

   376     return start;

   377   }

   379   // Return point for a Java call if there's an exception thrown in

   380   // Java code.  The exception is caught and transformed into a

   381   // pending exception stored in JavaThread that can be tested from

   382   // within the VM.

   383   //

   384   address generate_catch_exception() {

   385     StubCodeMark mark(this, "StubRoutines", "catch_exception");

   387     address start = __ pc();

   389     // Registers alive

   390     //

   391     //  R16_thread

   392     //  R3_ARG1 - address of pending exception

   393     //  R4_ARG2 - return address in call stub

   395     const Register exception_file = R21_tmp1;

   396     const Register exception_line = R22_tmp2;

   398     __ load_const(exception_file, (void*)__FILE__);

   399     __ load_const(exception_line, (void*)__LINE__);

   401     __ std(R3_ARG1, thread_(pending_exception));

   402     // store into `char *'

   403     __ std(exception_file, thread_(exception_file));

   404     // store into `int'

   405     __ stw(exception_line, thread_(exception_line));

   407     // complete return to VM

   408     assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");

   410     __ mtlr(R4_ARG2);

   411     // continue in call stub

   412     __ blr();

   414     return start;

   415   }

   417   // Continuation point for runtime calls returning with a pending

   418   // exception.  The pending exception check happened in the runtime

   419   // or native call stub.  The pending exception in Thread is

   420   // converted into a Java-level exception.

   421   //

   422   address generate_forward_exception() {

   423     StubCodeMark mark(this, "StubRoutines", "forward_exception");

   424     address start = __ pc();

   426 #if !defined(PRODUCT)

   427     if (VerifyOops) {

   428       // Get pending exception oop.

   429       __ ld(R3_ARG1,

   430                 in_bytes(Thread::pending_exception_offset()),

   431                 R16_thread);

   432       // Make sure that this code is only executed if there is a pending exception.

   433       {

   434         Label L;

   435         __ cmpdi(CCR0, R3_ARG1, 0);

   436         __ bne(CCR0, L);

   437         __ stop("StubRoutines::forward exception: no pending exception (1)");

   438         __ bind(L);

   439       }

   440       __ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");

   441     }

   442 #endif

   444     // Save LR/CR and copy exception pc (LR) into R4_ARG2.

   445     __ save_LR_CR(R4_ARG2);

   446     __ push_frame_abi112(0, R0);

   447     // Find exception handler.

   448     __ call_VM_leaf(CAST_FROM_FN_PTR(address,

   449                      SharedRuntime::exception_handler_for_return_address),

   450                     R16_thread,

   451                     R4_ARG2);

   452     // Copy handler's address.

   453     __ mtctr(R3_RET);

   454     __ pop_frame();

   455     __ restore_LR_CR(R0);

   457     // Set up the arguments for the exception handler:

   458     //  - R3_ARG1: exception oop

   459     //  - R4_ARG2: exception pc.

   461     // Load pending exception oop.

   462     __ ld(R3_ARG1,

   463               in_bytes(Thread::pending_exception_offset()),

   464               R16_thread);

   466     // The exception pc is the return address in the caller.

   467     // Must load it into R4_ARG2.

   468     __ mflr(R4_ARG2);

   470 #ifdef ASSERT

   471     // Make sure exception is set.

   472     {

   473       Label L;

   474       __ cmpdi(CCR0, R3_ARG1, 0);

   475       __ bne(CCR0, L);

   476       __ stop("StubRoutines::forward exception: no pending exception (2)");

   477       __ bind(L);

   478     }

   479 #endif

   481     // Clear the pending exception.

   482     __ li(R0, 0);

   483     __ std(R0,

   484                in_bytes(Thread::pending_exception_offset()),

   485                R16_thread);

   486     // Jump to exception handler.

   487     __ bctr();

   489     return start;

   490   }

   492 #undef __

   493 #define __ masm->

   494   // Continuation point for throwing of implicit exceptions that are

   495   // not handled in the current activation. Fabricates an exception

   496   // oop and initiates normal exception dispatching in this

   497   // frame. Only callee-saved registers are preserved (through the

   498   // normal register window / RegisterMap handling).  If the compiler

   499   // needs all registers to be preserved between the fault point and

   500   // the exception handler then it must assume responsibility for that

   501   // in AbstractCompiler::continuation_for_implicit_null_exception or

   502   // continuation_for_implicit_division_by_zero_exception. All other

   503   // implicit exceptions (e.g., NullPointerException or

   504   // AbstractMethodError on entry) are either at call sites or

   505   // otherwise assume that stack unwinding will be initiated, so

   506   // caller saved registers were assumed volatile in the compiler.

   507   //

   508   // Note that we generate only this stub into a RuntimeStub, because

   509   // it needs to be properly traversed and ignored during GC, so we

   510   // change the meaning of the "__" macro within this method.

   511   //

   512   // Note: the routine set_pc_not_at_call_for_caller in

   513   // SharedRuntime.cpp requires that this code be generated into a

   514   // RuntimeStub.

   515   address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc,

   516                                    Register arg1 = noreg, Register arg2 = noreg) {

   517     CodeBuffer code(name, 1024 DEBUG_ONLY(+ 512), 0);

   518     MacroAssembler* masm = new MacroAssembler(&code);

   520     OopMapSet* oop_maps  = new OopMapSet();

   521     int frame_size_in_bytes = frame::abi_112_size;

   522     OopMap* map = new OopMap(frame_size_in_bytes / sizeof(jint), 0);

   524     StubCodeMark mark(this, "StubRoutines", "throw_exception");

   526     address start = __ pc();

   528     __ save_LR_CR(R11_scratch1);

   530     // Push a frame.

   531     __ push_frame_abi112(0, R11_scratch1);

   533     address frame_complete_pc = __ pc();

   535     if (restore_saved_exception_pc) {

   536       __ unimplemented("StubGenerator::throw_exception with restore_saved_exception_pc", 74);

   537     }

   539     // Note that we always have a runtime stub frame on the top of

   540     // stack by this point. Remember the offset of the instruction

   541     // whose address will be moved to R11_scratch1.

   542     address gc_map_pc = __ get_PC_trash_LR(R11_scratch1);

   544     __ set_last_Java_frame(/*sp*/R1_SP, /*pc*/R11_scratch1);

   546     __ mr(R3_ARG1, R16_thread);

   547     if (arg1 != noreg) {

   548       __ mr(R4_ARG2, arg1);

   549     }

   550     if (arg2 != noreg) {

   551       __ mr(R5_ARG3, arg2);

   552     }

   553     __ call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, runtime_entry),

   554               relocInfo::none);

   556     // Set an oopmap for the call site.

   557     oop_maps->add_gc_map((int)(gc_map_pc - start), map);

   559     __ reset_last_Java_frame();

   561 #ifdef ASSERT

   562     // Make sure that this code is only executed if there is a pending

   563     // exception.

   564     {

   565       Label L;

   566       __ ld(R0,

   567                 in_bytes(Thread::pending_exception_offset()),

   568                 R16_thread);

   569       __ cmpdi(CCR0, R0, 0);

   570       __ bne(CCR0, L);

   571       __ stop("StubRoutines::throw_exception: no pending exception");

   572       __ bind(L);

   573     }

   574 #endif

   576     // Pop frame.

   577     __ pop_frame();

   579     __ restore_LR_CR(R11_scratch1);

   581     __ load_const(R11_scratch1, StubRoutines::forward_exception_entry());

   582     __ mtctr(R11_scratch1);

   583     __ bctr();

   585     // Create runtime stub with OopMap.

   586     RuntimeStub* stub =

   587       RuntimeStub::new_runtime_stub(name, &code,

   588                                     /*frame_complete=*/ (int)(frame_complete_pc - start),

   589                                     frame_size_in_bytes/wordSize,

   590                                     oop_maps,

   591                                     false);

   592     return stub->entry_point();

   593   }

   594 #undef __

   595 #define __ _masm->

   597   //  Generate G1 pre-write barrier for array.

   598   //

   599   //  Input:

   600   //     from     - register containing src address (only needed for spilling)

   601   //     to       - register containing starting address

   602   //     count    - register containing element count

   603   //     tmp      - scratch register

   604   //

   605   //  Kills:

   606   //     nothing

   607   //

   608   void gen_write_ref_array_pre_barrier(Register from, Register to, Register count, bool dest_uninitialized, Register Rtmp1) {

   609     BarrierSet* const bs = Universe::heap()->barrier_set();

   610     switch (bs->kind()) {

   611       case BarrierSet::G1SATBCT:

   612       case BarrierSet::G1SATBCTLogging:

   613         // With G1, don't generate the call if we statically know that the target in uninitialized

   614         if (!dest_uninitialized) {

   615           const int spill_slots = 4 * wordSize;

   616           const int frame_size  = frame::abi_112_size + spill_slots;

   618           __ save_LR_CR(R0);

   619           __ push_frame_abi112(spill_slots, R0);

   620           __ std(from,  frame_size - 1 * wordSize, R1_SP);

   621           __ std(to,    frame_size - 2 * wordSize, R1_SP);

   622           __ std(count, frame_size - 3 * wordSize, R1_SP);

   624           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), to, count);

   626           __ ld(from,  frame_size - 1 * wordSize, R1_SP);

   627           __ ld(to,    frame_size - 2 * wordSize, R1_SP);

   628           __ ld(count, frame_size - 3 * wordSize, R1_SP);

   629           __ pop_frame();

   630           __ restore_LR_CR(R0);

   631         }

   632         break;

   633       case BarrierSet::CardTableModRef:

   634       case BarrierSet::CardTableExtension:

   635       case BarrierSet::ModRef:

   636         break;

   637       default:

   638         ShouldNotReachHere();

   639     }

   640   }

   642   //  Generate CMS/G1 post-write barrier for array.

   643   //

   644   //  Input:

   645   //     addr     - register containing starting address

   646   //     count    - register containing element count

   647   //     tmp      - scratch register

   648   //

   649   //  The input registers and R0 are overwritten.

   650   //

   651   void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp) {

   652     BarrierSet* const bs = Universe::heap()->barrier_set();

   654     switch (bs->kind()) {

   655       case BarrierSet::G1SATBCT:

   656       case BarrierSet::G1SATBCTLogging:

   657         {

   658           __ save_LR_CR(R0);

   659           // We need this frame only that the callee can spill LR/CR.

   660           __ push_frame_abi112(0, R0);

   662           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), addr, count);

   664           __ pop_frame();

   665           __ restore_LR_CR(R0);

   666         }

   667         break;

   668       case BarrierSet::CardTableModRef:

   669       case BarrierSet::CardTableExtension:

   670         {

   671           Label Lskip_loop, Lstore_loop;

   672           if (UseConcMarkSweepGC) {

   673             // TODO PPC port: contribute optimization / requires shared changes

   674             __ release();

   675           }

   677           CardTableModRefBS* const ct = (CardTableModRefBS*)bs;

   678           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");

   679           assert_different_registers(addr, count, tmp);

   681           __ sldi(count, count, LogBytesPerHeapOop);

   682           __ addi(count, count, -BytesPerHeapOop);

   683           __ add(count, addr, count);

   684           // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)

   685           __ srdi(addr, addr, CardTableModRefBS::card_shift);

   686           __ srdi(count, count, CardTableModRefBS::card_shift);

   687           __ subf(count, addr, count);

   688           assert_different_registers(R0, addr, count, tmp);

   689           __ load_const(tmp, (address)ct->byte_map_base);

   690           __ addic_(count, count, 1);

   691           __ beq(CCR0, Lskip_loop);

   692           __ li(R0, 0);

   693           __ mtctr(count);

   694           // Byte store loop

   695           __ bind(Lstore_loop);

   696           __ stbx(R0, tmp, addr);

   697           __ addi(addr, addr, 1);

   698           __ bdnz(Lstore_loop);

   699           __ bind(Lskip_loop);

   700         }

   701       break;

   702       case BarrierSet::ModRef:

   703         break;

   704       default:

   705         ShouldNotReachHere();

   706     }

   707   }

   709   // Support for void zero_words_aligned8(HeapWord* to, size_t count)

   710   //

   711   // Arguments:

   712   //   to:

   713   //   count:

   714   //

   715   // Destroys:

   716   //

   717   address generate_zero_words_aligned8() {

   718     StubCodeMark mark(this, "StubRoutines", "zero_words_aligned8");

   720     // Implemented as in ClearArray.

   721     address start = __ emit_fd();

   723     Register base_ptr_reg   = R3_ARG1; // tohw (needs to be 8b aligned)

   724     Register cnt_dwords_reg = R4_ARG2; // count (in dwords)

   725     Register tmp1_reg       = R5_ARG3;

   726     Register tmp2_reg       = R6_ARG4;

   727     Register zero_reg       = R7_ARG5;

   729     // Procedure for large arrays (uses data cache block zero instruction).

   730     Label dwloop, fast, fastloop, restloop, lastdword, done;

   731     int cl_size=VM_Version::get_cache_line_size(), cl_dwords=cl_size>>3, cl_dwordaddr_bits=exact_log2(cl_dwords);

   732     int min_dcbz=2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines.

   734     // Clear up to 128byte boundary if long enough, dword_cnt=(16-(base>>3))%16.

   735     __ dcbtst(base_ptr_reg);                    // Indicate write access to first cache line ...

   736     __ andi(tmp2_reg, cnt_dwords_reg, 1);       // to check if number of dwords is even.

   737     __ srdi_(tmp1_reg, cnt_dwords_reg, 1);      // number of double dwords

   738     __ load_const_optimized(zero_reg, 0L);      // Use as zero register.

   740     __ cmpdi(CCR1, tmp2_reg, 0);                // cnt_dwords even?

   741     __ beq(CCR0, lastdword);                    // size <= 1

   742     __ mtctr(tmp1_reg);                         // Speculatively preload counter for rest loop (>0).

   743     __ cmpdi(CCR0, cnt_dwords_reg, (min_dcbz+1)*cl_dwords-1); // Big enough to ensure >=min_dcbz cache lines are included?

   744     __ neg(tmp1_reg, base_ptr_reg);             // bit 0..58: bogus, bit 57..60: (16-(base>>3))%16, bit 61..63: 000

   746     __ blt(CCR0, restloop);                     // Too small. (<31=(2*cl_dwords)-1 is sufficient, but bigger performs better.)

   747     __ rldicl_(tmp1_reg, tmp1_reg, 64-3, 64-cl_dwordaddr_bits); // Extract number of dwords to 128byte boundary=(16-(base>>3))%16.

   749     __ beq(CCR0, fast);                         // already 128byte aligned

   750     __ mtctr(tmp1_reg);                         // Set ctr to hit 128byte boundary (0<ctr<cnt).

   751     __ subf(cnt_dwords_reg, tmp1_reg, cnt_dwords_reg); // rest (>0 since size>=256-8)

   753     // Clear in first cache line dword-by-dword if not already 128byte aligned.

   754     __ bind(dwloop);

   755       __ std(zero_reg, 0, base_ptr_reg);        // Clear 8byte aligned block.

   756       __ addi(base_ptr_reg, base_ptr_reg, 8);

   757     __ bdnz(dwloop);

   759     // clear 128byte blocks

   760     __ bind(fast);

   761     __ srdi(tmp1_reg, cnt_dwords_reg, cl_dwordaddr_bits); // loop count for 128byte loop (>0 since size>=256-8)

   762     __ andi(tmp2_reg, cnt_dwords_reg, 1);       // to check if rest even

   764     __ mtctr(tmp1_reg);                         // load counter

   765     __ cmpdi(CCR1, tmp2_reg, 0);                // rest even?

   766     __ rldicl_(tmp1_reg, cnt_dwords_reg, 63, 65-cl_dwordaddr_bits); // rest in double dwords

   768     __ bind(fastloop);

   769       __ dcbz(base_ptr_reg);                    // Clear 128byte aligned block.

   770       __ addi(base_ptr_reg, base_ptr_reg, cl_size);

   771     __ bdnz(fastloop);

   773     //__ dcbtst(base_ptr_reg);                  // Indicate write access to last cache line.

   774     __ beq(CCR0, lastdword);                    // rest<=1

   775     __ mtctr(tmp1_reg);                         // load counter

   777     // Clear rest.

   778     __ bind(restloop);

   779       __ std(zero_reg, 0, base_ptr_reg);        // Clear 8byte aligned block.

   780       __ std(zero_reg, 8, base_ptr_reg);        // Clear 8byte aligned block.

   781       __ addi(base_ptr_reg, base_ptr_reg, 16);

   782     __ bdnz(restloop);

   784     __ bind(lastdword);

   785     __ beq(CCR1, done);

   786     __ std(zero_reg, 0, base_ptr_reg);

   787     __ bind(done);

   788     __ blr();                                   // return

   790     return start;

   791   }

   793   // The following routine generates a subroutine to throw an asynchronous

   794   // UnknownError when an unsafe access gets a fault that could not be

   795   // reasonably prevented by the programmer.  (Example: SIGBUS/OBJERR.)

   796   //

   797   address generate_handler_for_unsafe_access() {

   798     StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");

   799     address start = __ emit_fd();

   800     __ unimplemented("StubRoutines::handler_for_unsafe_access", 93);

   801     return start;

   802   }

   804 #if !defined(PRODUCT)

   805   // Wrapper which calls oopDesc::is_oop_or_null()

   806   // Only called by MacroAssembler::verify_oop

   807   static void verify_oop_helper(const char* message, oop o) {

   808     if (!o->is_oop_or_null()) {

   809       fatal(message);

   810     }

   811     ++ StubRoutines::_verify_oop_count;

   812   }

   813 #endif

   815   // Return address of code to be called from code generated by

   816   // MacroAssembler::verify_oop.

   817   //

   818   // Don't generate, rather use C++ code.

   819   address generate_verify_oop() {

   820     StubCodeMark mark(this, "StubRoutines", "verify_oop");

   822     // this is actually a `FunctionDescriptor*'.

   823     address start = 0;

   825 #if !defined(PRODUCT)

   826     start = CAST_FROM_FN_PTR(address, verify_oop_helper);

   827 #endif

   829     return start;

   830   }

   832   // Fairer handling of safepoints for native methods.

   833   //

   834   // Generate code which reads from the polling page. This special handling is needed as the

   835   // linux-ppc64 kernel before 2.6.6 doesn't set si_addr on some segfaults in 64bit mode

   836   // (cf. http://www.kernel.org/pub/linux/kernel/v2.6/ChangeLog-2.6.6), especially when we try

   837   // to read from the safepoint polling page.

   838   address generate_load_from_poll() {

   839     StubCodeMark mark(this, "StubRoutines", "generate_load_from_poll");

   840     address start = __ emit_fd();

   841     __ unimplemented("StubRoutines::verify_oop", 95);  // TODO PPC port

   842     return start;

   843   }

   845   // -XX:+OptimizeFill : convert fill/copy loops into intrinsic

   846   //

   847   // The code is implemented(ported from sparc) as we believe it benefits JVM98, however

   848   // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!

   849   //

   850   // Source code in function is_range_check_if() shows OptimizeFill relaxed the condition

   851   // for turning on loop predication optimization, and hence the behavior of "array range check"

   852   // and "loop invariant check" could be influenced, which potentially boosted JVM98.

   853   //

   854   // We leave the code here and see if Oracle has updates in later releases(later than HS20).

   855   //

   856   //  Generate stub for disjoint short fill.  If "aligned" is true, the

   857   //  "to" address is assumed to be heapword aligned.

   858   //

   859   // Arguments for generated stub:

   860   //      to:    R3_ARG1

   861   //      value: R4_ARG2

   862   //      count: R5_ARG3 treated as signed

   863   //

   864   address generate_fill(BasicType t, bool aligned, const char* name) {

   865     StubCodeMark mark(this, "StubRoutines", name);

   866     address start = __ emit_fd();

   868     const Register to        = R3_ARG1;   // source array address

   869     const Register value     = R4_ARG2;   // fill value

   870     const Register count     = R5_ARG3;   // elements count

   871     const Register temp      = R6_ARG4;   // temp register

   873     //assert_clean_int(count, O3);     // Make sure 'count' is clean int.

   875     Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;

   876     Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;

   878     int shift = -1;

   879     switch (t) {

   880        case T_BYTE:

   881         shift = 2;

   882         // clone bytes (zero extend not needed because store instructions below ignore high order bytes)

   883         __ rldimi(value, value, 8, 48);     // 8 bit -> 16 bit

   884         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element

   885         __ blt(CCR0, L_fill_elements);

   886         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit

   887         break;

   888        case T_SHORT:

   889         shift = 1;

   890         // clone bytes (zero extend not needed because store instructions below ignore high order bytes)

   891         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit

   892         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element

   893         __ blt(CCR0, L_fill_elements);

   894         break;

   895       case T_INT:

   896         shift = 0;

   897         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element

   898         __ blt(CCR0, L_fill_4_bytes);

   899         break;

   900       default: ShouldNotReachHere();

   901     }

   903     if (!aligned && (t == T_BYTE || t == T_SHORT)) {

   904       // align source address at 4 bytes address boundary

   905       if (t == T_BYTE) {

   906         // One byte misalignment happens only for byte arrays

   907         __ andi_(temp, to, 1);

   908         __ beq(CCR0, L_skip_align1);

   909         __ stb(value, 0, to);

   910         __ addi(to, to, 1);

   911         __ addi(count, count, -1);

   912         __ bind(L_skip_align1);

   913       }

   914       // Two bytes misalignment happens only for byte and short (char) arrays.

   915       __ andi_(temp, to, 2);

   916       __ beq(CCR0, L_skip_align2);

   917       __ sth(value, 0, to);

   918       __ addi(to, to, 2);

   919       __ addi(count, count, -(1 << (shift - 1)));

   920       __ bind(L_skip_align2);

   921     }

   923     if (!aligned) {

   924       // Align to 8 bytes, we know we are 4 byte aligned to start.

   925       __ andi_(temp, to, 7);

   926       __ beq(CCR0, L_fill_32_bytes);

   927       __ stw(value, 0, to);

   928       __ addi(to, to, 4);

   929       __ addi(count, count, -(1 << shift));

   930       __ bind(L_fill_32_bytes);

   931     }

   933     __ li(temp, 8<<shift);              // prepare for 32 byte loop

   934     // clone bytes int->long as above

   935     __ rldimi(value, value, 32, 0);     // 32 bit -> 64 bit

   937     Label L_check_fill_8_bytes;

   938     // Fill 32-byte chunks

   939     __ subf_(count, temp, count);

   940     __ blt(CCR0, L_check_fill_8_bytes);

   942     Label L_fill_32_bytes_loop;

   943     __ align(32);

   944     __ bind(L_fill_32_bytes_loop);

   946     __ std(value, 0, to);

   947     __ std(value, 8, to);

   948     __ subf_(count, temp, count); // update count

   949     __ std(value, 16, to);

   950     __ std(value, 24, to);

   952     __ addi(to, to, 32);

   953     __ bge(CCR0, L_fill_32_bytes_loop);

   955     __ bind(L_check_fill_8_bytes);

   956     __ add_(count, temp, count);

   957     __ beq(CCR0, L_exit);

   958     __ addic_(count, count, -(2 << shift));

   959     __ blt(CCR0, L_fill_4_bytes);

   961     //

   962     // Length is too short, just fill 8 bytes at a time.

   963     //

   964     Label L_fill_8_bytes_loop;

   965     __ bind(L_fill_8_bytes_loop);

   966     __ std(value, 0, to);

   967     __ addic_(count, count, -(2 << shift));

   968     __ addi(to, to, 8);

   969     __ bge(CCR0, L_fill_8_bytes_loop);

   971     // fill trailing 4 bytes

   972     __ bind(L_fill_4_bytes);

   973     __ andi_(temp, count, 1<<shift);

   974     __ beq(CCR0, L_fill_2_bytes);

   976     __ stw(value, 0, to);

   977     if (t == T_BYTE || t == T_SHORT) {

   978       __ addi(to, to, 4);

   979       // fill trailing 2 bytes

   980       __ bind(L_fill_2_bytes);

   981       __ andi_(temp, count, 1<<(shift-1));

   982       __ beq(CCR0, L_fill_byte);

   983       __ sth(value, 0, to);

   984       if (t == T_BYTE) {

   985         __ addi(to, to, 2);

   986         // fill trailing byte

   987         __ bind(L_fill_byte);

   988         __ andi_(count, count, 1);

   989         __ beq(CCR0, L_exit);

   990         __ stb(value, 0, to);

   991       } else {

   992         __ bind(L_fill_byte);

   993       }

   994     } else {

   995       __ bind(L_fill_2_bytes);

   996     }

   997     __ bind(L_exit);

   998     __ blr();

  1000     // Handle copies less than 8 bytes.  Int is handled elsewhere.

  1001     if (t == T_BYTE) {

  1002       __ bind(L_fill_elements);

  1003       Label L_fill_2, L_fill_4;

  1004       __ andi_(temp, count, 1);

  1005       __ beq(CCR0, L_fill_2);

  1006       __ stb(value, 0, to);

  1007       __ addi(to, to, 1);

  1008       __ bind(L_fill_2);

  1009       __ andi_(temp, count, 2);

  1010       __ beq(CCR0, L_fill_4);

  1011       __ stb(value, 0, to);

  1012       __ stb(value, 0, to);

  1013       __ addi(to, to, 2);

  1014       __ bind(L_fill_4);

  1015       __ andi_(temp, count, 4);

  1016       __ beq(CCR0, L_exit);

  1017       __ stb(value, 0, to);

  1018       __ stb(value, 1, to);

  1019       __ stb(value, 2, to);

  1020       __ stb(value, 3, to);

  1021       __ blr();

  1022     }

  1024     if (t == T_SHORT) {

  1025       Label L_fill_2;

  1026       __ bind(L_fill_elements);

  1027       __ andi_(temp, count, 1);

  1028       __ beq(CCR0, L_fill_2);

  1029       __ sth(value, 0, to);

  1030       __ addi(to, to, 2);

  1031       __ bind(L_fill_2);

  1032       __ andi_(temp, count, 2);

  1033       __ beq(CCR0, L_exit);

  1034       __ sth(value, 0, to);

  1035       __ sth(value, 2, to);

  1036       __ blr();

  1037     }

  1038     return start;

  1039   }

  1042   // Generate overlap test for array copy stubs

  1043   //

  1044   // Input:

  1045   //   R3_ARG1    -  from

  1046   //   R4_ARG2    -  to

  1047   //   R5_ARG3    -  element count

  1048   //

  1049   void array_overlap_test(address no_overlap_target, int log2_elem_size) {

  1050     Register tmp1 = R6_ARG4;

  1051     Register tmp2 = R7_ARG5;

  1053     Label l_overlap;

  1054 #ifdef ASSERT

  1055     __ srdi_(tmp2, R5_ARG3, 31);

  1056     __ asm_assert_eq("missing zero extend", 0xAFFE);

  1057 #endif

  1059     __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes

  1060     __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes

  1061     __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!

  1062     __ cmpld(CCR1, tmp1, tmp2);

  1063     __ crand(/*CCR0 lt*/0, /*CCR1 lt*/4+0, /*CCR0 lt*/0);

  1064     __ blt(CCR0, l_overlap); // Src before dst and distance smaller than size.

  1066     // need to copy forwards

  1067     if (__ is_within_range_of_b(no_overlap_target, __ pc())) {

  1068       __ b(no_overlap_target);

  1069     } else {

  1070       __ load_const(tmp1, no_overlap_target, tmp2);

  1071       __ mtctr(tmp1);

  1072       __ bctr();

  1073     }

  1075     __ bind(l_overlap);

  1076     // need to copy backwards

  1077   }

  1079   // The guideline in the implementations of generate_disjoint_xxx_copy

  1080   // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with

  1081   // single instructions, but to avoid alignment interrupts (see subsequent

  1082   // comment). Furthermore, we try to minimize misaligned access, even

  1083   // though they cause no alignment interrupt.

  1084   //

  1085   // In Big-Endian mode, the PowerPC architecture requires implementations to

  1086   // handle automatically misaligned integer halfword and word accesses,

  1087   // word-aligned integer doubleword accesses, and word-aligned floating-point

  1088   // accesses. Other accesses may or may not generate an Alignment interrupt

  1089   // depending on the implementation.

  1090   // Alignment interrupt handling may require on the order of hundreds of cycles,

  1091   // so every effort should be made to avoid misaligned memory values.

  1092   //

  1093   //

  1094   // Generate stub for disjoint byte copy.  If "aligned" is true, the

  1095   // "from" and "to" addresses are assumed to be heapword aligned.

  1096   //

  1097   // Arguments for generated stub:

  1098   //      from:  R3_ARG1

  1099   //      to:    R4_ARG2

  1100   //      count: R5_ARG3 treated as signed

  1101   //

  1102   address generate_disjoint_byte_copy(bool aligned, const char * name) {

  1103     StubCodeMark mark(this, "StubRoutines", name);

  1104     address start = __ emit_fd();

  1106     Register tmp1 = R6_ARG4;

  1107     Register tmp2 = R7_ARG5;

  1108     Register tmp3 = R8_ARG6;

  1109     Register tmp4 = R9_ARG7;

  1112     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;

  1113     // Don't try anything fancy if arrays don't have many elements.

  1114     __ li(tmp3, 0);

  1115     __ cmpwi(CCR0, R5_ARG3, 17);

  1116     __ ble(CCR0, l_6); // copy 4 at a time

  1118     if (!aligned) {

  1119       __ xorr(tmp1, R3_ARG1, R4_ARG2);

  1120       __ andi_(tmp1, tmp1, 3);

  1121       __ bne(CCR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.

  1123       // Copy elements if necessary to align to 4 bytes.

  1124       __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.

  1125       __ andi_(tmp1, tmp1, 3);

  1126       __ beq(CCR0, l_2);

  1128       __ subf(R5_ARG3, tmp1, R5_ARG3);

  1129       __ bind(l_9);

  1130       __ lbz(tmp2, 0, R3_ARG1);

  1131       __ addic_(tmp1, tmp1, -1);

  1132       __ stb(tmp2, 0, R4_ARG2);

  1133       __ addi(R3_ARG1, R3_ARG1, 1);

  1134       __ addi(R4_ARG2, R4_ARG2, 1);

  1135       __ bne(CCR0, l_9);

  1137       __ bind(l_2);

  1138     }

  1140     // copy 8 elements at a time

  1141     __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8

  1142     __ andi_(tmp1, tmp2, 7);

  1143     __ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8

  1145     // copy a 2-element word if necessary to align to 8 bytes

  1146     __ andi_(R0, R3_ARG1, 7);

  1147     __ beq(CCR0, l_7);

  1149     __ lwzx(tmp2, R3_ARG1, tmp3);

  1150     __ addi(R5_ARG3, R5_ARG3, -4);

  1151     __ stwx(tmp2, R4_ARG2, tmp3);

  1152     { // FasterArrayCopy

  1153       __ addi(R3_ARG1, R3_ARG1, 4);

  1154       __ addi(R4_ARG2, R4_ARG2, 4);

  1155     }

  1156     __ bind(l_7);

  1158     { // FasterArrayCopy

  1159       __ cmpwi(CCR0, R5_ARG3, 31);

  1160       __ ble(CCR0, l_6); // copy 2 at a time if less than 32 elements remain

  1162       __ srdi(tmp1, R5_ARG3, 5);

  1163       __ andi_(R5_ARG3, R5_ARG3, 31);

  1164       __ mtctr(tmp1);

  1166       __ bind(l_8);

  1167       // Use unrolled version for mass copying (copy 32 elements a time)

  1168       // Load feeding store gets zero latency on Power6, however not on Power5.

  1169       // Therefore, the following sequence is made for the good of both.

  1170       __ ld(tmp1, 0, R3_ARG1);

  1171       __ ld(tmp2, 8, R3_ARG1);

  1172       __ ld(tmp3, 16, R3_ARG1);

  1173       __ ld(tmp4, 24, R3_ARG1);

  1174       __ std(tmp1, 0, R4_ARG2);

  1175       __ std(tmp2, 8, R4_ARG2);

  1176       __ std(tmp3, 16, R4_ARG2);

  1177       __ std(tmp4, 24, R4_ARG2);

  1178       __ addi(R3_ARG1, R3_ARG1, 32);

  1179       __ addi(R4_ARG2, R4_ARG2, 32);

  1180       __ bdnz(l_8);

  1181     }

  1183     __ bind(l_6);

  1185     // copy 4 elements at a time

  1186     __ cmpwi(CCR0, R5_ARG3, 4);

  1187     __ blt(CCR0, l_1);

  1188     __ srdi(tmp1, R5_ARG3, 2);

  1189     __ mtctr(tmp1); // is > 0

  1190     __ andi_(R5_ARG3, R5_ARG3, 3);

  1192     { // FasterArrayCopy

  1193       __ addi(R3_ARG1, R3_ARG1, -4);

  1194       __ addi(R4_ARG2, R4_ARG2, -4);

  1195       __ bind(l_3);

  1196       __ lwzu(tmp2, 4, R3_ARG1);

  1197       __ stwu(tmp2, 4, R4_ARG2);

  1198       __ bdnz(l_3);

  1199       __ addi(R3_ARG1, R3_ARG1, 4);

  1200       __ addi(R4_ARG2, R4_ARG2, 4);

  1201     }

  1203     // do single element copy

  1204     __ bind(l_1);

  1205     __ cmpwi(CCR0, R5_ARG3, 0);

  1206     __ beq(CCR0, l_4);

  1208     { // FasterArrayCopy

  1209       __ mtctr(R5_ARG3);

  1210       __ addi(R3_ARG1, R3_ARG1, -1);

  1211       __ addi(R4_ARG2, R4_ARG2, -1);

  1213       __ bind(l_5);

  1214       __ lbzu(tmp2, 1, R3_ARG1);

  1215       __ stbu(tmp2, 1, R4_ARG2);

  1216       __ bdnz(l_5);

  1217     }

  1219     __ bind(l_4);

  1220     __ blr();

  1222     return start;

  1223   }

  1225   // Generate stub for conjoint byte copy.  If "aligned" is true, the

  1226   // "from" and "to" addresses are assumed to be heapword aligned.

  1227   //

  1228   // Arguments for generated stub:

  1229   //      from:  R3_ARG1

  1230   //      to:    R4_ARG2

  1231   //      count: R5_ARG3 treated as signed

  1232   //

  1233   address generate_conjoint_byte_copy(bool aligned, const char * name) {

  1234     StubCodeMark mark(this, "StubRoutines", name);

  1235     address start = __ emit_fd();

  1237     Register tmp1 = R6_ARG4;

  1238     Register tmp2 = R7_ARG5;

  1239     Register tmp3 = R8_ARG6;

  1241     address nooverlap_target = aligned ?

  1242       ((FunctionDescriptor*)StubRoutines::arrayof_jbyte_disjoint_arraycopy())->entry() :

  1243       ((FunctionDescriptor*)StubRoutines::jbyte_disjoint_arraycopy())->entry();

  1245     array_overlap_test(nooverlap_target, 0);

  1246     // Do reverse copy. We assume the case of actual overlap is rare enough

  1247     // that we don't have to optimize it.

  1248     Label l_1, l_2;

  1250     __ b(l_2);

  1251     __ bind(l_1);

  1252     __ stbx(tmp1, R4_ARG2, R5_ARG3);

  1253     __ bind(l_2);

  1254     __ addic_(R5_ARG3, R5_ARG3, -1);

  1255     __ lbzx(tmp1, R3_ARG1, R5_ARG3);

  1256     __ bge(CCR0, l_1);

  1258     __ blr();

  1260     return start;

  1261   }

  1263   // Generate stub for disjoint short copy.  If "aligned" is true, the

  1264   // "from" and "to" addresses are assumed to be heapword aligned.

  1265   //

  1266   // Arguments for generated stub:

  1267   //      from:  R3_ARG1

  1268   //      to:    R4_ARG2

  1269   //  elm.count: R5_ARG3 treated as signed

  1270   //

  1271   // Strategy for aligned==true:

  1272   //

  1273   //  If length <= 9:

  1274   //     1. copy 2 elements at a time (l_6)

  1275   //     2. copy last element if original element count was odd (l_1)

  1276   //

  1277   //  If length > 9:

  1278   //     1. copy 4 elements at a time until less than 4 elements are left (l_7)

  1279   //     2. copy 2 elements at a time until less than 2 elements are left (l_6)

  1280   //     3. copy last element if one was left in step 2. (l_1)

  1281   //

  1282   //

  1283   // Strategy for aligned==false:

  1284   //

  1285   //  If length <= 9: same as aligned==true case, but NOTE: load/stores

  1286   //                  can be unaligned (see comment below)

  1287   //

  1288   //  If length > 9:

  1289   //     1. continue with step 6. if the alignment of from and to mod 4

  1290   //        is different.

  1291   //     2. align from and to to 4 bytes by copying 1 element if necessary

  1292   //     3. at l_2 from and to are 4 byte aligned; continue with

  1293   //        5. if they cannot be aligned to 8 bytes because they have

  1294   //        got different alignment mod 8.

  1295   //     4. at this point we know that both, from and to, have the same

  1296   //        alignment mod 8, now copy one element if necessary to get

  1297   //        8 byte alignment of from and to.

  1298   //     5. copy 4 elements at a time until less than 4 elements are

  1299   //        left; depending on step 3. all load/stores are aligned or

  1300   //        either all loads or all stores are unaligned.

  1301   //     6. copy 2 elements at a time until less than 2 elements are

  1302   //        left (l_6); arriving here from step 1., there is a chance

  1303   //        that all accesses are unaligned.

  1304   //     7. copy last element if one was left in step 6. (l_1)

  1305   //

  1306   //  There are unaligned data accesses using integer load/store

  1307   //  instructions in this stub. POWER allows such accesses.

  1308   //

  1309   //  According to the manuals (PowerISA_V2.06_PUBLIC, Book II,

  1310   //  Chapter 2: Effect of Operand Placement on Performance) unaligned

  1311   //  integer load/stores have good performance. Only unaligned

  1312   //  floating point load/stores can have poor performance.

  1313   //

  1314   //  TODO:

  1315   //

  1316   //  1. check if aligning the backbranch target of loops is beneficial

  1317   //

  1318   address generate_disjoint_short_copy(bool aligned, const char * name) {

  1319     StubCodeMark mark(this, "StubRoutines", name);

  1321     Register tmp1 = R6_ARG4;

  1322     Register tmp2 = R7_ARG5;

  1323     Register tmp3 = R8_ARG6;

  1324     Register tmp4 = R9_ARG7;

  1326     address start = __ emit_fd();

  1328       Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;

  1329     // don't try anything fancy if arrays don't have many elements

  1330     __ li(tmp3, 0);

  1331     __ cmpwi(CCR0, R5_ARG3, 9);

  1332     __ ble(CCR0, l_6); // copy 2 at a time

  1334     if (!aligned) {

  1335       __ xorr(tmp1, R3_ARG1, R4_ARG2);

  1336       __ andi_(tmp1, tmp1, 3);

  1337       __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy

  1339       // At this point it is guaranteed that both, from and to have the same alignment mod 4.

  1341       // Copy 1 element if necessary to align to 4 bytes.

  1342       __ andi_(tmp1, R3_ARG1, 3);

  1343       __ beq(CCR0, l_2);

  1345       __ lhz(tmp2, 0, R3_ARG1);

  1346       __ addi(R3_ARG1, R3_ARG1, 2);

  1347       __ sth(tmp2, 0, R4_ARG2);

  1348       __ addi(R4_ARG2, R4_ARG2, 2);

  1349       __ addi(R5_ARG3, R5_ARG3, -1);

  1350       __ bind(l_2);

  1352       // At this point the positions of both, from and to, are at least 4 byte aligned.

  1354       // Copy 4 elements at a time.

  1355       // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.

  1356       __ xorr(tmp2, R3_ARG1, R4_ARG2);

  1357       __ andi_(tmp1, tmp2, 7);

  1358       __ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned

  1360       // Copy a 2-element word if necessary to align to 8 bytes.

  1361       __ andi_(R0, R3_ARG1, 7);

  1362       __ beq(CCR0, l_7);

  1364       __ lwzx(tmp2, R3_ARG1, tmp3);

  1365       __ addi(R5_ARG3, R5_ARG3, -2);

  1366       __ stwx(tmp2, R4_ARG2, tmp3);

  1367       { // FasterArrayCopy

  1368         __ addi(R3_ARG1, R3_ARG1, 4);

  1369         __ addi(R4_ARG2, R4_ARG2, 4);

  1370       }

  1371     }

  1373     __ bind(l_7);

  1375     // Copy 4 elements at a time; either the loads or the stores can

  1376     // be unaligned if aligned == false.

  1378     { // FasterArrayCopy

  1379       __ cmpwi(CCR0, R5_ARG3, 15);

  1380       __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain

  1382       __ srdi(tmp1, R5_ARG3, 4);

  1383       __ andi_(R5_ARG3, R5_ARG3, 15);

  1384       __ mtctr(tmp1);

  1386       __ bind(l_8);

  1387       // Use unrolled version for mass copying (copy 16 elements a time).

  1388       // Load feeding store gets zero latency on Power6, however not on Power5.

  1389       // Therefore, the following sequence is made for the good of both.

  1390       __ ld(tmp1, 0, R3_ARG1);

  1391       __ ld(tmp2, 8, R3_ARG1);

  1392       __ ld(tmp3, 16, R3_ARG1);

  1393       __ ld(tmp4, 24, R3_ARG1);

  1394       __ std(tmp1, 0, R4_ARG2);

  1395       __ std(tmp2, 8, R4_ARG2);

  1396       __ std(tmp3, 16, R4_ARG2);

  1397       __ std(tmp4, 24, R4_ARG2);

  1398       __ addi(R3_ARG1, R3_ARG1, 32);

  1399       __ addi(R4_ARG2, R4_ARG2, 32);

  1400       __ bdnz(l_8);

  1401     }

  1402     __ bind(l_6);

  1404     // copy 2 elements at a time

  1405     { // FasterArrayCopy

  1406       __ cmpwi(CCR0, R5_ARG3, 2);

  1407       __ blt(CCR0, l_1);

  1408       __ srdi(tmp1, R5_ARG3, 1);

  1409       __ andi_(R5_ARG3, R5_ARG3, 1);

  1411       __ addi(R3_ARG1, R3_ARG1, -4);

  1412       __ addi(R4_ARG2, R4_ARG2, -4);

  1413       __ mtctr(tmp1);

  1415       __ bind(l_3);

  1416       __ lwzu(tmp2, 4, R3_ARG1);

  1417       __ stwu(tmp2, 4, R4_ARG2);

  1418       __ bdnz(l_3);

  1420       __ addi(R3_ARG1, R3_ARG1, 4);

  1421       __ addi(R4_ARG2, R4_ARG2, 4);

  1422     }

  1424     // do single element copy

  1425     __ bind(l_1);

  1426     __ cmpwi(CCR0, R5_ARG3, 0);

  1427     __ beq(CCR0, l_4);

  1429     { // FasterArrayCopy

  1430       __ mtctr(R5_ARG3);

  1431       __ addi(R3_ARG1, R3_ARG1, -2);

  1432       __ addi(R4_ARG2, R4_ARG2, -2);

  1434       __ bind(l_5);

  1435       __ lhzu(tmp2, 2, R3_ARG1);

  1436       __ sthu(tmp2, 2, R4_ARG2);

  1437       __ bdnz(l_5);

  1438     }

  1439     __ bind(l_4);

  1440     __ blr();

  1442     return start;

  1443   }

  1445   // Generate stub for conjoint short copy.  If "aligned" is true, the

  1446   // "from" and "to" addresses are assumed to be heapword aligned.

  1447   //

  1448   // Arguments for generated stub:

  1449   //      from:  R3_ARG1

  1450   //      to:    R4_ARG2

  1451   //      count: R5_ARG3 treated as signed

  1452   //

  1453   address generate_conjoint_short_copy(bool aligned, const char * name) {

  1454     StubCodeMark mark(this, "StubRoutines", name);

  1455     address start = __ emit_fd();

  1457     Register tmp1 = R6_ARG4;

  1458     Register tmp2 = R7_ARG5;

  1459     Register tmp3 = R8_ARG6;

  1461     address nooverlap_target = aligned ?

  1462         ((FunctionDescriptor*)StubRoutines::arrayof_jshort_disjoint_arraycopy())->entry() :

  1463         ((FunctionDescriptor*)StubRoutines::jshort_disjoint_arraycopy())->entry();

  1465     array_overlap_test(nooverlap_target, 1);

  1467     Label l_1, l_2;

  1468     __ sldi(tmp1, R5_ARG3, 1);

  1469     __ b(l_2);

  1470     __ bind(l_1);

  1471     __ sthx(tmp2, R4_ARG2, tmp1);

  1472     __ bind(l_2);

  1473     __ addic_(tmp1, tmp1, -2);

  1474     __ lhzx(tmp2, R3_ARG1, tmp1);

  1475     __ bge(CCR0, l_1);

  1477     __ blr();

  1479     return start;

  1480   }

  1482   // Generate core code for disjoint int copy (and oop copy on 32-bit).  If "aligned"

  1483   // is true, the "from" and "to" addresses are assumed to be heapword aligned.

  1484   //

  1485   // Arguments:

  1486   //      from:  R3_ARG1

  1487   //      to:    R4_ARG2

  1488   //      count: R5_ARG3 treated as signed

  1489   //

  1490   void generate_disjoint_int_copy_core(bool aligned) {

  1491     Register tmp1 = R6_ARG4;

  1492     Register tmp2 = R7_ARG5;

  1493     Register tmp3 = R8_ARG6;

  1494     Register tmp4 = R0;

  1496     Label l_1, l_2, l_3, l_4, l_5, l_6;

  1497     // for short arrays, just do single element copy

  1498     __ li(tmp3, 0);

  1499     __ cmpwi(CCR0, R5_ARG3, 5);

  1500     __ ble(CCR0, l_2);

  1502     if (!aligned) {

  1503         // check if arrays have same alignment mod 8.

  1504         __ xorr(tmp1, R3_ARG1, R4_ARG2);

  1505         __ andi_(R0, tmp1, 7);

  1506         // Not the same alignment, but ld and std just need to be 4 byte aligned.

  1507         __ bne(CCR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time

  1509         // copy 1 element to align to and from on an 8 byte boundary

  1510         __ andi_(R0, R3_ARG1, 7);

  1511         __ beq(CCR0, l_4);

  1513         __ lwzx(tmp2, R3_ARG1, tmp3);

  1514         __ addi(R5_ARG3, R5_ARG3, -1);

  1515         __ stwx(tmp2, R4_ARG2, tmp3);

  1516         { // FasterArrayCopy

  1517           __ addi(R3_ARG1, R3_ARG1, 4);

  1518           __ addi(R4_ARG2, R4_ARG2, 4);

  1519         }

  1520         __ bind(l_4);

  1521       }

  1523     { // FasterArrayCopy

  1524       __ cmpwi(CCR0, R5_ARG3, 7);

  1525       __ ble(CCR0, l_2); // copy 1 at a time if less than 8 elements remain

  1527       __ srdi(tmp1, R5_ARG3, 3);

  1528       __ andi_(R5_ARG3, R5_ARG3, 7);

  1529       __ mtctr(tmp1);

  1531       __ bind(l_6);

  1532       // Use unrolled version for mass copying (copy 8 elements a time).

  1533       // Load feeding store gets zero latency on power6, however not on power 5.

  1534       // Therefore, the following sequence is made for the good of both.

  1535       __ ld(tmp1, 0, R3_ARG1);

  1536       __ ld(tmp2, 8, R3_ARG1);

  1537       __ ld(tmp3, 16, R3_ARG1);

  1538       __ ld(tmp4, 24, R3_ARG1);

  1539       __ std(tmp1, 0, R4_ARG2);

  1540       __ std(tmp2, 8, R4_ARG2);

  1541       __ std(tmp3, 16, R4_ARG2);

  1542       __ std(tmp4, 24, R4_ARG2);

  1543       __ addi(R3_ARG1, R3_ARG1, 32);

  1544       __ addi(R4_ARG2, R4_ARG2, 32);

  1545       __ bdnz(l_6);

  1546     }

  1548     // copy 1 element at a time

  1549     __ bind(l_2);

  1550     __ cmpwi(CCR0, R5_ARG3, 0);

  1551     __ beq(CCR0, l_1);

  1553     { // FasterArrayCopy

  1554       __ mtctr(R5_ARG3);

  1555       __ addi(R3_ARG1, R3_ARG1, -4);

  1556       __ addi(R4_ARG2, R4_ARG2, -4);

  1558       __ bind(l_3);

  1559       __ lwzu(tmp2, 4, R3_ARG1);

  1560       __ stwu(tmp2, 4, R4_ARG2);

  1561       __ bdnz(l_3);

  1562     }

  1564     __ bind(l_1);

  1565     return;

  1566   }

  1568   // Generate stub for disjoint int copy.  If "aligned" is true, the

  1569   // "from" and "to" addresses are assumed to be heapword aligned.

  1570   //

  1571   // Arguments for generated stub:

  1572   //      from:  R3_ARG1

  1573   //      to:    R4_ARG2

  1574   //      count: R5_ARG3 treated as signed

  1575   //

  1576   address generate_disjoint_int_copy(bool aligned, const char * name) {

  1577     StubCodeMark mark(this, "StubRoutines", name);

  1578     address start = __ emit_fd();

  1579     generate_disjoint_int_copy_core(aligned);

  1580     __ blr();

  1581     return start;

  1582   }

  1584   // Generate core code for conjoint int copy (and oop copy on

  1585   // 32-bit).  If "aligned" is true, the "from" and "to" addresses

  1586   // are assumed to be heapword aligned.

  1587   //

  1588   // Arguments:

  1589   //      from:  R3_ARG1

  1590   //      to:    R4_ARG2

  1591   //      count: R5_ARG3 treated as signed

  1592   //

  1593   void generate_conjoint_int_copy_core(bool aligned) {

  1594     // Do reverse copy.  We assume the case of actual overlap is rare enough

  1595     // that we don't have to optimize it.

  1597     Label l_1, l_2, l_3, l_4, l_5, l_6;

  1599     Register tmp1 = R6_ARG4;

  1600     Register tmp2 = R7_ARG5;

  1601     Register tmp3 = R8_ARG6;

  1602     Register tmp4 = R0;

  1604     { // FasterArrayCopy

  1605       __ cmpwi(CCR0, R5_ARG3, 0);

  1606       __ beq(CCR0, l_6);

  1608       __ sldi(R5_ARG3, R5_ARG3, 2);

  1609       __ add(R3_ARG1, R3_ARG1, R5_ARG3);

  1610       __ add(R4_ARG2, R4_ARG2, R5_ARG3);

  1611       __ srdi(R5_ARG3, R5_ARG3, 2);

  1613       __ cmpwi(CCR0, R5_ARG3, 7);

  1614       __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain

  1616       __ srdi(tmp1, R5_ARG3, 3);

  1617       __ andi(R5_ARG3, R5_ARG3, 7);

  1618       __ mtctr(tmp1);

  1620       __ bind(l_4);

  1621       // Use unrolled version for mass copying (copy 4 elements a time).

  1622       // Load feeding store gets zero latency on Power6, however not on Power5.

  1623       // Therefore, the following sequence is made for the good of both.

  1624       __ addi(R3_ARG1, R3_ARG1, -32);

  1625       __ addi(R4_ARG2, R4_ARG2, -32);

  1626       __ ld(tmp4, 24, R3_ARG1);

  1627       __ ld(tmp3, 16, R3_ARG1);

  1628       __ ld(tmp2, 8, R3_ARG1);

  1629       __ ld(tmp1, 0, R3_ARG1);

  1630       __ std(tmp4, 24, R4_ARG2);

  1631       __ std(tmp3, 16, R4_ARG2);

  1632       __ std(tmp2, 8, R4_ARG2);

  1633       __ std(tmp1, 0, R4_ARG2);

  1634       __ bdnz(l_4);

  1636       __ cmpwi(CCR0, R5_ARG3, 0);

  1637       __ beq(CCR0, l_6);

  1639       __ bind(l_5);

  1640       __ mtctr(R5_ARG3);

  1641       __ bind(l_3);

  1642       __ lwz(R0, -4, R3_ARG1);

  1643       __ stw(R0, -4, R4_ARG2);

  1644       __ addi(R3_ARG1, R3_ARG1, -4);

  1645       __ addi(R4_ARG2, R4_ARG2, -4);

  1646       __ bdnz(l_3);

  1648       __ bind(l_6);

  1649     }

  1650   }

  1652   // Generate stub for conjoint int copy.  If "aligned" is true, the

  1653   // "from" and "to" addresses are assumed to be heapword aligned.

  1654   //

  1655   // Arguments for generated stub:

  1656   //      from:  R3_ARG1

  1657   //      to:    R4_ARG2

  1658   //      count: R5_ARG3 treated as signed

  1659   //

  1660   address generate_conjoint_int_copy(bool aligned, const char * name) {

  1661     StubCodeMark mark(this, "StubRoutines", name);

  1662     address start = __ emit_fd();

  1664     address nooverlap_target = aligned ?

  1665       ((FunctionDescriptor*)StubRoutines::arrayof_jint_disjoint_arraycopy())->entry() :

  1666       ((FunctionDescriptor*)StubRoutines::jint_disjoint_arraycopy())->entry();

  1668     array_overlap_test(nooverlap_target, 2);

  1670     generate_conjoint_int_copy_core(aligned);

  1672     __ blr();

  1674     return start;

  1675   }

  1677   // Generate core code for disjoint long copy (and oop copy on

  1678   // 64-bit).  If "aligned" is true, the "from" and "to" addresses

  1679   // are assumed to be heapword aligned.

  1680   //

  1681   // Arguments:

  1682   //      from:  R3_ARG1

  1683   //      to:    R4_ARG2

  1684   //      count: R5_ARG3 treated as signed

  1685   //

  1686   void generate_disjoint_long_copy_core(bool aligned) {

  1687     Register tmp1 = R6_ARG4;

  1688     Register tmp2 = R7_ARG5;

  1689     Register tmp3 = R8_ARG6;

  1690     Register tmp4 = R0;

  1692     Label l_1, l_2, l_3, l_4;

  1694     { // FasterArrayCopy

  1695       __ cmpwi(CCR0, R5_ARG3, 3);

  1696       __ ble(CCR0, l_3); // copy 1 at a time if less than 4 elements remain

  1698       __ srdi(tmp1, R5_ARG3, 2);

  1699       __ andi_(R5_ARG3, R5_ARG3, 3);

  1700       __ mtctr(tmp1);

  1702       __ bind(l_4);

  1703       // Use unrolled version for mass copying (copy 4 elements a time).

  1704       // Load feeding store gets zero latency on Power6, however not on Power5.

  1705       // Therefore, the following sequence is made for the good of both.

  1706       __ ld(tmp1, 0, R3_ARG1);

  1707       __ ld(tmp2, 8, R3_ARG1);

  1708       __ ld(tmp3, 16, R3_ARG1);

  1709       __ ld(tmp4, 24, R3_ARG1);

  1710       __ std(tmp1, 0, R4_ARG2);

  1711       __ std(tmp2, 8, R4_ARG2);

  1712       __ std(tmp3, 16, R4_ARG2);

  1713       __ std(tmp4, 24, R4_ARG2);

  1714       __ addi(R3_ARG1, R3_ARG1, 32);

  1715       __ addi(R4_ARG2, R4_ARG2, 32);

  1716       __ bdnz(l_4);

  1717     }

  1719     // copy 1 element at a time

  1720     __ bind(l_3);

  1721     __ cmpwi(CCR0, R5_ARG3, 0);

  1722     __ beq(CCR0, l_1);

  1724     { // FasterArrayCopy

  1725       __ mtctr(R5_ARG3);

  1726       __ addi(R3_ARG1, R3_ARG1, -8);

  1727       __ addi(R4_ARG2, R4_ARG2, -8);

  1729       __ bind(l_2);

  1730       __ ldu(R0, 8, R3_ARG1);

  1731       __ stdu(R0, 8, R4_ARG2);

  1732       __ bdnz(l_2);

  1734     }

  1735     __ bind(l_1);

  1736   }

  1738   // Generate stub for disjoint long copy.  If "aligned" is true, the

  1739   // "from" and "to" addresses are assumed to be heapword aligned.

  1740   //

  1741   // Arguments for generated stub:

  1742   //      from:  R3_ARG1

  1743   //      to:    R4_ARG2

  1744   //      count: R5_ARG3 treated as signed

  1745   //

  1746   address generate_disjoint_long_copy(bool aligned, const char * name) {

  1747     StubCodeMark mark(this, "StubRoutines", name);

  1748     address start = __ emit_fd();

  1749     generate_disjoint_long_copy_core(aligned);

  1750     __ blr();

  1752     return start;

  1753   }

  1755   // Generate core code for conjoint long copy (and oop copy on

  1756   // 64-bit).  If "aligned" is true, the "from" and "to" addresses

  1757   // are assumed to be heapword aligned.

  1758   //

  1759   // Arguments:

  1760   //      from:  R3_ARG1

  1761   //      to:    R4_ARG2

  1762   //      count: R5_ARG3 treated as signed

  1763   //

  1764   void generate_conjoint_long_copy_core(bool aligned) {

  1765     Register tmp1 = R6_ARG4;

  1766     Register tmp2 = R7_ARG5;

  1767     Register tmp3 = R8_ARG6;

  1768     Register tmp4 = R0;

  1770     Label l_1, l_2, l_3, l_4, l_5;

  1772     __ cmpwi(CCR0, R5_ARG3, 0);

  1773     __ beq(CCR0, l_1);

  1775     { // FasterArrayCopy

  1776       __ sldi(R5_ARG3, R5_ARG3, 3);

  1777       __ add(R3_ARG1, R3_ARG1, R5_ARG3);

  1778       __ add(R4_ARG2, R4_ARG2, R5_ARG3);

  1779       __ srdi(R5_ARG3, R5_ARG3, 3);

  1781       __ cmpwi(CCR0, R5_ARG3, 3);

  1782       __ ble(CCR0, l_5); // copy 1 at a time if less than 4 elements remain

  1784       __ srdi(tmp1, R5_ARG3, 2);

  1785       __ andi(R5_ARG3, R5_ARG3, 3);

  1786       __ mtctr(tmp1);

  1788       __ bind(l_4);

  1789       // Use unrolled version for mass copying (copy 4 elements a time).

  1790       // Load feeding store gets zero latency on Power6, however not on Power5.

  1791       // Therefore, the following sequence is made for the good of both.

  1792       __ addi(R3_ARG1, R3_ARG1, -32);

  1793       __ addi(R4_ARG2, R4_ARG2, -32);

  1794       __ ld(tmp4, 24, R3_ARG1);

  1795       __ ld(tmp3, 16, R3_ARG1);

  1796       __ ld(tmp2, 8, R3_ARG1);

  1797       __ ld(tmp1, 0, R3_ARG1);

  1798       __ std(tmp4, 24, R4_ARG2);

  1799       __ std(tmp3, 16, R4_ARG2);

  1800       __ std(tmp2, 8, R4_ARG2);

  1801       __ std(tmp1, 0, R4_ARG2);

  1802       __ bdnz(l_4);

  1804       __ cmpwi(CCR0, R5_ARG3, 0);

  1805       __ beq(CCR0, l_1);

  1807       __ bind(l_5);

  1808       __ mtctr(R5_ARG3);

  1809       __ bind(l_3);

  1810       __ ld(R0, -8, R3_ARG1);

  1811       __ std(R0, -8, R4_ARG2);

  1812       __ addi(R3_ARG1, R3_ARG1, -8);

  1813       __ addi(R4_ARG2, R4_ARG2, -8);

  1814       __ bdnz(l_3);

  1816     }

  1817     __ bind(l_1);

  1818   }

  1820   // Generate stub for conjoint long copy.  If "aligned" is true, the

  1821   // "from" and "to" addresses are assumed to be heapword aligned.

  1822   //

  1823   // Arguments for generated stub:

  1824   //      from:  R3_ARG1

  1825   //      to:    R4_ARG2

  1826   //      count: R5_ARG3 treated as signed

  1827   //

  1828   address generate_conjoint_long_copy(bool aligned, const char * name) {

  1829     StubCodeMark mark(this, "StubRoutines", name);

  1830     address start = __ emit_fd();

  1832     address nooverlap_target = aligned ?

  1833       ((FunctionDescriptor*)StubRoutines::arrayof_jlong_disjoint_arraycopy())->entry() :

  1834       ((FunctionDescriptor*)StubRoutines::jlong_disjoint_arraycopy())->entry();

  1836     array_overlap_test(nooverlap_target, 3);

  1837     generate_conjoint_long_copy_core(aligned);

  1839     __ blr();

  1841     return start;

  1842   }

  1844   // Generate stub for conjoint oop copy.  If "aligned" is true, the

  1845   // "from" and "to" addresses are assumed to be heapword aligned.

  1846   //

  1847   // Arguments for generated stub:

  1848   //      from:  R3_ARG1

  1849   //      to:    R4_ARG2

  1850   //      count: R5_ARG3 treated as signed

  1851   //      dest_uninitialized: G1 support

  1852   //

  1853   address generate_conjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {

  1854     StubCodeMark mark(this, "StubRoutines", name);

  1856     address start = __ emit_fd();

  1858     address nooverlap_target = aligned ?

  1859       ((FunctionDescriptor*)StubRoutines::arrayof_oop_disjoint_arraycopy())->entry() :

  1860       ((FunctionDescriptor*)StubRoutines::oop_disjoint_arraycopy())->entry();

  1862     gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);

  1864     // Save arguments.

  1865     __ mr(R9_ARG7, R4_ARG2);

  1866     __ mr(R10_ARG8, R5_ARG3);

  1868     if (UseCompressedOops) {

  1869       array_overlap_test(nooverlap_target, 2);

  1870       generate_conjoint_int_copy_core(aligned);

  1871     } else {

  1872       array_overlap_test(nooverlap_target, 3);

  1873       generate_conjoint_long_copy_core(aligned);

  1874     }

  1876     gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1);

  1878     __ blr();

  1880     return start;

  1881   }

  1883   // Generate stub for disjoint oop copy.  If "aligned" is true, the

  1884   // "from" and "to" addresses are assumed to be heapword aligned.

  1885   //

  1886   // Arguments for generated stub:

  1887   //      from:  R3_ARG1

  1888   //      to:    R4_ARG2

  1889   //      count: R5_ARG3 treated as signed

  1890   //      dest_uninitialized: G1 support

  1891   //

  1892   address generate_disjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {

  1893     StubCodeMark mark(this, "StubRoutines", name);

  1894     address start = __ emit_fd();

  1896     gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);

  1898     // save some arguments, disjoint_long_copy_core destroys them.

  1899     // needed for post barrier

  1900     __ mr(R9_ARG7, R4_ARG2);

  1901     __ mr(R10_ARG8, R5_ARG3);

  1903     if (UseCompressedOops) {

  1904       generate_disjoint_int_copy_core(aligned);

  1905     } else {

  1906       generate_disjoint_long_copy_core(aligned);

  1907     }

  1909     gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1);

  1911     __ blr();

  1913     return start;

  1914   }

  1916   void generate_arraycopy_stubs() {

  1917     // Note: the disjoint stubs must be generated first, some of

  1918     // the conjoint stubs use them.

  1920     // non-aligned disjoint versions

  1921     StubRoutines::_jbyte_disjoint_arraycopy       = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");

  1922     StubRoutines::_jshort_disjoint_arraycopy      = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");

  1923     StubRoutines::_jint_disjoint_arraycopy        = generate_disjoint_int_copy(false, "jint_disjoint_arraycopy");

  1924     StubRoutines::_jlong_disjoint_arraycopy       = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");

  1925     StubRoutines::_oop_disjoint_arraycopy         = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy", false);

  1926     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy_uninit", true);

  1928     // aligned disjoint versions

  1929     StubRoutines::_arrayof_jbyte_disjoint_arraycopy      = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");

  1930     StubRoutines::_arrayof_jshort_disjoint_arraycopy     = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");

  1931     StubRoutines::_arrayof_jint_disjoint_arraycopy       = generate_disjoint_int_copy(true, "arrayof_jint_disjoint_arraycopy");

  1932     StubRoutines::_arrayof_jlong_disjoint_arraycopy      = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");

  1933     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(true, "arrayof_oop_disjoint_arraycopy", false);

  1934     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, "oop_disjoint_arraycopy_uninit", true);

  1936     // non-aligned conjoint versions

  1937     StubRoutines::_jbyte_arraycopy      = generate_conjoint_byte_copy(false, "jbyte_arraycopy");

  1938     StubRoutines::_jshort_arraycopy     = generate_conjoint_short_copy(false, "jshort_arraycopy");

  1939     StubRoutines::_jint_arraycopy       = generate_conjoint_int_copy(false, "jint_arraycopy");

  1940     StubRoutines::_jlong_arraycopy      = generate_conjoint_long_copy(false, "jlong_arraycopy");

  1941     StubRoutines::_oop_arraycopy        = generate_conjoint_oop_copy(false, "oop_arraycopy", false);

  1942     StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, "oop_arraycopy_uninit", true);

  1944     // aligned conjoint versions

  1945     StubRoutines::_arrayof_jbyte_arraycopy      = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");

  1946     StubRoutines::_arrayof_jshort_arraycopy     = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");

  1947     StubRoutines::_arrayof_jint_arraycopy       = generate_conjoint_int_copy(true, "arrayof_jint_arraycopy");

  1948     StubRoutines::_arrayof_jlong_arraycopy      = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");

  1949     StubRoutines::_arrayof_oop_arraycopy        = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", false);

  1950     StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", true);

  1952     // fill routines

  1953     StubRoutines::_jbyte_fill          = generate_fill(T_BYTE,  false, "jbyte_fill");

  1954     StubRoutines::_jshort_fill         = generate_fill(T_SHORT, false, "jshort_fill");

  1955     StubRoutines::_jint_fill           = generate_fill(T_INT,   false, "jint_fill");

  1956     StubRoutines::_arrayof_jbyte_fill  = generate_fill(T_BYTE,  true, "arrayof_jbyte_fill");

  1957     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");

  1958     StubRoutines::_arrayof_jint_fill   = generate_fill(T_INT,   true, "arrayof_jint_fill");

  1959   }

  1961   // Safefetch stubs.

  1962   void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) {

  1963     // safefetch signatures:

  1964     //   int      SafeFetch32(int*      adr, int      errValue);

  1965     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);

  1966     //

  1967     // arguments:

  1968     //   R3_ARG1 = adr

  1969     //   R4_ARG2 = errValue

  1970     //

  1971     // result:

  1972     //   R3_RET  = *adr or errValue

  1974     StubCodeMark mark(this, "StubRoutines", name);

  1976     // Entry point, pc or function descriptor.

  1977     *entry = __ emit_fd();

  1979     // Load *adr into R4_ARG2, may fault.

  1980     *fault_pc = __ pc();

  1981     switch (size) {

  1982       case 4:

  1983         // int32_t, signed extended

  1984         __ lwa(R4_ARG2, 0, R3_ARG1);

  1985         break;

  1986       case 8:

  1987         // int64_t

  1988         __ ld(R4_ARG2, 0, R3_ARG1);

  1989         break;

  1990       default:

  1991         ShouldNotReachHere();

  1992     }

  1994     // return errValue or *adr

  1995     *continuation_pc = __ pc();

  1996     __ mr(R3_RET, R4_ARG2);

  1997     __ blr();

  1998   }

  2000   // Initialization

  2001   void generate_initial() {

  2002     // Generates all stubs and initializes the entry points

  2004     // Entry points that exist in all platforms.

  2005     // Note: This is code that could be shared among different platforms - however the

  2006     // benefit seems to be smaller than the disadvantage of having a

  2007     // much more complicated generator structure. See also comment in

  2008     // stubRoutines.hpp.

  2010     StubRoutines::_forward_exception_entry          = generate_forward_exception();

  2011     StubRoutines::_call_stub_entry                  = generate_call_stub(StubRoutines::_call_stub_return_address);

  2012     StubRoutines::_catch_exception_entry            = generate_catch_exception();

  2013   }

  2015   void generate_all() {

  2016     // Generates all stubs and initializes the entry points

  2018     // These entry points require SharedInfo::stack0 to be set up in

  2019     // non-core builds

  2020     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);

  2021     // Handle IncompatibleClassChangeError in itable stubs.

  2022     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError),  false);

  2023     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);

  2024     StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError),   false);

  2026     StubRoutines::_handler_for_unsafe_access_entry         = generate_handler_for_unsafe_access();

  2028     // support for verify_oop (must happen after universe_init)

  2029     StubRoutines::_verify_oop_subroutine_entry             = generate_verify_oop();

  2031     // arraycopy stubs used by compilers

  2032     generate_arraycopy_stubs();

  2034     // PPC uses stubs for safefetch.

  2035     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,

  2036                                                        &StubRoutines::_safefetch32_fault_pc,

  2037                                                        &StubRoutines::_safefetch32_continuation_pc);

  2038     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,

  2039                                                        &StubRoutines::_safefetchN_fault_pc,

  2040                                                        &StubRoutines::_safefetchN_continuation_pc);

  2041   }

  2043  public:

  2044   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {

  2045     // replace the standard masm with a special one:

  2046     _masm = new MacroAssembler(code);

  2047     if (all) {

  2048       generate_all();

  2049     } else {

  2050       generate_initial();

  2051     }

  2052   }

  2053 };

  2055 void StubGenerator_generate(CodeBuffer* code, bool all) {

  2056   StubGenerator g(code, all);

  2057 }

src/cpu/ppc/vm/stubGenerator_ppc.cpp@41b780b43b74

src/cpu/ppc/vm/stubGenerator_ppc.cpp

Mercurial > jdk8-mips64-public > hotspot / file revision

src/cpu/ppc/vm/stubGenerator_ppc.cpp@41b780b43b74

src/cpu/ppc/vm/stubGenerator_ppc.cpp