jdk8-mips64-public/hotspot: src/cpu/ppc/vm/stubGenerator

8035647: PPC64: Support for elf v2 abi.
Summary: ELFv2 ABI used by the little endian PowerPC64 on Linux.
Reviewed-by: kvn
Contributed-by: asmundak@google.com

     1 /*

     2  * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.

     3  * Copyright 2012, 2013 SAP AG. All rights reserved.

     4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.

     5  *

     6  * This code is free software; you can redistribute it and/or modify it

     7  * under the terms of the GNU General Public License version 2 only, as

     8  * published by the Free Software Foundation.

     9  *

    10  * This code is distributed in the hope that it will be useful, but WITHOUT

    11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or

    12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License

    13  * version 2 for more details (a copy is included in the LICENSE file that

    14  * accompanied this code).

    15  *

    16  * You should have received a copy of the GNU General Public License version

    17  * 2 along with this work; if not, write to the Free Software Foundation,

    18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.

    19  *

    20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA

    21  * or visit www.oracle.com if you need additional information or have any

    22  * questions.

    23  *

    24  */

    26 #include "precompiled.hpp"

    27 #include "asm/assembler.hpp"

    28 #include "asm/macroAssembler.inline.hpp"

    29 #include "interpreter/interpreter.hpp"

    30 #include "nativeInst_ppc.hpp"

    31 #include "oops/instanceOop.hpp"

    32 #include "oops/method.hpp"

    33 #include "oops/objArrayKlass.hpp"

    34 #include "oops/oop.inline.hpp"

    35 #include "prims/methodHandles.hpp"

    36 #include "runtime/frame.inline.hpp"

    37 #include "runtime/handles.inline.hpp"

    38 #include "runtime/sharedRuntime.hpp"

    39 #include "runtime/stubCodeGenerator.hpp"

    40 #include "runtime/stubRoutines.hpp"

    41 #include "utilities/top.hpp"

    42 #ifdef TARGET_OS_FAMILY_aix

    43 # include "thread_aix.inline.hpp"

    44 #endif

    45 #ifdef TARGET_OS_FAMILY_linux

    46 # include "thread_linux.inline.hpp"

    47 #endif

    48 #ifdef COMPILER2

    49 #include "opto/runtime.hpp"

    50 #endif

    52 #define __ _masm->

    54 #ifdef PRODUCT

    55 #define BLOCK_COMMENT(str) // nothing

    56 #else

    57 #define BLOCK_COMMENT(str) __ block_comment(str)

    58 #endif

    60 class StubGenerator: public StubCodeGenerator {

    61  private:

    63   // Call stubs are used to call Java from C

    64   //

    65   // Arguments:

    66   //

    67   //   R3  - call wrapper address     : address

    68   //   R4  - result                   : intptr_t*

    69   //   R5  - result type              : BasicType

    70   //   R6  - method                   : Method

    71   //   R7  - frame mgr entry point    : address

    72   //   R8  - parameter block          : intptr_t*

    73   //   R9  - parameter count in words : int

    74   //   R10 - thread                   : Thread*

    75   //

    76   address generate_call_stub(address& return_address) {

    77     // Setup a new c frame, copy java arguments, call frame manager or

    78     // native_entry, and process result.

    80     StubCodeMark mark(this, "StubRoutines", "call_stub");

    82     address start = __ function_entry();

    84     // some sanity checks

    85     assert((sizeof(frame::abi_minframe) % 16) == 0,           "unaligned");

    86     assert((sizeof(frame::abi_reg_args) % 16) == 0,           "unaligned");

    87     assert((sizeof(frame::spill_nonvolatiles) % 16) == 0,     "unaligned");

    88     assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned");

    89     assert((sizeof(frame::entry_frame_locals) % 16) == 0,     "unaligned");

    91     Register r_arg_call_wrapper_addr        = R3;

    92     Register r_arg_result_addr              = R4;

    93     Register r_arg_result_type              = R5;

    94     Register r_arg_method                   = R6;

    95     Register r_arg_entry                    = R7;

    96     Register r_arg_thread                   = R10;

    98     Register r_temp                         = R24;

    99     Register r_top_of_arguments_addr        = R25;

   100     Register r_entryframe_fp                = R26;

   102     {

   103       // Stack on entry to call_stub:

   104       //

   105       //      F1      [C_FRAME]

   106       //              ...

   108       Register r_arg_argument_addr          = R8;

   109       Register r_arg_argument_count         = R9;

   110       Register r_frame_alignment_in_bytes   = R27;

   111       Register r_argument_addr              = R28;

   112       Register r_argumentcopy_addr          = R29;

   113       Register r_argument_size_in_bytes     = R30;

   114       Register r_frame_size                 = R23;

   116       Label arguments_copied;

   118       // Save LR/CR to caller's C_FRAME.

   119       __ save_LR_CR(R0);

   121       // Zero extend arg_argument_count.

   122       __ clrldi(r_arg_argument_count, r_arg_argument_count, 32);

   124       // Save non-volatiles GPRs to ENTRY_FRAME (not yet pushed, but it's safe).

   125       __ save_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));

   127       // Keep copy of our frame pointer (caller's SP).

   128       __ mr(r_entryframe_fp, R1_SP);

   130       BLOCK_COMMENT("Push ENTRY_FRAME including arguments");

   131       // Push ENTRY_FRAME including arguments:

   132       //

   133       //      F0      [TOP_IJAVA_FRAME_ABI]

   134       //              alignment (optional)

   135       //              [outgoing Java arguments]

   136       //              [ENTRY_FRAME_LOCALS]

   137       //      F1      [C_FRAME]

   138       //              ...

   140       // calculate frame size

   142       // unaligned size of arguments

   143       __ sldi(r_argument_size_in_bytes,

   144                   r_arg_argument_count, Interpreter::logStackElementSize);

   145       // arguments alignment (max 1 slot)

   146       // FIXME: use round_to() here

   147       __ andi_(r_frame_alignment_in_bytes, r_arg_argument_count, 1);

   148       __ sldi(r_frame_alignment_in_bytes,

   149               r_frame_alignment_in_bytes, Interpreter::logStackElementSize);

   151       // size = unaligned size of arguments + top abi's size

   152       __ addi(r_frame_size, r_argument_size_in_bytes,

   153               frame::top_ijava_frame_abi_size);

   154       // size += arguments alignment

   155       __ add(r_frame_size,

   156              r_frame_size, r_frame_alignment_in_bytes);

   157       // size += size of call_stub locals

   158       __ addi(r_frame_size,

   159               r_frame_size, frame::entry_frame_locals_size);

   161       // push ENTRY_FRAME

   162       __ push_frame(r_frame_size, r_temp);

   164       // initialize call_stub locals (step 1)

   165       __ std(r_arg_call_wrapper_addr,

   166              _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp);

   167       __ std(r_arg_result_addr,

   168              _entry_frame_locals_neg(result_address), r_entryframe_fp);

   169       __ std(r_arg_result_type,

   170              _entry_frame_locals_neg(result_type), r_entryframe_fp);

   171       // we will save arguments_tos_address later

   174       BLOCK_COMMENT("Copy Java arguments");

   175       // copy Java arguments

   177       // Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later.

   178       // FIXME: why not simply use SP+frame::top_ijava_frame_size?

   179       __ addi(r_top_of_arguments_addr,

   180               R1_SP, frame::top_ijava_frame_abi_size);

   181       __ add(r_top_of_arguments_addr,

   182              r_top_of_arguments_addr, r_frame_alignment_in_bytes);

   184       // any arguments to copy?

   185       __ cmpdi(CCR0, r_arg_argument_count, 0);

   186       __ beq(CCR0, arguments_copied);

   188       // prepare loop and copy arguments in reverse order

   189       {

   190         // init CTR with arg_argument_count

   191         __ mtctr(r_arg_argument_count);

   193         // let r_argumentcopy_addr point to last outgoing Java arguments P

   194         __ mr(r_argumentcopy_addr, r_top_of_arguments_addr);

   196         // let r_argument_addr point to last incoming java argument

   197         __ add(r_argument_addr,

   198                    r_arg_argument_addr, r_argument_size_in_bytes);

   199         __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);

   201         // now loop while CTR > 0 and copy arguments

   202         {

   203           Label next_argument;

   204           __ bind(next_argument);

   206           __ ld(r_temp, 0, r_argument_addr);

   207           // argument_addr--;

   208           __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);

   209           __ std(r_temp, 0, r_argumentcopy_addr);

   210           // argumentcopy_addr++;

   211           __ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord);

   213           __ bdnz(next_argument);

   214         }

   215       }

   217       // Arguments copied, continue.

   218       __ bind(arguments_copied);

   219     }

   221     {

   222       BLOCK_COMMENT("Call frame manager or native entry.");

   223       // Call frame manager or native entry.

   224       Register r_new_arg_entry = R14_state;

   225       assert_different_registers(r_new_arg_entry, r_top_of_arguments_addr,

   226                                  r_arg_method, r_arg_thread);

   228       __ mr(r_new_arg_entry, r_arg_entry);

   230       // Register state on entry to frame manager / native entry:

   231       //

   232       //   tos         -  intptr_t*    sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8

   233       //   R19_method  -  Method

   234       //   R16_thread  -  JavaThread*

   236       // Tos must point to last argument - element_size.

   237       const Register tos = R17_tos;

   238       __ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);

   240       // initialize call_stub locals (step 2)

   241       // now save tos as arguments_tos_address

   242       __ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);

   244       // load argument registers for call

   245       __ mr(R19_method, r_arg_method);

   246       __ mr(R16_thread, r_arg_thread);

   247       assert(tos != r_arg_method, "trashed r_arg_method");

   248       assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");

   250       // Set R15_prev_state to 0 for simplifying checks in callee.

   251       __ li(R15_prev_state, 0);

   253       // Stack on entry to frame manager / native entry:

   254       //

   255       //      F0      [TOP_IJAVA_FRAME_ABI]

   256       //              alignment (optional)

   257       //              [outgoing Java arguments]

   258       //              [ENTRY_FRAME_LOCALS]

   259       //      F1      [C_FRAME]

   260       //              ...

   261       //

   263       // global toc register

   264       __ load_const(R29, MacroAssembler::global_toc(), R11_scratch1);

   266       // Load narrow oop base.

   267       __ reinit_heapbase(R30, R11_scratch1);

   269       // Remember the senderSP so we interpreter can pop c2i arguments off of the stack

   270       // when called via a c2i.

   272       // Pass initial_caller_sp to framemanager.

   273       __ mr(R21_tmp1, R1_SP);

   275       // Do a light-weight C-call here, r_new_arg_entry holds the address

   276       // of the interpreter entry point (frame manager or native entry)

   277       // and save runtime-value of LR in return_address.

   278       assert(r_new_arg_entry != tos && r_new_arg_entry != R19_method && r_new_arg_entry != R16_thread,

   279              "trashed r_new_arg_entry");

   280       return_address = __ call_stub(r_new_arg_entry);

   281     }

   283     {

   284       BLOCK_COMMENT("Returned from frame manager or native entry.");

   285       // Returned from frame manager or native entry.

   286       // Now pop frame, process result, and return to caller.

   288       // Stack on exit from frame manager / native entry:

   289       //

   290       //      F0      [ABI]

   291       //              ...

   292       //              [ENTRY_FRAME_LOCALS]

   293       //      F1      [C_FRAME]

   294       //              ...

   295       //

   296       // Just pop the topmost frame ...

   297       //

   299       Label ret_is_object;

   300       Label ret_is_long;

   301       Label ret_is_float;

   302       Label ret_is_double;

   304       Register r_entryframe_fp = R30;

   305       Register r_lr            = R7_ARG5;

   306       Register r_cr            = R8_ARG6;

   308       // Reload some volatile registers which we've spilled before the call

   309       // to frame manager / native entry.

   310       // Access all locals via frame pointer, because we know nothing about

   311       // the topmost frame's size.

   312       __ ld(r_entryframe_fp, _abi(callers_sp), R1_SP);

   313       assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);

   314       __ ld(r_arg_result_addr,

   315             _entry_frame_locals_neg(result_address), r_entryframe_fp);

   316       __ ld(r_arg_result_type,

   317             _entry_frame_locals_neg(result_type), r_entryframe_fp);

   318       __ ld(r_cr, _abi(cr), r_entryframe_fp);

   319       __ ld(r_lr, _abi(lr), r_entryframe_fp);

   321       // pop frame and restore non-volatiles, LR and CR

   322       __ mr(R1_SP, r_entryframe_fp);

   323       __ mtcr(r_cr);

   324       __ mtlr(r_lr);

   326       // Store result depending on type. Everything that is not

   327       // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.

   328       __ cmpwi(CCR0, r_arg_result_type, T_OBJECT);

   329       __ cmpwi(CCR1, r_arg_result_type, T_LONG);

   330       __ cmpwi(CCR5, r_arg_result_type, T_FLOAT);

   331       __ cmpwi(CCR6, r_arg_result_type, T_DOUBLE);

   333       // restore non-volatile registers

   334       __ restore_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));

   337       // Stack on exit from call_stub:

   338       //

   339       //      0       [C_FRAME]

   340       //              ...

   341       //

   342       //  no call_stub frames left.

   344       // All non-volatiles have been restored at this point!!

   345       assert(R3_RET == R3, "R3_RET should be R3");

   347       __ beq(CCR0, ret_is_object);

   348       __ beq(CCR1, ret_is_long);

   349       __ beq(CCR5, ret_is_float);

   350       __ beq(CCR6, ret_is_double);

   352       // default:

   353       __ stw(R3_RET, 0, r_arg_result_addr);

   354       __ blr(); // return to caller

   356       // case T_OBJECT:

   357       __ bind(ret_is_object);

   358       __ std(R3_RET, 0, r_arg_result_addr);

   359       __ blr(); // return to caller

   361       // case T_LONG:

   362       __ bind(ret_is_long);

   363       __ std(R3_RET, 0, r_arg_result_addr);

   364       __ blr(); // return to caller

   366       // case T_FLOAT:

   367       __ bind(ret_is_float);

   368       __ stfs(F1_RET, 0, r_arg_result_addr);

   369       __ blr(); // return to caller

   371       // case T_DOUBLE:

   372       __ bind(ret_is_double);

   373       __ stfd(F1_RET, 0, r_arg_result_addr);

   374       __ blr(); // return to caller

   375     }

   377     return start;

   378   }

   380   // Return point for a Java call if there's an exception thrown in

   381   // Java code.  The exception is caught and transformed into a

   382   // pending exception stored in JavaThread that can be tested from

   383   // within the VM.

   384   //

   385   address generate_catch_exception() {

   386     StubCodeMark mark(this, "StubRoutines", "catch_exception");

   388     address start = __ pc();

   390     // Registers alive

   391     //

   392     //  R16_thread

   393     //  R3_ARG1 - address of pending exception

   394     //  R4_ARG2 - return address in call stub

   396     const Register exception_file = R21_tmp1;

   397     const Register exception_line = R22_tmp2;

   399     __ load_const(exception_file, (void*)__FILE__);

   400     __ load_const(exception_line, (void*)__LINE__);

   402     __ std(R3_ARG1, thread_(pending_exception));

   403     // store into `char *'

   404     __ std(exception_file, thread_(exception_file));

   405     // store into `int'

   406     __ stw(exception_line, thread_(exception_line));

   408     // complete return to VM

   409     assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");

   411     __ mtlr(R4_ARG2);

   412     // continue in call stub

   413     __ blr();

   415     return start;

   416   }

   418   // Continuation point for runtime calls returning with a pending

   419   // exception.  The pending exception check happened in the runtime

   420   // or native call stub.  The pending exception in Thread is

   421   // converted into a Java-level exception.

   422   //

   423   address generate_forward_exception() {

   424     StubCodeMark mark(this, "StubRoutines", "forward_exception");

   425     address start = __ pc();

   427 #if !defined(PRODUCT)

   428     if (VerifyOops) {

   429       // Get pending exception oop.

   430       __ ld(R3_ARG1,

   431                 in_bytes(Thread::pending_exception_offset()),

   432                 R16_thread);

   433       // Make sure that this code is only executed if there is a pending exception.

   434       {

   435         Label L;

   436         __ cmpdi(CCR0, R3_ARG1, 0);

   437         __ bne(CCR0, L);

   438         __ stop("StubRoutines::forward exception: no pending exception (1)");

   439         __ bind(L);

   440       }

   441       __ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");

   442     }

   443 #endif

   445     // Save LR/CR and copy exception pc (LR) into R4_ARG2.

   446     __ save_LR_CR(R4_ARG2);

   447     __ push_frame_reg_args(0, R0);

   448     // Find exception handler.

   449     __ call_VM_leaf(CAST_FROM_FN_PTR(address,

   450                      SharedRuntime::exception_handler_for_return_address),

   451                     R16_thread,

   452                     R4_ARG2);

   453     // Copy handler's address.

   454     __ mtctr(R3_RET);

   455     __ pop_frame();

   456     __ restore_LR_CR(R0);

   458     // Set up the arguments for the exception handler:

   459     //  - R3_ARG1: exception oop

   460     //  - R4_ARG2: exception pc.

   462     // Load pending exception oop.

   463     __ ld(R3_ARG1,

   464               in_bytes(Thread::pending_exception_offset()),

   465               R16_thread);

   467     // The exception pc is the return address in the caller.

   468     // Must load it into R4_ARG2.

   469     __ mflr(R4_ARG2);

   471 #ifdef ASSERT

   472     // Make sure exception is set.

   473     {

   474       Label L;

   475       __ cmpdi(CCR0, R3_ARG1, 0);

   476       __ bne(CCR0, L);

   477       __ stop("StubRoutines::forward exception: no pending exception (2)");

   478       __ bind(L);

   479     }

   480 #endif

   482     // Clear the pending exception.

   483     __ li(R0, 0);

   484     __ std(R0,

   485                in_bytes(Thread::pending_exception_offset()),

   486                R16_thread);

   487     // Jump to exception handler.

   488     __ bctr();

   490     return start;

   491   }

   493 #undef __

   494 #define __ masm->

   495   // Continuation point for throwing of implicit exceptions that are

   496   // not handled in the current activation. Fabricates an exception

   497   // oop and initiates normal exception dispatching in this

   498   // frame. Only callee-saved registers are preserved (through the

   499   // normal register window / RegisterMap handling).  If the compiler

   500   // needs all registers to be preserved between the fault point and

   501   // the exception handler then it must assume responsibility for that

   502   // in AbstractCompiler::continuation_for_implicit_null_exception or

   503   // continuation_for_implicit_division_by_zero_exception. All other

   504   // implicit exceptions (e.g., NullPointerException or

   505   // AbstractMethodError on entry) are either at call sites or

   506   // otherwise assume that stack unwinding will be initiated, so

   507   // caller saved registers were assumed volatile in the compiler.

   508   //

   509   // Note that we generate only this stub into a RuntimeStub, because

   510   // it needs to be properly traversed and ignored during GC, so we

   511   // change the meaning of the "__" macro within this method.

   512   //

   513   // Note: the routine set_pc_not_at_call_for_caller in

   514   // SharedRuntime.cpp requires that this code be generated into a

   515   // RuntimeStub.

   516   address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc,

   517                                    Register arg1 = noreg, Register arg2 = noreg) {

   518     CodeBuffer code(name, 1024 DEBUG_ONLY(+ 512), 0);

   519     MacroAssembler* masm = new MacroAssembler(&code);

   521     OopMapSet* oop_maps  = new OopMapSet();

   522     int frame_size_in_bytes = frame::abi_reg_args_size;

   523     OopMap* map = new OopMap(frame_size_in_bytes / sizeof(jint), 0);

   525     StubCodeMark mark(this, "StubRoutines", "throw_exception");

   527     address start = __ pc();

   529     __ save_LR_CR(R11_scratch1);

   531     // Push a frame.

   532     __ push_frame_reg_args(0, R11_scratch1);

   534     address frame_complete_pc = __ pc();

   536     if (restore_saved_exception_pc) {

   537       __ unimplemented("StubGenerator::throw_exception with restore_saved_exception_pc", 74);

   538     }

   540     // Note that we always have a runtime stub frame on the top of

   541     // stack by this point. Remember the offset of the instruction

   542     // whose address will be moved to R11_scratch1.

   543     address gc_map_pc = __ get_PC_trash_LR(R11_scratch1);

   545     __ set_last_Java_frame(/*sp*/R1_SP, /*pc*/R11_scratch1);

   547     __ mr(R3_ARG1, R16_thread);

   548     if (arg1 != noreg) {

   549       __ mr(R4_ARG2, arg1);

   550     }

   551     if (arg2 != noreg) {

   552       __ mr(R5_ARG3, arg2);

   553     }

   554 #if defined(ABI_ELFv2)

   555     __ call_c(runtime_entry, relocInfo::none);

   556 #else

   557     __ call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, runtime_entry), relocInfo::none);

   558 #endif

   560     // Set an oopmap for the call site.

   561     oop_maps->add_gc_map((int)(gc_map_pc - start), map);

   563     __ reset_last_Java_frame();

   565 #ifdef ASSERT

   566     // Make sure that this code is only executed if there is a pending

   567     // exception.

   568     {

   569       Label L;

   570       __ ld(R0,

   571                 in_bytes(Thread::pending_exception_offset()),

   572                 R16_thread);

   573       __ cmpdi(CCR0, R0, 0);

   574       __ bne(CCR0, L);

   575       __ stop("StubRoutines::throw_exception: no pending exception");

   576       __ bind(L);

   577     }

   578 #endif

   580     // Pop frame.

   581     __ pop_frame();

   583     __ restore_LR_CR(R11_scratch1);

   585     __ load_const(R11_scratch1, StubRoutines::forward_exception_entry());

   586     __ mtctr(R11_scratch1);

   587     __ bctr();

   589     // Create runtime stub with OopMap.

   590     RuntimeStub* stub =

   591       RuntimeStub::new_runtime_stub(name, &code,

   592                                     /*frame_complete=*/ (int)(frame_complete_pc - start),

   593                                     frame_size_in_bytes/wordSize,

   594                                     oop_maps,

   595                                     false);

   596     return stub->entry_point();

   597   }

   598 #undef __

   599 #define __ _masm->

   601   //  Generate G1 pre-write barrier for array.

   602   //

   603   //  Input:

   604   //     from     - register containing src address (only needed for spilling)

   605   //     to       - register containing starting address

   606   //     count    - register containing element count

   607   //     tmp      - scratch register

   608   //

   609   //  Kills:

   610   //     nothing

   611   //

   612   void gen_write_ref_array_pre_barrier(Register from, Register to, Register count, bool dest_uninitialized, Register Rtmp1) {

   613     BarrierSet* const bs = Universe::heap()->barrier_set();

   614     switch (bs->kind()) {

   615       case BarrierSet::G1SATBCT:

   616       case BarrierSet::G1SATBCTLogging:

   617         // With G1, don't generate the call if we statically know that the target in uninitialized

   618         if (!dest_uninitialized) {

   619           const int spill_slots = 4 * wordSize;

   620           const int frame_size  = frame::abi_reg_args_size + spill_slots;

   621           Label filtered;

   623           // Is marking active?

   624           if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {

   625             __ lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);

   626           } else {

   627             guarantee(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");

   628             __ lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);

   629           }

   630           __ cmpdi(CCR0, Rtmp1, 0);

   631           __ beq(CCR0, filtered);

   633           __ save_LR_CR(R0);

   634           __ push_frame_reg_args(spill_slots, R0);

   635           __ std(from,  frame_size - 1 * wordSize, R1_SP);

   636           __ std(to,    frame_size - 2 * wordSize, R1_SP);

   637           __ std(count, frame_size - 3 * wordSize, R1_SP);

   639           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), to, count);

   641           __ ld(from,  frame_size - 1 * wordSize, R1_SP);

   642           __ ld(to,    frame_size - 2 * wordSize, R1_SP);

   643           __ ld(count, frame_size - 3 * wordSize, R1_SP);

   644           __ pop_frame();

   645           __ restore_LR_CR(R0);

   647           __ bind(filtered);

   648         }

   649         break;

   650       case BarrierSet::CardTableModRef:

   651       case BarrierSet::CardTableExtension:

   652       case BarrierSet::ModRef:

   653         break;

   654       default:

   655         ShouldNotReachHere();

   656     }

   657   }

   659   //  Generate CMS/G1 post-write barrier for array.

   660   //

   661   //  Input:

   662   //     addr     - register containing starting address

   663   //     count    - register containing element count

   664   //     tmp      - scratch register

   665   //

   666   //  The input registers and R0 are overwritten.

   667   //

   668   void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp, bool branchToEnd) {

   669     BarrierSet* const bs = Universe::heap()->barrier_set();

   671     switch (bs->kind()) {

   672       case BarrierSet::G1SATBCT:

   673       case BarrierSet::G1SATBCTLogging:

   674         {

   675           if (branchToEnd) {

   676             __ save_LR_CR(R0);

   677             // We need this frame only to spill LR.

   678             __ push_frame_reg_args(0, R0);

   679             __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), addr, count);

   680             __ pop_frame();

   681             __ restore_LR_CR(R0);

   682           } else {

   683             // Tail call: fake call from stub caller by branching without linking.

   684             address entry_point = (address)CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post);

   685             __ mr_if_needed(R3_ARG1, addr);

   686             __ mr_if_needed(R4_ARG2, count);

   687             __ load_const(R11, entry_point, R0);

   688             __ call_c_and_return_to_caller(R11);

   689           }

   690         }

   691         break;

   692       case BarrierSet::CardTableModRef:

   693       case BarrierSet::CardTableExtension:

   694         {

   695           Label Lskip_loop, Lstore_loop;

   696           if (UseConcMarkSweepGC) {

   697             // TODO PPC port: contribute optimization / requires shared changes

   698             __ release();

   699           }

   701           CardTableModRefBS* const ct = (CardTableModRefBS*)bs;

   702           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");

   703           assert_different_registers(addr, count, tmp);

   705           __ sldi(count, count, LogBytesPerHeapOop);

   706           __ addi(count, count, -BytesPerHeapOop);

   707           __ add(count, addr, count);

   708           // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)

   709           __ srdi(addr, addr, CardTableModRefBS::card_shift);

   710           __ srdi(count, count, CardTableModRefBS::card_shift);

   711           __ subf(count, addr, count);

   712           assert_different_registers(R0, addr, count, tmp);

   713           __ load_const(tmp, (address)ct->byte_map_base);

   714           __ addic_(count, count, 1);

   715           __ beq(CCR0, Lskip_loop);

   716           __ li(R0, 0);

   717           __ mtctr(count);

   718           // Byte store loop

   719           __ bind(Lstore_loop);

   720           __ stbx(R0, tmp, addr);

   721           __ addi(addr, addr, 1);

   722           __ bdnz(Lstore_loop);

   723           __ bind(Lskip_loop);

   725           if (!branchToEnd) __ blr();

   726         }

   727       break;

   728       case BarrierSet::ModRef:

   729         if (!branchToEnd) __ blr();

   730         break;

   731       default:

   732         ShouldNotReachHere();

   733     }

   734   }

   736   // Support for void zero_words_aligned8(HeapWord* to, size_t count)

   737   //

   738   // Arguments:

   739   //   to:

   740   //   count:

   741   //

   742   // Destroys:

   743   //

   744   address generate_zero_words_aligned8() {

   745     StubCodeMark mark(this, "StubRoutines", "zero_words_aligned8");

   747     // Implemented as in ClearArray.

   748     address start = __ function_entry();

   750     Register base_ptr_reg   = R3_ARG1; // tohw (needs to be 8b aligned)

   751     Register cnt_dwords_reg = R4_ARG2; // count (in dwords)

   752     Register tmp1_reg       = R5_ARG3;

   753     Register tmp2_reg       = R6_ARG4;

   754     Register zero_reg       = R7_ARG5;

   756     // Procedure for large arrays (uses data cache block zero instruction).

   757     Label dwloop, fast, fastloop, restloop, lastdword, done;

   758     int cl_size=VM_Version::get_cache_line_size(), cl_dwords=cl_size>>3, cl_dwordaddr_bits=exact_log2(cl_dwords);

   759     int min_dcbz=2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines.

   761     // Clear up to 128byte boundary if long enough, dword_cnt=(16-(base>>3))%16.

   762     __ dcbtst(base_ptr_reg);                    // Indicate write access to first cache line ...

   763     __ andi(tmp2_reg, cnt_dwords_reg, 1);       // to check if number of dwords is even.

   764     __ srdi_(tmp1_reg, cnt_dwords_reg, 1);      // number of double dwords

   765     __ load_const_optimized(zero_reg, 0L);      // Use as zero register.

   767     __ cmpdi(CCR1, tmp2_reg, 0);                // cnt_dwords even?

   768     __ beq(CCR0, lastdword);                    // size <= 1

   769     __ mtctr(tmp1_reg);                         // Speculatively preload counter for rest loop (>0).

   770     __ cmpdi(CCR0, cnt_dwords_reg, (min_dcbz+1)*cl_dwords-1); // Big enough to ensure >=min_dcbz cache lines are included?

   771     __ neg(tmp1_reg, base_ptr_reg);             // bit 0..58: bogus, bit 57..60: (16-(base>>3))%16, bit 61..63: 000

   773     __ blt(CCR0, restloop);                     // Too small. (<31=(2*cl_dwords)-1 is sufficient, but bigger performs better.)

   774     __ rldicl_(tmp1_reg, tmp1_reg, 64-3, 64-cl_dwordaddr_bits); // Extract number of dwords to 128byte boundary=(16-(base>>3))%16.

   776     __ beq(CCR0, fast);                         // already 128byte aligned

   777     __ mtctr(tmp1_reg);                         // Set ctr to hit 128byte boundary (0<ctr<cnt).

   778     __ subf(cnt_dwords_reg, tmp1_reg, cnt_dwords_reg); // rest (>0 since size>=256-8)

   780     // Clear in first cache line dword-by-dword if not already 128byte aligned.

   781     __ bind(dwloop);

   782       __ std(zero_reg, 0, base_ptr_reg);        // Clear 8byte aligned block.

   783       __ addi(base_ptr_reg, base_ptr_reg, 8);

   784     __ bdnz(dwloop);

   786     // clear 128byte blocks

   787     __ bind(fast);

   788     __ srdi(tmp1_reg, cnt_dwords_reg, cl_dwordaddr_bits); // loop count for 128byte loop (>0 since size>=256-8)

   789     __ andi(tmp2_reg, cnt_dwords_reg, 1);       // to check if rest even

   791     __ mtctr(tmp1_reg);                         // load counter

   792     __ cmpdi(CCR1, tmp2_reg, 0);                // rest even?

   793     __ rldicl_(tmp1_reg, cnt_dwords_reg, 63, 65-cl_dwordaddr_bits); // rest in double dwords

   795     __ bind(fastloop);

   796       __ dcbz(base_ptr_reg);                    // Clear 128byte aligned block.

   797       __ addi(base_ptr_reg, base_ptr_reg, cl_size);

   798     __ bdnz(fastloop);

   800     //__ dcbtst(base_ptr_reg);                  // Indicate write access to last cache line.

   801     __ beq(CCR0, lastdword);                    // rest<=1

   802     __ mtctr(tmp1_reg);                         // load counter

   804     // Clear rest.

   805     __ bind(restloop);

   806       __ std(zero_reg, 0, base_ptr_reg);        // Clear 8byte aligned block.

   807       __ std(zero_reg, 8, base_ptr_reg);        // Clear 8byte aligned block.

   808       __ addi(base_ptr_reg, base_ptr_reg, 16);

   809     __ bdnz(restloop);

   811     __ bind(lastdword);

   812     __ beq(CCR1, done);

   813     __ std(zero_reg, 0, base_ptr_reg);

   814     __ bind(done);

   815     __ blr();                                   // return

   817     return start;

   818   }

   820   // The following routine generates a subroutine to throw an asynchronous

   821   // UnknownError when an unsafe access gets a fault that could not be

   822   // reasonably prevented by the programmer.  (Example: SIGBUS/OBJERR.)

   823   //

   824   address generate_handler_for_unsafe_access() {

   825     StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");

   826     address start = __ function_entry();

   827     __ unimplemented("StubRoutines::handler_for_unsafe_access", 93);

   828     return start;

   829   }

   831 #if !defined(PRODUCT)

   832   // Wrapper which calls oopDesc::is_oop_or_null()

   833   // Only called by MacroAssembler::verify_oop

   834   static void verify_oop_helper(const char* message, oop o) {

   835     if (!o->is_oop_or_null()) {

   836       fatal(message);

   837     }

   838     ++ StubRoutines::_verify_oop_count;

   839   }

   840 #endif

   842   // Return address of code to be called from code generated by

   843   // MacroAssembler::verify_oop.

   844   //

   845   // Don't generate, rather use C++ code.

   846   address generate_verify_oop() {

   847     StubCodeMark mark(this, "StubRoutines", "verify_oop");

   849     // this is actually a `FunctionDescriptor*'.

   850     address start = 0;

   852 #if !defined(PRODUCT)

   853     start = CAST_FROM_FN_PTR(address, verify_oop_helper);

   854 #endif

   856     return start;

   857   }

   859   // Fairer handling of safepoints for native methods.

   860   //

   861   // Generate code which reads from the polling page. This special handling is needed as the

   862   // linux-ppc64 kernel before 2.6.6 doesn't set si_addr on some segfaults in 64bit mode

   863   // (cf. http://www.kernel.org/pub/linux/kernel/v2.6/ChangeLog-2.6.6), especially when we try

   864   // to read from the safepoint polling page.

   865   address generate_load_from_poll() {

   866     StubCodeMark mark(this, "StubRoutines", "generate_load_from_poll");

   867     address start = __ function_entry();

   868     __ unimplemented("StubRoutines::verify_oop", 95);  // TODO PPC port

   869     return start;

   870   }

   872   // -XX:+OptimizeFill : convert fill/copy loops into intrinsic

   873   //

   874   // The code is implemented(ported from sparc) as we believe it benefits JVM98, however

   875   // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!

   876   //

   877   // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition

   878   // for turning on loop predication optimization, and hence the behavior of "array range check"

   879   // and "loop invariant check" could be influenced, which potentially boosted JVM98.

   880   //

   881   // Generate stub for disjoint short fill. If "aligned" is true, the

   882   // "to" address is assumed to be heapword aligned.

   883   //

   884   // Arguments for generated stub:

   885   //   to:    R3_ARG1

   886   //   value: R4_ARG2

   887   //   count: R5_ARG3 treated as signed

   888   //

   889   address generate_fill(BasicType t, bool aligned, const char* name) {

   890     StubCodeMark mark(this, "StubRoutines", name);

   891     address start = __ function_entry();

   893     const Register to    = R3_ARG1;   // source array address

   894     const Register value = R4_ARG2;   // fill value

   895     const Register count = R5_ARG3;   // elements count

   896     const Register temp  = R6_ARG4;   // temp register

   898     //assert_clean_int(count, O3);    // Make sure 'count' is clean int.

   900     Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;

   901     Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;

   903     int shift = -1;

   904     switch (t) {

   905        case T_BYTE:

   906         shift = 2;

   907         // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).

   908         __ rldimi(value, value, 8, 48);     // 8 bit -> 16 bit

   909         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.

   910         __ blt(CCR0, L_fill_elements);

   911         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit

   912         break;

   913        case T_SHORT:

   914         shift = 1;

   915         // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).

   916         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit

   917         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.

   918         __ blt(CCR0, L_fill_elements);

   919         break;

   920       case T_INT:

   921         shift = 0;

   922         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.

   923         __ blt(CCR0, L_fill_4_bytes);

   924         break;

   925       default: ShouldNotReachHere();

   926     }

   928     if (!aligned && (t == T_BYTE || t == T_SHORT)) {

   929       // Align source address at 4 bytes address boundary.

   930       if (t == T_BYTE) {

   931         // One byte misalignment happens only for byte arrays.

   932         __ andi_(temp, to, 1);

   933         __ beq(CCR0, L_skip_align1);

   934         __ stb(value, 0, to);

   935         __ addi(to, to, 1);

   936         __ addi(count, count, -1);

   937         __ bind(L_skip_align1);

   938       }

   939       // Two bytes misalignment happens only for byte and short (char) arrays.

   940       __ andi_(temp, to, 2);

   941       __ beq(CCR0, L_skip_align2);

   942       __ sth(value, 0, to);

   943       __ addi(to, to, 2);

   944       __ addi(count, count, -(1 << (shift - 1)));

   945       __ bind(L_skip_align2);

   946     }

   948     if (!aligned) {

   949       // Align to 8 bytes, we know we are 4 byte aligned to start.

   950       __ andi_(temp, to, 7);

   951       __ beq(CCR0, L_fill_32_bytes);

   952       __ stw(value, 0, to);

   953       __ addi(to, to, 4);

   954       __ addi(count, count, -(1 << shift));

   955       __ bind(L_fill_32_bytes);

   956     }

   958     __ li(temp, 8<<shift);                  // Prepare for 32 byte loop.

   959     // Clone bytes int->long as above.

   960     __ rldimi(value, value, 32, 0);         // 32 bit -> 64 bit

   962     Label L_check_fill_8_bytes;

   963     // Fill 32-byte chunks.

   964     __ subf_(count, temp, count);

   965     __ blt(CCR0, L_check_fill_8_bytes);

   967     Label L_fill_32_bytes_loop;

   968     __ align(32);

   969     __ bind(L_fill_32_bytes_loop);

   971     __ std(value, 0, to);

   972     __ std(value, 8, to);

   973     __ subf_(count, temp, count);           // Update count.

   974     __ std(value, 16, to);

   975     __ std(value, 24, to);

   977     __ addi(to, to, 32);

   978     __ bge(CCR0, L_fill_32_bytes_loop);

   980     __ bind(L_check_fill_8_bytes);

   981     __ add_(count, temp, count);

   982     __ beq(CCR0, L_exit);

   983     __ addic_(count, count, -(2 << shift));

   984     __ blt(CCR0, L_fill_4_bytes);

   986     //

   987     // Length is too short, just fill 8 bytes at a time.

   988     //

   989     Label L_fill_8_bytes_loop;

   990     __ bind(L_fill_8_bytes_loop);

   991     __ std(value, 0, to);

   992     __ addic_(count, count, -(2 << shift));

   993     __ addi(to, to, 8);

   994     __ bge(CCR0, L_fill_8_bytes_loop);

   996     // Fill trailing 4 bytes.

   997     __ bind(L_fill_4_bytes);

   998     __ andi_(temp, count, 1<<shift);

   999     __ beq(CCR0, L_fill_2_bytes);

  1001     __ stw(value, 0, to);

  1002     if (t == T_BYTE || t == T_SHORT) {

  1003       __ addi(to, to, 4);

  1004       // Fill trailing 2 bytes.

  1005       __ bind(L_fill_2_bytes);

  1006       __ andi_(temp, count, 1<<(shift-1));

  1007       __ beq(CCR0, L_fill_byte);

  1008       __ sth(value, 0, to);

  1009       if (t == T_BYTE) {

  1010         __ addi(to, to, 2);

  1011         // Fill trailing byte.

  1012         __ bind(L_fill_byte);

  1013         __ andi_(count, count, 1);

  1014         __ beq(CCR0, L_exit);

  1015         __ stb(value, 0, to);

  1016       } else {

  1017         __ bind(L_fill_byte);

  1018       }

  1019     } else {

  1020       __ bind(L_fill_2_bytes);

  1021     }

  1022     __ bind(L_exit);

  1023     __ blr();

  1025     // Handle copies less than 8 bytes. Int is handled elsewhere.

  1026     if (t == T_BYTE) {

  1027       __ bind(L_fill_elements);

  1028       Label L_fill_2, L_fill_4;

  1029       __ andi_(temp, count, 1);

  1030       __ beq(CCR0, L_fill_2);

  1031       __ stb(value, 0, to);

  1032       __ addi(to, to, 1);

  1033       __ bind(L_fill_2);

  1034       __ andi_(temp, count, 2);

  1035       __ beq(CCR0, L_fill_4);

  1036       __ stb(value, 0, to);

  1037       __ stb(value, 0, to);

  1038       __ addi(to, to, 2);

  1039       __ bind(L_fill_4);

  1040       __ andi_(temp, count, 4);

  1041       __ beq(CCR0, L_exit);

  1042       __ stb(value, 0, to);

  1043       __ stb(value, 1, to);

  1044       __ stb(value, 2, to);

  1045       __ stb(value, 3, to);

  1046       __ blr();

  1047     }

  1049     if (t == T_SHORT) {

  1050       Label L_fill_2;

  1051       __ bind(L_fill_elements);

  1052       __ andi_(temp, count, 1);

  1053       __ beq(CCR0, L_fill_2);

  1054       __ sth(value, 0, to);

  1055       __ addi(to, to, 2);

  1056       __ bind(L_fill_2);

  1057       __ andi_(temp, count, 2);

  1058       __ beq(CCR0, L_exit);

  1059       __ sth(value, 0, to);

  1060       __ sth(value, 2, to);

  1061       __ blr();

  1062     }

  1063     return start;

  1064   }

  1067   // Generate overlap test for array copy stubs.

  1068   //

  1069   // Input:

  1070   //   R3_ARG1    -  from

  1071   //   R4_ARG2    -  to

  1072   //   R5_ARG3    -  element count

  1073   //

  1074   void array_overlap_test(address no_overlap_target, int log2_elem_size) {

  1075     Register tmp1 = R6_ARG4;

  1076     Register tmp2 = R7_ARG5;

  1078     Label l_overlap;

  1079 #ifdef ASSERT

  1080     __ srdi_(tmp2, R5_ARG3, 31);

  1081     __ asm_assert_eq("missing zero extend", 0xAFFE);

  1082 #endif

  1084     __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes

  1085     __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes

  1086     __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!

  1087     __ cmpld(CCR1, tmp1, tmp2);

  1088     __ crand(/*CCR0 lt*/0, /*CCR1 lt*/4+0, /*CCR0 lt*/0);

  1089     __ blt(CCR0, l_overlap); // Src before dst and distance smaller than size.

  1091     // need to copy forwards

  1092     if (__ is_within_range_of_b(no_overlap_target, __ pc())) {

  1093       __ b(no_overlap_target);

  1094     } else {

  1095       __ load_const(tmp1, no_overlap_target, tmp2);

  1096       __ mtctr(tmp1);

  1097       __ bctr();

  1098     }

  1100     __ bind(l_overlap);

  1101     // need to copy backwards

  1102   }

  1104   // The guideline in the implementations of generate_disjoint_xxx_copy

  1105   // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with

  1106   // single instructions, but to avoid alignment interrupts (see subsequent

  1107   // comment). Furthermore, we try to minimize misaligned access, even

  1108   // though they cause no alignment interrupt.

  1109   //

  1110   // In Big-Endian mode, the PowerPC architecture requires implementations to

  1111   // handle automatically misaligned integer halfword and word accesses,

  1112   // word-aligned integer doubleword accesses, and word-aligned floating-point

  1113   // accesses. Other accesses may or may not generate an Alignment interrupt

  1114   // depending on the implementation.

  1115   // Alignment interrupt handling may require on the order of hundreds of cycles,

  1116   // so every effort should be made to avoid misaligned memory values.

  1117   //

  1118   //

  1119   // Generate stub for disjoint byte copy.  If "aligned" is true, the

  1120   // "from" and "to" addresses are assumed to be heapword aligned.

  1121   //

  1122   // Arguments for generated stub:

  1123   //      from:  R3_ARG1

  1124   //      to:    R4_ARG2

  1125   //      count: R5_ARG3 treated as signed

  1126   //

  1127   address generate_disjoint_byte_copy(bool aligned, const char * name) {

  1128     StubCodeMark mark(this, "StubRoutines", name);

  1129     address start = __ function_entry();

  1131     Register tmp1 = R6_ARG4;

  1132     Register tmp2 = R7_ARG5;

  1133     Register tmp3 = R8_ARG6;

  1134     Register tmp4 = R9_ARG7;

  1137     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;

  1138     // Don't try anything fancy if arrays don't have many elements.

  1139     __ li(tmp3, 0);

  1140     __ cmpwi(CCR0, R5_ARG3, 17);

  1141     __ ble(CCR0, l_6); // copy 4 at a time

  1143     if (!aligned) {

  1144       __ xorr(tmp1, R3_ARG1, R4_ARG2);

  1145       __ andi_(tmp1, tmp1, 3);

  1146       __ bne(CCR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.

  1148       // Copy elements if necessary to align to 4 bytes.

  1149       __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.

  1150       __ andi_(tmp1, tmp1, 3);

  1151       __ beq(CCR0, l_2);

  1153       __ subf(R5_ARG3, tmp1, R5_ARG3);

  1154       __ bind(l_9);

  1155       __ lbz(tmp2, 0, R3_ARG1);

  1156       __ addic_(tmp1, tmp1, -1);

  1157       __ stb(tmp2, 0, R4_ARG2);

  1158       __ addi(R3_ARG1, R3_ARG1, 1);

  1159       __ addi(R4_ARG2, R4_ARG2, 1);

  1160       __ bne(CCR0, l_9);

  1162       __ bind(l_2);

  1163     }

  1165     // copy 8 elements at a time

  1166     __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8

  1167     __ andi_(tmp1, tmp2, 7);

  1168     __ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8

  1170     // copy a 2-element word if necessary to align to 8 bytes

  1171     __ andi_(R0, R3_ARG1, 7);

  1172     __ beq(CCR0, l_7);

  1174     __ lwzx(tmp2, R3_ARG1, tmp3);

  1175     __ addi(R5_ARG3, R5_ARG3, -4);

  1176     __ stwx(tmp2, R4_ARG2, tmp3);

  1177     { // FasterArrayCopy

  1178       __ addi(R3_ARG1, R3_ARG1, 4);

  1179       __ addi(R4_ARG2, R4_ARG2, 4);

  1180     }

  1181     __ bind(l_7);

  1183     { // FasterArrayCopy

  1184       __ cmpwi(CCR0, R5_ARG3, 31);

  1185       __ ble(CCR0, l_6); // copy 2 at a time if less than 32 elements remain

  1187       __ srdi(tmp1, R5_ARG3, 5);

  1188       __ andi_(R5_ARG3, R5_ARG3, 31);

  1189       __ mtctr(tmp1);

  1191       __ bind(l_8);

  1192       // Use unrolled version for mass copying (copy 32 elements a time)

  1193       // Load feeding store gets zero latency on Power6, however not on Power5.

  1194       // Therefore, the following sequence is made for the good of both.

  1195       __ ld(tmp1, 0, R3_ARG1);

  1196       __ ld(tmp2, 8, R3_ARG1);

  1197       __ ld(tmp3, 16, R3_ARG1);

  1198       __ ld(tmp4, 24, R3_ARG1);

  1199       __ std(tmp1, 0, R4_ARG2);

  1200       __ std(tmp2, 8, R4_ARG2);

  1201       __ std(tmp3, 16, R4_ARG2);

  1202       __ std(tmp4, 24, R4_ARG2);

  1203       __ addi(R3_ARG1, R3_ARG1, 32);

  1204       __ addi(R4_ARG2, R4_ARG2, 32);

  1205       __ bdnz(l_8);

  1206     }

  1208     __ bind(l_6);

  1210     // copy 4 elements at a time

  1211     __ cmpwi(CCR0, R5_ARG3, 4);

  1212     __ blt(CCR0, l_1);

  1213     __ srdi(tmp1, R5_ARG3, 2);

  1214     __ mtctr(tmp1); // is > 0

  1215     __ andi_(R5_ARG3, R5_ARG3, 3);

  1217     { // FasterArrayCopy

  1218       __ addi(R3_ARG1, R3_ARG1, -4);

  1219       __ addi(R4_ARG2, R4_ARG2, -4);

  1220       __ bind(l_3);

  1221       __ lwzu(tmp2, 4, R3_ARG1);

  1222       __ stwu(tmp2, 4, R4_ARG2);

  1223       __ bdnz(l_3);

  1224       __ addi(R3_ARG1, R3_ARG1, 4);

  1225       __ addi(R4_ARG2, R4_ARG2, 4);

  1226     }

  1228     // do single element copy

  1229     __ bind(l_1);

  1230     __ cmpwi(CCR0, R5_ARG3, 0);

  1231     __ beq(CCR0, l_4);

  1233     { // FasterArrayCopy

  1234       __ mtctr(R5_ARG3);

  1235       __ addi(R3_ARG1, R3_ARG1, -1);

  1236       __ addi(R4_ARG2, R4_ARG2, -1);

  1238       __ bind(l_5);

  1239       __ lbzu(tmp2, 1, R3_ARG1);

  1240       __ stbu(tmp2, 1, R4_ARG2);

  1241       __ bdnz(l_5);

  1242     }

  1244     __ bind(l_4);

  1245     __ blr();

  1247     return start;

  1248   }

  1250   // Generate stub for conjoint byte copy.  If "aligned" is true, the

  1251   // "from" and "to" addresses are assumed to be heapword aligned.

  1252   //

  1253   // Arguments for generated stub:

  1254   //      from:  R3_ARG1

  1255   //      to:    R4_ARG2

  1256   //      count: R5_ARG3 treated as signed

  1257   //

  1258   address generate_conjoint_byte_copy(bool aligned, const char * name) {

  1259     StubCodeMark mark(this, "StubRoutines", name);

  1260     address start = __ function_entry();

  1262     Register tmp1 = R6_ARG4;

  1263     Register tmp2 = R7_ARG5;

  1264     Register tmp3 = R8_ARG6;

  1266 #if defined(ABI_ELFv2)

  1267      address nooverlap_target = aligned ?

  1268        StubRoutines::arrayof_jbyte_disjoint_arraycopy() :

  1269        StubRoutines::jbyte_disjoint_arraycopy();

  1270 #else

  1271     address nooverlap_target = aligned ?

  1272       ((FunctionDescriptor*)StubRoutines::arrayof_jbyte_disjoint_arraycopy())->entry() :

  1273       ((FunctionDescriptor*)StubRoutines::jbyte_disjoint_arraycopy())->entry();

  1274 #endif

  1276     array_overlap_test(nooverlap_target, 0);

  1277     // Do reverse copy. We assume the case of actual overlap is rare enough

  1278     // that we don't have to optimize it.

  1279     Label l_1, l_2;

  1281     __ b(l_2);

  1282     __ bind(l_1);

  1283     __ stbx(tmp1, R4_ARG2, R5_ARG3);

  1284     __ bind(l_2);

  1285     __ addic_(R5_ARG3, R5_ARG3, -1);

  1286     __ lbzx(tmp1, R3_ARG1, R5_ARG3);

  1287     __ bge(CCR0, l_1);

  1289     __ blr();

  1291     return start;

  1292   }

  1294   // Generate stub for disjoint short copy.  If "aligned" is true, the

  1295   // "from" and "to" addresses are assumed to be heapword aligned.

  1296   //

  1297   // Arguments for generated stub:

  1298   //      from:  R3_ARG1

  1299   //      to:    R4_ARG2

  1300   //  elm.count: R5_ARG3 treated as signed

  1301   //

  1302   // Strategy for aligned==true:

  1303   //

  1304   //  If length <= 9:

  1305   //     1. copy 2 elements at a time (l_6)

  1306   //     2. copy last element if original element count was odd (l_1)

  1307   //

  1308   //  If length > 9:

  1309   //     1. copy 4 elements at a time until less than 4 elements are left (l_7)

  1310   //     2. copy 2 elements at a time until less than 2 elements are left (l_6)

  1311   //     3. copy last element if one was left in step 2. (l_1)

  1312   //

  1313   //

  1314   // Strategy for aligned==false:

  1315   //

  1316   //  If length <= 9: same as aligned==true case, but NOTE: load/stores

  1317   //                  can be unaligned (see comment below)

  1318   //

  1319   //  If length > 9:

  1320   //     1. continue with step 6. if the alignment of from and to mod 4

  1321   //        is different.

  1322   //     2. align from and to to 4 bytes by copying 1 element if necessary

  1323   //     3. at l_2 from and to are 4 byte aligned; continue with

  1324   //        5. if they cannot be aligned to 8 bytes because they have

  1325   //        got different alignment mod 8.

  1326   //     4. at this point we know that both, from and to, have the same

  1327   //        alignment mod 8, now copy one element if necessary to get

  1328   //        8 byte alignment of from and to.

  1329   //     5. copy 4 elements at a time until less than 4 elements are

  1330   //        left; depending on step 3. all load/stores are aligned or

  1331   //        either all loads or all stores are unaligned.

  1332   //     6. copy 2 elements at a time until less than 2 elements are

  1333   //        left (l_6); arriving here from step 1., there is a chance

  1334   //        that all accesses are unaligned.

  1335   //     7. copy last element if one was left in step 6. (l_1)

  1336   //

  1337   //  There are unaligned data accesses using integer load/store

  1338   //  instructions in this stub. POWER allows such accesses.

  1339   //

  1340   //  According to the manuals (PowerISA_V2.06_PUBLIC, Book II,

  1341   //  Chapter 2: Effect of Operand Placement on Performance) unaligned

  1342   //  integer load/stores have good performance. Only unaligned

  1343   //  floating point load/stores can have poor performance.

  1344   //

  1345   //  TODO:

  1346   //

  1347   //  1. check if aligning the backbranch target of loops is beneficial

  1348   //

  1349   address generate_disjoint_short_copy(bool aligned, const char * name) {

  1350     StubCodeMark mark(this, "StubRoutines", name);

  1352     Register tmp1 = R6_ARG4;

  1353     Register tmp2 = R7_ARG5;

  1354     Register tmp3 = R8_ARG6;

  1355     Register tmp4 = R9_ARG7;

  1357     address start = __ function_entry();

  1359       Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;

  1360     // don't try anything fancy if arrays don't have many elements

  1361     __ li(tmp3, 0);

  1362     __ cmpwi(CCR0, R5_ARG3, 9);

  1363     __ ble(CCR0, l_6); // copy 2 at a time

  1365     if (!aligned) {

  1366       __ xorr(tmp1, R3_ARG1, R4_ARG2);

  1367       __ andi_(tmp1, tmp1, 3);

  1368       __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy

  1370       // At this point it is guaranteed that both, from and to have the same alignment mod 4.

  1372       // Copy 1 element if necessary to align to 4 bytes.

  1373       __ andi_(tmp1, R3_ARG1, 3);

  1374       __ beq(CCR0, l_2);

  1376       __ lhz(tmp2, 0, R3_ARG1);

  1377       __ addi(R3_ARG1, R3_ARG1, 2);

  1378       __ sth(tmp2, 0, R4_ARG2);

  1379       __ addi(R4_ARG2, R4_ARG2, 2);

  1380       __ addi(R5_ARG3, R5_ARG3, -1);

  1381       __ bind(l_2);

  1383       // At this point the positions of both, from and to, are at least 4 byte aligned.

  1385       // Copy 4 elements at a time.

  1386       // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.

  1387       __ xorr(tmp2, R3_ARG1, R4_ARG2);

  1388       __ andi_(tmp1, tmp2, 7);

  1389       __ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned

  1391       // Copy a 2-element word if necessary to align to 8 bytes.

  1392       __ andi_(R0, R3_ARG1, 7);

  1393       __ beq(CCR0, l_7);

  1395       __ lwzx(tmp2, R3_ARG1, tmp3);

  1396       __ addi(R5_ARG3, R5_ARG3, -2);

  1397       __ stwx(tmp2, R4_ARG2, tmp3);

  1398       { // FasterArrayCopy

  1399         __ addi(R3_ARG1, R3_ARG1, 4);

  1400         __ addi(R4_ARG2, R4_ARG2, 4);

  1401       }

  1402     }

  1404     __ bind(l_7);

  1406     // Copy 4 elements at a time; either the loads or the stores can

  1407     // be unaligned if aligned == false.

  1409     { // FasterArrayCopy

  1410       __ cmpwi(CCR0, R5_ARG3, 15);

  1411       __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain

  1413       __ srdi(tmp1, R5_ARG3, 4);

  1414       __ andi_(R5_ARG3, R5_ARG3, 15);

  1415       __ mtctr(tmp1);

  1417       __ bind(l_8);

  1418       // Use unrolled version for mass copying (copy 16 elements a time).

  1419       // Load feeding store gets zero latency on Power6, however not on Power5.

  1420       // Therefore, the following sequence is made for the good of both.

  1421       __ ld(tmp1, 0, R3_ARG1);

  1422       __ ld(tmp2, 8, R3_ARG1);

  1423       __ ld(tmp3, 16, R3_ARG1);

  1424       __ ld(tmp4, 24, R3_ARG1);

  1425       __ std(tmp1, 0, R4_ARG2);

  1426       __ std(tmp2, 8, R4_ARG2);

  1427       __ std(tmp3, 16, R4_ARG2);

  1428       __ std(tmp4, 24, R4_ARG2);

  1429       __ addi(R3_ARG1, R3_ARG1, 32);

  1430       __ addi(R4_ARG2, R4_ARG2, 32);

  1431       __ bdnz(l_8);

  1432     }

  1433     __ bind(l_6);

  1435     // copy 2 elements at a time

  1436     { // FasterArrayCopy

  1437       __ cmpwi(CCR0, R5_ARG3, 2);

  1438       __ blt(CCR0, l_1);

  1439       __ srdi(tmp1, R5_ARG3, 1);

  1440       __ andi_(R5_ARG3, R5_ARG3, 1);

  1442       __ addi(R3_ARG1, R3_ARG1, -4);

  1443       __ addi(R4_ARG2, R4_ARG2, -4);

  1444       __ mtctr(tmp1);

  1446       __ bind(l_3);

  1447       __ lwzu(tmp2, 4, R3_ARG1);

  1448       __ stwu(tmp2, 4, R4_ARG2);

  1449       __ bdnz(l_3);

  1451       __ addi(R3_ARG1, R3_ARG1, 4);

  1452       __ addi(R4_ARG2, R4_ARG2, 4);

  1453     }

  1455     // do single element copy

  1456     __ bind(l_1);

  1457     __ cmpwi(CCR0, R5_ARG3, 0);

  1458     __ beq(CCR0, l_4);

  1460     { // FasterArrayCopy

  1461       __ mtctr(R5_ARG3);

  1462       __ addi(R3_ARG1, R3_ARG1, -2);

  1463       __ addi(R4_ARG2, R4_ARG2, -2);

  1465       __ bind(l_5);

  1466       __ lhzu(tmp2, 2, R3_ARG1);

  1467       __ sthu(tmp2, 2, R4_ARG2);

  1468       __ bdnz(l_5);

  1469     }

  1470     __ bind(l_4);

  1471     __ blr();

  1473     return start;

  1474   }

  1476   // Generate stub for conjoint short copy.  If "aligned" is true, the

  1477   // "from" and "to" addresses are assumed to be heapword aligned.

  1478   //

  1479   // Arguments for generated stub:

  1480   //      from:  R3_ARG1

  1481   //      to:    R4_ARG2

  1482   //      count: R5_ARG3 treated as signed

  1483   //

  1484   address generate_conjoint_short_copy(bool aligned, const char * name) {

  1485     StubCodeMark mark(this, "StubRoutines", name);

  1486     address start = __ function_entry();

  1488     Register tmp1 = R6_ARG4;

  1489     Register tmp2 = R7_ARG5;

  1490     Register tmp3 = R8_ARG6;

  1492 #if defined(ABI_ELFv2)

  1493     address nooverlap_target = aligned ?

  1494         StubRoutines::arrayof_jshort_disjoint_arraycopy() :

  1495         StubRoutines::jshort_disjoint_arraycopy();

  1496 #else

  1497     address nooverlap_target = aligned ?

  1498         ((FunctionDescriptor*)StubRoutines::arrayof_jshort_disjoint_arraycopy())->entry() :

  1499         ((FunctionDescriptor*)StubRoutines::jshort_disjoint_arraycopy())->entry();

  1500 #endif

  1502     array_overlap_test(nooverlap_target, 1);

  1504     Label l_1, l_2;

  1505     __ sldi(tmp1, R5_ARG3, 1);

  1506     __ b(l_2);

  1507     __ bind(l_1);

  1508     __ sthx(tmp2, R4_ARG2, tmp1);

  1509     __ bind(l_2);

  1510     __ addic_(tmp1, tmp1, -2);

  1511     __ lhzx(tmp2, R3_ARG1, tmp1);

  1512     __ bge(CCR0, l_1);

  1514     __ blr();

  1516     return start;

  1517   }

  1519   // Generate core code for disjoint int copy (and oop copy on 32-bit).  If "aligned"

  1520   // is true, the "from" and "to" addresses are assumed to be heapword aligned.

  1521   //

  1522   // Arguments:

  1523   //      from:  R3_ARG1

  1524   //      to:    R4_ARG2

  1525   //      count: R5_ARG3 treated as signed

  1526   //

  1527   void generate_disjoint_int_copy_core(bool aligned) {

  1528     Register tmp1 = R6_ARG4;

  1529     Register tmp2 = R7_ARG5;

  1530     Register tmp3 = R8_ARG6;

  1531     Register tmp4 = R0;

  1533     Label l_1, l_2, l_3, l_4, l_5, l_6;

  1534     // for short arrays, just do single element copy

  1535     __ li(tmp3, 0);

  1536     __ cmpwi(CCR0, R5_ARG3, 5);

  1537     __ ble(CCR0, l_2);

  1539     if (!aligned) {

  1540         // check if arrays have same alignment mod 8.

  1541         __ xorr(tmp1, R3_ARG1, R4_ARG2);

  1542         __ andi_(R0, tmp1, 7);

  1543         // Not the same alignment, but ld and std just need to be 4 byte aligned.

  1544         __ bne(CCR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time

  1546         // copy 1 element to align to and from on an 8 byte boundary

  1547         __ andi_(R0, R3_ARG1, 7);

  1548         __ beq(CCR0, l_4);

  1550         __ lwzx(tmp2, R3_ARG1, tmp3);

  1551         __ addi(R5_ARG3, R5_ARG3, -1);

  1552         __ stwx(tmp2, R4_ARG2, tmp3);

  1553         { // FasterArrayCopy

  1554           __ addi(R3_ARG1, R3_ARG1, 4);

  1555           __ addi(R4_ARG2, R4_ARG2, 4);

  1556         }

  1557         __ bind(l_4);

  1558       }

  1560     { // FasterArrayCopy

  1561       __ cmpwi(CCR0, R5_ARG3, 7);

  1562       __ ble(CCR0, l_2); // copy 1 at a time if less than 8 elements remain

  1564       __ srdi(tmp1, R5_ARG3, 3);

  1565       __ andi_(R5_ARG3, R5_ARG3, 7);

  1566       __ mtctr(tmp1);

  1568       __ bind(l_6);

  1569       // Use unrolled version for mass copying (copy 8 elements a time).

  1570       // Load feeding store gets zero latency on power6, however not on power 5.

  1571       // Therefore, the following sequence is made for the good of both.

  1572       __ ld(tmp1, 0, R3_ARG1);

  1573       __ ld(tmp2, 8, R3_ARG1);

  1574       __ ld(tmp3, 16, R3_ARG1);

  1575       __ ld(tmp4, 24, R3_ARG1);

  1576       __ std(tmp1, 0, R4_ARG2);

  1577       __ std(tmp2, 8, R4_ARG2);

  1578       __ std(tmp3, 16, R4_ARG2);

  1579       __ std(tmp4, 24, R4_ARG2);

  1580       __ addi(R3_ARG1, R3_ARG1, 32);

  1581       __ addi(R4_ARG2, R4_ARG2, 32);

  1582       __ bdnz(l_6);

  1583     }

  1585     // copy 1 element at a time

  1586     __ bind(l_2);

  1587     __ cmpwi(CCR0, R5_ARG3, 0);

  1588     __ beq(CCR0, l_1);

  1590     { // FasterArrayCopy

  1591       __ mtctr(R5_ARG3);

  1592       __ addi(R3_ARG1, R3_ARG1, -4);

  1593       __ addi(R4_ARG2, R4_ARG2, -4);

  1595       __ bind(l_3);

  1596       __ lwzu(tmp2, 4, R3_ARG1);

  1597       __ stwu(tmp2, 4, R4_ARG2);

  1598       __ bdnz(l_3);

  1599     }

  1601     __ bind(l_1);

  1602     return;

  1603   }

  1605   // Generate stub for disjoint int copy.  If "aligned" is true, the

  1606   // "from" and "to" addresses are assumed to be heapword aligned.

  1607   //

  1608   // Arguments for generated stub:

  1609   //      from:  R3_ARG1

  1610   //      to:    R4_ARG2

  1611   //      count: R5_ARG3 treated as signed

  1612   //

  1613   address generate_disjoint_int_copy(bool aligned, const char * name) {

  1614     StubCodeMark mark(this, "StubRoutines", name);

  1615     address start = __ function_entry();

  1616     generate_disjoint_int_copy_core(aligned);

  1617     __ blr();

  1618     return start;

  1619   }

  1621   // Generate core code for conjoint int copy (and oop copy on

  1622   // 32-bit).  If "aligned" is true, the "from" and "to" addresses

  1623   // are assumed to be heapword aligned.

  1624   //

  1625   // Arguments:

  1626   //      from:  R3_ARG1

  1627   //      to:    R4_ARG2

  1628   //      count: R5_ARG3 treated as signed

  1629   //

  1630   void generate_conjoint_int_copy_core(bool aligned) {

  1631     // Do reverse copy.  We assume the case of actual overlap is rare enough

  1632     // that we don't have to optimize it.

  1634     Label l_1, l_2, l_3, l_4, l_5, l_6;

  1636     Register tmp1 = R6_ARG4;

  1637     Register tmp2 = R7_ARG5;

  1638     Register tmp3 = R8_ARG6;

  1639     Register tmp4 = R0;

  1641     { // FasterArrayCopy

  1642       __ cmpwi(CCR0, R5_ARG3, 0);

  1643       __ beq(CCR0, l_6);

  1645       __ sldi(R5_ARG3, R5_ARG3, 2);

  1646       __ add(R3_ARG1, R3_ARG1, R5_ARG3);

  1647       __ add(R4_ARG2, R4_ARG2, R5_ARG3);

  1648       __ srdi(R5_ARG3, R5_ARG3, 2);

  1650       __ cmpwi(CCR0, R5_ARG3, 7);

  1651       __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain

  1653       __ srdi(tmp1, R5_ARG3, 3);

  1654       __ andi(R5_ARG3, R5_ARG3, 7);

  1655       __ mtctr(tmp1);

  1657       __ bind(l_4);

  1658       // Use unrolled version for mass copying (copy 4 elements a time).

  1659       // Load feeding store gets zero latency on Power6, however not on Power5.

  1660       // Therefore, the following sequence is made for the good of both.

  1661       __ addi(R3_ARG1, R3_ARG1, -32);

  1662       __ addi(R4_ARG2, R4_ARG2, -32);

  1663       __ ld(tmp4, 24, R3_ARG1);

  1664       __ ld(tmp3, 16, R3_ARG1);

  1665       __ ld(tmp2, 8, R3_ARG1);

  1666       __ ld(tmp1, 0, R3_ARG1);

  1667       __ std(tmp4, 24, R4_ARG2);

  1668       __ std(tmp3, 16, R4_ARG2);

  1669       __ std(tmp2, 8, R4_ARG2);

  1670       __ std(tmp1, 0, R4_ARG2);

  1671       __ bdnz(l_4);

  1673       __ cmpwi(CCR0, R5_ARG3, 0);

  1674       __ beq(CCR0, l_6);

  1676       __ bind(l_5);

  1677       __ mtctr(R5_ARG3);

  1678       __ bind(l_3);

  1679       __ lwz(R0, -4, R3_ARG1);

  1680       __ stw(R0, -4, R4_ARG2);

  1681       __ addi(R3_ARG1, R3_ARG1, -4);

  1682       __ addi(R4_ARG2, R4_ARG2, -4);

  1683       __ bdnz(l_3);

  1685       __ bind(l_6);

  1686     }

  1687   }

  1689   // Generate stub for conjoint int copy.  If "aligned" is true, the

  1690   // "from" and "to" addresses are assumed to be heapword aligned.

  1691   //

  1692   // Arguments for generated stub:

  1693   //      from:  R3_ARG1

  1694   //      to:    R4_ARG2

  1695   //      count: R5_ARG3 treated as signed

  1696   //

  1697   address generate_conjoint_int_copy(bool aligned, const char * name) {

  1698     StubCodeMark mark(this, "StubRoutines", name);

  1699     address start = __ function_entry();

  1701 #if defined(ABI_ELFv2)

  1702     address nooverlap_target = aligned ?

  1703       StubRoutines::arrayof_jint_disjoint_arraycopy() :

  1704       StubRoutines::jint_disjoint_arraycopy();

  1705 #else

  1706     address nooverlap_target = aligned ?

  1707       ((FunctionDescriptor*)StubRoutines::arrayof_jint_disjoint_arraycopy())->entry() :

  1708       ((FunctionDescriptor*)StubRoutines::jint_disjoint_arraycopy())->entry();

  1709 #endif

  1711     array_overlap_test(nooverlap_target, 2);

  1713     generate_conjoint_int_copy_core(aligned);

  1715     __ blr();

  1717     return start;

  1718   }

  1720   // Generate core code for disjoint long copy (and oop copy on

  1721   // 64-bit).  If "aligned" is true, the "from" and "to" addresses

  1722   // are assumed to be heapword aligned.

  1723   //

  1724   // Arguments:

  1725   //      from:  R3_ARG1

  1726   //      to:    R4_ARG2

  1727   //      count: R5_ARG3 treated as signed

  1728   //

  1729   void generate_disjoint_long_copy_core(bool aligned) {

  1730     Register tmp1 = R6_ARG4;

  1731     Register tmp2 = R7_ARG5;

  1732     Register tmp3 = R8_ARG6;

  1733     Register tmp4 = R0;

  1735     Label l_1, l_2, l_3, l_4;

  1737     { // FasterArrayCopy

  1738       __ cmpwi(CCR0, R5_ARG3, 3);

  1739       __ ble(CCR0, l_3); // copy 1 at a time if less than 4 elements remain

  1741       __ srdi(tmp1, R5_ARG3, 2);

  1742       __ andi_(R5_ARG3, R5_ARG3, 3);

  1743       __ mtctr(tmp1);

  1745       __ bind(l_4);

  1746       // Use unrolled version for mass copying (copy 4 elements a time).

  1747       // Load feeding store gets zero latency on Power6, however not on Power5.

  1748       // Therefore, the following sequence is made for the good of both.

  1749       __ ld(tmp1, 0, R3_ARG1);

  1750       __ ld(tmp2, 8, R3_ARG1);

  1751       __ ld(tmp3, 16, R3_ARG1);

  1752       __ ld(tmp4, 24, R3_ARG1);

  1753       __ std(tmp1, 0, R4_ARG2);

  1754       __ std(tmp2, 8, R4_ARG2);

  1755       __ std(tmp3, 16, R4_ARG2);

  1756       __ std(tmp4, 24, R4_ARG2);

  1757       __ addi(R3_ARG1, R3_ARG1, 32);

  1758       __ addi(R4_ARG2, R4_ARG2, 32);

  1759       __ bdnz(l_4);

  1760     }

  1762     // copy 1 element at a time

  1763     __ bind(l_3);

  1764     __ cmpwi(CCR0, R5_ARG3, 0);

  1765     __ beq(CCR0, l_1);

  1767     { // FasterArrayCopy

  1768       __ mtctr(R5_ARG3);

  1769       __ addi(R3_ARG1, R3_ARG1, -8);

  1770       __ addi(R4_ARG2, R4_ARG2, -8);

  1772       __ bind(l_2);

  1773       __ ldu(R0, 8, R3_ARG1);

  1774       __ stdu(R0, 8, R4_ARG2);

  1775       __ bdnz(l_2);

  1777     }

  1778     __ bind(l_1);

  1779   }

  1781   // Generate stub for disjoint long copy.  If "aligned" is true, the

  1782   // "from" and "to" addresses are assumed to be heapword aligned.

  1783   //

  1784   // Arguments for generated stub:

  1785   //      from:  R3_ARG1

  1786   //      to:    R4_ARG2

  1787   //      count: R5_ARG3 treated as signed

  1788   //

  1789   address generate_disjoint_long_copy(bool aligned, const char * name) {

  1790     StubCodeMark mark(this, "StubRoutines", name);

  1791     address start = __ function_entry();

  1792     generate_disjoint_long_copy_core(aligned);

  1793     __ blr();

  1795     return start;

  1796   }

  1798   // Generate core code for conjoint long copy (and oop copy on

  1799   // 64-bit).  If "aligned" is true, the "from" and "to" addresses

  1800   // are assumed to be heapword aligned.

  1801   //

  1802   // Arguments:

  1803   //      from:  R3_ARG1

  1804   //      to:    R4_ARG2

  1805   //      count: R5_ARG3 treated as signed

  1806   //

  1807   void generate_conjoint_long_copy_core(bool aligned) {

  1808     Register tmp1 = R6_ARG4;

  1809     Register tmp2 = R7_ARG5;

  1810     Register tmp3 = R8_ARG6;

  1811     Register tmp4 = R0;

  1813     Label l_1, l_2, l_3, l_4, l_5;

  1815     __ cmpwi(CCR0, R5_ARG3, 0);

  1816     __ beq(CCR0, l_1);

  1818     { // FasterArrayCopy

  1819       __ sldi(R5_ARG3, R5_ARG3, 3);

  1820       __ add(R3_ARG1, R3_ARG1, R5_ARG3);

  1821       __ add(R4_ARG2, R4_ARG2, R5_ARG3);

  1822       __ srdi(R5_ARG3, R5_ARG3, 3);

  1824       __ cmpwi(CCR0, R5_ARG3, 3);

  1825       __ ble(CCR0, l_5); // copy 1 at a time if less than 4 elements remain

  1827       __ srdi(tmp1, R5_ARG3, 2);

  1828       __ andi(R5_ARG3, R5_ARG3, 3);

  1829       __ mtctr(tmp1);

  1831       __ bind(l_4);

  1832       // Use unrolled version for mass copying (copy 4 elements a time).

  1833       // Load feeding store gets zero latency on Power6, however not on Power5.

  1834       // Therefore, the following sequence is made for the good of both.

  1835       __ addi(R3_ARG1, R3_ARG1, -32);

  1836       __ addi(R4_ARG2, R4_ARG2, -32);

  1837       __ ld(tmp4, 24, R3_ARG1);

  1838       __ ld(tmp3, 16, R3_ARG1);

  1839       __ ld(tmp2, 8, R3_ARG1);

  1840       __ ld(tmp1, 0, R3_ARG1);

  1841       __ std(tmp4, 24, R4_ARG2);

  1842       __ std(tmp3, 16, R4_ARG2);

  1843       __ std(tmp2, 8, R4_ARG2);

  1844       __ std(tmp1, 0, R4_ARG2);

  1845       __ bdnz(l_4);

  1847       __ cmpwi(CCR0, R5_ARG3, 0);

  1848       __ beq(CCR0, l_1);

  1850       __ bind(l_5);

  1851       __ mtctr(R5_ARG3);

  1852       __ bind(l_3);

  1853       __ ld(R0, -8, R3_ARG1);

  1854       __ std(R0, -8, R4_ARG2);

  1855       __ addi(R3_ARG1, R3_ARG1, -8);

  1856       __ addi(R4_ARG2, R4_ARG2, -8);

  1857       __ bdnz(l_3);

  1859     }

  1860     __ bind(l_1);

  1861   }

  1863   // Generate stub for conjoint long copy.  If "aligned" is true, the

  1864   // "from" and "to" addresses are assumed to be heapword aligned.

  1865   //

  1866   // Arguments for generated stub:

  1867   //      from:  R3_ARG1

  1868   //      to:    R4_ARG2

  1869   //      count: R5_ARG3 treated as signed

  1870   //

  1871   address generate_conjoint_long_copy(bool aligned, const char * name) {

  1872     StubCodeMark mark(this, "StubRoutines", name);

  1873     address start = __ function_entry();

  1875 #if defined(ABI_ELFv2)

  1876     address nooverlap_target = aligned ?

  1877       StubRoutines::arrayof_jlong_disjoint_arraycopy() :

  1878       StubRoutines::jlong_disjoint_arraycopy();

  1879 #else

  1880     address nooverlap_target = aligned ?

  1881       ((FunctionDescriptor*)StubRoutines::arrayof_jlong_disjoint_arraycopy())->entry() :

  1882       ((FunctionDescriptor*)StubRoutines::jlong_disjoint_arraycopy())->entry();

  1883 #endif

  1885     array_overlap_test(nooverlap_target, 3);

  1886     generate_conjoint_long_copy_core(aligned);

  1888     __ blr();

  1890     return start;

  1891   }

  1893   // Generate stub for conjoint oop copy.  If "aligned" is true, the

  1894   // "from" and "to" addresses are assumed to be heapword aligned.

  1895   //

  1896   // Arguments for generated stub:

  1897   //      from:  R3_ARG1

  1898   //      to:    R4_ARG2

  1899   //      count: R5_ARG3 treated as signed

  1900   //      dest_uninitialized: G1 support

  1901   //

  1902   address generate_conjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {

  1903     StubCodeMark mark(this, "StubRoutines", name);

  1905     address start = __ function_entry();

  1907 #if defined(ABI_ELFv2)

  1908     address nooverlap_target = aligned ?

  1909       StubRoutines::arrayof_oop_disjoint_arraycopy() :

  1910       StubRoutines::oop_disjoint_arraycopy();

  1911 #else

  1912     address nooverlap_target = aligned ?

  1913       ((FunctionDescriptor*)StubRoutines::arrayof_oop_disjoint_arraycopy())->entry() :

  1914       ((FunctionDescriptor*)StubRoutines::oop_disjoint_arraycopy())->entry();

  1915 #endif

  1917     gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);

  1919     // Save arguments.

  1920     __ mr(R9_ARG7, R4_ARG2);

  1921     __ mr(R10_ARG8, R5_ARG3);

  1923     if (UseCompressedOops) {

  1924       array_overlap_test(nooverlap_target, 2);

  1925       generate_conjoint_int_copy_core(aligned);

  1926     } else {

  1927       array_overlap_test(nooverlap_target, 3);

  1928       generate_conjoint_long_copy_core(aligned);

  1929     }

  1931     gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);

  1932     return start;

  1933   }

  1935   // Generate stub for disjoint oop copy.  If "aligned" is true, the

  1936   // "from" and "to" addresses are assumed to be heapword aligned.

  1937   //

  1938   // Arguments for generated stub:

  1939   //      from:  R3_ARG1

  1940   //      to:    R4_ARG2

  1941   //      count: R5_ARG3 treated as signed

  1942   //      dest_uninitialized: G1 support

  1943   //

  1944   address generate_disjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {

  1945     StubCodeMark mark(this, "StubRoutines", name);

  1946     address start = __ function_entry();

  1948     gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);

  1950     // save some arguments, disjoint_long_copy_core destroys them.

  1951     // needed for post barrier

  1952     __ mr(R9_ARG7, R4_ARG2);

  1953     __ mr(R10_ARG8, R5_ARG3);

  1955     if (UseCompressedOops) {

  1956       generate_disjoint_int_copy_core(aligned);

  1957     } else {

  1958       generate_disjoint_long_copy_core(aligned);

  1959     }

  1961     gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);

  1963     return start;

  1964   }

  1966   void generate_arraycopy_stubs() {

  1967     // Note: the disjoint stubs must be generated first, some of

  1968     // the conjoint stubs use them.

  1970     // non-aligned disjoint versions

  1971     StubRoutines::_jbyte_disjoint_arraycopy       = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");

  1972     StubRoutines::_jshort_disjoint_arraycopy      = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");

  1973     StubRoutines::_jint_disjoint_arraycopy        = generate_disjoint_int_copy(false, "jint_disjoint_arraycopy");

  1974     StubRoutines::_jlong_disjoint_arraycopy       = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");

  1975     StubRoutines::_oop_disjoint_arraycopy         = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy", false);

  1976     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy_uninit", true);

  1978     // aligned disjoint versions

  1979     StubRoutines::_arrayof_jbyte_disjoint_arraycopy      = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");

  1980     StubRoutines::_arrayof_jshort_disjoint_arraycopy     = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");

  1981     StubRoutines::_arrayof_jint_disjoint_arraycopy       = generate_disjoint_int_copy(true, "arrayof_jint_disjoint_arraycopy");

  1982     StubRoutines::_arrayof_jlong_disjoint_arraycopy      = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");

  1983     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(true, "arrayof_oop_disjoint_arraycopy", false);

  1984     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, "oop_disjoint_arraycopy_uninit", true);

  1986     // non-aligned conjoint versions

  1987     StubRoutines::_jbyte_arraycopy      = generate_conjoint_byte_copy(false, "jbyte_arraycopy");

  1988     StubRoutines::_jshort_arraycopy     = generate_conjoint_short_copy(false, "jshort_arraycopy");

  1989     StubRoutines::_jint_arraycopy       = generate_conjoint_int_copy(false, "jint_arraycopy");

  1990     StubRoutines::_jlong_arraycopy      = generate_conjoint_long_copy(false, "jlong_arraycopy");

  1991     StubRoutines::_oop_arraycopy        = generate_conjoint_oop_copy(false, "oop_arraycopy", false);

  1992     StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, "oop_arraycopy_uninit", true);

  1994     // aligned conjoint versions

  1995     StubRoutines::_arrayof_jbyte_arraycopy      = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");

  1996     StubRoutines::_arrayof_jshort_arraycopy     = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");

  1997     StubRoutines::_arrayof_jint_arraycopy       = generate_conjoint_int_copy(true, "arrayof_jint_arraycopy");

  1998     StubRoutines::_arrayof_jlong_arraycopy      = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");

  1999     StubRoutines::_arrayof_oop_arraycopy        = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", false);

  2000     StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", true);

  2002     // fill routines

  2003     StubRoutines::_jbyte_fill          = generate_fill(T_BYTE,  false, "jbyte_fill");

  2004     StubRoutines::_jshort_fill         = generate_fill(T_SHORT, false, "jshort_fill");

  2005     StubRoutines::_jint_fill           = generate_fill(T_INT,   false, "jint_fill");

  2006     StubRoutines::_arrayof_jbyte_fill  = generate_fill(T_BYTE,  true, "arrayof_jbyte_fill");

  2007     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");

  2008     StubRoutines::_arrayof_jint_fill   = generate_fill(T_INT,   true, "arrayof_jint_fill");

  2009   }

  2011   // Safefetch stubs.

  2012   void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) {

  2013     // safefetch signatures:

  2014     //   int      SafeFetch32(int*      adr, int      errValue);

  2015     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);

  2016     //

  2017     // arguments:

  2018     //   R3_ARG1 = adr

  2019     //   R4_ARG2 = errValue

  2020     //

  2021     // result:

  2022     //   R3_RET  = *adr or errValue

  2024     StubCodeMark mark(this, "StubRoutines", name);

  2026     // Entry point, pc or function descriptor.

  2027     *entry = __ function_entry();

  2029     // Load *adr into R4_ARG2, may fault.

  2030     *fault_pc = __ pc();

  2031     switch (size) {

  2032       case 4:

  2033         // int32_t, signed extended

  2034         __ lwa(R4_ARG2, 0, R3_ARG1);

  2035         break;

  2036       case 8:

  2037         // int64_t

  2038         __ ld(R4_ARG2, 0, R3_ARG1);

  2039         break;

  2040       default:

  2041         ShouldNotReachHere();

  2042     }

  2044     // return errValue or *adr

  2045     *continuation_pc = __ pc();

  2046     __ mr(R3_RET, R4_ARG2);

  2047     __ blr();

  2048   }

  2050   // Initialization

  2051   void generate_initial() {

  2052     // Generates all stubs and initializes the entry points

  2054     // Entry points that exist in all platforms.

  2055     // Note: This is code that could be shared among different platforms - however the

  2056     // benefit seems to be smaller than the disadvantage of having a

  2057     // much more complicated generator structure. See also comment in

  2058     // stubRoutines.hpp.

  2060     StubRoutines::_forward_exception_entry          = generate_forward_exception();

  2061     StubRoutines::_call_stub_entry                  = generate_call_stub(StubRoutines::_call_stub_return_address);

  2062     StubRoutines::_catch_exception_entry            = generate_catch_exception();

  2064     // Build this early so it's available for the interpreter.

  2065     StubRoutines::_throw_StackOverflowError_entry   =

  2066       generate_throw_exception("StackOverflowError throw_exception",

  2067                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);

  2068   }

  2070   void generate_all() {

  2071     // Generates all stubs and initializes the entry points

  2073     // These entry points require SharedInfo::stack0 to be set up in

  2074     // non-core builds

  2075     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);

  2076     // Handle IncompatibleClassChangeError in itable stubs.

  2077     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError),  false);

  2078     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);

  2080     StubRoutines::_handler_for_unsafe_access_entry         = generate_handler_for_unsafe_access();

  2082     // support for verify_oop (must happen after universe_init)

  2083     StubRoutines::_verify_oop_subroutine_entry             = generate_verify_oop();

  2085     // arraycopy stubs used by compilers

  2086     generate_arraycopy_stubs();

  2088     if (UseAESIntrinsics) {

  2089       guarantee(!UseAESIntrinsics, "not yet implemented.");

  2090     }

  2092     // PPC uses stubs for safefetch.

  2093     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,

  2094                                                        &StubRoutines::_safefetch32_fault_pc,

  2095                                                        &StubRoutines::_safefetch32_continuation_pc);

  2096     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,

  2097                                                        &StubRoutines::_safefetchN_fault_pc,

  2098                                                        &StubRoutines::_safefetchN_continuation_pc);

  2099   }

  2101  public:

  2102   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {

  2103     // replace the standard masm with a special one:

  2104     _masm = new MacroAssembler(code);

  2105     if (all) {

  2106       generate_all();

  2107     } else {

  2108       generate_initial();

  2109     }

  2110   }

  2111 };

  2113 void StubGenerator_generate(CodeBuffer* code, bool all) {

  2114   StubGenerator g(code, all);

  2115 }

src/cpu/ppc/vm/stubGenerator_ppc.cpp@31e80afe3fed

src/cpu/ppc/vm/stubGenerator_ppc.cpp

Mercurial > jdk8-mips64-public > hotspot / file revision

src/cpu/ppc/vm/stubGenerator_ppc.cpp@31e80afe3fed

src/cpu/ppc/vm/stubGenerator_ppc.cpp