src/cpu/sparc/vm/stubGenerator_sparc.cpp

Thu, 07 Apr 2011 09:53:20 -0700

author
johnc
date
Thu, 07 Apr 2011 09:53:20 -0700
changeset 2781
e1162778c1c8
parent 2606
0ac769a57c64
child 2978
d83ac25d0304
permissions
-rw-r--r--

7009266: G1: assert(obj->is_oop_or_null(true )) failed: Error
Summary: A referent object that is only weakly reachable at the start of concurrent marking but is re-attached to the strongly reachable object graph during marking may not be marked as live. This can cause the reference object to be processed prematurely and leave dangling pointers to the referent object. Implement a read barrier for the java.lang.ref.Reference::referent field by intrinsifying the Reference.get() method, and intercepting accesses though JNI, reflection, and Unsafe, so that when a non-null referent object is read it is also logged in an SATB buffer.
Reviewed-by: kvn, iveresov, never, tonyp, dholmes

     1 /*
     2  * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     4  *
     5  * This code is free software; you can redistribute it and/or modify it
     6  * under the terms of the GNU General Public License version 2 only, as
     7  * published by the Free Software Foundation.
     8  *
     9  * This code is distributed in the hope that it will be useful, but WITHOUT
    10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    12  * version 2 for more details (a copy is included in the LICENSE file that
    13  * accompanied this code).
    14  *
    15  * You should have received a copy of the GNU General Public License version
    16  * 2 along with this work; if not, write to the Free Software Foundation,
    17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    18  *
    19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    20  * or visit www.oracle.com if you need additional information or have any
    21  * questions.
    22  *
    23  */
    25 #include "precompiled.hpp"
    26 #include "asm/assembler.hpp"
    27 #include "assembler_sparc.inline.hpp"
    28 #include "interpreter/interpreter.hpp"
    29 #include "nativeInst_sparc.hpp"
    30 #include "oops/instanceOop.hpp"
    31 #include "oops/methodOop.hpp"
    32 #include "oops/objArrayKlass.hpp"
    33 #include "oops/oop.inline.hpp"
    34 #include "prims/methodHandles.hpp"
    35 #include "runtime/frame.inline.hpp"
    36 #include "runtime/handles.inline.hpp"
    37 #include "runtime/sharedRuntime.hpp"
    38 #include "runtime/stubCodeGenerator.hpp"
    39 #include "runtime/stubRoutines.hpp"
    40 #include "utilities/top.hpp"
    41 #ifdef TARGET_OS_FAMILY_linux
    42 # include "thread_linux.inline.hpp"
    43 #endif
    44 #ifdef TARGET_OS_FAMILY_solaris
    45 # include "thread_solaris.inline.hpp"
    46 #endif
    47 #ifdef COMPILER2
    48 #include "opto/runtime.hpp"
    49 #endif
    51 // Declaration and definition of StubGenerator (no .hpp file).
    52 // For a more detailed description of the stub routine structure
    53 // see the comment in stubRoutines.hpp.
    55 #define __ _masm->
    57 #ifdef PRODUCT
    58 #define BLOCK_COMMENT(str) /* nothing */
    59 #else
    60 #define BLOCK_COMMENT(str) __ block_comment(str)
    61 #endif
    63 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
    65 // Note:  The register L7 is used as L7_thread_cache, and may not be used
    66 //        any other way within this module.
    69 static const Register& Lstub_temp = L2;
    71 // -------------------------------------------------------------------------------------------------------------------------
    72 // Stub Code definitions
    74 static address handle_unsafe_access() {
    75   JavaThread* thread = JavaThread::current();
    76   address pc  = thread->saved_exception_pc();
    77   address npc = thread->saved_exception_npc();
    78   // pc is the instruction which we must emulate
    79   // doing a no-op is fine:  return garbage from the load
    81   // request an async exception
    82   thread->set_pending_unsafe_access_error();
    84   // return address of next instruction to execute
    85   return npc;
    86 }
    88 class StubGenerator: public StubCodeGenerator {
    89  private:
    91 #ifdef PRODUCT
    92 #define inc_counter_np(a,b,c) (0)
    93 #else
    94 #define inc_counter_np(counter, t1, t2) \
    95   BLOCK_COMMENT("inc_counter " #counter); \
    96   __ inc_counter(&counter, t1, t2);
    97 #endif
    99   //----------------------------------------------------------------------------------------------------
   100   // Call stubs are used to call Java from C
   102   address generate_call_stub(address& return_pc) {
   103     StubCodeMark mark(this, "StubRoutines", "call_stub");
   104     address start = __ pc();
   106     // Incoming arguments:
   107     //
   108     // o0         : call wrapper address
   109     // o1         : result (address)
   110     // o2         : result type
   111     // o3         : method
   112     // o4         : (interpreter) entry point
   113     // o5         : parameters (address)
   114     // [sp + 0x5c]: parameter size (in words)
   115     // [sp + 0x60]: thread
   116     //
   117     // +---------------+ <--- sp + 0
   118     // |               |
   119     // . reg save area .
   120     // |               |
   121     // +---------------+ <--- sp + 0x40
   122     // |               |
   123     // . extra 7 slots .
   124     // |               |
   125     // +---------------+ <--- sp + 0x5c
   126     // |  param. size  |
   127     // +---------------+ <--- sp + 0x60
   128     // |    thread     |
   129     // +---------------+
   130     // |               |
   132     // note: if the link argument position changes, adjust
   133     //       the code in frame::entry_frame_call_wrapper()
   135     const Argument link           = Argument(0, false); // used only for GC
   136     const Argument result         = Argument(1, false);
   137     const Argument result_type    = Argument(2, false);
   138     const Argument method         = Argument(3, false);
   139     const Argument entry_point    = Argument(4, false);
   140     const Argument parameters     = Argument(5, false);
   141     const Argument parameter_size = Argument(6, false);
   142     const Argument thread         = Argument(7, false);
   144     // setup thread register
   145     __ ld_ptr(thread.as_address(), G2_thread);
   146     __ reinit_heapbase();
   148 #ifdef ASSERT
   149     // make sure we have no pending exceptions
   150     { const Register t = G3_scratch;
   151       Label L;
   152       __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t);
   153       __ br_null(t, false, Assembler::pt, L);
   154       __ delayed()->nop();
   155       __ stop("StubRoutines::call_stub: entered with pending exception");
   156       __ bind(L);
   157     }
   158 #endif
   160     // create activation frame & allocate space for parameters
   161     { const Register t = G3_scratch;
   162       __ ld_ptr(parameter_size.as_address(), t);                // get parameter size (in words)
   163       __ add(t, frame::memory_parameter_word_sp_offset, t);     // add space for save area (in words)
   164       __ round_to(t, WordsPerLong);                             // make sure it is multiple of 2 (in words)
   165       __ sll(t, Interpreter::logStackElementSize, t);           // compute number of bytes
   166       __ neg(t);                                                // negate so it can be used with save
   167       __ save(SP, t, SP);                                       // setup new frame
   168     }
   170     // +---------------+ <--- sp + 0
   171     // |               |
   172     // . reg save area .
   173     // |               |
   174     // +---------------+ <--- sp + 0x40
   175     // |               |
   176     // . extra 7 slots .
   177     // |               |
   178     // +---------------+ <--- sp + 0x5c
   179     // |  empty slot   |      (only if parameter size is even)
   180     // +---------------+
   181     // |               |
   182     // .  parameters   .
   183     // |               |
   184     // +---------------+ <--- fp + 0
   185     // |               |
   186     // . reg save area .
   187     // |               |
   188     // +---------------+ <--- fp + 0x40
   189     // |               |
   190     // . extra 7 slots .
   191     // |               |
   192     // +---------------+ <--- fp + 0x5c
   193     // |  param. size  |
   194     // +---------------+ <--- fp + 0x60
   195     // |    thread     |
   196     // +---------------+
   197     // |               |
   199     // pass parameters if any
   200     BLOCK_COMMENT("pass parameters if any");
   201     { const Register src = parameters.as_in().as_register();
   202       const Register dst = Lentry_args;
   203       const Register tmp = G3_scratch;
   204       const Register cnt = G4_scratch;
   206       // test if any parameters & setup of Lentry_args
   207       Label exit;
   208       __ ld_ptr(parameter_size.as_in().as_address(), cnt);      // parameter counter
   209       __ add( FP, STACK_BIAS, dst );
   210       __ tst(cnt);
   211       __ br(Assembler::zero, false, Assembler::pn, exit);
   212       __ delayed()->sub(dst, BytesPerWord, dst);                 // setup Lentry_args
   214       // copy parameters if any
   215       Label loop;
   216       __ BIND(loop);
   217       // Store parameter value
   218       __ ld_ptr(src, 0, tmp);
   219       __ add(src, BytesPerWord, src);
   220       __ st_ptr(tmp, dst, 0);
   221       __ deccc(cnt);
   222       __ br(Assembler::greater, false, Assembler::pt, loop);
   223       __ delayed()->sub(dst, Interpreter::stackElementSize, dst);
   225       // done
   226       __ BIND(exit);
   227     }
   229     // setup parameters, method & call Java function
   230 #ifdef ASSERT
   231     // layout_activation_impl checks it's notion of saved SP against
   232     // this register, so if this changes update it as well.
   233     const Register saved_SP = Lscratch;
   234     __ mov(SP, saved_SP);                               // keep track of SP before call
   235 #endif
   237     // setup parameters
   238     const Register t = G3_scratch;
   239     __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words)
   240     __ sll(t, Interpreter::logStackElementSize, t);    // compute number of bytes
   241     __ sub(FP, t, Gargs);                              // setup parameter pointer
   242 #ifdef _LP64
   243     __ add( Gargs, STACK_BIAS, Gargs );                // Account for LP64 stack bias
   244 #endif
   245     __ mov(SP, O5_savedSP);
   248     // do the call
   249     //
   250     // the following register must be setup:
   251     //
   252     // G2_thread
   253     // G5_method
   254     // Gargs
   255     BLOCK_COMMENT("call Java function");
   256     __ jmpl(entry_point.as_in().as_register(), G0, O7);
   257     __ delayed()->mov(method.as_in().as_register(), G5_method);   // setup method
   259     BLOCK_COMMENT("call_stub_return_address:");
   260     return_pc = __ pc();
   262     // The callee, if it wasn't interpreted, can return with SP changed so
   263     // we can no longer assert of change of SP.
   265     // store result depending on type
   266     // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE
   267     //  is treated as T_INT)
   268     { const Register addr = result     .as_in().as_register();
   269       const Register type = result_type.as_in().as_register();
   270       Label is_long, is_float, is_double, is_object, exit;
   271       __            cmp(type, T_OBJECT);  __ br(Assembler::equal, false, Assembler::pn, is_object);
   272       __ delayed()->cmp(type, T_FLOAT);   __ br(Assembler::equal, false, Assembler::pn, is_float);
   273       __ delayed()->cmp(type, T_DOUBLE);  __ br(Assembler::equal, false, Assembler::pn, is_double);
   274       __ delayed()->cmp(type, T_LONG);    __ br(Assembler::equal, false, Assembler::pn, is_long);
   275       __ delayed()->nop();
   277       // store int result
   278       __ st(O0, addr, G0);
   280       __ BIND(exit);
   281       __ ret();
   282       __ delayed()->restore();
   284       __ BIND(is_object);
   285       __ ba(false, exit);
   286       __ delayed()->st_ptr(O0, addr, G0);
   288       __ BIND(is_float);
   289       __ ba(false, exit);
   290       __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0);
   292       __ BIND(is_double);
   293       __ ba(false, exit);
   294       __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0);
   296       __ BIND(is_long);
   297 #ifdef _LP64
   298       __ ba(false, exit);
   299       __ delayed()->st_long(O0, addr, G0);      // store entire long
   300 #else
   301 #if defined(COMPILER2)
   302   // All return values are where we want them, except for Longs.  C2 returns
   303   // longs in G1 in the 32-bit build whereas the interpreter wants them in O0/O1.
   304   // Since the interpreter will return longs in G1 and O0/O1 in the 32bit
   305   // build we simply always use G1.
   306   // Note: I tried to make c2 return longs in O0/O1 and G1 so we wouldn't have to
   307   // do this here. Unfortunately if we did a rethrow we'd see an machepilog node
   308   // first which would move g1 -> O0/O1 and destroy the exception we were throwing.
   310       __ ba(false, exit);
   311       __ delayed()->stx(G1, addr, G0);  // store entire long
   312 #else
   313       __ st(O1, addr, BytesPerInt);
   314       __ ba(false, exit);
   315       __ delayed()->st(O0, addr, G0);
   316 #endif /* COMPILER2 */
   317 #endif /* _LP64 */
   318      }
   319      return start;
   320   }
   323   //----------------------------------------------------------------------------------------------------
   324   // Return point for a Java call if there's an exception thrown in Java code.
   325   // The exception is caught and transformed into a pending exception stored in
   326   // JavaThread that can be tested from within the VM.
   327   //
   328   // Oexception: exception oop
   330   address generate_catch_exception() {
   331     StubCodeMark mark(this, "StubRoutines", "catch_exception");
   333     address start = __ pc();
   334     // verify that thread corresponds
   335     __ verify_thread();
   337     const Register& temp_reg = Gtemp;
   338     Address pending_exception_addr    (G2_thread, Thread::pending_exception_offset());
   339     Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset   ());
   340     Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset   ());
   342     // set pending exception
   343     __ verify_oop(Oexception);
   344     __ st_ptr(Oexception, pending_exception_addr);
   345     __ set((intptr_t)__FILE__, temp_reg);
   346     __ st_ptr(temp_reg, exception_file_offset_addr);
   347     __ set((intptr_t)__LINE__, temp_reg);
   348     __ st(temp_reg, exception_line_offset_addr);
   350     // complete return to VM
   351     assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
   353     AddressLiteral stub_ret(StubRoutines::_call_stub_return_address);
   354     __ jump_to(stub_ret, temp_reg);
   355     __ delayed()->nop();
   357     return start;
   358   }
   361   //----------------------------------------------------------------------------------------------------
   362   // Continuation point for runtime calls returning with a pending exception
   363   // The pending exception check happened in the runtime or native call stub
   364   // The pending exception in Thread is converted into a Java-level exception
   365   //
   366   // Contract with Java-level exception handler: O0 = exception
   367   //                                             O1 = throwing pc
   369   address generate_forward_exception() {
   370     StubCodeMark mark(this, "StubRoutines", "forward_exception");
   371     address start = __ pc();
   373     // Upon entry, O7 has the return address returning into Java
   374     // (interpreted or compiled) code; i.e. the return address
   375     // becomes the throwing pc.
   377     const Register& handler_reg = Gtemp;
   379     Address exception_addr(G2_thread, Thread::pending_exception_offset());
   381 #ifdef ASSERT
   382     // make sure that this code is only executed if there is a pending exception
   383     { Label L;
   384       __ ld_ptr(exception_addr, Gtemp);
   385       __ br_notnull(Gtemp, false, Assembler::pt, L);
   386       __ delayed()->nop();
   387       __ stop("StubRoutines::forward exception: no pending exception (1)");
   388       __ bind(L);
   389     }
   390 #endif
   392     // compute exception handler into handler_reg
   393     __ get_thread();
   394     __ ld_ptr(exception_addr, Oexception);
   395     __ verify_oop(Oexception);
   396     __ save_frame(0);             // compensates for compiler weakness
   397     __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC
   398     BLOCK_COMMENT("call exception_handler_for_return_address");
   399     __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch);
   400     __ mov(O0, handler_reg);
   401     __ restore();                 // compensates for compiler weakness
   403     __ ld_ptr(exception_addr, Oexception);
   404     __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC
   406 #ifdef ASSERT
   407     // make sure exception is set
   408     { Label L;
   409       __ br_notnull(Oexception, false, Assembler::pt, L);
   410       __ delayed()->nop();
   411       __ stop("StubRoutines::forward exception: no pending exception (2)");
   412       __ bind(L);
   413     }
   414 #endif
   415     // jump to exception handler
   416     __ jmp(handler_reg, 0);
   417     // clear pending exception
   418     __ delayed()->st_ptr(G0, exception_addr);
   420     return start;
   421   }
   424   //------------------------------------------------------------------------------------------------------------------------
   425   // Continuation point for throwing of implicit exceptions that are not handled in
   426   // the current activation. Fabricates an exception oop and initiates normal
   427   // exception dispatching in this frame. Only callee-saved registers are preserved
   428   // (through the normal register window / RegisterMap handling).
   429   // If the compiler needs all registers to be preserved between the fault
   430   // point and the exception handler then it must assume responsibility for that in
   431   // AbstractCompiler::continuation_for_implicit_null_exception or
   432   // continuation_for_implicit_division_by_zero_exception. All other implicit
   433   // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
   434   // either at call sites or otherwise assume that stack unwinding will be initiated,
   435   // so caller saved registers were assumed volatile in the compiler.
   437   // Note that we generate only this stub into a RuntimeStub, because it needs to be
   438   // properly traversed and ignored during GC, so we change the meaning of the "__"
   439   // macro within this method.
   440 #undef __
   441 #define __ masm->
   443   address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc) {
   444 #ifdef ASSERT
   445     int insts_size = VerifyThread ? 1 * K : 600;
   446 #else
   447     int insts_size = VerifyThread ? 1 * K : 256;
   448 #endif /* ASSERT */
   449     int locs_size  = 32;
   451     CodeBuffer      code(name, insts_size, locs_size);
   452     MacroAssembler* masm = new MacroAssembler(&code);
   454     __ verify_thread();
   456     // This is an inlined and slightly modified version of call_VM
   457     // which has the ability to fetch the return PC out of thread-local storage
   458     __ assert_not_delayed();
   460     // Note that we always push a frame because on the SPARC
   461     // architecture, for all of our implicit exception kinds at call
   462     // sites, the implicit exception is taken before the callee frame
   463     // is pushed.
   464     __ save_frame(0);
   466     int frame_complete = __ offset();
   468     if (restore_saved_exception_pc) {
   469       __ ld_ptr(G2_thread, JavaThread::saved_exception_pc_offset(), I7);
   470       __ sub(I7, frame::pc_return_offset, I7);
   471     }
   473     // Note that we always have a runtime stub frame on the top of stack by this point
   474     Register last_java_sp = SP;
   475     // 64-bit last_java_sp is biased!
   476     __ set_last_Java_frame(last_java_sp, G0);
   477     if (VerifyThread)  __ mov(G2_thread, O0); // about to be smashed; pass early
   478     __ save_thread(noreg);
   479     // do the call
   480     BLOCK_COMMENT("call runtime_entry");
   481     __ call(runtime_entry, relocInfo::runtime_call_type);
   482     if (!VerifyThread)
   483       __ delayed()->mov(G2_thread, O0);  // pass thread as first argument
   484     else
   485       __ delayed()->nop();             // (thread already passed)
   486     __ restore_thread(noreg);
   487     __ reset_last_Java_frame();
   489     // check for pending exceptions. use Gtemp as scratch register.
   490 #ifdef ASSERT
   491     Label L;
   493     Address exception_addr(G2_thread, Thread::pending_exception_offset());
   494     Register scratch_reg = Gtemp;
   495     __ ld_ptr(exception_addr, scratch_reg);
   496     __ br_notnull(scratch_reg, false, Assembler::pt, L);
   497     __ delayed()->nop();
   498     __ should_not_reach_here();
   499     __ bind(L);
   500 #endif // ASSERT
   501     BLOCK_COMMENT("call forward_exception_entry");
   502     __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
   503     // we use O7 linkage so that forward_exception_entry has the issuing PC
   504     __ delayed()->restore();
   506     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false);
   507     return stub->entry_point();
   508   }
   510 #undef __
   511 #define __ _masm->
   514   // Generate a routine that sets all the registers so we
   515   // can tell if the stop routine prints them correctly.
   516   address generate_test_stop() {
   517     StubCodeMark mark(this, "StubRoutines", "test_stop");
   518     address start = __ pc();
   520     int i;
   522     __ save_frame(0);
   524     static jfloat zero = 0.0, one = 1.0;
   526     // put addr in L0, then load through L0 to F0
   527     __ set((intptr_t)&zero, L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F0);
   528     __ set((intptr_t)&one,  L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1
   530     // use add to put 2..18 in F2..F18
   531     for ( i = 2;  i <= 18;  ++i ) {
   532       __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1),  as_FloatRegister(i));
   533     }
   535     // Now put double 2 in F16, double 18 in F18
   536     __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 );
   537     __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 );
   539     // use add to put 20..32 in F20..F32
   540     for (i = 20; i < 32; i += 2) {
   541       __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2),  as_FloatRegister(i));
   542     }
   544     // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's
   545     for ( i = 0; i < 8; ++i ) {
   546       if (i < 6) {
   547         __ set(     i, as_iRegister(i));
   548         __ set(16 + i, as_oRegister(i));
   549         __ set(24 + i, as_gRegister(i));
   550       }
   551       __ set( 8 + i, as_lRegister(i));
   552     }
   554     __ stop("testing stop");
   557     __ ret();
   558     __ delayed()->restore();
   560     return start;
   561   }
   564   address generate_stop_subroutine() {
   565     StubCodeMark mark(this, "StubRoutines", "stop_subroutine");
   566     address start = __ pc();
   568     __ stop_subroutine();
   570     return start;
   571   }
   573   address generate_flush_callers_register_windows() {
   574     StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows");
   575     address start = __ pc();
   577     __ flush_windows();
   578     __ retl(false);
   579     __ delayed()->add( FP, STACK_BIAS, O0 );
   580     // The returned value must be a stack pointer whose register save area
   581     // is flushed, and will stay flushed while the caller executes.
   583     return start;
   584   }
   586   // Helper functions for v8 atomic operations.
   587   //
   588   void get_v8_oop_lock_ptr(Register lock_ptr_reg, Register mark_oop_reg, Register scratch_reg) {
   589     if (mark_oop_reg == noreg) {
   590       address lock_ptr = (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr();
   591       __ set((intptr_t)lock_ptr, lock_ptr_reg);
   592     } else {
   593       assert(scratch_reg != noreg, "just checking");
   594       address lock_ptr = (address)StubRoutines::Sparc::_v8_oop_lock_cache;
   595       __ set((intptr_t)lock_ptr, lock_ptr_reg);
   596       __ and3(mark_oop_reg, StubRoutines::Sparc::v8_oop_lock_mask_in_place, scratch_reg);
   597       __ add(lock_ptr_reg, scratch_reg, lock_ptr_reg);
   598     }
   599   }
   601   void generate_v8_lock_prologue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) {
   603     get_v8_oop_lock_ptr(lock_ptr_reg, mark_oop_reg, scratch_reg);
   604     __ set(StubRoutines::Sparc::locked, lock_reg);
   605     // Initialize yield counter
   606     __ mov(G0,yield_reg);
   608     __ BIND(retry);
   609     __ cmp(yield_reg, V8AtomicOperationUnderLockSpinCount);
   610     __ br(Assembler::less, false, Assembler::pt, dontyield);
   611     __ delayed()->nop();
   613     // This code can only be called from inside the VM, this
   614     // stub is only invoked from Atomic::add().  We do not
   615     // want to use call_VM, because _last_java_sp and such
   616     // must already be set.
   617     //
   618     // Save the regs and make space for a C call
   619     __ save(SP, -96, SP);
   620     __ save_all_globals_into_locals();
   621     BLOCK_COMMENT("call os::naked_sleep");
   622     __ call(CAST_FROM_FN_PTR(address, os::naked_sleep));
   623     __ delayed()->nop();
   624     __ restore_globals_from_locals();
   625     __ restore();
   626     // reset the counter
   627     __ mov(G0,yield_reg);
   629     __ BIND(dontyield);
   631     // try to get lock
   632     __ swap(lock_ptr_reg, 0, lock_reg);
   634     // did we get the lock?
   635     __ cmp(lock_reg, StubRoutines::Sparc::unlocked);
   636     __ br(Assembler::notEqual, true, Assembler::pn, retry);
   637     __ delayed()->add(yield_reg,1,yield_reg);
   639     // yes, got lock. do the operation here.
   640   }
   642   void generate_v8_lock_epilogue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) {
   643     __ st(lock_reg, lock_ptr_reg, 0); // unlock
   644   }
   646   // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest).
   647   //
   648   // Arguments :
   649   //
   650   //      exchange_value: O0
   651   //      dest:           O1
   652   //
   653   // Results:
   654   //
   655   //     O0: the value previously stored in dest
   656   //
   657   address generate_atomic_xchg() {
   658     StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
   659     address start = __ pc();
   661     if (UseCASForSwap) {
   662       // Use CAS instead of swap, just in case the MP hardware
   663       // prefers to work with just one kind of synch. instruction.
   664       Label retry;
   665       __ BIND(retry);
   666       __ mov(O0, O3);       // scratch copy of exchange value
   667       __ ld(O1, 0, O2);     // observe the previous value
   668       // try to replace O2 with O3
   669       __ cas_under_lock(O1, O2, O3,
   670       (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
   671       __ cmp(O2, O3);
   672       __ br(Assembler::notEqual, false, Assembler::pn, retry);
   673       __ delayed()->nop();
   675       __ retl(false);
   676       __ delayed()->mov(O2, O0);  // report previous value to caller
   678     } else {
   679       if (VM_Version::v9_instructions_work()) {
   680         __ retl(false);
   681         __ delayed()->swap(O1, 0, O0);
   682       } else {
   683         const Register& lock_reg = O2;
   684         const Register& lock_ptr_reg = O3;
   685         const Register& yield_reg = O4;
   687         Label retry;
   688         Label dontyield;
   690         generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
   691         // got the lock, do the swap
   692         __ swap(O1, 0, O0);
   694         generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
   695         __ retl(false);
   696         __ delayed()->nop();
   697       }
   698     }
   700     return start;
   701   }
   704   // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value)
   705   //
   706   // Arguments :
   707   //
   708   //      exchange_value: O0
   709   //      dest:           O1
   710   //      compare_value:  O2
   711   //
   712   // Results:
   713   //
   714   //     O0: the value previously stored in dest
   715   //
   716   // Overwrites (v8): O3,O4,O5
   717   //
   718   address generate_atomic_cmpxchg() {
   719     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
   720     address start = __ pc();
   722     // cmpxchg(dest, compare_value, exchange_value)
   723     __ cas_under_lock(O1, O2, O0,
   724       (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
   725     __ retl(false);
   726     __ delayed()->nop();
   728     return start;
   729   }
   731   // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
   732   //
   733   // Arguments :
   734   //
   735   //      exchange_value: O1:O0
   736   //      dest:           O2
   737   //      compare_value:  O4:O3
   738   //
   739   // Results:
   740   //
   741   //     O1:O0: the value previously stored in dest
   742   //
   743   // This only works on V9, on V8 we don't generate any
   744   // code and just return NULL.
   745   //
   746   // Overwrites: G1,G2,G3
   747   //
   748   address generate_atomic_cmpxchg_long() {
   749     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
   750     address start = __ pc();
   752     if (!VM_Version::supports_cx8())
   753         return NULL;;
   754     __ sllx(O0, 32, O0);
   755     __ srl(O1, 0, O1);
   756     __ or3(O0,O1,O0);      // O0 holds 64-bit value from compare_value
   757     __ sllx(O3, 32, O3);
   758     __ srl(O4, 0, O4);
   759     __ or3(O3,O4,O3);     // O3 holds 64-bit value from exchange_value
   760     __ casx(O2, O3, O0);
   761     __ srl(O0, 0, O1);    // unpacked return value in O1:O0
   762     __ retl(false);
   763     __ delayed()->srlx(O0, 32, O0);
   765     return start;
   766   }
   769   // Support for jint Atomic::add(jint add_value, volatile jint* dest).
   770   //
   771   // Arguments :
   772   //
   773   //      add_value: O0   (e.g., +1 or -1)
   774   //      dest:      O1
   775   //
   776   // Results:
   777   //
   778   //     O0: the new value stored in dest
   779   //
   780   // Overwrites (v9): O3
   781   // Overwrites (v8): O3,O4,O5
   782   //
   783   address generate_atomic_add() {
   784     StubCodeMark mark(this, "StubRoutines", "atomic_add");
   785     address start = __ pc();
   786     __ BIND(_atomic_add_stub);
   788     if (VM_Version::v9_instructions_work()) {
   789       Label(retry);
   790       __ BIND(retry);
   792       __ lduw(O1, 0, O2);
   793       __ add(O0,   O2, O3);
   794       __ cas(O1,   O2, O3);
   795       __ cmp(      O2, O3);
   796       __ br(Assembler::notEqual, false, Assembler::pn, retry);
   797       __ delayed()->nop();
   798       __ retl(false);
   799       __ delayed()->add(O0, O2, O0); // note that cas made O2==O3
   800     } else {
   801       const Register& lock_reg = O2;
   802       const Register& lock_ptr_reg = O3;
   803       const Register& value_reg = O4;
   804       const Register& yield_reg = O5;
   806       Label(retry);
   807       Label(dontyield);
   809       generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
   810       // got lock, do the increment
   811       __ ld(O1, 0, value_reg);
   812       __ add(O0, value_reg, value_reg);
   813       __ st(value_reg, O1, 0);
   815       // %%% only for RMO and PSO
   816       __ membar(Assembler::StoreStore);
   818       generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
   820       __ retl(false);
   821       __ delayed()->mov(value_reg, O0);
   822     }
   824     return start;
   825   }
   826   Label _atomic_add_stub;  // called from other stubs
   829   //------------------------------------------------------------------------------------------------------------------------
   830   // The following routine generates a subroutine to throw an asynchronous
   831   // UnknownError when an unsafe access gets a fault that could not be
   832   // reasonably prevented by the programmer.  (Example: SIGBUS/OBJERR.)
   833   //
   834   // Arguments :
   835   //
   836   //      trapping PC:    O7
   837   //
   838   // Results:
   839   //     posts an asynchronous exception, skips the trapping instruction
   840   //
   842   address generate_handler_for_unsafe_access() {
   843     StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
   844     address start = __ pc();
   846     const int preserve_register_words = (64 * 2);
   847     Address preserve_addr(FP, (-preserve_register_words * wordSize) + STACK_BIAS);
   849     Register Lthread = L7_thread_cache;
   850     int i;
   852     __ save_frame(0);
   853     __ mov(G1, L1);
   854     __ mov(G2, L2);
   855     __ mov(G3, L3);
   856     __ mov(G4, L4);
   857     __ mov(G5, L5);
   858     for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) {
   859       __ stf(FloatRegisterImpl::D, as_FloatRegister(i), preserve_addr, i * wordSize);
   860     }
   862     address entry_point = CAST_FROM_FN_PTR(address, handle_unsafe_access);
   863     BLOCK_COMMENT("call handle_unsafe_access");
   864     __ call(entry_point, relocInfo::runtime_call_type);
   865     __ delayed()->nop();
   867     __ mov(L1, G1);
   868     __ mov(L2, G2);
   869     __ mov(L3, G3);
   870     __ mov(L4, G4);
   871     __ mov(L5, G5);
   872     for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) {
   873       __ ldf(FloatRegisterImpl::D, preserve_addr, as_FloatRegister(i), i * wordSize);
   874     }
   876     __ verify_thread();
   878     __ jmp(O0, 0);
   879     __ delayed()->restore();
   881     return start;
   882   }
   885   // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super );
   886   // Arguments :
   887   //
   888   //      ret  : O0, returned
   889   //      icc/xcc: set as O0 (depending on wordSize)
   890   //      sub  : O1, argument, not changed
   891   //      super: O2, argument, not changed
   892   //      raddr: O7, blown by call
   893   address generate_partial_subtype_check() {
   894     __ align(CodeEntryAlignment);
   895     StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
   896     address start = __ pc();
   897     Label miss;
   899 #if defined(COMPILER2) && !defined(_LP64)
   900     // Do not use a 'save' because it blows the 64-bit O registers.
   901     __ add(SP,-4*wordSize,SP);  // Make space for 4 temps (stack must be 2 words aligned)
   902     __ st_ptr(L0,SP,(frame::register_save_words+0)*wordSize);
   903     __ st_ptr(L1,SP,(frame::register_save_words+1)*wordSize);
   904     __ st_ptr(L2,SP,(frame::register_save_words+2)*wordSize);
   905     __ st_ptr(L3,SP,(frame::register_save_words+3)*wordSize);
   906     Register Rret   = O0;
   907     Register Rsub   = O1;
   908     Register Rsuper = O2;
   909 #else
   910     __ save_frame(0);
   911     Register Rret   = I0;
   912     Register Rsub   = I1;
   913     Register Rsuper = I2;
   914 #endif
   916     Register L0_ary_len = L0;
   917     Register L1_ary_ptr = L1;
   918     Register L2_super   = L2;
   919     Register L3_index   = L3;
   921     __ check_klass_subtype_slow_path(Rsub, Rsuper,
   922                                      L0, L1, L2, L3,
   923                                      NULL, &miss);
   925     // Match falls through here.
   926     __ addcc(G0,0,Rret);        // set Z flags, Z result
   928 #if defined(COMPILER2) && !defined(_LP64)
   929     __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
   930     __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
   931     __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
   932     __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
   933     __ retl();                  // Result in Rret is zero; flags set to Z
   934     __ delayed()->add(SP,4*wordSize,SP);
   935 #else
   936     __ ret();                   // Result in Rret is zero; flags set to Z
   937     __ delayed()->restore();
   938 #endif
   940     __ BIND(miss);
   941     __ addcc(G0,1,Rret);        // set NZ flags, NZ result
   943 #if defined(COMPILER2) && !defined(_LP64)
   944     __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
   945     __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
   946     __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
   947     __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
   948     __ retl();                  // Result in Rret is != 0; flags set to NZ
   949     __ delayed()->add(SP,4*wordSize,SP);
   950 #else
   951     __ ret();                   // Result in Rret is != 0; flags set to NZ
   952     __ delayed()->restore();
   953 #endif
   955     return start;
   956   }
   959   // Called from MacroAssembler::verify_oop
   960   //
   961   address generate_verify_oop_subroutine() {
   962     StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
   964     address start = __ pc();
   966     __ verify_oop_subroutine();
   968     return start;
   969   }
   972   //
   973   // Verify that a register contains clean 32-bits positive value
   974   // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax).
   975   //
   976   //  Input:
   977   //    Rint  -  32-bits value
   978   //    Rtmp  -  scratch
   979   //
   980   void assert_clean_int(Register Rint, Register Rtmp) {
   981 #if defined(ASSERT) && defined(_LP64)
   982     __ signx(Rint, Rtmp);
   983     __ cmp(Rint, Rtmp);
   984     __ breakpoint_trap(Assembler::notEqual, Assembler::xcc);
   985 #endif
   986   }
   988   //
   989   //  Generate overlap test for array copy stubs
   990   //
   991   //  Input:
   992   //    O0    -  array1
   993   //    O1    -  array2
   994   //    O2    -  element count
   995   //
   996   //  Kills temps:  O3, O4
   997   //
   998   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
   999     assert(no_overlap_target != NULL, "must be generated");
  1000     array_overlap_test(no_overlap_target, NULL, log2_elem_size);
  1002   void array_overlap_test(Label& L_no_overlap, int log2_elem_size) {
  1003     array_overlap_test(NULL, &L_no_overlap, log2_elem_size);
  1005   void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) {
  1006     const Register from       = O0;
  1007     const Register to         = O1;
  1008     const Register count      = O2;
  1009     const Register to_from    = O3; // to - from
  1010     const Register byte_count = O4; // count << log2_elem_size
  1012       __ subcc(to, from, to_from);
  1013       __ sll_ptr(count, log2_elem_size, byte_count);
  1014       if (NOLp == NULL)
  1015         __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target);
  1016       else
  1017         __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp));
  1018       __ delayed()->cmp(to_from, byte_count);
  1019       if (NOLp == NULL)
  1020         __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target);
  1021       else
  1022         __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp));
  1023       __ delayed()->nop();
  1026   //
  1027   //  Generate pre-write barrier for array.
  1028   //
  1029   //  Input:
  1030   //     addr     - register containing starting address
  1031   //     count    - register containing element count
  1032   //     tmp      - scratch register
  1033   //
  1034   //  The input registers are overwritten.
  1035   //
  1036   void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
  1037     BarrierSet* bs = Universe::heap()->barrier_set();
  1038     switch (bs->kind()) {
  1039       case BarrierSet::G1SATBCT:
  1040       case BarrierSet::G1SATBCTLogging:
  1041         // With G1, don't generate the call if we statically know that the target in uninitialized
  1042         if (!dest_uninitialized) {
  1043           __ save_frame(0);
  1044           // Save the necessary global regs... will be used after.
  1045           if (addr->is_global()) {
  1046             __ mov(addr, L0);
  1048           if (count->is_global()) {
  1049             __ mov(count, L1);
  1051           __ mov(addr->after_save(), O0);
  1052           // Get the count into O1
  1053           __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
  1054           __ delayed()->mov(count->after_save(), O1);
  1055           if (addr->is_global()) {
  1056             __ mov(L0, addr);
  1058           if (count->is_global()) {
  1059             __ mov(L1, count);
  1061           __ restore();
  1063         break;
  1064       case BarrierSet::CardTableModRef:
  1065       case BarrierSet::CardTableExtension:
  1066       case BarrierSet::ModRef:
  1067         break;
  1068       default:
  1069         ShouldNotReachHere();
  1072   //
  1073   //  Generate post-write barrier for array.
  1074   //
  1075   //  Input:
  1076   //     addr     - register containing starting address
  1077   //     count    - register containing element count
  1078   //     tmp      - scratch register
  1079   //
  1080   //  The input registers are overwritten.
  1081   //
  1082   void gen_write_ref_array_post_barrier(Register addr, Register count,
  1083                                         Register tmp) {
  1084     BarrierSet* bs = Universe::heap()->barrier_set();
  1086     switch (bs->kind()) {
  1087       case BarrierSet::G1SATBCT:
  1088       case BarrierSet::G1SATBCTLogging:
  1090           // Get some new fresh output registers.
  1091           __ save_frame(0);
  1092           __ mov(addr->after_save(), O0);
  1093           __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
  1094           __ delayed()->mov(count->after_save(), O1);
  1095           __ restore();
  1097         break;
  1098       case BarrierSet::CardTableModRef:
  1099       case BarrierSet::CardTableExtension:
  1101           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
  1102           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
  1103           assert_different_registers(addr, count, tmp);
  1105           Label L_loop;
  1107           __ sll_ptr(count, LogBytesPerHeapOop, count);
  1108           __ sub(count, BytesPerHeapOop, count);
  1109           __ add(count, addr, count);
  1110           // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
  1111           __ srl_ptr(addr, CardTableModRefBS::card_shift, addr);
  1112           __ srl_ptr(count, CardTableModRefBS::card_shift, count);
  1113           __ sub(count, addr, count);
  1114           AddressLiteral rs(ct->byte_map_base);
  1115           __ set(rs, tmp);
  1116         __ BIND(L_loop);
  1117           __ stb(G0, tmp, addr);
  1118           __ subcc(count, 1, count);
  1119           __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
  1120           __ delayed()->add(addr, 1, addr);
  1122         break;
  1123       case BarrierSet::ModRef:
  1124         break;
  1125       default:
  1126         ShouldNotReachHere();
  1131   // Copy big chunks forward with shift
  1132   //
  1133   // Inputs:
  1134   //   from      - source arrays
  1135   //   to        - destination array aligned to 8-bytes
  1136   //   count     - elements count to copy >= the count equivalent to 16 bytes
  1137   //   count_dec - elements count's decrement equivalent to 16 bytes
  1138   //   L_copy_bytes - copy exit label
  1139   //
  1140   void copy_16_bytes_forward_with_shift(Register from, Register to,
  1141                      Register count, int count_dec, Label& L_copy_bytes) {
  1142     Label L_loop, L_aligned_copy, L_copy_last_bytes;
  1144     // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
  1145       __ andcc(from, 7, G1); // misaligned bytes
  1146       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
  1147       __ delayed()->nop();
  1149     const Register left_shift  = G1; // left  shift bit counter
  1150     const Register right_shift = G5; // right shift bit counter
  1152       __ sll(G1, LogBitsPerByte, left_shift);
  1153       __ mov(64, right_shift);
  1154       __ sub(right_shift, left_shift, right_shift);
  1156     //
  1157     // Load 2 aligned 8-bytes chunks and use one from previous iteration
  1158     // to form 2 aligned 8-bytes chunks to store.
  1159     //
  1160       __ deccc(count, count_dec); // Pre-decrement 'count'
  1161       __ andn(from, 7, from);     // Align address
  1162       __ ldx(from, 0, O3);
  1163       __ inc(from, 8);
  1164       __ align(OptoLoopAlignment);
  1165     __ BIND(L_loop);
  1166       __ ldx(from, 0, O4);
  1167       __ deccc(count, count_dec); // Can we do next iteration after this one?
  1168       __ ldx(from, 8, G4);
  1169       __ inc(to, 16);
  1170       __ inc(from, 16);
  1171       __ sllx(O3, left_shift,  O3);
  1172       __ srlx(O4, right_shift, G3);
  1173       __ bset(G3, O3);
  1174       __ stx(O3, to, -16);
  1175       __ sllx(O4, left_shift,  O4);
  1176       __ srlx(G4, right_shift, G3);
  1177       __ bset(G3, O4);
  1178       __ stx(O4, to, -8);
  1179       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
  1180       __ delayed()->mov(G4, O3);
  1182       __ inccc(count, count_dec>>1 ); // + 8 bytes
  1183       __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
  1184       __ delayed()->inc(count, count_dec>>1); // restore 'count'
  1186       // copy 8 bytes, part of them already loaded in O3
  1187       __ ldx(from, 0, O4);
  1188       __ inc(to, 8);
  1189       __ inc(from, 8);
  1190       __ sllx(O3, left_shift,  O3);
  1191       __ srlx(O4, right_shift, G3);
  1192       __ bset(O3, G3);
  1193       __ stx(G3, to, -8);
  1195     __ BIND(L_copy_last_bytes);
  1196       __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
  1197       __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
  1198       __ delayed()->sub(from, right_shift, from);       // restore address
  1200     __ BIND(L_aligned_copy);
  1203   // Copy big chunks backward with shift
  1204   //
  1205   // Inputs:
  1206   //   end_from  - source arrays end address
  1207   //   end_to    - destination array end address aligned to 8-bytes
  1208   //   count     - elements count to copy >= the count equivalent to 16 bytes
  1209   //   count_dec - elements count's decrement equivalent to 16 bytes
  1210   //   L_aligned_copy - aligned copy exit label
  1211   //   L_copy_bytes   - copy exit label
  1212   //
  1213   void copy_16_bytes_backward_with_shift(Register end_from, Register end_to,
  1214                      Register count, int count_dec,
  1215                      Label& L_aligned_copy, Label& L_copy_bytes) {
  1216     Label L_loop, L_copy_last_bytes;
  1218     // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
  1219       __ andcc(end_from, 7, G1); // misaligned bytes
  1220       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
  1221       __ delayed()->deccc(count, count_dec); // Pre-decrement 'count'
  1223     const Register left_shift  = G1; // left  shift bit counter
  1224     const Register right_shift = G5; // right shift bit counter
  1226       __ sll(G1, LogBitsPerByte, left_shift);
  1227       __ mov(64, right_shift);
  1228       __ sub(right_shift, left_shift, right_shift);
  1230     //
  1231     // Load 2 aligned 8-bytes chunks and use one from previous iteration
  1232     // to form 2 aligned 8-bytes chunks to store.
  1233     //
  1234       __ andn(end_from, 7, end_from);     // Align address
  1235       __ ldx(end_from, 0, O3);
  1236       __ align(OptoLoopAlignment);
  1237     __ BIND(L_loop);
  1238       __ ldx(end_from, -8, O4);
  1239       __ deccc(count, count_dec); // Can we do next iteration after this one?
  1240       __ ldx(end_from, -16, G4);
  1241       __ dec(end_to, 16);
  1242       __ dec(end_from, 16);
  1243       __ srlx(O3, right_shift, O3);
  1244       __ sllx(O4, left_shift,  G3);
  1245       __ bset(G3, O3);
  1246       __ stx(O3, end_to, 8);
  1247       __ srlx(O4, right_shift, O4);
  1248       __ sllx(G4, left_shift,  G3);
  1249       __ bset(G3, O4);
  1250       __ stx(O4, end_to, 0);
  1251       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
  1252       __ delayed()->mov(G4, O3);
  1254       __ inccc(count, count_dec>>1 ); // + 8 bytes
  1255       __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
  1256       __ delayed()->inc(count, count_dec>>1); // restore 'count'
  1258       // copy 8 bytes, part of them already loaded in O3
  1259       __ ldx(end_from, -8, O4);
  1260       __ dec(end_to, 8);
  1261       __ dec(end_from, 8);
  1262       __ srlx(O3, right_shift, O3);
  1263       __ sllx(O4, left_shift,  G3);
  1264       __ bset(O3, G3);
  1265       __ stx(G3, end_to, 0);
  1267     __ BIND(L_copy_last_bytes);
  1268       __ srl(left_shift, LogBitsPerByte, left_shift);    // misaligned bytes
  1269       __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
  1270       __ delayed()->add(end_from, left_shift, end_from); // restore address
  1273   //
  1274   //  Generate stub for disjoint byte copy.  If "aligned" is true, the
  1275   //  "from" and "to" addresses are assumed to be heapword aligned.
  1276   //
  1277   // Arguments for generated stub:
  1278   //      from:  O0
  1279   //      to:    O1
  1280   //      count: O2 treated as signed
  1281   //
  1282   address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) {
  1283     __ align(CodeEntryAlignment);
  1284     StubCodeMark mark(this, "StubRoutines", name);
  1285     address start = __ pc();
  1287     Label L_skip_alignment, L_align;
  1288     Label L_copy_byte, L_copy_byte_loop, L_exit;
  1290     const Register from      = O0;   // source array address
  1291     const Register to        = O1;   // destination array address
  1292     const Register count     = O2;   // elements count
  1293     const Register offset    = O5;   // offset from start of arrays
  1294     // O3, O4, G3, G4 are used as temp registers
  1296     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  1298     if (entry != NULL) {
  1299       *entry = __ pc();
  1300       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  1301       BLOCK_COMMENT("Entry:");
  1304     // for short arrays, just do single element copy
  1305     __ cmp(count, 23); // 16 + 7
  1306     __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
  1307     __ delayed()->mov(G0, offset);
  1309     if (aligned) {
  1310       // 'aligned' == true when it is known statically during compilation
  1311       // of this arraycopy call site that both 'from' and 'to' addresses
  1312       // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
  1313       //
  1314       // Aligned arrays have 4 bytes alignment in 32-bits VM
  1315       // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM
  1316       //
  1317 #ifndef _LP64
  1318       // copy a 4-bytes word if necessary to align 'to' to 8 bytes
  1319       __ andcc(to, 7, G0);
  1320       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment);
  1321       __ delayed()->ld(from, 0, O3);
  1322       __ inc(from, 4);
  1323       __ inc(to, 4);
  1324       __ dec(count, 4);
  1325       __ st(O3, to, -4);
  1326     __ BIND(L_skip_alignment);
  1327 #endif
  1328     } else {
  1329       // copy bytes to align 'to' on 8 byte boundary
  1330       __ andcc(to, 7, G1); // misaligned bytes
  1331       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
  1332       __ delayed()->neg(G1);
  1333       __ inc(G1, 8);       // bytes need to copy to next 8-bytes alignment
  1334       __ sub(count, G1, count);
  1335     __ BIND(L_align);
  1336       __ ldub(from, 0, O3);
  1337       __ deccc(G1);
  1338       __ inc(from);
  1339       __ stb(O3, to, 0);
  1340       __ br(Assembler::notZero, false, Assembler::pt, L_align);
  1341       __ delayed()->inc(to);
  1342     __ BIND(L_skip_alignment);
  1344 #ifdef _LP64
  1345     if (!aligned)
  1346 #endif
  1348       // Copy with shift 16 bytes per iteration if arrays do not have
  1349       // the same alignment mod 8, otherwise fall through to the next
  1350       // code for aligned copy.
  1351       // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
  1352       // Also jump over aligned copy after the copy with shift completed.
  1354       copy_16_bytes_forward_with_shift(from, to, count, 16, L_copy_byte);
  1357     // Both array are 8 bytes aligned, copy 16 bytes at a time
  1358       __ and3(count, 7, G4); // Save count
  1359       __ srl(count, 3, count);
  1360      generate_disjoint_long_copy_core(aligned);
  1361       __ mov(G4, count);     // Restore count
  1363     // copy tailing bytes
  1364     __ BIND(L_copy_byte);
  1365       __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
  1366       __ delayed()->nop();
  1367       __ align(OptoLoopAlignment);
  1368     __ BIND(L_copy_byte_loop);
  1369       __ ldub(from, offset, O3);
  1370       __ deccc(count);
  1371       __ stb(O3, to, offset);
  1372       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
  1373       __ delayed()->inc(offset);
  1375     __ BIND(L_exit);
  1376       // O3, O4 are used as temp registers
  1377       inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
  1378       __ retl();
  1379       __ delayed()->mov(G0, O0); // return 0
  1380     return start;
  1383   //
  1384   //  Generate stub for conjoint byte copy.  If "aligned" is true, the
  1385   //  "from" and "to" addresses are assumed to be heapword aligned.
  1386   //
  1387   // Arguments for generated stub:
  1388   //      from:  O0
  1389   //      to:    O1
  1390   //      count: O2 treated as signed
  1391   //
  1392   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
  1393                                       address *entry, const char *name) {
  1394     // Do reverse copy.
  1396     __ align(CodeEntryAlignment);
  1397     StubCodeMark mark(this, "StubRoutines", name);
  1398     address start = __ pc();
  1400     Label L_skip_alignment, L_align, L_aligned_copy;
  1401     Label L_copy_byte, L_copy_byte_loop, L_exit;
  1403     const Register from      = O0;   // source array address
  1404     const Register to        = O1;   // destination array address
  1405     const Register count     = O2;   // elements count
  1406     const Register end_from  = from; // source array end address
  1407     const Register end_to    = to;   // destination array end address
  1409     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  1411     if (entry != NULL) {
  1412       *entry = __ pc();
  1413       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  1414       BLOCK_COMMENT("Entry:");
  1417     array_overlap_test(nooverlap_target, 0);
  1419     __ add(to, count, end_to);       // offset after last copied element
  1421     // for short arrays, just do single element copy
  1422     __ cmp(count, 23); // 16 + 7
  1423     __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
  1424     __ delayed()->add(from, count, end_from);
  1427       // Align end of arrays since they could be not aligned even
  1428       // when arrays itself are aligned.
  1430       // copy bytes to align 'end_to' on 8 byte boundary
  1431       __ andcc(end_to, 7, G1); // misaligned bytes
  1432       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
  1433       __ delayed()->nop();
  1434       __ sub(count, G1, count);
  1435     __ BIND(L_align);
  1436       __ dec(end_from);
  1437       __ dec(end_to);
  1438       __ ldub(end_from, 0, O3);
  1439       __ deccc(G1);
  1440       __ brx(Assembler::notZero, false, Assembler::pt, L_align);
  1441       __ delayed()->stb(O3, end_to, 0);
  1442     __ BIND(L_skip_alignment);
  1444 #ifdef _LP64
  1445     if (aligned) {
  1446       // Both arrays are aligned to 8-bytes in 64-bits VM.
  1447       // The 'count' is decremented in copy_16_bytes_backward_with_shift()
  1448       // in unaligned case.
  1449       __ dec(count, 16);
  1450     } else
  1451 #endif
  1453       // Copy with shift 16 bytes per iteration if arrays do not have
  1454       // the same alignment mod 8, otherwise jump to the next
  1455       // code for aligned copy (and substracting 16 from 'count' before jump).
  1456       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
  1457       // Also jump over aligned copy after the copy with shift completed.
  1459       copy_16_bytes_backward_with_shift(end_from, end_to, count, 16,
  1460                                         L_aligned_copy, L_copy_byte);
  1462     // copy 4 elements (16 bytes) at a time
  1463       __ align(OptoLoopAlignment);
  1464     __ BIND(L_aligned_copy);
  1465       __ dec(end_from, 16);
  1466       __ ldx(end_from, 8, O3);
  1467       __ ldx(end_from, 0, O4);
  1468       __ dec(end_to, 16);
  1469       __ deccc(count, 16);
  1470       __ stx(O3, end_to, 8);
  1471       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
  1472       __ delayed()->stx(O4, end_to, 0);
  1473       __ inc(count, 16);
  1475     // copy 1 element (2 bytes) at a time
  1476     __ BIND(L_copy_byte);
  1477       __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
  1478       __ delayed()->nop();
  1479       __ align(OptoLoopAlignment);
  1480     __ BIND(L_copy_byte_loop);
  1481       __ dec(end_from);
  1482       __ dec(end_to);
  1483       __ ldub(end_from, 0, O4);
  1484       __ deccc(count);
  1485       __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop);
  1486       __ delayed()->stb(O4, end_to, 0);
  1488     __ BIND(L_exit);
  1489     // O3, O4 are used as temp registers
  1490     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
  1491     __ retl();
  1492     __ delayed()->mov(G0, O0); // return 0
  1493     return start;
  1496   //
  1497   //  Generate stub for disjoint short copy.  If "aligned" is true, the
  1498   //  "from" and "to" addresses are assumed to be heapword aligned.
  1499   //
  1500   // Arguments for generated stub:
  1501   //      from:  O0
  1502   //      to:    O1
  1503   //      count: O2 treated as signed
  1504   //
  1505   address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) {
  1506     __ align(CodeEntryAlignment);
  1507     StubCodeMark mark(this, "StubRoutines", name);
  1508     address start = __ pc();
  1510     Label L_skip_alignment, L_skip_alignment2;
  1511     Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
  1513     const Register from      = O0;   // source array address
  1514     const Register to        = O1;   // destination array address
  1515     const Register count     = O2;   // elements count
  1516     const Register offset    = O5;   // offset from start of arrays
  1517     // O3, O4, G3, G4 are used as temp registers
  1519     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  1521     if (entry != NULL) {
  1522       *entry = __ pc();
  1523       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  1524       BLOCK_COMMENT("Entry:");
  1527     // for short arrays, just do single element copy
  1528     __ cmp(count, 11); // 8 + 3  (22 bytes)
  1529     __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
  1530     __ delayed()->mov(G0, offset);
  1532     if (aligned) {
  1533       // 'aligned' == true when it is known statically during compilation
  1534       // of this arraycopy call site that both 'from' and 'to' addresses
  1535       // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
  1536       //
  1537       // Aligned arrays have 4 bytes alignment in 32-bits VM
  1538       // and 8 bytes - in 64-bits VM.
  1539       //
  1540 #ifndef _LP64
  1541       // copy a 2-elements word if necessary to align 'to' to 8 bytes
  1542       __ andcc(to, 7, G0);
  1543       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
  1544       __ delayed()->ld(from, 0, O3);
  1545       __ inc(from, 4);
  1546       __ inc(to, 4);
  1547       __ dec(count, 2);
  1548       __ st(O3, to, -4);
  1549     __ BIND(L_skip_alignment);
  1550 #endif
  1551     } else {
  1552       // copy 1 element if necessary to align 'to' on an 4 bytes
  1553       __ andcc(to, 3, G0);
  1554       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
  1555       __ delayed()->lduh(from, 0, O3);
  1556       __ inc(from, 2);
  1557       __ inc(to, 2);
  1558       __ dec(count);
  1559       __ sth(O3, to, -2);
  1560     __ BIND(L_skip_alignment);
  1562       // copy 2 elements to align 'to' on an 8 byte boundary
  1563       __ andcc(to, 7, G0);
  1564       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
  1565       __ delayed()->lduh(from, 0, O3);
  1566       __ dec(count, 2);
  1567       __ lduh(from, 2, O4);
  1568       __ inc(from, 4);
  1569       __ inc(to, 4);
  1570       __ sth(O3, to, -4);
  1571       __ sth(O4, to, -2);
  1572     __ BIND(L_skip_alignment2);
  1574 #ifdef _LP64
  1575     if (!aligned)
  1576 #endif
  1578       // Copy with shift 16 bytes per iteration if arrays do not have
  1579       // the same alignment mod 8, otherwise fall through to the next
  1580       // code for aligned copy.
  1581       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
  1582       // Also jump over aligned copy after the copy with shift completed.
  1584       copy_16_bytes_forward_with_shift(from, to, count, 8, L_copy_2_bytes);
  1587     // Both array are 8 bytes aligned, copy 16 bytes at a time
  1588       __ and3(count, 3, G4); // Save
  1589       __ srl(count, 2, count);
  1590      generate_disjoint_long_copy_core(aligned);
  1591       __ mov(G4, count); // restore
  1593     // copy 1 element at a time
  1594     __ BIND(L_copy_2_bytes);
  1595       __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
  1596       __ delayed()->nop();
  1597       __ align(OptoLoopAlignment);
  1598     __ BIND(L_copy_2_bytes_loop);
  1599       __ lduh(from, offset, O3);
  1600       __ deccc(count);
  1601       __ sth(O3, to, offset);
  1602       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
  1603       __ delayed()->inc(offset, 2);
  1605     __ BIND(L_exit);
  1606       // O3, O4 are used as temp registers
  1607       inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
  1608       __ retl();
  1609       __ delayed()->mov(G0, O0); // return 0
  1610     return start;
  1613   //
  1614   //  Generate stub for disjoint short fill.  If "aligned" is true, the
  1615   //  "to" address is assumed to be heapword aligned.
  1616   //
  1617   // Arguments for generated stub:
  1618   //      to:    O0
  1619   //      value: O1
  1620   //      count: O2 treated as signed
  1621   //
  1622   address generate_fill(BasicType t, bool aligned, const char* name) {
  1623     __ align(CodeEntryAlignment);
  1624     StubCodeMark mark(this, "StubRoutines", name);
  1625     address start = __ pc();
  1627     const Register to        = O0;   // source array address
  1628     const Register value     = O1;   // fill value
  1629     const Register count     = O2;   // elements count
  1630     // O3 is used as a temp register
  1632     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  1634     Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
  1635     Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes;
  1637     int shift = -1;
  1638     switch (t) {
  1639        case T_BYTE:
  1640         shift = 2;
  1641         break;
  1642        case T_SHORT:
  1643         shift = 1;
  1644         break;
  1645       case T_INT:
  1646          shift = 0;
  1647         break;
  1648       default: ShouldNotReachHere();
  1651     BLOCK_COMMENT("Entry:");
  1653     if (t == T_BYTE) {
  1654       // Zero extend value
  1655       __ and3(value, 0xff, value);
  1656       __ sllx(value, 8, O3);
  1657       __ or3(value, O3, value);
  1659     if (t == T_SHORT) {
  1660       // Zero extend value
  1661       __ sllx(value, 48, value);
  1662       __ srlx(value, 48, value);
  1664     if (t == T_BYTE || t == T_SHORT) {
  1665       __ sllx(value, 16, O3);
  1666       __ or3(value, O3, value);
  1669     __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
  1670     __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp
  1671     __ delayed()->andcc(count, 1, G0);
  1673     if (!aligned && (t == T_BYTE || t == T_SHORT)) {
  1674       // align source address at 4 bytes address boundary
  1675       if (t == T_BYTE) {
  1676         // One byte misalignment happens only for byte arrays
  1677         __ andcc(to, 1, G0);
  1678         __ br(Assembler::zero, false, Assembler::pt, L_skip_align1);
  1679         __ delayed()->nop();
  1680         __ stb(value, to, 0);
  1681         __ inc(to, 1);
  1682         __ dec(count, 1);
  1683         __ BIND(L_skip_align1);
  1685       // Two bytes misalignment happens only for byte and short (char) arrays
  1686       __ andcc(to, 2, G0);
  1687       __ br(Assembler::zero, false, Assembler::pt, L_skip_align2);
  1688       __ delayed()->nop();
  1689       __ sth(value, to, 0);
  1690       __ inc(to, 2);
  1691       __ dec(count, 1 << (shift - 1));
  1692       __ BIND(L_skip_align2);
  1694 #ifdef _LP64
  1695     if (!aligned) {
  1696 #endif
  1697     // align to 8 bytes, we know we are 4 byte aligned to start
  1698     __ andcc(to, 7, G0);
  1699     __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes);
  1700     __ delayed()->nop();
  1701     __ stw(value, to, 0);
  1702     __ inc(to, 4);
  1703     __ dec(count, 1 << shift);
  1704     __ BIND(L_fill_32_bytes);
  1705 #ifdef _LP64
  1707 #endif
  1709     if (t == T_INT) {
  1710       // Zero extend value
  1711       __ srl(value, 0, value);
  1713     if (t == T_BYTE || t == T_SHORT || t == T_INT) {
  1714       __ sllx(value, 32, O3);
  1715       __ or3(value, O3, value);
  1718     Label L_check_fill_8_bytes;
  1719     // Fill 32-byte chunks
  1720     __ subcc(count, 8 << shift, count);
  1721     __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes);
  1722     __ delayed()->nop();
  1724     Label L_fill_32_bytes_loop, L_fill_4_bytes;
  1725     __ align(16);
  1726     __ BIND(L_fill_32_bytes_loop);
  1728     __ stx(value, to, 0);
  1729     __ stx(value, to, 8);
  1730     __ stx(value, to, 16);
  1731     __ stx(value, to, 24);
  1733     __ subcc(count, 8 << shift, count);
  1734     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop);
  1735     __ delayed()->add(to, 32, to);
  1737     __ BIND(L_check_fill_8_bytes);
  1738     __ addcc(count, 8 << shift, count);
  1739     __ brx(Assembler::zero, false, Assembler::pn, L_exit);
  1740     __ delayed()->subcc(count, 1 << (shift + 1), count);
  1741     __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes);
  1742     __ delayed()->andcc(count, 1<<shift, G0);
  1744     //
  1745     // length is too short, just fill 8 bytes at a time
  1746     //
  1747     Label L_fill_8_bytes_loop;
  1748     __ BIND(L_fill_8_bytes_loop);
  1749     __ stx(value, to, 0);
  1750     __ subcc(count, 1 << (shift + 1), count);
  1751     __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop);
  1752     __ delayed()->add(to, 8, to);
  1754     // fill trailing 4 bytes
  1755     __ andcc(count, 1<<shift, G0);  // in delay slot of branches
  1756     if (t == T_INT) {
  1757       __ BIND(L_fill_elements);
  1759     __ BIND(L_fill_4_bytes);
  1760     __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes);
  1761     if (t == T_BYTE || t == T_SHORT) {
  1762       __ delayed()->andcc(count, 1<<(shift-1), G0);
  1763     } else {
  1764       __ delayed()->nop();
  1766     __ stw(value, to, 0);
  1767     if (t == T_BYTE || t == T_SHORT) {
  1768       __ inc(to, 4);
  1769       // fill trailing 2 bytes
  1770       __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches
  1771       __ BIND(L_fill_2_bytes);
  1772       __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte);
  1773       __ delayed()->andcc(count, 1, count);
  1774       __ sth(value, to, 0);
  1775       if (t == T_BYTE) {
  1776         __ inc(to, 2);
  1777         // fill trailing byte
  1778         __ andcc(count, 1, count);  // in delay slot of branches
  1779         __ BIND(L_fill_byte);
  1780         __ brx(Assembler::zero, false, Assembler::pt, L_exit);
  1781         __ delayed()->nop();
  1782         __ stb(value, to, 0);
  1783       } else {
  1784         __ BIND(L_fill_byte);
  1786     } else {
  1787       __ BIND(L_fill_2_bytes);
  1789     __ BIND(L_exit);
  1790     __ retl();
  1791     __ delayed()->nop();
  1793     // Handle copies less than 8 bytes.  Int is handled elsewhere.
  1794     if (t == T_BYTE) {
  1795       __ BIND(L_fill_elements);
  1796       Label L_fill_2, L_fill_4;
  1797       // in delay slot __ andcc(count, 1, G0);
  1798       __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
  1799       __ delayed()->andcc(count, 2, G0);
  1800       __ stb(value, to, 0);
  1801       __ inc(to, 1);
  1802       __ BIND(L_fill_2);
  1803       __ brx(Assembler::zero, false, Assembler::pt, L_fill_4);
  1804       __ delayed()->andcc(count, 4, G0);
  1805       __ stb(value, to, 0);
  1806       __ stb(value, to, 1);
  1807       __ inc(to, 2);
  1808       __ BIND(L_fill_4);
  1809       __ brx(Assembler::zero, false, Assembler::pt, L_exit);
  1810       __ delayed()->nop();
  1811       __ stb(value, to, 0);
  1812       __ stb(value, to, 1);
  1813       __ stb(value, to, 2);
  1814       __ retl();
  1815       __ delayed()->stb(value, to, 3);
  1818     if (t == T_SHORT) {
  1819       Label L_fill_2;
  1820       __ BIND(L_fill_elements);
  1821       // in delay slot __ andcc(count, 1, G0);
  1822       __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
  1823       __ delayed()->andcc(count, 2, G0);
  1824       __ sth(value, to, 0);
  1825       __ inc(to, 2);
  1826       __ BIND(L_fill_2);
  1827       __ brx(Assembler::zero, false, Assembler::pt, L_exit);
  1828       __ delayed()->nop();
  1829       __ sth(value, to, 0);
  1830       __ retl();
  1831       __ delayed()->sth(value, to, 2);
  1833     return start;
  1836   //
  1837   //  Generate stub for conjoint short copy.  If "aligned" is true, the
  1838   //  "from" and "to" addresses are assumed to be heapword aligned.
  1839   //
  1840   // Arguments for generated stub:
  1841   //      from:  O0
  1842   //      to:    O1
  1843   //      count: O2 treated as signed
  1844   //
  1845   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
  1846                                        address *entry, const char *name) {
  1847     // Do reverse copy.
  1849     __ align(CodeEntryAlignment);
  1850     StubCodeMark mark(this, "StubRoutines", name);
  1851     address start = __ pc();
  1853     Label L_skip_alignment, L_skip_alignment2, L_aligned_copy;
  1854     Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
  1856     const Register from      = O0;   // source array address
  1857     const Register to        = O1;   // destination array address
  1858     const Register count     = O2;   // elements count
  1859     const Register end_from  = from; // source array end address
  1860     const Register end_to    = to;   // destination array end address
  1862     const Register byte_count = O3;  // bytes count to copy
  1864     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  1866     if (entry != NULL) {
  1867       *entry = __ pc();
  1868       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  1869       BLOCK_COMMENT("Entry:");
  1872     array_overlap_test(nooverlap_target, 1);
  1874     __ sllx(count, LogBytesPerShort, byte_count);
  1875     __ add(to, byte_count, end_to);  // offset after last copied element
  1877     // for short arrays, just do single element copy
  1878     __ cmp(count, 11); // 8 + 3  (22 bytes)
  1879     __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
  1880     __ delayed()->add(from, byte_count, end_from);
  1883       // Align end of arrays since they could be not aligned even
  1884       // when arrays itself are aligned.
  1886       // copy 1 element if necessary to align 'end_to' on an 4 bytes
  1887       __ andcc(end_to, 3, G0);
  1888       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
  1889       __ delayed()->lduh(end_from, -2, O3);
  1890       __ dec(end_from, 2);
  1891       __ dec(end_to, 2);
  1892       __ dec(count);
  1893       __ sth(O3, end_to, 0);
  1894     __ BIND(L_skip_alignment);
  1896       // copy 2 elements to align 'end_to' on an 8 byte boundary
  1897       __ andcc(end_to, 7, G0);
  1898       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
  1899       __ delayed()->lduh(end_from, -2, O3);
  1900       __ dec(count, 2);
  1901       __ lduh(end_from, -4, O4);
  1902       __ dec(end_from, 4);
  1903       __ dec(end_to, 4);
  1904       __ sth(O3, end_to, 2);
  1905       __ sth(O4, end_to, 0);
  1906     __ BIND(L_skip_alignment2);
  1908 #ifdef _LP64
  1909     if (aligned) {
  1910       // Both arrays are aligned to 8-bytes in 64-bits VM.
  1911       // The 'count' is decremented in copy_16_bytes_backward_with_shift()
  1912       // in unaligned case.
  1913       __ dec(count, 8);
  1914     } else
  1915 #endif
  1917       // Copy with shift 16 bytes per iteration if arrays do not have
  1918       // the same alignment mod 8, otherwise jump to the next
  1919       // code for aligned copy (and substracting 8 from 'count' before jump).
  1920       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
  1921       // Also jump over aligned copy after the copy with shift completed.
  1923       copy_16_bytes_backward_with_shift(end_from, end_to, count, 8,
  1924                                         L_aligned_copy, L_copy_2_bytes);
  1926     // copy 4 elements (16 bytes) at a time
  1927       __ align(OptoLoopAlignment);
  1928     __ BIND(L_aligned_copy);
  1929       __ dec(end_from, 16);
  1930       __ ldx(end_from, 8, O3);
  1931       __ ldx(end_from, 0, O4);
  1932       __ dec(end_to, 16);
  1933       __ deccc(count, 8);
  1934       __ stx(O3, end_to, 8);
  1935       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
  1936       __ delayed()->stx(O4, end_to, 0);
  1937       __ inc(count, 8);
  1939     // copy 1 element (2 bytes) at a time
  1940     __ BIND(L_copy_2_bytes);
  1941       __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
  1942       __ delayed()->nop();
  1943     __ BIND(L_copy_2_bytes_loop);
  1944       __ dec(end_from, 2);
  1945       __ dec(end_to, 2);
  1946       __ lduh(end_from, 0, O4);
  1947       __ deccc(count);
  1948       __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
  1949       __ delayed()->sth(O4, end_to, 0);
  1951     __ BIND(L_exit);
  1952     // O3, O4 are used as temp registers
  1953     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
  1954     __ retl();
  1955     __ delayed()->mov(G0, O0); // return 0
  1956     return start;
  1959   //
  1960   //  Generate core code for disjoint int copy (and oop copy on 32-bit).
  1961   //  If "aligned" is true, the "from" and "to" addresses are assumed
  1962   //  to be heapword aligned.
  1963   //
  1964   // Arguments:
  1965   //      from:  O0
  1966   //      to:    O1
  1967   //      count: O2 treated as signed
  1968   //
  1969   void generate_disjoint_int_copy_core(bool aligned) {
  1971     Label L_skip_alignment, L_aligned_copy;
  1972     Label L_copy_16_bytes,  L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
  1974     const Register from      = O0;   // source array address
  1975     const Register to        = O1;   // destination array address
  1976     const Register count     = O2;   // elements count
  1977     const Register offset    = O5;   // offset from start of arrays
  1978     // O3, O4, G3, G4 are used as temp registers
  1980     // 'aligned' == true when it is known statically during compilation
  1981     // of this arraycopy call site that both 'from' and 'to' addresses
  1982     // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
  1983     //
  1984     // Aligned arrays have 4 bytes alignment in 32-bits VM
  1985     // and 8 bytes - in 64-bits VM.
  1986     //
  1987 #ifdef _LP64
  1988     if (!aligned)
  1989 #endif
  1991       // The next check could be put under 'ifndef' since the code in
  1992       // generate_disjoint_long_copy_core() has own checks and set 'offset'.
  1994       // for short arrays, just do single element copy
  1995       __ cmp(count, 5); // 4 + 1 (20 bytes)
  1996       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
  1997       __ delayed()->mov(G0, offset);
  1999       // copy 1 element to align 'to' on an 8 byte boundary
  2000       __ andcc(to, 7, G0);
  2001       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
  2002       __ delayed()->ld(from, 0, O3);
  2003       __ inc(from, 4);
  2004       __ inc(to, 4);
  2005       __ dec(count);
  2006       __ st(O3, to, -4);
  2007     __ BIND(L_skip_alignment);
  2009     // if arrays have same alignment mod 8, do 4 elements copy
  2010       __ andcc(from, 7, G0);
  2011       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
  2012       __ delayed()->ld(from, 0, O3);
  2014     //
  2015     // Load 2 aligned 8-bytes chunks and use one from previous iteration
  2016     // to form 2 aligned 8-bytes chunks to store.
  2017     //
  2018     // copy_16_bytes_forward_with_shift() is not used here since this
  2019     // code is more optimal.
  2021     // copy with shift 4 elements (16 bytes) at a time
  2022       __ dec(count, 4);   // The cmp at the beginning guaranty count >= 4
  2024       __ align(OptoLoopAlignment);
  2025     __ BIND(L_copy_16_bytes);
  2026       __ ldx(from, 4, O4);
  2027       __ deccc(count, 4); // Can we do next iteration after this one?
  2028       __ ldx(from, 12, G4);
  2029       __ inc(to, 16);
  2030       __ inc(from, 16);
  2031       __ sllx(O3, 32, O3);
  2032       __ srlx(O4, 32, G3);
  2033       __ bset(G3, O3);
  2034       __ stx(O3, to, -16);
  2035       __ sllx(O4, 32, O4);
  2036       __ srlx(G4, 32, G3);
  2037       __ bset(G3, O4);
  2038       __ stx(O4, to, -8);
  2039       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
  2040       __ delayed()->mov(G4, O3);
  2042       __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
  2043       __ delayed()->inc(count, 4); // restore 'count'
  2045     __ BIND(L_aligned_copy);
  2047     // copy 4 elements (16 bytes) at a time
  2048       __ and3(count, 1, G4); // Save
  2049       __ srl(count, 1, count);
  2050      generate_disjoint_long_copy_core(aligned);
  2051       __ mov(G4, count);     // Restore
  2053     // copy 1 element at a time
  2054     __ BIND(L_copy_4_bytes);
  2055       __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
  2056       __ delayed()->nop();
  2057     __ BIND(L_copy_4_bytes_loop);
  2058       __ ld(from, offset, O3);
  2059       __ deccc(count);
  2060       __ st(O3, to, offset);
  2061       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
  2062       __ delayed()->inc(offset, 4);
  2063     __ BIND(L_exit);
  2066   //
  2067   //  Generate stub for disjoint int copy.  If "aligned" is true, the
  2068   //  "from" and "to" addresses are assumed to be heapword aligned.
  2069   //
  2070   // Arguments for generated stub:
  2071   //      from:  O0
  2072   //      to:    O1
  2073   //      count: O2 treated as signed
  2074   //
  2075   address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) {
  2076     __ align(CodeEntryAlignment);
  2077     StubCodeMark mark(this, "StubRoutines", name);
  2078     address start = __ pc();
  2080     const Register count = O2;
  2081     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  2083     if (entry != NULL) {
  2084       *entry = __ pc();
  2085       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  2086       BLOCK_COMMENT("Entry:");
  2089     generate_disjoint_int_copy_core(aligned);
  2091     // O3, O4 are used as temp registers
  2092     inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
  2093     __ retl();
  2094     __ delayed()->mov(G0, O0); // return 0
  2095     return start;
  2098   //
  2099   //  Generate core code for conjoint int copy (and oop copy on 32-bit).
  2100   //  If "aligned" is true, the "from" and "to" addresses are assumed
  2101   //  to be heapword aligned.
  2102   //
  2103   // Arguments:
  2104   //      from:  O0
  2105   //      to:    O1
  2106   //      count: O2 treated as signed
  2107   //
  2108   void generate_conjoint_int_copy_core(bool aligned) {
  2109     // Do reverse copy.
  2111     Label L_skip_alignment, L_aligned_copy;
  2112     Label L_copy_16_bytes,  L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
  2114     const Register from      = O0;   // source array address
  2115     const Register to        = O1;   // destination array address
  2116     const Register count     = O2;   // elements count
  2117     const Register end_from  = from; // source array end address
  2118     const Register end_to    = to;   // destination array end address
  2119     // O3, O4, O5, G3 are used as temp registers
  2121     const Register byte_count = O3;  // bytes count to copy
  2123       __ sllx(count, LogBytesPerInt, byte_count);
  2124       __ add(to, byte_count, end_to); // offset after last copied element
  2126       __ cmp(count, 5); // for short arrays, just do single element copy
  2127       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
  2128       __ delayed()->add(from, byte_count, end_from);
  2130     // copy 1 element to align 'to' on an 8 byte boundary
  2131       __ andcc(end_to, 7, G0);
  2132       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
  2133       __ delayed()->nop();
  2134       __ dec(count);
  2135       __ dec(end_from, 4);
  2136       __ dec(end_to,   4);
  2137       __ ld(end_from, 0, O4);
  2138       __ st(O4, end_to, 0);
  2139     __ BIND(L_skip_alignment);
  2141     // Check if 'end_from' and 'end_to' has the same alignment.
  2142       __ andcc(end_from, 7, G0);
  2143       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
  2144       __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4
  2146     // copy with shift 4 elements (16 bytes) at a time
  2147     //
  2148     // Load 2 aligned 8-bytes chunks and use one from previous iteration
  2149     // to form 2 aligned 8-bytes chunks to store.
  2150     //
  2151       __ ldx(end_from, -4, O3);
  2152       __ align(OptoLoopAlignment);
  2153     __ BIND(L_copy_16_bytes);
  2154       __ ldx(end_from, -12, O4);
  2155       __ deccc(count, 4);
  2156       __ ldx(end_from, -20, O5);
  2157       __ dec(end_to, 16);
  2158       __ dec(end_from, 16);
  2159       __ srlx(O3, 32, O3);
  2160       __ sllx(O4, 32, G3);
  2161       __ bset(G3, O3);
  2162       __ stx(O3, end_to, 8);
  2163       __ srlx(O4, 32, O4);
  2164       __ sllx(O5, 32, G3);
  2165       __ bset(O4, G3);
  2166       __ stx(G3, end_to, 0);
  2167       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
  2168       __ delayed()->mov(O5, O3);
  2170       __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
  2171       __ delayed()->inc(count, 4);
  2173     // copy 4 elements (16 bytes) at a time
  2174       __ align(OptoLoopAlignment);
  2175     __ BIND(L_aligned_copy);
  2176       __ dec(end_from, 16);
  2177       __ ldx(end_from, 8, O3);
  2178       __ ldx(end_from, 0, O4);
  2179       __ dec(end_to, 16);
  2180       __ deccc(count, 4);
  2181       __ stx(O3, end_to, 8);
  2182       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
  2183       __ delayed()->stx(O4, end_to, 0);
  2184       __ inc(count, 4);
  2186     // copy 1 element (4 bytes) at a time
  2187     __ BIND(L_copy_4_bytes);
  2188       __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
  2189       __ delayed()->nop();
  2190     __ BIND(L_copy_4_bytes_loop);
  2191       __ dec(end_from, 4);
  2192       __ dec(end_to, 4);
  2193       __ ld(end_from, 0, O4);
  2194       __ deccc(count);
  2195       __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop);
  2196       __ delayed()->st(O4, end_to, 0);
  2197     __ BIND(L_exit);
  2200   //
  2201   //  Generate stub for conjoint int copy.  If "aligned" is true, the
  2202   //  "from" and "to" addresses are assumed to be heapword aligned.
  2203   //
  2204   // Arguments for generated stub:
  2205   //      from:  O0
  2206   //      to:    O1
  2207   //      count: O2 treated as signed
  2208   //
  2209   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
  2210                                      address *entry, const char *name) {
  2211     __ align(CodeEntryAlignment);
  2212     StubCodeMark mark(this, "StubRoutines", name);
  2213     address start = __ pc();
  2215     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
  2217     if (entry != NULL) {
  2218       *entry = __ pc();
  2219       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  2220       BLOCK_COMMENT("Entry:");
  2223     array_overlap_test(nooverlap_target, 2);
  2225     generate_conjoint_int_copy_core(aligned);
  2227     // O3, O4 are used as temp registers
  2228     inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
  2229     __ retl();
  2230     __ delayed()->mov(G0, O0); // return 0
  2231     return start;
  2234   //
  2235   //  Generate core code for disjoint long copy (and oop copy on 64-bit).
  2236   //  "aligned" is ignored, because we must make the stronger
  2237   //  assumption that both addresses are always 64-bit aligned.
  2238   //
  2239   // Arguments:
  2240   //      from:  O0
  2241   //      to:    O1
  2242   //      count: O2 treated as signed
  2243   //
  2244   // count -= 2;
  2245   // if ( count >= 0 ) { // >= 2 elements
  2246   //   if ( count > 6) { // >= 8 elements
  2247   //     count -= 6; // original count - 8
  2248   //     do {
  2249   //       copy_8_elements;
  2250   //       count -= 8;
  2251   //     } while ( count >= 0 );
  2252   //     count += 6;
  2253   //   }
  2254   //   if ( count >= 0 ) { // >= 2 elements
  2255   //     do {
  2256   //       copy_2_elements;
  2257   //     } while ( (count=count-2) >= 0 );
  2258   //   }
  2259   // }
  2260   // count += 2;
  2261   // if ( count != 0 ) { // 1 element left
  2262   //   copy_1_element;
  2263   // }
  2264   //
  2265   void generate_disjoint_long_copy_core(bool aligned) {
  2266     Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
  2267     const Register from    = O0;  // source array address
  2268     const Register to      = O1;  // destination array address
  2269     const Register count   = O2;  // elements count
  2270     const Register offset0 = O4;  // element offset
  2271     const Register offset8 = O5;  // next element offset
  2273       __ deccc(count, 2);
  2274       __ mov(G0, offset0);   // offset from start of arrays (0)
  2275       __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
  2276       __ delayed()->add(offset0, 8, offset8);
  2278     // Copy by 64 bytes chunks
  2279     Label L_copy_64_bytes;
  2280     const Register from64 = O3;  // source address
  2281     const Register to64   = G3;  // destination address
  2282       __ subcc(count, 6, O3);
  2283       __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
  2284       __ delayed()->mov(to,   to64);
  2285       // Now we can use O4(offset0), O5(offset8) as temps
  2286       __ mov(O3, count);
  2287       __ mov(from, from64);
  2289       __ align(OptoLoopAlignment);
  2290     __ BIND(L_copy_64_bytes);
  2291       for( int off = 0; off < 64; off += 16 ) {
  2292         __ ldx(from64,  off+0, O4);
  2293         __ ldx(from64,  off+8, O5);
  2294         __ stx(O4, to64,  off+0);
  2295         __ stx(O5, to64,  off+8);
  2297       __ deccc(count, 8);
  2298       __ inc(from64, 64);
  2299       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_64_bytes);
  2300       __ delayed()->inc(to64, 64);
  2302       // Restore O4(offset0), O5(offset8)
  2303       __ sub(from64, from, offset0);
  2304       __ inccc(count, 6);
  2305       __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
  2306       __ delayed()->add(offset0, 8, offset8);
  2308       // Copy by 16 bytes chunks
  2309       __ align(OptoLoopAlignment);
  2310     __ BIND(L_copy_16_bytes);
  2311       __ ldx(from, offset0, O3);
  2312       __ ldx(from, offset8, G3);
  2313       __ deccc(count, 2);
  2314       __ stx(O3, to, offset0);
  2315       __ inc(offset0, 16);
  2316       __ stx(G3, to, offset8);
  2317       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
  2318       __ delayed()->inc(offset8, 16);
  2320       // Copy last 8 bytes
  2321     __ BIND(L_copy_8_bytes);
  2322       __ inccc(count, 2);
  2323       __ brx(Assembler::zero, true, Assembler::pn, L_exit );
  2324       __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs
  2325       __ ldx(from, offset0, O3);
  2326       __ stx(O3, to, offset0);
  2327     __ BIND(L_exit);
  2330   //
  2331   //  Generate stub for disjoint long copy.
  2332   //  "aligned" is ignored, because we must make the stronger
  2333   //  assumption that both addresses are always 64-bit aligned.
  2334   //
  2335   // Arguments for generated stub:
  2336   //      from:  O0
  2337   //      to:    O1
  2338   //      count: O2 treated as signed
  2339   //
  2340   address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) {
  2341     __ align(CodeEntryAlignment);
  2342     StubCodeMark mark(this, "StubRoutines", name);
  2343     address start = __ pc();
  2345     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
  2347     if (entry != NULL) {
  2348       *entry = __ pc();
  2349       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  2350       BLOCK_COMMENT("Entry:");
  2353     generate_disjoint_long_copy_core(aligned);
  2355     // O3, O4 are used as temp registers
  2356     inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
  2357     __ retl();
  2358     __ delayed()->mov(G0, O0); // return 0
  2359     return start;
  2362   //
  2363   //  Generate core code for conjoint long copy (and oop copy on 64-bit).
  2364   //  "aligned" is ignored, because we must make the stronger
  2365   //  assumption that both addresses are always 64-bit aligned.
  2366   //
  2367   // Arguments:
  2368   //      from:  O0
  2369   //      to:    O1
  2370   //      count: O2 treated as signed
  2371   //
  2372   void generate_conjoint_long_copy_core(bool aligned) {
  2373     // Do reverse copy.
  2374     Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
  2375     const Register from    = O0;  // source array address
  2376     const Register to      = O1;  // destination array address
  2377     const Register count   = O2;  // elements count
  2378     const Register offset8 = O4;  // element offset
  2379     const Register offset0 = O5;  // previous element offset
  2381       __ subcc(count, 1, count);
  2382       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
  2383       __ delayed()->sllx(count, LogBytesPerLong, offset8);
  2384       __ sub(offset8, 8, offset0);
  2385       __ align(OptoLoopAlignment);
  2386     __ BIND(L_copy_16_bytes);
  2387       __ ldx(from, offset8, O2);
  2388       __ ldx(from, offset0, O3);
  2389       __ stx(O2, to, offset8);
  2390       __ deccc(offset8, 16);      // use offset8 as counter
  2391       __ stx(O3, to, offset0);
  2392       __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes);
  2393       __ delayed()->dec(offset0, 16);
  2395     __ BIND(L_copy_8_bytes);
  2396       __ brx(Assembler::negative, false, Assembler::pn, L_exit );
  2397       __ delayed()->nop();
  2398       __ ldx(from, 0, O3);
  2399       __ stx(O3, to, 0);
  2400     __ BIND(L_exit);
  2403   //  Generate stub for conjoint long copy.
  2404   //  "aligned" is ignored, because we must make the stronger
  2405   //  assumption that both addresses are always 64-bit aligned.
  2406   //
  2407   // Arguments for generated stub:
  2408   //      from:  O0
  2409   //      to:    O1
  2410   //      count: O2 treated as signed
  2411   //
  2412   address generate_conjoint_long_copy(bool aligned, address nooverlap_target,
  2413                                       address *entry, const char *name) {
  2414     __ align(CodeEntryAlignment);
  2415     StubCodeMark mark(this, "StubRoutines", name);
  2416     address start = __ pc();
  2418     assert(aligned, "Should always be aligned");
  2420     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
  2422     if (entry != NULL) {
  2423       *entry = __ pc();
  2424       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  2425       BLOCK_COMMENT("Entry:");
  2428     array_overlap_test(nooverlap_target, 3);
  2430     generate_conjoint_long_copy_core(aligned);
  2432     // O3, O4 are used as temp registers
  2433     inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
  2434     __ retl();
  2435     __ delayed()->mov(G0, O0); // return 0
  2436     return start;
  2439   //  Generate stub for disjoint oop copy.  If "aligned" is true, the
  2440   //  "from" and "to" addresses are assumed to be heapword aligned.
  2441   //
  2442   // Arguments for generated stub:
  2443   //      from:  O0
  2444   //      to:    O1
  2445   //      count: O2 treated as signed
  2446   //
  2447   address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name,
  2448                                      bool dest_uninitialized = false) {
  2450     const Register from  = O0;  // source array address
  2451     const Register to    = O1;  // destination array address
  2452     const Register count = O2;  // elements count
  2454     __ align(CodeEntryAlignment);
  2455     StubCodeMark mark(this, "StubRoutines", name);
  2456     address start = __ pc();
  2458     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  2460     if (entry != NULL) {
  2461       *entry = __ pc();
  2462       // caller can pass a 64-bit byte count here
  2463       BLOCK_COMMENT("Entry:");
  2466     // save arguments for barrier generation
  2467     __ mov(to, G1);
  2468     __ mov(count, G5);
  2469     gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
  2470   #ifdef _LP64
  2471     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  2472     if (UseCompressedOops) {
  2473       generate_disjoint_int_copy_core(aligned);
  2474     } else {
  2475       generate_disjoint_long_copy_core(aligned);
  2477   #else
  2478     generate_disjoint_int_copy_core(aligned);
  2479   #endif
  2480     // O0 is used as temp register
  2481     gen_write_ref_array_post_barrier(G1, G5, O0);
  2483     // O3, O4 are used as temp registers
  2484     inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
  2485     __ retl();
  2486     __ delayed()->mov(G0, O0); // return 0
  2487     return start;
  2490   //  Generate stub for conjoint oop copy.  If "aligned" is true, the
  2491   //  "from" and "to" addresses are assumed to be heapword aligned.
  2492   //
  2493   // Arguments for generated stub:
  2494   //      from:  O0
  2495   //      to:    O1
  2496   //      count: O2 treated as signed
  2497   //
  2498   address generate_conjoint_oop_copy(bool aligned, address nooverlap_target,
  2499                                      address *entry, const char *name,
  2500                                      bool dest_uninitialized = false) {
  2502     const Register from  = O0;  // source array address
  2503     const Register to    = O1;  // destination array address
  2504     const Register count = O2;  // elements count
  2506     __ align(CodeEntryAlignment);
  2507     StubCodeMark mark(this, "StubRoutines", name);
  2508     address start = __ pc();
  2510     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  2512     if (entry != NULL) {
  2513       *entry = __ pc();
  2514       // caller can pass a 64-bit byte count here
  2515       BLOCK_COMMENT("Entry:");
  2518     array_overlap_test(nooverlap_target, LogBytesPerHeapOop);
  2520     // save arguments for barrier generation
  2521     __ mov(to, G1);
  2522     __ mov(count, G5);
  2523     gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
  2525   #ifdef _LP64
  2526     if (UseCompressedOops) {
  2527       generate_conjoint_int_copy_core(aligned);
  2528     } else {
  2529       generate_conjoint_long_copy_core(aligned);
  2531   #else
  2532     generate_conjoint_int_copy_core(aligned);
  2533   #endif
  2535     // O0 is used as temp register
  2536     gen_write_ref_array_post_barrier(G1, G5, O0);
  2538     // O3, O4 are used as temp registers
  2539     inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
  2540     __ retl();
  2541     __ delayed()->mov(G0, O0); // return 0
  2542     return start;
  2546   // Helper for generating a dynamic type check.
  2547   // Smashes only the given temp registers.
  2548   void generate_type_check(Register sub_klass,
  2549                            Register super_check_offset,
  2550                            Register super_klass,
  2551                            Register temp,
  2552                            Label& L_success) {
  2553     assert_different_registers(sub_klass, super_check_offset, super_klass, temp);
  2555     BLOCK_COMMENT("type_check:");
  2557     Label L_miss, L_pop_to_miss;
  2559     assert_clean_int(super_check_offset, temp);
  2561     __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg,
  2562                                      &L_success, &L_miss, NULL,
  2563                                      super_check_offset);
  2565     BLOCK_COMMENT("type_check_slow_path:");
  2566     __ save_frame(0);
  2567     __ check_klass_subtype_slow_path(sub_klass->after_save(),
  2568                                      super_klass->after_save(),
  2569                                      L0, L1, L2, L4,
  2570                                      NULL, &L_pop_to_miss);
  2571     __ ba(false, L_success);
  2572     __ delayed()->restore();
  2574     __ bind(L_pop_to_miss);
  2575     __ restore();
  2577     // Fall through on failure!
  2578     __ BIND(L_miss);
  2582   //  Generate stub for checked oop copy.
  2583   //
  2584   // Arguments for generated stub:
  2585   //      from:  O0
  2586   //      to:    O1
  2587   //      count: O2 treated as signed
  2588   //      ckoff: O3 (super_check_offset)
  2589   //      ckval: O4 (super_klass)
  2590   //      ret:   O0 zero for success; (-1^K) where K is partial transfer count
  2591   //
  2592   address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) {
  2594     const Register O0_from   = O0;      // source array address
  2595     const Register O1_to     = O1;      // destination array address
  2596     const Register O2_count  = O2;      // elements count
  2597     const Register O3_ckoff  = O3;      // super_check_offset
  2598     const Register O4_ckval  = O4;      // super_klass
  2600     const Register O5_offset = O5;      // loop var, with stride wordSize
  2601     const Register G1_remain = G1;      // loop var, with stride -1
  2602     const Register G3_oop    = G3;      // actual oop copied
  2603     const Register G4_klass  = G4;      // oop._klass
  2604     const Register G5_super  = G5;      // oop._klass._primary_supers[ckval]
  2606     __ align(CodeEntryAlignment);
  2607     StubCodeMark mark(this, "StubRoutines", name);
  2608     address start = __ pc();
  2610 #ifdef ASSERT
  2611     // We sometimes save a frame (see generate_type_check below).
  2612     // If this will cause trouble, let's fail now instead of later.
  2613     __ save_frame(0);
  2614     __ restore();
  2615 #endif
  2617     assert_clean_int(O2_count, G1);     // Make sure 'count' is clean int.
  2619 #ifdef ASSERT
  2620     // caller guarantees that the arrays really are different
  2621     // otherwise, we would have to make conjoint checks
  2622     { Label L;
  2623       __ mov(O3, G1);           // spill: overlap test smashes O3
  2624       __ mov(O4, G4);           // spill: overlap test smashes O4
  2625       array_overlap_test(L, LogBytesPerHeapOop);
  2626       __ stop("checkcast_copy within a single array");
  2627       __ bind(L);
  2628       __ mov(G1, O3);
  2629       __ mov(G4, O4);
  2631 #endif //ASSERT
  2633     if (entry != NULL) {
  2634       *entry = __ pc();
  2635       // caller can pass a 64-bit byte count here (from generic stub)
  2636       BLOCK_COMMENT("Entry:");
  2638     gen_write_ref_array_pre_barrier(O1_to, O2_count, dest_uninitialized);
  2640     Label load_element, store_element, do_card_marks, fail, done;
  2641     __ addcc(O2_count, 0, G1_remain);   // initialize loop index, and test it
  2642     __ brx(Assembler::notZero, false, Assembler::pt, load_element);
  2643     __ delayed()->mov(G0, O5_offset);   // offset from start of arrays
  2645     // Empty array:  Nothing to do.
  2646     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
  2647     __ retl();
  2648     __ delayed()->set(0, O0);           // return 0 on (trivial) success
  2650     // ======== begin loop ========
  2651     // (Loop is rotated; its entry is load_element.)
  2652     // Loop variables:
  2653     //   (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
  2654     //   (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
  2655     //   G3, G4, G5 --- current oop, oop.klass, oop.klass.super
  2656     __ align(OptoLoopAlignment);
  2658     __ BIND(store_element);
  2659     __ deccc(G1_remain);                // decrement the count
  2660     __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop
  2661     __ inc(O5_offset, heapOopSize);     // step to next offset
  2662     __ brx(Assembler::zero, true, Assembler::pt, do_card_marks);
  2663     __ delayed()->set(0, O0);           // return -1 on success
  2665     // ======== loop entry is here ========
  2666     __ BIND(load_element);
  2667     __ load_heap_oop(O0_from, O5_offset, G3_oop);  // load the oop
  2668     __ br_null(G3_oop, true, Assembler::pt, store_element);
  2669     __ delayed()->nop();
  2671     __ load_klass(G3_oop, G4_klass); // query the object klass
  2673     generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super,
  2674                         // branch to this on success:
  2675                         store_element);
  2676     // ======== end loop ========
  2678     // It was a real error; we must depend on the caller to finish the job.
  2679     // Register G1 has number of *remaining* oops, O2 number of *total* oops.
  2680     // Emit GC store barriers for the oops we have copied (O2 minus G1),
  2681     // and report their number to the caller.
  2682     __ BIND(fail);
  2683     __ subcc(O2_count, G1_remain, O2_count);
  2684     __ brx(Assembler::zero, false, Assembler::pt, done);
  2685     __ delayed()->not1(O2_count, O0);   // report (-1^K) to caller
  2687     __ BIND(do_card_marks);
  2688     gen_write_ref_array_post_barrier(O1_to, O2_count, O3);   // store check on O1[0..O2]
  2690     __ BIND(done);
  2691     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
  2692     __ retl();
  2693     __ delayed()->nop();             // return value in 00
  2695     return start;
  2699   //  Generate 'unsafe' array copy stub
  2700   //  Though just as safe as the other stubs, it takes an unscaled
  2701   //  size_t argument instead of an element count.
  2702   //
  2703   // Arguments for generated stub:
  2704   //      from:  O0
  2705   //      to:    O1
  2706   //      count: O2 byte count, treated as ssize_t, can be zero
  2707   //
  2708   // Examines the alignment of the operands and dispatches
  2709   // to a long, int, short, or byte copy loop.
  2710   //
  2711   address generate_unsafe_copy(const char* name,
  2712                                address byte_copy_entry,
  2713                                address short_copy_entry,
  2714                                address int_copy_entry,
  2715                                address long_copy_entry) {
  2717     const Register O0_from   = O0;      // source array address
  2718     const Register O1_to     = O1;      // destination array address
  2719     const Register O2_count  = O2;      // elements count
  2721     const Register G1_bits   = G1;      // test copy of low bits
  2723     __ align(CodeEntryAlignment);
  2724     StubCodeMark mark(this, "StubRoutines", name);
  2725     address start = __ pc();
  2727     // bump this on entry, not on exit:
  2728     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3);
  2730     __ or3(O0_from, O1_to, G1_bits);
  2731     __ or3(O2_count,       G1_bits, G1_bits);
  2733     __ btst(BytesPerLong-1, G1_bits);
  2734     __ br(Assembler::zero, true, Assembler::pt,
  2735           long_copy_entry, relocInfo::runtime_call_type);
  2736     // scale the count on the way out:
  2737     __ delayed()->srax(O2_count, LogBytesPerLong, O2_count);
  2739     __ btst(BytesPerInt-1, G1_bits);
  2740     __ br(Assembler::zero, true, Assembler::pt,
  2741           int_copy_entry, relocInfo::runtime_call_type);
  2742     // scale the count on the way out:
  2743     __ delayed()->srax(O2_count, LogBytesPerInt, O2_count);
  2745     __ btst(BytesPerShort-1, G1_bits);
  2746     __ br(Assembler::zero, true, Assembler::pt,
  2747           short_copy_entry, relocInfo::runtime_call_type);
  2748     // scale the count on the way out:
  2749     __ delayed()->srax(O2_count, LogBytesPerShort, O2_count);
  2751     __ br(Assembler::always, false, Assembler::pt,
  2752           byte_copy_entry, relocInfo::runtime_call_type);
  2753     __ delayed()->nop();
  2755     return start;
  2759   // Perform range checks on the proposed arraycopy.
  2760   // Kills the two temps, but nothing else.
  2761   // Also, clean the sign bits of src_pos and dst_pos.
  2762   void arraycopy_range_checks(Register src,     // source array oop (O0)
  2763                               Register src_pos, // source position (O1)
  2764                               Register dst,     // destination array oo (O2)
  2765                               Register dst_pos, // destination position (O3)
  2766                               Register length,  // length of copy (O4)
  2767                               Register temp1, Register temp2,
  2768                               Label& L_failed) {
  2769     BLOCK_COMMENT("arraycopy_range_checks:");
  2771     //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
  2773     const Register array_length = temp1;  // scratch
  2774     const Register end_pos      = temp2;  // scratch
  2776     // Note:  This next instruction may be in the delay slot of a branch:
  2777     __ add(length, src_pos, end_pos);  // src_pos + length
  2778     __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length);
  2779     __ cmp(end_pos, array_length);
  2780     __ br(Assembler::greater, false, Assembler::pn, L_failed);
  2782     //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
  2783     __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length
  2784     __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length);
  2785     __ cmp(end_pos, array_length);
  2786     __ br(Assembler::greater, false, Assembler::pn, L_failed);
  2788     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
  2789     // Move with sign extension can be used since they are positive.
  2790     __ delayed()->signx(src_pos, src_pos);
  2791     __ signx(dst_pos, dst_pos);
  2793     BLOCK_COMMENT("arraycopy_range_checks done");
  2797   //
  2798   //  Generate generic array copy stubs
  2799   //
  2800   //  Input:
  2801   //    O0    -  src oop
  2802   //    O1    -  src_pos
  2803   //    O2    -  dst oop
  2804   //    O3    -  dst_pos
  2805   //    O4    -  element count
  2806   //
  2807   //  Output:
  2808   //    O0 ==  0  -  success
  2809   //    O0 == -1  -  need to call System.arraycopy
  2810   //
  2811   address generate_generic_copy(const char *name,
  2812                                 address entry_jbyte_arraycopy,
  2813                                 address entry_jshort_arraycopy,
  2814                                 address entry_jint_arraycopy,
  2815                                 address entry_oop_arraycopy,
  2816                                 address entry_jlong_arraycopy,
  2817                                 address entry_checkcast_arraycopy) {
  2818     Label L_failed, L_objArray;
  2820     // Input registers
  2821     const Register src      = O0;  // source array oop
  2822     const Register src_pos  = O1;  // source position
  2823     const Register dst      = O2;  // destination array oop
  2824     const Register dst_pos  = O3;  // destination position
  2825     const Register length   = O4;  // elements count
  2827     // registers used as temp
  2828     const Register G3_src_klass = G3; // source array klass
  2829     const Register G4_dst_klass = G4; // destination array klass
  2830     const Register G5_lh        = G5; // layout handler
  2831     const Register O5_temp      = O5;
  2833     __ align(CodeEntryAlignment);
  2834     StubCodeMark mark(this, "StubRoutines", name);
  2835     address start = __ pc();
  2837     // bump this on entry, not on exit:
  2838     inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3);
  2840     // In principle, the int arguments could be dirty.
  2841     //assert_clean_int(src_pos, G1);
  2842     //assert_clean_int(dst_pos, G1);
  2843     //assert_clean_int(length, G1);
  2845     //-----------------------------------------------------------------------
  2846     // Assembler stubs will be used for this call to arraycopy
  2847     // if the following conditions are met:
  2848     //
  2849     // (1) src and dst must not be null.
  2850     // (2) src_pos must not be negative.
  2851     // (3) dst_pos must not be negative.
  2852     // (4) length  must not be negative.
  2853     // (5) src klass and dst klass should be the same and not NULL.
  2854     // (6) src and dst should be arrays.
  2855     // (7) src_pos + length must not exceed length of src.
  2856     // (8) dst_pos + length must not exceed length of dst.
  2857     BLOCK_COMMENT("arraycopy initial argument checks");
  2859     //  if (src == NULL) return -1;
  2860     __ br_null(src, false, Assembler::pn, L_failed);
  2862     //  if (src_pos < 0) return -1;
  2863     __ delayed()->tst(src_pos);
  2864     __ br(Assembler::negative, false, Assembler::pn, L_failed);
  2865     __ delayed()->nop();
  2867     //  if (dst == NULL) return -1;
  2868     __ br_null(dst, false, Assembler::pn, L_failed);
  2870     //  if (dst_pos < 0) return -1;
  2871     __ delayed()->tst(dst_pos);
  2872     __ br(Assembler::negative, false, Assembler::pn, L_failed);
  2874     //  if (length < 0) return -1;
  2875     __ delayed()->tst(length);
  2876     __ br(Assembler::negative, false, Assembler::pn, L_failed);
  2878     BLOCK_COMMENT("arraycopy argument klass checks");
  2879     //  get src->klass()
  2880     if (UseCompressedOops) {
  2881       __ delayed()->nop(); // ??? not good
  2882       __ load_klass(src, G3_src_klass);
  2883     } else {
  2884       __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass);
  2887 #ifdef ASSERT
  2888     //  assert(src->klass() != NULL);
  2889     BLOCK_COMMENT("assert klasses not null");
  2890     { Label L_a, L_b;
  2891       __ br_notnull(G3_src_klass, false, Assembler::pt, L_b); // it is broken if klass is NULL
  2892       __ delayed()->nop();
  2893       __ bind(L_a);
  2894       __ stop("broken null klass");
  2895       __ bind(L_b);
  2896       __ load_klass(dst, G4_dst_klass);
  2897       __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also
  2898       __ delayed()->mov(G0, G4_dst_klass);      // scribble the temp
  2899       BLOCK_COMMENT("assert done");
  2901 #endif
  2903     // Load layout helper
  2904     //
  2905     //  |array_tag|     | header_size | element_type |     |log2_element_size|
  2906     // 32        30    24            16              8     2                 0
  2907     //
  2908     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
  2909     //
  2911     int lh_offset = klassOopDesc::header_size() * HeapWordSize +
  2912                     Klass::layout_helper_offset_in_bytes();
  2914     // Load 32-bits signed value. Use br() instruction with it to check icc.
  2915     __ lduw(G3_src_klass, lh_offset, G5_lh);
  2917     if (UseCompressedOops) {
  2918       __ load_klass(dst, G4_dst_klass);
  2920     // Handle objArrays completely differently...
  2921     juint objArray_lh = Klass::array_layout_helper(T_OBJECT);
  2922     __ set(objArray_lh, O5_temp);
  2923     __ cmp(G5_lh,       O5_temp);
  2924     __ br(Assembler::equal, false, Assembler::pt, L_objArray);
  2925     if (UseCompressedOops) {
  2926       __ delayed()->nop();
  2927     } else {
  2928       __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass);
  2931     //  if (src->klass() != dst->klass()) return -1;
  2932     __ cmp(G3_src_klass, G4_dst_klass);
  2933     __ brx(Assembler::notEqual, false, Assembler::pn, L_failed);
  2934     __ delayed()->nop();
  2936     //  if (!src->is_Array()) return -1;
  2937     __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0
  2938     __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed);
  2940     // At this point, it is known to be a typeArray (array_tag 0x3).
  2941 #ifdef ASSERT
  2942     __ delayed()->nop();
  2943     { Label L;
  2944       jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
  2945       __ set(lh_prim_tag_in_place, O5_temp);
  2946       __ cmp(G5_lh,                O5_temp);
  2947       __ br(Assembler::greaterEqual, false, Assembler::pt, L);
  2948       __ delayed()->nop();
  2949       __ stop("must be a primitive array");
  2950       __ bind(L);
  2952 #else
  2953     __ delayed();                               // match next insn to prev branch
  2954 #endif
  2956     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
  2957                            O5_temp, G4_dst_klass, L_failed);
  2959     // typeArrayKlass
  2960     //
  2961     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
  2962     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
  2963     //
  2965     const Register G4_offset = G4_dst_klass;    // array offset
  2966     const Register G3_elsize = G3_src_klass;    // log2 element size
  2968     __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset);
  2969     __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset
  2970     __ add(src, G4_offset, src);       // src array offset
  2971     __ add(dst, G4_offset, dst);       // dst array offset
  2972     __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size
  2974     // next registers should be set before the jump to corresponding stub
  2975     const Register from     = O0;  // source array address
  2976     const Register to       = O1;  // destination array address
  2977     const Register count    = O2;  // elements count
  2979     // 'from', 'to', 'count' registers should be set in this order
  2980     // since they are the same as 'src', 'src_pos', 'dst'.
  2982     BLOCK_COMMENT("scale indexes to element size");
  2983     __ sll_ptr(src_pos, G3_elsize, src_pos);
  2984     __ sll_ptr(dst_pos, G3_elsize, dst_pos);
  2985     __ add(src, src_pos, from);       // src_addr
  2986     __ add(dst, dst_pos, to);         // dst_addr
  2988     BLOCK_COMMENT("choose copy loop based on element size");
  2989     __ cmp(G3_elsize, 0);
  2990     __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy);
  2991     __ delayed()->signx(length, count); // length
  2993     __ cmp(G3_elsize, LogBytesPerShort);
  2994     __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy);
  2995     __ delayed()->signx(length, count); // length
  2997     __ cmp(G3_elsize, LogBytesPerInt);
  2998     __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy);
  2999     __ delayed()->signx(length, count); // length
  3000 #ifdef ASSERT
  3001     { Label L;
  3002       __ cmp(G3_elsize, LogBytesPerLong);
  3003       __ br(Assembler::equal, false, Assembler::pt, L);
  3004       __ delayed()->nop();
  3005       __ stop("must be long copy, but elsize is wrong");
  3006       __ bind(L);
  3008 #endif
  3009     __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy);
  3010     __ delayed()->signx(length, count); // length
  3012     // objArrayKlass
  3013   __ BIND(L_objArray);
  3014     // live at this point:  G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length
  3016     Label L_plain_copy, L_checkcast_copy;
  3017     //  test array classes for subtyping
  3018     __ cmp(G3_src_klass, G4_dst_klass);         // usual case is exact equality
  3019     __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy);
  3020     __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below
  3022     // Identically typed arrays can be copied without element-wise checks.
  3023     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
  3024                            O5_temp, G5_lh, L_failed);
  3026     __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
  3027     __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
  3028     __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
  3029     __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
  3030     __ add(src, src_pos, from);       // src_addr
  3031     __ add(dst, dst_pos, to);         // dst_addr
  3032   __ BIND(L_plain_copy);
  3033     __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy);
  3034     __ delayed()->signx(length, count); // length
  3036   __ BIND(L_checkcast_copy);
  3037     // live at this point:  G3_src_klass, G4_dst_klass
  3039       // Before looking at dst.length, make sure dst is also an objArray.
  3040       // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot
  3041       __ cmp(G5_lh,                    O5_temp);
  3042       __ br(Assembler::notEqual, false, Assembler::pn, L_failed);
  3044       // It is safe to examine both src.length and dst.length.
  3045       __ delayed();                             // match next insn to prev branch
  3046       arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
  3047                              O5_temp, G5_lh, L_failed);
  3049       // Marshal the base address arguments now, freeing registers.
  3050       __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
  3051       __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
  3052       __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
  3053       __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
  3054       __ add(src, src_pos, from);               // src_addr
  3055       __ add(dst, dst_pos, to);                 // dst_addr
  3056       __ signx(length, count);                  // length (reloaded)
  3058       Register sco_temp = O3;                   // this register is free now
  3059       assert_different_registers(from, to, count, sco_temp,
  3060                                  G4_dst_klass, G3_src_klass);
  3062       // Generate the type check.
  3063       int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
  3064                         Klass::super_check_offset_offset_in_bytes());
  3065       __ lduw(G4_dst_klass, sco_offset, sco_temp);
  3066       generate_type_check(G3_src_klass, sco_temp, G4_dst_klass,
  3067                           O5_temp, L_plain_copy);
  3069       // Fetch destination element klass from the objArrayKlass header.
  3070       int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
  3071                        objArrayKlass::element_klass_offset_in_bytes());
  3073       // the checkcast_copy loop needs two extra arguments:
  3074       __ ld_ptr(G4_dst_klass, ek_offset, O4);   // dest elem klass
  3075       // lduw(O4, sco_offset, O3);              // sco of elem klass
  3077       __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy);
  3078       __ delayed()->lduw(O4, sco_offset, O3);
  3081   __ BIND(L_failed);
  3082     __ retl();
  3083     __ delayed()->sub(G0, 1, O0); // return -1
  3084     return start;
  3087   void generate_arraycopy_stubs() {
  3088     address entry;
  3089     address entry_jbyte_arraycopy;
  3090     address entry_jshort_arraycopy;
  3091     address entry_jint_arraycopy;
  3092     address entry_oop_arraycopy;
  3093     address entry_jlong_arraycopy;
  3094     address entry_checkcast_arraycopy;
  3096     //*** jbyte
  3097     // Always need aligned and unaligned versions
  3098     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
  3099                                                                                   "jbyte_disjoint_arraycopy");
  3100     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
  3101                                                                                   &entry_jbyte_arraycopy,
  3102                                                                                   "jbyte_arraycopy");
  3103     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
  3104                                                                                   "arrayof_jbyte_disjoint_arraycopy");
  3105     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
  3106                                                                                   "arrayof_jbyte_arraycopy");
  3108     //*** jshort
  3109     // Always need aligned and unaligned versions
  3110     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
  3111                                                                                     "jshort_disjoint_arraycopy");
  3112     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
  3113                                                                                     &entry_jshort_arraycopy,
  3114                                                                                     "jshort_arraycopy");
  3115     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
  3116                                                                                     "arrayof_jshort_disjoint_arraycopy");
  3117     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
  3118                                                                                     "arrayof_jshort_arraycopy");
  3120     //*** jint
  3121     // Aligned versions
  3122     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
  3123                                                                                 "arrayof_jint_disjoint_arraycopy");
  3124     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
  3125                                                                                 "arrayof_jint_arraycopy");
  3126 #ifdef _LP64
  3127     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
  3128     // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it).
  3129     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
  3130                                                                                 "jint_disjoint_arraycopy");
  3131     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
  3132                                                                                 &entry_jint_arraycopy,
  3133                                                                                 "jint_arraycopy");
  3134 #else
  3135     // In 32 bit jints are always HeapWordSize aligned, so always use the aligned version
  3136     // (in fact in 32bit we always have a pre-loop part even in the aligned version,
  3137     //  because it uses 64-bit loads/stores, so the aligned flag is actually ignored).
  3138     StubRoutines::_jint_disjoint_arraycopy = StubRoutines::_arrayof_jint_disjoint_arraycopy;
  3139     StubRoutines::_jint_arraycopy          = StubRoutines::_arrayof_jint_arraycopy;
  3140 #endif
  3143     //*** jlong
  3144     // It is always aligned
  3145     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
  3146                                                                                   "arrayof_jlong_disjoint_arraycopy");
  3147     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
  3148                                                                                   "arrayof_jlong_arraycopy");
  3149     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
  3150     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
  3153     //*** oops
  3154     // Aligned versions
  3155     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(true, &entry,
  3156                                                                                       "arrayof_oop_disjoint_arraycopy");
  3157     StubRoutines::_arrayof_oop_arraycopy                 = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy,
  3158                                                                                       "arrayof_oop_arraycopy");
  3159     // Aligned versions without pre-barriers
  3160     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry,
  3161                                                                                       "arrayof_oop_disjoint_arraycopy_uninit",
  3162                                                                                       /*dest_uninitialized*/true);
  3163     StubRoutines::_arrayof_oop_arraycopy_uninit          = generate_conjoint_oop_copy(true, entry, NULL,
  3164                                                                                       "arrayof_oop_arraycopy_uninit",
  3165                                                                                       /*dest_uninitialized*/true);
  3166 #ifdef _LP64
  3167     if (UseCompressedOops) {
  3168       // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy.
  3169       StubRoutines::_oop_disjoint_arraycopy            = generate_disjoint_oop_copy(false, &entry,
  3170                                                                                     "oop_disjoint_arraycopy");
  3171       StubRoutines::_oop_arraycopy                     = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy,
  3172                                                                                     "oop_arraycopy");
  3173       // Unaligned versions without pre-barriers
  3174       StubRoutines::_oop_disjoint_arraycopy_uninit     = generate_disjoint_oop_copy(false, &entry,
  3175                                                                                     "oop_disjoint_arraycopy_uninit",
  3176                                                                                     /*dest_uninitialized*/true);
  3177       StubRoutines::_oop_arraycopy_uninit              = generate_conjoint_oop_copy(false, entry, NULL,
  3178                                                                                     "oop_arraycopy_uninit",
  3179                                                                                     /*dest_uninitialized*/true);
  3180     } else
  3181 #endif
  3183       // oop arraycopy is always aligned on 32bit and 64bit without compressed oops
  3184       StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
  3185       StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
  3186       StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
  3187       StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
  3190     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
  3191     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
  3192                                                                         /*dest_uninitialized*/true);
  3194     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
  3195                                                               entry_jbyte_arraycopy,
  3196                                                               entry_jshort_arraycopy,
  3197                                                               entry_jint_arraycopy,
  3198                                                               entry_jlong_arraycopy);
  3199     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
  3200                                                                entry_jbyte_arraycopy,
  3201                                                                entry_jshort_arraycopy,
  3202                                                                entry_jint_arraycopy,
  3203                                                                entry_oop_arraycopy,
  3204                                                                entry_jlong_arraycopy,
  3205                                                                entry_checkcast_arraycopy);
  3207     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
  3208     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
  3209     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
  3210     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
  3211     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
  3212     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
  3215   void generate_initial() {
  3216     // Generates all stubs and initializes the entry points
  3218     //------------------------------------------------------------------------------------------------------------------------
  3219     // entry points that exist in all platforms
  3220     // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
  3221     //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
  3222     StubRoutines::_forward_exception_entry                 = generate_forward_exception();
  3224     StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
  3225     StubRoutines::_catch_exception_entry                   = generate_catch_exception();
  3227     //------------------------------------------------------------------------------------------------------------------------
  3228     // entry points that are platform specific
  3229     StubRoutines::Sparc::_test_stop_entry                  = generate_test_stop();
  3231     StubRoutines::Sparc::_stop_subroutine_entry            = generate_stop_subroutine();
  3232     StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
  3234 #if !defined(COMPILER2) && !defined(_LP64)
  3235     StubRoutines::_atomic_xchg_entry         = generate_atomic_xchg();
  3236     StubRoutines::_atomic_cmpxchg_entry      = generate_atomic_cmpxchg();
  3237     StubRoutines::_atomic_add_entry          = generate_atomic_add();
  3238     StubRoutines::_atomic_xchg_ptr_entry     = StubRoutines::_atomic_xchg_entry;
  3239     StubRoutines::_atomic_cmpxchg_ptr_entry  = StubRoutines::_atomic_cmpxchg_entry;
  3240     StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
  3241     StubRoutines::_atomic_add_ptr_entry      = StubRoutines::_atomic_add_entry;
  3242 #endif  // COMPILER2 !=> _LP64
  3246   void generate_all() {
  3247     // Generates all stubs and initializes the entry points
  3249     // Generate partial_subtype_check first here since its code depends on
  3250     // UseZeroBaseCompressedOops which is defined after heap initialization.
  3251     StubRoutines::Sparc::_partial_subtype_check                = generate_partial_subtype_check();
  3252     // These entry points require SharedInfo::stack0 to be set up in non-core builds
  3253     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
  3254     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError),  false);
  3255     StubRoutines::_throw_ArithmeticException_entry         = generate_throw_exception("ArithmeticException throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_ArithmeticException),  true);
  3256     StubRoutines::_throw_NullPointerException_entry        = generate_throw_exception("NullPointerException throw_exception",         CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException), true);
  3257     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
  3258     StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError),   false);
  3260     StubRoutines::_handler_for_unsafe_access_entry =
  3261       generate_handler_for_unsafe_access();
  3263     // support for verify_oop (must happen after universe_init)
  3264     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop_subroutine();
  3266     // arraycopy stubs used by compilers
  3267     generate_arraycopy_stubs();
  3269     // Don't initialize the platform math functions since sparc
  3270     // doesn't have intrinsics for these operations.
  3274  public:
  3275   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
  3276     // replace the standard masm with a special one:
  3277     _masm = new MacroAssembler(code);
  3279     _stub_count = !all ? 0x100 : 0x200;
  3280     if (all) {
  3281       generate_all();
  3282     } else {
  3283       generate_initial();
  3286     // make sure this stub is available for all local calls
  3287     if (_atomic_add_stub.is_unbound()) {
  3288       // generate a second time, if necessary
  3289       (void) generate_atomic_add();
  3294  private:
  3295   int _stub_count;
  3296   void stub_prolog(StubCodeDesc* cdesc) {
  3297     # ifdef ASSERT
  3298       // put extra information in the stub code, to make it more readable
  3299 #ifdef _LP64
  3300 // Write the high part of the address
  3301 // [RGV] Check if there is a dependency on the size of this prolog
  3302       __ emit_data((intptr_t)cdesc >> 32,    relocInfo::none);
  3303 #endif
  3304       __ emit_data((intptr_t)cdesc,    relocInfo::none);
  3305       __ emit_data(++_stub_count, relocInfo::none);
  3306     # endif
  3307     align(true);
  3310   void align(bool at_header = false) {
  3311     // %%%%% move this constant somewhere else
  3312     // UltraSPARC cache line size is 8 instructions:
  3313     const unsigned int icache_line_size = 32;
  3314     const unsigned int icache_half_line_size = 16;
  3316     if (at_header) {
  3317       while ((intptr_t)(__ pc()) % icache_line_size != 0) {
  3318         __ emit_data(0, relocInfo::none);
  3320     } else {
  3321       while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
  3322         __ nop();
  3327 }; // end class declaration
  3329 void StubGenerator_generate(CodeBuffer* code, bool all) {
  3330   StubGenerator g(code, all);

mercurial