src/cpu/sparc/vm/stubGenerator_sparc.cpp

Tue, 27 Nov 2012 14:20:21 +0100

author
stefank
date
Tue, 27 Nov 2012 14:20:21 +0100
changeset 4299
f34d701e952e
parent 4142
d8ce2825b193
child 4325
d2f8c38e543d
permissions
-rw-r--r--

8003935: Simplify the needed includes for using Thread::current()
Reviewed-by: dholmes, rbackman, coleenp

     1 /*
     2  * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     4  *
     5  * This code is free software; you can redistribute it and/or modify it
     6  * under the terms of the GNU General Public License version 2 only, as
     7  * published by the Free Software Foundation.
     8  *
     9  * This code is distributed in the hope that it will be useful, but WITHOUT
    10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    12  * version 2 for more details (a copy is included in the LICENSE file that
    13  * accompanied this code).
    14  *
    15  * You should have received a copy of the GNU General Public License version
    16  * 2 along with this work; if not, write to the Free Software Foundation,
    17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    18  *
    19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    20  * or visit www.oracle.com if you need additional information or have any
    21  * questions.
    22  *
    23  */
    25 #include "precompiled.hpp"
    26 #include "asm/assembler.hpp"
    27 #include "assembler_sparc.inline.hpp"
    28 #include "interpreter/interpreter.hpp"
    29 #include "nativeInst_sparc.hpp"
    30 #include "oops/instanceOop.hpp"
    31 #include "oops/method.hpp"
    32 #include "oops/objArrayKlass.hpp"
    33 #include "oops/oop.inline.hpp"
    34 #include "prims/methodHandles.hpp"
    35 #include "runtime/frame.inline.hpp"
    36 #include "runtime/handles.inline.hpp"
    37 #include "runtime/sharedRuntime.hpp"
    38 #include "runtime/stubCodeGenerator.hpp"
    39 #include "runtime/stubRoutines.hpp"
    40 #include "runtime/thread.inline.hpp"
    41 #include "utilities/top.hpp"
    42 #ifdef COMPILER2
    43 #include "opto/runtime.hpp"
    44 #endif
    46 // Declaration and definition of StubGenerator (no .hpp file).
    47 // For a more detailed description of the stub routine structure
    48 // see the comment in stubRoutines.hpp.
    50 #define __ _masm->
    52 #ifdef PRODUCT
    53 #define BLOCK_COMMENT(str) /* nothing */
    54 #else
    55 #define BLOCK_COMMENT(str) __ block_comment(str)
    56 #endif
    58 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
    60 // Note:  The register L7 is used as L7_thread_cache, and may not be used
    61 //        any other way within this module.
    64 static const Register& Lstub_temp = L2;
    66 // -------------------------------------------------------------------------------------------------------------------------
    67 // Stub Code definitions
    69 static address handle_unsafe_access() {
    70   JavaThread* thread = JavaThread::current();
    71   address pc  = thread->saved_exception_pc();
    72   address npc = thread->saved_exception_npc();
    73   // pc is the instruction which we must emulate
    74   // doing a no-op is fine:  return garbage from the load
    76   // request an async exception
    77   thread->set_pending_unsafe_access_error();
    79   // return address of next instruction to execute
    80   return npc;
    81 }
    83 class StubGenerator: public StubCodeGenerator {
    84  private:
    86 #ifdef PRODUCT
    87 #define inc_counter_np(a,b,c) (0)
    88 #else
    89 #define inc_counter_np(counter, t1, t2) \
    90   BLOCK_COMMENT("inc_counter " #counter); \
    91   __ inc_counter(&counter, t1, t2);
    92 #endif
    94   //----------------------------------------------------------------------------------------------------
    95   // Call stubs are used to call Java from C
    97   address generate_call_stub(address& return_pc) {
    98     StubCodeMark mark(this, "StubRoutines", "call_stub");
    99     address start = __ pc();
   101     // Incoming arguments:
   102     //
   103     // o0         : call wrapper address
   104     // o1         : result (address)
   105     // o2         : result type
   106     // o3         : method
   107     // o4         : (interpreter) entry point
   108     // o5         : parameters (address)
   109     // [sp + 0x5c]: parameter size (in words)
   110     // [sp + 0x60]: thread
   111     //
   112     // +---------------+ <--- sp + 0
   113     // |               |
   114     // . reg save area .
   115     // |               |
   116     // +---------------+ <--- sp + 0x40
   117     // |               |
   118     // . extra 7 slots .
   119     // |               |
   120     // +---------------+ <--- sp + 0x5c
   121     // |  param. size  |
   122     // +---------------+ <--- sp + 0x60
   123     // |    thread     |
   124     // +---------------+
   125     // |               |
   127     // note: if the link argument position changes, adjust
   128     //       the code in frame::entry_frame_call_wrapper()
   130     const Argument link           = Argument(0, false); // used only for GC
   131     const Argument result         = Argument(1, false);
   132     const Argument result_type    = Argument(2, false);
   133     const Argument method         = Argument(3, false);
   134     const Argument entry_point    = Argument(4, false);
   135     const Argument parameters     = Argument(5, false);
   136     const Argument parameter_size = Argument(6, false);
   137     const Argument thread         = Argument(7, false);
   139     // setup thread register
   140     __ ld_ptr(thread.as_address(), G2_thread);
   141     __ reinit_heapbase();
   143 #ifdef ASSERT
   144     // make sure we have no pending exceptions
   145     { const Register t = G3_scratch;
   146       Label L;
   147       __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t);
   148       __ br_null_short(t, Assembler::pt, L);
   149       __ stop("StubRoutines::call_stub: entered with pending exception");
   150       __ bind(L);
   151     }
   152 #endif
   154     // create activation frame & allocate space for parameters
   155     { const Register t = G3_scratch;
   156       __ ld_ptr(parameter_size.as_address(), t);                // get parameter size (in words)
   157       __ add(t, frame::memory_parameter_word_sp_offset, t);     // add space for save area (in words)
   158       __ round_to(t, WordsPerLong);                             // make sure it is multiple of 2 (in words)
   159       __ sll(t, Interpreter::logStackElementSize, t);           // compute number of bytes
   160       __ neg(t);                                                // negate so it can be used with save
   161       __ save(SP, t, SP);                                       // setup new frame
   162     }
   164     // +---------------+ <--- sp + 0
   165     // |               |
   166     // . reg save area .
   167     // |               |
   168     // +---------------+ <--- sp + 0x40
   169     // |               |
   170     // . extra 7 slots .
   171     // |               |
   172     // +---------------+ <--- sp + 0x5c
   173     // |  empty slot   |      (only if parameter size is even)
   174     // +---------------+
   175     // |               |
   176     // .  parameters   .
   177     // |               |
   178     // +---------------+ <--- fp + 0
   179     // |               |
   180     // . reg save area .
   181     // |               |
   182     // +---------------+ <--- fp + 0x40
   183     // |               |
   184     // . extra 7 slots .
   185     // |               |
   186     // +---------------+ <--- fp + 0x5c
   187     // |  param. size  |
   188     // +---------------+ <--- fp + 0x60
   189     // |    thread     |
   190     // +---------------+
   191     // |               |
   193     // pass parameters if any
   194     BLOCK_COMMENT("pass parameters if any");
   195     { const Register src = parameters.as_in().as_register();
   196       const Register dst = Lentry_args;
   197       const Register tmp = G3_scratch;
   198       const Register cnt = G4_scratch;
   200       // test if any parameters & setup of Lentry_args
   201       Label exit;
   202       __ ld_ptr(parameter_size.as_in().as_address(), cnt);      // parameter counter
   203       __ add( FP, STACK_BIAS, dst );
   204       __ cmp_zero_and_br(Assembler::zero, cnt, exit);
   205       __ delayed()->sub(dst, BytesPerWord, dst);                 // setup Lentry_args
   207       // copy parameters if any
   208       Label loop;
   209       __ BIND(loop);
   210       // Store parameter value
   211       __ ld_ptr(src, 0, tmp);
   212       __ add(src, BytesPerWord, src);
   213       __ st_ptr(tmp, dst, 0);
   214       __ deccc(cnt);
   215       __ br(Assembler::greater, false, Assembler::pt, loop);
   216       __ delayed()->sub(dst, Interpreter::stackElementSize, dst);
   218       // done
   219       __ BIND(exit);
   220     }
   222     // setup parameters, method & call Java function
   223 #ifdef ASSERT
   224     // layout_activation_impl checks it's notion of saved SP against
   225     // this register, so if this changes update it as well.
   226     const Register saved_SP = Lscratch;
   227     __ mov(SP, saved_SP);                               // keep track of SP before call
   228 #endif
   230     // setup parameters
   231     const Register t = G3_scratch;
   232     __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words)
   233     __ sll(t, Interpreter::logStackElementSize, t);    // compute number of bytes
   234     __ sub(FP, t, Gargs);                              // setup parameter pointer
   235 #ifdef _LP64
   236     __ add( Gargs, STACK_BIAS, Gargs );                // Account for LP64 stack bias
   237 #endif
   238     __ mov(SP, O5_savedSP);
   241     // do the call
   242     //
   243     // the following register must be setup:
   244     //
   245     // G2_thread
   246     // G5_method
   247     // Gargs
   248     BLOCK_COMMENT("call Java function");
   249     __ jmpl(entry_point.as_in().as_register(), G0, O7);
   250     __ delayed()->mov(method.as_in().as_register(), G5_method);   // setup method
   252     BLOCK_COMMENT("call_stub_return_address:");
   253     return_pc = __ pc();
   255     // The callee, if it wasn't interpreted, can return with SP changed so
   256     // we can no longer assert of change of SP.
   258     // store result depending on type
   259     // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE
   260     //  is treated as T_INT)
   261     { const Register addr = result     .as_in().as_register();
   262       const Register type = result_type.as_in().as_register();
   263       Label is_long, is_float, is_double, is_object, exit;
   264       __            cmp(type, T_OBJECT);  __ br(Assembler::equal, false, Assembler::pn, is_object);
   265       __ delayed()->cmp(type, T_FLOAT);   __ br(Assembler::equal, false, Assembler::pn, is_float);
   266       __ delayed()->cmp(type, T_DOUBLE);  __ br(Assembler::equal, false, Assembler::pn, is_double);
   267       __ delayed()->cmp(type, T_LONG);    __ br(Assembler::equal, false, Assembler::pn, is_long);
   268       __ delayed()->nop();
   270       // store int result
   271       __ st(O0, addr, G0);
   273       __ BIND(exit);
   274       __ ret();
   275       __ delayed()->restore();
   277       __ BIND(is_object);
   278       __ ba(exit);
   279       __ delayed()->st_ptr(O0, addr, G0);
   281       __ BIND(is_float);
   282       __ ba(exit);
   283       __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0);
   285       __ BIND(is_double);
   286       __ ba(exit);
   287       __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0);
   289       __ BIND(is_long);
   290 #ifdef _LP64
   291       __ ba(exit);
   292       __ delayed()->st_long(O0, addr, G0);      // store entire long
   293 #else
   294 #if defined(COMPILER2)
   295   // All return values are where we want them, except for Longs.  C2 returns
   296   // longs in G1 in the 32-bit build whereas the interpreter wants them in O0/O1.
   297   // Since the interpreter will return longs in G1 and O0/O1 in the 32bit
   298   // build we simply always use G1.
   299   // Note: I tried to make c2 return longs in O0/O1 and G1 so we wouldn't have to
   300   // do this here. Unfortunately if we did a rethrow we'd see an machepilog node
   301   // first which would move g1 -> O0/O1 and destroy the exception we were throwing.
   303       __ ba(exit);
   304       __ delayed()->stx(G1, addr, G0);  // store entire long
   305 #else
   306       __ st(O1, addr, BytesPerInt);
   307       __ ba(exit);
   308       __ delayed()->st(O0, addr, G0);
   309 #endif /* COMPILER2 */
   310 #endif /* _LP64 */
   311      }
   312      return start;
   313   }
   316   //----------------------------------------------------------------------------------------------------
   317   // Return point for a Java call if there's an exception thrown in Java code.
   318   // The exception is caught and transformed into a pending exception stored in
   319   // JavaThread that can be tested from within the VM.
   320   //
   321   // Oexception: exception oop
   323   address generate_catch_exception() {
   324     StubCodeMark mark(this, "StubRoutines", "catch_exception");
   326     address start = __ pc();
   327     // verify that thread corresponds
   328     __ verify_thread();
   330     const Register& temp_reg = Gtemp;
   331     Address pending_exception_addr    (G2_thread, Thread::pending_exception_offset());
   332     Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset   ());
   333     Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset   ());
   335     // set pending exception
   336     __ verify_oop(Oexception);
   337     __ st_ptr(Oexception, pending_exception_addr);
   338     __ set((intptr_t)__FILE__, temp_reg);
   339     __ st_ptr(temp_reg, exception_file_offset_addr);
   340     __ set((intptr_t)__LINE__, temp_reg);
   341     __ st(temp_reg, exception_line_offset_addr);
   343     // complete return to VM
   344     assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
   346     AddressLiteral stub_ret(StubRoutines::_call_stub_return_address);
   347     __ jump_to(stub_ret, temp_reg);
   348     __ delayed()->nop();
   350     return start;
   351   }
   354   //----------------------------------------------------------------------------------------------------
   355   // Continuation point for runtime calls returning with a pending exception
   356   // The pending exception check happened in the runtime or native call stub
   357   // The pending exception in Thread is converted into a Java-level exception
   358   //
   359   // Contract with Java-level exception handler: O0 = exception
   360   //                                             O1 = throwing pc
   362   address generate_forward_exception() {
   363     StubCodeMark mark(this, "StubRoutines", "forward_exception");
   364     address start = __ pc();
   366     // Upon entry, O7 has the return address returning into Java
   367     // (interpreted or compiled) code; i.e. the return address
   368     // becomes the throwing pc.
   370     const Register& handler_reg = Gtemp;
   372     Address exception_addr(G2_thread, Thread::pending_exception_offset());
   374 #ifdef ASSERT
   375     // make sure that this code is only executed if there is a pending exception
   376     { Label L;
   377       __ ld_ptr(exception_addr, Gtemp);
   378       __ br_notnull_short(Gtemp, Assembler::pt, L);
   379       __ stop("StubRoutines::forward exception: no pending exception (1)");
   380       __ bind(L);
   381     }
   382 #endif
   384     // compute exception handler into handler_reg
   385     __ get_thread();
   386     __ ld_ptr(exception_addr, Oexception);
   387     __ verify_oop(Oexception);
   388     __ save_frame(0);             // compensates for compiler weakness
   389     __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC
   390     BLOCK_COMMENT("call exception_handler_for_return_address");
   391     __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch);
   392     __ mov(O0, handler_reg);
   393     __ restore();                 // compensates for compiler weakness
   395     __ ld_ptr(exception_addr, Oexception);
   396     __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC
   398 #ifdef ASSERT
   399     // make sure exception is set
   400     { Label L;
   401       __ br_notnull_short(Oexception, Assembler::pt, L);
   402       __ stop("StubRoutines::forward exception: no pending exception (2)");
   403       __ bind(L);
   404     }
   405 #endif
   406     // jump to exception handler
   407     __ jmp(handler_reg, 0);
   408     // clear pending exception
   409     __ delayed()->st_ptr(G0, exception_addr);
   411     return start;
   412   }
   415   //------------------------------------------------------------------------------------------------------------------------
   416   // Continuation point for throwing of implicit exceptions that are not handled in
   417   // the current activation. Fabricates an exception oop and initiates normal
   418   // exception dispatching in this frame. Only callee-saved registers are preserved
   419   // (through the normal register window / RegisterMap handling).
   420   // If the compiler needs all registers to be preserved between the fault
   421   // point and the exception handler then it must assume responsibility for that in
   422   // AbstractCompiler::continuation_for_implicit_null_exception or
   423   // continuation_for_implicit_division_by_zero_exception. All other implicit
   424   // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
   425   // either at call sites or otherwise assume that stack unwinding will be initiated,
   426   // so caller saved registers were assumed volatile in the compiler.
   428   // Note that we generate only this stub into a RuntimeStub, because it needs to be
   429   // properly traversed and ignored during GC, so we change the meaning of the "__"
   430   // macro within this method.
   431 #undef __
   432 #define __ masm->
   434   address generate_throw_exception(const char* name, address runtime_entry,
   435                                    Register arg1 = noreg, Register arg2 = noreg) {
   436 #ifdef ASSERT
   437     int insts_size = VerifyThread ? 1 * K : 600;
   438 #else
   439     int insts_size = VerifyThread ? 1 * K : 256;
   440 #endif /* ASSERT */
   441     int locs_size  = 32;
   443     CodeBuffer      code(name, insts_size, locs_size);
   444     MacroAssembler* masm = new MacroAssembler(&code);
   446     __ verify_thread();
   448     // This is an inlined and slightly modified version of call_VM
   449     // which has the ability to fetch the return PC out of thread-local storage
   450     __ assert_not_delayed();
   452     // Note that we always push a frame because on the SPARC
   453     // architecture, for all of our implicit exception kinds at call
   454     // sites, the implicit exception is taken before the callee frame
   455     // is pushed.
   456     __ save_frame(0);
   458     int frame_complete = __ offset();
   460     // Note that we always have a runtime stub frame on the top of stack by this point
   461     Register last_java_sp = SP;
   462     // 64-bit last_java_sp is biased!
   463     __ set_last_Java_frame(last_java_sp, G0);
   464     if (VerifyThread)  __ mov(G2_thread, O0); // about to be smashed; pass early
   465     __ save_thread(noreg);
   466     if (arg1 != noreg) {
   467       assert(arg2 != O1, "clobbered");
   468       __ mov(arg1, O1);
   469     }
   470     if (arg2 != noreg) {
   471       __ mov(arg2, O2);
   472     }
   473     // do the call
   474     BLOCK_COMMENT("call runtime_entry");
   475     __ call(runtime_entry, relocInfo::runtime_call_type);
   476     if (!VerifyThread)
   477       __ delayed()->mov(G2_thread, O0);  // pass thread as first argument
   478     else
   479       __ delayed()->nop();             // (thread already passed)
   480     __ restore_thread(noreg);
   481     __ reset_last_Java_frame();
   483     // check for pending exceptions. use Gtemp as scratch register.
   484 #ifdef ASSERT
   485     Label L;
   487     Address exception_addr(G2_thread, Thread::pending_exception_offset());
   488     Register scratch_reg = Gtemp;
   489     __ ld_ptr(exception_addr, scratch_reg);
   490     __ br_notnull_short(scratch_reg, Assembler::pt, L);
   491     __ should_not_reach_here();
   492     __ bind(L);
   493 #endif // ASSERT
   494     BLOCK_COMMENT("call forward_exception_entry");
   495     __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
   496     // we use O7 linkage so that forward_exception_entry has the issuing PC
   497     __ delayed()->restore();
   499     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false);
   500     return stub->entry_point();
   501   }
   503 #undef __
   504 #define __ _masm->
   507   // Generate a routine that sets all the registers so we
   508   // can tell if the stop routine prints them correctly.
   509   address generate_test_stop() {
   510     StubCodeMark mark(this, "StubRoutines", "test_stop");
   511     address start = __ pc();
   513     int i;
   515     __ save_frame(0);
   517     static jfloat zero = 0.0, one = 1.0;
   519     // put addr in L0, then load through L0 to F0
   520     __ set((intptr_t)&zero, L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F0);
   521     __ set((intptr_t)&one,  L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1
   523     // use add to put 2..18 in F2..F18
   524     for ( i = 2;  i <= 18;  ++i ) {
   525       __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1),  as_FloatRegister(i));
   526     }
   528     // Now put double 2 in F16, double 18 in F18
   529     __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 );
   530     __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 );
   532     // use add to put 20..32 in F20..F32
   533     for (i = 20; i < 32; i += 2) {
   534       __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2),  as_FloatRegister(i));
   535     }
   537     // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's
   538     for ( i = 0; i < 8; ++i ) {
   539       if (i < 6) {
   540         __ set(     i, as_iRegister(i));
   541         __ set(16 + i, as_oRegister(i));
   542         __ set(24 + i, as_gRegister(i));
   543       }
   544       __ set( 8 + i, as_lRegister(i));
   545     }
   547     __ stop("testing stop");
   550     __ ret();
   551     __ delayed()->restore();
   553     return start;
   554   }
   557   address generate_stop_subroutine() {
   558     StubCodeMark mark(this, "StubRoutines", "stop_subroutine");
   559     address start = __ pc();
   561     __ stop_subroutine();
   563     return start;
   564   }
   566   address generate_flush_callers_register_windows() {
   567     StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows");
   568     address start = __ pc();
   570     __ flush_windows();
   571     __ retl(false);
   572     __ delayed()->add( FP, STACK_BIAS, O0 );
   573     // The returned value must be a stack pointer whose register save area
   574     // is flushed, and will stay flushed while the caller executes.
   576     return start;
   577   }
   579   // Helper functions for v8 atomic operations.
   580   //
   581   void get_v8_oop_lock_ptr(Register lock_ptr_reg, Register mark_oop_reg, Register scratch_reg) {
   582     if (mark_oop_reg == noreg) {
   583       address lock_ptr = (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr();
   584       __ set((intptr_t)lock_ptr, lock_ptr_reg);
   585     } else {
   586       assert(scratch_reg != noreg, "just checking");
   587       address lock_ptr = (address)StubRoutines::Sparc::_v8_oop_lock_cache;
   588       __ set((intptr_t)lock_ptr, lock_ptr_reg);
   589       __ and3(mark_oop_reg, StubRoutines::Sparc::v8_oop_lock_mask_in_place, scratch_reg);
   590       __ add(lock_ptr_reg, scratch_reg, lock_ptr_reg);
   591     }
   592   }
   594   void generate_v8_lock_prologue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) {
   596     get_v8_oop_lock_ptr(lock_ptr_reg, mark_oop_reg, scratch_reg);
   597     __ set(StubRoutines::Sparc::locked, lock_reg);
   598     // Initialize yield counter
   599     __ mov(G0,yield_reg);
   601     __ BIND(retry);
   602     __ cmp_and_br_short(yield_reg, V8AtomicOperationUnderLockSpinCount, Assembler::less, Assembler::pt, dontyield);
   604     // This code can only be called from inside the VM, this
   605     // stub is only invoked from Atomic::add().  We do not
   606     // want to use call_VM, because _last_java_sp and such
   607     // must already be set.
   608     //
   609     // Save the regs and make space for a C call
   610     __ save(SP, -96, SP);
   611     __ save_all_globals_into_locals();
   612     BLOCK_COMMENT("call os::naked_sleep");
   613     __ call(CAST_FROM_FN_PTR(address, os::naked_sleep));
   614     __ delayed()->nop();
   615     __ restore_globals_from_locals();
   616     __ restore();
   617     // reset the counter
   618     __ mov(G0,yield_reg);
   620     __ BIND(dontyield);
   622     // try to get lock
   623     __ swap(lock_ptr_reg, 0, lock_reg);
   625     // did we get the lock?
   626     __ cmp(lock_reg, StubRoutines::Sparc::unlocked);
   627     __ br(Assembler::notEqual, true, Assembler::pn, retry);
   628     __ delayed()->add(yield_reg,1,yield_reg);
   630     // yes, got lock. do the operation here.
   631   }
   633   void generate_v8_lock_epilogue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) {
   634     __ st(lock_reg, lock_ptr_reg, 0); // unlock
   635   }
   637   // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest).
   638   //
   639   // Arguments :
   640   //
   641   //      exchange_value: O0
   642   //      dest:           O1
   643   //
   644   // Results:
   645   //
   646   //     O0: the value previously stored in dest
   647   //
   648   address generate_atomic_xchg() {
   649     StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
   650     address start = __ pc();
   652     if (UseCASForSwap) {
   653       // Use CAS instead of swap, just in case the MP hardware
   654       // prefers to work with just one kind of synch. instruction.
   655       Label retry;
   656       __ BIND(retry);
   657       __ mov(O0, O3);       // scratch copy of exchange value
   658       __ ld(O1, 0, O2);     // observe the previous value
   659       // try to replace O2 with O3
   660       __ cas_under_lock(O1, O2, O3,
   661       (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
   662       __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
   664       __ retl(false);
   665       __ delayed()->mov(O2, O0);  // report previous value to caller
   667     } else {
   668       if (VM_Version::v9_instructions_work()) {
   669         __ retl(false);
   670         __ delayed()->swap(O1, 0, O0);
   671       } else {
   672         const Register& lock_reg = O2;
   673         const Register& lock_ptr_reg = O3;
   674         const Register& yield_reg = O4;
   676         Label retry;
   677         Label dontyield;
   679         generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
   680         // got the lock, do the swap
   681         __ swap(O1, 0, O0);
   683         generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
   684         __ retl(false);
   685         __ delayed()->nop();
   686       }
   687     }
   689     return start;
   690   }
   693   // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value)
   694   //
   695   // Arguments :
   696   //
   697   //      exchange_value: O0
   698   //      dest:           O1
   699   //      compare_value:  O2
   700   //
   701   // Results:
   702   //
   703   //     O0: the value previously stored in dest
   704   //
   705   // Overwrites (v8): O3,O4,O5
   706   //
   707   address generate_atomic_cmpxchg() {
   708     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
   709     address start = __ pc();
   711     // cmpxchg(dest, compare_value, exchange_value)
   712     __ cas_under_lock(O1, O2, O0,
   713       (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
   714     __ retl(false);
   715     __ delayed()->nop();
   717     return start;
   718   }
   720   // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
   721   //
   722   // Arguments :
   723   //
   724   //      exchange_value: O1:O0
   725   //      dest:           O2
   726   //      compare_value:  O4:O3
   727   //
   728   // Results:
   729   //
   730   //     O1:O0: the value previously stored in dest
   731   //
   732   // This only works on V9, on V8 we don't generate any
   733   // code and just return NULL.
   734   //
   735   // Overwrites: G1,G2,G3
   736   //
   737   address generate_atomic_cmpxchg_long() {
   738     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
   739     address start = __ pc();
   741     if (!VM_Version::supports_cx8())
   742         return NULL;;
   743     __ sllx(O0, 32, O0);
   744     __ srl(O1, 0, O1);
   745     __ or3(O0,O1,O0);      // O0 holds 64-bit value from compare_value
   746     __ sllx(O3, 32, O3);
   747     __ srl(O4, 0, O4);
   748     __ or3(O3,O4,O3);     // O3 holds 64-bit value from exchange_value
   749     __ casx(O2, O3, O0);
   750     __ srl(O0, 0, O1);    // unpacked return value in O1:O0
   751     __ retl(false);
   752     __ delayed()->srlx(O0, 32, O0);
   754     return start;
   755   }
   758   // Support for jint Atomic::add(jint add_value, volatile jint* dest).
   759   //
   760   // Arguments :
   761   //
   762   //      add_value: O0   (e.g., +1 or -1)
   763   //      dest:      O1
   764   //
   765   // Results:
   766   //
   767   //     O0: the new value stored in dest
   768   //
   769   // Overwrites (v9): O3
   770   // Overwrites (v8): O3,O4,O5
   771   //
   772   address generate_atomic_add() {
   773     StubCodeMark mark(this, "StubRoutines", "atomic_add");
   774     address start = __ pc();
   775     __ BIND(_atomic_add_stub);
   777     if (VM_Version::v9_instructions_work()) {
   778       Label(retry);
   779       __ BIND(retry);
   781       __ lduw(O1, 0, O2);
   782       __ add(O0, O2, O3);
   783       __ cas(O1, O2, O3);
   784       __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
   785       __ retl(false);
   786       __ delayed()->add(O0, O2, O0); // note that cas made O2==O3
   787     } else {
   788       const Register& lock_reg = O2;
   789       const Register& lock_ptr_reg = O3;
   790       const Register& value_reg = O4;
   791       const Register& yield_reg = O5;
   793       Label(retry);
   794       Label(dontyield);
   796       generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
   797       // got lock, do the increment
   798       __ ld(O1, 0, value_reg);
   799       __ add(O0, value_reg, value_reg);
   800       __ st(value_reg, O1, 0);
   802       // %%% only for RMO and PSO
   803       __ membar(Assembler::StoreStore);
   805       generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
   807       __ retl(false);
   808       __ delayed()->mov(value_reg, O0);
   809     }
   811     return start;
   812   }
   813   Label _atomic_add_stub;  // called from other stubs
   816   //------------------------------------------------------------------------------------------------------------------------
   817   // The following routine generates a subroutine to throw an asynchronous
   818   // UnknownError when an unsafe access gets a fault that could not be
   819   // reasonably prevented by the programmer.  (Example: SIGBUS/OBJERR.)
   820   //
   821   // Arguments :
   822   //
   823   //      trapping PC:    O7
   824   //
   825   // Results:
   826   //     posts an asynchronous exception, skips the trapping instruction
   827   //
   829   address generate_handler_for_unsafe_access() {
   830     StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
   831     address start = __ pc();
   833     const int preserve_register_words = (64 * 2);
   834     Address preserve_addr(FP, (-preserve_register_words * wordSize) + STACK_BIAS);
   836     Register Lthread = L7_thread_cache;
   837     int i;
   839     __ save_frame(0);
   840     __ mov(G1, L1);
   841     __ mov(G2, L2);
   842     __ mov(G3, L3);
   843     __ mov(G4, L4);
   844     __ mov(G5, L5);
   845     for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) {
   846       __ stf(FloatRegisterImpl::D, as_FloatRegister(i), preserve_addr, i * wordSize);
   847     }
   849     address entry_point = CAST_FROM_FN_PTR(address, handle_unsafe_access);
   850     BLOCK_COMMENT("call handle_unsafe_access");
   851     __ call(entry_point, relocInfo::runtime_call_type);
   852     __ delayed()->nop();
   854     __ mov(L1, G1);
   855     __ mov(L2, G2);
   856     __ mov(L3, G3);
   857     __ mov(L4, G4);
   858     __ mov(L5, G5);
   859     for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) {
   860       __ ldf(FloatRegisterImpl::D, preserve_addr, as_FloatRegister(i), i * wordSize);
   861     }
   863     __ verify_thread();
   865     __ jmp(O0, 0);
   866     __ delayed()->restore();
   868     return start;
   869   }
   872   // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super );
   873   // Arguments :
   874   //
   875   //      ret  : O0, returned
   876   //      icc/xcc: set as O0 (depending on wordSize)
   877   //      sub  : O1, argument, not changed
   878   //      super: O2, argument, not changed
   879   //      raddr: O7, blown by call
   880   address generate_partial_subtype_check() {
   881     __ align(CodeEntryAlignment);
   882     StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
   883     address start = __ pc();
   884     Label miss;
   886 #if defined(COMPILER2) && !defined(_LP64)
   887     // Do not use a 'save' because it blows the 64-bit O registers.
   888     __ add(SP,-4*wordSize,SP);  // Make space for 4 temps (stack must be 2 words aligned)
   889     __ st_ptr(L0,SP,(frame::register_save_words+0)*wordSize);
   890     __ st_ptr(L1,SP,(frame::register_save_words+1)*wordSize);
   891     __ st_ptr(L2,SP,(frame::register_save_words+2)*wordSize);
   892     __ st_ptr(L3,SP,(frame::register_save_words+3)*wordSize);
   893     Register Rret   = O0;
   894     Register Rsub   = O1;
   895     Register Rsuper = O2;
   896 #else
   897     __ save_frame(0);
   898     Register Rret   = I0;
   899     Register Rsub   = I1;
   900     Register Rsuper = I2;
   901 #endif
   903     Register L0_ary_len = L0;
   904     Register L1_ary_ptr = L1;
   905     Register L2_super   = L2;
   906     Register L3_index   = L3;
   908     __ check_klass_subtype_slow_path(Rsub, Rsuper,
   909                                      L0, L1, L2, L3,
   910                                      NULL, &miss);
   912     // Match falls through here.
   913     __ addcc(G0,0,Rret);        // set Z flags, Z result
   915 #if defined(COMPILER2) && !defined(_LP64)
   916     __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
   917     __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
   918     __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
   919     __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
   920     __ retl();                  // Result in Rret is zero; flags set to Z
   921     __ delayed()->add(SP,4*wordSize,SP);
   922 #else
   923     __ ret();                   // Result in Rret is zero; flags set to Z
   924     __ delayed()->restore();
   925 #endif
   927     __ BIND(miss);
   928     __ addcc(G0,1,Rret);        // set NZ flags, NZ result
   930 #if defined(COMPILER2) && !defined(_LP64)
   931     __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
   932     __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
   933     __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
   934     __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
   935     __ retl();                  // Result in Rret is != 0; flags set to NZ
   936     __ delayed()->add(SP,4*wordSize,SP);
   937 #else
   938     __ ret();                   // Result in Rret is != 0; flags set to NZ
   939     __ delayed()->restore();
   940 #endif
   942     return start;
   943   }
   946   // Called from MacroAssembler::verify_oop
   947   //
   948   address generate_verify_oop_subroutine() {
   949     StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
   951     address start = __ pc();
   953     __ verify_oop_subroutine();
   955     return start;
   956   }
   959   //
   960   // Verify that a register contains clean 32-bits positive value
   961   // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax).
   962   //
   963   //  Input:
   964   //    Rint  -  32-bits value
   965   //    Rtmp  -  scratch
   966   //
   967   void assert_clean_int(Register Rint, Register Rtmp) {
   968 #if defined(ASSERT) && defined(_LP64)
   969     __ signx(Rint, Rtmp);
   970     __ cmp(Rint, Rtmp);
   971     __ breakpoint_trap(Assembler::notEqual, Assembler::xcc);
   972 #endif
   973   }
   975   //
   976   //  Generate overlap test for array copy stubs
   977   //
   978   //  Input:
   979   //    O0    -  array1
   980   //    O1    -  array2
   981   //    O2    -  element count
   982   //
   983   //  Kills temps:  O3, O4
   984   //
   985   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
   986     assert(no_overlap_target != NULL, "must be generated");
   987     array_overlap_test(no_overlap_target, NULL, log2_elem_size);
   988   }
   989   void array_overlap_test(Label& L_no_overlap, int log2_elem_size) {
   990     array_overlap_test(NULL, &L_no_overlap, log2_elem_size);
   991   }
   992   void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) {
   993     const Register from       = O0;
   994     const Register to         = O1;
   995     const Register count      = O2;
   996     const Register to_from    = O3; // to - from
   997     const Register byte_count = O4; // count << log2_elem_size
   999       __ subcc(to, from, to_from);
  1000       __ sll_ptr(count, log2_elem_size, byte_count);
  1001       if (NOLp == NULL)
  1002         __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target);
  1003       else
  1004         __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp));
  1005       __ delayed()->cmp(to_from, byte_count);
  1006       if (NOLp == NULL)
  1007         __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target);
  1008       else
  1009         __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp));
  1010       __ delayed()->nop();
  1013   //
  1014   //  Generate pre-write barrier for array.
  1015   //
  1016   //  Input:
  1017   //     addr     - register containing starting address
  1018   //     count    - register containing element count
  1019   //     tmp      - scratch register
  1020   //
  1021   //  The input registers are overwritten.
  1022   //
  1023   void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
  1024     BarrierSet* bs = Universe::heap()->barrier_set();
  1025     switch (bs->kind()) {
  1026       case BarrierSet::G1SATBCT:
  1027       case BarrierSet::G1SATBCTLogging:
  1028         // With G1, don't generate the call if we statically know that the target in uninitialized
  1029         if (!dest_uninitialized) {
  1030           __ save_frame(0);
  1031           // Save the necessary global regs... will be used after.
  1032           if (addr->is_global()) {
  1033             __ mov(addr, L0);
  1035           if (count->is_global()) {
  1036             __ mov(count, L1);
  1038           __ mov(addr->after_save(), O0);
  1039           // Get the count into O1
  1040           __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
  1041           __ delayed()->mov(count->after_save(), O1);
  1042           if (addr->is_global()) {
  1043             __ mov(L0, addr);
  1045           if (count->is_global()) {
  1046             __ mov(L1, count);
  1048           __ restore();
  1050         break;
  1051       case BarrierSet::CardTableModRef:
  1052       case BarrierSet::CardTableExtension:
  1053       case BarrierSet::ModRef:
  1054         break;
  1055       default:
  1056         ShouldNotReachHere();
  1059   //
  1060   //  Generate post-write barrier for array.
  1061   //
  1062   //  Input:
  1063   //     addr     - register containing starting address
  1064   //     count    - register containing element count
  1065   //     tmp      - scratch register
  1066   //
  1067   //  The input registers are overwritten.
  1068   //
  1069   void gen_write_ref_array_post_barrier(Register addr, Register count,
  1070                                         Register tmp) {
  1071     BarrierSet* bs = Universe::heap()->barrier_set();
  1073     switch (bs->kind()) {
  1074       case BarrierSet::G1SATBCT:
  1075       case BarrierSet::G1SATBCTLogging:
  1077           // Get some new fresh output registers.
  1078           __ save_frame(0);
  1079           __ mov(addr->after_save(), O0);
  1080           __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
  1081           __ delayed()->mov(count->after_save(), O1);
  1082           __ restore();
  1084         break;
  1085       case BarrierSet::CardTableModRef:
  1086       case BarrierSet::CardTableExtension:
  1088           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
  1089           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
  1090           assert_different_registers(addr, count, tmp);
  1092           Label L_loop;
  1094           __ sll_ptr(count, LogBytesPerHeapOop, count);
  1095           __ sub(count, BytesPerHeapOop, count);
  1096           __ add(count, addr, count);
  1097           // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
  1098           __ srl_ptr(addr, CardTableModRefBS::card_shift, addr);
  1099           __ srl_ptr(count, CardTableModRefBS::card_shift, count);
  1100           __ sub(count, addr, count);
  1101           AddressLiteral rs(ct->byte_map_base);
  1102           __ set(rs, tmp);
  1103         __ BIND(L_loop);
  1104           __ stb(G0, tmp, addr);
  1105           __ subcc(count, 1, count);
  1106           __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
  1107           __ delayed()->add(addr, 1, addr);
  1109         break;
  1110       case BarrierSet::ModRef:
  1111         break;
  1112       default:
  1113         ShouldNotReachHere();
  1117   //
  1118   // Generate main code for disjoint arraycopy
  1119   //
  1120   typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec,
  1121                                               Label& L_loop, bool use_prefetch, bool use_bis);
  1123   void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size,
  1124                           int iter_size, CopyLoopFunc copy_loop_func) {
  1125     Label L_copy;
  1127     assert(log2_elem_size <= 3, "the following code should be changed");
  1128     int count_dec = 16>>log2_elem_size;
  1130     int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance);
  1131     assert(prefetch_dist < 4096, "invalid value");
  1132     prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size
  1133     int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count
  1135     if (UseBlockCopy) {
  1136       Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy;
  1138       // 64 bytes tail + bytes copied in one loop iteration
  1139       int tail_size = 64 + iter_size;
  1140       int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size;
  1141       // Use BIS copy only for big arrays since it requires membar.
  1142       __ set(block_copy_count, O4);
  1143       __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy);
  1144       // This code is for disjoint source and destination:
  1145       //   to <= from || to >= from+count
  1146       // but BIS will stomp over 'from' if (to > from-tail_size && to <= from)
  1147       __ sub(from, to, O4);
  1148       __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm.
  1149       __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy);
  1151       __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
  1152       // BIS should not be used to copy tail (64 bytes+iter_size)
  1153       // to avoid zeroing of following values.
  1154       __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0
  1156       if (prefetch_count > 0) { // rounded up to one iteration count
  1157         // Do prefetching only if copy size is bigger
  1158         // than prefetch distance.
  1159         __ set(prefetch_count, O4);
  1160         __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy);
  1161         __ sub(count, prefetch_count, count);
  1163         (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true);
  1164         __ add(count, prefetch_count, count); // restore count
  1166       } // prefetch_count > 0
  1168       (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true);
  1169       __ add(count, (tail_size>>log2_elem_size), count); // restore count
  1171       __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT);
  1172       // BIS needs membar.
  1173       __ membar(Assembler::StoreLoad);
  1174       // Copy tail
  1175       __ ba_short(L_copy);
  1177       __ BIND(L_skip_block_copy);
  1178     } // UseBlockCopy
  1180     if (prefetch_count > 0) { // rounded up to one iteration count
  1181       // Do prefetching only if copy size is bigger
  1182       // than prefetch distance.
  1183       __ set(prefetch_count, O4);
  1184       __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy);
  1185       __ sub(count, prefetch_count, count);
  1187       Label L_copy_prefetch;
  1188       (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false);
  1189       __ add(count, prefetch_count, count); // restore count
  1191     } // prefetch_count > 0
  1193     (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false);
  1198   //
  1199   // Helper methods for copy_16_bytes_forward_with_shift()
  1200   //
  1201   void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec,
  1202                                 Label& L_loop, bool use_prefetch, bool use_bis) {
  1204     const Register left_shift  = G1; // left  shift bit counter
  1205     const Register right_shift = G5; // right shift bit counter
  1207     __ align(OptoLoopAlignment);
  1208     __ BIND(L_loop);
  1209     if (use_prefetch) {
  1210       if (ArraycopySrcPrefetchDistance > 0) {
  1211         __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
  1213       if (ArraycopyDstPrefetchDistance > 0) {
  1214         __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
  1217     __ ldx(from, 0, O4);
  1218     __ ldx(from, 8, G4);
  1219     __ inc(to, 16);
  1220     __ inc(from, 16);
  1221     __ deccc(count, count_dec); // Can we do next iteration after this one?
  1222     __ srlx(O4, right_shift, G3);
  1223     __ bset(G3, O3);
  1224     __ sllx(O4, left_shift,  O4);
  1225     __ srlx(G4, right_shift, G3);
  1226     __ bset(G3, O4);
  1227     if (use_bis) {
  1228       __ stxa(O3, to, -16);
  1229       __ stxa(O4, to, -8);
  1230     } else {
  1231       __ stx(O3, to, -16);
  1232       __ stx(O4, to, -8);
  1234     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
  1235     __ delayed()->sllx(G4, left_shift,  O3);
  1238   // Copy big chunks forward with shift
  1239   //
  1240   // Inputs:
  1241   //   from      - source arrays
  1242   //   to        - destination array aligned to 8-bytes
  1243   //   count     - elements count to copy >= the count equivalent to 16 bytes
  1244   //   count_dec - elements count's decrement equivalent to 16 bytes
  1245   //   L_copy_bytes - copy exit label
  1246   //
  1247   void copy_16_bytes_forward_with_shift(Register from, Register to,
  1248                      Register count, int log2_elem_size, Label& L_copy_bytes) {
  1249     Label L_aligned_copy, L_copy_last_bytes;
  1250     assert(log2_elem_size <= 3, "the following code should be changed");
  1251     int count_dec = 16>>log2_elem_size;
  1253     // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
  1254     __ andcc(from, 7, G1); // misaligned bytes
  1255     __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
  1256     __ delayed()->nop();
  1258     const Register left_shift  = G1; // left  shift bit counter
  1259     const Register right_shift = G5; // right shift bit counter
  1261     __ sll(G1, LogBitsPerByte, left_shift);
  1262     __ mov(64, right_shift);
  1263     __ sub(right_shift, left_shift, right_shift);
  1265     //
  1266     // Load 2 aligned 8-bytes chunks and use one from previous iteration
  1267     // to form 2 aligned 8-bytes chunks to store.
  1268     //
  1269     __ dec(count, count_dec);   // Pre-decrement 'count'
  1270     __ andn(from, 7, from);     // Align address
  1271     __ ldx(from, 0, O3);
  1272     __ inc(from, 8);
  1273     __ sllx(O3, left_shift,  O3);
  1275     disjoint_copy_core(from, to, count, log2_elem_size, 16, copy_16_bytes_shift_loop);
  1277     __ inccc(count, count_dec>>1 ); // + 8 bytes
  1278     __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
  1279     __ delayed()->inc(count, count_dec>>1); // restore 'count'
  1281     // copy 8 bytes, part of them already loaded in O3
  1282     __ ldx(from, 0, O4);
  1283     __ inc(to, 8);
  1284     __ inc(from, 8);
  1285     __ srlx(O4, right_shift, G3);
  1286     __ bset(O3, G3);
  1287     __ stx(G3, to, -8);
  1289     __ BIND(L_copy_last_bytes);
  1290     __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
  1291     __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
  1292     __ delayed()->sub(from, right_shift, from);       // restore address
  1294     __ BIND(L_aligned_copy);
  1297   // Copy big chunks backward with shift
  1298   //
  1299   // Inputs:
  1300   //   end_from  - source arrays end address
  1301   //   end_to    - destination array end address aligned to 8-bytes
  1302   //   count     - elements count to copy >= the count equivalent to 16 bytes
  1303   //   count_dec - elements count's decrement equivalent to 16 bytes
  1304   //   L_aligned_copy - aligned copy exit label
  1305   //   L_copy_bytes   - copy exit label
  1306   //
  1307   void copy_16_bytes_backward_with_shift(Register end_from, Register end_to,
  1308                      Register count, int count_dec,
  1309                      Label& L_aligned_copy, Label& L_copy_bytes) {
  1310     Label L_loop, L_copy_last_bytes;
  1312     // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
  1313       __ andcc(end_from, 7, G1); // misaligned bytes
  1314       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
  1315       __ delayed()->deccc(count, count_dec); // Pre-decrement 'count'
  1317     const Register left_shift  = G1; // left  shift bit counter
  1318     const Register right_shift = G5; // right shift bit counter
  1320       __ sll(G1, LogBitsPerByte, left_shift);
  1321       __ mov(64, right_shift);
  1322       __ sub(right_shift, left_shift, right_shift);
  1324     //
  1325     // Load 2 aligned 8-bytes chunks and use one from previous iteration
  1326     // to form 2 aligned 8-bytes chunks to store.
  1327     //
  1328       __ andn(end_from, 7, end_from);     // Align address
  1329       __ ldx(end_from, 0, O3);
  1330       __ align(OptoLoopAlignment);
  1331     __ BIND(L_loop);
  1332       __ ldx(end_from, -8, O4);
  1333       __ deccc(count, count_dec); // Can we do next iteration after this one?
  1334       __ ldx(end_from, -16, G4);
  1335       __ dec(end_to, 16);
  1336       __ dec(end_from, 16);
  1337       __ srlx(O3, right_shift, O3);
  1338       __ sllx(O4, left_shift,  G3);
  1339       __ bset(G3, O3);
  1340       __ stx(O3, end_to, 8);
  1341       __ srlx(O4, right_shift, O4);
  1342       __ sllx(G4, left_shift,  G3);
  1343       __ bset(G3, O4);
  1344       __ stx(O4, end_to, 0);
  1345       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
  1346       __ delayed()->mov(G4, O3);
  1348       __ inccc(count, count_dec>>1 ); // + 8 bytes
  1349       __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
  1350       __ delayed()->inc(count, count_dec>>1); // restore 'count'
  1352       // copy 8 bytes, part of them already loaded in O3
  1353       __ ldx(end_from, -8, O4);
  1354       __ dec(end_to, 8);
  1355       __ dec(end_from, 8);
  1356       __ srlx(O3, right_shift, O3);
  1357       __ sllx(O4, left_shift,  G3);
  1358       __ bset(O3, G3);
  1359       __ stx(G3, end_to, 0);
  1361     __ BIND(L_copy_last_bytes);
  1362       __ srl(left_shift, LogBitsPerByte, left_shift);    // misaligned bytes
  1363       __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
  1364       __ delayed()->add(end_from, left_shift, end_from); // restore address
  1367   //
  1368   //  Generate stub for disjoint byte copy.  If "aligned" is true, the
  1369   //  "from" and "to" addresses are assumed to be heapword aligned.
  1370   //
  1371   // Arguments for generated stub:
  1372   //      from:  O0
  1373   //      to:    O1
  1374   //      count: O2 treated as signed
  1375   //
  1376   address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) {
  1377     __ align(CodeEntryAlignment);
  1378     StubCodeMark mark(this, "StubRoutines", name);
  1379     address start = __ pc();
  1381     Label L_skip_alignment, L_align;
  1382     Label L_copy_byte, L_copy_byte_loop, L_exit;
  1384     const Register from      = O0;   // source array address
  1385     const Register to        = O1;   // destination array address
  1386     const Register count     = O2;   // elements count
  1387     const Register offset    = O5;   // offset from start of arrays
  1388     // O3, O4, G3, G4 are used as temp registers
  1390     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  1392     if (entry != NULL) {
  1393       *entry = __ pc();
  1394       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  1395       BLOCK_COMMENT("Entry:");
  1398     // for short arrays, just do single element copy
  1399     __ cmp(count, 23); // 16 + 7
  1400     __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
  1401     __ delayed()->mov(G0, offset);
  1403     if (aligned) {
  1404       // 'aligned' == true when it is known statically during compilation
  1405       // of this arraycopy call site that both 'from' and 'to' addresses
  1406       // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
  1407       //
  1408       // Aligned arrays have 4 bytes alignment in 32-bits VM
  1409       // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM
  1410       //
  1411 #ifndef _LP64
  1412       // copy a 4-bytes word if necessary to align 'to' to 8 bytes
  1413       __ andcc(to, 7, G0);
  1414       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment);
  1415       __ delayed()->ld(from, 0, O3);
  1416       __ inc(from, 4);
  1417       __ inc(to, 4);
  1418       __ dec(count, 4);
  1419       __ st(O3, to, -4);
  1420     __ BIND(L_skip_alignment);
  1421 #endif
  1422     } else {
  1423       // copy bytes to align 'to' on 8 byte boundary
  1424       __ andcc(to, 7, G1); // misaligned bytes
  1425       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
  1426       __ delayed()->neg(G1);
  1427       __ inc(G1, 8);       // bytes need to copy to next 8-bytes alignment
  1428       __ sub(count, G1, count);
  1429     __ BIND(L_align);
  1430       __ ldub(from, 0, O3);
  1431       __ deccc(G1);
  1432       __ inc(from);
  1433       __ stb(O3, to, 0);
  1434       __ br(Assembler::notZero, false, Assembler::pt, L_align);
  1435       __ delayed()->inc(to);
  1436     __ BIND(L_skip_alignment);
  1438 #ifdef _LP64
  1439     if (!aligned)
  1440 #endif
  1442       // Copy with shift 16 bytes per iteration if arrays do not have
  1443       // the same alignment mod 8, otherwise fall through to the next
  1444       // code for aligned copy.
  1445       // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
  1446       // Also jump over aligned copy after the copy with shift completed.
  1448       copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte);
  1451     // Both array are 8 bytes aligned, copy 16 bytes at a time
  1452       __ and3(count, 7, G4); // Save count
  1453       __ srl(count, 3, count);
  1454      generate_disjoint_long_copy_core(aligned);
  1455       __ mov(G4, count);     // Restore count
  1457     // copy tailing bytes
  1458     __ BIND(L_copy_byte);
  1459       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
  1460       __ align(OptoLoopAlignment);
  1461     __ BIND(L_copy_byte_loop);
  1462       __ ldub(from, offset, O3);
  1463       __ deccc(count);
  1464       __ stb(O3, to, offset);
  1465       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
  1466       __ delayed()->inc(offset);
  1468     __ BIND(L_exit);
  1469       // O3, O4 are used as temp registers
  1470       inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
  1471       __ retl();
  1472       __ delayed()->mov(G0, O0); // return 0
  1473     return start;
  1476   //
  1477   //  Generate stub for conjoint byte copy.  If "aligned" is true, the
  1478   //  "from" and "to" addresses are assumed to be heapword aligned.
  1479   //
  1480   // Arguments for generated stub:
  1481   //      from:  O0
  1482   //      to:    O1
  1483   //      count: O2 treated as signed
  1484   //
  1485   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
  1486                                       address *entry, const char *name) {
  1487     // Do reverse copy.
  1489     __ align(CodeEntryAlignment);
  1490     StubCodeMark mark(this, "StubRoutines", name);
  1491     address start = __ pc();
  1493     Label L_skip_alignment, L_align, L_aligned_copy;
  1494     Label L_copy_byte, L_copy_byte_loop, L_exit;
  1496     const Register from      = O0;   // source array address
  1497     const Register to        = O1;   // destination array address
  1498     const Register count     = O2;   // elements count
  1499     const Register end_from  = from; // source array end address
  1500     const Register end_to    = to;   // destination array end address
  1502     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  1504     if (entry != NULL) {
  1505       *entry = __ pc();
  1506       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  1507       BLOCK_COMMENT("Entry:");
  1510     array_overlap_test(nooverlap_target, 0);
  1512     __ add(to, count, end_to);       // offset after last copied element
  1514     // for short arrays, just do single element copy
  1515     __ cmp(count, 23); // 16 + 7
  1516     __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
  1517     __ delayed()->add(from, count, end_from);
  1520       // Align end of arrays since they could be not aligned even
  1521       // when arrays itself are aligned.
  1523       // copy bytes to align 'end_to' on 8 byte boundary
  1524       __ andcc(end_to, 7, G1); // misaligned bytes
  1525       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
  1526       __ delayed()->nop();
  1527       __ sub(count, G1, count);
  1528     __ BIND(L_align);
  1529       __ dec(end_from);
  1530       __ dec(end_to);
  1531       __ ldub(end_from, 0, O3);
  1532       __ deccc(G1);
  1533       __ brx(Assembler::notZero, false, Assembler::pt, L_align);
  1534       __ delayed()->stb(O3, end_to, 0);
  1535     __ BIND(L_skip_alignment);
  1537 #ifdef _LP64
  1538     if (aligned) {
  1539       // Both arrays are aligned to 8-bytes in 64-bits VM.
  1540       // The 'count' is decremented in copy_16_bytes_backward_with_shift()
  1541       // in unaligned case.
  1542       __ dec(count, 16);
  1543     } else
  1544 #endif
  1546       // Copy with shift 16 bytes per iteration if arrays do not have
  1547       // the same alignment mod 8, otherwise jump to the next
  1548       // code for aligned copy (and substracting 16 from 'count' before jump).
  1549       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
  1550       // Also jump over aligned copy after the copy with shift completed.
  1552       copy_16_bytes_backward_with_shift(end_from, end_to, count, 16,
  1553                                         L_aligned_copy, L_copy_byte);
  1555     // copy 4 elements (16 bytes) at a time
  1556       __ align(OptoLoopAlignment);
  1557     __ BIND(L_aligned_copy);
  1558       __ dec(end_from, 16);
  1559       __ ldx(end_from, 8, O3);
  1560       __ ldx(end_from, 0, O4);
  1561       __ dec(end_to, 16);
  1562       __ deccc(count, 16);
  1563       __ stx(O3, end_to, 8);
  1564       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
  1565       __ delayed()->stx(O4, end_to, 0);
  1566       __ inc(count, 16);
  1568     // copy 1 element (2 bytes) at a time
  1569     __ BIND(L_copy_byte);
  1570       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
  1571       __ align(OptoLoopAlignment);
  1572     __ BIND(L_copy_byte_loop);
  1573       __ dec(end_from);
  1574       __ dec(end_to);
  1575       __ ldub(end_from, 0, O4);
  1576       __ deccc(count);
  1577       __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop);
  1578       __ delayed()->stb(O4, end_to, 0);
  1580     __ BIND(L_exit);
  1581     // O3, O4 are used as temp registers
  1582     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
  1583     __ retl();
  1584     __ delayed()->mov(G0, O0); // return 0
  1585     return start;
  1588   //
  1589   //  Generate stub for disjoint short copy.  If "aligned" is true, the
  1590   //  "from" and "to" addresses are assumed to be heapword aligned.
  1591   //
  1592   // Arguments for generated stub:
  1593   //      from:  O0
  1594   //      to:    O1
  1595   //      count: O2 treated as signed
  1596   //
  1597   address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) {
  1598     __ align(CodeEntryAlignment);
  1599     StubCodeMark mark(this, "StubRoutines", name);
  1600     address start = __ pc();
  1602     Label L_skip_alignment, L_skip_alignment2;
  1603     Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
  1605     const Register from      = O0;   // source array address
  1606     const Register to        = O1;   // destination array address
  1607     const Register count     = O2;   // elements count
  1608     const Register offset    = O5;   // offset from start of arrays
  1609     // O3, O4, G3, G4 are used as temp registers
  1611     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  1613     if (entry != NULL) {
  1614       *entry = __ pc();
  1615       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  1616       BLOCK_COMMENT("Entry:");
  1619     // for short arrays, just do single element copy
  1620     __ cmp(count, 11); // 8 + 3  (22 bytes)
  1621     __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
  1622     __ delayed()->mov(G0, offset);
  1624     if (aligned) {
  1625       // 'aligned' == true when it is known statically during compilation
  1626       // of this arraycopy call site that both 'from' and 'to' addresses
  1627       // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
  1628       //
  1629       // Aligned arrays have 4 bytes alignment in 32-bits VM
  1630       // and 8 bytes - in 64-bits VM.
  1631       //
  1632 #ifndef _LP64
  1633       // copy a 2-elements word if necessary to align 'to' to 8 bytes
  1634       __ andcc(to, 7, G0);
  1635       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
  1636       __ delayed()->ld(from, 0, O3);
  1637       __ inc(from, 4);
  1638       __ inc(to, 4);
  1639       __ dec(count, 2);
  1640       __ st(O3, to, -4);
  1641     __ BIND(L_skip_alignment);
  1642 #endif
  1643     } else {
  1644       // copy 1 element if necessary to align 'to' on an 4 bytes
  1645       __ andcc(to, 3, G0);
  1646       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
  1647       __ delayed()->lduh(from, 0, O3);
  1648       __ inc(from, 2);
  1649       __ inc(to, 2);
  1650       __ dec(count);
  1651       __ sth(O3, to, -2);
  1652     __ BIND(L_skip_alignment);
  1654       // copy 2 elements to align 'to' on an 8 byte boundary
  1655       __ andcc(to, 7, G0);
  1656       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
  1657       __ delayed()->lduh(from, 0, O3);
  1658       __ dec(count, 2);
  1659       __ lduh(from, 2, O4);
  1660       __ inc(from, 4);
  1661       __ inc(to, 4);
  1662       __ sth(O3, to, -4);
  1663       __ sth(O4, to, -2);
  1664     __ BIND(L_skip_alignment2);
  1666 #ifdef _LP64
  1667     if (!aligned)
  1668 #endif
  1670       // Copy with shift 16 bytes per iteration if arrays do not have
  1671       // the same alignment mod 8, otherwise fall through to the next
  1672       // code for aligned copy.
  1673       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
  1674       // Also jump over aligned copy after the copy with shift completed.
  1676       copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes);
  1679     // Both array are 8 bytes aligned, copy 16 bytes at a time
  1680       __ and3(count, 3, G4); // Save
  1681       __ srl(count, 2, count);
  1682      generate_disjoint_long_copy_core(aligned);
  1683       __ mov(G4, count); // restore
  1685     // copy 1 element at a time
  1686     __ BIND(L_copy_2_bytes);
  1687       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
  1688       __ align(OptoLoopAlignment);
  1689     __ BIND(L_copy_2_bytes_loop);
  1690       __ lduh(from, offset, O3);
  1691       __ deccc(count);
  1692       __ sth(O3, to, offset);
  1693       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
  1694       __ delayed()->inc(offset, 2);
  1696     __ BIND(L_exit);
  1697       // O3, O4 are used as temp registers
  1698       inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
  1699       __ retl();
  1700       __ delayed()->mov(G0, O0); // return 0
  1701     return start;
  1704   //
  1705   //  Generate stub for disjoint short fill.  If "aligned" is true, the
  1706   //  "to" address is assumed to be heapword aligned.
  1707   //
  1708   // Arguments for generated stub:
  1709   //      to:    O0
  1710   //      value: O1
  1711   //      count: O2 treated as signed
  1712   //
  1713   address generate_fill(BasicType t, bool aligned, const char* name) {
  1714     __ align(CodeEntryAlignment);
  1715     StubCodeMark mark(this, "StubRoutines", name);
  1716     address start = __ pc();
  1718     const Register to        = O0;   // source array address
  1719     const Register value     = O1;   // fill value
  1720     const Register count     = O2;   // elements count
  1721     // O3 is used as a temp register
  1723     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  1725     Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
  1726     Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes;
  1728     int shift = -1;
  1729     switch (t) {
  1730        case T_BYTE:
  1731         shift = 2;
  1732         break;
  1733        case T_SHORT:
  1734         shift = 1;
  1735         break;
  1736       case T_INT:
  1737          shift = 0;
  1738         break;
  1739       default: ShouldNotReachHere();
  1742     BLOCK_COMMENT("Entry:");
  1744     if (t == T_BYTE) {
  1745       // Zero extend value
  1746       __ and3(value, 0xff, value);
  1747       __ sllx(value, 8, O3);
  1748       __ or3(value, O3, value);
  1750     if (t == T_SHORT) {
  1751       // Zero extend value
  1752       __ sllx(value, 48, value);
  1753       __ srlx(value, 48, value);
  1755     if (t == T_BYTE || t == T_SHORT) {
  1756       __ sllx(value, 16, O3);
  1757       __ or3(value, O3, value);
  1760     __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
  1761     __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp
  1762     __ delayed()->andcc(count, 1, G0);
  1764     if (!aligned && (t == T_BYTE || t == T_SHORT)) {
  1765       // align source address at 4 bytes address boundary
  1766       if (t == T_BYTE) {
  1767         // One byte misalignment happens only for byte arrays
  1768         __ andcc(to, 1, G0);
  1769         __ br(Assembler::zero, false, Assembler::pt, L_skip_align1);
  1770         __ delayed()->nop();
  1771         __ stb(value, to, 0);
  1772         __ inc(to, 1);
  1773         __ dec(count, 1);
  1774         __ BIND(L_skip_align1);
  1776       // Two bytes misalignment happens only for byte and short (char) arrays
  1777       __ andcc(to, 2, G0);
  1778       __ br(Assembler::zero, false, Assembler::pt, L_skip_align2);
  1779       __ delayed()->nop();
  1780       __ sth(value, to, 0);
  1781       __ inc(to, 2);
  1782       __ dec(count, 1 << (shift - 1));
  1783       __ BIND(L_skip_align2);
  1785 #ifdef _LP64
  1786     if (!aligned) {
  1787 #endif
  1788     // align to 8 bytes, we know we are 4 byte aligned to start
  1789     __ andcc(to, 7, G0);
  1790     __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes);
  1791     __ delayed()->nop();
  1792     __ stw(value, to, 0);
  1793     __ inc(to, 4);
  1794     __ dec(count, 1 << shift);
  1795     __ BIND(L_fill_32_bytes);
  1796 #ifdef _LP64
  1798 #endif
  1800     if (t == T_INT) {
  1801       // Zero extend value
  1802       __ srl(value, 0, value);
  1804     if (t == T_BYTE || t == T_SHORT || t == T_INT) {
  1805       __ sllx(value, 32, O3);
  1806       __ or3(value, O3, value);
  1809     Label L_check_fill_8_bytes;
  1810     // Fill 32-byte chunks
  1811     __ subcc(count, 8 << shift, count);
  1812     __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes);
  1813     __ delayed()->nop();
  1815     Label L_fill_32_bytes_loop, L_fill_4_bytes;
  1816     __ align(16);
  1817     __ BIND(L_fill_32_bytes_loop);
  1819     __ stx(value, to, 0);
  1820     __ stx(value, to, 8);
  1821     __ stx(value, to, 16);
  1822     __ stx(value, to, 24);
  1824     __ subcc(count, 8 << shift, count);
  1825     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop);
  1826     __ delayed()->add(to, 32, to);
  1828     __ BIND(L_check_fill_8_bytes);
  1829     __ addcc(count, 8 << shift, count);
  1830     __ brx(Assembler::zero, false, Assembler::pn, L_exit);
  1831     __ delayed()->subcc(count, 1 << (shift + 1), count);
  1832     __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes);
  1833     __ delayed()->andcc(count, 1<<shift, G0);
  1835     //
  1836     // length is too short, just fill 8 bytes at a time
  1837     //
  1838     Label L_fill_8_bytes_loop;
  1839     __ BIND(L_fill_8_bytes_loop);
  1840     __ stx(value, to, 0);
  1841     __ subcc(count, 1 << (shift + 1), count);
  1842     __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop);
  1843     __ delayed()->add(to, 8, to);
  1845     // fill trailing 4 bytes
  1846     __ andcc(count, 1<<shift, G0);  // in delay slot of branches
  1847     if (t == T_INT) {
  1848       __ BIND(L_fill_elements);
  1850     __ BIND(L_fill_4_bytes);
  1851     __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes);
  1852     if (t == T_BYTE || t == T_SHORT) {
  1853       __ delayed()->andcc(count, 1<<(shift-1), G0);
  1854     } else {
  1855       __ delayed()->nop();
  1857     __ stw(value, to, 0);
  1858     if (t == T_BYTE || t == T_SHORT) {
  1859       __ inc(to, 4);
  1860       // fill trailing 2 bytes
  1861       __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches
  1862       __ BIND(L_fill_2_bytes);
  1863       __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte);
  1864       __ delayed()->andcc(count, 1, count);
  1865       __ sth(value, to, 0);
  1866       if (t == T_BYTE) {
  1867         __ inc(to, 2);
  1868         // fill trailing byte
  1869         __ andcc(count, 1, count);  // in delay slot of branches
  1870         __ BIND(L_fill_byte);
  1871         __ brx(Assembler::zero, false, Assembler::pt, L_exit);
  1872         __ delayed()->nop();
  1873         __ stb(value, to, 0);
  1874       } else {
  1875         __ BIND(L_fill_byte);
  1877     } else {
  1878       __ BIND(L_fill_2_bytes);
  1880     __ BIND(L_exit);
  1881     __ retl();
  1882     __ delayed()->nop();
  1884     // Handle copies less than 8 bytes.  Int is handled elsewhere.
  1885     if (t == T_BYTE) {
  1886       __ BIND(L_fill_elements);
  1887       Label L_fill_2, L_fill_4;
  1888       // in delay slot __ andcc(count, 1, G0);
  1889       __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
  1890       __ delayed()->andcc(count, 2, G0);
  1891       __ stb(value, to, 0);
  1892       __ inc(to, 1);
  1893       __ BIND(L_fill_2);
  1894       __ brx(Assembler::zero, false, Assembler::pt, L_fill_4);
  1895       __ delayed()->andcc(count, 4, G0);
  1896       __ stb(value, to, 0);
  1897       __ stb(value, to, 1);
  1898       __ inc(to, 2);
  1899       __ BIND(L_fill_4);
  1900       __ brx(Assembler::zero, false, Assembler::pt, L_exit);
  1901       __ delayed()->nop();
  1902       __ stb(value, to, 0);
  1903       __ stb(value, to, 1);
  1904       __ stb(value, to, 2);
  1905       __ retl();
  1906       __ delayed()->stb(value, to, 3);
  1909     if (t == T_SHORT) {
  1910       Label L_fill_2;
  1911       __ BIND(L_fill_elements);
  1912       // in delay slot __ andcc(count, 1, G0);
  1913       __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
  1914       __ delayed()->andcc(count, 2, G0);
  1915       __ sth(value, to, 0);
  1916       __ inc(to, 2);
  1917       __ BIND(L_fill_2);
  1918       __ brx(Assembler::zero, false, Assembler::pt, L_exit);
  1919       __ delayed()->nop();
  1920       __ sth(value, to, 0);
  1921       __ retl();
  1922       __ delayed()->sth(value, to, 2);
  1924     return start;
  1927   //
  1928   //  Generate stub for conjoint short copy.  If "aligned" is true, the
  1929   //  "from" and "to" addresses are assumed to be heapword aligned.
  1930   //
  1931   // Arguments for generated stub:
  1932   //      from:  O0
  1933   //      to:    O1
  1934   //      count: O2 treated as signed
  1935   //
  1936   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
  1937                                        address *entry, const char *name) {
  1938     // Do reverse copy.
  1940     __ align(CodeEntryAlignment);
  1941     StubCodeMark mark(this, "StubRoutines", name);
  1942     address start = __ pc();
  1944     Label L_skip_alignment, L_skip_alignment2, L_aligned_copy;
  1945     Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
  1947     const Register from      = O0;   // source array address
  1948     const Register to        = O1;   // destination array address
  1949     const Register count     = O2;   // elements count
  1950     const Register end_from  = from; // source array end address
  1951     const Register end_to    = to;   // destination array end address
  1953     const Register byte_count = O3;  // bytes count to copy
  1955     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  1957     if (entry != NULL) {
  1958       *entry = __ pc();
  1959       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  1960       BLOCK_COMMENT("Entry:");
  1963     array_overlap_test(nooverlap_target, 1);
  1965     __ sllx(count, LogBytesPerShort, byte_count);
  1966     __ add(to, byte_count, end_to);  // offset after last copied element
  1968     // for short arrays, just do single element copy
  1969     __ cmp(count, 11); // 8 + 3  (22 bytes)
  1970     __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
  1971     __ delayed()->add(from, byte_count, end_from);
  1974       // Align end of arrays since they could be not aligned even
  1975       // when arrays itself are aligned.
  1977       // copy 1 element if necessary to align 'end_to' on an 4 bytes
  1978       __ andcc(end_to, 3, G0);
  1979       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
  1980       __ delayed()->lduh(end_from, -2, O3);
  1981       __ dec(end_from, 2);
  1982       __ dec(end_to, 2);
  1983       __ dec(count);
  1984       __ sth(O3, end_to, 0);
  1985     __ BIND(L_skip_alignment);
  1987       // copy 2 elements to align 'end_to' on an 8 byte boundary
  1988       __ andcc(end_to, 7, G0);
  1989       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
  1990       __ delayed()->lduh(end_from, -2, O3);
  1991       __ dec(count, 2);
  1992       __ lduh(end_from, -4, O4);
  1993       __ dec(end_from, 4);
  1994       __ dec(end_to, 4);
  1995       __ sth(O3, end_to, 2);
  1996       __ sth(O4, end_to, 0);
  1997     __ BIND(L_skip_alignment2);
  1999 #ifdef _LP64
  2000     if (aligned) {
  2001       // Both arrays are aligned to 8-bytes in 64-bits VM.
  2002       // The 'count' is decremented in copy_16_bytes_backward_with_shift()
  2003       // in unaligned case.
  2004       __ dec(count, 8);
  2005     } else
  2006 #endif
  2008       // Copy with shift 16 bytes per iteration if arrays do not have
  2009       // the same alignment mod 8, otherwise jump to the next
  2010       // code for aligned copy (and substracting 8 from 'count' before jump).
  2011       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
  2012       // Also jump over aligned copy after the copy with shift completed.
  2014       copy_16_bytes_backward_with_shift(end_from, end_to, count, 8,
  2015                                         L_aligned_copy, L_copy_2_bytes);
  2017     // copy 4 elements (16 bytes) at a time
  2018       __ align(OptoLoopAlignment);
  2019     __ BIND(L_aligned_copy);
  2020       __ dec(end_from, 16);
  2021       __ ldx(end_from, 8, O3);
  2022       __ ldx(end_from, 0, O4);
  2023       __ dec(end_to, 16);
  2024       __ deccc(count, 8);
  2025       __ stx(O3, end_to, 8);
  2026       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
  2027       __ delayed()->stx(O4, end_to, 0);
  2028       __ inc(count, 8);
  2030     // copy 1 element (2 bytes) at a time
  2031     __ BIND(L_copy_2_bytes);
  2032       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
  2033     __ BIND(L_copy_2_bytes_loop);
  2034       __ dec(end_from, 2);
  2035       __ dec(end_to, 2);
  2036       __ lduh(end_from, 0, O4);
  2037       __ deccc(count);
  2038       __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
  2039       __ delayed()->sth(O4, end_to, 0);
  2041     __ BIND(L_exit);
  2042     // O3, O4 are used as temp registers
  2043     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
  2044     __ retl();
  2045     __ delayed()->mov(G0, O0); // return 0
  2046     return start;
  2049   //
  2050   // Helper methods for generate_disjoint_int_copy_core()
  2051   //
  2052   void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec,
  2053                           Label& L_loop, bool use_prefetch, bool use_bis) {
  2055     __ align(OptoLoopAlignment);
  2056     __ BIND(L_loop);
  2057     if (use_prefetch) {
  2058       if (ArraycopySrcPrefetchDistance > 0) {
  2059         __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
  2061       if (ArraycopyDstPrefetchDistance > 0) {
  2062         __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
  2065     __ ldx(from, 4, O4);
  2066     __ ldx(from, 12, G4);
  2067     __ inc(to, 16);
  2068     __ inc(from, 16);
  2069     __ deccc(count, 4); // Can we do next iteration after this one?
  2071     __ srlx(O4, 32, G3);
  2072     __ bset(G3, O3);
  2073     __ sllx(O4, 32, O4);
  2074     __ srlx(G4, 32, G3);
  2075     __ bset(G3, O4);
  2076     if (use_bis) {
  2077       __ stxa(O3, to, -16);
  2078       __ stxa(O4, to, -8);
  2079     } else {
  2080       __ stx(O3, to, -16);
  2081       __ stx(O4, to, -8);
  2083     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
  2084     __ delayed()->sllx(G4, 32,  O3);
  2088   //
  2089   //  Generate core code for disjoint int copy (and oop copy on 32-bit).
  2090   //  If "aligned" is true, the "from" and "to" addresses are assumed
  2091   //  to be heapword aligned.
  2092   //
  2093   // Arguments:
  2094   //      from:  O0
  2095   //      to:    O1
  2096   //      count: O2 treated as signed
  2097   //
  2098   void generate_disjoint_int_copy_core(bool aligned) {
  2100     Label L_skip_alignment, L_aligned_copy;
  2101     Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
  2103     const Register from      = O0;   // source array address
  2104     const Register to        = O1;   // destination array address
  2105     const Register count     = O2;   // elements count
  2106     const Register offset    = O5;   // offset from start of arrays
  2107     // O3, O4, G3, G4 are used as temp registers
  2109     // 'aligned' == true when it is known statically during compilation
  2110     // of this arraycopy call site that both 'from' and 'to' addresses
  2111     // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
  2112     //
  2113     // Aligned arrays have 4 bytes alignment in 32-bits VM
  2114     // and 8 bytes - in 64-bits VM.
  2115     //
  2116 #ifdef _LP64
  2117     if (!aligned)
  2118 #endif
  2120       // The next check could be put under 'ifndef' since the code in
  2121       // generate_disjoint_long_copy_core() has own checks and set 'offset'.
  2123       // for short arrays, just do single element copy
  2124       __ cmp(count, 5); // 4 + 1 (20 bytes)
  2125       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
  2126       __ delayed()->mov(G0, offset);
  2128       // copy 1 element to align 'to' on an 8 byte boundary
  2129       __ andcc(to, 7, G0);
  2130       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
  2131       __ delayed()->ld(from, 0, O3);
  2132       __ inc(from, 4);
  2133       __ inc(to, 4);
  2134       __ dec(count);
  2135       __ st(O3, to, -4);
  2136     __ BIND(L_skip_alignment);
  2138     // if arrays have same alignment mod 8, do 4 elements copy
  2139       __ andcc(from, 7, G0);
  2140       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
  2141       __ delayed()->ld(from, 0, O3);
  2143     //
  2144     // Load 2 aligned 8-bytes chunks and use one from previous iteration
  2145     // to form 2 aligned 8-bytes chunks to store.
  2146     //
  2147     // copy_16_bytes_forward_with_shift() is not used here since this
  2148     // code is more optimal.
  2150     // copy with shift 4 elements (16 bytes) at a time
  2151       __ dec(count, 4);   // The cmp at the beginning guaranty count >= 4
  2152       __ sllx(O3, 32,  O3);
  2154       disjoint_copy_core(from, to, count, 2, 16, copy_16_bytes_loop);
  2156       __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
  2157       __ delayed()->inc(count, 4); // restore 'count'
  2159     __ BIND(L_aligned_copy);
  2160     } // !aligned
  2162     // copy 4 elements (16 bytes) at a time
  2163       __ and3(count, 1, G4); // Save
  2164       __ srl(count, 1, count);
  2165      generate_disjoint_long_copy_core(aligned);
  2166       __ mov(G4, count);     // Restore
  2168     // copy 1 element at a time
  2169     __ BIND(L_copy_4_bytes);
  2170       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
  2171     __ BIND(L_copy_4_bytes_loop);
  2172       __ ld(from, offset, O3);
  2173       __ deccc(count);
  2174       __ st(O3, to, offset);
  2175       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
  2176       __ delayed()->inc(offset, 4);
  2177     __ BIND(L_exit);
  2180   //
  2181   //  Generate stub for disjoint int copy.  If "aligned" is true, the
  2182   //  "from" and "to" addresses are assumed to be heapword aligned.
  2183   //
  2184   // Arguments for generated stub:
  2185   //      from:  O0
  2186   //      to:    O1
  2187   //      count: O2 treated as signed
  2188   //
  2189   address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) {
  2190     __ align(CodeEntryAlignment);
  2191     StubCodeMark mark(this, "StubRoutines", name);
  2192     address start = __ pc();
  2194     const Register count = O2;
  2195     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  2197     if (entry != NULL) {
  2198       *entry = __ pc();
  2199       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  2200       BLOCK_COMMENT("Entry:");
  2203     generate_disjoint_int_copy_core(aligned);
  2205     // O3, O4 are used as temp registers
  2206     inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
  2207     __ retl();
  2208     __ delayed()->mov(G0, O0); // return 0
  2209     return start;
  2212   //
  2213   //  Generate core code for conjoint int copy (and oop copy on 32-bit).
  2214   //  If "aligned" is true, the "from" and "to" addresses are assumed
  2215   //  to be heapword aligned.
  2216   //
  2217   // Arguments:
  2218   //      from:  O0
  2219   //      to:    O1
  2220   //      count: O2 treated as signed
  2221   //
  2222   void generate_conjoint_int_copy_core(bool aligned) {
  2223     // Do reverse copy.
  2225     Label L_skip_alignment, L_aligned_copy;
  2226     Label L_copy_16_bytes,  L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
  2228     const Register from      = O0;   // source array address
  2229     const Register to        = O1;   // destination array address
  2230     const Register count     = O2;   // elements count
  2231     const Register end_from  = from; // source array end address
  2232     const Register end_to    = to;   // destination array end address
  2233     // O3, O4, O5, G3 are used as temp registers
  2235     const Register byte_count = O3;  // bytes count to copy
  2237       __ sllx(count, LogBytesPerInt, byte_count);
  2238       __ add(to, byte_count, end_to); // offset after last copied element
  2240       __ cmp(count, 5); // for short arrays, just do single element copy
  2241       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
  2242       __ delayed()->add(from, byte_count, end_from);
  2244     // copy 1 element to align 'to' on an 8 byte boundary
  2245       __ andcc(end_to, 7, G0);
  2246       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
  2247       __ delayed()->nop();
  2248       __ dec(count);
  2249       __ dec(end_from, 4);
  2250       __ dec(end_to,   4);
  2251       __ ld(end_from, 0, O4);
  2252       __ st(O4, end_to, 0);
  2253     __ BIND(L_skip_alignment);
  2255     // Check if 'end_from' and 'end_to' has the same alignment.
  2256       __ andcc(end_from, 7, G0);
  2257       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
  2258       __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4
  2260     // copy with shift 4 elements (16 bytes) at a time
  2261     //
  2262     // Load 2 aligned 8-bytes chunks and use one from previous iteration
  2263     // to form 2 aligned 8-bytes chunks to store.
  2264     //
  2265       __ ldx(end_from, -4, O3);
  2266       __ align(OptoLoopAlignment);
  2267     __ BIND(L_copy_16_bytes);
  2268       __ ldx(end_from, -12, O4);
  2269       __ deccc(count, 4);
  2270       __ ldx(end_from, -20, O5);
  2271       __ dec(end_to, 16);
  2272       __ dec(end_from, 16);
  2273       __ srlx(O3, 32, O3);
  2274       __ sllx(O4, 32, G3);
  2275       __ bset(G3, O3);
  2276       __ stx(O3, end_to, 8);
  2277       __ srlx(O4, 32, O4);
  2278       __ sllx(O5, 32, G3);
  2279       __ bset(O4, G3);
  2280       __ stx(G3, end_to, 0);
  2281       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
  2282       __ delayed()->mov(O5, O3);
  2284       __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
  2285       __ delayed()->inc(count, 4);
  2287     // copy 4 elements (16 bytes) at a time
  2288       __ align(OptoLoopAlignment);
  2289     __ BIND(L_aligned_copy);
  2290       __ dec(end_from, 16);
  2291       __ ldx(end_from, 8, O3);
  2292       __ ldx(end_from, 0, O4);
  2293       __ dec(end_to, 16);
  2294       __ deccc(count, 4);
  2295       __ stx(O3, end_to, 8);
  2296       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
  2297       __ delayed()->stx(O4, end_to, 0);
  2298       __ inc(count, 4);
  2300     // copy 1 element (4 bytes) at a time
  2301     __ BIND(L_copy_4_bytes);
  2302       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
  2303     __ BIND(L_copy_4_bytes_loop);
  2304       __ dec(end_from, 4);
  2305       __ dec(end_to, 4);
  2306       __ ld(end_from, 0, O4);
  2307       __ deccc(count);
  2308       __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop);
  2309       __ delayed()->st(O4, end_to, 0);
  2310     __ BIND(L_exit);
  2313   //
  2314   //  Generate stub for conjoint int copy.  If "aligned" is true, the
  2315   //  "from" and "to" addresses are assumed to be heapword aligned.
  2316   //
  2317   // Arguments for generated stub:
  2318   //      from:  O0
  2319   //      to:    O1
  2320   //      count: O2 treated as signed
  2321   //
  2322   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
  2323                                      address *entry, const char *name) {
  2324     __ align(CodeEntryAlignment);
  2325     StubCodeMark mark(this, "StubRoutines", name);
  2326     address start = __ pc();
  2328     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
  2330     if (entry != NULL) {
  2331       *entry = __ pc();
  2332       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  2333       BLOCK_COMMENT("Entry:");
  2336     array_overlap_test(nooverlap_target, 2);
  2338     generate_conjoint_int_copy_core(aligned);
  2340     // O3, O4 are used as temp registers
  2341     inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
  2342     __ retl();
  2343     __ delayed()->mov(G0, O0); // return 0
  2344     return start;
  2347   //
  2348   // Helper methods for generate_disjoint_long_copy_core()
  2349   //
  2350   void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec,
  2351                           Label& L_loop, bool use_prefetch, bool use_bis) {
  2352     __ align(OptoLoopAlignment);
  2353     __ BIND(L_loop);
  2354     for (int off = 0; off < 64; off += 16) {
  2355       if (use_prefetch && (off & 31) == 0) {
  2356         if (ArraycopySrcPrefetchDistance > 0) {
  2357           __ prefetch(from, ArraycopySrcPrefetchDistance+off, Assembler::severalReads);
  2359         if (ArraycopyDstPrefetchDistance > 0) {
  2360           __ prefetch(to, ArraycopyDstPrefetchDistance+off, Assembler::severalWritesAndPossiblyReads);
  2363       __ ldx(from,  off+0, O4);
  2364       __ ldx(from,  off+8, O5);
  2365       if (use_bis) {
  2366         __ stxa(O4, to,  off+0);
  2367         __ stxa(O5, to,  off+8);
  2368       } else {
  2369         __ stx(O4, to,  off+0);
  2370         __ stx(O5, to,  off+8);
  2373     __ deccc(count, 8);
  2374     __ inc(from, 64);
  2375     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
  2376     __ delayed()->inc(to, 64);
  2379   //
  2380   //  Generate core code for disjoint long copy (and oop copy on 64-bit).
  2381   //  "aligned" is ignored, because we must make the stronger
  2382   //  assumption that both addresses are always 64-bit aligned.
  2383   //
  2384   // Arguments:
  2385   //      from:  O0
  2386   //      to:    O1
  2387   //      count: O2 treated as signed
  2388   //
  2389   // count -= 2;
  2390   // if ( count >= 0 ) { // >= 2 elements
  2391   //   if ( count > 6) { // >= 8 elements
  2392   //     count -= 6; // original count - 8
  2393   //     do {
  2394   //       copy_8_elements;
  2395   //       count -= 8;
  2396   //     } while ( count >= 0 );
  2397   //     count += 6;
  2398   //   }
  2399   //   if ( count >= 0 ) { // >= 2 elements
  2400   //     do {
  2401   //       copy_2_elements;
  2402   //     } while ( (count=count-2) >= 0 );
  2403   //   }
  2404   // }
  2405   // count += 2;
  2406   // if ( count != 0 ) { // 1 element left
  2407   //   copy_1_element;
  2408   // }
  2409   //
  2410   void generate_disjoint_long_copy_core(bool aligned) {
  2411     Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
  2412     const Register from    = O0;  // source array address
  2413     const Register to      = O1;  // destination array address
  2414     const Register count   = O2;  // elements count
  2415     const Register offset0 = O4;  // element offset
  2416     const Register offset8 = O5;  // next element offset
  2418     __ deccc(count, 2);
  2419     __ mov(G0, offset0);   // offset from start of arrays (0)
  2420     __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
  2421     __ delayed()->add(offset0, 8, offset8);
  2423     // Copy by 64 bytes chunks
  2425     const Register from64 = O3;  // source address
  2426     const Register to64   = G3;  // destination address
  2427     __ subcc(count, 6, O3);
  2428     __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
  2429     __ delayed()->mov(to,   to64);
  2430     // Now we can use O4(offset0), O5(offset8) as temps
  2431     __ mov(O3, count);
  2432     // count >= 0 (original count - 8)
  2433     __ mov(from, from64);
  2435     disjoint_copy_core(from64, to64, count, 3, 64, copy_64_bytes_loop);
  2437       // Restore O4(offset0), O5(offset8)
  2438       __ sub(from64, from, offset0);
  2439       __ inccc(count, 6); // restore count
  2440       __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
  2441       __ delayed()->add(offset0, 8, offset8);
  2443       // Copy by 16 bytes chunks
  2444       __ align(OptoLoopAlignment);
  2445     __ BIND(L_copy_16_bytes);
  2446       __ ldx(from, offset0, O3);
  2447       __ ldx(from, offset8, G3);
  2448       __ deccc(count, 2);
  2449       __ stx(O3, to, offset0);
  2450       __ inc(offset0, 16);
  2451       __ stx(G3, to, offset8);
  2452       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
  2453       __ delayed()->inc(offset8, 16);
  2455       // Copy last 8 bytes
  2456     __ BIND(L_copy_8_bytes);
  2457       __ inccc(count, 2);
  2458       __ brx(Assembler::zero, true, Assembler::pn, L_exit );
  2459       __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs
  2460       __ ldx(from, offset0, O3);
  2461       __ stx(O3, to, offset0);
  2462     __ BIND(L_exit);
  2465   //
  2466   //  Generate stub for disjoint long copy.
  2467   //  "aligned" is ignored, because we must make the stronger
  2468   //  assumption that both addresses are always 64-bit aligned.
  2469   //
  2470   // Arguments for generated stub:
  2471   //      from:  O0
  2472   //      to:    O1
  2473   //      count: O2 treated as signed
  2474   //
  2475   address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) {
  2476     __ align(CodeEntryAlignment);
  2477     StubCodeMark mark(this, "StubRoutines", name);
  2478     address start = __ pc();
  2480     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
  2482     if (entry != NULL) {
  2483       *entry = __ pc();
  2484       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  2485       BLOCK_COMMENT("Entry:");
  2488     generate_disjoint_long_copy_core(aligned);
  2490     // O3, O4 are used as temp registers
  2491     inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
  2492     __ retl();
  2493     __ delayed()->mov(G0, O0); // return 0
  2494     return start;
  2497   //
  2498   //  Generate core code for conjoint long copy (and oop copy on 64-bit).
  2499   //  "aligned" is ignored, because we must make the stronger
  2500   //  assumption that both addresses are always 64-bit aligned.
  2501   //
  2502   // Arguments:
  2503   //      from:  O0
  2504   //      to:    O1
  2505   //      count: O2 treated as signed
  2506   //
  2507   void generate_conjoint_long_copy_core(bool aligned) {
  2508     // Do reverse copy.
  2509     Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
  2510     const Register from    = O0;  // source array address
  2511     const Register to      = O1;  // destination array address
  2512     const Register count   = O2;  // elements count
  2513     const Register offset8 = O4;  // element offset
  2514     const Register offset0 = O5;  // previous element offset
  2516       __ subcc(count, 1, count);
  2517       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
  2518       __ delayed()->sllx(count, LogBytesPerLong, offset8);
  2519       __ sub(offset8, 8, offset0);
  2520       __ align(OptoLoopAlignment);
  2521     __ BIND(L_copy_16_bytes);
  2522       __ ldx(from, offset8, O2);
  2523       __ ldx(from, offset0, O3);
  2524       __ stx(O2, to, offset8);
  2525       __ deccc(offset8, 16);      // use offset8 as counter
  2526       __ stx(O3, to, offset0);
  2527       __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes);
  2528       __ delayed()->dec(offset0, 16);
  2530     __ BIND(L_copy_8_bytes);
  2531       __ brx(Assembler::negative, false, Assembler::pn, L_exit );
  2532       __ delayed()->nop();
  2533       __ ldx(from, 0, O3);
  2534       __ stx(O3, to, 0);
  2535     __ BIND(L_exit);
  2538   //  Generate stub for conjoint long copy.
  2539   //  "aligned" is ignored, because we must make the stronger
  2540   //  assumption that both addresses are always 64-bit aligned.
  2541   //
  2542   // Arguments for generated stub:
  2543   //      from:  O0
  2544   //      to:    O1
  2545   //      count: O2 treated as signed
  2546   //
  2547   address generate_conjoint_long_copy(bool aligned, address nooverlap_target,
  2548                                       address *entry, const char *name) {
  2549     __ align(CodeEntryAlignment);
  2550     StubCodeMark mark(this, "StubRoutines", name);
  2551     address start = __ pc();
  2553     assert(aligned, "Should always be aligned");
  2555     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
  2557     if (entry != NULL) {
  2558       *entry = __ pc();
  2559       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  2560       BLOCK_COMMENT("Entry:");
  2563     array_overlap_test(nooverlap_target, 3);
  2565     generate_conjoint_long_copy_core(aligned);
  2567     // O3, O4 are used as temp registers
  2568     inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
  2569     __ retl();
  2570     __ delayed()->mov(G0, O0); // return 0
  2571     return start;
  2574   //  Generate stub for disjoint oop copy.  If "aligned" is true, the
  2575   //  "from" and "to" addresses are assumed to be heapword aligned.
  2576   //
  2577   // Arguments for generated stub:
  2578   //      from:  O0
  2579   //      to:    O1
  2580   //      count: O2 treated as signed
  2581   //
  2582   address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name,
  2583                                      bool dest_uninitialized = false) {
  2585     const Register from  = O0;  // source array address
  2586     const Register to    = O1;  // destination array address
  2587     const Register count = O2;  // elements count
  2589     __ align(CodeEntryAlignment);
  2590     StubCodeMark mark(this, "StubRoutines", name);
  2591     address start = __ pc();
  2593     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  2595     if (entry != NULL) {
  2596       *entry = __ pc();
  2597       // caller can pass a 64-bit byte count here
  2598       BLOCK_COMMENT("Entry:");
  2601     // save arguments for barrier generation
  2602     __ mov(to, G1);
  2603     __ mov(count, G5);
  2604     gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
  2605   #ifdef _LP64
  2606     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  2607     if (UseCompressedOops) {
  2608       generate_disjoint_int_copy_core(aligned);
  2609     } else {
  2610       generate_disjoint_long_copy_core(aligned);
  2612   #else
  2613     generate_disjoint_int_copy_core(aligned);
  2614   #endif
  2615     // O0 is used as temp register
  2616     gen_write_ref_array_post_barrier(G1, G5, O0);
  2618     // O3, O4 are used as temp registers
  2619     inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
  2620     __ retl();
  2621     __ delayed()->mov(G0, O0); // return 0
  2622     return start;
  2625   //  Generate stub for conjoint oop copy.  If "aligned" is true, the
  2626   //  "from" and "to" addresses are assumed to be heapword aligned.
  2627   //
  2628   // Arguments for generated stub:
  2629   //      from:  O0
  2630   //      to:    O1
  2631   //      count: O2 treated as signed
  2632   //
  2633   address generate_conjoint_oop_copy(bool aligned, address nooverlap_target,
  2634                                      address *entry, const char *name,
  2635                                      bool dest_uninitialized = false) {
  2637     const Register from  = O0;  // source array address
  2638     const Register to    = O1;  // destination array address
  2639     const Register count = O2;  // elements count
  2641     __ align(CodeEntryAlignment);
  2642     StubCodeMark mark(this, "StubRoutines", name);
  2643     address start = __ pc();
  2645     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  2647     if (entry != NULL) {
  2648       *entry = __ pc();
  2649       // caller can pass a 64-bit byte count here
  2650       BLOCK_COMMENT("Entry:");
  2653     array_overlap_test(nooverlap_target, LogBytesPerHeapOop);
  2655     // save arguments for barrier generation
  2656     __ mov(to, G1);
  2657     __ mov(count, G5);
  2658     gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
  2660   #ifdef _LP64
  2661     if (UseCompressedOops) {
  2662       generate_conjoint_int_copy_core(aligned);
  2663     } else {
  2664       generate_conjoint_long_copy_core(aligned);
  2666   #else
  2667     generate_conjoint_int_copy_core(aligned);
  2668   #endif
  2670     // O0 is used as temp register
  2671     gen_write_ref_array_post_barrier(G1, G5, O0);
  2673     // O3, O4 are used as temp registers
  2674     inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
  2675     __ retl();
  2676     __ delayed()->mov(G0, O0); // return 0
  2677     return start;
  2681   // Helper for generating a dynamic type check.
  2682   // Smashes only the given temp registers.
  2683   void generate_type_check(Register sub_klass,
  2684                            Register super_check_offset,
  2685                            Register super_klass,
  2686                            Register temp,
  2687                            Label& L_success) {
  2688     assert_different_registers(sub_klass, super_check_offset, super_klass, temp);
  2690     BLOCK_COMMENT("type_check:");
  2692     Label L_miss, L_pop_to_miss;
  2694     assert_clean_int(super_check_offset, temp);
  2696     __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg,
  2697                                      &L_success, &L_miss, NULL,
  2698                                      super_check_offset);
  2700     BLOCK_COMMENT("type_check_slow_path:");
  2701     __ save_frame(0);
  2702     __ check_klass_subtype_slow_path(sub_klass->after_save(),
  2703                                      super_klass->after_save(),
  2704                                      L0, L1, L2, L4,
  2705                                      NULL, &L_pop_to_miss);
  2706     __ ba(L_success);
  2707     __ delayed()->restore();
  2709     __ bind(L_pop_to_miss);
  2710     __ restore();
  2712     // Fall through on failure!
  2713     __ BIND(L_miss);
  2717   //  Generate stub for checked oop copy.
  2718   //
  2719   // Arguments for generated stub:
  2720   //      from:  O0
  2721   //      to:    O1
  2722   //      count: O2 treated as signed
  2723   //      ckoff: O3 (super_check_offset)
  2724   //      ckval: O4 (super_klass)
  2725   //      ret:   O0 zero for success; (-1^K) where K is partial transfer count
  2726   //
  2727   address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) {
  2729     const Register O0_from   = O0;      // source array address
  2730     const Register O1_to     = O1;      // destination array address
  2731     const Register O2_count  = O2;      // elements count
  2732     const Register O3_ckoff  = O3;      // super_check_offset
  2733     const Register O4_ckval  = O4;      // super_klass
  2735     const Register O5_offset = O5;      // loop var, with stride wordSize
  2736     const Register G1_remain = G1;      // loop var, with stride -1
  2737     const Register G3_oop    = G3;      // actual oop copied
  2738     const Register G4_klass  = G4;      // oop._klass
  2739     const Register G5_super  = G5;      // oop._klass._primary_supers[ckval]
  2741     __ align(CodeEntryAlignment);
  2742     StubCodeMark mark(this, "StubRoutines", name);
  2743     address start = __ pc();
  2745 #ifdef ASSERT
  2746     // We sometimes save a frame (see generate_type_check below).
  2747     // If this will cause trouble, let's fail now instead of later.
  2748     __ save_frame(0);
  2749     __ restore();
  2750 #endif
  2752     assert_clean_int(O2_count, G1);     // Make sure 'count' is clean int.
  2754 #ifdef ASSERT
  2755     // caller guarantees that the arrays really are different
  2756     // otherwise, we would have to make conjoint checks
  2757     { Label L;
  2758       __ mov(O3, G1);           // spill: overlap test smashes O3
  2759       __ mov(O4, G4);           // spill: overlap test smashes O4
  2760       array_overlap_test(L, LogBytesPerHeapOop);
  2761       __ stop("checkcast_copy within a single array");
  2762       __ bind(L);
  2763       __ mov(G1, O3);
  2764       __ mov(G4, O4);
  2766 #endif //ASSERT
  2768     if (entry != NULL) {
  2769       *entry = __ pc();
  2770       // caller can pass a 64-bit byte count here (from generic stub)
  2771       BLOCK_COMMENT("Entry:");
  2773     gen_write_ref_array_pre_barrier(O1_to, O2_count, dest_uninitialized);
  2775     Label load_element, store_element, do_card_marks, fail, done;
  2776     __ addcc(O2_count, 0, G1_remain);   // initialize loop index, and test it
  2777     __ brx(Assembler::notZero, false, Assembler::pt, load_element);
  2778     __ delayed()->mov(G0, O5_offset);   // offset from start of arrays
  2780     // Empty array:  Nothing to do.
  2781     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
  2782     __ retl();
  2783     __ delayed()->set(0, O0);           // return 0 on (trivial) success
  2785     // ======== begin loop ========
  2786     // (Loop is rotated; its entry is load_element.)
  2787     // Loop variables:
  2788     //   (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
  2789     //   (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
  2790     //   G3, G4, G5 --- current oop, oop.klass, oop.klass.super
  2791     __ align(OptoLoopAlignment);
  2793     __ BIND(store_element);
  2794     __ deccc(G1_remain);                // decrement the count
  2795     __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop
  2796     __ inc(O5_offset, heapOopSize);     // step to next offset
  2797     __ brx(Assembler::zero, true, Assembler::pt, do_card_marks);
  2798     __ delayed()->set(0, O0);           // return -1 on success
  2800     // ======== loop entry is here ========
  2801     __ BIND(load_element);
  2802     __ load_heap_oop(O0_from, O5_offset, G3_oop);  // load the oop
  2803     __ br_null_short(G3_oop, Assembler::pt, store_element);
  2805     __ load_klass(G3_oop, G4_klass); // query the object klass
  2807     generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super,
  2808                         // branch to this on success:
  2809                         store_element);
  2810     // ======== end loop ========
  2812     // It was a real error; we must depend on the caller to finish the job.
  2813     // Register G1 has number of *remaining* oops, O2 number of *total* oops.
  2814     // Emit GC store barriers for the oops we have copied (O2 minus G1),
  2815     // and report their number to the caller.
  2816     __ BIND(fail);
  2817     __ subcc(O2_count, G1_remain, O2_count);
  2818     __ brx(Assembler::zero, false, Assembler::pt, done);
  2819     __ delayed()->not1(O2_count, O0);   // report (-1^K) to caller
  2821     __ BIND(do_card_marks);
  2822     gen_write_ref_array_post_barrier(O1_to, O2_count, O3);   // store check on O1[0..O2]
  2824     __ BIND(done);
  2825     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
  2826     __ retl();
  2827     __ delayed()->nop();             // return value in 00
  2829     return start;
  2833   //  Generate 'unsafe' array copy stub
  2834   //  Though just as safe as the other stubs, it takes an unscaled
  2835   //  size_t argument instead of an element count.
  2836   //
  2837   // Arguments for generated stub:
  2838   //      from:  O0
  2839   //      to:    O1
  2840   //      count: O2 byte count, treated as ssize_t, can be zero
  2841   //
  2842   // Examines the alignment of the operands and dispatches
  2843   // to a long, int, short, or byte copy loop.
  2844   //
  2845   address generate_unsafe_copy(const char* name,
  2846                                address byte_copy_entry,
  2847                                address short_copy_entry,
  2848                                address int_copy_entry,
  2849                                address long_copy_entry) {
  2851     const Register O0_from   = O0;      // source array address
  2852     const Register O1_to     = O1;      // destination array address
  2853     const Register O2_count  = O2;      // elements count
  2855     const Register G1_bits   = G1;      // test copy of low bits
  2857     __ align(CodeEntryAlignment);
  2858     StubCodeMark mark(this, "StubRoutines", name);
  2859     address start = __ pc();
  2861     // bump this on entry, not on exit:
  2862     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3);
  2864     __ or3(O0_from, O1_to, G1_bits);
  2865     __ or3(O2_count,       G1_bits, G1_bits);
  2867     __ btst(BytesPerLong-1, G1_bits);
  2868     __ br(Assembler::zero, true, Assembler::pt,
  2869           long_copy_entry, relocInfo::runtime_call_type);
  2870     // scale the count on the way out:
  2871     __ delayed()->srax(O2_count, LogBytesPerLong, O2_count);
  2873     __ btst(BytesPerInt-1, G1_bits);
  2874     __ br(Assembler::zero, true, Assembler::pt,
  2875           int_copy_entry, relocInfo::runtime_call_type);
  2876     // scale the count on the way out:
  2877     __ delayed()->srax(O2_count, LogBytesPerInt, O2_count);
  2879     __ btst(BytesPerShort-1, G1_bits);
  2880     __ br(Assembler::zero, true, Assembler::pt,
  2881           short_copy_entry, relocInfo::runtime_call_type);
  2882     // scale the count on the way out:
  2883     __ delayed()->srax(O2_count, LogBytesPerShort, O2_count);
  2885     __ br(Assembler::always, false, Assembler::pt,
  2886           byte_copy_entry, relocInfo::runtime_call_type);
  2887     __ delayed()->nop();
  2889     return start;
  2893   // Perform range checks on the proposed arraycopy.
  2894   // Kills the two temps, but nothing else.
  2895   // Also, clean the sign bits of src_pos and dst_pos.
  2896   void arraycopy_range_checks(Register src,     // source array oop (O0)
  2897                               Register src_pos, // source position (O1)
  2898                               Register dst,     // destination array oo (O2)
  2899                               Register dst_pos, // destination position (O3)
  2900                               Register length,  // length of copy (O4)
  2901                               Register temp1, Register temp2,
  2902                               Label& L_failed) {
  2903     BLOCK_COMMENT("arraycopy_range_checks:");
  2905     //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
  2907     const Register array_length = temp1;  // scratch
  2908     const Register end_pos      = temp2;  // scratch
  2910     // Note:  This next instruction may be in the delay slot of a branch:
  2911     __ add(length, src_pos, end_pos);  // src_pos + length
  2912     __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length);
  2913     __ cmp(end_pos, array_length);
  2914     __ br(Assembler::greater, false, Assembler::pn, L_failed);
  2916     //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
  2917     __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length
  2918     __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length);
  2919     __ cmp(end_pos, array_length);
  2920     __ br(Assembler::greater, false, Assembler::pn, L_failed);
  2922     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
  2923     // Move with sign extension can be used since they are positive.
  2924     __ delayed()->signx(src_pos, src_pos);
  2925     __ signx(dst_pos, dst_pos);
  2927     BLOCK_COMMENT("arraycopy_range_checks done");
  2931   //
  2932   //  Generate generic array copy stubs
  2933   //
  2934   //  Input:
  2935   //    O0    -  src oop
  2936   //    O1    -  src_pos
  2937   //    O2    -  dst oop
  2938   //    O3    -  dst_pos
  2939   //    O4    -  element count
  2940   //
  2941   //  Output:
  2942   //    O0 ==  0  -  success
  2943   //    O0 == -1  -  need to call System.arraycopy
  2944   //
  2945   address generate_generic_copy(const char *name,
  2946                                 address entry_jbyte_arraycopy,
  2947                                 address entry_jshort_arraycopy,
  2948                                 address entry_jint_arraycopy,
  2949                                 address entry_oop_arraycopy,
  2950                                 address entry_jlong_arraycopy,
  2951                                 address entry_checkcast_arraycopy) {
  2952     Label L_failed, L_objArray;
  2954     // Input registers
  2955     const Register src      = O0;  // source array oop
  2956     const Register src_pos  = O1;  // source position
  2957     const Register dst      = O2;  // destination array oop
  2958     const Register dst_pos  = O3;  // destination position
  2959     const Register length   = O4;  // elements count
  2961     // registers used as temp
  2962     const Register G3_src_klass = G3; // source array klass
  2963     const Register G4_dst_klass = G4; // destination array klass
  2964     const Register G5_lh        = G5; // layout handler
  2965     const Register O5_temp      = O5;
  2967     __ align(CodeEntryAlignment);
  2968     StubCodeMark mark(this, "StubRoutines", name);
  2969     address start = __ pc();
  2971     // bump this on entry, not on exit:
  2972     inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3);
  2974     // In principle, the int arguments could be dirty.
  2975     //assert_clean_int(src_pos, G1);
  2976     //assert_clean_int(dst_pos, G1);
  2977     //assert_clean_int(length, G1);
  2979     //-----------------------------------------------------------------------
  2980     // Assembler stubs will be used for this call to arraycopy
  2981     // if the following conditions are met:
  2982     //
  2983     // (1) src and dst must not be null.
  2984     // (2) src_pos must not be negative.
  2985     // (3) dst_pos must not be negative.
  2986     // (4) length  must not be negative.
  2987     // (5) src klass and dst klass should be the same and not NULL.
  2988     // (6) src and dst should be arrays.
  2989     // (7) src_pos + length must not exceed length of src.
  2990     // (8) dst_pos + length must not exceed length of dst.
  2991     BLOCK_COMMENT("arraycopy initial argument checks");
  2993     //  if (src == NULL) return -1;
  2994     __ br_null(src, false, Assembler::pn, L_failed);
  2996     //  if (src_pos < 0) return -1;
  2997     __ delayed()->tst(src_pos);
  2998     __ br(Assembler::negative, false, Assembler::pn, L_failed);
  2999     __ delayed()->nop();
  3001     //  if (dst == NULL) return -1;
  3002     __ br_null(dst, false, Assembler::pn, L_failed);
  3004     //  if (dst_pos < 0) return -1;
  3005     __ delayed()->tst(dst_pos);
  3006     __ br(Assembler::negative, false, Assembler::pn, L_failed);
  3008     //  if (length < 0) return -1;
  3009     __ delayed()->tst(length);
  3010     __ br(Assembler::negative, false, Assembler::pn, L_failed);
  3012     BLOCK_COMMENT("arraycopy argument klass checks");
  3013     //  get src->klass()
  3014     if (UseCompressedKlassPointers) {
  3015       __ delayed()->nop(); // ??? not good
  3016       __ load_klass(src, G3_src_klass);
  3017     } else {
  3018       __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass);
  3021 #ifdef ASSERT
  3022     //  assert(src->klass() != NULL);
  3023     BLOCK_COMMENT("assert klasses not null");
  3024     { Label L_a, L_b;
  3025       __ br_notnull_short(G3_src_klass, Assembler::pt, L_b); // it is broken if klass is NULL
  3026       __ bind(L_a);
  3027       __ stop("broken null klass");
  3028       __ bind(L_b);
  3029       __ load_klass(dst, G4_dst_klass);
  3030       __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also
  3031       __ delayed()->mov(G0, G4_dst_klass);      // scribble the temp
  3032       BLOCK_COMMENT("assert done");
  3034 #endif
  3036     // Load layout helper
  3037     //
  3038     //  |array_tag|     | header_size | element_type |     |log2_element_size|
  3039     // 32        30    24            16              8     2                 0
  3040     //
  3041     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
  3042     //
  3044     int lh_offset = in_bytes(Klass::layout_helper_offset());
  3046     // Load 32-bits signed value. Use br() instruction with it to check icc.
  3047     __ lduw(G3_src_klass, lh_offset, G5_lh);
  3049     if (UseCompressedKlassPointers) {
  3050       __ load_klass(dst, G4_dst_klass);
  3052     // Handle objArrays completely differently...
  3053     juint objArray_lh = Klass::array_layout_helper(T_OBJECT);
  3054     __ set(objArray_lh, O5_temp);
  3055     __ cmp(G5_lh,       O5_temp);
  3056     __ br(Assembler::equal, false, Assembler::pt, L_objArray);
  3057     if (UseCompressedKlassPointers) {
  3058       __ delayed()->nop();
  3059     } else {
  3060       __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass);
  3063     //  if (src->klass() != dst->klass()) return -1;
  3064     __ cmp_and_brx_short(G3_src_klass, G4_dst_klass, Assembler::notEqual, Assembler::pn, L_failed);
  3066     //  if (!src->is_Array()) return -1;
  3067     __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0
  3068     __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed);
  3070     // At this point, it is known to be a typeArray (array_tag 0x3).
  3071 #ifdef ASSERT
  3072     __ delayed()->nop();
  3073     { Label L;
  3074       jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
  3075       __ set(lh_prim_tag_in_place, O5_temp);
  3076       __ cmp(G5_lh,                O5_temp);
  3077       __ br(Assembler::greaterEqual, false, Assembler::pt, L);
  3078       __ delayed()->nop();
  3079       __ stop("must be a primitive array");
  3080       __ bind(L);
  3082 #else
  3083     __ delayed();                               // match next insn to prev branch
  3084 #endif
  3086     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
  3087                            O5_temp, G4_dst_klass, L_failed);
  3089     // TypeArrayKlass
  3090     //
  3091     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
  3092     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
  3093     //
  3095     const Register G4_offset = G4_dst_klass;    // array offset
  3096     const Register G3_elsize = G3_src_klass;    // log2 element size
  3098     __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset);
  3099     __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset
  3100     __ add(src, G4_offset, src);       // src array offset
  3101     __ add(dst, G4_offset, dst);       // dst array offset
  3102     __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size
  3104     // next registers should be set before the jump to corresponding stub
  3105     const Register from     = O0;  // source array address
  3106     const Register to       = O1;  // destination array address
  3107     const Register count    = O2;  // elements count
  3109     // 'from', 'to', 'count' registers should be set in this order
  3110     // since they are the same as 'src', 'src_pos', 'dst'.
  3112     BLOCK_COMMENT("scale indexes to element size");
  3113     __ sll_ptr(src_pos, G3_elsize, src_pos);
  3114     __ sll_ptr(dst_pos, G3_elsize, dst_pos);
  3115     __ add(src, src_pos, from);       // src_addr
  3116     __ add(dst, dst_pos, to);         // dst_addr
  3118     BLOCK_COMMENT("choose copy loop based on element size");
  3119     __ cmp(G3_elsize, 0);
  3120     __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy);
  3121     __ delayed()->signx(length, count); // length
  3123     __ cmp(G3_elsize, LogBytesPerShort);
  3124     __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy);
  3125     __ delayed()->signx(length, count); // length
  3127     __ cmp(G3_elsize, LogBytesPerInt);
  3128     __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy);
  3129     __ delayed()->signx(length, count); // length
  3130 #ifdef ASSERT
  3131     { Label L;
  3132       __ cmp_and_br_short(G3_elsize, LogBytesPerLong, Assembler::equal, Assembler::pt, L);
  3133       __ stop("must be long copy, but elsize is wrong");
  3134       __ bind(L);
  3136 #endif
  3137     __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy);
  3138     __ delayed()->signx(length, count); // length
  3140     // ObjArrayKlass
  3141   __ BIND(L_objArray);
  3142     // live at this point:  G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length
  3144     Label L_plain_copy, L_checkcast_copy;
  3145     //  test array classes for subtyping
  3146     __ cmp(G3_src_klass, G4_dst_klass);         // usual case is exact equality
  3147     __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy);
  3148     __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below
  3150     // Identically typed arrays can be copied without element-wise checks.
  3151     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
  3152                            O5_temp, G5_lh, L_failed);
  3154     __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
  3155     __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
  3156     __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
  3157     __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
  3158     __ add(src, src_pos, from);       // src_addr
  3159     __ add(dst, dst_pos, to);         // dst_addr
  3160   __ BIND(L_plain_copy);
  3161     __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy);
  3162     __ delayed()->signx(length, count); // length
  3164   __ BIND(L_checkcast_copy);
  3165     // live at this point:  G3_src_klass, G4_dst_klass
  3167       // Before looking at dst.length, make sure dst is also an objArray.
  3168       // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot
  3169       __ cmp(G5_lh,                    O5_temp);
  3170       __ br(Assembler::notEqual, false, Assembler::pn, L_failed);
  3172       // It is safe to examine both src.length and dst.length.
  3173       __ delayed();                             // match next insn to prev branch
  3174       arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
  3175                              O5_temp, G5_lh, L_failed);
  3177       // Marshal the base address arguments now, freeing registers.
  3178       __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
  3179       __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
  3180       __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
  3181       __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
  3182       __ add(src, src_pos, from);               // src_addr
  3183       __ add(dst, dst_pos, to);                 // dst_addr
  3184       __ signx(length, count);                  // length (reloaded)
  3186       Register sco_temp = O3;                   // this register is free now
  3187       assert_different_registers(from, to, count, sco_temp,
  3188                                  G4_dst_klass, G3_src_klass);
  3190       // Generate the type check.
  3191       int sco_offset = in_bytes(Klass::super_check_offset_offset());
  3192       __ lduw(G4_dst_klass, sco_offset, sco_temp);
  3193       generate_type_check(G3_src_klass, sco_temp, G4_dst_klass,
  3194                           O5_temp, L_plain_copy);
  3196       // Fetch destination element klass from the ObjArrayKlass header.
  3197       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
  3199       // the checkcast_copy loop needs two extra arguments:
  3200       __ ld_ptr(G4_dst_klass, ek_offset, O4);   // dest elem klass
  3201       // lduw(O4, sco_offset, O3);              // sco of elem klass
  3203       __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy);
  3204       __ delayed()->lduw(O4, sco_offset, O3);
  3207   __ BIND(L_failed);
  3208     __ retl();
  3209     __ delayed()->sub(G0, 1, O0); // return -1
  3210     return start;
  3213   //
  3214   //  Generate stub for heap zeroing.
  3215   //  "to" address is aligned to jlong (8 bytes).
  3216   //
  3217   // Arguments for generated stub:
  3218   //      to:    O0
  3219   //      count: O1 treated as signed (count of HeapWord)
  3220   //             count could be 0
  3221   //
  3222   address generate_zero_aligned_words(const char* name) {
  3223     __ align(CodeEntryAlignment);
  3224     StubCodeMark mark(this, "StubRoutines", name);
  3225     address start = __ pc();
  3227     const Register to    = O0;   // source array address
  3228     const Register count = O1;   // HeapWords count
  3229     const Register temp  = O2;   // scratch
  3231     Label Ldone;
  3232     __ sllx(count, LogHeapWordSize, count); // to bytes count
  3233     // Use BIS for zeroing
  3234     __ bis_zeroing(to, count, temp, Ldone);
  3235     __ bind(Ldone);
  3236     __ retl();
  3237     __ delayed()->nop();
  3238     return start;
  3241   void generate_arraycopy_stubs() {
  3242     address entry;
  3243     address entry_jbyte_arraycopy;
  3244     address entry_jshort_arraycopy;
  3245     address entry_jint_arraycopy;
  3246     address entry_oop_arraycopy;
  3247     address entry_jlong_arraycopy;
  3248     address entry_checkcast_arraycopy;
  3250     //*** jbyte
  3251     // Always need aligned and unaligned versions
  3252     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
  3253                                                                                   "jbyte_disjoint_arraycopy");
  3254     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
  3255                                                                                   &entry_jbyte_arraycopy,
  3256                                                                                   "jbyte_arraycopy");
  3257     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
  3258                                                                                   "arrayof_jbyte_disjoint_arraycopy");
  3259     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
  3260                                                                                   "arrayof_jbyte_arraycopy");
  3262     //*** jshort
  3263     // Always need aligned and unaligned versions
  3264     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
  3265                                                                                     "jshort_disjoint_arraycopy");
  3266     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
  3267                                                                                     &entry_jshort_arraycopy,
  3268                                                                                     "jshort_arraycopy");
  3269     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
  3270                                                                                     "arrayof_jshort_disjoint_arraycopy");
  3271     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
  3272                                                                                     "arrayof_jshort_arraycopy");
  3274     //*** jint
  3275     // Aligned versions
  3276     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
  3277                                                                                 "arrayof_jint_disjoint_arraycopy");
  3278     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
  3279                                                                                 "arrayof_jint_arraycopy");
  3280 #ifdef _LP64
  3281     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
  3282     // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it).
  3283     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
  3284                                                                                 "jint_disjoint_arraycopy");
  3285     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
  3286                                                                                 &entry_jint_arraycopy,
  3287                                                                                 "jint_arraycopy");
  3288 #else
  3289     // In 32 bit jints are always HeapWordSize aligned, so always use the aligned version
  3290     // (in fact in 32bit we always have a pre-loop part even in the aligned version,
  3291     //  because it uses 64-bit loads/stores, so the aligned flag is actually ignored).
  3292     StubRoutines::_jint_disjoint_arraycopy = StubRoutines::_arrayof_jint_disjoint_arraycopy;
  3293     StubRoutines::_jint_arraycopy          = StubRoutines::_arrayof_jint_arraycopy;
  3294 #endif
  3297     //*** jlong
  3298     // It is always aligned
  3299     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
  3300                                                                                   "arrayof_jlong_disjoint_arraycopy");
  3301     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
  3302                                                                                   "arrayof_jlong_arraycopy");
  3303     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
  3304     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
  3307     //*** oops
  3308     // Aligned versions
  3309     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(true, &entry,
  3310                                                                                       "arrayof_oop_disjoint_arraycopy");
  3311     StubRoutines::_arrayof_oop_arraycopy                 = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy,
  3312                                                                                       "arrayof_oop_arraycopy");
  3313     // Aligned versions without pre-barriers
  3314     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry,
  3315                                                                                       "arrayof_oop_disjoint_arraycopy_uninit",
  3316                                                                                       /*dest_uninitialized*/true);
  3317     StubRoutines::_arrayof_oop_arraycopy_uninit          = generate_conjoint_oop_copy(true, entry, NULL,
  3318                                                                                       "arrayof_oop_arraycopy_uninit",
  3319                                                                                       /*dest_uninitialized*/true);
  3320 #ifdef _LP64
  3321     if (UseCompressedOops) {
  3322       // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy.
  3323       StubRoutines::_oop_disjoint_arraycopy            = generate_disjoint_oop_copy(false, &entry,
  3324                                                                                     "oop_disjoint_arraycopy");
  3325       StubRoutines::_oop_arraycopy                     = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy,
  3326                                                                                     "oop_arraycopy");
  3327       // Unaligned versions without pre-barriers
  3328       StubRoutines::_oop_disjoint_arraycopy_uninit     = generate_disjoint_oop_copy(false, &entry,
  3329                                                                                     "oop_disjoint_arraycopy_uninit",
  3330                                                                                     /*dest_uninitialized*/true);
  3331       StubRoutines::_oop_arraycopy_uninit              = generate_conjoint_oop_copy(false, entry, NULL,
  3332                                                                                     "oop_arraycopy_uninit",
  3333                                                                                     /*dest_uninitialized*/true);
  3334     } else
  3335 #endif
  3337       // oop arraycopy is always aligned on 32bit and 64bit without compressed oops
  3338       StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
  3339       StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
  3340       StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
  3341       StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
  3344     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
  3345     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
  3346                                                                         /*dest_uninitialized*/true);
  3348     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
  3349                                                               entry_jbyte_arraycopy,
  3350                                                               entry_jshort_arraycopy,
  3351                                                               entry_jint_arraycopy,
  3352                                                               entry_jlong_arraycopy);
  3353     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
  3354                                                                entry_jbyte_arraycopy,
  3355                                                                entry_jshort_arraycopy,
  3356                                                                entry_jint_arraycopy,
  3357                                                                entry_oop_arraycopy,
  3358                                                                entry_jlong_arraycopy,
  3359                                                                entry_checkcast_arraycopy);
  3361     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
  3362     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
  3363     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
  3364     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
  3365     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
  3366     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
  3368     if (UseBlockZeroing) {
  3369       StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words");
  3373   void generate_initial() {
  3374     // Generates all stubs and initializes the entry points
  3376     //------------------------------------------------------------------------------------------------------------------------
  3377     // entry points that exist in all platforms
  3378     // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
  3379     //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
  3380     StubRoutines::_forward_exception_entry                 = generate_forward_exception();
  3382     StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
  3383     StubRoutines::_catch_exception_entry                   = generate_catch_exception();
  3385     //------------------------------------------------------------------------------------------------------------------------
  3386     // entry points that are platform specific
  3387     StubRoutines::Sparc::_test_stop_entry                  = generate_test_stop();
  3389     StubRoutines::Sparc::_stop_subroutine_entry            = generate_stop_subroutine();
  3390     StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
  3392 #if !defined(COMPILER2) && !defined(_LP64)
  3393     StubRoutines::_atomic_xchg_entry         = generate_atomic_xchg();
  3394     StubRoutines::_atomic_cmpxchg_entry      = generate_atomic_cmpxchg();
  3395     StubRoutines::_atomic_add_entry          = generate_atomic_add();
  3396     StubRoutines::_atomic_xchg_ptr_entry     = StubRoutines::_atomic_xchg_entry;
  3397     StubRoutines::_atomic_cmpxchg_ptr_entry  = StubRoutines::_atomic_cmpxchg_entry;
  3398     StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
  3399     StubRoutines::_atomic_add_ptr_entry      = StubRoutines::_atomic_add_entry;
  3400 #endif  // COMPILER2 !=> _LP64
  3402     // Build this early so it's available for the interpreter.
  3403     StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
  3407   void generate_all() {
  3408     // Generates all stubs and initializes the entry points
  3410     // Generate partial_subtype_check first here since its code depends on
  3411     // UseZeroBaseCompressedOops which is defined after heap initialization.
  3412     StubRoutines::Sparc::_partial_subtype_check                = generate_partial_subtype_check();
  3413     // These entry points require SharedInfo::stack0 to be set up in non-core builds
  3414     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
  3415     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
  3416     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
  3418     StubRoutines::_handler_for_unsafe_access_entry =
  3419       generate_handler_for_unsafe_access();
  3421     // support for verify_oop (must happen after universe_init)
  3422     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop_subroutine();
  3424     // arraycopy stubs used by compilers
  3425     generate_arraycopy_stubs();
  3427     // Don't initialize the platform math functions since sparc
  3428     // doesn't have intrinsics for these operations.
  3432  public:
  3433   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
  3434     // replace the standard masm with a special one:
  3435     _masm = new MacroAssembler(code);
  3437     _stub_count = !all ? 0x100 : 0x200;
  3438     if (all) {
  3439       generate_all();
  3440     } else {
  3441       generate_initial();
  3444     // make sure this stub is available for all local calls
  3445     if (_atomic_add_stub.is_unbound()) {
  3446       // generate a second time, if necessary
  3447       (void) generate_atomic_add();
  3452  private:
  3453   int _stub_count;
  3454   void stub_prolog(StubCodeDesc* cdesc) {
  3455     # ifdef ASSERT
  3456       // put extra information in the stub code, to make it more readable
  3457 #ifdef _LP64
  3458 // Write the high part of the address
  3459 // [RGV] Check if there is a dependency on the size of this prolog
  3460       __ emit_data((intptr_t)cdesc >> 32,    relocInfo::none);
  3461 #endif
  3462       __ emit_data((intptr_t)cdesc,    relocInfo::none);
  3463       __ emit_data(++_stub_count, relocInfo::none);
  3464     # endif
  3465     align(true);
  3468   void align(bool at_header = false) {
  3469     // %%%%% move this constant somewhere else
  3470     // UltraSPARC cache line size is 8 instructions:
  3471     const unsigned int icache_line_size = 32;
  3472     const unsigned int icache_half_line_size = 16;
  3474     if (at_header) {
  3475       while ((intptr_t)(__ pc()) % icache_line_size != 0) {
  3476         __ emit_data(0, relocInfo::none);
  3478     } else {
  3479       while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
  3480         __ nop();
  3485 }; // end class declaration
  3487 void StubGenerator_generate(CodeBuffer* code, bool all) {
  3488   StubGenerator g(code, all);

mercurial