jdk8-mips64-public/hotspot: diff src/cpu/sparc/vm/stubGenerator

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Wed Apr 27 01:25:04 2016 +0800
     1.3 @@ -0,0 +1,4710 @@
     1.4 +/*
     1.5 + * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
     1.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     1.7 + *
     1.8 + * This code is free software; you can redistribute it and/or modify it
     1.9 + * under the terms of the GNU General Public License version 2 only, as
    1.10 + * published by the Free Software Foundation.
    1.11 + *
    1.12 + * This code is distributed in the hope that it will be useful, but WITHOUT
    1.13 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    1.14 + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    1.15 + * version 2 for more details (a copy is included in the LICENSE file that
    1.16 + * accompanied this code).
    1.17 + *
    1.18 + * You should have received a copy of the GNU General Public License version
    1.19 + * 2 along with this work; if not, write to the Free Software Foundation,
    1.20 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    1.21 + *
    1.22 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    1.23 + * or visit www.oracle.com if you need additional information or have any
    1.24 + * questions.
    1.25 + *
    1.26 + */
    1.27 +
    1.28 +#include "precompiled.hpp"
    1.29 +#include "asm/macroAssembler.inline.hpp"
    1.30 +#include "interpreter/interpreter.hpp"
    1.31 +#include "nativeInst_sparc.hpp"
    1.32 +#include "oops/instanceOop.hpp"
    1.33 +#include "oops/method.hpp"
    1.34 +#include "oops/objArrayKlass.hpp"
    1.35 +#include "oops/oop.inline.hpp"
    1.36 +#include "prims/methodHandles.hpp"
    1.37 +#include "runtime/frame.inline.hpp"
    1.38 +#include "runtime/handles.inline.hpp"
    1.39 +#include "runtime/sharedRuntime.hpp"
    1.40 +#include "runtime/stubCodeGenerator.hpp"
    1.41 +#include "runtime/stubRoutines.hpp"
    1.42 +#include "runtime/thread.inline.hpp"
    1.43 +#include "utilities/top.hpp"
    1.44 +#ifdef COMPILER2
    1.45 +#include "opto/runtime.hpp"
    1.46 +#endif
    1.47 +
    1.48 +// Declaration and definition of StubGenerator (no .hpp file).
    1.49 +// For a more detailed description of the stub routine structure
    1.50 +// see the comment in stubRoutines.hpp.
    1.51 +
    1.52 +#define __ _masm->
    1.53 +
    1.54 +#ifdef PRODUCT
    1.55 +#define BLOCK_COMMENT(str) /* nothing */
    1.56 +#else
    1.57 +#define BLOCK_COMMENT(str) __ block_comment(str)
    1.58 +#endif
    1.59 +
    1.60 +#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
    1.61 +
    1.62 +// Note:  The register L7 is used as L7_thread_cache, and may not be used
    1.63 +//        any other way within this module.
    1.64 +
    1.65 +
    1.66 +static const Register& Lstub_temp = L2;
    1.67 +
    1.68 +// -------------------------------------------------------------------------------------------------------------------------
    1.69 +// Stub Code definitions
    1.70 +
    1.71 +static address handle_unsafe_access() {
    1.72 +  JavaThread* thread = JavaThread::current();
    1.73 +  address pc  = thread->saved_exception_pc();
    1.74 +  address npc = thread->saved_exception_npc();
    1.75 +  // pc is the instruction which we must emulate
    1.76 +  // doing a no-op is fine:  return garbage from the load
    1.77 +
    1.78 +  // request an async exception
    1.79 +  thread->set_pending_unsafe_access_error();
    1.80 +
    1.81 +  // return address of next instruction to execute
    1.82 +  return npc;
    1.83 +}
    1.84 +
    1.85 +class StubGenerator: public StubCodeGenerator {
    1.86 + private:
    1.87 +
    1.88 +#ifdef PRODUCT
    1.89 +#define inc_counter_np(a,b,c)
    1.90 +#else
    1.91 +#define inc_counter_np(counter, t1, t2) \
    1.92 +  BLOCK_COMMENT("inc_counter " #counter); \
    1.93 +  __ inc_counter(&counter, t1, t2);
    1.94 +#endif
    1.95 +
    1.96 +  //----------------------------------------------------------------------------------------------------
    1.97 +  // Call stubs are used to call Java from C
    1.98 +
    1.99 +  address generate_call_stub(address& return_pc) {
   1.100 +    StubCodeMark mark(this, "StubRoutines", "call_stub");
   1.101 +    address start = __ pc();
   1.102 +
   1.103 +    // Incoming arguments:
   1.104 +    //
   1.105 +    // o0         : call wrapper address
   1.106 +    // o1         : result (address)
   1.107 +    // o2         : result type
   1.108 +    // o3         : method
   1.109 +    // o4         : (interpreter) entry point
   1.110 +    // o5         : parameters (address)
   1.111 +    // [sp + 0x5c]: parameter size (in words)
   1.112 +    // [sp + 0x60]: thread
   1.113 +    //
   1.114 +    // +---------------+ <--- sp + 0
   1.115 +    // |               |
   1.116 +    // . reg save area .
   1.117 +    // |               |
   1.118 +    // +---------------+ <--- sp + 0x40
   1.119 +    // |               |
   1.120 +    // . extra 7 slots .
   1.121 +    // |               |
   1.122 +    // +---------------+ <--- sp + 0x5c
   1.123 +    // |  param. size  |
   1.124 +    // +---------------+ <--- sp + 0x60
   1.125 +    // |    thread     |
   1.126 +    // +---------------+
   1.127 +    // |               |
   1.128 +
   1.129 +    // note: if the link argument position changes, adjust
   1.130 +    //       the code in frame::entry_frame_call_wrapper()
   1.131 +
   1.132 +    const Argument link           = Argument(0, false); // used only for GC
   1.133 +    const Argument result         = Argument(1, false);
   1.134 +    const Argument result_type    = Argument(2, false);
   1.135 +    const Argument method         = Argument(3, false);
   1.136 +    const Argument entry_point    = Argument(4, false);
   1.137 +    const Argument parameters     = Argument(5, false);
   1.138 +    const Argument parameter_size = Argument(6, false);
   1.139 +    const Argument thread         = Argument(7, false);
   1.140 +
   1.141 +    // setup thread register
   1.142 +    __ ld_ptr(thread.as_address(), G2_thread);
   1.143 +    __ reinit_heapbase();
   1.144 +
   1.145 +#ifdef ASSERT
   1.146 +    // make sure we have no pending exceptions
   1.147 +    { const Register t = G3_scratch;
   1.148 +      Label L;
   1.149 +      __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t);
   1.150 +      __ br_null_short(t, Assembler::pt, L);
   1.151 +      __ stop("StubRoutines::call_stub: entered with pending exception");
   1.152 +      __ bind(L);
   1.153 +    }
   1.154 +#endif
   1.155 +
   1.156 +    // create activation frame & allocate space for parameters
   1.157 +    { const Register t = G3_scratch;
   1.158 +      __ ld_ptr(parameter_size.as_address(), t);                // get parameter size (in words)
   1.159 +      __ add(t, frame::memory_parameter_word_sp_offset, t);     // add space for save area (in words)
   1.160 +      __ round_to(t, WordsPerLong);                             // make sure it is multiple of 2 (in words)
   1.161 +      __ sll(t, Interpreter::logStackElementSize, t);           // compute number of bytes
   1.162 +      __ neg(t);                                                // negate so it can be used with save
   1.163 +      __ save(SP, t, SP);                                       // setup new frame
   1.164 +    }
   1.165 +
   1.166 +    // +---------------+ <--- sp + 0
   1.167 +    // |               |
   1.168 +    // . reg save area .
   1.169 +    // |               |
   1.170 +    // +---------------+ <--- sp + 0x40
   1.171 +    // |               |
   1.172 +    // . extra 7 slots .
   1.173 +    // |               |
   1.174 +    // +---------------+ <--- sp + 0x5c
   1.175 +    // |  empty slot   |      (only if parameter size is even)
   1.176 +    // +---------------+
   1.177 +    // |               |
   1.178 +    // .  parameters   .
   1.179 +    // |               |
   1.180 +    // +---------------+ <--- fp + 0
   1.181 +    // |               |
   1.182 +    // . reg save area .
   1.183 +    // |               |
   1.184 +    // +---------------+ <--- fp + 0x40
   1.185 +    // |               |
   1.186 +    // . extra 7 slots .
   1.187 +    // |               |
   1.188 +    // +---------------+ <--- fp + 0x5c
   1.189 +    // |  param. size  |
   1.190 +    // +---------------+ <--- fp + 0x60
   1.191 +    // |    thread     |
   1.192 +    // +---------------+
   1.193 +    // |               |
   1.194 +
   1.195 +    // pass parameters if any
   1.196 +    BLOCK_COMMENT("pass parameters if any");
   1.197 +    { const Register src = parameters.as_in().as_register();
   1.198 +      const Register dst = Lentry_args;
   1.199 +      const Register tmp = G3_scratch;
   1.200 +      const Register cnt = G4_scratch;
   1.201 +
   1.202 +      // test if any parameters & setup of Lentry_args
   1.203 +      Label exit;
   1.204 +      __ ld_ptr(parameter_size.as_in().as_address(), cnt);      // parameter counter
   1.205 +      __ add( FP, STACK_BIAS, dst );
   1.206 +      __ cmp_zero_and_br(Assembler::zero, cnt, exit);
   1.207 +      __ delayed()->sub(dst, BytesPerWord, dst);                 // setup Lentry_args
   1.208 +
   1.209 +      // copy parameters if any
   1.210 +      Label loop;
   1.211 +      __ BIND(loop);
   1.212 +      // Store parameter value
   1.213 +      __ ld_ptr(src, 0, tmp);
   1.214 +      __ add(src, BytesPerWord, src);
   1.215 +      __ st_ptr(tmp, dst, 0);
   1.216 +      __ deccc(cnt);
   1.217 +      __ br(Assembler::greater, false, Assembler::pt, loop);
   1.218 +      __ delayed()->sub(dst, Interpreter::stackElementSize, dst);
   1.219 +
   1.220 +      // done
   1.221 +      __ BIND(exit);
   1.222 +    }
   1.223 +
   1.224 +    // setup parameters, method & call Java function
   1.225 +#ifdef ASSERT
   1.226 +    // layout_activation_impl checks it's notion of saved SP against
   1.227 +    // this register, so if this changes update it as well.
   1.228 +    const Register saved_SP = Lscratch;
   1.229 +    __ mov(SP, saved_SP);                               // keep track of SP before call
   1.230 +#endif
   1.231 +
   1.232 +    // setup parameters
   1.233 +    const Register t = G3_scratch;
   1.234 +    __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words)
   1.235 +    __ sll(t, Interpreter::logStackElementSize, t);    // compute number of bytes
   1.236 +    __ sub(FP, t, Gargs);                              // setup parameter pointer
   1.237 +#ifdef _LP64
   1.238 +    __ add( Gargs, STACK_BIAS, Gargs );                // Account for LP64 stack bias
   1.239 +#endif
   1.240 +    __ mov(SP, O5_savedSP);
   1.241 +
   1.242 +
   1.243 +    // do the call
   1.244 +    //
   1.245 +    // the following register must be setup:
   1.246 +    //
   1.247 +    // G2_thread
   1.248 +    // G5_method
   1.249 +    // Gargs
   1.250 +    BLOCK_COMMENT("call Java function");
   1.251 +    __ jmpl(entry_point.as_in().as_register(), G0, O7);
   1.252 +    __ delayed()->mov(method.as_in().as_register(), G5_method);   // setup method
   1.253 +
   1.254 +    BLOCK_COMMENT("call_stub_return_address:");
   1.255 +    return_pc = __ pc();
   1.256 +
   1.257 +    // The callee, if it wasn't interpreted, can return with SP changed so
   1.258 +    // we can no longer assert of change of SP.
   1.259 +
   1.260 +    // store result depending on type
   1.261 +    // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE
   1.262 +    //  is treated as T_INT)
   1.263 +    { const Register addr = result     .as_in().as_register();
   1.264 +      const Register type = result_type.as_in().as_register();
   1.265 +      Label is_long, is_float, is_double, is_object, exit;
   1.266 +      __            cmp(type, T_OBJECT);  __ br(Assembler::equal, false, Assembler::pn, is_object);
   1.267 +      __ delayed()->cmp(type, T_FLOAT);   __ br(Assembler::equal, false, Assembler::pn, is_float);
   1.268 +      __ delayed()->cmp(type, T_DOUBLE);  __ br(Assembler::equal, false, Assembler::pn, is_double);
   1.269 +      __ delayed()->cmp(type, T_LONG);    __ br(Assembler::equal, false, Assembler::pn, is_long);
   1.270 +      __ delayed()->nop();
   1.271 +
   1.272 +      // store int result
   1.273 +      __ st(O0, addr, G0);
   1.274 +
   1.275 +      __ BIND(exit);
   1.276 +      __ ret();
   1.277 +      __ delayed()->restore();
   1.278 +
   1.279 +      __ BIND(is_object);
   1.280 +      __ ba(exit);
   1.281 +      __ delayed()->st_ptr(O0, addr, G0);
   1.282 +
   1.283 +      __ BIND(is_float);
   1.284 +      __ ba(exit);
   1.285 +      __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0);
   1.286 +
   1.287 +      __ BIND(is_double);
   1.288 +      __ ba(exit);
   1.289 +      __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0);
   1.290 +
   1.291 +      __ BIND(is_long);
   1.292 +#ifdef _LP64
   1.293 +      __ ba(exit);
   1.294 +      __ delayed()->st_long(O0, addr, G0);      // store entire long
   1.295 +#else
   1.296 +#if defined(COMPILER2)
   1.297 +  // All return values are where we want them, except for Longs.  C2 returns
   1.298 +  // longs in G1 in the 32-bit build whereas the interpreter wants them in O0/O1.
   1.299 +  // Since the interpreter will return longs in G1 and O0/O1 in the 32bit
   1.300 +  // build we simply always use G1.
   1.301 +  // Note: I tried to make c2 return longs in O0/O1 and G1 so we wouldn't have to
   1.302 +  // do this here. Unfortunately if we did a rethrow we'd see an machepilog node
   1.303 +  // first which would move g1 -> O0/O1 and destroy the exception we were throwing.
   1.304 +
   1.305 +      __ ba(exit);
   1.306 +      __ delayed()->stx(G1, addr, G0);  // store entire long
   1.307 +#else
   1.308 +      __ st(O1, addr, BytesPerInt);
   1.309 +      __ ba(exit);
   1.310 +      __ delayed()->st(O0, addr, G0);
   1.311 +#endif /* COMPILER2 */
   1.312 +#endif /* _LP64 */
   1.313 +     }
   1.314 +     return start;
   1.315 +  }
   1.316 +
   1.317 +
   1.318 +  //----------------------------------------------------------------------------------------------------
   1.319 +  // Return point for a Java call if there's an exception thrown in Java code.
   1.320 +  // The exception is caught and transformed into a pending exception stored in
   1.321 +  // JavaThread that can be tested from within the VM.
   1.322 +  //
   1.323 +  // Oexception: exception oop
   1.324 +
   1.325 +  address generate_catch_exception() {
   1.326 +    StubCodeMark mark(this, "StubRoutines", "catch_exception");
   1.327 +
   1.328 +    address start = __ pc();
   1.329 +    // verify that thread corresponds
   1.330 +    __ verify_thread();
   1.331 +
   1.332 +    const Register& temp_reg = Gtemp;
   1.333 +    Address pending_exception_addr    (G2_thread, Thread::pending_exception_offset());
   1.334 +    Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset   ());
   1.335 +    Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset   ());
   1.336 +
   1.337 +    // set pending exception
   1.338 +    __ verify_oop(Oexception);
   1.339 +    __ st_ptr(Oexception, pending_exception_addr);
   1.340 +    __ set((intptr_t)__FILE__, temp_reg);
   1.341 +    __ st_ptr(temp_reg, exception_file_offset_addr);
   1.342 +    __ set((intptr_t)__LINE__, temp_reg);
   1.343 +    __ st(temp_reg, exception_line_offset_addr);
   1.344 +
   1.345 +    // complete return to VM
   1.346 +    assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
   1.347 +
   1.348 +    AddressLiteral stub_ret(StubRoutines::_call_stub_return_address);
   1.349 +    __ jump_to(stub_ret, temp_reg);
   1.350 +    __ delayed()->nop();
   1.351 +
   1.352 +    return start;
   1.353 +  }
   1.354 +
   1.355 +
   1.356 +  //----------------------------------------------------------------------------------------------------
   1.357 +  // Continuation point for runtime calls returning with a pending exception
   1.358 +  // The pending exception check happened in the runtime or native call stub
   1.359 +  // The pending exception in Thread is converted into a Java-level exception
   1.360 +  //
   1.361 +  // Contract with Java-level exception handler: O0 = exception
   1.362 +  //                                             O1 = throwing pc
   1.363 +
   1.364 +  address generate_forward_exception() {
   1.365 +    StubCodeMark mark(this, "StubRoutines", "forward_exception");
   1.366 +    address start = __ pc();
   1.367 +
   1.368 +    // Upon entry, O7 has the return address returning into Java
   1.369 +    // (interpreted or compiled) code; i.e. the return address
   1.370 +    // becomes the throwing pc.
   1.371 +
   1.372 +    const Register& handler_reg = Gtemp;
   1.373 +
   1.374 +    Address exception_addr(G2_thread, Thread::pending_exception_offset());
   1.375 +
   1.376 +#ifdef ASSERT
   1.377 +    // make sure that this code is only executed if there is a pending exception
   1.378 +    { Label L;
   1.379 +      __ ld_ptr(exception_addr, Gtemp);
   1.380 +      __ br_notnull_short(Gtemp, Assembler::pt, L);
   1.381 +      __ stop("StubRoutines::forward exception: no pending exception (1)");
   1.382 +      __ bind(L);
   1.383 +    }
   1.384 +#endif
   1.385 +
   1.386 +    // compute exception handler into handler_reg
   1.387 +    __ get_thread();
   1.388 +    __ ld_ptr(exception_addr, Oexception);
   1.389 +    __ verify_oop(Oexception);
   1.390 +    __ save_frame(0);             // compensates for compiler weakness
   1.391 +    __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC
   1.392 +    BLOCK_COMMENT("call exception_handler_for_return_address");
   1.393 +    __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch);
   1.394 +    __ mov(O0, handler_reg);
   1.395 +    __ restore();                 // compensates for compiler weakness
   1.396 +
   1.397 +    __ ld_ptr(exception_addr, Oexception);
   1.398 +    __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC
   1.399 +
   1.400 +#ifdef ASSERT
   1.401 +    // make sure exception is set
   1.402 +    { Label L;
   1.403 +      __ br_notnull_short(Oexception, Assembler::pt, L);
   1.404 +      __ stop("StubRoutines::forward exception: no pending exception (2)");
   1.405 +      __ bind(L);
   1.406 +    }
   1.407 +#endif
   1.408 +    // jump to exception handler
   1.409 +    __ jmp(handler_reg, 0);
   1.410 +    // clear pending exception
   1.411 +    __ delayed()->st_ptr(G0, exception_addr);
   1.412 +
   1.413 +    return start;
   1.414 +  }
   1.415 +
   1.416 +  // Safefetch stubs.
   1.417 +  void generate_safefetch(const char* name, int size, address* entry,
   1.418 +                          address* fault_pc, address* continuation_pc) {
   1.419 +    // safefetch signatures:
   1.420 +    //   int      SafeFetch32(int*      adr, int      errValue);
   1.421 +    //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
   1.422 +    //
   1.423 +    // arguments:
   1.424 +    //   o0 = adr
   1.425 +    //   o1 = errValue
   1.426 +    //
   1.427 +    // result:
   1.428 +    //   o0  = *adr or errValue
   1.429 +
   1.430 +    StubCodeMark mark(this, "StubRoutines", name);
   1.431 +
   1.432 +    // Entry point, pc or function descriptor.
   1.433 +    __ align(CodeEntryAlignment);
   1.434 +    *entry = __ pc();
   1.435 +
   1.436 +    __ mov(O0, G1);  // g1 = o0
   1.437 +    __ mov(O1, O0);  // o0 = o1
   1.438 +    // Load *adr into c_rarg1, may fault.
   1.439 +    *fault_pc = __ pc();
   1.440 +    switch (size) {
   1.441 +      case 4:
   1.442 +        // int32_t
   1.443 +        __ ldsw(G1, 0, O0);  // o0 = [g1]
   1.444 +        break;
   1.445 +      case 8:
   1.446 +        // int64_t
   1.447 +        __ ldx(G1, 0, O0);   // o0 = [g1]
   1.448 +        break;
   1.449 +      default:
   1.450 +        ShouldNotReachHere();
   1.451 +    }
   1.452 +
   1.453 +    // return errValue or *adr
   1.454 +    *continuation_pc = __ pc();
   1.455 +    // By convention with the trap handler we ensure there is a non-CTI
   1.456 +    // instruction in the trap shadow.
   1.457 +    __ nop();
   1.458 +    __ retl();
   1.459 +    __ delayed()->nop();
   1.460 +  }
   1.461 +
   1.462 +  //------------------------------------------------------------------------------------------------------------------------
   1.463 +  // Continuation point for throwing of implicit exceptions that are not handled in
   1.464 +  // the current activation. Fabricates an exception oop and initiates normal
   1.465 +  // exception dispatching in this frame. Only callee-saved registers are preserved
   1.466 +  // (through the normal register window / RegisterMap handling).
   1.467 +  // If the compiler needs all registers to be preserved between the fault
   1.468 +  // point and the exception handler then it must assume responsibility for that in
   1.469 +  // AbstractCompiler::continuation_for_implicit_null_exception or
   1.470 +  // continuation_for_implicit_division_by_zero_exception. All other implicit
   1.471 +  // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
   1.472 +  // either at call sites or otherwise assume that stack unwinding will be initiated,
   1.473 +  // so caller saved registers were assumed volatile in the compiler.
   1.474 +
   1.475 +  // Note that we generate only this stub into a RuntimeStub, because it needs to be
   1.476 +  // properly traversed and ignored during GC, so we change the meaning of the "__"
   1.477 +  // macro within this method.
   1.478 +#undef __
   1.479 +#define __ masm->
   1.480 +
   1.481 +  address generate_throw_exception(const char* name, address runtime_entry,
   1.482 +                                   Register arg1 = noreg, Register arg2 = noreg) {
   1.483 +#ifdef ASSERT
   1.484 +    int insts_size = VerifyThread ? 1 * K : 600;
   1.485 +#else
   1.486 +    int insts_size = VerifyThread ? 1 * K : 256;
   1.487 +#endif /* ASSERT */
   1.488 +    int locs_size  = 32;
   1.489 +
   1.490 +    CodeBuffer      code(name, insts_size, locs_size);
   1.491 +    MacroAssembler* masm = new MacroAssembler(&code);
   1.492 +
   1.493 +    __ verify_thread();
   1.494 +
   1.495 +    // This is an inlined and slightly modified version of call_VM
   1.496 +    // which has the ability to fetch the return PC out of thread-local storage
   1.497 +    __ assert_not_delayed();
   1.498 +
   1.499 +    // Note that we always push a frame because on the SPARC
   1.500 +    // architecture, for all of our implicit exception kinds at call
   1.501 +    // sites, the implicit exception is taken before the callee frame
   1.502 +    // is pushed.
   1.503 +    __ save_frame(0);
   1.504 +
   1.505 +    int frame_complete = __ offset();
   1.506 +
   1.507 +    // Note that we always have a runtime stub frame on the top of stack by this point
   1.508 +    Register last_java_sp = SP;
   1.509 +    // 64-bit last_java_sp is biased!
   1.510 +    __ set_last_Java_frame(last_java_sp, G0);
   1.511 +    if (VerifyThread)  __ mov(G2_thread, O0); // about to be smashed; pass early
   1.512 +    __ save_thread(noreg);
   1.513 +    if (arg1 != noreg) {
   1.514 +      assert(arg2 != O1, "clobbered");
   1.515 +      __ mov(arg1, O1);
   1.516 +    }
   1.517 +    if (arg2 != noreg) {
   1.518 +      __ mov(arg2, O2);
   1.519 +    }
   1.520 +    // do the call
   1.521 +    BLOCK_COMMENT("call runtime_entry");
   1.522 +    __ call(runtime_entry, relocInfo::runtime_call_type);
   1.523 +    if (!VerifyThread)
   1.524 +      __ delayed()->mov(G2_thread, O0);  // pass thread as first argument
   1.525 +    else
   1.526 +      __ delayed()->nop();             // (thread already passed)
   1.527 +    __ restore_thread(noreg);
   1.528 +    __ reset_last_Java_frame();
   1.529 +
   1.530 +    // check for pending exceptions. use Gtemp as scratch register.
   1.531 +#ifdef ASSERT
   1.532 +    Label L;
   1.533 +
   1.534 +    Address exception_addr(G2_thread, Thread::pending_exception_offset());
   1.535 +    Register scratch_reg = Gtemp;
   1.536 +    __ ld_ptr(exception_addr, scratch_reg);
   1.537 +    __ br_notnull_short(scratch_reg, Assembler::pt, L);
   1.538 +    __ should_not_reach_here();
   1.539 +    __ bind(L);
   1.540 +#endif // ASSERT
   1.541 +    BLOCK_COMMENT("call forward_exception_entry");
   1.542 +    __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
   1.543 +    // we use O7 linkage so that forward_exception_entry has the issuing PC
   1.544 +    __ delayed()->restore();
   1.545 +
   1.546 +    RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false);
   1.547 +    return stub->entry_point();
   1.548 +  }
   1.549 +
   1.550 +#undef __
   1.551 +#define __ _masm->
   1.552 +
   1.553 +
   1.554 +  // Generate a routine that sets all the registers so we
   1.555 +  // can tell if the stop routine prints them correctly.
   1.556 +  address generate_test_stop() {
   1.557 +    StubCodeMark mark(this, "StubRoutines", "test_stop");
   1.558 +    address start = __ pc();
   1.559 +
   1.560 +    int i;
   1.561 +
   1.562 +    __ save_frame(0);
   1.563 +
   1.564 +    static jfloat zero = 0.0, one = 1.0;
   1.565 +
   1.566 +    // put addr in L0, then load through L0 to F0
   1.567 +    __ set((intptr_t)&zero, L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F0);
   1.568 +    __ set((intptr_t)&one,  L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1
   1.569 +
   1.570 +    // use add to put 2..18 in F2..F18
   1.571 +    for ( i = 2;  i <= 18;  ++i ) {
   1.572 +      __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1),  as_FloatRegister(i));
   1.573 +    }
   1.574 +
   1.575 +    // Now put double 2 in F16, double 18 in F18
   1.576 +    __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 );
   1.577 +    __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 );
   1.578 +
   1.579 +    // use add to put 20..32 in F20..F32
   1.580 +    for (i = 20; i < 32; i += 2) {
   1.581 +      __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2),  as_FloatRegister(i));
   1.582 +    }
   1.583 +
   1.584 +    // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's
   1.585 +    for ( i = 0; i < 8; ++i ) {
   1.586 +      if (i < 6) {
   1.587 +        __ set(     i, as_iRegister(i));
   1.588 +        __ set(16 + i, as_oRegister(i));
   1.589 +        __ set(24 + i, as_gRegister(i));
   1.590 +      }
   1.591 +      __ set( 8 + i, as_lRegister(i));
   1.592 +    }
   1.593 +
   1.594 +    __ stop("testing stop");
   1.595 +
   1.596 +
   1.597 +    __ ret();
   1.598 +    __ delayed()->restore();
   1.599 +
   1.600 +    return start;
   1.601 +  }
   1.602 +
   1.603 +
   1.604 +  address generate_stop_subroutine() {
   1.605 +    StubCodeMark mark(this, "StubRoutines", "stop_subroutine");
   1.606 +    address start = __ pc();
   1.607 +
   1.608 +    __ stop_subroutine();
   1.609 +
   1.610 +    return start;
   1.611 +  }
   1.612 +
   1.613 +  address generate_flush_callers_register_windows() {
   1.614 +    StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows");
   1.615 +    address start = __ pc();
   1.616 +
   1.617 +    __ flushw();
   1.618 +    __ retl(false);
   1.619 +    __ delayed()->add( FP, STACK_BIAS, O0 );
   1.620 +    // The returned value must be a stack pointer whose register save area
   1.621 +    // is flushed, and will stay flushed while the caller executes.
   1.622 +
   1.623 +    return start;
   1.624 +  }
   1.625 +
   1.626 +  // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest).
   1.627 +  //
   1.628 +  // Arguments:
   1.629 +  //
   1.630 +  //      exchange_value: O0
   1.631 +  //      dest:           O1
   1.632 +  //
   1.633 +  // Results:
   1.634 +  //
   1.635 +  //     O0: the value previously stored in dest
   1.636 +  //
   1.637 +  address generate_atomic_xchg() {
   1.638 +    StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
   1.639 +    address start = __ pc();
   1.640 +
   1.641 +    if (UseCASForSwap) {
   1.642 +      // Use CAS instead of swap, just in case the MP hardware
   1.643 +      // prefers to work with just one kind of synch. instruction.
   1.644 +      Label retry;
   1.645 +      __ BIND(retry);
   1.646 +      __ mov(O0, O3);       // scratch copy of exchange value
   1.647 +      __ ld(O1, 0, O2);     // observe the previous value
   1.648 +      // try to replace O2 with O3
   1.649 +      __ cas(O1, O2, O3);
   1.650 +      __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
   1.651 +
   1.652 +      __ retl(false);
   1.653 +      __ delayed()->mov(O2, O0);  // report previous value to caller
   1.654 +    } else {
   1.655 +      __ retl(false);
   1.656 +      __ delayed()->swap(O1, 0, O0);
   1.657 +    }
   1.658 +
   1.659 +    return start;
   1.660 +  }
   1.661 +
   1.662 +
   1.663 +  // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value)
   1.664 +  //
   1.665 +  // Arguments:
   1.666 +  //
   1.667 +  //      exchange_value: O0
   1.668 +  //      dest:           O1
   1.669 +  //      compare_value:  O2
   1.670 +  //
   1.671 +  // Results:
   1.672 +  //
   1.673 +  //     O0: the value previously stored in dest
   1.674 +  //
   1.675 +  address generate_atomic_cmpxchg() {
   1.676 +    StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
   1.677 +    address start = __ pc();
   1.678 +
   1.679 +    // cmpxchg(dest, compare_value, exchange_value)
   1.680 +    __ cas(O1, O2, O0);
   1.681 +    __ retl(false);
   1.682 +    __ delayed()->nop();
   1.683 +
   1.684 +    return start;
   1.685 +  }
   1.686 +
   1.687 +  // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
   1.688 +  //
   1.689 +  // Arguments:
   1.690 +  //
   1.691 +  //      exchange_value: O1:O0
   1.692 +  //      dest:           O2
   1.693 +  //      compare_value:  O4:O3
   1.694 +  //
   1.695 +  // Results:
   1.696 +  //
   1.697 +  //     O1:O0: the value previously stored in dest
   1.698 +  //
   1.699 +  // Overwrites: G1,G2,G3
   1.700 +  //
   1.701 +  address generate_atomic_cmpxchg_long() {
   1.702 +    StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
   1.703 +    address start = __ pc();
   1.704 +
   1.705 +    __ sllx(O0, 32, O0);
   1.706 +    __ srl(O1, 0, O1);
   1.707 +    __ or3(O0,O1,O0);      // O0 holds 64-bit value from compare_value
   1.708 +    __ sllx(O3, 32, O3);
   1.709 +    __ srl(O4, 0, O4);
   1.710 +    __ or3(O3,O4,O3);     // O3 holds 64-bit value from exchange_value
   1.711 +    __ casx(O2, O3, O0);
   1.712 +    __ srl(O0, 0, O1);    // unpacked return value in O1:O0
   1.713 +    __ retl(false);
   1.714 +    __ delayed()->srlx(O0, 32, O0);
   1.715 +
   1.716 +    return start;
   1.717 +  }
   1.718 +
   1.719 +
   1.720 +  // Support for jint Atomic::add(jint add_value, volatile jint* dest).
   1.721 +  //
   1.722 +  // Arguments:
   1.723 +  //
   1.724 +  //      add_value: O0   (e.g., +1 or -1)
   1.725 +  //      dest:      O1
   1.726 +  //
   1.727 +  // Results:
   1.728 +  //
   1.729 +  //     O0: the new value stored in dest
   1.730 +  //
   1.731 +  // Overwrites: O3
   1.732 +  //
   1.733 +  address generate_atomic_add() {
   1.734 +    StubCodeMark mark(this, "StubRoutines", "atomic_add");
   1.735 +    address start = __ pc();
   1.736 +    __ BIND(_atomic_add_stub);
   1.737 +
   1.738 +    Label(retry);
   1.739 +    __ BIND(retry);
   1.740 +
   1.741 +    __ lduw(O1, 0, O2);
   1.742 +    __ add(O0, O2, O3);
   1.743 +    __ cas(O1, O2, O3);
   1.744 +    __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
   1.745 +    __ retl(false);
   1.746 +    __ delayed()->add(O0, O2, O0); // note that cas made O2==O3
   1.747 +
   1.748 +    return start;
   1.749 +  }
   1.750 +  Label _atomic_add_stub;  // called from other stubs
   1.751 +
   1.752 +
   1.753 +  //------------------------------------------------------------------------------------------------------------------------
   1.754 +  // The following routine generates a subroutine to throw an asynchronous
   1.755 +  // UnknownError when an unsafe access gets a fault that could not be
   1.756 +  // reasonably prevented by the programmer.  (Example: SIGBUS/OBJERR.)
   1.757 +  //
   1.758 +  // Arguments :
   1.759 +  //
   1.760 +  //      trapping PC:    O7
   1.761 +  //
   1.762 +  // Results:
   1.763 +  //     posts an asynchronous exception, skips the trapping instruction
   1.764 +  //
   1.765 +
   1.766 +  address generate_handler_for_unsafe_access() {
   1.767 +    StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
   1.768 +    address start = __ pc();
   1.769 +
   1.770 +    const int preserve_register_words = (64 * 2);
   1.771 +    Address preserve_addr(FP, (-preserve_register_words * wordSize) + STACK_BIAS);
   1.772 +
   1.773 +    Register Lthread = L7_thread_cache;
   1.774 +    int i;
   1.775 +
   1.776 +    __ save_frame(0);
   1.777 +    __ mov(G1, L1);
   1.778 +    __ mov(G2, L2);
   1.779 +    __ mov(G3, L3);
   1.780 +    __ mov(G4, L4);
   1.781 +    __ mov(G5, L5);
   1.782 +    for (i = 0; i < 64; i += 2) {
   1.783 +      __ stf(FloatRegisterImpl::D, as_FloatRegister(i), preserve_addr, i * wordSize);
   1.784 +    }
   1.785 +
   1.786 +    address entry_point = CAST_FROM_FN_PTR(address, handle_unsafe_access);
   1.787 +    BLOCK_COMMENT("call handle_unsafe_access");
   1.788 +    __ call(entry_point, relocInfo::runtime_call_type);
   1.789 +    __ delayed()->nop();
   1.790 +
   1.791 +    __ mov(L1, G1);
   1.792 +    __ mov(L2, G2);
   1.793 +    __ mov(L3, G3);
   1.794 +    __ mov(L4, G4);
   1.795 +    __ mov(L5, G5);
   1.796 +    for (i = 0; i < 64; i += 2) {
   1.797 +      __ ldf(FloatRegisterImpl::D, preserve_addr, as_FloatRegister(i), i * wordSize);
   1.798 +    }
   1.799 +
   1.800 +    __ verify_thread();
   1.801 +
   1.802 +    __ jmp(O0, 0);
   1.803 +    __ delayed()->restore();
   1.804 +
   1.805 +    return start;
   1.806 +  }
   1.807 +
   1.808 +
   1.809 +  // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super );
   1.810 +  // Arguments :
   1.811 +  //
   1.812 +  //      ret  : O0, returned
   1.813 +  //      icc/xcc: set as O0 (depending on wordSize)
   1.814 +  //      sub  : O1, argument, not changed
   1.815 +  //      super: O2, argument, not changed
   1.816 +  //      raddr: O7, blown by call
   1.817 +  address generate_partial_subtype_check() {
   1.818 +    __ align(CodeEntryAlignment);
   1.819 +    StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
   1.820 +    address start = __ pc();
   1.821 +    Label miss;
   1.822 +
   1.823 +#if defined(COMPILER2) && !defined(_LP64)
   1.824 +    // Do not use a 'save' because it blows the 64-bit O registers.
   1.825 +    __ add(SP,-4*wordSize,SP);  // Make space for 4 temps (stack must be 2 words aligned)
   1.826 +    __ st_ptr(L0,SP,(frame::register_save_words+0)*wordSize);
   1.827 +    __ st_ptr(L1,SP,(frame::register_save_words+1)*wordSize);
   1.828 +    __ st_ptr(L2,SP,(frame::register_save_words+2)*wordSize);
   1.829 +    __ st_ptr(L3,SP,(frame::register_save_words+3)*wordSize);
   1.830 +    Register Rret   = O0;
   1.831 +    Register Rsub   = O1;
   1.832 +    Register Rsuper = O2;
   1.833 +#else
   1.834 +    __ save_frame(0);
   1.835 +    Register Rret   = I0;
   1.836 +    Register Rsub   = I1;
   1.837 +    Register Rsuper = I2;
   1.838 +#endif
   1.839 +
   1.840 +    Register L0_ary_len = L0;
   1.841 +    Register L1_ary_ptr = L1;
   1.842 +    Register L2_super   = L2;
   1.843 +    Register L3_index   = L3;
   1.844 +
   1.845 +    __ check_klass_subtype_slow_path(Rsub, Rsuper,
   1.846 +                                     L0, L1, L2, L3,
   1.847 +                                     NULL, &miss);
   1.848 +
   1.849 +    // Match falls through here.
   1.850 +    __ addcc(G0,0,Rret);        // set Z flags, Z result
   1.851 +
   1.852 +#if defined(COMPILER2) && !defined(_LP64)
   1.853 +    __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
   1.854 +    __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
   1.855 +    __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
   1.856 +    __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
   1.857 +    __ retl();                  // Result in Rret is zero; flags set to Z
   1.858 +    __ delayed()->add(SP,4*wordSize,SP);
   1.859 +#else
   1.860 +    __ ret();                   // Result in Rret is zero; flags set to Z
   1.861 +    __ delayed()->restore();
   1.862 +#endif
   1.863 +
   1.864 +    __ BIND(miss);
   1.865 +    __ addcc(G0,1,Rret);        // set NZ flags, NZ result
   1.866 +
   1.867 +#if defined(COMPILER2) && !defined(_LP64)
   1.868 +    __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
   1.869 +    __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
   1.870 +    __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
   1.871 +    __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
   1.872 +    __ retl();                  // Result in Rret is != 0; flags set to NZ
   1.873 +    __ delayed()->add(SP,4*wordSize,SP);
   1.874 +#else
   1.875 +    __ ret();                   // Result in Rret is != 0; flags set to NZ
   1.876 +    __ delayed()->restore();
   1.877 +#endif
   1.878 +
   1.879 +    return start;
   1.880 +  }
   1.881 +
   1.882 +
   1.883 +  // Called from MacroAssembler::verify_oop
   1.884 +  //
   1.885 +  address generate_verify_oop_subroutine() {
   1.886 +    StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
   1.887 +
   1.888 +    address start = __ pc();
   1.889 +
   1.890 +    __ verify_oop_subroutine();
   1.891 +
   1.892 +    return start;
   1.893 +  }
   1.894 +
   1.895 +
   1.896 +  //
   1.897 +  // Verify that a register contains clean 32-bits positive value
   1.898 +  // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax).
   1.899 +  //
   1.900 +  //  Input:
   1.901 +  //    Rint  -  32-bits value
   1.902 +  //    Rtmp  -  scratch
   1.903 +  //
   1.904 +  void assert_clean_int(Register Rint, Register Rtmp) {
   1.905 +#if defined(ASSERT) && defined(_LP64)
   1.906 +    __ signx(Rint, Rtmp);
   1.907 +    __ cmp(Rint, Rtmp);
   1.908 +    __ breakpoint_trap(Assembler::notEqual, Assembler::xcc);
   1.909 +#endif
   1.910 +  }
   1.911 +
   1.912 +  //
   1.913 +  //  Generate overlap test for array copy stubs
   1.914 +  //
   1.915 +  //  Input:
   1.916 +  //    O0    -  array1
   1.917 +  //    O1    -  array2
   1.918 +  //    O2    -  element count
   1.919 +  //
   1.920 +  //  Kills temps:  O3, O4
   1.921 +  //
   1.922 +  void array_overlap_test(address no_overlap_target, int log2_elem_size) {
   1.923 +    assert(no_overlap_target != NULL, "must be generated");
   1.924 +    array_overlap_test(no_overlap_target, NULL, log2_elem_size);
   1.925 +  }
   1.926 +  void array_overlap_test(Label& L_no_overlap, int log2_elem_size) {
   1.927 +    array_overlap_test(NULL, &L_no_overlap, log2_elem_size);
   1.928 +  }
   1.929 +  void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) {
   1.930 +    const Register from       = O0;
   1.931 +    const Register to         = O1;
   1.932 +    const Register count      = O2;
   1.933 +    const Register to_from    = O3; // to - from
   1.934 +    const Register byte_count = O4; // count << log2_elem_size
   1.935 +
   1.936 +      __ subcc(to, from, to_from);
   1.937 +      __ sll_ptr(count, log2_elem_size, byte_count);
   1.938 +      if (NOLp == NULL)
   1.939 +        __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target);
   1.940 +      else
   1.941 +        __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp));
   1.942 +      __ delayed()->cmp(to_from, byte_count);
   1.943 +      if (NOLp == NULL)
   1.944 +        __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target);
   1.945 +      else
   1.946 +        __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp));
   1.947 +      __ delayed()->nop();
   1.948 +  }
   1.949 +
   1.950 +  //
   1.951 +  //  Generate pre-write barrier for array.
   1.952 +  //
   1.953 +  //  Input:
   1.954 +  //     addr     - register containing starting address
   1.955 +  //     count    - register containing element count
   1.956 +  //     tmp      - scratch register
   1.957 +  //
   1.958 +  //  The input registers are overwritten.
   1.959 +  //
   1.960 +  void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
   1.961 +    BarrierSet* bs = Universe::heap()->barrier_set();
   1.962 +    switch (bs->kind()) {
   1.963 +      case BarrierSet::G1SATBCT:
   1.964 +      case BarrierSet::G1SATBCTLogging:
   1.965 +        // With G1, don't generate the call if we statically know that the target in uninitialized
   1.966 +        if (!dest_uninitialized) {
   1.967 +          __ save_frame(0);
   1.968 +          // Save the necessary global regs... will be used after.
   1.969 +          if (addr->is_global()) {
   1.970 +            __ mov(addr, L0);
   1.971 +          }
   1.972 +          if (count->is_global()) {
   1.973 +            __ mov(count, L1);
   1.974 +          }
   1.975 +          __ mov(addr->after_save(), O0);
   1.976 +          // Get the count into O1
   1.977 +          __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
   1.978 +          __ delayed()->mov(count->after_save(), O1);
   1.979 +          if (addr->is_global()) {
   1.980 +            __ mov(L0, addr);
   1.981 +          }
   1.982 +          if (count->is_global()) {
   1.983 +            __ mov(L1, count);
   1.984 +          }
   1.985 +          __ restore();
   1.986 +        }
   1.987 +        break;
   1.988 +      case BarrierSet::CardTableModRef:
   1.989 +      case BarrierSet::CardTableExtension:
   1.990 +      case BarrierSet::ModRef:
   1.991 +        break;
   1.992 +      default:
   1.993 +        ShouldNotReachHere();
   1.994 +    }
   1.995 +  }
   1.996 +  //
   1.997 +  //  Generate post-write barrier for array.
   1.998 +  //
   1.999 +  //  Input:
  1.1000 +  //     addr     - register containing starting address
  1.1001 +  //     count    - register containing element count
  1.1002 +  //     tmp      - scratch register
  1.1003 +  //
  1.1004 +  //  The input registers are overwritten.
  1.1005 +  //
  1.1006 +  void gen_write_ref_array_post_barrier(Register addr, Register count,
  1.1007 +                                        Register tmp) {
  1.1008 +    BarrierSet* bs = Universe::heap()->barrier_set();
  1.1009 +
  1.1010 +    switch (bs->kind()) {
  1.1011 +      case BarrierSet::G1SATBCT:
  1.1012 +      case BarrierSet::G1SATBCTLogging:
  1.1013 +        {
  1.1014 +          // Get some new fresh output registers.
  1.1015 +          __ save_frame(0);
  1.1016 +          __ mov(addr->after_save(), O0);
  1.1017 +          __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
  1.1018 +          __ delayed()->mov(count->after_save(), O1);
  1.1019 +          __ restore();
  1.1020 +        }
  1.1021 +        break;
  1.1022 +      case BarrierSet::CardTableModRef:
  1.1023 +      case BarrierSet::CardTableExtension:
  1.1024 +        {
  1.1025 +          CardTableModRefBS* ct = (CardTableModRefBS*)bs;
  1.1026 +          assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
  1.1027 +          assert_different_registers(addr, count, tmp);
  1.1028 +
  1.1029 +          Label L_loop;
  1.1030 +
  1.1031 +          __ sll_ptr(count, LogBytesPerHeapOop, count);
  1.1032 +          __ sub(count, BytesPerHeapOop, count);
  1.1033 +          __ add(count, addr, count);
  1.1034 +          // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
  1.1035 +          __ srl_ptr(addr, CardTableModRefBS::card_shift, addr);
  1.1036 +          __ srl_ptr(count, CardTableModRefBS::card_shift, count);
  1.1037 +          __ sub(count, addr, count);
  1.1038 +          AddressLiteral rs(ct->byte_map_base);
  1.1039 +          __ set(rs, tmp);
  1.1040 +        __ BIND(L_loop);
  1.1041 +          __ stb(G0, tmp, addr);
  1.1042 +          __ subcc(count, 1, count);
  1.1043 +          __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
  1.1044 +          __ delayed()->add(addr, 1, addr);
  1.1045 +        }
  1.1046 +        break;
  1.1047 +      case BarrierSet::ModRef:
  1.1048 +        break;
  1.1049 +      default:
  1.1050 +        ShouldNotReachHere();
  1.1051 +    }
  1.1052 +  }
  1.1053 +
  1.1054 +  //
  1.1055 +  // Generate main code for disjoint arraycopy
  1.1056 +  //
  1.1057 +  typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec,
  1.1058 +                                              Label& L_loop, bool use_prefetch, bool use_bis);
  1.1059 +
  1.1060 +  void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size,
  1.1061 +                          int iter_size, StubGenerator::CopyLoopFunc copy_loop_func) {
  1.1062 +    Label L_copy;
  1.1063 +
  1.1064 +    assert(log2_elem_size <= 3, "the following code should be changed");
  1.1065 +    int count_dec = 16>>log2_elem_size;
  1.1066 +
  1.1067 +    int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance);
  1.1068 +    assert(prefetch_dist < 4096, "invalid value");
  1.1069 +    prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size
  1.1070 +    int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count
  1.1071 +
  1.1072 +    if (UseBlockCopy) {
  1.1073 +      Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy;
  1.1074 +
  1.1075 +      // 64 bytes tail + bytes copied in one loop iteration
  1.1076 +      int tail_size = 64 + iter_size;
  1.1077 +      int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size;
  1.1078 +      // Use BIS copy only for big arrays since it requires membar.
  1.1079 +      __ set(block_copy_count, O4);
  1.1080 +      __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy);
  1.1081 +      // This code is for disjoint source and destination:
  1.1082 +      //   to <= from || to >= from+count
  1.1083 +      // but BIS will stomp over 'from' if (to > from-tail_size && to <= from)
  1.1084 +      __ sub(from, to, O4);
  1.1085 +      __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm.
  1.1086 +      __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy);
  1.1087 +
  1.1088 +      __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
  1.1089 +      // BIS should not be used to copy tail (64 bytes+iter_size)
  1.1090 +      // to avoid zeroing of following values.
  1.1091 +      __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0
  1.1092 +
  1.1093 +      if (prefetch_count > 0) { // rounded up to one iteration count
  1.1094 +        // Do prefetching only if copy size is bigger
  1.1095 +        // than prefetch distance.
  1.1096 +        __ set(prefetch_count, O4);
  1.1097 +        __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy);
  1.1098 +        __ sub(count, prefetch_count, count);
  1.1099 +
  1.1100 +        (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true);
  1.1101 +        __ add(count, prefetch_count, count); // restore count
  1.1102 +
  1.1103 +      } // prefetch_count > 0
  1.1104 +
  1.1105 +      (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true);
  1.1106 +      __ add(count, (tail_size>>log2_elem_size), count); // restore count
  1.1107 +
  1.1108 +      __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT);
  1.1109 +      // BIS needs membar.
  1.1110 +      __ membar(Assembler::StoreLoad);
  1.1111 +      // Copy tail
  1.1112 +      __ ba_short(L_copy);
  1.1113 +
  1.1114 +      __ BIND(L_skip_block_copy);
  1.1115 +    } // UseBlockCopy
  1.1116 +
  1.1117 +    if (prefetch_count > 0) { // rounded up to one iteration count
  1.1118 +      // Do prefetching only if copy size is bigger
  1.1119 +      // than prefetch distance.
  1.1120 +      __ set(prefetch_count, O4);
  1.1121 +      __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy);
  1.1122 +      __ sub(count, prefetch_count, count);
  1.1123 +
  1.1124 +      Label L_copy_prefetch;
  1.1125 +      (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false);
  1.1126 +      __ add(count, prefetch_count, count); // restore count
  1.1127 +
  1.1128 +    } // prefetch_count > 0
  1.1129 +
  1.1130 +    (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false);
  1.1131 +  }
  1.1132 +
  1.1133 +
  1.1134 +
  1.1135 +  //
  1.1136 +  // Helper methods for copy_16_bytes_forward_with_shift()
  1.1137 +  //
  1.1138 +  void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec,
  1.1139 +                                Label& L_loop, bool use_prefetch, bool use_bis) {
  1.1140 +
  1.1141 +    const Register left_shift  = G1; // left  shift bit counter
  1.1142 +    const Register right_shift = G5; // right shift bit counter
  1.1143 +
  1.1144 +    __ align(OptoLoopAlignment);
  1.1145 +    __ BIND(L_loop);
  1.1146 +    if (use_prefetch) {
  1.1147 +      if (ArraycopySrcPrefetchDistance > 0) {
  1.1148 +        __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
  1.1149 +      }
  1.1150 +      if (ArraycopyDstPrefetchDistance > 0) {
  1.1151 +        __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
  1.1152 +      }
  1.1153 +    }
  1.1154 +    __ ldx(from, 0, O4);
  1.1155 +    __ ldx(from, 8, G4);
  1.1156 +    __ inc(to, 16);
  1.1157 +    __ inc(from, 16);
  1.1158 +    __ deccc(count, count_dec); // Can we do next iteration after this one?
  1.1159 +    __ srlx(O4, right_shift, G3);
  1.1160 +    __ bset(G3, O3);
  1.1161 +    __ sllx(O4, left_shift,  O4);
  1.1162 +    __ srlx(G4, right_shift, G3);
  1.1163 +    __ bset(G3, O4);
  1.1164 +    if (use_bis) {
  1.1165 +      __ stxa(O3, to, -16);
  1.1166 +      __ stxa(O4, to, -8);
  1.1167 +    } else {
  1.1168 +      __ stx(O3, to, -16);
  1.1169 +      __ stx(O4, to, -8);
  1.1170 +    }
  1.1171 +    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
  1.1172 +    __ delayed()->sllx(G4, left_shift,  O3);
  1.1173 +  }
  1.1174 +
  1.1175 +  // Copy big chunks forward with shift
  1.1176 +  //
  1.1177 +  // Inputs:
  1.1178 +  //   from      - source arrays
  1.1179 +  //   to        - destination array aligned to 8-bytes
  1.1180 +  //   count     - elements count to copy >= the count equivalent to 16 bytes
  1.1181 +  //   count_dec - elements count's decrement equivalent to 16 bytes
  1.1182 +  //   L_copy_bytes - copy exit label
  1.1183 +  //
  1.1184 +  void copy_16_bytes_forward_with_shift(Register from, Register to,
  1.1185 +                     Register count, int log2_elem_size, Label& L_copy_bytes) {
  1.1186 +    Label L_aligned_copy, L_copy_last_bytes;
  1.1187 +    assert(log2_elem_size <= 3, "the following code should be changed");
  1.1188 +    int count_dec = 16>>log2_elem_size;
  1.1189 +
  1.1190 +    // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
  1.1191 +    __ andcc(from, 7, G1); // misaligned bytes
  1.1192 +    __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
  1.1193 +    __ delayed()->nop();
  1.1194 +
  1.1195 +    const Register left_shift  = G1; // left  shift bit counter
  1.1196 +    const Register right_shift = G5; // right shift bit counter
  1.1197 +
  1.1198 +    __ sll(G1, LogBitsPerByte, left_shift);
  1.1199 +    __ mov(64, right_shift);
  1.1200 +    __ sub(right_shift, left_shift, right_shift);
  1.1201 +
  1.1202 +    //
  1.1203 +    // Load 2 aligned 8-bytes chunks and use one from previous iteration
  1.1204 +    // to form 2 aligned 8-bytes chunks to store.
  1.1205 +    //
  1.1206 +    __ dec(count, count_dec);   // Pre-decrement 'count'
  1.1207 +    __ andn(from, 7, from);     // Align address
  1.1208 +    __ ldx(from, 0, O3);
  1.1209 +    __ inc(from, 8);
  1.1210 +    __ sllx(O3, left_shift,  O3);
  1.1211 +
  1.1212 +    disjoint_copy_core(from, to, count, log2_elem_size, 16, &StubGenerator::copy_16_bytes_shift_loop);
  1.1213 +
  1.1214 +    __ inccc(count, count_dec>>1 ); // + 8 bytes
  1.1215 +    __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
  1.1216 +    __ delayed()->inc(count, count_dec>>1); // restore 'count'
  1.1217 +
  1.1218 +    // copy 8 bytes, part of them already loaded in O3
  1.1219 +    __ ldx(from, 0, O4);
  1.1220 +    __ inc(to, 8);
  1.1221 +    __ inc(from, 8);
  1.1222 +    __ srlx(O4, right_shift, G3);
  1.1223 +    __ bset(O3, G3);
  1.1224 +    __ stx(G3, to, -8);
  1.1225 +
  1.1226 +    __ BIND(L_copy_last_bytes);
  1.1227 +    __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
  1.1228 +    __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
  1.1229 +    __ delayed()->sub(from, right_shift, from);       // restore address
  1.1230 +
  1.1231 +    __ BIND(L_aligned_copy);
  1.1232 +  }
  1.1233 +
  1.1234 +  // Copy big chunks backward with shift
  1.1235 +  //
  1.1236 +  // Inputs:
  1.1237 +  //   end_from  - source arrays end address
  1.1238 +  //   end_to    - destination array end address aligned to 8-bytes
  1.1239 +  //   count     - elements count to copy >= the count equivalent to 16 bytes
  1.1240 +  //   count_dec - elements count's decrement equivalent to 16 bytes
  1.1241 +  //   L_aligned_copy - aligned copy exit label
  1.1242 +  //   L_copy_bytes   - copy exit label
  1.1243 +  //
  1.1244 +  void copy_16_bytes_backward_with_shift(Register end_from, Register end_to,
  1.1245 +                     Register count, int count_dec,
  1.1246 +                     Label& L_aligned_copy, Label& L_copy_bytes) {
  1.1247 +    Label L_loop, L_copy_last_bytes;
  1.1248 +
  1.1249 +    // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
  1.1250 +      __ andcc(end_from, 7, G1); // misaligned bytes
  1.1251 +      __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
  1.1252 +      __ delayed()->deccc(count, count_dec); // Pre-decrement 'count'
  1.1253 +
  1.1254 +    const Register left_shift  = G1; // left  shift bit counter
  1.1255 +    const Register right_shift = G5; // right shift bit counter
  1.1256 +
  1.1257 +      __ sll(G1, LogBitsPerByte, left_shift);
  1.1258 +      __ mov(64, right_shift);
  1.1259 +      __ sub(right_shift, left_shift, right_shift);
  1.1260 +
  1.1261 +    //
  1.1262 +    // Load 2 aligned 8-bytes chunks and use one from previous iteration
  1.1263 +    // to form 2 aligned 8-bytes chunks to store.
  1.1264 +    //
  1.1265 +      __ andn(end_from, 7, end_from);     // Align address
  1.1266 +      __ ldx(end_from, 0, O3);
  1.1267 +      __ align(OptoLoopAlignment);
  1.1268 +    __ BIND(L_loop);
  1.1269 +      __ ldx(end_from, -8, O4);
  1.1270 +      __ deccc(count, count_dec); // Can we do next iteration after this one?
  1.1271 +      __ ldx(end_from, -16, G4);
  1.1272 +      __ dec(end_to, 16);
  1.1273 +      __ dec(end_from, 16);
  1.1274 +      __ srlx(O3, right_shift, O3);
  1.1275 +      __ sllx(O4, left_shift,  G3);
  1.1276 +      __ bset(G3, O3);
  1.1277 +      __ stx(O3, end_to, 8);
  1.1278 +      __ srlx(O4, right_shift, O4);
  1.1279 +      __ sllx(G4, left_shift,  G3);
  1.1280 +      __ bset(G3, O4);
  1.1281 +      __ stx(O4, end_to, 0);
  1.1282 +      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
  1.1283 +      __ delayed()->mov(G4, O3);
  1.1284 +
  1.1285 +      __ inccc(count, count_dec>>1 ); // + 8 bytes
  1.1286 +      __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
  1.1287 +      __ delayed()->inc(count, count_dec>>1); // restore 'count'
  1.1288 +
  1.1289 +      // copy 8 bytes, part of them already loaded in O3
  1.1290 +      __ ldx(end_from, -8, O4);
  1.1291 +      __ dec(end_to, 8);
  1.1292 +      __ dec(end_from, 8);
  1.1293 +      __ srlx(O3, right_shift, O3);
  1.1294 +      __ sllx(O4, left_shift,  G3);
  1.1295 +      __ bset(O3, G3);
  1.1296 +      __ stx(G3, end_to, 0);
  1.1297 +
  1.1298 +    __ BIND(L_copy_last_bytes);
  1.1299 +      __ srl(left_shift, LogBitsPerByte, left_shift);    // misaligned bytes
  1.1300 +      __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
  1.1301 +      __ delayed()->add(end_from, left_shift, end_from); // restore address
  1.1302 +  }
  1.1303 +
  1.1304 +  //
  1.1305 +  //  Generate stub for disjoint byte copy.  If "aligned" is true, the
  1.1306 +  //  "from" and "to" addresses are assumed to be heapword aligned.
  1.1307 +  //
  1.1308 +  // Arguments for generated stub:
  1.1309 +  //      from:  O0
  1.1310 +  //      to:    O1
  1.1311 +  //      count: O2 treated as signed
  1.1312 +  //
  1.1313 +  address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) {
  1.1314 +    __ align(CodeEntryAlignment);
  1.1315 +    StubCodeMark mark(this, "StubRoutines", name);
  1.1316 +    address start = __ pc();
  1.1317 +
  1.1318 +    Label L_skip_alignment, L_align;
  1.1319 +    Label L_copy_byte, L_copy_byte_loop, L_exit;
  1.1320 +
  1.1321 +    const Register from      = O0;   // source array address
  1.1322 +    const Register to        = O1;   // destination array address
  1.1323 +    const Register count     = O2;   // elements count
  1.1324 +    const Register offset    = O5;   // offset from start of arrays
  1.1325 +    // O3, O4, G3, G4 are used as temp registers
  1.1326 +
  1.1327 +    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  1.1328 +
  1.1329 +    if (entry != NULL) {
  1.1330 +      *entry = __ pc();
  1.1331 +      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  1.1332 +      BLOCK_COMMENT("Entry:");
  1.1333 +    }
  1.1334 +
  1.1335 +    // for short arrays, just do single element copy
  1.1336 +    __ cmp(count, 23); // 16 + 7
  1.1337 +    __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
  1.1338 +    __ delayed()->mov(G0, offset);
  1.1339 +
  1.1340 +    if (aligned) {
  1.1341 +      // 'aligned' == true when it is known statically during compilation
  1.1342 +      // of this arraycopy call site that both 'from' and 'to' addresses
  1.1343 +      // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
  1.1344 +      //
  1.1345 +      // Aligned arrays have 4 bytes alignment in 32-bits VM
  1.1346 +      // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM
  1.1347 +      //
  1.1348 +#ifndef _LP64
  1.1349 +      // copy a 4-bytes word if necessary to align 'to' to 8 bytes
  1.1350 +      __ andcc(to, 7, G0);
  1.1351 +      __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment);
  1.1352 +      __ delayed()->ld(from, 0, O3);
  1.1353 +      __ inc(from, 4);
  1.1354 +      __ inc(to, 4);
  1.1355 +      __ dec(count, 4);
  1.1356 +      __ st(O3, to, -4);
  1.1357 +    __ BIND(L_skip_alignment);
  1.1358 +#endif
  1.1359 +    } else {
  1.1360 +      // copy bytes to align 'to' on 8 byte boundary
  1.1361 +      __ andcc(to, 7, G1); // misaligned bytes
  1.1362 +      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
  1.1363 +      __ delayed()->neg(G1);
  1.1364 +      __ inc(G1, 8);       // bytes need to copy to next 8-bytes alignment
  1.1365 +      __ sub(count, G1, count);
  1.1366 +    __ BIND(L_align);
  1.1367 +      __ ldub(from, 0, O3);
  1.1368 +      __ deccc(G1);
  1.1369 +      __ inc(from);
  1.1370 +      __ stb(O3, to, 0);
  1.1371 +      __ br(Assembler::notZero, false, Assembler::pt, L_align);
  1.1372 +      __ delayed()->inc(to);
  1.1373 +    __ BIND(L_skip_alignment);
  1.1374 +    }
  1.1375 +#ifdef _LP64
  1.1376 +    if (!aligned)
  1.1377 +#endif
  1.1378 +    {
  1.1379 +      // Copy with shift 16 bytes per iteration if arrays do not have
  1.1380 +      // the same alignment mod 8, otherwise fall through to the next
  1.1381 +      // code for aligned copy.
  1.1382 +      // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
  1.1383 +      // Also jump over aligned copy after the copy with shift completed.
  1.1384 +
  1.1385 +      copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte);
  1.1386 +    }
  1.1387 +
  1.1388 +    // Both array are 8 bytes aligned, copy 16 bytes at a time
  1.1389 +      __ and3(count, 7, G4); // Save count
  1.1390 +      __ srl(count, 3, count);
  1.1391 +     generate_disjoint_long_copy_core(aligned);
  1.1392 +      __ mov(G4, count);     // Restore count
  1.1393 +
  1.1394 +    // copy tailing bytes
  1.1395 +    __ BIND(L_copy_byte);
  1.1396 +      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
  1.1397 +      __ align(OptoLoopAlignment);
  1.1398 +    __ BIND(L_copy_byte_loop);
  1.1399 +      __ ldub(from, offset, O3);
  1.1400 +      __ deccc(count);
  1.1401 +      __ stb(O3, to, offset);
  1.1402 +      __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
  1.1403 +      __ delayed()->inc(offset);
  1.1404 +
  1.1405 +    __ BIND(L_exit);
  1.1406 +      // O3, O4 are used as temp registers
  1.1407 +      inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
  1.1408 +      __ retl();
  1.1409 +      __ delayed()->mov(G0, O0); // return 0
  1.1410 +    return start;
  1.1411 +  }
  1.1412 +
  1.1413 +  //
  1.1414 +  //  Generate stub for conjoint byte copy.  If "aligned" is true, the
  1.1415 +  //  "from" and "to" addresses are assumed to be heapword aligned.
  1.1416 +  //
  1.1417 +  // Arguments for generated stub:
  1.1418 +  //      from:  O0
  1.1419 +  //      to:    O1
  1.1420 +  //      count: O2 treated as signed
  1.1421 +  //
  1.1422 +  address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
  1.1423 +                                      address *entry, const char *name) {
  1.1424 +    // Do reverse copy.
  1.1425 +
  1.1426 +    __ align(CodeEntryAlignment);
  1.1427 +    StubCodeMark mark(this, "StubRoutines", name);
  1.1428 +    address start = __ pc();
  1.1429 +
  1.1430 +    Label L_skip_alignment, L_align, L_aligned_copy;
  1.1431 +    Label L_copy_byte, L_copy_byte_loop, L_exit;
  1.1432 +
  1.1433 +    const Register from      = O0;   // source array address
  1.1434 +    const Register to        = O1;   // destination array address
  1.1435 +    const Register count     = O2;   // elements count
  1.1436 +    const Register end_from  = from; // source array end address
  1.1437 +    const Register end_to    = to;   // destination array end address
  1.1438 +
  1.1439 +    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  1.1440 +
  1.1441 +    if (entry != NULL) {
  1.1442 +      *entry = __ pc();
  1.1443 +      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  1.1444 +      BLOCK_COMMENT("Entry:");
  1.1445 +    }
  1.1446 +
  1.1447 +    array_overlap_test(nooverlap_target, 0);
  1.1448 +
  1.1449 +    __ add(to, count, end_to);       // offset after last copied element
  1.1450 +
  1.1451 +    // for short arrays, just do single element copy
  1.1452 +    __ cmp(count, 23); // 16 + 7
  1.1453 +    __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
  1.1454 +    __ delayed()->add(from, count, end_from);
  1.1455 +
  1.1456 +    {
  1.1457 +      // Align end of arrays since they could be not aligned even
  1.1458 +      // when arrays itself are aligned.
  1.1459 +
  1.1460 +      // copy bytes to align 'end_to' on 8 byte boundary
  1.1461 +      __ andcc(end_to, 7, G1); // misaligned bytes
  1.1462 +      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
  1.1463 +      __ delayed()->nop();
  1.1464 +      __ sub(count, G1, count);
  1.1465 +    __ BIND(L_align);
  1.1466 +      __ dec(end_from);
  1.1467 +      __ dec(end_to);
  1.1468 +      __ ldub(end_from, 0, O3);
  1.1469 +      __ deccc(G1);
  1.1470 +      __ brx(Assembler::notZero, false, Assembler::pt, L_align);
  1.1471 +      __ delayed()->stb(O3, end_to, 0);
  1.1472 +    __ BIND(L_skip_alignment);
  1.1473 +    }
  1.1474 +#ifdef _LP64
  1.1475 +    if (aligned) {
  1.1476 +      // Both arrays are aligned to 8-bytes in 64-bits VM.
  1.1477 +      // The 'count' is decremented in copy_16_bytes_backward_with_shift()
  1.1478 +      // in unaligned case.
  1.1479 +      __ dec(count, 16);
  1.1480 +    } else
  1.1481 +#endif
  1.1482 +    {
  1.1483 +      // Copy with shift 16 bytes per iteration if arrays do not have
  1.1484 +      // the same alignment mod 8, otherwise jump to the next
  1.1485 +      // code for aligned copy (and substracting 16 from 'count' before jump).
  1.1486 +      // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
  1.1487 +      // Also jump over aligned copy after the copy with shift completed.
  1.1488 +
  1.1489 +      copy_16_bytes_backward_with_shift(end_from, end_to, count, 16,
  1.1490 +                                        L_aligned_copy, L_copy_byte);
  1.1491 +    }
  1.1492 +    // copy 4 elements (16 bytes) at a time
  1.1493 +      __ align(OptoLoopAlignment);
  1.1494 +    __ BIND(L_aligned_copy);
  1.1495 +      __ dec(end_from, 16);
  1.1496 +      __ ldx(end_from, 8, O3);
  1.1497 +      __ ldx(end_from, 0, O4);
  1.1498 +      __ dec(end_to, 16);
  1.1499 +      __ deccc(count, 16);
  1.1500 +      __ stx(O3, end_to, 8);
  1.1501 +      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
  1.1502 +      __ delayed()->stx(O4, end_to, 0);
  1.1503 +      __ inc(count, 16);
  1.1504 +
  1.1505 +    // copy 1 element (2 bytes) at a time
  1.1506 +    __ BIND(L_copy_byte);
  1.1507 +      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
  1.1508 +      __ align(OptoLoopAlignment);
  1.1509 +    __ BIND(L_copy_byte_loop);
  1.1510 +      __ dec(end_from);
  1.1511 +      __ dec(end_to);
  1.1512 +      __ ldub(end_from, 0, O4);
  1.1513 +      __ deccc(count);
  1.1514 +      __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop);
  1.1515 +      __ delayed()->stb(O4, end_to, 0);
  1.1516 +
  1.1517 +    __ BIND(L_exit);
  1.1518 +    // O3, O4 are used as temp registers
  1.1519 +    inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
  1.1520 +    __ retl();
  1.1521 +    __ delayed()->mov(G0, O0); // return 0
  1.1522 +    return start;
  1.1523 +  }
  1.1524 +
  1.1525 +  //
  1.1526 +  //  Generate stub for disjoint short copy.  If "aligned" is true, the
  1.1527 +  //  "from" and "to" addresses are assumed to be heapword aligned.
  1.1528 +  //
  1.1529 +  // Arguments for generated stub:
  1.1530 +  //      from:  O0
  1.1531 +  //      to:    O1
  1.1532 +  //      count: O2 treated as signed
  1.1533 +  //
  1.1534 +  address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) {
  1.1535 +    __ align(CodeEntryAlignment);
  1.1536 +    StubCodeMark mark(this, "StubRoutines", name);
  1.1537 +    address start = __ pc();
  1.1538 +
  1.1539 +    Label L_skip_alignment, L_skip_alignment2;
  1.1540 +    Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
  1.1541 +
  1.1542 +    const Register from      = O0;   // source array address
  1.1543 +    const Register to        = O1;   // destination array address
  1.1544 +    const Register count     = O2;   // elements count
  1.1545 +    const Register offset    = O5;   // offset from start of arrays
  1.1546 +    // O3, O4, G3, G4 are used as temp registers
  1.1547 +
  1.1548 +    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  1.1549 +
  1.1550 +    if (entry != NULL) {
  1.1551 +      *entry = __ pc();
  1.1552 +      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  1.1553 +      BLOCK_COMMENT("Entry:");
  1.1554 +    }
  1.1555 +
  1.1556 +    // for short arrays, just do single element copy
  1.1557 +    __ cmp(count, 11); // 8 + 3  (22 bytes)
  1.1558 +    __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
  1.1559 +    __ delayed()->mov(G0, offset);
  1.1560 +
  1.1561 +    if (aligned) {
  1.1562 +      // 'aligned' == true when it is known statically during compilation
  1.1563 +      // of this arraycopy call site that both 'from' and 'to' addresses
  1.1564 +      // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
  1.1565 +      //
  1.1566 +      // Aligned arrays have 4 bytes alignment in 32-bits VM
  1.1567 +      // and 8 bytes - in 64-bits VM.
  1.1568 +      //
  1.1569 +#ifndef _LP64
  1.1570 +      // copy a 2-elements word if necessary to align 'to' to 8 bytes
  1.1571 +      __ andcc(to, 7, G0);
  1.1572 +      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
  1.1573 +      __ delayed()->ld(from, 0, O3);
  1.1574 +      __ inc(from, 4);
  1.1575 +      __ inc(to, 4);
  1.1576 +      __ dec(count, 2);
  1.1577 +      __ st(O3, to, -4);
  1.1578 +    __ BIND(L_skip_alignment);
  1.1579 +#endif
  1.1580 +    } else {
  1.1581 +      // copy 1 element if necessary to align 'to' on an 4 bytes
  1.1582 +      __ andcc(to, 3, G0);
  1.1583 +      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
  1.1584 +      __ delayed()->lduh(from, 0, O3);
  1.1585 +      __ inc(from, 2);
  1.1586 +      __ inc(to, 2);
  1.1587 +      __ dec(count);
  1.1588 +      __ sth(O3, to, -2);
  1.1589 +    __ BIND(L_skip_alignment);
  1.1590 +
  1.1591 +      // copy 2 elements to align 'to' on an 8 byte boundary
  1.1592 +      __ andcc(to, 7, G0);
  1.1593 +      __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
  1.1594 +      __ delayed()->lduh(from, 0, O3);
  1.1595 +      __ dec(count, 2);
  1.1596 +      __ lduh(from, 2, O4);
  1.1597 +      __ inc(from, 4);
  1.1598 +      __ inc(to, 4);
  1.1599 +      __ sth(O3, to, -4);
  1.1600 +      __ sth(O4, to, -2);
  1.1601 +    __ BIND(L_skip_alignment2);
  1.1602 +    }
  1.1603 +#ifdef _LP64
  1.1604 +    if (!aligned)
  1.1605 +#endif
  1.1606 +    {
  1.1607 +      // Copy with shift 16 bytes per iteration if arrays do not have
  1.1608 +      // the same alignment mod 8, otherwise fall through to the next
  1.1609 +      // code for aligned copy.
  1.1610 +      // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
  1.1611 +      // Also jump over aligned copy after the copy with shift completed.
  1.1612 +
  1.1613 +      copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes);
  1.1614 +    }
  1.1615 +
  1.1616 +    // Both array are 8 bytes aligned, copy 16 bytes at a time
  1.1617 +      __ and3(count, 3, G4); // Save
  1.1618 +      __ srl(count, 2, count);
  1.1619 +     generate_disjoint_long_copy_core(aligned);
  1.1620 +      __ mov(G4, count); // restore
  1.1621 +
  1.1622 +    // copy 1 element at a time
  1.1623 +    __ BIND(L_copy_2_bytes);
  1.1624 +      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
  1.1625 +      __ align(OptoLoopAlignment);
  1.1626 +    __ BIND(L_copy_2_bytes_loop);
  1.1627 +      __ lduh(from, offset, O3);
  1.1628 +      __ deccc(count);
  1.1629 +      __ sth(O3, to, offset);
  1.1630 +      __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
  1.1631 +      __ delayed()->inc(offset, 2);
  1.1632 +
  1.1633 +    __ BIND(L_exit);
  1.1634 +      // O3, O4 are used as temp registers
  1.1635 +      inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
  1.1636 +      __ retl();
  1.1637 +      __ delayed()->mov(G0, O0); // return 0
  1.1638 +    return start;
  1.1639 +  }
  1.1640 +
  1.1641 +  //
  1.1642 +  //  Generate stub for disjoint short fill.  If "aligned" is true, the
  1.1643 +  //  "to" address is assumed to be heapword aligned.
  1.1644 +  //
  1.1645 +  // Arguments for generated stub:
  1.1646 +  //      to:    O0
  1.1647 +  //      value: O1
  1.1648 +  //      count: O2 treated as signed
  1.1649 +  //
  1.1650 +  address generate_fill(BasicType t, bool aligned, const char* name) {
  1.1651 +    __ align(CodeEntryAlignment);
  1.1652 +    StubCodeMark mark(this, "StubRoutines", name);
  1.1653 +    address start = __ pc();
  1.1654 +
  1.1655 +    const Register to        = O0;   // source array address
  1.1656 +    const Register value     = O1;   // fill value
  1.1657 +    const Register count     = O2;   // elements count
  1.1658 +    // O3 is used as a temp register
  1.1659 +
  1.1660 +    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  1.1661 +
  1.1662 +    Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
  1.1663 +    Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes;
  1.1664 +
  1.1665 +    int shift = -1;
  1.1666 +    switch (t) {
  1.1667 +       case T_BYTE:
  1.1668 +        shift = 2;
  1.1669 +        break;
  1.1670 +       case T_SHORT:
  1.1671 +        shift = 1;
  1.1672 +        break;
  1.1673 +      case T_INT:
  1.1674 +         shift = 0;
  1.1675 +        break;
  1.1676 +      default: ShouldNotReachHere();
  1.1677 +    }
  1.1678 +
  1.1679 +    BLOCK_COMMENT("Entry:");
  1.1680 +
  1.1681 +    if (t == T_BYTE) {
  1.1682 +      // Zero extend value
  1.1683 +      __ and3(value, 0xff, value);
  1.1684 +      __ sllx(value, 8, O3);
  1.1685 +      __ or3(value, O3, value);
  1.1686 +    }
  1.1687 +    if (t == T_SHORT) {
  1.1688 +      // Zero extend value
  1.1689 +      __ sllx(value, 48, value);
  1.1690 +      __ srlx(value, 48, value);
  1.1691 +    }
  1.1692 +    if (t == T_BYTE || t == T_SHORT) {
  1.1693 +      __ sllx(value, 16, O3);
  1.1694 +      __ or3(value, O3, value);
  1.1695 +    }
  1.1696 +
  1.1697 +    __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
  1.1698 +    __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp
  1.1699 +    __ delayed()->andcc(count, 1, G0);
  1.1700 +
  1.1701 +    if (!aligned && (t == T_BYTE || t == T_SHORT)) {
  1.1702 +      // align source address at 4 bytes address boundary
  1.1703 +      if (t == T_BYTE) {
  1.1704 +        // One byte misalignment happens only for byte arrays
  1.1705 +        __ andcc(to, 1, G0);
  1.1706 +        __ br(Assembler::zero, false, Assembler::pt, L_skip_align1);
  1.1707 +        __ delayed()->nop();
  1.1708 +        __ stb(value, to, 0);
  1.1709 +        __ inc(to, 1);
  1.1710 +        __ dec(count, 1);
  1.1711 +        __ BIND(L_skip_align1);
  1.1712 +      }
  1.1713 +      // Two bytes misalignment happens only for byte and short (char) arrays
  1.1714 +      __ andcc(to, 2, G0);
  1.1715 +      __ br(Assembler::zero, false, Assembler::pt, L_skip_align2);
  1.1716 +      __ delayed()->nop();
  1.1717 +      __ sth(value, to, 0);
  1.1718 +      __ inc(to, 2);
  1.1719 +      __ dec(count, 1 << (shift - 1));
  1.1720 +      __ BIND(L_skip_align2);
  1.1721 +    }
  1.1722 +#ifdef _LP64
  1.1723 +    if (!aligned) {
  1.1724 +#endif
  1.1725 +    // align to 8 bytes, we know we are 4 byte aligned to start
  1.1726 +    __ andcc(to, 7, G0);
  1.1727 +    __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes);
  1.1728 +    __ delayed()->nop();
  1.1729 +    __ stw(value, to, 0);
  1.1730 +    __ inc(to, 4);
  1.1731 +    __ dec(count, 1 << shift);
  1.1732 +    __ BIND(L_fill_32_bytes);
  1.1733 +#ifdef _LP64
  1.1734 +    }
  1.1735 +#endif
  1.1736 +
  1.1737 +    if (t == T_INT) {
  1.1738 +      // Zero extend value
  1.1739 +      __ srl(value, 0, value);
  1.1740 +    }
  1.1741 +    if (t == T_BYTE || t == T_SHORT || t == T_INT) {
  1.1742 +      __ sllx(value, 32, O3);
  1.1743 +      __ or3(value, O3, value);
  1.1744 +    }
  1.1745 +
  1.1746 +    Label L_check_fill_8_bytes;
  1.1747 +    // Fill 32-byte chunks
  1.1748 +    __ subcc(count, 8 << shift, count);
  1.1749 +    __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes);
  1.1750 +    __ delayed()->nop();
  1.1751 +
  1.1752 +    Label L_fill_32_bytes_loop, L_fill_4_bytes;
  1.1753 +    __ align(16);
  1.1754 +    __ BIND(L_fill_32_bytes_loop);
  1.1755 +
  1.1756 +    __ stx(value, to, 0);
  1.1757 +    __ stx(value, to, 8);
  1.1758 +    __ stx(value, to, 16);
  1.1759 +    __ stx(value, to, 24);
  1.1760 +
  1.1761 +    __ subcc(count, 8 << shift, count);
  1.1762 +    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop);
  1.1763 +    __ delayed()->add(to, 32, to);
  1.1764 +
  1.1765 +    __ BIND(L_check_fill_8_bytes);
  1.1766 +    __ addcc(count, 8 << shift, count);
  1.1767 +    __ brx(Assembler::zero, false, Assembler::pn, L_exit);
  1.1768 +    __ delayed()->subcc(count, 1 << (shift + 1), count);
  1.1769 +    __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes);
  1.1770 +    __ delayed()->andcc(count, 1<<shift, G0);
  1.1771 +
  1.1772 +    //
  1.1773 +    // length is too short, just fill 8 bytes at a time
  1.1774 +    //
  1.1775 +    Label L_fill_8_bytes_loop;
  1.1776 +    __ BIND(L_fill_8_bytes_loop);
  1.1777 +    __ stx(value, to, 0);
  1.1778 +    __ subcc(count, 1 << (shift + 1), count);
  1.1779 +    __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop);
  1.1780 +    __ delayed()->add(to, 8, to);
  1.1781 +
  1.1782 +    // fill trailing 4 bytes
  1.1783 +    __ andcc(count, 1<<shift, G0);  // in delay slot of branches
  1.1784 +    if (t == T_INT) {
  1.1785 +      __ BIND(L_fill_elements);
  1.1786 +    }
  1.1787 +    __ BIND(L_fill_4_bytes);
  1.1788 +    __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes);
  1.1789 +    if (t == T_BYTE || t == T_SHORT) {
  1.1790 +      __ delayed()->andcc(count, 1<<(shift-1), G0);
  1.1791 +    } else {
  1.1792 +      __ delayed()->nop();
  1.1793 +    }
  1.1794 +    __ stw(value, to, 0);
  1.1795 +    if (t == T_BYTE || t == T_SHORT) {
  1.1796 +      __ inc(to, 4);
  1.1797 +      // fill trailing 2 bytes
  1.1798 +      __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches
  1.1799 +      __ BIND(L_fill_2_bytes);
  1.1800 +      __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte);
  1.1801 +      __ delayed()->andcc(count, 1, count);
  1.1802 +      __ sth(value, to, 0);
  1.1803 +      if (t == T_BYTE) {
  1.1804 +        __ inc(to, 2);
  1.1805 +        // fill trailing byte
  1.1806 +        __ andcc(count, 1, count);  // in delay slot of branches
  1.1807 +        __ BIND(L_fill_byte);
  1.1808 +        __ brx(Assembler::zero, false, Assembler::pt, L_exit);
  1.1809 +        __ delayed()->nop();
  1.1810 +        __ stb(value, to, 0);
  1.1811 +      } else {
  1.1812 +        __ BIND(L_fill_byte);
  1.1813 +      }
  1.1814 +    } else {
  1.1815 +      __ BIND(L_fill_2_bytes);
  1.1816 +    }
  1.1817 +    __ BIND(L_exit);
  1.1818 +    __ retl();
  1.1819 +    __ delayed()->nop();
  1.1820 +
  1.1821 +    // Handle copies less than 8 bytes.  Int is handled elsewhere.
  1.1822 +    if (t == T_BYTE) {
  1.1823 +      __ BIND(L_fill_elements);
  1.1824 +      Label L_fill_2, L_fill_4;
  1.1825 +      // in delay slot __ andcc(count, 1, G0);
  1.1826 +      __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
  1.1827 +      __ delayed()->andcc(count, 2, G0);
  1.1828 +      __ stb(value, to, 0);
  1.1829 +      __ inc(to, 1);
  1.1830 +      __ BIND(L_fill_2);
  1.1831 +      __ brx(Assembler::zero, false, Assembler::pt, L_fill_4);
  1.1832 +      __ delayed()->andcc(count, 4, G0);
  1.1833 +      __ stb(value, to, 0);
  1.1834 +      __ stb(value, to, 1);
  1.1835 +      __ inc(to, 2);
  1.1836 +      __ BIND(L_fill_4);
  1.1837 +      __ brx(Assembler::zero, false, Assembler::pt, L_exit);
  1.1838 +      __ delayed()->nop();
  1.1839 +      __ stb(value, to, 0);
  1.1840 +      __ stb(value, to, 1);
  1.1841 +      __ stb(value, to, 2);
  1.1842 +      __ retl();
  1.1843 +      __ delayed()->stb(value, to, 3);
  1.1844 +    }
  1.1845 +
  1.1846 +    if (t == T_SHORT) {
  1.1847 +      Label L_fill_2;
  1.1848 +      __ BIND(L_fill_elements);
  1.1849 +      // in delay slot __ andcc(count, 1, G0);
  1.1850 +      __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
  1.1851 +      __ delayed()->andcc(count, 2, G0);
  1.1852 +      __ sth(value, to, 0);
  1.1853 +      __ inc(to, 2);
  1.1854 +      __ BIND(L_fill_2);
  1.1855 +      __ brx(Assembler::zero, false, Assembler::pt, L_exit);
  1.1856 +      __ delayed()->nop();
  1.1857 +      __ sth(value, to, 0);
  1.1858 +      __ retl();
  1.1859 +      __ delayed()->sth(value, to, 2);
  1.1860 +    }
  1.1861 +    return start;
  1.1862 +  }
  1.1863 +
  1.1864 +  //
  1.1865 +  //  Generate stub for conjoint short copy.  If "aligned" is true, the
  1.1866 +  //  "from" and "to" addresses are assumed to be heapword aligned.
  1.1867 +  //
  1.1868 +  // Arguments for generated stub:
  1.1869 +  //      from:  O0
  1.1870 +  //      to:    O1
  1.1871 +  //      count: O2 treated as signed
  1.1872 +  //
  1.1873 +  address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
  1.1874 +                                       address *entry, const char *name) {
  1.1875 +    // Do reverse copy.
  1.1876 +
  1.1877 +    __ align(CodeEntryAlignment);
  1.1878 +    StubCodeMark mark(this, "StubRoutines", name);
  1.1879 +    address start = __ pc();
  1.1880 +
  1.1881 +    Label L_skip_alignment, L_skip_alignment2, L_aligned_copy;
  1.1882 +    Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
  1.1883 +
  1.1884 +    const Register from      = O0;   // source array address
  1.1885 +    const Register to        = O1;   // destination array address
  1.1886 +    const Register count     = O2;   // elements count
  1.1887 +    const Register end_from  = from; // source array end address
  1.1888 +    const Register end_to    = to;   // destination array end address
  1.1889 +
  1.1890 +    const Register byte_count = O3;  // bytes count to copy
  1.1891 +
  1.1892 +    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  1.1893 +
  1.1894 +    if (entry != NULL) {
  1.1895 +      *entry = __ pc();
  1.1896 +      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  1.1897 +      BLOCK_COMMENT("Entry:");
  1.1898 +    }
  1.1899 +
  1.1900 +    array_overlap_test(nooverlap_target, 1);
  1.1901 +
  1.1902 +    __ sllx(count, LogBytesPerShort, byte_count);
  1.1903 +    __ add(to, byte_count, end_to);  // offset after last copied element
  1.1904 +
  1.1905 +    // for short arrays, just do single element copy
  1.1906 +    __ cmp(count, 11); // 8 + 3  (22 bytes)
  1.1907 +    __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
  1.1908 +    __ delayed()->add(from, byte_count, end_from);
  1.1909 +
  1.1910 +    {
  1.1911 +      // Align end of arrays since they could be not aligned even
  1.1912 +      // when arrays itself are aligned.
  1.1913 +
  1.1914 +      // copy 1 element if necessary to align 'end_to' on an 4 bytes
  1.1915 +      __ andcc(end_to, 3, G0);
  1.1916 +      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
  1.1917 +      __ delayed()->lduh(end_from, -2, O3);
  1.1918 +      __ dec(end_from, 2);
  1.1919 +      __ dec(end_to, 2);
  1.1920 +      __ dec(count);
  1.1921 +      __ sth(O3, end_to, 0);
  1.1922 +    __ BIND(L_skip_alignment);
  1.1923 +
  1.1924 +      // copy 2 elements to align 'end_to' on an 8 byte boundary
  1.1925 +      __ andcc(end_to, 7, G0);
  1.1926 +      __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
  1.1927 +      __ delayed()->lduh(end_from, -2, O3);
  1.1928 +      __ dec(count, 2);
  1.1929 +      __ lduh(end_from, -4, O4);
  1.1930 +      __ dec(end_from, 4);
  1.1931 +      __ dec(end_to, 4);
  1.1932 +      __ sth(O3, end_to, 2);
  1.1933 +      __ sth(O4, end_to, 0);
  1.1934 +    __ BIND(L_skip_alignment2);
  1.1935 +    }
  1.1936 +#ifdef _LP64
  1.1937 +    if (aligned) {
  1.1938 +      // Both arrays are aligned to 8-bytes in 64-bits VM.
  1.1939 +      // The 'count' is decremented in copy_16_bytes_backward_with_shift()
  1.1940 +      // in unaligned case.
  1.1941 +      __ dec(count, 8);
  1.1942 +    } else
  1.1943 +#endif
  1.1944 +    {
  1.1945 +      // Copy with shift 16 bytes per iteration if arrays do not have
  1.1946 +      // the same alignment mod 8, otherwise jump to the next
  1.1947 +      // code for aligned copy (and substracting 8 from 'count' before jump).
  1.1948 +      // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
  1.1949 +      // Also jump over aligned copy after the copy with shift completed.
  1.1950 +
  1.1951 +      copy_16_bytes_backward_with_shift(end_from, end_to, count, 8,
  1.1952 +                                        L_aligned_copy, L_copy_2_bytes);
  1.1953 +    }
  1.1954 +    // copy 4 elements (16 bytes) at a time
  1.1955 +      __ align(OptoLoopAlignment);
  1.1956 +    __ BIND(L_aligned_copy);
  1.1957 +      __ dec(end_from, 16);
  1.1958 +      __ ldx(end_from, 8, O3);
  1.1959 +      __ ldx(end_from, 0, O4);
  1.1960 +      __ dec(end_to, 16);
  1.1961 +      __ deccc(count, 8);
  1.1962 +      __ stx(O3, end_to, 8);
  1.1963 +      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
  1.1964 +      __ delayed()->stx(O4, end_to, 0);
  1.1965 +      __ inc(count, 8);
  1.1966 +
  1.1967 +    // copy 1 element (2 bytes) at a time
  1.1968 +    __ BIND(L_copy_2_bytes);
  1.1969 +      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
  1.1970 +    __ BIND(L_copy_2_bytes_loop);
  1.1971 +      __ dec(end_from, 2);
  1.1972 +      __ dec(end_to, 2);
  1.1973 +      __ lduh(end_from, 0, O4);
  1.1974 +      __ deccc(count);
  1.1975 +      __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
  1.1976 +      __ delayed()->sth(O4, end_to, 0);
  1.1977 +
  1.1978 +    __ BIND(L_exit);
  1.1979 +    // O3, O4 are used as temp registers
  1.1980 +    inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
  1.1981 +    __ retl();
  1.1982 +    __ delayed()->mov(G0, O0); // return 0
  1.1983 +    return start;
  1.1984 +  }
  1.1985 +
  1.1986 +  //
  1.1987 +  // Helper methods for generate_disjoint_int_copy_core()
  1.1988 +  //
  1.1989 +  void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec,
  1.1990 +                          Label& L_loop, bool use_prefetch, bool use_bis) {
  1.1991 +
  1.1992 +    __ align(OptoLoopAlignment);
  1.1993 +    __ BIND(L_loop);
  1.1994 +    if (use_prefetch) {
  1.1995 +      if (ArraycopySrcPrefetchDistance > 0) {
  1.1996 +        __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
  1.1997 +      }
  1.1998 +      if (ArraycopyDstPrefetchDistance > 0) {
  1.1999 +        __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
  1.2000 +      }
  1.2001 +    }
  1.2002 +    __ ldx(from, 4, O4);
  1.2003 +    __ ldx(from, 12, G4);
  1.2004 +    __ inc(to, 16);
  1.2005 +    __ inc(from, 16);
  1.2006 +    __ deccc(count, 4); // Can we do next iteration after this one?
  1.2007 +
  1.2008 +    __ srlx(O4, 32, G3);
  1.2009 +    __ bset(G3, O3);
  1.2010 +    __ sllx(O4, 32, O4);
  1.2011 +    __ srlx(G4, 32, G3);
  1.2012 +    __ bset(G3, O4);
  1.2013 +    if (use_bis) {
  1.2014 +      __ stxa(O3, to, -16);
  1.2015 +      __ stxa(O4, to, -8);
  1.2016 +    } else {
  1.2017 +      __ stx(O3, to, -16);
  1.2018 +      __ stx(O4, to, -8);
  1.2019 +    }
  1.2020 +    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
  1.2021 +    __ delayed()->sllx(G4, 32,  O3);
  1.2022 +
  1.2023 +  }
  1.2024 +
  1.2025 +  //
  1.2026 +  //  Generate core code for disjoint int copy (and oop copy on 32-bit).
  1.2027 +  //  If "aligned" is true, the "from" and "to" addresses are assumed
  1.2028 +  //  to be heapword aligned.
  1.2029 +  //
  1.2030 +  // Arguments:
  1.2031 +  //      from:  O0
  1.2032 +  //      to:    O1
  1.2033 +  //      count: O2 treated as signed
  1.2034 +  //
  1.2035 +  void generate_disjoint_int_copy_core(bool aligned) {
  1.2036 +
  1.2037 +    Label L_skip_alignment, L_aligned_copy;
  1.2038 +    Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
  1.2039 +
  1.2040 +    const Register from      = O0;   // source array address
  1.2041 +    const Register to        = O1;   // destination array address
  1.2042 +    const Register count     = O2;   // elements count
  1.2043 +    const Register offset    = O5;   // offset from start of arrays
  1.2044 +    // O3, O4, G3, G4 are used as temp registers
  1.2045 +
  1.2046 +    // 'aligned' == true when it is known statically during compilation
  1.2047 +    // of this arraycopy call site that both 'from' and 'to' addresses
  1.2048 +    // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
  1.2049 +    //
  1.2050 +    // Aligned arrays have 4 bytes alignment in 32-bits VM
  1.2051 +    // and 8 bytes - in 64-bits VM.
  1.2052 +    //
  1.2053 +#ifdef _LP64
  1.2054 +    if (!aligned)
  1.2055 +#endif
  1.2056 +    {
  1.2057 +      // The next check could be put under 'ifndef' since the code in
  1.2058 +      // generate_disjoint_long_copy_core() has own checks and set 'offset'.
  1.2059 +
  1.2060 +      // for short arrays, just do single element copy
  1.2061 +      __ cmp(count, 5); // 4 + 1 (20 bytes)
  1.2062 +      __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
  1.2063 +      __ delayed()->mov(G0, offset);
  1.2064 +
  1.2065 +      // copy 1 element to align 'to' on an 8 byte boundary
  1.2066 +      __ andcc(to, 7, G0);
  1.2067 +      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
  1.2068 +      __ delayed()->ld(from, 0, O3);
  1.2069 +      __ inc(from, 4);
  1.2070 +      __ inc(to, 4);
  1.2071 +      __ dec(count);
  1.2072 +      __ st(O3, to, -4);
  1.2073 +    __ BIND(L_skip_alignment);
  1.2074 +
  1.2075 +    // if arrays have same alignment mod 8, do 4 elements copy
  1.2076 +      __ andcc(from, 7, G0);
  1.2077 +      __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
  1.2078 +      __ delayed()->ld(from, 0, O3);
  1.2079 +
  1.2080 +    //
  1.2081 +    // Load 2 aligned 8-bytes chunks and use one from previous iteration
  1.2082 +    // to form 2 aligned 8-bytes chunks to store.
  1.2083 +    //
  1.2084 +    // copy_16_bytes_forward_with_shift() is not used here since this
  1.2085 +    // code is more optimal.
  1.2086 +
  1.2087 +    // copy with shift 4 elements (16 bytes) at a time
  1.2088 +      __ dec(count, 4);   // The cmp at the beginning guaranty count >= 4
  1.2089 +      __ sllx(O3, 32,  O3);
  1.2090 +
  1.2091 +      disjoint_copy_core(from, to, count, 2, 16, &StubGenerator::copy_16_bytes_loop);
  1.2092 +
  1.2093 +      __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
  1.2094 +      __ delayed()->inc(count, 4); // restore 'count'
  1.2095 +
  1.2096 +    __ BIND(L_aligned_copy);
  1.2097 +    } // !aligned
  1.2098 +
  1.2099 +    // copy 4 elements (16 bytes) at a time
  1.2100 +      __ and3(count, 1, G4); // Save
  1.2101 +      __ srl(count, 1, count);
  1.2102 +     generate_disjoint_long_copy_core(aligned);
  1.2103 +      __ mov(G4, count);     // Restore
  1.2104 +
  1.2105 +    // copy 1 element at a time
  1.2106 +    __ BIND(L_copy_4_bytes);
  1.2107 +      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
  1.2108 +    __ BIND(L_copy_4_bytes_loop);
  1.2109 +      __ ld(from, offset, O3);
  1.2110 +      __ deccc(count);
  1.2111 +      __ st(O3, to, offset);
  1.2112 +      __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
  1.2113 +      __ delayed()->inc(offset, 4);
  1.2114 +    __ BIND(L_exit);
  1.2115 +  }
  1.2116 +
  1.2117 +  //
  1.2118 +  //  Generate stub for disjoint int copy.  If "aligned" is true, the
  1.2119 +  //  "from" and "to" addresses are assumed to be heapword aligned.
  1.2120 +  //
  1.2121 +  // Arguments for generated stub:
  1.2122 +  //      from:  O0
  1.2123 +  //      to:    O1
  1.2124 +  //      count: O2 treated as signed
  1.2125 +  //
  1.2126 +  address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) {
  1.2127 +    __ align(CodeEntryAlignment);
  1.2128 +    StubCodeMark mark(this, "StubRoutines", name);
  1.2129 +    address start = __ pc();
  1.2130 +
  1.2131 +    const Register count = O2;
  1.2132 +    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  1.2133 +
  1.2134 +    if (entry != NULL) {
  1.2135 +      *entry = __ pc();
  1.2136 +      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  1.2137 +      BLOCK_COMMENT("Entry:");
  1.2138 +    }
  1.2139 +
  1.2140 +    generate_disjoint_int_copy_core(aligned);
  1.2141 +
  1.2142 +    // O3, O4 are used as temp registers
  1.2143 +    inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
  1.2144 +    __ retl();
  1.2145 +    __ delayed()->mov(G0, O0); // return 0
  1.2146 +    return start;
  1.2147 +  }
  1.2148 +
  1.2149 +  //
  1.2150 +  //  Generate core code for conjoint int copy (and oop copy on 32-bit).
  1.2151 +  //  If "aligned" is true, the "from" and "to" addresses are assumed
  1.2152 +  //  to be heapword aligned.
  1.2153 +  //
  1.2154 +  // Arguments:
  1.2155 +  //      from:  O0
  1.2156 +  //      to:    O1
  1.2157 +  //      count: O2 treated as signed
  1.2158 +  //
  1.2159 +  void generate_conjoint_int_copy_core(bool aligned) {
  1.2160 +    // Do reverse copy.
  1.2161 +
  1.2162 +    Label L_skip_alignment, L_aligned_copy;
  1.2163 +    Label L_copy_16_bytes,  L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
  1.2164 +
  1.2165 +    const Register from      = O0;   // source array address
  1.2166 +    const Register to        = O1;   // destination array address
  1.2167 +    const Register count     = O2;   // elements count
  1.2168 +    const Register end_from  = from; // source array end address
  1.2169 +    const Register end_to    = to;   // destination array end address
  1.2170 +    // O3, O4, O5, G3 are used as temp registers
  1.2171 +
  1.2172 +    const Register byte_count = O3;  // bytes count to copy
  1.2173 +
  1.2174 +      __ sllx(count, LogBytesPerInt, byte_count);
  1.2175 +      __ add(to, byte_count, end_to); // offset after last copied element
  1.2176 +
  1.2177 +      __ cmp(count, 5); // for short arrays, just do single element copy
  1.2178 +      __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
  1.2179 +      __ delayed()->add(from, byte_count, end_from);
  1.2180 +
  1.2181 +    // copy 1 element to align 'to' on an 8 byte boundary
  1.2182 +      __ andcc(end_to, 7, G0);
  1.2183 +      __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
  1.2184 +      __ delayed()->nop();
  1.2185 +      __ dec(count);
  1.2186 +      __ dec(end_from, 4);
  1.2187 +      __ dec(end_to,   4);
  1.2188 +      __ ld(end_from, 0, O4);
  1.2189 +      __ st(O4, end_to, 0);
  1.2190 +    __ BIND(L_skip_alignment);
  1.2191 +
  1.2192 +    // Check if 'end_from' and 'end_to' has the same alignment.
  1.2193 +      __ andcc(end_from, 7, G0);
  1.2194 +      __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
  1.2195 +      __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4
  1.2196 +
  1.2197 +    // copy with shift 4 elements (16 bytes) at a time
  1.2198 +    //
  1.2199 +    // Load 2 aligned 8-bytes chunks and use one from previous iteration
  1.2200 +    // to form 2 aligned 8-bytes chunks to store.
  1.2201 +    //
  1.2202 +      __ ldx(end_from, -4, O3);
  1.2203 +      __ align(OptoLoopAlignment);
  1.2204 +    __ BIND(L_copy_16_bytes);
  1.2205 +      __ ldx(end_from, -12, O4);
  1.2206 +      __ deccc(count, 4);
  1.2207 +      __ ldx(end_from, -20, O5);
  1.2208 +      __ dec(end_to, 16);
  1.2209 +      __ dec(end_from, 16);
  1.2210 +      __ srlx(O3, 32, O3);
  1.2211 +      __ sllx(O4, 32, G3);
  1.2212 +      __ bset(G3, O3);
  1.2213 +      __ stx(O3, end_to, 8);
  1.2214 +      __ srlx(O4, 32, O4);
  1.2215 +      __ sllx(O5, 32, G3);
  1.2216 +      __ bset(O4, G3);
  1.2217 +      __ stx(G3, end_to, 0);
  1.2218 +      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
  1.2219 +      __ delayed()->mov(O5, O3);
  1.2220 +
  1.2221 +      __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
  1.2222 +      __ delayed()->inc(count, 4);
  1.2223 +
  1.2224 +    // copy 4 elements (16 bytes) at a time
  1.2225 +      __ align(OptoLoopAlignment);
  1.2226 +    __ BIND(L_aligned_copy);
  1.2227 +      __ dec(end_from, 16);
  1.2228 +      __ ldx(end_from, 8, O3);
  1.2229 +      __ ldx(end_from, 0, O4);
  1.2230 +      __ dec(end_to, 16);
  1.2231 +      __ deccc(count, 4);
  1.2232 +      __ stx(O3, end_to, 8);
  1.2233 +      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
  1.2234 +      __ delayed()->stx(O4, end_to, 0);
  1.2235 +      __ inc(count, 4);
  1.2236 +
  1.2237 +    // copy 1 element (4 bytes) at a time
  1.2238 +    __ BIND(L_copy_4_bytes);
  1.2239 +      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
  1.2240 +    __ BIND(L_copy_4_bytes_loop);
  1.2241 +      __ dec(end_from, 4);
  1.2242 +      __ dec(end_to, 4);
  1.2243 +      __ ld(end_from, 0, O4);
  1.2244 +      __ deccc(count);
  1.2245 +      __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop);
  1.2246 +      __ delayed()->st(O4, end_to, 0);
  1.2247 +    __ BIND(L_exit);
  1.2248 +  }
  1.2249 +
  1.2250 +  //
  1.2251 +  //  Generate stub for conjoint int copy.  If "aligned" is true, the
  1.2252 +  //  "from" and "to" addresses are assumed to be heapword aligned.
  1.2253 +  //
  1.2254 +  // Arguments for generated stub:
  1.2255 +  //      from:  O0
  1.2256 +  //      to:    O1
  1.2257 +  //      count: O2 treated as signed
  1.2258 +  //
  1.2259 +  address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
  1.2260 +                                     address *entry, const char *name) {
  1.2261 +    __ align(CodeEntryAlignment);
  1.2262 +    StubCodeMark mark(this, "StubRoutines", name);
  1.2263 +    address start = __ pc();
  1.2264 +
  1.2265 +    assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
  1.2266 +
  1.2267 +    if (entry != NULL) {
  1.2268 +      *entry = __ pc();
  1.2269 +      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  1.2270 +      BLOCK_COMMENT("Entry:");
  1.2271 +    }
  1.2272 +
  1.2273 +    array_overlap_test(nooverlap_target, 2);
  1.2274 +
  1.2275 +    generate_conjoint_int_copy_core(aligned);
  1.2276 +
  1.2277 +    // O3, O4 are used as temp registers
  1.2278 +    inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
  1.2279 +    __ retl();
  1.2280 +    __ delayed()->mov(G0, O0); // return 0
  1.2281 +    return start;
  1.2282 +  }
  1.2283 +
  1.2284 +  //
  1.2285 +  // Helper methods for generate_disjoint_long_copy_core()
  1.2286 +  //
  1.2287 +  void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec,
  1.2288 +                          Label& L_loop, bool use_prefetch, bool use_bis) {
  1.2289 +    __ align(OptoLoopAlignment);
  1.2290 +    __ BIND(L_loop);
  1.2291 +    for (int off = 0; off < 64; off += 16) {
  1.2292 +      if (use_prefetch && (off & 31) == 0) {
  1.2293 +        if (ArraycopySrcPrefetchDistance > 0) {
  1.2294 +          __ prefetch(from, ArraycopySrcPrefetchDistance+off, Assembler::severalReads);
  1.2295 +        }
  1.2296 +        if (ArraycopyDstPrefetchDistance > 0) {
  1.2297 +          __ prefetch(to, ArraycopyDstPrefetchDistance+off, Assembler::severalWritesAndPossiblyReads);
  1.2298 +        }
  1.2299 +      }
  1.2300 +      __ ldx(from,  off+0, O4);
  1.2301 +      __ ldx(from,  off+8, O5);
  1.2302 +      if (use_bis) {
  1.2303 +        __ stxa(O4, to,  off+0);
  1.2304 +        __ stxa(O5, to,  off+8);
  1.2305 +      } else {
  1.2306 +        __ stx(O4, to,  off+0);
  1.2307 +        __ stx(O5, to,  off+8);
  1.2308 +      }
  1.2309 +    }
  1.2310 +    __ deccc(count, 8);
  1.2311 +    __ inc(from, 64);
  1.2312 +    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
  1.2313 +    __ delayed()->inc(to, 64);
  1.2314 +  }
  1.2315 +
  1.2316 +  //
  1.2317 +  //  Generate core code for disjoint long copy (and oop copy on 64-bit).
  1.2318 +  //  "aligned" is ignored, because we must make the stronger
  1.2319 +  //  assumption that both addresses are always 64-bit aligned.
  1.2320 +  //
  1.2321 +  // Arguments:
  1.2322 +  //      from:  O0
  1.2323 +  //      to:    O1
  1.2324 +  //      count: O2 treated as signed
  1.2325 +  //
  1.2326 +  // count -= 2;
  1.2327 +  // if ( count >= 0 ) { // >= 2 elements
  1.2328 +  //   if ( count > 6) { // >= 8 elements
  1.2329 +  //     count -= 6; // original count - 8
  1.2330 +  //     do {
  1.2331 +  //       copy_8_elements;
  1.2332 +  //       count -= 8;
  1.2333 +  //     } while ( count >= 0 );
  1.2334 +  //     count += 6;
  1.2335 +  //   }
  1.2336 +  //   if ( count >= 0 ) { // >= 2 elements
  1.2337 +  //     do {
  1.2338 +  //       copy_2_elements;
  1.2339 +  //     } while ( (count=count-2) >= 0 );
  1.2340 +  //   }
  1.2341 +  // }
  1.2342 +  // count += 2;
  1.2343 +  // if ( count != 0 ) { // 1 element left
  1.2344 +  //   copy_1_element;
  1.2345 +  // }
  1.2346 +  //
  1.2347 +  void generate_disjoint_long_copy_core(bool aligned) {
  1.2348 +    Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
  1.2349 +    const Register from    = O0;  // source array address
  1.2350 +    const Register to      = O1;  // destination array address
  1.2351 +    const Register count   = O2;  // elements count
  1.2352 +    const Register offset0 = O4;  // element offset
  1.2353 +    const Register offset8 = O5;  // next element offset
  1.2354 +
  1.2355 +    __ deccc(count, 2);
  1.2356 +    __ mov(G0, offset0);   // offset from start of arrays (0)
  1.2357 +    __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
  1.2358 +    __ delayed()->add(offset0, 8, offset8);
  1.2359 +
  1.2360 +    // Copy by 64 bytes chunks
  1.2361 +
  1.2362 +    const Register from64 = O3;  // source address
  1.2363 +    const Register to64   = G3;  // destination address
  1.2364 +    __ subcc(count, 6, O3);
  1.2365 +    __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
  1.2366 +    __ delayed()->mov(to,   to64);
  1.2367 +    // Now we can use O4(offset0), O5(offset8) as temps
  1.2368 +    __ mov(O3, count);
  1.2369 +    // count >= 0 (original count - 8)
  1.2370 +    __ mov(from, from64);
  1.2371 +
  1.2372 +    disjoint_copy_core(from64, to64, count, 3, 64, &StubGenerator::copy_64_bytes_loop);
  1.2373 +
  1.2374 +      // Restore O4(offset0), O5(offset8)
  1.2375 +      __ sub(from64, from, offset0);
  1.2376 +      __ inccc(count, 6); // restore count
  1.2377 +      __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
  1.2378 +      __ delayed()->add(offset0, 8, offset8);
  1.2379 +
  1.2380 +      // Copy by 16 bytes chunks
  1.2381 +      __ align(OptoLoopAlignment);
  1.2382 +    __ BIND(L_copy_16_bytes);
  1.2383 +      __ ldx(from, offset0, O3);
  1.2384 +      __ ldx(from, offset8, G3);
  1.2385 +      __ deccc(count, 2);
  1.2386 +      __ stx(O3, to, offset0);
  1.2387 +      __ inc(offset0, 16);
  1.2388 +      __ stx(G3, to, offset8);
  1.2389 +      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
  1.2390 +      __ delayed()->inc(offset8, 16);
  1.2391 +
  1.2392 +      // Copy last 8 bytes
  1.2393 +    __ BIND(L_copy_8_bytes);
  1.2394 +      __ inccc(count, 2);
  1.2395 +      __ brx(Assembler::zero, true, Assembler::pn, L_exit );
  1.2396 +      __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs
  1.2397 +      __ ldx(from, offset0, O3);
  1.2398 +      __ stx(O3, to, offset0);
  1.2399 +    __ BIND(L_exit);
  1.2400 +  }
  1.2401 +
  1.2402 +  //
  1.2403 +  //  Generate stub for disjoint long copy.
  1.2404 +  //  "aligned" is ignored, because we must make the stronger
  1.2405 +  //  assumption that both addresses are always 64-bit aligned.
  1.2406 +  //
  1.2407 +  // Arguments for generated stub:
  1.2408 +  //      from:  O0
  1.2409 +  //      to:    O1
  1.2410 +  //      count: O2 treated as signed
  1.2411 +  //
  1.2412 +  address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) {
  1.2413 +    __ align(CodeEntryAlignment);
  1.2414 +    StubCodeMark mark(this, "StubRoutines", name);
  1.2415 +    address start = __ pc();
  1.2416 +
  1.2417 +    assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
  1.2418 +
  1.2419 +    if (entry != NULL) {
  1.2420 +      *entry = __ pc();
  1.2421 +      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  1.2422 +      BLOCK_COMMENT("Entry:");
  1.2423 +    }
  1.2424 +
  1.2425 +    generate_disjoint_long_copy_core(aligned);
  1.2426 +
  1.2427 +    // O3, O4 are used as temp registers
  1.2428 +    inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
  1.2429 +    __ retl();
  1.2430 +    __ delayed()->mov(G0, O0); // return 0
  1.2431 +    return start;
  1.2432 +  }
  1.2433 +
  1.2434 +  //
  1.2435 +  //  Generate core code for conjoint long copy (and oop copy on 64-bit).
  1.2436 +  //  "aligned" is ignored, because we must make the stronger
  1.2437 +  //  assumption that both addresses are always 64-bit aligned.
  1.2438 +  //
  1.2439 +  // Arguments:
  1.2440 +  //      from:  O0
  1.2441 +  //      to:    O1
  1.2442 +  //      count: O2 treated as signed
  1.2443 +  //
  1.2444 +  void generate_conjoint_long_copy_core(bool aligned) {
  1.2445 +    // Do reverse copy.
  1.2446 +    Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
  1.2447 +    const Register from    = O0;  // source array address
  1.2448 +    const Register to      = O1;  // destination array address
  1.2449 +    const Register count   = O2;  // elements count
  1.2450 +    const Register offset8 = O4;  // element offset
  1.2451 +    const Register offset0 = O5;  // previous element offset
  1.2452 +
  1.2453 +      __ subcc(count, 1, count);
  1.2454 +      __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
  1.2455 +      __ delayed()->sllx(count, LogBytesPerLong, offset8);
  1.2456 +      __ sub(offset8, 8, offset0);
  1.2457 +      __ align(OptoLoopAlignment);
  1.2458 +    __ BIND(L_copy_16_bytes);
  1.2459 +      __ ldx(from, offset8, O2);
  1.2460 +      __ ldx(from, offset0, O3);
  1.2461 +      __ stx(O2, to, offset8);
  1.2462 +      __ deccc(offset8, 16);      // use offset8 as counter
  1.2463 +      __ stx(O3, to, offset0);
  1.2464 +      __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes);
  1.2465 +      __ delayed()->dec(offset0, 16);
  1.2466 +
  1.2467 +    __ BIND(L_copy_8_bytes);
  1.2468 +      __ brx(Assembler::negative, false, Assembler::pn, L_exit );
  1.2469 +      __ delayed()->nop();
  1.2470 +      __ ldx(from, 0, O3);
  1.2471 +      __ stx(O3, to, 0);
  1.2472 +    __ BIND(L_exit);
  1.2473 +  }
  1.2474 +
  1.2475 +  //  Generate stub for conjoint long copy.
  1.2476 +  //  "aligned" is ignored, because we must make the stronger
  1.2477 +  //  assumption that both addresses are always 64-bit aligned.
  1.2478 +  //
  1.2479 +  // Arguments for generated stub:
  1.2480 +  //      from:  O0
  1.2481 +  //      to:    O1
  1.2482 +  //      count: O2 treated as signed
  1.2483 +  //
  1.2484 +  address generate_conjoint_long_copy(bool aligned, address nooverlap_target,
  1.2485 +                                      address *entry, const char *name) {
  1.2486 +    __ align(CodeEntryAlignment);
  1.2487 +    StubCodeMark mark(this, "StubRoutines", name);
  1.2488 +    address start = __ pc();
  1.2489 +
  1.2490 +    assert(aligned, "Should always be aligned");
  1.2491 +
  1.2492 +    assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
  1.2493 +
  1.2494 +    if (entry != NULL) {
  1.2495 +      *entry = __ pc();
  1.2496 +      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
  1.2497 +      BLOCK_COMMENT("Entry:");
  1.2498 +    }
  1.2499 +
  1.2500 +    array_overlap_test(nooverlap_target, 3);
  1.2501 +
  1.2502 +    generate_conjoint_long_copy_core(aligned);
  1.2503 +
  1.2504 +    // O3, O4 are used as temp registers
  1.2505 +    inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
  1.2506 +    __ retl();
  1.2507 +    __ delayed()->mov(G0, O0); // return 0
  1.2508 +    return start;
  1.2509 +  }
  1.2510 +
  1.2511 +  //  Generate stub for disjoint oop copy.  If "aligned" is true, the
  1.2512 +  //  "from" and "to" addresses are assumed to be heapword aligned.
  1.2513 +  //
  1.2514 +  // Arguments for generated stub:
  1.2515 +  //      from:  O0
  1.2516 +  //      to:    O1
  1.2517 +  //      count: O2 treated as signed
  1.2518 +  //
  1.2519 +  address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name,
  1.2520 +                                     bool dest_uninitialized = false) {
  1.2521 +
  1.2522 +    const Register from  = O0;  // source array address
  1.2523 +    const Register to    = O1;  // destination array address
  1.2524 +    const Register count = O2;  // elements count
  1.2525 +
  1.2526 +    __ align(CodeEntryAlignment);
  1.2527 +    StubCodeMark mark(this, "StubRoutines", name);
  1.2528 +    address start = __ pc();
  1.2529 +
  1.2530 +    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  1.2531 +
  1.2532 +    if (entry != NULL) {
  1.2533 +      *entry = __ pc();
  1.2534 +      // caller can pass a 64-bit byte count here
  1.2535 +      BLOCK_COMMENT("Entry:");
  1.2536 +    }
  1.2537 +
  1.2538 +    // save arguments for barrier generation
  1.2539 +    __ mov(to, G1);
  1.2540 +    __ mov(count, G5);
  1.2541 +    gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
  1.2542 +  #ifdef _LP64
  1.2543 +    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  1.2544 +    if (UseCompressedOops) {
  1.2545 +      generate_disjoint_int_copy_core(aligned);
  1.2546 +    } else {
  1.2547 +      generate_disjoint_long_copy_core(aligned);
  1.2548 +    }
  1.2549 +  #else
  1.2550 +    generate_disjoint_int_copy_core(aligned);
  1.2551 +  #endif
  1.2552 +    // O0 is used as temp register
  1.2553 +    gen_write_ref_array_post_barrier(G1, G5, O0);
  1.2554 +
  1.2555 +    // O3, O4 are used as temp registers
  1.2556 +    inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
  1.2557 +    __ retl();
  1.2558 +    __ delayed()->mov(G0, O0); // return 0
  1.2559 +    return start;
  1.2560 +  }
  1.2561 +
  1.2562 +  //  Generate stub for conjoint oop copy.  If "aligned" is true, the
  1.2563 +  //  "from" and "to" addresses are assumed to be heapword aligned.
  1.2564 +  //
  1.2565 +  // Arguments for generated stub:
  1.2566 +  //      from:  O0
  1.2567 +  //      to:    O1
  1.2568 +  //      count: O2 treated as signed
  1.2569 +  //
  1.2570 +  address generate_conjoint_oop_copy(bool aligned, address nooverlap_target,
  1.2571 +                                     address *entry, const char *name,
  1.2572 +                                     bool dest_uninitialized = false) {
  1.2573 +
  1.2574 +    const Register from  = O0;  // source array address
  1.2575 +    const Register to    = O1;  // destination array address
  1.2576 +    const Register count = O2;  // elements count
  1.2577 +
  1.2578 +    __ align(CodeEntryAlignment);
  1.2579 +    StubCodeMark mark(this, "StubRoutines", name);
  1.2580 +    address start = __ pc();
  1.2581 +
  1.2582 +    assert_clean_int(count, O3);     // Make sure 'count' is clean int.
  1.2583 +
  1.2584 +    if (entry != NULL) {
  1.2585 +      *entry = __ pc();
  1.2586 +      // caller can pass a 64-bit byte count here
  1.2587 +      BLOCK_COMMENT("Entry:");
  1.2588 +    }
  1.2589 +
  1.2590 +    array_overlap_test(nooverlap_target, LogBytesPerHeapOop);
  1.2591 +
  1.2592 +    // save arguments for barrier generation
  1.2593 +    __ mov(to, G1);
  1.2594 +    __ mov(count, G5);
  1.2595 +    gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
  1.2596 +
  1.2597 +  #ifdef _LP64
  1.2598 +    if (UseCompressedOops) {
  1.2599 +      generate_conjoint_int_copy_core(aligned);
  1.2600 +    } else {
  1.2601 +      generate_conjoint_long_copy_core(aligned);
  1.2602 +    }
  1.2603 +  #else
  1.2604 +    generate_conjoint_int_copy_core(aligned);
  1.2605 +  #endif
  1.2606 +
  1.2607 +    // O0 is used as temp register
  1.2608 +    gen_write_ref_array_post_barrier(G1, G5, O0);
  1.2609 +
  1.2610 +    // O3, O4 are used as temp registers
  1.2611 +    inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
  1.2612 +    __ retl();
  1.2613 +    __ delayed()->mov(G0, O0); // return 0
  1.2614 +    return start;
  1.2615 +  }
  1.2616 +
  1.2617 +
  1.2618 +  // Helper for generating a dynamic type check.
  1.2619 +  // Smashes only the given temp registers.
  1.2620 +  void generate_type_check(Register sub_klass,
  1.2621 +                           Register super_check_offset,
  1.2622 +                           Register super_klass,
  1.2623 +                           Register temp,
  1.2624 +                           Label& L_success) {
  1.2625 +    assert_different_registers(sub_klass, super_check_offset, super_klass, temp);
  1.2626 +
  1.2627 +    BLOCK_COMMENT("type_check:");
  1.2628 +
  1.2629 +    Label L_miss, L_pop_to_miss;
  1.2630 +
  1.2631 +    assert_clean_int(super_check_offset, temp);
  1.2632 +
  1.2633 +    __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg,
  1.2634 +                                     &L_success, &L_miss, NULL,
  1.2635 +                                     super_check_offset);
  1.2636 +
  1.2637 +    BLOCK_COMMENT("type_check_slow_path:");
  1.2638 +    __ save_frame(0);
  1.2639 +    __ check_klass_subtype_slow_path(sub_klass->after_save(),
  1.2640 +                                     super_klass->after_save(),
  1.2641 +                                     L0, L1, L2, L4,
  1.2642 +                                     NULL, &L_pop_to_miss);
  1.2643 +    __ ba(L_success);
  1.2644 +    __ delayed()->restore();
  1.2645 +
  1.2646 +    __ bind(L_pop_to_miss);
  1.2647 +    __ restore();
  1.2648 +
  1.2649 +    // Fall through on failure!
  1.2650 +    __ BIND(L_miss);
  1.2651 +  }
  1.2652 +
  1.2653 +
  1.2654 +  //  Generate stub for checked oop copy.
  1.2655 +  //
  1.2656 +  // Arguments for generated stub:
  1.2657 +  //      from:  O0
  1.2658 +  //      to:    O1
  1.2659 +  //      count: O2 treated as signed
  1.2660 +  //      ckoff: O3 (super_check_offset)
  1.2661 +  //      ckval: O4 (super_klass)
  1.2662 +  //      ret:   O0 zero for success; (-1^K) where K is partial transfer count
  1.2663 +  //
  1.2664 +  address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) {
  1.2665 +
  1.2666 +    const Register O0_from   = O0;      // source array address
  1.2667 +    const Register O1_to     = O1;      // destination array address
  1.2668 +    const Register O2_count  = O2;      // elements count
  1.2669 +    const Register O3_ckoff  = O3;      // super_check_offset
  1.2670 +    const Register O4_ckval  = O4;      // super_klass
  1.2671 +
  1.2672 +    const Register O5_offset = O5;      // loop var, with stride wordSize
  1.2673 +    const Register G1_remain = G1;      // loop var, with stride -1
  1.2674 +    const Register G3_oop    = G3;      // actual oop copied
  1.2675 +    const Register G4_klass  = G4;      // oop._klass
  1.2676 +    const Register G5_super  = G5;      // oop._klass._primary_supers[ckval]
  1.2677 +
  1.2678 +    __ align(CodeEntryAlignment);
  1.2679 +    StubCodeMark mark(this, "StubRoutines", name);
  1.2680 +    address start = __ pc();
  1.2681 +
  1.2682 +#ifdef ASSERT
  1.2683 +    // We sometimes save a frame (see generate_type_check below).
  1.2684 +    // If this will cause trouble, let's fail now instead of later.
  1.2685 +    __ save_frame(0);
  1.2686 +    __ restore();
  1.2687 +#endif
  1.2688 +
  1.2689 +    assert_clean_int(O2_count, G1);     // Make sure 'count' is clean int.
  1.2690 +
  1.2691 +#ifdef ASSERT
  1.2692 +    // caller guarantees that the arrays really are different
  1.2693 +    // otherwise, we would have to make conjoint checks
  1.2694 +    { Label L;
  1.2695 +      __ mov(O3, G1);           // spill: overlap test smashes O3
  1.2696 +      __ mov(O4, G4);           // spill: overlap test smashes O4
  1.2697 +      array_overlap_test(L, LogBytesPerHeapOop);
  1.2698 +      __ stop("checkcast_copy within a single array");
  1.2699 +      __ bind(L);
  1.2700 +      __ mov(G1, O3);
  1.2701 +      __ mov(G4, O4);
  1.2702 +    }
  1.2703 +#endif //ASSERT
  1.2704 +
  1.2705 +    if (entry != NULL) {
  1.2706 +      *entry = __ pc();
  1.2707 +      // caller can pass a 64-bit byte count here (from generic stub)
  1.2708 +      BLOCK_COMMENT("Entry:");
  1.2709 +    }
  1.2710 +    gen_write_ref_array_pre_barrier(O1_to, O2_count, dest_uninitialized);
  1.2711 +
  1.2712 +    Label load_element, store_element, do_card_marks, fail, done;
  1.2713 +    __ addcc(O2_count, 0, G1_remain);   // initialize loop index, and test it
  1.2714 +    __ brx(Assembler::notZero, false, Assembler::pt, load_element);
  1.2715 +    __ delayed()->mov(G0, O5_offset);   // offset from start of arrays
  1.2716 +
  1.2717 +    // Empty array:  Nothing to do.
  1.2718 +    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
  1.2719 +    __ retl();
  1.2720 +    __ delayed()->set(0, O0);           // return 0 on (trivial) success
  1.2721 +
  1.2722 +    // ======== begin loop ========
  1.2723 +    // (Loop is rotated; its entry is load_element.)
  1.2724 +    // Loop variables:
  1.2725 +    //   (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
  1.2726 +    //   (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
  1.2727 +    //   G3, G4, G5 --- current oop, oop.klass, oop.klass.super
  1.2728 +    __ align(OptoLoopAlignment);
  1.2729 +
  1.2730 +    __ BIND(store_element);
  1.2731 +    __ deccc(G1_remain);                // decrement the count
  1.2732 +    __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop
  1.2733 +    __ inc(O5_offset, heapOopSize);     // step to next offset
  1.2734 +    __ brx(Assembler::zero, true, Assembler::pt, do_card_marks);
  1.2735 +    __ delayed()->set(0, O0);           // return -1 on success
  1.2736 +
  1.2737 +    // ======== loop entry is here ========
  1.2738 +    __ BIND(load_element);
  1.2739 +    __ load_heap_oop(O0_from, O5_offset, G3_oop);  // load the oop
  1.2740 +    __ br_null_short(G3_oop, Assembler::pt, store_element);
  1.2741 +
  1.2742 +    __ load_klass(G3_oop, G4_klass); // query the object klass
  1.2743 +
  1.2744 +    generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super,
  1.2745 +                        // branch to this on success:
  1.2746 +                        store_element);
  1.2747 +    // ======== end loop ========
  1.2748 +
  1.2749 +    // It was a real error; we must depend on the caller to finish the job.
  1.2750 +    // Register G1 has number of *remaining* oops, O2 number of *total* oops.
  1.2751 +    // Emit GC store barriers for the oops we have copied (O2 minus G1),
  1.2752 +    // and report their number to the caller.
  1.2753 +    __ BIND(fail);
  1.2754 +    __ subcc(O2_count, G1_remain, O2_count);
  1.2755 +    __ brx(Assembler::zero, false, Assembler::pt, done);
  1.2756 +    __ delayed()->not1(O2_count, O0);   // report (-1^K) to caller
  1.2757 +
  1.2758 +    __ BIND(do_card_marks);
  1.2759 +    gen_write_ref_array_post_barrier(O1_to, O2_count, O3);   // store check on O1[0..O2]
  1.2760 +
  1.2761 +    __ BIND(done);
  1.2762 +    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
  1.2763 +    __ retl();
  1.2764 +    __ delayed()->nop();             // return value in 00
  1.2765 +
  1.2766 +    return start;
  1.2767 +  }
  1.2768 +
  1.2769 +
  1.2770 +  //  Generate 'unsafe' array copy stub
  1.2771 +  //  Though just as safe as the other stubs, it takes an unscaled
  1.2772 +  //  size_t argument instead of an element count.
  1.2773 +  //
  1.2774 +  // Arguments for generated stub:
  1.2775 +  //      from:  O0
  1.2776 +  //      to:    O1
  1.2777 +  //      count: O2 byte count, treated as ssize_t, can be zero
  1.2778 +  //
  1.2779 +  // Examines the alignment of the operands and dispatches
  1.2780 +  // to a long, int, short, or byte copy loop.
  1.2781 +  //
  1.2782 +  address generate_unsafe_copy(const char* name,
  1.2783 +                               address byte_copy_entry,
  1.2784 +                               address short_copy_entry,
  1.2785 +                               address int_copy_entry,
  1.2786 +                               address long_copy_entry) {
  1.2787 +
  1.2788 +    const Register O0_from   = O0;      // source array address
  1.2789 +    const Register O1_to     = O1;      // destination array address
  1.2790 +    const Register O2_count  = O2;      // elements count
  1.2791 +
  1.2792 +    const Register G1_bits   = G1;      // test copy of low bits
  1.2793 +
  1.2794 +    __ align(CodeEntryAlignment);
  1.2795 +    StubCodeMark mark(this, "StubRoutines", name);
  1.2796 +    address start = __ pc();
  1.2797 +
  1.2798 +    // bump this on entry, not on exit:
  1.2799 +    inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3);
  1.2800 +
  1.2801 +    __ or3(O0_from, O1_to, G1_bits);
  1.2802 +    __ or3(O2_count,       G1_bits, G1_bits);
  1.2803 +
  1.2804 +    __ btst(BytesPerLong-1, G1_bits);
  1.2805 +    __ br(Assembler::zero, true, Assembler::pt,
  1.2806 +          long_copy_entry, relocInfo::runtime_call_type);
  1.2807 +    // scale the count on the way out:
  1.2808 +    __ delayed()->srax(O2_count, LogBytesPerLong, O2_count);
  1.2809 +
  1.2810 +    __ btst(BytesPerInt-1, G1_bits);
  1.2811 +    __ br(Assembler::zero, true, Assembler::pt,
  1.2812 +          int_copy_entry, relocInfo::runtime_call_type);
  1.2813 +    // scale the count on the way out:
  1.2814 +    __ delayed()->srax(O2_count, LogBytesPerInt, O2_count);
  1.2815 +
  1.2816 +    __ btst(BytesPerShort-1, G1_bits);
  1.2817 +    __ br(Assembler::zero, true, Assembler::pt,
  1.2818 +          short_copy_entry, relocInfo::runtime_call_type);
  1.2819 +    // scale the count on the way out:
  1.2820 +    __ delayed()->srax(O2_count, LogBytesPerShort, O2_count);
  1.2821 +
  1.2822 +    __ br(Assembler::always, false, Assembler::pt,
  1.2823 +          byte_copy_entry, relocInfo::runtime_call_type);
  1.2824 +    __ delayed()->nop();
  1.2825 +
  1.2826 +    return start;
  1.2827 +  }
  1.2828 +
  1.2829 +
  1.2830 +  // Perform range checks on the proposed arraycopy.
  1.2831 +  // Kills the two temps, but nothing else.
  1.2832 +  // Also, clean the sign bits of src_pos and dst_pos.
  1.2833 +  void arraycopy_range_checks(Register src,     // source array oop (O0)
  1.2834 +                              Register src_pos, // source position (O1)
  1.2835 +                              Register dst,     // destination array oo (O2)
  1.2836 +                              Register dst_pos, // destination position (O3)
  1.2837 +                              Register length,  // length of copy (O4)
  1.2838 +                              Register temp1, Register temp2,
  1.2839 +                              Label& L_failed) {
  1.2840 +    BLOCK_COMMENT("arraycopy_range_checks:");
  1.2841 +
  1.2842 +    //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
  1.2843 +
  1.2844 +    const Register array_length = temp1;  // scratch
  1.2845 +    const Register end_pos      = temp2;  // scratch
  1.2846 +
  1.2847 +    // Note:  This next instruction may be in the delay slot of a branch:
  1.2848 +    __ add(length, src_pos, end_pos);  // src_pos + length
  1.2849 +    __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length);
  1.2850 +    __ cmp(end_pos, array_length);
  1.2851 +    __ br(Assembler::greater, false, Assembler::pn, L_failed);
  1.2852 +
  1.2853 +    //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
  1.2854 +    __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length
  1.2855 +    __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length);
  1.2856 +    __ cmp(end_pos, array_length);
  1.2857 +    __ br(Assembler::greater, false, Assembler::pn, L_failed);
  1.2858 +
  1.2859 +    // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
  1.2860 +    // Move with sign extension can be used since they are positive.
  1.2861 +    __ delayed()->signx(src_pos, src_pos);
  1.2862 +    __ signx(dst_pos, dst_pos);
  1.2863 +
  1.2864 +    BLOCK_COMMENT("arraycopy_range_checks done");
  1.2865 +  }
  1.2866 +
  1.2867 +
  1.2868 +  //
  1.2869 +  //  Generate generic array copy stubs
  1.2870 +  //
  1.2871 +  //  Input:
  1.2872 +  //    O0    -  src oop
  1.2873 +  //    O1    -  src_pos
  1.2874 +  //    O2    -  dst oop
  1.2875 +  //    O3    -  dst_pos
  1.2876 +  //    O4    -  element count
  1.2877 +  //
  1.2878 +  //  Output:
  1.2879 +  //    O0 ==  0  -  success
  1.2880 +  //    O0 == -1  -  need to call System.arraycopy
  1.2881 +  //
  1.2882 +  address generate_generic_copy(const char *name,
  1.2883 +                                address entry_jbyte_arraycopy,
  1.2884 +                                address entry_jshort_arraycopy,
  1.2885 +                                address entry_jint_arraycopy,
  1.2886 +                                address entry_oop_arraycopy,
  1.2887 +                                address entry_jlong_arraycopy,
  1.2888 +                                address entry_checkcast_arraycopy) {
  1.2889 +    Label L_failed, L_objArray;
  1.2890 +
  1.2891 +    // Input registers
  1.2892 +    const Register src      = O0;  // source array oop
  1.2893 +    const Register src_pos  = O1;  // source position
  1.2894 +    const Register dst      = O2;  // destination array oop
  1.2895 +    const Register dst_pos  = O3;  // destination position
  1.2896 +    const Register length   = O4;  // elements count
  1.2897 +
  1.2898 +    // registers used as temp
  1.2899 +    const Register G3_src_klass = G3; // source array klass
  1.2900 +    const Register G4_dst_klass = G4; // destination array klass
  1.2901 +    const Register G5_lh        = G5; // layout handler
  1.2902 +    const Register O5_temp      = O5;
  1.2903 +
  1.2904 +    __ align(CodeEntryAlignment);
  1.2905 +    StubCodeMark mark(this, "StubRoutines", name);
  1.2906 +    address start = __ pc();
  1.2907 +
  1.2908 +    // bump this on entry, not on exit:
  1.2909 +    inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3);
  1.2910 +
  1.2911 +    // In principle, the int arguments could be dirty.
  1.2912 +    //assert_clean_int(src_pos, G1);
  1.2913 +    //assert_clean_int(dst_pos, G1);
  1.2914 +    //assert_clean_int(length, G1);
  1.2915 +
  1.2916 +    //-----------------------------------------------------------------------
  1.2917 +    // Assembler stubs will be used for this call to arraycopy
  1.2918 +    // if the following conditions are met:
  1.2919 +    //
  1.2920 +    // (1) src and dst must not be null.
  1.2921 +    // (2) src_pos must not be negative.
  1.2922 +    // (3) dst_pos must not be negative.
  1.2923 +    // (4) length  must not be negative.
  1.2924 +    // (5) src klass and dst klass should be the same and not NULL.
  1.2925 +    // (6) src and dst should be arrays.
  1.2926 +    // (7) src_pos + length must not exceed length of src.
  1.2927 +    // (8) dst_pos + length must not exceed length of dst.
  1.2928 +    BLOCK_COMMENT("arraycopy initial argument checks");
  1.2929 +
  1.2930 +    //  if (src == NULL) return -1;
  1.2931 +    __ br_null(src, false, Assembler::pn, L_failed);
  1.2932 +
  1.2933 +    //  if (src_pos < 0) return -1;
  1.2934 +    __ delayed()->tst(src_pos);
  1.2935 +    __ br(Assembler::negative, false, Assembler::pn, L_failed);
  1.2936 +    __ delayed()->nop();
  1.2937 +
  1.2938 +    //  if (dst == NULL) return -1;
  1.2939 +    __ br_null(dst, false, Assembler::pn, L_failed);
  1.2940 +
  1.2941 +    //  if (dst_pos < 0) return -1;
  1.2942 +    __ delayed()->tst(dst_pos);
  1.2943 +    __ br(Assembler::negative, false, Assembler::pn, L_failed);
  1.2944 +
  1.2945 +    //  if (length < 0) return -1;
  1.2946 +    __ delayed()->tst(length);
  1.2947 +    __ br(Assembler::negative, false, Assembler::pn, L_failed);
  1.2948 +
  1.2949 +    BLOCK_COMMENT("arraycopy argument klass checks");
  1.2950 +    //  get src->klass()
  1.2951 +    if (UseCompressedClassPointers) {
  1.2952 +      __ delayed()->nop(); // ??? not good
  1.2953 +      __ load_klass(src, G3_src_klass);
  1.2954 +    } else {
  1.2955 +      __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass);
  1.2956 +    }
  1.2957 +
  1.2958 +#ifdef ASSERT
  1.2959 +    //  assert(src->klass() != NULL);
  1.2960 +    BLOCK_COMMENT("assert klasses not null");
  1.2961 +    { Label L_a, L_b;
  1.2962 +      __ br_notnull_short(G3_src_klass, Assembler::pt, L_b); // it is broken if klass is NULL
  1.2963 +      __ bind(L_a);
  1.2964 +      __ stop("broken null klass");
  1.2965 +      __ bind(L_b);
  1.2966 +      __ load_klass(dst, G4_dst_klass);
  1.2967 +      __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also
  1.2968 +      __ delayed()->mov(G0, G4_dst_klass);      // scribble the temp
  1.2969 +      BLOCK_COMMENT("assert done");
  1.2970 +    }
  1.2971 +#endif
  1.2972 +
  1.2973 +    // Load layout helper
  1.2974 +    //
  1.2975 +    //  |array_tag|     | header_size | element_type |     |log2_element_size|
  1.2976 +    // 32        30    24            16              8     2                 0
  1.2977 +    //
  1.2978 +    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
  1.2979 +    //
  1.2980 +
  1.2981 +    int lh_offset = in_bytes(Klass::layout_helper_offset());
  1.2982 +
  1.2983 +    // Load 32-bits signed value. Use br() instruction with it to check icc.
  1.2984 +    __ lduw(G3_src_klass, lh_offset, G5_lh);
  1.2985 +
  1.2986 +    if (UseCompressedClassPointers) {
  1.2987 +      __ load_klass(dst, G4_dst_klass);
  1.2988 +    }
  1.2989 +    // Handle objArrays completely differently...
  1.2990 +    juint objArray_lh = Klass::array_layout_helper(T_OBJECT);
  1.2991 +    __ set(objArray_lh, O5_temp);
  1.2992 +    __ cmp(G5_lh,       O5_temp);
  1.2993 +    __ br(Assembler::equal, false, Assembler::pt, L_objArray);
  1.2994 +    if (UseCompressedClassPointers) {
  1.2995 +      __ delayed()->nop();
  1.2996 +    } else {
  1.2997 +      __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass);
  1.2998 +    }
  1.2999 +
  1.3000 +    //  if (src->klass() != dst->klass()) return -1;
  1.3001 +    __ cmp_and_brx_short(G3_src_klass, G4_dst_klass, Assembler::notEqual, Assembler::pn, L_failed);
  1.3002 +
  1.3003 +    //  if (!src->is_Array()) return -1;
  1.3004 +    __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0
  1.3005 +    __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed);
  1.3006 +
  1.3007 +    // At this point, it is known to be a typeArray (array_tag 0x3).
  1.3008 +#ifdef ASSERT
  1.3009 +    __ delayed()->nop();
  1.3010 +    { Label L;
  1.3011 +      jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
  1.3012 +      __ set(lh_prim_tag_in_place, O5_temp);
  1.3013 +      __ cmp(G5_lh,                O5_temp);
  1.3014 +      __ br(Assembler::greaterEqual, false, Assembler::pt, L);
  1.3015 +      __ delayed()->nop();
  1.3016 +      __ stop("must be a primitive array");
  1.3017 +      __ bind(L);
  1.3018 +    }
  1.3019 +#else
  1.3020 +    __ delayed();                               // match next insn to prev branch
  1.3021 +#endif
  1.3022 +
  1.3023 +    arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
  1.3024 +                           O5_temp, G4_dst_klass, L_failed);
  1.3025 +
  1.3026 +    // TypeArrayKlass
  1.3027 +    //
  1.3028 +    // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
  1.3029 +    // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
  1.3030 +    //
  1.3031 +
  1.3032 +    const Register G4_offset = G4_dst_klass;    // array offset
  1.3033 +    const Register G3_elsize = G3_src_klass;    // log2 element size
  1.3034 +
  1.3035 +    __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset);
  1.3036 +    __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset
  1.3037 +    __ add(src, G4_offset, src);       // src array offset
  1.3038 +    __ add(dst, G4_offset, dst);       // dst array offset
  1.3039 +    __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size
  1.3040 +
  1.3041 +    // next registers should be set before the jump to corresponding stub
  1.3042 +    const Register from     = O0;  // source array address
  1.3043 +    const Register to       = O1;  // destination array address
  1.3044 +    const Register count    = O2;  // elements count
  1.3045 +
  1.3046 +    // 'from', 'to', 'count' registers should be set in this order
  1.3047 +    // since they are the same as 'src', 'src_pos', 'dst'.
  1.3048 +
  1.3049 +    BLOCK_COMMENT("scale indexes to element size");
  1.3050 +    __ sll_ptr(src_pos, G3_elsize, src_pos);
  1.3051 +    __ sll_ptr(dst_pos, G3_elsize, dst_pos);
  1.3052 +    __ add(src, src_pos, from);       // src_addr
  1.3053 +    __ add(dst, dst_pos, to);         // dst_addr
  1.3054 +
  1.3055 +    BLOCK_COMMENT("choose copy loop based on element size");
  1.3056 +    __ cmp(G3_elsize, 0);
  1.3057 +    __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy);
  1.3058 +    __ delayed()->signx(length, count); // length
  1.3059 +
  1.3060 +    __ cmp(G3_elsize, LogBytesPerShort);
  1.3061 +    __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy);
  1.3062 +    __ delayed()->signx(length, count); // length
  1.3063 +
  1.3064 +    __ cmp(G3_elsize, LogBytesPerInt);
  1.3065 +    __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy);
  1.3066 +    __ delayed()->signx(length, count); // length
  1.3067 +#ifdef ASSERT
  1.3068 +    { Label L;
  1.3069 +      __ cmp_and_br_short(G3_elsize, LogBytesPerLong, Assembler::equal, Assembler::pt, L);
  1.3070 +      __ stop("must be long copy, but elsize is wrong");
  1.3071 +      __ bind(L);
  1.3072 +    }
  1.3073 +#endif
  1.3074 +    __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy);
  1.3075 +    __ delayed()->signx(length, count); // length
  1.3076 +
  1.3077 +    // ObjArrayKlass
  1.3078 +  __ BIND(L_objArray);
  1.3079 +    // live at this point:  G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length
  1.3080 +
  1.3081 +    Label L_plain_copy, L_checkcast_copy;
  1.3082 +    //  test array classes for subtyping
  1.3083 +    __ cmp(G3_src_klass, G4_dst_klass);         // usual case is exact equality
  1.3084 +    __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy);
  1.3085 +    __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below
  1.3086 +
  1.3087 +    // Identically typed arrays can be copied without element-wise checks.
  1.3088 +    arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
  1.3089 +                           O5_temp, G5_lh, L_failed);
  1.3090 +
  1.3091 +    __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
  1.3092 +    __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
  1.3093 +    __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
  1.3094 +    __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
  1.3095 +    __ add(src, src_pos, from);       // src_addr
  1.3096 +    __ add(dst, dst_pos, to);         // dst_addr
  1.3097 +  __ BIND(L_plain_copy);
  1.3098 +    __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy);
  1.3099 +    __ delayed()->signx(length, count); // length
  1.3100 +
  1.3101 +  __ BIND(L_checkcast_copy);
  1.3102 +    // live at this point:  G3_src_klass, G4_dst_klass
  1.3103 +    {
  1.3104 +      // Before looking at dst.length, make sure dst is also an objArray.
  1.3105 +      // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot
  1.3106 +      __ cmp(G5_lh,                    O5_temp);
  1.3107 +      __ br(Assembler::notEqual, false, Assembler::pn, L_failed);
  1.3108 +
  1.3109 +      // It is safe to examine both src.length and dst.length.
  1.3110 +      __ delayed();                             // match next insn to prev branch
  1.3111 +      arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
  1.3112 +                             O5_temp, G5_lh, L_failed);
  1.3113 +
  1.3114 +      // Marshal the base address arguments now, freeing registers.
  1.3115 +      __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
  1.3116 +      __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
  1.3117 +      __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
  1.3118 +      __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
  1.3119 +      __ add(src, src_pos, from);               // src_addr
  1.3120 +      __ add(dst, dst_pos, to);                 // dst_addr
  1.3121 +      __ signx(length, count);                  // length (reloaded)
  1.3122 +
  1.3123 +      Register sco_temp = O3;                   // this register is free now
  1.3124 +      assert_different_registers(from, to, count, sco_temp,
  1.3125 +                                 G4_dst_klass, G3_src_klass);
  1.3126 +
  1.3127 +      // Generate the type check.
  1.3128 +      int sco_offset = in_bytes(Klass::super_check_offset_offset());
  1.3129 +      __ lduw(G4_dst_klass, sco_offset, sco_temp);
  1.3130 +      generate_type_check(G3_src_klass, sco_temp, G4_dst_klass,
  1.3131 +                          O5_temp, L_plain_copy);
  1.3132 +
  1.3133 +      // Fetch destination element klass from the ObjArrayKlass header.
  1.3134 +      int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
  1.3135 +
  1.3136 +      // the checkcast_copy loop needs two extra arguments:
  1.3137 +      __ ld_ptr(G4_dst_klass, ek_offset, O4);   // dest elem klass
  1.3138 +      // lduw(O4, sco_offset, O3);              // sco of elem klass
  1.3139 +
  1.3140 +      __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy);
  1.3141 +      __ delayed()->lduw(O4, sco_offset, O3);
  1.3142 +    }
  1.3143 +
  1.3144 +  __ BIND(L_failed);
  1.3145 +    __ retl();
  1.3146 +    __ delayed()->sub(G0, 1, O0); // return -1
  1.3147 +    return start;
  1.3148 +  }
  1.3149 +
  1.3150 +  //
  1.3151 +  //  Generate stub for heap zeroing.
  1.3152 +  //  "to" address is aligned to jlong (8 bytes).
  1.3153 +  //
  1.3154 +  // Arguments for generated stub:
  1.3155 +  //      to:    O0
  1.3156 +  //      count: O1 treated as signed (count of HeapWord)
  1.3157 +  //             count could be 0
  1.3158 +  //
  1.3159 +  address generate_zero_aligned_words(const char* name) {
  1.3160 +    __ align(CodeEntryAlignment);
  1.3161 +    StubCodeMark mark(this, "StubRoutines", name);
  1.3162 +    address start = __ pc();
  1.3163 +
  1.3164 +    const Register to    = O0;   // source array address
  1.3165 +    const Register count = O1;   // HeapWords count
  1.3166 +    const Register temp  = O2;   // scratch
  1.3167 +
  1.3168 +    Label Ldone;
  1.3169 +    __ sllx(count, LogHeapWordSize, count); // to bytes count
  1.3170 +    // Use BIS for zeroing
  1.3171 +    __ bis_zeroing(to, count, temp, Ldone);
  1.3172 +    __ bind(Ldone);
  1.3173 +    __ retl();
  1.3174 +    __ delayed()->nop();
  1.3175 +    return start;
  1.3176 +}
  1.3177 +
  1.3178 +  void generate_arraycopy_stubs() {
  1.3179 +    address entry;
  1.3180 +    address entry_jbyte_arraycopy;
  1.3181 +    address entry_jshort_arraycopy;
  1.3182 +    address entry_jint_arraycopy;
  1.3183 +    address entry_oop_arraycopy;
  1.3184 +    address entry_jlong_arraycopy;
  1.3185 +    address entry_checkcast_arraycopy;
  1.3186 +
  1.3187 +    //*** jbyte
  1.3188 +    // Always need aligned and unaligned versions
  1.3189 +    StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
  1.3190 +                                                                                  "jbyte_disjoint_arraycopy");
  1.3191 +    StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
  1.3192 +                                                                                  &entry_jbyte_arraycopy,
  1.3193 +                                                                                  "jbyte_arraycopy");
  1.3194 +    StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
  1.3195 +                                                                                  "arrayof_jbyte_disjoint_arraycopy");
  1.3196 +    StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
  1.3197 +                                                                                  "arrayof_jbyte_arraycopy");
  1.3198 +
  1.3199 +    //*** jshort
  1.3200 +    // Always need aligned and unaligned versions
  1.3201 +    StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
  1.3202 +                                                                                    "jshort_disjoint_arraycopy");
  1.3203 +    StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
  1.3204 +                                                                                    &entry_jshort_arraycopy,
  1.3205 +                                                                                    "jshort_arraycopy");
  1.3206 +    StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
  1.3207 +                                                                                    "arrayof_jshort_disjoint_arraycopy");
  1.3208 +    StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
  1.3209 +                                                                                    "arrayof_jshort_arraycopy");
  1.3210 +
  1.3211 +    //*** jint
  1.3212 +    // Aligned versions
  1.3213 +    StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
  1.3214 +                                                                                "arrayof_jint_disjoint_arraycopy");
  1.3215 +    StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
  1.3216 +                                                                                "arrayof_jint_arraycopy");
  1.3217 +#ifdef _LP64
  1.3218 +    // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
  1.3219 +    // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it).
  1.3220 +    StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
  1.3221 +                                                                                "jint_disjoint_arraycopy");
  1.3222 +    StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
  1.3223 +                                                                                &entry_jint_arraycopy,
  1.3224 +                                                                                "jint_arraycopy");
  1.3225 +#else
  1.3226 +    // In 32 bit jints are always HeapWordSize aligned, so always use the aligned version
  1.3227 +    // (in fact in 32bit we always have a pre-loop part even in the aligned version,
  1.3228 +    //  because it uses 64-bit loads/stores, so the aligned flag is actually ignored).
  1.3229 +    StubRoutines::_jint_disjoint_arraycopy = StubRoutines::_arrayof_jint_disjoint_arraycopy;
  1.3230 +    StubRoutines::_jint_arraycopy          = StubRoutines::_arrayof_jint_arraycopy;
  1.3231 +#endif
  1.3232 +
  1.3233 +
  1.3234 +    //*** jlong
  1.3235 +    // It is always aligned
  1.3236 +    StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
  1.3237 +                                                                                  "arrayof_jlong_disjoint_arraycopy");
  1.3238 +    StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
  1.3239 +                                                                                  "arrayof_jlong_arraycopy");
  1.3240 +    StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
  1.3241 +    StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
  1.3242 +
  1.3243 +
  1.3244 +    //*** oops
  1.3245 +    // Aligned versions
  1.3246 +    StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(true, &entry,
  1.3247 +                                                                                      "arrayof_oop_disjoint_arraycopy");
  1.3248 +    StubRoutines::_arrayof_oop_arraycopy                 = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy,
  1.3249 +                                                                                      "arrayof_oop_arraycopy");
  1.3250 +    // Aligned versions without pre-barriers
  1.3251 +    StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry,
  1.3252 +                                                                                      "arrayof_oop_disjoint_arraycopy_uninit",
  1.3253 +                                                                                      /*dest_uninitialized*/true);
  1.3254 +    StubRoutines::_arrayof_oop_arraycopy_uninit          = generate_conjoint_oop_copy(true, entry, NULL,
  1.3255 +                                                                                      "arrayof_oop_arraycopy_uninit",
  1.3256 +                                                                                      /*dest_uninitialized*/true);
  1.3257 +#ifdef _LP64
  1.3258 +    if (UseCompressedOops) {
  1.3259 +      // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy.
  1.3260 +      StubRoutines::_oop_disjoint_arraycopy            = generate_disjoint_oop_copy(false, &entry,
  1.3261 +                                                                                    "oop_disjoint_arraycopy");
  1.3262 +      StubRoutines::_oop_arraycopy                     = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy,
  1.3263 +                                                                                    "oop_arraycopy");
  1.3264 +      // Unaligned versions without pre-barriers
  1.3265 +      StubRoutines::_oop_disjoint_arraycopy_uninit     = generate_disjoint_oop_copy(false, &entry,
  1.3266 +                                                                                    "oop_disjoint_arraycopy_uninit",
  1.3267 +                                                                                    /*dest_uninitialized*/true);
  1.3268 +      StubRoutines::_oop_arraycopy_uninit              = generate_conjoint_oop_copy(false, entry, NULL,
  1.3269 +                                                                                    "oop_arraycopy_uninit",
  1.3270 +                                                                                    /*dest_uninitialized*/true);
  1.3271 +    } else
  1.3272 +#endif
  1.3273 +    {
  1.3274 +      // oop arraycopy is always aligned on 32bit and 64bit without compressed oops
  1.3275 +      StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
  1.3276 +      StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
  1.3277 +      StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
  1.3278 +      StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
  1.3279 +    }
  1.3280 +
  1.3281 +    StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
  1.3282 +    StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
  1.3283 +                                                                        /*dest_uninitialized*/true);
  1.3284 +
  1.3285 +    StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
  1.3286 +                                                              entry_jbyte_arraycopy,
  1.3287 +                                                              entry_jshort_arraycopy,
  1.3288 +                                                              entry_jint_arraycopy,
  1.3289 +                                                              entry_jlong_arraycopy);
  1.3290 +    StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
  1.3291 +                                                               entry_jbyte_arraycopy,
  1.3292 +                                                               entry_jshort_arraycopy,
  1.3293 +                                                               entry_jint_arraycopy,
  1.3294 +                                                               entry_oop_arraycopy,
  1.3295 +                                                               entry_jlong_arraycopy,
  1.3296 +                                                               entry_checkcast_arraycopy);
  1.3297 +
  1.3298 +    StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
  1.3299 +    StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
  1.3300 +    StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
  1.3301 +    StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
  1.3302 +    StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
  1.3303 +    StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
  1.3304 +
  1.3305 +    if (UseBlockZeroing) {
  1.3306 +      StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words");
  1.3307 +    }
  1.3308 +  }
  1.3309 +
  1.3310 +  address generate_aescrypt_encryptBlock() {
  1.3311 +    // required since we read expanded key 'int' array starting first element without alignment considerations
  1.3312 +    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
  1.3313 +           "the following code assumes that first element of an int array is aligned to 8 bytes");
  1.3314 +    __ align(CodeEntryAlignment);
  1.3315 +    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
  1.3316 +    Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output;
  1.3317 +    address start = __ pc();
  1.3318 +    Register from = O0; // source byte array
  1.3319 +    Register to = O1;   // destination byte array
  1.3320 +    Register key = O2;  // expanded key array
  1.3321 +    const Register keylen = O4; //reg for storing expanded key array length
  1.3322 +
  1.3323 +    // read expanded key length
  1.3324 +    __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
  1.3325 +
  1.3326 +    // Method to address arbitrary alignment for load instructions:
  1.3327 +    // Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary
  1.3328 +    // If zero/aligned then continue with double FP load instructions
  1.3329 +    // If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata
  1.3330 +    // alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address
  1.3331 +    // load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address
  1.3332 +    // faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs
  1.3333 +
  1.3334 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
  1.3335 +    __ andcc(from, 7, G0);
  1.3336 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
  1.3337 +    __ delayed()->alignaddr(from, G0, from);
  1.3338 +
  1.3339 +    // aligned case: load input into F54-F56
  1.3340 +    __ ldf(FloatRegisterImpl::D, from, 0, F54);
  1.3341 +    __ ldf(FloatRegisterImpl::D, from, 8, F56);
  1.3342 +    __ ba_short(L_load_expanded_key);
  1.3343 +
  1.3344 +    __ BIND(L_load_misaligned_input);
  1.3345 +    __ ldf(FloatRegisterImpl::D, from, 0, F54);
  1.3346 +    __ ldf(FloatRegisterImpl::D, from, 8, F56);
  1.3347 +    __ ldf(FloatRegisterImpl::D, from, 16, F58);
  1.3348 +    __ faligndata(F54, F56, F54);
  1.3349 +    __ faligndata(F56, F58, F56);
  1.3350 +
  1.3351 +    __ BIND(L_load_expanded_key);
  1.3352 +    // Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed
  1.3353 +    for ( int i = 0;  i <= 38; i += 2 ) {
  1.3354 +      __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i));
  1.3355 +    }
  1.3356 +
  1.3357 +    // perform cipher transformation
  1.3358 +    __ fxor(FloatRegisterImpl::D, F0, F54, F54);
  1.3359 +    __ fxor(FloatRegisterImpl::D, F2, F56, F56);
  1.3360 +    // rounds 1 through 8
  1.3361 +    for ( int i = 4;  i <= 28; i += 8 ) {
  1.3362 +      __ aes_eround01(as_FloatRegister(i), F54, F56, F58);
  1.3363 +      __ aes_eround23(as_FloatRegister(i+2), F54, F56, F60);
  1.3364 +      __ aes_eround01(as_FloatRegister(i+4), F58, F60, F54);
  1.3365 +      __ aes_eround23(as_FloatRegister(i+6), F58, F60, F56);
  1.3366 +    }
  1.3367 +    __ aes_eround01(F36, F54, F56, F58); //round 9
  1.3368 +    __ aes_eround23(F38, F54, F56, F60);
  1.3369 +
  1.3370 +    // 128-bit original key size
  1.3371 +    __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_doLast128bit);
  1.3372 +
  1.3373 +    for ( int i = 40;  i <= 50; i += 2 ) {
  1.3374 +      __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i) );
  1.3375 +    }
  1.3376 +    __ aes_eround01(F40, F58, F60, F54); //round 10
  1.3377 +    __ aes_eround23(F42, F58, F60, F56);
  1.3378 +    __ aes_eround01(F44, F54, F56, F58); //round 11
  1.3379 +    __ aes_eround23(F46, F54, F56, F60);
  1.3380 +
  1.3381 +    // 192-bit original key size
  1.3382 +    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_storeOutput);
  1.3383 +
  1.3384 +    __ ldf(FloatRegisterImpl::D, key, 208, F52);
  1.3385 +    __ aes_eround01(F48, F58, F60, F54); //round 12
  1.3386 +    __ aes_eround23(F50, F58, F60, F56);
  1.3387 +    __ ldf(FloatRegisterImpl::D, key, 216, F46);
  1.3388 +    __ ldf(FloatRegisterImpl::D, key, 224, F48);
  1.3389 +    __ ldf(FloatRegisterImpl::D, key, 232, F50);
  1.3390 +    __ aes_eround01(F52, F54, F56, F58); //round 13
  1.3391 +    __ aes_eround23(F46, F54, F56, F60);
  1.3392 +    __ ba_short(L_storeOutput);
  1.3393 +
  1.3394 +    __ BIND(L_doLast128bit);
  1.3395 +    __ ldf(FloatRegisterImpl::D, key, 160, F48);
  1.3396 +    __ ldf(FloatRegisterImpl::D, key, 168, F50);
  1.3397 +
  1.3398 +    __ BIND(L_storeOutput);
  1.3399 +    // perform last round of encryption common for all key sizes
  1.3400 +    __ aes_eround01_l(F48, F58, F60, F54); //last round
  1.3401 +    __ aes_eround23_l(F50, F58, F60, F56);
  1.3402 +
  1.3403 +    // Method to address arbitrary alignment for store instructions:
  1.3404 +    // Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary
  1.3405 +    // If zero/aligned then continue with double FP store instructions
  1.3406 +    // If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case)
  1.3407 +    // Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001
  1.3408 +    // Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case
  1.3409 +    // We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case.
  1.3410 +    // Set GSR.align to (8-n) using alignaddr
  1.3411 +    // Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf
  1.3412 +    // Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address
  1.3413 +    // Store (partial) the original first (8-n) bytes starting at the original 'dest' address
  1.3414 +    // Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address
  1.3415 +    // We need to execute this process for both the 8-byte result values
  1.3416 +
  1.3417 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
  1.3418 +    __ andcc(to, 7, O5);
  1.3419 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
  1.3420 +    __ delayed()->edge8n(to, G0, O3);
  1.3421 +
  1.3422 +    // aligned case: store output into the destination array
  1.3423 +    __ stf(FloatRegisterImpl::D, F54, to, 0);
  1.3424 +    __ retl();
  1.3425 +    __ delayed()->stf(FloatRegisterImpl::D, F56, to, 8);
  1.3426 +
  1.3427 +    __ BIND(L_store_misaligned_output);
  1.3428 +    __ add(to, 8, O4);
  1.3429 +    __ mov(8, O2);
  1.3430 +    __ sub(O2, O5, O2);
  1.3431 +    __ alignaddr(O2, G0, O2);
  1.3432 +    __ faligndata(F54, F54, F54);
  1.3433 +    __ faligndata(F56, F56, F56);
  1.3434 +    __ and3(to, -8, to);
  1.3435 +    __ and3(O4, -8, O4);
  1.3436 +    __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
  1.3437 +    __ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
  1.3438 +    __ add(to, 8, to);
  1.3439 +    __ add(O4, 8, O4);
  1.3440 +    __ orn(G0, O3, O3);
  1.3441 +    __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
  1.3442 +    __ retl();
  1.3443 +    __ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
  1.3444 +
  1.3445 +    return start;
  1.3446 +  }
  1.3447 +
  1.3448 +  address generate_aescrypt_decryptBlock() {
  1.3449 +    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
  1.3450 +           "the following code assumes that first element of an int array is aligned to 8 bytes");
  1.3451 +    // required since we read original key 'byte' array as well in the decryption stubs
  1.3452 +    assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
  1.3453 +           "the following code assumes that first element of a byte array is aligned to 8 bytes");
  1.3454 +    __ align(CodeEntryAlignment);
  1.3455 +    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
  1.3456 +    address start = __ pc();
  1.3457 +    Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input;
  1.3458 +    Label L_256bit_transform, L_common_transform, L_store_misaligned_output;
  1.3459 +    Register from = O0; // source byte array
  1.3460 +    Register to = O1;   // destination byte array
  1.3461 +    Register key = O2;  // expanded key array
  1.3462 +    Register original_key = O3;  // original key array only required during decryption
  1.3463 +    const Register keylen = O4;  // reg for storing expanded key array length
  1.3464 +
  1.3465 +    // read expanded key array length
  1.3466 +    __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
  1.3467 +
  1.3468 +    // save 'from' since we may need to recheck alignment in case of 256-bit decryption
  1.3469 +    __ mov(from, G1);
  1.3470 +
  1.3471 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
  1.3472 +    __ andcc(from, 7, G0);
  1.3473 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
  1.3474 +    __ delayed()->alignaddr(from, G0, from);
  1.3475 +
  1.3476 +    // aligned case: load input into F52-F54
  1.3477 +    __ ldf(FloatRegisterImpl::D, from, 0, F52);
  1.3478 +    __ ldf(FloatRegisterImpl::D, from, 8, F54);
  1.3479 +    __ ba_short(L_load_original_key);
  1.3480 +
  1.3481 +    __ BIND(L_load_misaligned_input);
  1.3482 +    __ ldf(FloatRegisterImpl::D, from, 0, F52);
  1.3483 +    __ ldf(FloatRegisterImpl::D, from, 8, F54);
  1.3484 +    __ ldf(FloatRegisterImpl::D, from, 16, F56);
  1.3485 +    __ faligndata(F52, F54, F52);
  1.3486 +    __ faligndata(F54, F56, F54);
  1.3487 +
  1.3488 +    __ BIND(L_load_original_key);
  1.3489 +    // load original key from SunJCE expanded decryption key
  1.3490 +    // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
  1.3491 +    for ( int i = 0;  i <= 3; i++ ) {
  1.3492 +      __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
  1.3493 +    }
  1.3494 +
  1.3495 +    // 256-bit original key size
  1.3496 +    __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit);
  1.3497 +
  1.3498 +    // 192-bit original key size
  1.3499 +    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit);
  1.3500 +
  1.3501 +    // 128-bit original key size
  1.3502 +    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
  1.3503 +    for ( int i = 0;  i <= 36; i += 4 ) {
  1.3504 +      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4));
  1.3505 +      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6));
  1.3506 +    }
  1.3507 +
  1.3508 +    // perform 128-bit key specific inverse cipher transformation
  1.3509 +    __ fxor(FloatRegisterImpl::D, F42, F54, F54);
  1.3510 +    __ fxor(FloatRegisterImpl::D, F40, F52, F52);
  1.3511 +    __ ba_short(L_common_transform);
  1.3512 +
  1.3513 +    __ BIND(L_expand192bit);
  1.3514 +
  1.3515 +    // start loading rest of the 192-bit key
  1.3516 +    __ ldf(FloatRegisterImpl::S, original_key, 16, F4);
  1.3517 +    __ ldf(FloatRegisterImpl::S, original_key, 20, F5);
  1.3518 +
  1.3519 +    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
  1.3520 +    for ( int i = 0;  i <= 36; i += 6 ) {
  1.3521 +      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6));
  1.3522 +      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8));
  1.3523 +      __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10));
  1.3524 +    }
  1.3525 +    __ aes_kexpand1(F42, F46, 7, F48);
  1.3526 +    __ aes_kexpand2(F44, F48, F50);
  1.3527 +
  1.3528 +    // perform 192-bit key specific inverse cipher transformation
  1.3529 +    __ fxor(FloatRegisterImpl::D, F50, F54, F54);
  1.3530 +    __ fxor(FloatRegisterImpl::D, F48, F52, F52);
  1.3531 +    __ aes_dround23(F46, F52, F54, F58);
  1.3532 +    __ aes_dround01(F44, F52, F54, F56);
  1.3533 +    __ aes_dround23(F42, F56, F58, F54);
  1.3534 +    __ aes_dround01(F40, F56, F58, F52);
  1.3535 +    __ ba_short(L_common_transform);
  1.3536 +
  1.3537 +    __ BIND(L_expand256bit);
  1.3538 +
  1.3539 +    // load rest of the 256-bit key
  1.3540 +    for ( int i = 4;  i <= 7; i++ ) {
  1.3541 +      __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
  1.3542 +    }
  1.3543 +
  1.3544 +    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
  1.3545 +    for ( int i = 0;  i <= 40; i += 8 ) {
  1.3546 +      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8));
  1.3547 +      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10));
  1.3548 +      __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12));
  1.3549 +      __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14));
  1.3550 +    }
  1.3551 +    __ aes_kexpand1(F48, F54, 6, F56);
  1.3552 +    __ aes_kexpand2(F50, F56, F58);
  1.3553 +
  1.3554 +    for ( int i = 0;  i <= 6; i += 2 ) {
  1.3555 +      __ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i));
  1.3556 +    }
  1.3557 +
  1.3558 +    // reload original 'from' address
  1.3559 +    __ mov(G1, from);
  1.3560 +
  1.3561 +    // re-check 8-byte alignment
  1.3562 +    __ andcc(from, 7, G0);
  1.3563 +    __ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input);
  1.3564 +    __ delayed()->alignaddr(from, G0, from);
  1.3565 +
  1.3566 +    // aligned case: load input into F52-F54
  1.3567 +    __ ldf(FloatRegisterImpl::D, from, 0, F52);
  1.3568 +    __ ldf(FloatRegisterImpl::D, from, 8, F54);
  1.3569 +    __ ba_short(L_256bit_transform);
  1.3570 +
  1.3571 +    __ BIND(L_reload_misaligned_input);
  1.3572 +    __ ldf(FloatRegisterImpl::D, from, 0, F52);
  1.3573 +    __ ldf(FloatRegisterImpl::D, from, 8, F54);
  1.3574 +    __ ldf(FloatRegisterImpl::D, from, 16, F56);
  1.3575 +    __ faligndata(F52, F54, F52);
  1.3576 +    __ faligndata(F54, F56, F54);
  1.3577 +
  1.3578 +    // perform 256-bit key specific inverse cipher transformation
  1.3579 +    __ BIND(L_256bit_transform);
  1.3580 +    __ fxor(FloatRegisterImpl::D, F0, F54, F54);
  1.3581 +    __ fxor(FloatRegisterImpl::D, F2, F52, F52);
  1.3582 +    __ aes_dround23(F4, F52, F54, F58);
  1.3583 +    __ aes_dround01(F6, F52, F54, F56);
  1.3584 +    __ aes_dround23(F50, F56, F58, F54);
  1.3585 +    __ aes_dround01(F48, F56, F58, F52);
  1.3586 +    __ aes_dround23(F46, F52, F54, F58);
  1.3587 +    __ aes_dround01(F44, F52, F54, F56);
  1.3588 +    __ aes_dround23(F42, F56, F58, F54);
  1.3589 +    __ aes_dround01(F40, F56, F58, F52);
  1.3590 +
  1.3591 +    for ( int i = 0;  i <= 7; i++ ) {
  1.3592 +      __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
  1.3593 +    }
  1.3594 +
  1.3595 +    // perform inverse cipher transformations common for all key sizes
  1.3596 +    __ BIND(L_common_transform);
  1.3597 +    for ( int i = 38;  i >= 6; i -= 8 ) {
  1.3598 +      __ aes_dround23(as_FloatRegister(i), F52, F54, F58);
  1.3599 +      __ aes_dround01(as_FloatRegister(i-2), F52, F54, F56);
  1.3600 +      if ( i != 6) {
  1.3601 +        __ aes_dround23(as_FloatRegister(i-4), F56, F58, F54);
  1.3602 +        __ aes_dround01(as_FloatRegister(i-6), F56, F58, F52);
  1.3603 +      } else {
  1.3604 +        __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F54);
  1.3605 +        __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F52);
  1.3606 +      }
  1.3607 +    }
  1.3608 +
  1.3609 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
  1.3610 +    __ andcc(to, 7, O5);
  1.3611 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
  1.3612 +    __ delayed()->edge8n(to, G0, O3);
  1.3613 +
  1.3614 +    // aligned case: store output into the destination array
  1.3615 +    __ stf(FloatRegisterImpl::D, F52, to, 0);
  1.3616 +    __ retl();
  1.3617 +    __ delayed()->stf(FloatRegisterImpl::D, F54, to, 8);
  1.3618 +
  1.3619 +    __ BIND(L_store_misaligned_output);
  1.3620 +    __ add(to, 8, O4);
  1.3621 +    __ mov(8, O2);
  1.3622 +    __ sub(O2, O5, O2);
  1.3623 +    __ alignaddr(O2, G0, O2);
  1.3624 +    __ faligndata(F52, F52, F52);
  1.3625 +    __ faligndata(F54, F54, F54);
  1.3626 +    __ and3(to, -8, to);
  1.3627 +    __ and3(O4, -8, O4);
  1.3628 +    __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
  1.3629 +    __ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
  1.3630 +    __ add(to, 8, to);
  1.3631 +    __ add(O4, 8, O4);
  1.3632 +    __ orn(G0, O3, O3);
  1.3633 +    __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
  1.3634 +    __ retl();
  1.3635 +    __ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
  1.3636 +
  1.3637 +    return start;
  1.3638 +  }
  1.3639 +
  1.3640 +  address generate_cipherBlockChaining_encryptAESCrypt() {
  1.3641 +    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
  1.3642 +           "the following code assumes that first element of an int array is aligned to 8 bytes");
  1.3643 +    assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
  1.3644 +           "the following code assumes that first element of a byte array is aligned to 8 bytes");
  1.3645 +    __ align(CodeEntryAlignment);
  1.3646 +    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
  1.3647 +    Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit;
  1.3648 +    Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform;
  1.3649 +    Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit;
  1.3650 +    Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit;
  1.3651 +    address start = __ pc();
  1.3652 +    Register from = I0; // source byte array
  1.3653 +    Register to = I1;   // destination byte array
  1.3654 +    Register key = I2;  // expanded key array
  1.3655 +    Register rvec = I3; // init vector
  1.3656 +    const Register len_reg = I4; // cipher length
  1.3657 +    const Register keylen = I5;  // reg for storing expanded key array length
  1.3658 +
  1.3659 +    __ save_frame(0);
  1.3660 +    // save cipher len to return in the end
  1.3661 +    __ mov(len_reg, L0);
  1.3662 +
  1.3663 +    // read expanded key length
  1.3664 +    __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
  1.3665 +
  1.3666 +    // load initial vector, 8-byte alignment is guranteed
  1.3667 +    __ ldf(FloatRegisterImpl::D, rvec, 0, F60);
  1.3668 +    __ ldf(FloatRegisterImpl::D, rvec, 8, F62);
  1.3669 +    // load key, 8-byte alignment is guranteed
  1.3670 +    __ ldx(key,0,G1);
  1.3671 +    __ ldx(key,8,G5);
  1.3672 +
  1.3673 +    // start loading expanded key, 8-byte alignment is guranteed
  1.3674 +    for ( int i = 0, j = 16;  i <= 38; i += 2, j += 8 ) {
  1.3675 +      __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
  1.3676 +    }
  1.3677 +
  1.3678 +    // 128-bit original key size
  1.3679 +    __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_cbcenc128);
  1.3680 +
  1.3681 +    for ( int i = 40, j = 176;  i <= 46; i += 2, j += 8 ) {
  1.3682 +      __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
  1.3683 +    }
  1.3684 +
  1.3685 +    // 192-bit original key size
  1.3686 +    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_cbcenc192);
  1.3687 +
  1.3688 +    for ( int i = 48, j = 208;  i <= 54; i += 2, j += 8 ) {
  1.3689 +      __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
  1.3690 +    }
  1.3691 +
  1.3692 +    // 256-bit original key size
  1.3693 +    __ ba_short(L_cbcenc256);
  1.3694 +
  1.3695 +    __ align(OptoLoopAlignment);
  1.3696 +    __ BIND(L_cbcenc128);
  1.3697 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
  1.3698 +    __ andcc(from, 7, G0);
  1.3699 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit);
  1.3700 +    __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
  1.3701 +
  1.3702 +    // aligned case: load input into G3 and G4
  1.3703 +    __ ldx(from,0,G3);
  1.3704 +    __ ldx(from,8,G4);
  1.3705 +    __ ba_short(L_128bit_transform);
  1.3706 +
  1.3707 +    __ BIND(L_load_misaligned_input_128bit);
  1.3708 +    // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
  1.3709 +    __ alignaddr(from, G0, from);
  1.3710 +    __ ldf(FloatRegisterImpl::D, from, 0, F48);
  1.3711 +    __ ldf(FloatRegisterImpl::D, from, 8, F50);
  1.3712 +    __ ldf(FloatRegisterImpl::D, from, 16, F52);
  1.3713 +    __ faligndata(F48, F50, F48);
  1.3714 +    __ faligndata(F50, F52, F50);
  1.3715 +    __ movdtox(F48, G3);
  1.3716 +    __ movdtox(F50, G4);
  1.3717 +    __ mov(L1, from);
  1.3718 +
  1.3719 +    __ BIND(L_128bit_transform);
  1.3720 +    __ xor3(G1,G3,G3);
  1.3721 +    __ xor3(G5,G4,G4);
  1.3722 +    __ movxtod(G3,F56);
  1.3723 +    __ movxtod(G4,F58);
  1.3724 +    __ fxor(FloatRegisterImpl::D, F60, F56, F60);
  1.3725 +    __ fxor(FloatRegisterImpl::D, F62, F58, F62);
  1.3726 +
  1.3727 +    // TEN_EROUNDS
  1.3728 +    for ( int i = 0;  i <= 32; i += 8 ) {
  1.3729 +      __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
  1.3730 +      __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
  1.3731 +      if (i != 32 ) {
  1.3732 +        __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
  1.3733 +        __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
  1.3734 +      } else {
  1.3735 +        __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
  1.3736 +        __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
  1.3737 +      }
  1.3738 +    }
  1.3739 +
  1.3740 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
  1.3741 +    __ andcc(to, 7, L1);
  1.3742 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit);
  1.3743 +    __ delayed()->edge8n(to, G0, L2);
  1.3744 +
  1.3745 +    // aligned case: store output into the destination array
  1.3746 +    __ stf(FloatRegisterImpl::D, F60, to, 0);
  1.3747 +    __ stf(FloatRegisterImpl::D, F62, to, 8);
  1.3748 +    __ ba_short(L_check_loop_end_128bit);
  1.3749 +
  1.3750 +    __ BIND(L_store_misaligned_output_128bit);
  1.3751 +    __ add(to, 8, L3);
  1.3752 +    __ mov(8, L4);
  1.3753 +    __ sub(L4, L1, L4);
  1.3754 +    __ alignaddr(L4, G0, L4);
  1.3755 +    // save cipher text before circular right shift
  1.3756 +    // as it needs to be stored as iv for next block (see code before next retl)
  1.3757 +    __ movdtox(F60, L6);
  1.3758 +    __ movdtox(F62, L7);
  1.3759 +    __ faligndata(F60, F60, F60);
  1.3760 +    __ faligndata(F62, F62, F62);
  1.3761 +    __ mov(to, L5);
  1.3762 +    __ and3(to, -8, to);
  1.3763 +    __ and3(L3, -8, L3);
  1.3764 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
  1.3765 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
  1.3766 +    __ add(to, 8, to);
  1.3767 +    __ add(L3, 8, L3);
  1.3768 +    __ orn(G0, L2, L2);
  1.3769 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
  1.3770 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
  1.3771 +    __ mov(L5, to);
  1.3772 +    __ movxtod(L6, F60);
  1.3773 +    __ movxtod(L7, F62);
  1.3774 +
  1.3775 +    __ BIND(L_check_loop_end_128bit);
  1.3776 +    __ add(from, 16, from);
  1.3777 +    __ add(to, 16, to);
  1.3778 +    __ subcc(len_reg, 16, len_reg);
  1.3779 +    __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128);
  1.3780 +    __ delayed()->nop();
  1.3781 +    // re-init intial vector for next block, 8-byte alignment is guaranteed
  1.3782 +    __ stf(FloatRegisterImpl::D, F60, rvec, 0);
  1.3783 +    __ stf(FloatRegisterImpl::D, F62, rvec, 8);
  1.3784 +    __ mov(L0, I0);
  1.3785 +    __ ret();
  1.3786 +    __ delayed()->restore();
  1.3787 +
  1.3788 +    __ align(OptoLoopAlignment);
  1.3789 +    __ BIND(L_cbcenc192);
  1.3790 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
  1.3791 +    __ andcc(from, 7, G0);
  1.3792 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit);
  1.3793 +    __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
  1.3794 +
  1.3795 +    // aligned case: load input into G3 and G4
  1.3796 +    __ ldx(from,0,G3);
  1.3797 +    __ ldx(from,8,G4);
  1.3798 +    __ ba_short(L_192bit_transform);
  1.3799 +
  1.3800 +    __ BIND(L_load_misaligned_input_192bit);
  1.3801 +    // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
  1.3802 +    __ alignaddr(from, G0, from);
  1.3803 +    __ ldf(FloatRegisterImpl::D, from, 0, F48);
  1.3804 +    __ ldf(FloatRegisterImpl::D, from, 8, F50);
  1.3805 +    __ ldf(FloatRegisterImpl::D, from, 16, F52);
  1.3806 +    __ faligndata(F48, F50, F48);
  1.3807 +    __ faligndata(F50, F52, F50);
  1.3808 +    __ movdtox(F48, G3);
  1.3809 +    __ movdtox(F50, G4);
  1.3810 +    __ mov(L1, from);
  1.3811 +
  1.3812 +    __ BIND(L_192bit_transform);
  1.3813 +    __ xor3(G1,G3,G3);
  1.3814 +    __ xor3(G5,G4,G4);
  1.3815 +    __ movxtod(G3,F56);
  1.3816 +    __ movxtod(G4,F58);
  1.3817 +    __ fxor(FloatRegisterImpl::D, F60, F56, F60);
  1.3818 +    __ fxor(FloatRegisterImpl::D, F62, F58, F62);
  1.3819 +
  1.3820 +    // TWELEVE_EROUNDS
  1.3821 +    for ( int i = 0;  i <= 40; i += 8 ) {
  1.3822 +      __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
  1.3823 +      __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
  1.3824 +      if (i != 40 ) {
  1.3825 +        __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
  1.3826 +        __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
  1.3827 +      } else {
  1.3828 +        __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
  1.3829 +        __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
  1.3830 +      }
  1.3831 +    }
  1.3832 +
  1.3833 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
  1.3834 +    __ andcc(to, 7, L1);
  1.3835 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit);
  1.3836 +    __ delayed()->edge8n(to, G0, L2);
  1.3837 +
  1.3838 +    // aligned case: store output into the destination array
  1.3839 +    __ stf(FloatRegisterImpl::D, F60, to, 0);
  1.3840 +    __ stf(FloatRegisterImpl::D, F62, to, 8);
  1.3841 +    __ ba_short(L_check_loop_end_192bit);
  1.3842 +
  1.3843 +    __ BIND(L_store_misaligned_output_192bit);
  1.3844 +    __ add(to, 8, L3);
  1.3845 +    __ mov(8, L4);
  1.3846 +    __ sub(L4, L1, L4);
  1.3847 +    __ alignaddr(L4, G0, L4);
  1.3848 +    __ movdtox(F60, L6);
  1.3849 +    __ movdtox(F62, L7);
  1.3850 +    __ faligndata(F60, F60, F60);
  1.3851 +    __ faligndata(F62, F62, F62);
  1.3852 +    __ mov(to, L5);
  1.3853 +    __ and3(to, -8, to);
  1.3854 +    __ and3(L3, -8, L3);
  1.3855 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
  1.3856 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
  1.3857 +    __ add(to, 8, to);
  1.3858 +    __ add(L3, 8, L3);
  1.3859 +    __ orn(G0, L2, L2);
  1.3860 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
  1.3861 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
  1.3862 +    __ mov(L5, to);
  1.3863 +    __ movxtod(L6, F60);
  1.3864 +    __ movxtod(L7, F62);
  1.3865 +
  1.3866 +    __ BIND(L_check_loop_end_192bit);
  1.3867 +    __ add(from, 16, from);
  1.3868 +    __ subcc(len_reg, 16, len_reg);
  1.3869 +    __ add(to, 16, to);
  1.3870 +    __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192);
  1.3871 +    __ delayed()->nop();
  1.3872 +    // re-init intial vector for next block, 8-byte alignment is guaranteed
  1.3873 +    __ stf(FloatRegisterImpl::D, F60, rvec, 0);
  1.3874 +    __ stf(FloatRegisterImpl::D, F62, rvec, 8);
  1.3875 +    __ mov(L0, I0);
  1.3876 +    __ ret();
  1.3877 +    __ delayed()->restore();
  1.3878 +
  1.3879 +    __ align(OptoLoopAlignment);
  1.3880 +    __ BIND(L_cbcenc256);
  1.3881 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
  1.3882 +    __ andcc(from, 7, G0);
  1.3883 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit);
  1.3884 +    __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
  1.3885 +
  1.3886 +    // aligned case: load input into G3 and G4
  1.3887 +    __ ldx(from,0,G3);
  1.3888 +    __ ldx(from,8,G4);
  1.3889 +    __ ba_short(L_256bit_transform);
  1.3890 +
  1.3891 +    __ BIND(L_load_misaligned_input_256bit);
  1.3892 +    // cannot clobber F48, F50 and F52. F56, F58 can be used though
  1.3893 +    __ alignaddr(from, G0, from);
  1.3894 +    __ movdtox(F60, L2); // save F60 before overwriting
  1.3895 +    __ ldf(FloatRegisterImpl::D, from, 0, F56);
  1.3896 +    __ ldf(FloatRegisterImpl::D, from, 8, F58);
  1.3897 +    __ ldf(FloatRegisterImpl::D, from, 16, F60);
  1.3898 +    __ faligndata(F56, F58, F56);
  1.3899 +    __ faligndata(F58, F60, F58);
  1.3900 +    __ movdtox(F56, G3);
  1.3901 +    __ movdtox(F58, G4);
  1.3902 +    __ mov(L1, from);
  1.3903 +    __ movxtod(L2, F60);
  1.3904 +
  1.3905 +    __ BIND(L_256bit_transform);
  1.3906 +    __ xor3(G1,G3,G3);
  1.3907 +    __ xor3(G5,G4,G4);
  1.3908 +    __ movxtod(G3,F56);
  1.3909 +    __ movxtod(G4,F58);
  1.3910 +    __ fxor(FloatRegisterImpl::D, F60, F56, F60);
  1.3911 +    __ fxor(FloatRegisterImpl::D, F62, F58, F62);
  1.3912 +
  1.3913 +    // FOURTEEN_EROUNDS
  1.3914 +    for ( int i = 0;  i <= 48; i += 8 ) {
  1.3915 +      __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
  1.3916 +      __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
  1.3917 +      if (i != 48 ) {
  1.3918 +        __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
  1.3919 +        __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
  1.3920 +      } else {
  1.3921 +        __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
  1.3922 +        __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
  1.3923 +      }
  1.3924 +    }
  1.3925 +
  1.3926 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
  1.3927 +    __ andcc(to, 7, L1);
  1.3928 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit);
  1.3929 +    __ delayed()->edge8n(to, G0, L2);
  1.3930 +
  1.3931 +    // aligned case: store output into the destination array
  1.3932 +    __ stf(FloatRegisterImpl::D, F60, to, 0);
  1.3933 +    __ stf(FloatRegisterImpl::D, F62, to, 8);
  1.3934 +    __ ba_short(L_check_loop_end_256bit);
  1.3935 +
  1.3936 +    __ BIND(L_store_misaligned_output_256bit);
  1.3937 +    __ add(to, 8, L3);
  1.3938 +    __ mov(8, L4);
  1.3939 +    __ sub(L4, L1, L4);
  1.3940 +    __ alignaddr(L4, G0, L4);
  1.3941 +    __ movdtox(F60, L6);
  1.3942 +    __ movdtox(F62, L7);
  1.3943 +    __ faligndata(F60, F60, F60);
  1.3944 +    __ faligndata(F62, F62, F62);
  1.3945 +    __ mov(to, L5);
  1.3946 +    __ and3(to, -8, to);
  1.3947 +    __ and3(L3, -8, L3);
  1.3948 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
  1.3949 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
  1.3950 +    __ add(to, 8, to);
  1.3951 +    __ add(L3, 8, L3);
  1.3952 +    __ orn(G0, L2, L2);
  1.3953 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
  1.3954 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
  1.3955 +    __ mov(L5, to);
  1.3956 +    __ movxtod(L6, F60);
  1.3957 +    __ movxtod(L7, F62);
  1.3958 +
  1.3959 +    __ BIND(L_check_loop_end_256bit);
  1.3960 +    __ add(from, 16, from);
  1.3961 +    __ subcc(len_reg, 16, len_reg);
  1.3962 +    __ add(to, 16, to);
  1.3963 +    __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256);
  1.3964 +    __ delayed()->nop();
  1.3965 +    // re-init intial vector for next block, 8-byte alignment is guaranteed
  1.3966 +    __ stf(FloatRegisterImpl::D, F60, rvec, 0);
  1.3967 +    __ stf(FloatRegisterImpl::D, F62, rvec, 8);
  1.3968 +    __ mov(L0, I0);
  1.3969 +    __ ret();
  1.3970 +    __ delayed()->restore();
  1.3971 +
  1.3972 +    return start;
  1.3973 +  }
  1.3974 +
  1.3975 +  address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
  1.3976 +    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
  1.3977 +           "the following code assumes that first element of an int array is aligned to 8 bytes");
  1.3978 +    assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
  1.3979 +           "the following code assumes that first element of a byte array is aligned to 8 bytes");
  1.3980 +    __ align(CodeEntryAlignment);
  1.3981 +    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
  1.3982 +    Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start;
  1.3983 +    Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256;
  1.3984 +    Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128;
  1.3985 +    Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256;
  1.3986 +    Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128;
  1.3987 +    Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192;
  1.3988 +    Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256;
  1.3989 +    address start = __ pc();
  1.3990 +    Register from = I0; // source byte array
  1.3991 +    Register to = I1;   // destination byte array
  1.3992 +    Register key = I2;  // expanded key array
  1.3993 +    Register rvec = I3; // init vector
  1.3994 +    const Register len_reg = I4; // cipher length
  1.3995 +    const Register original_key = I5;  // original key array only required during decryption
  1.3996 +    const Register keylen = L6;  // reg for storing expanded key array length
  1.3997 +
  1.3998 +    __ save_frame(0); //args are read from I* registers since we save the frame in the beginning
  1.3999 +    // save cipher len to return in the end
  1.4000 +    __ mov(len_reg, L7);
  1.4001 +
  1.4002 +    // load original key from SunJCE expanded decryption key
  1.4003 +    // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
  1.4004 +    for ( int i = 0;  i <= 3; i++ ) {
  1.4005 +      __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
  1.4006 +    }
  1.4007 +
  1.4008 +    // load initial vector, 8-byte alignment is guaranteed
  1.4009 +    __ ldx(rvec,0,L0);
  1.4010 +    __ ldx(rvec,8,L1);
  1.4011 +
  1.4012 +    // read expanded key array length
  1.4013 +    __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
  1.4014 +
  1.4015 +    // 256-bit original key size
  1.4016 +    __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit);
  1.4017 +
  1.4018 +    // 192-bit original key size
  1.4019 +    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit);
  1.4020 +
  1.4021 +    // 128-bit original key size
  1.4022 +    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
  1.4023 +    for ( int i = 0;  i <= 36; i += 4 ) {
  1.4024 +      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4));
  1.4025 +      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6));
  1.4026 +    }
  1.4027 +
  1.4028 +    // load expanded key[last-1] and key[last] elements
  1.4029 +    __ movdtox(F40,L2);
  1.4030 +    __ movdtox(F42,L3);
  1.4031 +
  1.4032 +    __ and3(len_reg, 16, L4);
  1.4033 +    __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128);
  1.4034 +    __ nop();
  1.4035 +
  1.4036 +    __ ba_short(L_dec_first_block_start);
  1.4037 +
  1.4038 +    __ BIND(L_expand192bit);
  1.4039 +    // load rest of the 192-bit key
  1.4040 +    __ ldf(FloatRegisterImpl::S, original_key, 16, F4);
  1.4041 +    __ ldf(FloatRegisterImpl::S, original_key, 20, F5);
  1.4042 +
  1.4043 +    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
  1.4044 +    for ( int i = 0;  i <= 36; i += 6 ) {
  1.4045 +      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6));
  1.4046 +      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8));
  1.4047 +      __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10));
  1.4048 +    }
  1.4049 +    __ aes_kexpand1(F42, F46, 7, F48);
  1.4050 +    __ aes_kexpand2(F44, F48, F50);
  1.4051 +
  1.4052 +    // load expanded key[last-1] and key[last] elements
  1.4053 +    __ movdtox(F48,L2);
  1.4054 +    __ movdtox(F50,L3);
  1.4055 +
  1.4056 +    __ and3(len_reg, 16, L4);
  1.4057 +    __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192);
  1.4058 +    __ nop();
  1.4059 +
  1.4060 +    __ ba_short(L_dec_first_block_start);
  1.4061 +
  1.4062 +    __ BIND(L_expand256bit);
  1.4063 +    // load rest of the 256-bit key
  1.4064 +    for ( int i = 4;  i <= 7; i++ ) {
  1.4065 +      __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
  1.4066 +    }
  1.4067 +
  1.4068 +    // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
  1.4069 +    for ( int i = 0;  i <= 40; i += 8 ) {
  1.4070 +      __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8));
  1.4071 +      __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10));
  1.4072 +      __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12));
  1.4073 +      __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14));
  1.4074 +    }
  1.4075 +    __ aes_kexpand1(F48, F54, 6, F56);
  1.4076 +    __ aes_kexpand2(F50, F56, F58);
  1.4077 +
  1.4078 +    // load expanded key[last-1] and key[last] elements
  1.4079 +    __ movdtox(F56,L2);
  1.4080 +    __ movdtox(F58,L3);
  1.4081 +
  1.4082 +    __ and3(len_reg, 16, L4);
  1.4083 +    __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256);
  1.4084 +
  1.4085 +    __ BIND(L_dec_first_block_start);
  1.4086 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
  1.4087 +    __ andcc(from, 7, G0);
  1.4088 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block);
  1.4089 +    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
  1.4090 +
  1.4091 +    // aligned case: load input into L4 and L5
  1.4092 +    __ ldx(from,0,L4);
  1.4093 +    __ ldx(from,8,L5);
  1.4094 +    __ ba_short(L_transform_first_block);
  1.4095 +
  1.4096 +    __ BIND(L_load_misaligned_input_first_block);
  1.4097 +    __ alignaddr(from, G0, from);
  1.4098 +    // F58, F60, F62 can be clobbered
  1.4099 +    __ ldf(FloatRegisterImpl::D, from, 0, F58);
  1.4100 +    __ ldf(FloatRegisterImpl::D, from, 8, F60);
  1.4101 +    __ ldf(FloatRegisterImpl::D, from, 16, F62);
  1.4102 +    __ faligndata(F58, F60, F58);
  1.4103 +    __ faligndata(F60, F62, F60);
  1.4104 +    __ movdtox(F58, L4);
  1.4105 +    __ movdtox(F60, L5);
  1.4106 +    __ mov(G1, from);
  1.4107 +
  1.4108 +    __ BIND(L_transform_first_block);
  1.4109 +    __ xor3(L2,L4,G1);
  1.4110 +    __ movxtod(G1,F60);
  1.4111 +    __ xor3(L3,L5,G1);
  1.4112 +    __ movxtod(G1,F62);
  1.4113 +
  1.4114 +    // 128-bit original key size
  1.4115 +    __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pn, L_dec_first_block128);
  1.4116 +
  1.4117 +    // 192-bit original key size
  1.4118 +    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_first_block192);
  1.4119 +
  1.4120 +    __ aes_dround23(F54, F60, F62, F58);
  1.4121 +    __ aes_dround01(F52, F60, F62, F56);
  1.4122 +    __ aes_dround23(F50, F56, F58, F62);
  1.4123 +    __ aes_dround01(F48, F56, F58, F60);
  1.4124 +
  1.4125 +    __ BIND(L_dec_first_block192);
  1.4126 +    __ aes_dround23(F46, F60, F62, F58);
  1.4127 +    __ aes_dround01(F44, F60, F62, F56);
  1.4128 +    __ aes_dround23(F42, F56, F58, F62);
  1.4129 +    __ aes_dround01(F40, F56, F58, F60);
  1.4130 +
  1.4131 +    __ BIND(L_dec_first_block128);
  1.4132 +    for ( int i = 38;  i >= 6; i -= 8 ) {
  1.4133 +      __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
  1.4134 +      __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
  1.4135 +      if ( i != 6) {
  1.4136 +        __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
  1.4137 +        __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
  1.4138 +      } else {
  1.4139 +        __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
  1.4140 +        __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
  1.4141 +      }
  1.4142 +    }
  1.4143 +
  1.4144 +    __ movxtod(L0,F56);
  1.4145 +    __ movxtod(L1,F58);
  1.4146 +    __ mov(L4,L0);
  1.4147 +    __ mov(L5,L1);
  1.4148 +    __ fxor(FloatRegisterImpl::D, F56, F60, F60);
  1.4149 +    __ fxor(FloatRegisterImpl::D, F58, F62, F62);
  1.4150 +
  1.4151 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
  1.4152 +    __ andcc(to, 7, G1);
  1.4153 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block);
  1.4154 +    __ delayed()->edge8n(to, G0, G2);
  1.4155 +
  1.4156 +    // aligned case: store output into the destination array
  1.4157 +    __ stf(FloatRegisterImpl::D, F60, to, 0);
  1.4158 +    __ stf(FloatRegisterImpl::D, F62, to, 8);
  1.4159 +    __ ba_short(L_check_decrypt_end);
  1.4160 +
  1.4161 +    __ BIND(L_store_misaligned_output_first_block);
  1.4162 +    __ add(to, 8, G3);
  1.4163 +    __ mov(8, G4);
  1.4164 +    __ sub(G4, G1, G4);
  1.4165 +    __ alignaddr(G4, G0, G4);
  1.4166 +    __ faligndata(F60, F60, F60);
  1.4167 +    __ faligndata(F62, F62, F62);
  1.4168 +    __ mov(to, G1);
  1.4169 +    __ and3(to, -8, to);
  1.4170 +    __ and3(G3, -8, G3);
  1.4171 +    __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
  1.4172 +    __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
  1.4173 +    __ add(to, 8, to);
  1.4174 +    __ add(G3, 8, G3);
  1.4175 +    __ orn(G0, G2, G2);
  1.4176 +    __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
  1.4177 +    __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
  1.4178 +    __ mov(G1, to);
  1.4179 +
  1.4180 +    __ BIND(L_check_decrypt_end);
  1.4181 +    __ add(from, 16, from);
  1.4182 +    __ add(to, 16, to);
  1.4183 +    __ subcc(len_reg, 16, len_reg);
  1.4184 +    __ br(Assembler::equal, false, Assembler::pt, L_cbcdec_end);
  1.4185 +    __ delayed()->nop();
  1.4186 +
  1.4187 +    // 256-bit original key size
  1.4188 +    __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_dec_next2_blocks256);
  1.4189 +
  1.4190 +    // 192-bit original key size
  1.4191 +    __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_next2_blocks192);
  1.4192 +
  1.4193 +    __ align(OptoLoopAlignment);
  1.4194 +    __ BIND(L_dec_next2_blocks128);
  1.4195 +    __ nop();
  1.4196 +
  1.4197 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
  1.4198 +    __ andcc(from, 7, G0);
  1.4199 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128);
  1.4200 +    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
  1.4201 +
  1.4202 +    // aligned case: load input into G4, G5, L4 and L5
  1.4203 +    __ ldx(from,0,G4);
  1.4204 +    __ ldx(from,8,G5);
  1.4205 +    __ ldx(from,16,L4);
  1.4206 +    __ ldx(from,24,L5);
  1.4207 +    __ ba_short(L_transform_next2_blocks128);
  1.4208 +
  1.4209 +    __ BIND(L_load_misaligned_next2_blocks128);
  1.4210 +    __ alignaddr(from, G0, from);
  1.4211 +    // F40, F42, F58, F60, F62 can be clobbered
  1.4212 +    __ ldf(FloatRegisterImpl::D, from, 0, F40);
  1.4213 +    __ ldf(FloatRegisterImpl::D, from, 8, F42);
  1.4214 +    __ ldf(FloatRegisterImpl::D, from, 16, F60);
  1.4215 +    __ ldf(FloatRegisterImpl::D, from, 24, F62);
  1.4216 +    __ ldf(FloatRegisterImpl::D, from, 32, F58);
  1.4217 +    __ faligndata(F40, F42, F40);
  1.4218 +    __ faligndata(F42, F60, F42);
  1.4219 +    __ faligndata(F60, F62, F60);
  1.4220 +    __ faligndata(F62, F58, F62);
  1.4221 +    __ movdtox(F40, G4);
  1.4222 +    __ movdtox(F42, G5);
  1.4223 +    __ movdtox(F60, L4);
  1.4224 +    __ movdtox(F62, L5);
  1.4225 +    __ mov(G1, from);
  1.4226 +
  1.4227 +    __ BIND(L_transform_next2_blocks128);
  1.4228 +    // F40:F42 used for first 16-bytes
  1.4229 +    __ xor3(L2,G4,G1);
  1.4230 +    __ movxtod(G1,F40);
  1.4231 +    __ xor3(L3,G5,G1);
  1.4232 +    __ movxtod(G1,F42);
  1.4233 +
  1.4234 +    // F60:F62 used for next 16-bytes
  1.4235 +    __ xor3(L2,L4,G1);
  1.4236 +    __ movxtod(G1,F60);
  1.4237 +    __ xor3(L3,L5,G1);
  1.4238 +    __ movxtod(G1,F62);
  1.4239 +
  1.4240 +    for ( int i = 38;  i >= 6; i -= 8 ) {
  1.4241 +      __ aes_dround23(as_FloatRegister(i), F40, F42, F44);
  1.4242 +      __ aes_dround01(as_FloatRegister(i-2), F40, F42, F46);
  1.4243 +      __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
  1.4244 +      __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
  1.4245 +      if (i != 6 ) {
  1.4246 +        __ aes_dround23(as_FloatRegister(i-4), F46, F44, F42);
  1.4247 +        __ aes_dround01(as_FloatRegister(i-6), F46, F44, F40);
  1.4248 +        __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
  1.4249 +        __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
  1.4250 +      } else {
  1.4251 +        __ aes_dround23_l(as_FloatRegister(i-4), F46, F44, F42);
  1.4252 +        __ aes_dround01_l(as_FloatRegister(i-6), F46, F44, F40);
  1.4253 +        __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
  1.4254 +        __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
  1.4255 +      }
  1.4256 +    }
  1.4257 +
  1.4258 +    __ movxtod(L0,F46);
  1.4259 +    __ movxtod(L1,F44);
  1.4260 +    __ fxor(FloatRegisterImpl::D, F46, F40, F40);
  1.4261 +    __ fxor(FloatRegisterImpl::D, F44, F42, F42);
  1.4262 +
  1.4263 +    __ movxtod(G4,F56);
  1.4264 +    __ movxtod(G5,F58);
  1.4265 +    __ mov(L4,L0);
  1.4266 +    __ mov(L5,L1);
  1.4267 +    __ fxor(FloatRegisterImpl::D, F56, F60, F60);
  1.4268 +    __ fxor(FloatRegisterImpl::D, F58, F62, F62);
  1.4269 +
  1.4270 +    // For mis-aligned store of 32 bytes of result we can do:
  1.4271 +    // Circular right-shift all 4 FP registers so that 'head' and 'tail'
  1.4272 +    // parts that need to be stored starting at mis-aligned address are in a FP reg
  1.4273 +    // the other 3 FP regs can thus be stored using regular store
  1.4274 +    // we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts
  1.4275 +
  1.4276 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
  1.4277 +    __ andcc(to, 7, G1);
  1.4278 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128);
  1.4279 +    __ delayed()->edge8n(to, G0, G2);
  1.4280 +
  1.4281 +    // aligned case: store output into the destination array
  1.4282 +    __ stf(FloatRegisterImpl::D, F40, to, 0);
  1.4283 +    __ stf(FloatRegisterImpl::D, F42, to, 8);
  1.4284 +    __ stf(FloatRegisterImpl::D, F60, to, 16);
  1.4285 +    __ stf(FloatRegisterImpl::D, F62, to, 24);
  1.4286 +    __ ba_short(L_check_decrypt_loop_end128);
  1.4287 +
  1.4288 +    __ BIND(L_store_misaligned_output_next2_blocks128);
  1.4289 +    __ mov(8, G4);
  1.4290 +    __ sub(G4, G1, G4);
  1.4291 +    __ alignaddr(G4, G0, G4);
  1.4292 +    __ faligndata(F40, F42, F56); // F56 can be clobbered
  1.4293 +    __ faligndata(F42, F60, F42);
  1.4294 +    __ faligndata(F60, F62, F60);
  1.4295 +    __ faligndata(F62, F40, F40);
  1.4296 +    __ mov(to, G1);
  1.4297 +    __ and3(to, -8, to);
  1.4298 +    __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
  1.4299 +    __ stf(FloatRegisterImpl::D, F56, to, 8);
  1.4300 +    __ stf(FloatRegisterImpl::D, F42, to, 16);
  1.4301 +    __ stf(FloatRegisterImpl::D, F60, to, 24);
  1.4302 +    __ add(to, 32, to);
  1.4303 +    __ orn(G0, G2, G2);
  1.4304 +    __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
  1.4305 +    __ mov(G1, to);
  1.4306 +
  1.4307 +    __ BIND(L_check_decrypt_loop_end128);
  1.4308 +    __ add(from, 32, from);
  1.4309 +    __ add(to, 32, to);
  1.4310 +    __ subcc(len_reg, 32, len_reg);
  1.4311 +    __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128);
  1.4312 +    __ delayed()->nop();
  1.4313 +    __ ba_short(L_cbcdec_end);
  1.4314 +
  1.4315 +    __ align(OptoLoopAlignment);
  1.4316 +    __ BIND(L_dec_next2_blocks192);
  1.4317 +    __ nop();
  1.4318 +
  1.4319 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
  1.4320 +    __ andcc(from, 7, G0);
  1.4321 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192);
  1.4322 +    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
  1.4323 +
  1.4324 +    // aligned case: load input into G4, G5, L4 and L5
  1.4325 +    __ ldx(from,0,G4);
  1.4326 +    __ ldx(from,8,G5);
  1.4327 +    __ ldx(from,16,L4);
  1.4328 +    __ ldx(from,24,L5);
  1.4329 +    __ ba_short(L_transform_next2_blocks192);
  1.4330 +
  1.4331 +    __ BIND(L_load_misaligned_next2_blocks192);
  1.4332 +    __ alignaddr(from, G0, from);
  1.4333 +    // F48, F50, F52, F60, F62 can be clobbered
  1.4334 +    __ ldf(FloatRegisterImpl::D, from, 0, F48);
  1.4335 +    __ ldf(FloatRegisterImpl::D, from, 8, F50);
  1.4336 +    __ ldf(FloatRegisterImpl::D, from, 16, F60);
  1.4337 +    __ ldf(FloatRegisterImpl::D, from, 24, F62);
  1.4338 +    __ ldf(FloatRegisterImpl::D, from, 32, F52);
  1.4339 +    __ faligndata(F48, F50, F48);
  1.4340 +    __ faligndata(F50, F60, F50);
  1.4341 +    __ faligndata(F60, F62, F60);
  1.4342 +    __ faligndata(F62, F52, F62);
  1.4343 +    __ movdtox(F48, G4);
  1.4344 +    __ movdtox(F50, G5);
  1.4345 +    __ movdtox(F60, L4);
  1.4346 +    __ movdtox(F62, L5);
  1.4347 +    __ mov(G1, from);
  1.4348 +
  1.4349 +    __ BIND(L_transform_next2_blocks192);
  1.4350 +    // F48:F50 used for first 16-bytes
  1.4351 +    __ xor3(L2,G4,G1);
  1.4352 +    __ movxtod(G1,F48);
  1.4353 +    __ xor3(L3,G5,G1);
  1.4354 +    __ movxtod(G1,F50);
  1.4355 +
  1.4356 +    // F60:F62 used for next 16-bytes
  1.4357 +    __ xor3(L2,L4,G1);
  1.4358 +    __ movxtod(G1,F60);
  1.4359 +    __ xor3(L3,L5,G1);
  1.4360 +    __ movxtod(G1,F62);
  1.4361 +
  1.4362 +    for ( int i = 46;  i >= 6; i -= 8 ) {
  1.4363 +      __ aes_dround23(as_FloatRegister(i), F48, F50, F52);
  1.4364 +      __ aes_dround01(as_FloatRegister(i-2), F48, F50, F54);
  1.4365 +      __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
  1.4366 +      __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
  1.4367 +      if (i != 6 ) {
  1.4368 +        __ aes_dround23(as_FloatRegister(i-4), F54, F52, F50);
  1.4369 +        __ aes_dround01(as_FloatRegister(i-6), F54, F52, F48);
  1.4370 +        __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
  1.4371 +        __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
  1.4372 +      } else {
  1.4373 +        __ aes_dround23_l(as_FloatRegister(i-4), F54, F52, F50);
  1.4374 +        __ aes_dround01_l(as_FloatRegister(i-6), F54, F52, F48);
  1.4375 +        __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
  1.4376 +        __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
  1.4377 +      }
  1.4378 +    }
  1.4379 +
  1.4380 +    __ movxtod(L0,F54);
  1.4381 +    __ movxtod(L1,F52);
  1.4382 +    __ fxor(FloatRegisterImpl::D, F54, F48, F48);
  1.4383 +    __ fxor(FloatRegisterImpl::D, F52, F50, F50);
  1.4384 +
  1.4385 +    __ movxtod(G4,F56);
  1.4386 +    __ movxtod(G5,F58);
  1.4387 +    __ mov(L4,L0);
  1.4388 +    __ mov(L5,L1);
  1.4389 +    __ fxor(FloatRegisterImpl::D, F56, F60, F60);
  1.4390 +    __ fxor(FloatRegisterImpl::D, F58, F62, F62);
  1.4391 +
  1.4392 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
  1.4393 +    __ andcc(to, 7, G1);
  1.4394 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192);
  1.4395 +    __ delayed()->edge8n(to, G0, G2);
  1.4396 +
  1.4397 +    // aligned case: store output into the destination array
  1.4398 +    __ stf(FloatRegisterImpl::D, F48, to, 0);
  1.4399 +    __ stf(FloatRegisterImpl::D, F50, to, 8);
  1.4400 +    __ stf(FloatRegisterImpl::D, F60, to, 16);
  1.4401 +    __ stf(FloatRegisterImpl::D, F62, to, 24);
  1.4402 +    __ ba_short(L_check_decrypt_loop_end192);
  1.4403 +
  1.4404 +    __ BIND(L_store_misaligned_output_next2_blocks192);
  1.4405 +    __ mov(8, G4);
  1.4406 +    __ sub(G4, G1, G4);
  1.4407 +    __ alignaddr(G4, G0, G4);
  1.4408 +    __ faligndata(F48, F50, F56); // F56 can be clobbered
  1.4409 +    __ faligndata(F50, F60, F50);
  1.4410 +    __ faligndata(F60, F62, F60);
  1.4411 +    __ faligndata(F62, F48, F48);
  1.4412 +    __ mov(to, G1);
  1.4413 +    __ and3(to, -8, to);
  1.4414 +    __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
  1.4415 +    __ stf(FloatRegisterImpl::D, F56, to, 8);
  1.4416 +    __ stf(FloatRegisterImpl::D, F50, to, 16);
  1.4417 +    __ stf(FloatRegisterImpl::D, F60, to, 24);
  1.4418 +    __ add(to, 32, to);
  1.4419 +    __ orn(G0, G2, G2);
  1.4420 +    __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
  1.4421 +    __ mov(G1, to);
  1.4422 +
  1.4423 +    __ BIND(L_check_decrypt_loop_end192);
  1.4424 +    __ add(from, 32, from);
  1.4425 +    __ add(to, 32, to);
  1.4426 +    __ subcc(len_reg, 32, len_reg);
  1.4427 +    __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192);
  1.4428 +    __ delayed()->nop();
  1.4429 +    __ ba_short(L_cbcdec_end);
  1.4430 +
  1.4431 +    __ align(OptoLoopAlignment);
  1.4432 +    __ BIND(L_dec_next2_blocks256);
  1.4433 +    __ nop();
  1.4434 +
  1.4435 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
  1.4436 +    __ andcc(from, 7, G0);
  1.4437 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256);
  1.4438 +    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
  1.4439 +
  1.4440 +    // aligned case: load input into G4, G5, L4 and L5
  1.4441 +    __ ldx(from,0,G4);
  1.4442 +    __ ldx(from,8,G5);
  1.4443 +    __ ldx(from,16,L4);
  1.4444 +    __ ldx(from,24,L5);
  1.4445 +    __ ba_short(L_transform_next2_blocks256);
  1.4446 +
  1.4447 +    __ BIND(L_load_misaligned_next2_blocks256);
  1.4448 +    __ alignaddr(from, G0, from);
  1.4449 +    // F0, F2, F4, F60, F62 can be clobbered
  1.4450 +    __ ldf(FloatRegisterImpl::D, from, 0, F0);
  1.4451 +    __ ldf(FloatRegisterImpl::D, from, 8, F2);
  1.4452 +    __ ldf(FloatRegisterImpl::D, from, 16, F60);
  1.4453 +    __ ldf(FloatRegisterImpl::D, from, 24, F62);
  1.4454 +    __ ldf(FloatRegisterImpl::D, from, 32, F4);
  1.4455 +    __ faligndata(F0, F2, F0);
  1.4456 +    __ faligndata(F2, F60, F2);
  1.4457 +    __ faligndata(F60, F62, F60);
  1.4458 +    __ faligndata(F62, F4, F62);
  1.4459 +    __ movdtox(F0, G4);
  1.4460 +    __ movdtox(F2, G5);
  1.4461 +    __ movdtox(F60, L4);
  1.4462 +    __ movdtox(F62, L5);
  1.4463 +    __ mov(G1, from);
  1.4464 +
  1.4465 +    __ BIND(L_transform_next2_blocks256);
  1.4466 +    // F0:F2 used for first 16-bytes
  1.4467 +    __ xor3(L2,G4,G1);
  1.4468 +    __ movxtod(G1,F0);
  1.4469 +    __ xor3(L3,G5,G1);
  1.4470 +    __ movxtod(G1,F2);
  1.4471 +
  1.4472 +    // F60:F62 used for next 16-bytes
  1.4473 +    __ xor3(L2,L4,G1);
  1.4474 +    __ movxtod(G1,F60);
  1.4475 +    __ xor3(L3,L5,G1);
  1.4476 +    __ movxtod(G1,F62);
  1.4477 +
  1.4478 +    __ aes_dround23(F54, F0, F2, F4);
  1.4479 +    __ aes_dround01(F52, F0, F2, F6);
  1.4480 +    __ aes_dround23(F54, F60, F62, F58);
  1.4481 +    __ aes_dround01(F52, F60, F62, F56);
  1.4482 +    __ aes_dround23(F50, F6, F4, F2);
  1.4483 +    __ aes_dround01(F48, F6, F4, F0);
  1.4484 +    __ aes_dround23(F50, F56, F58, F62);
  1.4485 +    __ aes_dround01(F48, F56, F58, F60);
  1.4486 +    // save F48:F54 in temp registers
  1.4487 +    __ movdtox(F54,G2);
  1.4488 +    __ movdtox(F52,G3);
  1.4489 +    __ movdtox(F50,G6);
  1.4490 +    __ movdtox(F48,G1);
  1.4491 +    for ( int i = 46;  i >= 14; i -= 8 ) {
  1.4492 +      __ aes_dround23(as_FloatRegister(i), F0, F2, F4);
  1.4493 +      __ aes_dround01(as_FloatRegister(i-2), F0, F2, F6);
  1.4494 +      __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
  1.4495 +      __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
  1.4496 +      __ aes_dround23(as_FloatRegister(i-4), F6, F4, F2);
  1.4497 +      __ aes_dround01(as_FloatRegister(i-6), F6, F4, F0);
  1.4498 +      __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
  1.4499 +      __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
  1.4500 +    }
  1.4501 +    // init F48:F54 with F0:F6 values (original key)
  1.4502 +    __ ldf(FloatRegisterImpl::D, original_key, 0, F48);
  1.4503 +    __ ldf(FloatRegisterImpl::D, original_key, 8, F50);
  1.4504 +    __ ldf(FloatRegisterImpl::D, original_key, 16, F52);
  1.4505 +    __ ldf(FloatRegisterImpl::D, original_key, 24, F54);
  1.4506 +    __ aes_dround23(F54, F0, F2, F4);
  1.4507 +    __ aes_dround01(F52, F0, F2, F6);
  1.4508 +    __ aes_dround23(F54, F60, F62, F58);
  1.4509 +    __ aes_dround01(F52, F60, F62, F56);
  1.4510 +    __ aes_dround23_l(F50, F6, F4, F2);
  1.4511 +    __ aes_dround01_l(F48, F6, F4, F0);
  1.4512 +    __ aes_dround23_l(F50, F56, F58, F62);
  1.4513 +    __ aes_dround01_l(F48, F56, F58, F60);
  1.4514 +    // re-init F48:F54 with their original values
  1.4515 +    __ movxtod(G2,F54);
  1.4516 +    __ movxtod(G3,F52);
  1.4517 +    __ movxtod(G6,F50);
  1.4518 +    __ movxtod(G1,F48);
  1.4519 +
  1.4520 +    __ movxtod(L0,F6);
  1.4521 +    __ movxtod(L1,F4);
  1.4522 +    __ fxor(FloatRegisterImpl::D, F6, F0, F0);
  1.4523 +    __ fxor(FloatRegisterImpl::D, F4, F2, F2);
  1.4524 +
  1.4525 +    __ movxtod(G4,F56);
  1.4526 +    __ movxtod(G5,F58);
  1.4527 +    __ mov(L4,L0);
  1.4528 +    __ mov(L5,L1);
  1.4529 +    __ fxor(FloatRegisterImpl::D, F56, F60, F60);
  1.4530 +    __ fxor(FloatRegisterImpl::D, F58, F62, F62);
  1.4531 +
  1.4532 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
  1.4533 +    __ andcc(to, 7, G1);
  1.4534 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256);
  1.4535 +    __ delayed()->edge8n(to, G0, G2);
  1.4536 +
  1.4537 +    // aligned case: store output into the destination array
  1.4538 +    __ stf(FloatRegisterImpl::D, F0, to, 0);
  1.4539 +    __ stf(FloatRegisterImpl::D, F2, to, 8);
  1.4540 +    __ stf(FloatRegisterImpl::D, F60, to, 16);
  1.4541 +    __ stf(FloatRegisterImpl::D, F62, to, 24);
  1.4542 +    __ ba_short(L_check_decrypt_loop_end256);
  1.4543 +
  1.4544 +    __ BIND(L_store_misaligned_output_next2_blocks256);
  1.4545 +    __ mov(8, G4);
  1.4546 +    __ sub(G4, G1, G4);
  1.4547 +    __ alignaddr(G4, G0, G4);
  1.4548 +    __ faligndata(F0, F2, F56); // F56 can be clobbered
  1.4549 +    __ faligndata(F2, F60, F2);
  1.4550 +    __ faligndata(F60, F62, F60);
  1.4551 +    __ faligndata(F62, F0, F0);
  1.4552 +    __ mov(to, G1);
  1.4553 +    __ and3(to, -8, to);
  1.4554 +    __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
  1.4555 +    __ stf(FloatRegisterImpl::D, F56, to, 8);
  1.4556 +    __ stf(FloatRegisterImpl::D, F2, to, 16);
  1.4557 +    __ stf(FloatRegisterImpl::D, F60, to, 24);
  1.4558 +    __ add(to, 32, to);
  1.4559 +    __ orn(G0, G2, G2);
  1.4560 +    __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
  1.4561 +    __ mov(G1, to);
  1.4562 +
  1.4563 +    __ BIND(L_check_decrypt_loop_end256);
  1.4564 +    __ add(from, 32, from);
  1.4565 +    __ add(to, 32, to);
  1.4566 +    __ subcc(len_reg, 32, len_reg);
  1.4567 +    __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks256);
  1.4568 +    __ delayed()->nop();
  1.4569 +
  1.4570 +    __ BIND(L_cbcdec_end);
  1.4571 +    // re-init intial vector for next block, 8-byte alignment is guaranteed
  1.4572 +    __ stx(L0, rvec, 0);
  1.4573 +    __ stx(L1, rvec, 8);
  1.4574 +    __ mov(L7, I0);
  1.4575 +    __ ret();
  1.4576 +    __ delayed()->restore();
  1.4577 +
  1.4578 +    return start;
  1.4579 +  }
  1.4580 +
  1.4581 +  void generate_initial() {
  1.4582 +    // Generates all stubs and initializes the entry points
  1.4583 +
  1.4584 +    //------------------------------------------------------------------------------------------------------------------------
  1.4585 +    // entry points that exist in all platforms
  1.4586 +    // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
  1.4587 +    //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
  1.4588 +    StubRoutines::_forward_exception_entry                 = generate_forward_exception();
  1.4589 +
  1.4590 +    StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
  1.4591 +    StubRoutines::_catch_exception_entry                   = generate_catch_exception();
  1.4592 +
  1.4593 +    //------------------------------------------------------------------------------------------------------------------------
  1.4594 +    // entry points that are platform specific
  1.4595 +    StubRoutines::Sparc::_test_stop_entry                  = generate_test_stop();
  1.4596 +
  1.4597 +    StubRoutines::Sparc::_stop_subroutine_entry            = generate_stop_subroutine();
  1.4598 +    StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
  1.4599 +
  1.4600 +#if !defined(COMPILER2) && !defined(_LP64)
  1.4601 +    StubRoutines::_atomic_xchg_entry         = generate_atomic_xchg();
  1.4602 +    StubRoutines::_atomic_cmpxchg_entry      = generate_atomic_cmpxchg();
  1.4603 +    StubRoutines::_atomic_add_entry          = generate_atomic_add();
  1.4604 +    StubRoutines::_atomic_xchg_ptr_entry     = StubRoutines::_atomic_xchg_entry;
  1.4605 +    StubRoutines::_atomic_cmpxchg_ptr_entry  = StubRoutines::_atomic_cmpxchg_entry;
  1.4606 +    StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
  1.4607 +    StubRoutines::_atomic_add_ptr_entry      = StubRoutines::_atomic_add_entry;
  1.4608 +#endif  // COMPILER2 !=> _LP64
  1.4609 +
  1.4610 +    // Build this early so it's available for the interpreter.
  1.4611 +    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
  1.4612 +  }
  1.4613 +
  1.4614 +
  1.4615 +  void generate_all() {
  1.4616 +    // Generates all stubs and initializes the entry points
  1.4617 +
  1.4618 +    // Generate partial_subtype_check first here since its code depends on
  1.4619 +    // UseZeroBaseCompressedOops which is defined after heap initialization.
  1.4620 +    StubRoutines::Sparc::_partial_subtype_check                = generate_partial_subtype_check();
  1.4621 +    // These entry points require SharedInfo::stack0 to be set up in non-core builds
  1.4622 +    StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
  1.4623 +    StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
  1.4624 +    StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
  1.4625 +
  1.4626 +    StubRoutines::_handler_for_unsafe_access_entry =
  1.4627 +      generate_handler_for_unsafe_access();
  1.4628 +
  1.4629 +    // support for verify_oop (must happen after universe_init)
  1.4630 +    StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop_subroutine();
  1.4631 +
  1.4632 +    // arraycopy stubs used by compilers
  1.4633 +    generate_arraycopy_stubs();
  1.4634 +
  1.4635 +    // Don't initialize the platform math functions since sparc
  1.4636 +    // doesn't have intrinsics for these operations.
  1.4637 +
  1.4638 +    // Safefetch stubs.
  1.4639 +    generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
  1.4640 +                                                       &StubRoutines::_safefetch32_fault_pc,
  1.4641 +                                                       &StubRoutines::_safefetch32_continuation_pc);
  1.4642 +    generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
  1.4643 +                                                       &StubRoutines::_safefetchN_fault_pc,
  1.4644 +                                                       &StubRoutines::_safefetchN_continuation_pc);
  1.4645 +
  1.4646 +    // generate AES intrinsics code
  1.4647 +    if (UseAESIntrinsics) {
  1.4648 +      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
  1.4649 +      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
  1.4650 +      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
  1.4651 +      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
  1.4652 +    }
  1.4653 +  }
  1.4654 +
  1.4655 +
  1.4656 + public:
  1.4657 +  StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
  1.4658 +    // replace the standard masm with a special one:
  1.4659 +    _masm = new MacroAssembler(code);
  1.4660 +
  1.4661 +    _stub_count = !all ? 0x100 : 0x200;
  1.4662 +    if (all) {
  1.4663 +      generate_all();
  1.4664 +    } else {
  1.4665 +      generate_initial();
  1.4666 +    }
  1.4667 +
  1.4668 +    // make sure this stub is available for all local calls
  1.4669 +    if (_atomic_add_stub.is_unbound()) {
  1.4670 +      // generate a second time, if necessary
  1.4671 +      (void) generate_atomic_add();
  1.4672 +    }
  1.4673 +  }
  1.4674 +
  1.4675 +
  1.4676 + private:
  1.4677 +  int _stub_count;
  1.4678 +  void stub_prolog(StubCodeDesc* cdesc) {
  1.4679 +    # ifdef ASSERT
  1.4680 +      // put extra information in the stub code, to make it more readable
  1.4681 +#ifdef _LP64
  1.4682 +// Write the high part of the address
  1.4683 +// [RGV] Check if there is a dependency on the size of this prolog
  1.4684 +      __ emit_data((intptr_t)cdesc >> 32,    relocInfo::none);
  1.4685 +#endif
  1.4686 +      __ emit_data((intptr_t)cdesc,    relocInfo::none);
  1.4687 +      __ emit_data(++_stub_count, relocInfo::none);
  1.4688 +    # endif
  1.4689 +    align(true);
  1.4690 +  }
  1.4691 +
  1.4692 +  void align(bool at_header = false) {
  1.4693 +    // %%%%% move this constant somewhere else
  1.4694 +    // UltraSPARC cache line size is 8 instructions:
  1.4695 +    const unsigned int icache_line_size = 32;
  1.4696 +    const unsigned int icache_half_line_size = 16;
  1.4697 +
  1.4698 +    if (at_header) {
  1.4699 +      while ((intptr_t)(__ pc()) % icache_line_size != 0) {
  1.4700 +        __ emit_data(0, relocInfo::none);
  1.4701 +      }
  1.4702 +    } else {
  1.4703 +      while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
  1.4704 +        __ nop();
  1.4705 +      }
  1.4706 +    }
  1.4707 +  }
  1.4708 +
  1.4709 +}; // end class declaration
  1.4710 +
  1.4711 +void StubGenerator_generate(CodeBuffer* code, bool all) {
  1.4712 +  StubGenerator g(code, all);
  1.4713 +}
Mercurial > jdk8-mips64-public > hotspot / file diff

diff: src/cpu/sparc/vm/stubGenerator_sparc.cpp

src/cpu/sparc/vm/stubGenerator_sparc.cpp