1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp Wed Apr 27 01:25:04 2016 +0800 1.3 @@ -0,0 +1,4710 @@ 1.4 +/* 1.5 + * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. 1.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 1.7 + * 1.8 + * This code is free software; you can redistribute it and/or modify it 1.9 + * under the terms of the GNU General Public License version 2 only, as 1.10 + * published by the Free Software Foundation. 1.11 + * 1.12 + * This code is distributed in the hope that it will be useful, but WITHOUT 1.13 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 1.14 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 1.15 + * version 2 for more details (a copy is included in the LICENSE file that 1.16 + * accompanied this code). 1.17 + * 1.18 + * You should have received a copy of the GNU General Public License version 1.19 + * 2 along with this work; if not, write to the Free Software Foundation, 1.20 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 1.21 + * 1.22 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 1.23 + * or visit www.oracle.com if you need additional information or have any 1.24 + * questions. 1.25 + * 1.26 + */ 1.27 + 1.28 +#include "precompiled.hpp" 1.29 +#include "asm/macroAssembler.inline.hpp" 1.30 +#include "interpreter/interpreter.hpp" 1.31 +#include "nativeInst_sparc.hpp" 1.32 +#include "oops/instanceOop.hpp" 1.33 +#include "oops/method.hpp" 1.34 +#include "oops/objArrayKlass.hpp" 1.35 +#include "oops/oop.inline.hpp" 1.36 +#include "prims/methodHandles.hpp" 1.37 +#include "runtime/frame.inline.hpp" 1.38 +#include "runtime/handles.inline.hpp" 1.39 +#include "runtime/sharedRuntime.hpp" 1.40 +#include "runtime/stubCodeGenerator.hpp" 1.41 +#include "runtime/stubRoutines.hpp" 1.42 +#include "runtime/thread.inline.hpp" 1.43 +#include "utilities/top.hpp" 1.44 +#ifdef COMPILER2 1.45 +#include "opto/runtime.hpp" 1.46 +#endif 1.47 + 1.48 +// Declaration and definition of StubGenerator (no .hpp file). 1.49 +// For a more detailed description of the stub routine structure 1.50 +// see the comment in stubRoutines.hpp. 1.51 + 1.52 +#define __ _masm-> 1.53 + 1.54 +#ifdef PRODUCT 1.55 +#define BLOCK_COMMENT(str) /* nothing */ 1.56 +#else 1.57 +#define BLOCK_COMMENT(str) __ block_comment(str) 1.58 +#endif 1.59 + 1.60 +#define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 1.61 + 1.62 +// Note: The register L7 is used as L7_thread_cache, and may not be used 1.63 +// any other way within this module. 1.64 + 1.65 + 1.66 +static const Register& Lstub_temp = L2; 1.67 + 1.68 +// ------------------------------------------------------------------------------------------------------------------------- 1.69 +// Stub Code definitions 1.70 + 1.71 +static address handle_unsafe_access() { 1.72 + JavaThread* thread = JavaThread::current(); 1.73 + address pc = thread->saved_exception_pc(); 1.74 + address npc = thread->saved_exception_npc(); 1.75 + // pc is the instruction which we must emulate 1.76 + // doing a no-op is fine: return garbage from the load 1.77 + 1.78 + // request an async exception 1.79 + thread->set_pending_unsafe_access_error(); 1.80 + 1.81 + // return address of next instruction to execute 1.82 + return npc; 1.83 +} 1.84 + 1.85 +class StubGenerator: public StubCodeGenerator { 1.86 + private: 1.87 + 1.88 +#ifdef PRODUCT 1.89 +#define inc_counter_np(a,b,c) 1.90 +#else 1.91 +#define inc_counter_np(counter, t1, t2) \ 1.92 + BLOCK_COMMENT("inc_counter " #counter); \ 1.93 + __ inc_counter(&counter, t1, t2); 1.94 +#endif 1.95 + 1.96 + //---------------------------------------------------------------------------------------------------- 1.97 + // Call stubs are used to call Java from C 1.98 + 1.99 + address generate_call_stub(address& return_pc) { 1.100 + StubCodeMark mark(this, "StubRoutines", "call_stub"); 1.101 + address start = __ pc(); 1.102 + 1.103 + // Incoming arguments: 1.104 + // 1.105 + // o0 : call wrapper address 1.106 + // o1 : result (address) 1.107 + // o2 : result type 1.108 + // o3 : method 1.109 + // o4 : (interpreter) entry point 1.110 + // o5 : parameters (address) 1.111 + // [sp + 0x5c]: parameter size (in words) 1.112 + // [sp + 0x60]: thread 1.113 + // 1.114 + // +---------------+ <--- sp + 0 1.115 + // | | 1.116 + // . reg save area . 1.117 + // | | 1.118 + // +---------------+ <--- sp + 0x40 1.119 + // | | 1.120 + // . extra 7 slots . 1.121 + // | | 1.122 + // +---------------+ <--- sp + 0x5c 1.123 + // | param. size | 1.124 + // +---------------+ <--- sp + 0x60 1.125 + // | thread | 1.126 + // +---------------+ 1.127 + // | | 1.128 + 1.129 + // note: if the link argument position changes, adjust 1.130 + // the code in frame::entry_frame_call_wrapper() 1.131 + 1.132 + const Argument link = Argument(0, false); // used only for GC 1.133 + const Argument result = Argument(1, false); 1.134 + const Argument result_type = Argument(2, false); 1.135 + const Argument method = Argument(3, false); 1.136 + const Argument entry_point = Argument(4, false); 1.137 + const Argument parameters = Argument(5, false); 1.138 + const Argument parameter_size = Argument(6, false); 1.139 + const Argument thread = Argument(7, false); 1.140 + 1.141 + // setup thread register 1.142 + __ ld_ptr(thread.as_address(), G2_thread); 1.143 + __ reinit_heapbase(); 1.144 + 1.145 +#ifdef ASSERT 1.146 + // make sure we have no pending exceptions 1.147 + { const Register t = G3_scratch; 1.148 + Label L; 1.149 + __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t); 1.150 + __ br_null_short(t, Assembler::pt, L); 1.151 + __ stop("StubRoutines::call_stub: entered with pending exception"); 1.152 + __ bind(L); 1.153 + } 1.154 +#endif 1.155 + 1.156 + // create activation frame & allocate space for parameters 1.157 + { const Register t = G3_scratch; 1.158 + __ ld_ptr(parameter_size.as_address(), t); // get parameter size (in words) 1.159 + __ add(t, frame::memory_parameter_word_sp_offset, t); // add space for save area (in words) 1.160 + __ round_to(t, WordsPerLong); // make sure it is multiple of 2 (in words) 1.161 + __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes 1.162 + __ neg(t); // negate so it can be used with save 1.163 + __ save(SP, t, SP); // setup new frame 1.164 + } 1.165 + 1.166 + // +---------------+ <--- sp + 0 1.167 + // | | 1.168 + // . reg save area . 1.169 + // | | 1.170 + // +---------------+ <--- sp + 0x40 1.171 + // | | 1.172 + // . extra 7 slots . 1.173 + // | | 1.174 + // +---------------+ <--- sp + 0x5c 1.175 + // | empty slot | (only if parameter size is even) 1.176 + // +---------------+ 1.177 + // | | 1.178 + // . parameters . 1.179 + // | | 1.180 + // +---------------+ <--- fp + 0 1.181 + // | | 1.182 + // . reg save area . 1.183 + // | | 1.184 + // +---------------+ <--- fp + 0x40 1.185 + // | | 1.186 + // . extra 7 slots . 1.187 + // | | 1.188 + // +---------------+ <--- fp + 0x5c 1.189 + // | param. size | 1.190 + // +---------------+ <--- fp + 0x60 1.191 + // | thread | 1.192 + // +---------------+ 1.193 + // | | 1.194 + 1.195 + // pass parameters if any 1.196 + BLOCK_COMMENT("pass parameters if any"); 1.197 + { const Register src = parameters.as_in().as_register(); 1.198 + const Register dst = Lentry_args; 1.199 + const Register tmp = G3_scratch; 1.200 + const Register cnt = G4_scratch; 1.201 + 1.202 + // test if any parameters & setup of Lentry_args 1.203 + Label exit; 1.204 + __ ld_ptr(parameter_size.as_in().as_address(), cnt); // parameter counter 1.205 + __ add( FP, STACK_BIAS, dst ); 1.206 + __ cmp_zero_and_br(Assembler::zero, cnt, exit); 1.207 + __ delayed()->sub(dst, BytesPerWord, dst); // setup Lentry_args 1.208 + 1.209 + // copy parameters if any 1.210 + Label loop; 1.211 + __ BIND(loop); 1.212 + // Store parameter value 1.213 + __ ld_ptr(src, 0, tmp); 1.214 + __ add(src, BytesPerWord, src); 1.215 + __ st_ptr(tmp, dst, 0); 1.216 + __ deccc(cnt); 1.217 + __ br(Assembler::greater, false, Assembler::pt, loop); 1.218 + __ delayed()->sub(dst, Interpreter::stackElementSize, dst); 1.219 + 1.220 + // done 1.221 + __ BIND(exit); 1.222 + } 1.223 + 1.224 + // setup parameters, method & call Java function 1.225 +#ifdef ASSERT 1.226 + // layout_activation_impl checks it's notion of saved SP against 1.227 + // this register, so if this changes update it as well. 1.228 + const Register saved_SP = Lscratch; 1.229 + __ mov(SP, saved_SP); // keep track of SP before call 1.230 +#endif 1.231 + 1.232 + // setup parameters 1.233 + const Register t = G3_scratch; 1.234 + __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words) 1.235 + __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes 1.236 + __ sub(FP, t, Gargs); // setup parameter pointer 1.237 +#ifdef _LP64 1.238 + __ add( Gargs, STACK_BIAS, Gargs ); // Account for LP64 stack bias 1.239 +#endif 1.240 + __ mov(SP, O5_savedSP); 1.241 + 1.242 + 1.243 + // do the call 1.244 + // 1.245 + // the following register must be setup: 1.246 + // 1.247 + // G2_thread 1.248 + // G5_method 1.249 + // Gargs 1.250 + BLOCK_COMMENT("call Java function"); 1.251 + __ jmpl(entry_point.as_in().as_register(), G0, O7); 1.252 + __ delayed()->mov(method.as_in().as_register(), G5_method); // setup method 1.253 + 1.254 + BLOCK_COMMENT("call_stub_return_address:"); 1.255 + return_pc = __ pc(); 1.256 + 1.257 + // The callee, if it wasn't interpreted, can return with SP changed so 1.258 + // we can no longer assert of change of SP. 1.259 + 1.260 + // store result depending on type 1.261 + // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE 1.262 + // is treated as T_INT) 1.263 + { const Register addr = result .as_in().as_register(); 1.264 + const Register type = result_type.as_in().as_register(); 1.265 + Label is_long, is_float, is_double, is_object, exit; 1.266 + __ cmp(type, T_OBJECT); __ br(Assembler::equal, false, Assembler::pn, is_object); 1.267 + __ delayed()->cmp(type, T_FLOAT); __ br(Assembler::equal, false, Assembler::pn, is_float); 1.268 + __ delayed()->cmp(type, T_DOUBLE); __ br(Assembler::equal, false, Assembler::pn, is_double); 1.269 + __ delayed()->cmp(type, T_LONG); __ br(Assembler::equal, false, Assembler::pn, is_long); 1.270 + __ delayed()->nop(); 1.271 + 1.272 + // store int result 1.273 + __ st(O0, addr, G0); 1.274 + 1.275 + __ BIND(exit); 1.276 + __ ret(); 1.277 + __ delayed()->restore(); 1.278 + 1.279 + __ BIND(is_object); 1.280 + __ ba(exit); 1.281 + __ delayed()->st_ptr(O0, addr, G0); 1.282 + 1.283 + __ BIND(is_float); 1.284 + __ ba(exit); 1.285 + __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0); 1.286 + 1.287 + __ BIND(is_double); 1.288 + __ ba(exit); 1.289 + __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0); 1.290 + 1.291 + __ BIND(is_long); 1.292 +#ifdef _LP64 1.293 + __ ba(exit); 1.294 + __ delayed()->st_long(O0, addr, G0); // store entire long 1.295 +#else 1.296 +#if defined(COMPILER2) 1.297 + // All return values are where we want them, except for Longs. C2 returns 1.298 + // longs in G1 in the 32-bit build whereas the interpreter wants them in O0/O1. 1.299 + // Since the interpreter will return longs in G1 and O0/O1 in the 32bit 1.300 + // build we simply always use G1. 1.301 + // Note: I tried to make c2 return longs in O0/O1 and G1 so we wouldn't have to 1.302 + // do this here. Unfortunately if we did a rethrow we'd see an machepilog node 1.303 + // first which would move g1 -> O0/O1 and destroy the exception we were throwing. 1.304 + 1.305 + __ ba(exit); 1.306 + __ delayed()->stx(G1, addr, G0); // store entire long 1.307 +#else 1.308 + __ st(O1, addr, BytesPerInt); 1.309 + __ ba(exit); 1.310 + __ delayed()->st(O0, addr, G0); 1.311 +#endif /* COMPILER2 */ 1.312 +#endif /* _LP64 */ 1.313 + } 1.314 + return start; 1.315 + } 1.316 + 1.317 + 1.318 + //---------------------------------------------------------------------------------------------------- 1.319 + // Return point for a Java call if there's an exception thrown in Java code. 1.320 + // The exception is caught and transformed into a pending exception stored in 1.321 + // JavaThread that can be tested from within the VM. 1.322 + // 1.323 + // Oexception: exception oop 1.324 + 1.325 + address generate_catch_exception() { 1.326 + StubCodeMark mark(this, "StubRoutines", "catch_exception"); 1.327 + 1.328 + address start = __ pc(); 1.329 + // verify that thread corresponds 1.330 + __ verify_thread(); 1.331 + 1.332 + const Register& temp_reg = Gtemp; 1.333 + Address pending_exception_addr (G2_thread, Thread::pending_exception_offset()); 1.334 + Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset ()); 1.335 + Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset ()); 1.336 + 1.337 + // set pending exception 1.338 + __ verify_oop(Oexception); 1.339 + __ st_ptr(Oexception, pending_exception_addr); 1.340 + __ set((intptr_t)__FILE__, temp_reg); 1.341 + __ st_ptr(temp_reg, exception_file_offset_addr); 1.342 + __ set((intptr_t)__LINE__, temp_reg); 1.343 + __ st(temp_reg, exception_line_offset_addr); 1.344 + 1.345 + // complete return to VM 1.346 + assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before"); 1.347 + 1.348 + AddressLiteral stub_ret(StubRoutines::_call_stub_return_address); 1.349 + __ jump_to(stub_ret, temp_reg); 1.350 + __ delayed()->nop(); 1.351 + 1.352 + return start; 1.353 + } 1.354 + 1.355 + 1.356 + //---------------------------------------------------------------------------------------------------- 1.357 + // Continuation point for runtime calls returning with a pending exception 1.358 + // The pending exception check happened in the runtime or native call stub 1.359 + // The pending exception in Thread is converted into a Java-level exception 1.360 + // 1.361 + // Contract with Java-level exception handler: O0 = exception 1.362 + // O1 = throwing pc 1.363 + 1.364 + address generate_forward_exception() { 1.365 + StubCodeMark mark(this, "StubRoutines", "forward_exception"); 1.366 + address start = __ pc(); 1.367 + 1.368 + // Upon entry, O7 has the return address returning into Java 1.369 + // (interpreted or compiled) code; i.e. the return address 1.370 + // becomes the throwing pc. 1.371 + 1.372 + const Register& handler_reg = Gtemp; 1.373 + 1.374 + Address exception_addr(G2_thread, Thread::pending_exception_offset()); 1.375 + 1.376 +#ifdef ASSERT 1.377 + // make sure that this code is only executed if there is a pending exception 1.378 + { Label L; 1.379 + __ ld_ptr(exception_addr, Gtemp); 1.380 + __ br_notnull_short(Gtemp, Assembler::pt, L); 1.381 + __ stop("StubRoutines::forward exception: no pending exception (1)"); 1.382 + __ bind(L); 1.383 + } 1.384 +#endif 1.385 + 1.386 + // compute exception handler into handler_reg 1.387 + __ get_thread(); 1.388 + __ ld_ptr(exception_addr, Oexception); 1.389 + __ verify_oop(Oexception); 1.390 + __ save_frame(0); // compensates for compiler weakness 1.391 + __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC 1.392 + BLOCK_COMMENT("call exception_handler_for_return_address"); 1.393 + __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch); 1.394 + __ mov(O0, handler_reg); 1.395 + __ restore(); // compensates for compiler weakness 1.396 + 1.397 + __ ld_ptr(exception_addr, Oexception); 1.398 + __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC 1.399 + 1.400 +#ifdef ASSERT 1.401 + // make sure exception is set 1.402 + { Label L; 1.403 + __ br_notnull_short(Oexception, Assembler::pt, L); 1.404 + __ stop("StubRoutines::forward exception: no pending exception (2)"); 1.405 + __ bind(L); 1.406 + } 1.407 +#endif 1.408 + // jump to exception handler 1.409 + __ jmp(handler_reg, 0); 1.410 + // clear pending exception 1.411 + __ delayed()->st_ptr(G0, exception_addr); 1.412 + 1.413 + return start; 1.414 + } 1.415 + 1.416 + // Safefetch stubs. 1.417 + void generate_safefetch(const char* name, int size, address* entry, 1.418 + address* fault_pc, address* continuation_pc) { 1.419 + // safefetch signatures: 1.420 + // int SafeFetch32(int* adr, int errValue); 1.421 + // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 1.422 + // 1.423 + // arguments: 1.424 + // o0 = adr 1.425 + // o1 = errValue 1.426 + // 1.427 + // result: 1.428 + // o0 = *adr or errValue 1.429 + 1.430 + StubCodeMark mark(this, "StubRoutines", name); 1.431 + 1.432 + // Entry point, pc or function descriptor. 1.433 + __ align(CodeEntryAlignment); 1.434 + *entry = __ pc(); 1.435 + 1.436 + __ mov(O0, G1); // g1 = o0 1.437 + __ mov(O1, O0); // o0 = o1 1.438 + // Load *adr into c_rarg1, may fault. 1.439 + *fault_pc = __ pc(); 1.440 + switch (size) { 1.441 + case 4: 1.442 + // int32_t 1.443 + __ ldsw(G1, 0, O0); // o0 = [g1] 1.444 + break; 1.445 + case 8: 1.446 + // int64_t 1.447 + __ ldx(G1, 0, O0); // o0 = [g1] 1.448 + break; 1.449 + default: 1.450 + ShouldNotReachHere(); 1.451 + } 1.452 + 1.453 + // return errValue or *adr 1.454 + *continuation_pc = __ pc(); 1.455 + // By convention with the trap handler we ensure there is a non-CTI 1.456 + // instruction in the trap shadow. 1.457 + __ nop(); 1.458 + __ retl(); 1.459 + __ delayed()->nop(); 1.460 + } 1.461 + 1.462 + //------------------------------------------------------------------------------------------------------------------------ 1.463 + // Continuation point for throwing of implicit exceptions that are not handled in 1.464 + // the current activation. Fabricates an exception oop and initiates normal 1.465 + // exception dispatching in this frame. Only callee-saved registers are preserved 1.466 + // (through the normal register window / RegisterMap handling). 1.467 + // If the compiler needs all registers to be preserved between the fault 1.468 + // point and the exception handler then it must assume responsibility for that in 1.469 + // AbstractCompiler::continuation_for_implicit_null_exception or 1.470 + // continuation_for_implicit_division_by_zero_exception. All other implicit 1.471 + // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are 1.472 + // either at call sites or otherwise assume that stack unwinding will be initiated, 1.473 + // so caller saved registers were assumed volatile in the compiler. 1.474 + 1.475 + // Note that we generate only this stub into a RuntimeStub, because it needs to be 1.476 + // properly traversed and ignored during GC, so we change the meaning of the "__" 1.477 + // macro within this method. 1.478 +#undef __ 1.479 +#define __ masm-> 1.480 + 1.481 + address generate_throw_exception(const char* name, address runtime_entry, 1.482 + Register arg1 = noreg, Register arg2 = noreg) { 1.483 +#ifdef ASSERT 1.484 + int insts_size = VerifyThread ? 1 * K : 600; 1.485 +#else 1.486 + int insts_size = VerifyThread ? 1 * K : 256; 1.487 +#endif /* ASSERT */ 1.488 + int locs_size = 32; 1.489 + 1.490 + CodeBuffer code(name, insts_size, locs_size); 1.491 + MacroAssembler* masm = new MacroAssembler(&code); 1.492 + 1.493 + __ verify_thread(); 1.494 + 1.495 + // This is an inlined and slightly modified version of call_VM 1.496 + // which has the ability to fetch the return PC out of thread-local storage 1.497 + __ assert_not_delayed(); 1.498 + 1.499 + // Note that we always push a frame because on the SPARC 1.500 + // architecture, for all of our implicit exception kinds at call 1.501 + // sites, the implicit exception is taken before the callee frame 1.502 + // is pushed. 1.503 + __ save_frame(0); 1.504 + 1.505 + int frame_complete = __ offset(); 1.506 + 1.507 + // Note that we always have a runtime stub frame on the top of stack by this point 1.508 + Register last_java_sp = SP; 1.509 + // 64-bit last_java_sp is biased! 1.510 + __ set_last_Java_frame(last_java_sp, G0); 1.511 + if (VerifyThread) __ mov(G2_thread, O0); // about to be smashed; pass early 1.512 + __ save_thread(noreg); 1.513 + if (arg1 != noreg) { 1.514 + assert(arg2 != O1, "clobbered"); 1.515 + __ mov(arg1, O1); 1.516 + } 1.517 + if (arg2 != noreg) { 1.518 + __ mov(arg2, O2); 1.519 + } 1.520 + // do the call 1.521 + BLOCK_COMMENT("call runtime_entry"); 1.522 + __ call(runtime_entry, relocInfo::runtime_call_type); 1.523 + if (!VerifyThread) 1.524 + __ delayed()->mov(G2_thread, O0); // pass thread as first argument 1.525 + else 1.526 + __ delayed()->nop(); // (thread already passed) 1.527 + __ restore_thread(noreg); 1.528 + __ reset_last_Java_frame(); 1.529 + 1.530 + // check for pending exceptions. use Gtemp as scratch register. 1.531 +#ifdef ASSERT 1.532 + Label L; 1.533 + 1.534 + Address exception_addr(G2_thread, Thread::pending_exception_offset()); 1.535 + Register scratch_reg = Gtemp; 1.536 + __ ld_ptr(exception_addr, scratch_reg); 1.537 + __ br_notnull_short(scratch_reg, Assembler::pt, L); 1.538 + __ should_not_reach_here(); 1.539 + __ bind(L); 1.540 +#endif // ASSERT 1.541 + BLOCK_COMMENT("call forward_exception_entry"); 1.542 + __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type); 1.543 + // we use O7 linkage so that forward_exception_entry has the issuing PC 1.544 + __ delayed()->restore(); 1.545 + 1.546 + RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false); 1.547 + return stub->entry_point(); 1.548 + } 1.549 + 1.550 +#undef __ 1.551 +#define __ _masm-> 1.552 + 1.553 + 1.554 + // Generate a routine that sets all the registers so we 1.555 + // can tell if the stop routine prints them correctly. 1.556 + address generate_test_stop() { 1.557 + StubCodeMark mark(this, "StubRoutines", "test_stop"); 1.558 + address start = __ pc(); 1.559 + 1.560 + int i; 1.561 + 1.562 + __ save_frame(0); 1.563 + 1.564 + static jfloat zero = 0.0, one = 1.0; 1.565 + 1.566 + // put addr in L0, then load through L0 to F0 1.567 + __ set((intptr_t)&zero, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F0); 1.568 + __ set((intptr_t)&one, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1 1.569 + 1.570 + // use add to put 2..18 in F2..F18 1.571 + for ( i = 2; i <= 18; ++i ) { 1.572 + __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1), as_FloatRegister(i)); 1.573 + } 1.574 + 1.575 + // Now put double 2 in F16, double 18 in F18 1.576 + __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 ); 1.577 + __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 ); 1.578 + 1.579 + // use add to put 20..32 in F20..F32 1.580 + for (i = 20; i < 32; i += 2) { 1.581 + __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2), as_FloatRegister(i)); 1.582 + } 1.583 + 1.584 + // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's 1.585 + for ( i = 0; i < 8; ++i ) { 1.586 + if (i < 6) { 1.587 + __ set( i, as_iRegister(i)); 1.588 + __ set(16 + i, as_oRegister(i)); 1.589 + __ set(24 + i, as_gRegister(i)); 1.590 + } 1.591 + __ set( 8 + i, as_lRegister(i)); 1.592 + } 1.593 + 1.594 + __ stop("testing stop"); 1.595 + 1.596 + 1.597 + __ ret(); 1.598 + __ delayed()->restore(); 1.599 + 1.600 + return start; 1.601 + } 1.602 + 1.603 + 1.604 + address generate_stop_subroutine() { 1.605 + StubCodeMark mark(this, "StubRoutines", "stop_subroutine"); 1.606 + address start = __ pc(); 1.607 + 1.608 + __ stop_subroutine(); 1.609 + 1.610 + return start; 1.611 + } 1.612 + 1.613 + address generate_flush_callers_register_windows() { 1.614 + StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows"); 1.615 + address start = __ pc(); 1.616 + 1.617 + __ flushw(); 1.618 + __ retl(false); 1.619 + __ delayed()->add( FP, STACK_BIAS, O0 ); 1.620 + // The returned value must be a stack pointer whose register save area 1.621 + // is flushed, and will stay flushed while the caller executes. 1.622 + 1.623 + return start; 1.624 + } 1.625 + 1.626 + // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest). 1.627 + // 1.628 + // Arguments: 1.629 + // 1.630 + // exchange_value: O0 1.631 + // dest: O1 1.632 + // 1.633 + // Results: 1.634 + // 1.635 + // O0: the value previously stored in dest 1.636 + // 1.637 + address generate_atomic_xchg() { 1.638 + StubCodeMark mark(this, "StubRoutines", "atomic_xchg"); 1.639 + address start = __ pc(); 1.640 + 1.641 + if (UseCASForSwap) { 1.642 + // Use CAS instead of swap, just in case the MP hardware 1.643 + // prefers to work with just one kind of synch. instruction. 1.644 + Label retry; 1.645 + __ BIND(retry); 1.646 + __ mov(O0, O3); // scratch copy of exchange value 1.647 + __ ld(O1, 0, O2); // observe the previous value 1.648 + // try to replace O2 with O3 1.649 + __ cas(O1, O2, O3); 1.650 + __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry); 1.651 + 1.652 + __ retl(false); 1.653 + __ delayed()->mov(O2, O0); // report previous value to caller 1.654 + } else { 1.655 + __ retl(false); 1.656 + __ delayed()->swap(O1, 0, O0); 1.657 + } 1.658 + 1.659 + return start; 1.660 + } 1.661 + 1.662 + 1.663 + // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value) 1.664 + // 1.665 + // Arguments: 1.666 + // 1.667 + // exchange_value: O0 1.668 + // dest: O1 1.669 + // compare_value: O2 1.670 + // 1.671 + // Results: 1.672 + // 1.673 + // O0: the value previously stored in dest 1.674 + // 1.675 + address generate_atomic_cmpxchg() { 1.676 + StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg"); 1.677 + address start = __ pc(); 1.678 + 1.679 + // cmpxchg(dest, compare_value, exchange_value) 1.680 + __ cas(O1, O2, O0); 1.681 + __ retl(false); 1.682 + __ delayed()->nop(); 1.683 + 1.684 + return start; 1.685 + } 1.686 + 1.687 + // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value) 1.688 + // 1.689 + // Arguments: 1.690 + // 1.691 + // exchange_value: O1:O0 1.692 + // dest: O2 1.693 + // compare_value: O4:O3 1.694 + // 1.695 + // Results: 1.696 + // 1.697 + // O1:O0: the value previously stored in dest 1.698 + // 1.699 + // Overwrites: G1,G2,G3 1.700 + // 1.701 + address generate_atomic_cmpxchg_long() { 1.702 + StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long"); 1.703 + address start = __ pc(); 1.704 + 1.705 + __ sllx(O0, 32, O0); 1.706 + __ srl(O1, 0, O1); 1.707 + __ or3(O0,O1,O0); // O0 holds 64-bit value from compare_value 1.708 + __ sllx(O3, 32, O3); 1.709 + __ srl(O4, 0, O4); 1.710 + __ or3(O3,O4,O3); // O3 holds 64-bit value from exchange_value 1.711 + __ casx(O2, O3, O0); 1.712 + __ srl(O0, 0, O1); // unpacked return value in O1:O0 1.713 + __ retl(false); 1.714 + __ delayed()->srlx(O0, 32, O0); 1.715 + 1.716 + return start; 1.717 + } 1.718 + 1.719 + 1.720 + // Support for jint Atomic::add(jint add_value, volatile jint* dest). 1.721 + // 1.722 + // Arguments: 1.723 + // 1.724 + // add_value: O0 (e.g., +1 or -1) 1.725 + // dest: O1 1.726 + // 1.727 + // Results: 1.728 + // 1.729 + // O0: the new value stored in dest 1.730 + // 1.731 + // Overwrites: O3 1.732 + // 1.733 + address generate_atomic_add() { 1.734 + StubCodeMark mark(this, "StubRoutines", "atomic_add"); 1.735 + address start = __ pc(); 1.736 + __ BIND(_atomic_add_stub); 1.737 + 1.738 + Label(retry); 1.739 + __ BIND(retry); 1.740 + 1.741 + __ lduw(O1, 0, O2); 1.742 + __ add(O0, O2, O3); 1.743 + __ cas(O1, O2, O3); 1.744 + __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry); 1.745 + __ retl(false); 1.746 + __ delayed()->add(O0, O2, O0); // note that cas made O2==O3 1.747 + 1.748 + return start; 1.749 + } 1.750 + Label _atomic_add_stub; // called from other stubs 1.751 + 1.752 + 1.753 + //------------------------------------------------------------------------------------------------------------------------ 1.754 + // The following routine generates a subroutine to throw an asynchronous 1.755 + // UnknownError when an unsafe access gets a fault that could not be 1.756 + // reasonably prevented by the programmer. (Example: SIGBUS/OBJERR.) 1.757 + // 1.758 + // Arguments : 1.759 + // 1.760 + // trapping PC: O7 1.761 + // 1.762 + // Results: 1.763 + // posts an asynchronous exception, skips the trapping instruction 1.764 + // 1.765 + 1.766 + address generate_handler_for_unsafe_access() { 1.767 + StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access"); 1.768 + address start = __ pc(); 1.769 + 1.770 + const int preserve_register_words = (64 * 2); 1.771 + Address preserve_addr(FP, (-preserve_register_words * wordSize) + STACK_BIAS); 1.772 + 1.773 + Register Lthread = L7_thread_cache; 1.774 + int i; 1.775 + 1.776 + __ save_frame(0); 1.777 + __ mov(G1, L1); 1.778 + __ mov(G2, L2); 1.779 + __ mov(G3, L3); 1.780 + __ mov(G4, L4); 1.781 + __ mov(G5, L5); 1.782 + for (i = 0; i < 64; i += 2) { 1.783 + __ stf(FloatRegisterImpl::D, as_FloatRegister(i), preserve_addr, i * wordSize); 1.784 + } 1.785 + 1.786 + address entry_point = CAST_FROM_FN_PTR(address, handle_unsafe_access); 1.787 + BLOCK_COMMENT("call handle_unsafe_access"); 1.788 + __ call(entry_point, relocInfo::runtime_call_type); 1.789 + __ delayed()->nop(); 1.790 + 1.791 + __ mov(L1, G1); 1.792 + __ mov(L2, G2); 1.793 + __ mov(L3, G3); 1.794 + __ mov(L4, G4); 1.795 + __ mov(L5, G5); 1.796 + for (i = 0; i < 64; i += 2) { 1.797 + __ ldf(FloatRegisterImpl::D, preserve_addr, as_FloatRegister(i), i * wordSize); 1.798 + } 1.799 + 1.800 + __ verify_thread(); 1.801 + 1.802 + __ jmp(O0, 0); 1.803 + __ delayed()->restore(); 1.804 + 1.805 + return start; 1.806 + } 1.807 + 1.808 + 1.809 + // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super ); 1.810 + // Arguments : 1.811 + // 1.812 + // ret : O0, returned 1.813 + // icc/xcc: set as O0 (depending on wordSize) 1.814 + // sub : O1, argument, not changed 1.815 + // super: O2, argument, not changed 1.816 + // raddr: O7, blown by call 1.817 + address generate_partial_subtype_check() { 1.818 + __ align(CodeEntryAlignment); 1.819 + StubCodeMark mark(this, "StubRoutines", "partial_subtype_check"); 1.820 + address start = __ pc(); 1.821 + Label miss; 1.822 + 1.823 +#if defined(COMPILER2) && !defined(_LP64) 1.824 + // Do not use a 'save' because it blows the 64-bit O registers. 1.825 + __ add(SP,-4*wordSize,SP); // Make space for 4 temps (stack must be 2 words aligned) 1.826 + __ st_ptr(L0,SP,(frame::register_save_words+0)*wordSize); 1.827 + __ st_ptr(L1,SP,(frame::register_save_words+1)*wordSize); 1.828 + __ st_ptr(L2,SP,(frame::register_save_words+2)*wordSize); 1.829 + __ st_ptr(L3,SP,(frame::register_save_words+3)*wordSize); 1.830 + Register Rret = O0; 1.831 + Register Rsub = O1; 1.832 + Register Rsuper = O2; 1.833 +#else 1.834 + __ save_frame(0); 1.835 + Register Rret = I0; 1.836 + Register Rsub = I1; 1.837 + Register Rsuper = I2; 1.838 +#endif 1.839 + 1.840 + Register L0_ary_len = L0; 1.841 + Register L1_ary_ptr = L1; 1.842 + Register L2_super = L2; 1.843 + Register L3_index = L3; 1.844 + 1.845 + __ check_klass_subtype_slow_path(Rsub, Rsuper, 1.846 + L0, L1, L2, L3, 1.847 + NULL, &miss); 1.848 + 1.849 + // Match falls through here. 1.850 + __ addcc(G0,0,Rret); // set Z flags, Z result 1.851 + 1.852 +#if defined(COMPILER2) && !defined(_LP64) 1.853 + __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0); 1.854 + __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1); 1.855 + __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2); 1.856 + __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3); 1.857 + __ retl(); // Result in Rret is zero; flags set to Z 1.858 + __ delayed()->add(SP,4*wordSize,SP); 1.859 +#else 1.860 + __ ret(); // Result in Rret is zero; flags set to Z 1.861 + __ delayed()->restore(); 1.862 +#endif 1.863 + 1.864 + __ BIND(miss); 1.865 + __ addcc(G0,1,Rret); // set NZ flags, NZ result 1.866 + 1.867 +#if defined(COMPILER2) && !defined(_LP64) 1.868 + __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0); 1.869 + __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1); 1.870 + __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2); 1.871 + __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3); 1.872 + __ retl(); // Result in Rret is != 0; flags set to NZ 1.873 + __ delayed()->add(SP,4*wordSize,SP); 1.874 +#else 1.875 + __ ret(); // Result in Rret is != 0; flags set to NZ 1.876 + __ delayed()->restore(); 1.877 +#endif 1.878 + 1.879 + return start; 1.880 + } 1.881 + 1.882 + 1.883 + // Called from MacroAssembler::verify_oop 1.884 + // 1.885 + address generate_verify_oop_subroutine() { 1.886 + StubCodeMark mark(this, "StubRoutines", "verify_oop_stub"); 1.887 + 1.888 + address start = __ pc(); 1.889 + 1.890 + __ verify_oop_subroutine(); 1.891 + 1.892 + return start; 1.893 + } 1.894 + 1.895 + 1.896 + // 1.897 + // Verify that a register contains clean 32-bits positive value 1.898 + // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax). 1.899 + // 1.900 + // Input: 1.901 + // Rint - 32-bits value 1.902 + // Rtmp - scratch 1.903 + // 1.904 + void assert_clean_int(Register Rint, Register Rtmp) { 1.905 +#if defined(ASSERT) && defined(_LP64) 1.906 + __ signx(Rint, Rtmp); 1.907 + __ cmp(Rint, Rtmp); 1.908 + __ breakpoint_trap(Assembler::notEqual, Assembler::xcc); 1.909 +#endif 1.910 + } 1.911 + 1.912 + // 1.913 + // Generate overlap test for array copy stubs 1.914 + // 1.915 + // Input: 1.916 + // O0 - array1 1.917 + // O1 - array2 1.918 + // O2 - element count 1.919 + // 1.920 + // Kills temps: O3, O4 1.921 + // 1.922 + void array_overlap_test(address no_overlap_target, int log2_elem_size) { 1.923 + assert(no_overlap_target != NULL, "must be generated"); 1.924 + array_overlap_test(no_overlap_target, NULL, log2_elem_size); 1.925 + } 1.926 + void array_overlap_test(Label& L_no_overlap, int log2_elem_size) { 1.927 + array_overlap_test(NULL, &L_no_overlap, log2_elem_size); 1.928 + } 1.929 + void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) { 1.930 + const Register from = O0; 1.931 + const Register to = O1; 1.932 + const Register count = O2; 1.933 + const Register to_from = O3; // to - from 1.934 + const Register byte_count = O4; // count << log2_elem_size 1.935 + 1.936 + __ subcc(to, from, to_from); 1.937 + __ sll_ptr(count, log2_elem_size, byte_count); 1.938 + if (NOLp == NULL) 1.939 + __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target); 1.940 + else 1.941 + __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp)); 1.942 + __ delayed()->cmp(to_from, byte_count); 1.943 + if (NOLp == NULL) 1.944 + __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target); 1.945 + else 1.946 + __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp)); 1.947 + __ delayed()->nop(); 1.948 + } 1.949 + 1.950 + // 1.951 + // Generate pre-write barrier for array. 1.952 + // 1.953 + // Input: 1.954 + // addr - register containing starting address 1.955 + // count - register containing element count 1.956 + // tmp - scratch register 1.957 + // 1.958 + // The input registers are overwritten. 1.959 + // 1.960 + void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) { 1.961 + BarrierSet* bs = Universe::heap()->barrier_set(); 1.962 + switch (bs->kind()) { 1.963 + case BarrierSet::G1SATBCT: 1.964 + case BarrierSet::G1SATBCTLogging: 1.965 + // With G1, don't generate the call if we statically know that the target in uninitialized 1.966 + if (!dest_uninitialized) { 1.967 + __ save_frame(0); 1.968 + // Save the necessary global regs... will be used after. 1.969 + if (addr->is_global()) { 1.970 + __ mov(addr, L0); 1.971 + } 1.972 + if (count->is_global()) { 1.973 + __ mov(count, L1); 1.974 + } 1.975 + __ mov(addr->after_save(), O0); 1.976 + // Get the count into O1 1.977 + __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre)); 1.978 + __ delayed()->mov(count->after_save(), O1); 1.979 + if (addr->is_global()) { 1.980 + __ mov(L0, addr); 1.981 + } 1.982 + if (count->is_global()) { 1.983 + __ mov(L1, count); 1.984 + } 1.985 + __ restore(); 1.986 + } 1.987 + break; 1.988 + case BarrierSet::CardTableModRef: 1.989 + case BarrierSet::CardTableExtension: 1.990 + case BarrierSet::ModRef: 1.991 + break; 1.992 + default: 1.993 + ShouldNotReachHere(); 1.994 + } 1.995 + } 1.996 + // 1.997 + // Generate post-write barrier for array. 1.998 + // 1.999 + // Input: 1.1000 + // addr - register containing starting address 1.1001 + // count - register containing element count 1.1002 + // tmp - scratch register 1.1003 + // 1.1004 + // The input registers are overwritten. 1.1005 + // 1.1006 + void gen_write_ref_array_post_barrier(Register addr, Register count, 1.1007 + Register tmp) { 1.1008 + BarrierSet* bs = Universe::heap()->barrier_set(); 1.1009 + 1.1010 + switch (bs->kind()) { 1.1011 + case BarrierSet::G1SATBCT: 1.1012 + case BarrierSet::G1SATBCTLogging: 1.1013 + { 1.1014 + // Get some new fresh output registers. 1.1015 + __ save_frame(0); 1.1016 + __ mov(addr->after_save(), O0); 1.1017 + __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post)); 1.1018 + __ delayed()->mov(count->after_save(), O1); 1.1019 + __ restore(); 1.1020 + } 1.1021 + break; 1.1022 + case BarrierSet::CardTableModRef: 1.1023 + case BarrierSet::CardTableExtension: 1.1024 + { 1.1025 + CardTableModRefBS* ct = (CardTableModRefBS*)bs; 1.1026 + assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 1.1027 + assert_different_registers(addr, count, tmp); 1.1028 + 1.1029 + Label L_loop; 1.1030 + 1.1031 + __ sll_ptr(count, LogBytesPerHeapOop, count); 1.1032 + __ sub(count, BytesPerHeapOop, count); 1.1033 + __ add(count, addr, count); 1.1034 + // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.) 1.1035 + __ srl_ptr(addr, CardTableModRefBS::card_shift, addr); 1.1036 + __ srl_ptr(count, CardTableModRefBS::card_shift, count); 1.1037 + __ sub(count, addr, count); 1.1038 + AddressLiteral rs(ct->byte_map_base); 1.1039 + __ set(rs, tmp); 1.1040 + __ BIND(L_loop); 1.1041 + __ stb(G0, tmp, addr); 1.1042 + __ subcc(count, 1, count); 1.1043 + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1.1044 + __ delayed()->add(addr, 1, addr); 1.1045 + } 1.1046 + break; 1.1047 + case BarrierSet::ModRef: 1.1048 + break; 1.1049 + default: 1.1050 + ShouldNotReachHere(); 1.1051 + } 1.1052 + } 1.1053 + 1.1054 + // 1.1055 + // Generate main code for disjoint arraycopy 1.1056 + // 1.1057 + typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec, 1.1058 + Label& L_loop, bool use_prefetch, bool use_bis); 1.1059 + 1.1060 + void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size, 1.1061 + int iter_size, StubGenerator::CopyLoopFunc copy_loop_func) { 1.1062 + Label L_copy; 1.1063 + 1.1064 + assert(log2_elem_size <= 3, "the following code should be changed"); 1.1065 + int count_dec = 16>>log2_elem_size; 1.1066 + 1.1067 + int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance); 1.1068 + assert(prefetch_dist < 4096, "invalid value"); 1.1069 + prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size 1.1070 + int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count 1.1071 + 1.1072 + if (UseBlockCopy) { 1.1073 + Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy; 1.1074 + 1.1075 + // 64 bytes tail + bytes copied in one loop iteration 1.1076 + int tail_size = 64 + iter_size; 1.1077 + int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size; 1.1078 + // Use BIS copy only for big arrays since it requires membar. 1.1079 + __ set(block_copy_count, O4); 1.1080 + __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy); 1.1081 + // This code is for disjoint source and destination: 1.1082 + // to <= from || to >= from+count 1.1083 + // but BIS will stomp over 'from' if (to > from-tail_size && to <= from) 1.1084 + __ sub(from, to, O4); 1.1085 + __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm. 1.1086 + __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy); 1.1087 + 1.1088 + __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY); 1.1089 + // BIS should not be used to copy tail (64 bytes+iter_size) 1.1090 + // to avoid zeroing of following values. 1.1091 + __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0 1.1092 + 1.1093 + if (prefetch_count > 0) { // rounded up to one iteration count 1.1094 + // Do prefetching only if copy size is bigger 1.1095 + // than prefetch distance. 1.1096 + __ set(prefetch_count, O4); 1.1097 + __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy); 1.1098 + __ sub(count, prefetch_count, count); 1.1099 + 1.1100 + (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true); 1.1101 + __ add(count, prefetch_count, count); // restore count 1.1102 + 1.1103 + } // prefetch_count > 0 1.1104 + 1.1105 + (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true); 1.1106 + __ add(count, (tail_size>>log2_elem_size), count); // restore count 1.1107 + 1.1108 + __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT); 1.1109 + // BIS needs membar. 1.1110 + __ membar(Assembler::StoreLoad); 1.1111 + // Copy tail 1.1112 + __ ba_short(L_copy); 1.1113 + 1.1114 + __ BIND(L_skip_block_copy); 1.1115 + } // UseBlockCopy 1.1116 + 1.1117 + if (prefetch_count > 0) { // rounded up to one iteration count 1.1118 + // Do prefetching only if copy size is bigger 1.1119 + // than prefetch distance. 1.1120 + __ set(prefetch_count, O4); 1.1121 + __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy); 1.1122 + __ sub(count, prefetch_count, count); 1.1123 + 1.1124 + Label L_copy_prefetch; 1.1125 + (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false); 1.1126 + __ add(count, prefetch_count, count); // restore count 1.1127 + 1.1128 + } // prefetch_count > 0 1.1129 + 1.1130 + (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false); 1.1131 + } 1.1132 + 1.1133 + 1.1134 + 1.1135 + // 1.1136 + // Helper methods for copy_16_bytes_forward_with_shift() 1.1137 + // 1.1138 + void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec, 1.1139 + Label& L_loop, bool use_prefetch, bool use_bis) { 1.1140 + 1.1141 + const Register left_shift = G1; // left shift bit counter 1.1142 + const Register right_shift = G5; // right shift bit counter 1.1143 + 1.1144 + __ align(OptoLoopAlignment); 1.1145 + __ BIND(L_loop); 1.1146 + if (use_prefetch) { 1.1147 + if (ArraycopySrcPrefetchDistance > 0) { 1.1148 + __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); 1.1149 + } 1.1150 + if (ArraycopyDstPrefetchDistance > 0) { 1.1151 + __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); 1.1152 + } 1.1153 + } 1.1154 + __ ldx(from, 0, O4); 1.1155 + __ ldx(from, 8, G4); 1.1156 + __ inc(to, 16); 1.1157 + __ inc(from, 16); 1.1158 + __ deccc(count, count_dec); // Can we do next iteration after this one? 1.1159 + __ srlx(O4, right_shift, G3); 1.1160 + __ bset(G3, O3); 1.1161 + __ sllx(O4, left_shift, O4); 1.1162 + __ srlx(G4, right_shift, G3); 1.1163 + __ bset(G3, O4); 1.1164 + if (use_bis) { 1.1165 + __ stxa(O3, to, -16); 1.1166 + __ stxa(O4, to, -8); 1.1167 + } else { 1.1168 + __ stx(O3, to, -16); 1.1169 + __ stx(O4, to, -8); 1.1170 + } 1.1171 + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1.1172 + __ delayed()->sllx(G4, left_shift, O3); 1.1173 + } 1.1174 + 1.1175 + // Copy big chunks forward with shift 1.1176 + // 1.1177 + // Inputs: 1.1178 + // from - source arrays 1.1179 + // to - destination array aligned to 8-bytes 1.1180 + // count - elements count to copy >= the count equivalent to 16 bytes 1.1181 + // count_dec - elements count's decrement equivalent to 16 bytes 1.1182 + // L_copy_bytes - copy exit label 1.1183 + // 1.1184 + void copy_16_bytes_forward_with_shift(Register from, Register to, 1.1185 + Register count, int log2_elem_size, Label& L_copy_bytes) { 1.1186 + Label L_aligned_copy, L_copy_last_bytes; 1.1187 + assert(log2_elem_size <= 3, "the following code should be changed"); 1.1188 + int count_dec = 16>>log2_elem_size; 1.1189 + 1.1190 + // if both arrays have the same alignment mod 8, do 8 bytes aligned copy 1.1191 + __ andcc(from, 7, G1); // misaligned bytes 1.1192 + __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 1.1193 + __ delayed()->nop(); 1.1194 + 1.1195 + const Register left_shift = G1; // left shift bit counter 1.1196 + const Register right_shift = G5; // right shift bit counter 1.1197 + 1.1198 + __ sll(G1, LogBitsPerByte, left_shift); 1.1199 + __ mov(64, right_shift); 1.1200 + __ sub(right_shift, left_shift, right_shift); 1.1201 + 1.1202 + // 1.1203 + // Load 2 aligned 8-bytes chunks and use one from previous iteration 1.1204 + // to form 2 aligned 8-bytes chunks to store. 1.1205 + // 1.1206 + __ dec(count, count_dec); // Pre-decrement 'count' 1.1207 + __ andn(from, 7, from); // Align address 1.1208 + __ ldx(from, 0, O3); 1.1209 + __ inc(from, 8); 1.1210 + __ sllx(O3, left_shift, O3); 1.1211 + 1.1212 + disjoint_copy_core(from, to, count, log2_elem_size, 16, &StubGenerator::copy_16_bytes_shift_loop); 1.1213 + 1.1214 + __ inccc(count, count_dec>>1 ); // + 8 bytes 1.1215 + __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes); 1.1216 + __ delayed()->inc(count, count_dec>>1); // restore 'count' 1.1217 + 1.1218 + // copy 8 bytes, part of them already loaded in O3 1.1219 + __ ldx(from, 0, O4); 1.1220 + __ inc(to, 8); 1.1221 + __ inc(from, 8); 1.1222 + __ srlx(O4, right_shift, G3); 1.1223 + __ bset(O3, G3); 1.1224 + __ stx(G3, to, -8); 1.1225 + 1.1226 + __ BIND(L_copy_last_bytes); 1.1227 + __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes 1.1228 + __ br(Assembler::always, false, Assembler::pt, L_copy_bytes); 1.1229 + __ delayed()->sub(from, right_shift, from); // restore address 1.1230 + 1.1231 + __ BIND(L_aligned_copy); 1.1232 + } 1.1233 + 1.1234 + // Copy big chunks backward with shift 1.1235 + // 1.1236 + // Inputs: 1.1237 + // end_from - source arrays end address 1.1238 + // end_to - destination array end address aligned to 8-bytes 1.1239 + // count - elements count to copy >= the count equivalent to 16 bytes 1.1240 + // count_dec - elements count's decrement equivalent to 16 bytes 1.1241 + // L_aligned_copy - aligned copy exit label 1.1242 + // L_copy_bytes - copy exit label 1.1243 + // 1.1244 + void copy_16_bytes_backward_with_shift(Register end_from, Register end_to, 1.1245 + Register count, int count_dec, 1.1246 + Label& L_aligned_copy, Label& L_copy_bytes) { 1.1247 + Label L_loop, L_copy_last_bytes; 1.1248 + 1.1249 + // if both arrays have the same alignment mod 8, do 8 bytes aligned copy 1.1250 + __ andcc(end_from, 7, G1); // misaligned bytes 1.1251 + __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 1.1252 + __ delayed()->deccc(count, count_dec); // Pre-decrement 'count' 1.1253 + 1.1254 + const Register left_shift = G1; // left shift bit counter 1.1255 + const Register right_shift = G5; // right shift bit counter 1.1256 + 1.1257 + __ sll(G1, LogBitsPerByte, left_shift); 1.1258 + __ mov(64, right_shift); 1.1259 + __ sub(right_shift, left_shift, right_shift); 1.1260 + 1.1261 + // 1.1262 + // Load 2 aligned 8-bytes chunks and use one from previous iteration 1.1263 + // to form 2 aligned 8-bytes chunks to store. 1.1264 + // 1.1265 + __ andn(end_from, 7, end_from); // Align address 1.1266 + __ ldx(end_from, 0, O3); 1.1267 + __ align(OptoLoopAlignment); 1.1268 + __ BIND(L_loop); 1.1269 + __ ldx(end_from, -8, O4); 1.1270 + __ deccc(count, count_dec); // Can we do next iteration after this one? 1.1271 + __ ldx(end_from, -16, G4); 1.1272 + __ dec(end_to, 16); 1.1273 + __ dec(end_from, 16); 1.1274 + __ srlx(O3, right_shift, O3); 1.1275 + __ sllx(O4, left_shift, G3); 1.1276 + __ bset(G3, O3); 1.1277 + __ stx(O3, end_to, 8); 1.1278 + __ srlx(O4, right_shift, O4); 1.1279 + __ sllx(G4, left_shift, G3); 1.1280 + __ bset(G3, O4); 1.1281 + __ stx(O4, end_to, 0); 1.1282 + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1.1283 + __ delayed()->mov(G4, O3); 1.1284 + 1.1285 + __ inccc(count, count_dec>>1 ); // + 8 bytes 1.1286 + __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes); 1.1287 + __ delayed()->inc(count, count_dec>>1); // restore 'count' 1.1288 + 1.1289 + // copy 8 bytes, part of them already loaded in O3 1.1290 + __ ldx(end_from, -8, O4); 1.1291 + __ dec(end_to, 8); 1.1292 + __ dec(end_from, 8); 1.1293 + __ srlx(O3, right_shift, O3); 1.1294 + __ sllx(O4, left_shift, G3); 1.1295 + __ bset(O3, G3); 1.1296 + __ stx(G3, end_to, 0); 1.1297 + 1.1298 + __ BIND(L_copy_last_bytes); 1.1299 + __ srl(left_shift, LogBitsPerByte, left_shift); // misaligned bytes 1.1300 + __ br(Assembler::always, false, Assembler::pt, L_copy_bytes); 1.1301 + __ delayed()->add(end_from, left_shift, end_from); // restore address 1.1302 + } 1.1303 + 1.1304 + // 1.1305 + // Generate stub for disjoint byte copy. If "aligned" is true, the 1.1306 + // "from" and "to" addresses are assumed to be heapword aligned. 1.1307 + // 1.1308 + // Arguments for generated stub: 1.1309 + // from: O0 1.1310 + // to: O1 1.1311 + // count: O2 treated as signed 1.1312 + // 1.1313 + address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) { 1.1314 + __ align(CodeEntryAlignment); 1.1315 + StubCodeMark mark(this, "StubRoutines", name); 1.1316 + address start = __ pc(); 1.1317 + 1.1318 + Label L_skip_alignment, L_align; 1.1319 + Label L_copy_byte, L_copy_byte_loop, L_exit; 1.1320 + 1.1321 + const Register from = O0; // source array address 1.1322 + const Register to = O1; // destination array address 1.1323 + const Register count = O2; // elements count 1.1324 + const Register offset = O5; // offset from start of arrays 1.1325 + // O3, O4, G3, G4 are used as temp registers 1.1326 + 1.1327 + assert_clean_int(count, O3); // Make sure 'count' is clean int. 1.1328 + 1.1329 + if (entry != NULL) { 1.1330 + *entry = __ pc(); 1.1331 + // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1.1332 + BLOCK_COMMENT("Entry:"); 1.1333 + } 1.1334 + 1.1335 + // for short arrays, just do single element copy 1.1336 + __ cmp(count, 23); // 16 + 7 1.1337 + __ brx(Assembler::less, false, Assembler::pn, L_copy_byte); 1.1338 + __ delayed()->mov(G0, offset); 1.1339 + 1.1340 + if (aligned) { 1.1341 + // 'aligned' == true when it is known statically during compilation 1.1342 + // of this arraycopy call site that both 'from' and 'to' addresses 1.1343 + // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()). 1.1344 + // 1.1345 + // Aligned arrays have 4 bytes alignment in 32-bits VM 1.1346 + // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM 1.1347 + // 1.1348 +#ifndef _LP64 1.1349 + // copy a 4-bytes word if necessary to align 'to' to 8 bytes 1.1350 + __ andcc(to, 7, G0); 1.1351 + __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment); 1.1352 + __ delayed()->ld(from, 0, O3); 1.1353 + __ inc(from, 4); 1.1354 + __ inc(to, 4); 1.1355 + __ dec(count, 4); 1.1356 + __ st(O3, to, -4); 1.1357 + __ BIND(L_skip_alignment); 1.1358 +#endif 1.1359 + } else { 1.1360 + // copy bytes to align 'to' on 8 byte boundary 1.1361 + __ andcc(to, 7, G1); // misaligned bytes 1.1362 + __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1.1363 + __ delayed()->neg(G1); 1.1364 + __ inc(G1, 8); // bytes need to copy to next 8-bytes alignment 1.1365 + __ sub(count, G1, count); 1.1366 + __ BIND(L_align); 1.1367 + __ ldub(from, 0, O3); 1.1368 + __ deccc(G1); 1.1369 + __ inc(from); 1.1370 + __ stb(O3, to, 0); 1.1371 + __ br(Assembler::notZero, false, Assembler::pt, L_align); 1.1372 + __ delayed()->inc(to); 1.1373 + __ BIND(L_skip_alignment); 1.1374 + } 1.1375 +#ifdef _LP64 1.1376 + if (!aligned) 1.1377 +#endif 1.1378 + { 1.1379 + // Copy with shift 16 bytes per iteration if arrays do not have 1.1380 + // the same alignment mod 8, otherwise fall through to the next 1.1381 + // code for aligned copy. 1.1382 + // The compare above (count >= 23) guarantes 'count' >= 16 bytes. 1.1383 + // Also jump over aligned copy after the copy with shift completed. 1.1384 + 1.1385 + copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte); 1.1386 + } 1.1387 + 1.1388 + // Both array are 8 bytes aligned, copy 16 bytes at a time 1.1389 + __ and3(count, 7, G4); // Save count 1.1390 + __ srl(count, 3, count); 1.1391 + generate_disjoint_long_copy_core(aligned); 1.1392 + __ mov(G4, count); // Restore count 1.1393 + 1.1394 + // copy tailing bytes 1.1395 + __ BIND(L_copy_byte); 1.1396 + __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1.1397 + __ align(OptoLoopAlignment); 1.1398 + __ BIND(L_copy_byte_loop); 1.1399 + __ ldub(from, offset, O3); 1.1400 + __ deccc(count); 1.1401 + __ stb(O3, to, offset); 1.1402 + __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop); 1.1403 + __ delayed()->inc(offset); 1.1404 + 1.1405 + __ BIND(L_exit); 1.1406 + // O3, O4 are used as temp registers 1.1407 + inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4); 1.1408 + __ retl(); 1.1409 + __ delayed()->mov(G0, O0); // return 0 1.1410 + return start; 1.1411 + } 1.1412 + 1.1413 + // 1.1414 + // Generate stub for conjoint byte copy. If "aligned" is true, the 1.1415 + // "from" and "to" addresses are assumed to be heapword aligned. 1.1416 + // 1.1417 + // Arguments for generated stub: 1.1418 + // from: O0 1.1419 + // to: O1 1.1420 + // count: O2 treated as signed 1.1421 + // 1.1422 + address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1.1423 + address *entry, const char *name) { 1.1424 + // Do reverse copy. 1.1425 + 1.1426 + __ align(CodeEntryAlignment); 1.1427 + StubCodeMark mark(this, "StubRoutines", name); 1.1428 + address start = __ pc(); 1.1429 + 1.1430 + Label L_skip_alignment, L_align, L_aligned_copy; 1.1431 + Label L_copy_byte, L_copy_byte_loop, L_exit; 1.1432 + 1.1433 + const Register from = O0; // source array address 1.1434 + const Register to = O1; // destination array address 1.1435 + const Register count = O2; // elements count 1.1436 + const Register end_from = from; // source array end address 1.1437 + const Register end_to = to; // destination array end address 1.1438 + 1.1439 + assert_clean_int(count, O3); // Make sure 'count' is clean int. 1.1440 + 1.1441 + if (entry != NULL) { 1.1442 + *entry = __ pc(); 1.1443 + // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1.1444 + BLOCK_COMMENT("Entry:"); 1.1445 + } 1.1446 + 1.1447 + array_overlap_test(nooverlap_target, 0); 1.1448 + 1.1449 + __ add(to, count, end_to); // offset after last copied element 1.1450 + 1.1451 + // for short arrays, just do single element copy 1.1452 + __ cmp(count, 23); // 16 + 7 1.1453 + __ brx(Assembler::less, false, Assembler::pn, L_copy_byte); 1.1454 + __ delayed()->add(from, count, end_from); 1.1455 + 1.1456 + { 1.1457 + // Align end of arrays since they could be not aligned even 1.1458 + // when arrays itself are aligned. 1.1459 + 1.1460 + // copy bytes to align 'end_to' on 8 byte boundary 1.1461 + __ andcc(end_to, 7, G1); // misaligned bytes 1.1462 + __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1.1463 + __ delayed()->nop(); 1.1464 + __ sub(count, G1, count); 1.1465 + __ BIND(L_align); 1.1466 + __ dec(end_from); 1.1467 + __ dec(end_to); 1.1468 + __ ldub(end_from, 0, O3); 1.1469 + __ deccc(G1); 1.1470 + __ brx(Assembler::notZero, false, Assembler::pt, L_align); 1.1471 + __ delayed()->stb(O3, end_to, 0); 1.1472 + __ BIND(L_skip_alignment); 1.1473 + } 1.1474 +#ifdef _LP64 1.1475 + if (aligned) { 1.1476 + // Both arrays are aligned to 8-bytes in 64-bits VM. 1.1477 + // The 'count' is decremented in copy_16_bytes_backward_with_shift() 1.1478 + // in unaligned case. 1.1479 + __ dec(count, 16); 1.1480 + } else 1.1481 +#endif 1.1482 + { 1.1483 + // Copy with shift 16 bytes per iteration if arrays do not have 1.1484 + // the same alignment mod 8, otherwise jump to the next 1.1485 + // code for aligned copy (and substracting 16 from 'count' before jump). 1.1486 + // The compare above (count >= 11) guarantes 'count' >= 16 bytes. 1.1487 + // Also jump over aligned copy after the copy with shift completed. 1.1488 + 1.1489 + copy_16_bytes_backward_with_shift(end_from, end_to, count, 16, 1.1490 + L_aligned_copy, L_copy_byte); 1.1491 + } 1.1492 + // copy 4 elements (16 bytes) at a time 1.1493 + __ align(OptoLoopAlignment); 1.1494 + __ BIND(L_aligned_copy); 1.1495 + __ dec(end_from, 16); 1.1496 + __ ldx(end_from, 8, O3); 1.1497 + __ ldx(end_from, 0, O4); 1.1498 + __ dec(end_to, 16); 1.1499 + __ deccc(count, 16); 1.1500 + __ stx(O3, end_to, 8); 1.1501 + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy); 1.1502 + __ delayed()->stx(O4, end_to, 0); 1.1503 + __ inc(count, 16); 1.1504 + 1.1505 + // copy 1 element (2 bytes) at a time 1.1506 + __ BIND(L_copy_byte); 1.1507 + __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1.1508 + __ align(OptoLoopAlignment); 1.1509 + __ BIND(L_copy_byte_loop); 1.1510 + __ dec(end_from); 1.1511 + __ dec(end_to); 1.1512 + __ ldub(end_from, 0, O4); 1.1513 + __ deccc(count); 1.1514 + __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop); 1.1515 + __ delayed()->stb(O4, end_to, 0); 1.1516 + 1.1517 + __ BIND(L_exit); 1.1518 + // O3, O4 are used as temp registers 1.1519 + inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4); 1.1520 + __ retl(); 1.1521 + __ delayed()->mov(G0, O0); // return 0 1.1522 + return start; 1.1523 + } 1.1524 + 1.1525 + // 1.1526 + // Generate stub for disjoint short copy. If "aligned" is true, the 1.1527 + // "from" and "to" addresses are assumed to be heapword aligned. 1.1528 + // 1.1529 + // Arguments for generated stub: 1.1530 + // from: O0 1.1531 + // to: O1 1.1532 + // count: O2 treated as signed 1.1533 + // 1.1534 + address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) { 1.1535 + __ align(CodeEntryAlignment); 1.1536 + StubCodeMark mark(this, "StubRoutines", name); 1.1537 + address start = __ pc(); 1.1538 + 1.1539 + Label L_skip_alignment, L_skip_alignment2; 1.1540 + Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit; 1.1541 + 1.1542 + const Register from = O0; // source array address 1.1543 + const Register to = O1; // destination array address 1.1544 + const Register count = O2; // elements count 1.1545 + const Register offset = O5; // offset from start of arrays 1.1546 + // O3, O4, G3, G4 are used as temp registers 1.1547 + 1.1548 + assert_clean_int(count, O3); // Make sure 'count' is clean int. 1.1549 + 1.1550 + if (entry != NULL) { 1.1551 + *entry = __ pc(); 1.1552 + // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1.1553 + BLOCK_COMMENT("Entry:"); 1.1554 + } 1.1555 + 1.1556 + // for short arrays, just do single element copy 1.1557 + __ cmp(count, 11); // 8 + 3 (22 bytes) 1.1558 + __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes); 1.1559 + __ delayed()->mov(G0, offset); 1.1560 + 1.1561 + if (aligned) { 1.1562 + // 'aligned' == true when it is known statically during compilation 1.1563 + // of this arraycopy call site that both 'from' and 'to' addresses 1.1564 + // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()). 1.1565 + // 1.1566 + // Aligned arrays have 4 bytes alignment in 32-bits VM 1.1567 + // and 8 bytes - in 64-bits VM. 1.1568 + // 1.1569 +#ifndef _LP64 1.1570 + // copy a 2-elements word if necessary to align 'to' to 8 bytes 1.1571 + __ andcc(to, 7, G0); 1.1572 + __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1.1573 + __ delayed()->ld(from, 0, O3); 1.1574 + __ inc(from, 4); 1.1575 + __ inc(to, 4); 1.1576 + __ dec(count, 2); 1.1577 + __ st(O3, to, -4); 1.1578 + __ BIND(L_skip_alignment); 1.1579 +#endif 1.1580 + } else { 1.1581 + // copy 1 element if necessary to align 'to' on an 4 bytes 1.1582 + __ andcc(to, 3, G0); 1.1583 + __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1.1584 + __ delayed()->lduh(from, 0, O3); 1.1585 + __ inc(from, 2); 1.1586 + __ inc(to, 2); 1.1587 + __ dec(count); 1.1588 + __ sth(O3, to, -2); 1.1589 + __ BIND(L_skip_alignment); 1.1590 + 1.1591 + // copy 2 elements to align 'to' on an 8 byte boundary 1.1592 + __ andcc(to, 7, G0); 1.1593 + __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2); 1.1594 + __ delayed()->lduh(from, 0, O3); 1.1595 + __ dec(count, 2); 1.1596 + __ lduh(from, 2, O4); 1.1597 + __ inc(from, 4); 1.1598 + __ inc(to, 4); 1.1599 + __ sth(O3, to, -4); 1.1600 + __ sth(O4, to, -2); 1.1601 + __ BIND(L_skip_alignment2); 1.1602 + } 1.1603 +#ifdef _LP64 1.1604 + if (!aligned) 1.1605 +#endif 1.1606 + { 1.1607 + // Copy with shift 16 bytes per iteration if arrays do not have 1.1608 + // the same alignment mod 8, otherwise fall through to the next 1.1609 + // code for aligned copy. 1.1610 + // The compare above (count >= 11) guarantes 'count' >= 16 bytes. 1.1611 + // Also jump over aligned copy after the copy with shift completed. 1.1612 + 1.1613 + copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes); 1.1614 + } 1.1615 + 1.1616 + // Both array are 8 bytes aligned, copy 16 bytes at a time 1.1617 + __ and3(count, 3, G4); // Save 1.1618 + __ srl(count, 2, count); 1.1619 + generate_disjoint_long_copy_core(aligned); 1.1620 + __ mov(G4, count); // restore 1.1621 + 1.1622 + // copy 1 element at a time 1.1623 + __ BIND(L_copy_2_bytes); 1.1624 + __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1.1625 + __ align(OptoLoopAlignment); 1.1626 + __ BIND(L_copy_2_bytes_loop); 1.1627 + __ lduh(from, offset, O3); 1.1628 + __ deccc(count); 1.1629 + __ sth(O3, to, offset); 1.1630 + __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop); 1.1631 + __ delayed()->inc(offset, 2); 1.1632 + 1.1633 + __ BIND(L_exit); 1.1634 + // O3, O4 are used as temp registers 1.1635 + inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4); 1.1636 + __ retl(); 1.1637 + __ delayed()->mov(G0, O0); // return 0 1.1638 + return start; 1.1639 + } 1.1640 + 1.1641 + // 1.1642 + // Generate stub for disjoint short fill. If "aligned" is true, the 1.1643 + // "to" address is assumed to be heapword aligned. 1.1644 + // 1.1645 + // Arguments for generated stub: 1.1646 + // to: O0 1.1647 + // value: O1 1.1648 + // count: O2 treated as signed 1.1649 + // 1.1650 + address generate_fill(BasicType t, bool aligned, const char* name) { 1.1651 + __ align(CodeEntryAlignment); 1.1652 + StubCodeMark mark(this, "StubRoutines", name); 1.1653 + address start = __ pc(); 1.1654 + 1.1655 + const Register to = O0; // source array address 1.1656 + const Register value = O1; // fill value 1.1657 + const Register count = O2; // elements count 1.1658 + // O3 is used as a temp register 1.1659 + 1.1660 + assert_clean_int(count, O3); // Make sure 'count' is clean int. 1.1661 + 1.1662 + Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte; 1.1663 + Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes; 1.1664 + 1.1665 + int shift = -1; 1.1666 + switch (t) { 1.1667 + case T_BYTE: 1.1668 + shift = 2; 1.1669 + break; 1.1670 + case T_SHORT: 1.1671 + shift = 1; 1.1672 + break; 1.1673 + case T_INT: 1.1674 + shift = 0; 1.1675 + break; 1.1676 + default: ShouldNotReachHere(); 1.1677 + } 1.1678 + 1.1679 + BLOCK_COMMENT("Entry:"); 1.1680 + 1.1681 + if (t == T_BYTE) { 1.1682 + // Zero extend value 1.1683 + __ and3(value, 0xff, value); 1.1684 + __ sllx(value, 8, O3); 1.1685 + __ or3(value, O3, value); 1.1686 + } 1.1687 + if (t == T_SHORT) { 1.1688 + // Zero extend value 1.1689 + __ sllx(value, 48, value); 1.1690 + __ srlx(value, 48, value); 1.1691 + } 1.1692 + if (t == T_BYTE || t == T_SHORT) { 1.1693 + __ sllx(value, 16, O3); 1.1694 + __ or3(value, O3, value); 1.1695 + } 1.1696 + 1.1697 + __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element 1.1698 + __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp 1.1699 + __ delayed()->andcc(count, 1, G0); 1.1700 + 1.1701 + if (!aligned && (t == T_BYTE || t == T_SHORT)) { 1.1702 + // align source address at 4 bytes address boundary 1.1703 + if (t == T_BYTE) { 1.1704 + // One byte misalignment happens only for byte arrays 1.1705 + __ andcc(to, 1, G0); 1.1706 + __ br(Assembler::zero, false, Assembler::pt, L_skip_align1); 1.1707 + __ delayed()->nop(); 1.1708 + __ stb(value, to, 0); 1.1709 + __ inc(to, 1); 1.1710 + __ dec(count, 1); 1.1711 + __ BIND(L_skip_align1); 1.1712 + } 1.1713 + // Two bytes misalignment happens only for byte and short (char) arrays 1.1714 + __ andcc(to, 2, G0); 1.1715 + __ br(Assembler::zero, false, Assembler::pt, L_skip_align2); 1.1716 + __ delayed()->nop(); 1.1717 + __ sth(value, to, 0); 1.1718 + __ inc(to, 2); 1.1719 + __ dec(count, 1 << (shift - 1)); 1.1720 + __ BIND(L_skip_align2); 1.1721 + } 1.1722 +#ifdef _LP64 1.1723 + if (!aligned) { 1.1724 +#endif 1.1725 + // align to 8 bytes, we know we are 4 byte aligned to start 1.1726 + __ andcc(to, 7, G0); 1.1727 + __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes); 1.1728 + __ delayed()->nop(); 1.1729 + __ stw(value, to, 0); 1.1730 + __ inc(to, 4); 1.1731 + __ dec(count, 1 << shift); 1.1732 + __ BIND(L_fill_32_bytes); 1.1733 +#ifdef _LP64 1.1734 + } 1.1735 +#endif 1.1736 + 1.1737 + if (t == T_INT) { 1.1738 + // Zero extend value 1.1739 + __ srl(value, 0, value); 1.1740 + } 1.1741 + if (t == T_BYTE || t == T_SHORT || t == T_INT) { 1.1742 + __ sllx(value, 32, O3); 1.1743 + __ or3(value, O3, value); 1.1744 + } 1.1745 + 1.1746 + Label L_check_fill_8_bytes; 1.1747 + // Fill 32-byte chunks 1.1748 + __ subcc(count, 8 << shift, count); 1.1749 + __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes); 1.1750 + __ delayed()->nop(); 1.1751 + 1.1752 + Label L_fill_32_bytes_loop, L_fill_4_bytes; 1.1753 + __ align(16); 1.1754 + __ BIND(L_fill_32_bytes_loop); 1.1755 + 1.1756 + __ stx(value, to, 0); 1.1757 + __ stx(value, to, 8); 1.1758 + __ stx(value, to, 16); 1.1759 + __ stx(value, to, 24); 1.1760 + 1.1761 + __ subcc(count, 8 << shift, count); 1.1762 + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop); 1.1763 + __ delayed()->add(to, 32, to); 1.1764 + 1.1765 + __ BIND(L_check_fill_8_bytes); 1.1766 + __ addcc(count, 8 << shift, count); 1.1767 + __ brx(Assembler::zero, false, Assembler::pn, L_exit); 1.1768 + __ delayed()->subcc(count, 1 << (shift + 1), count); 1.1769 + __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes); 1.1770 + __ delayed()->andcc(count, 1<<shift, G0); 1.1771 + 1.1772 + // 1.1773 + // length is too short, just fill 8 bytes at a time 1.1774 + // 1.1775 + Label L_fill_8_bytes_loop; 1.1776 + __ BIND(L_fill_8_bytes_loop); 1.1777 + __ stx(value, to, 0); 1.1778 + __ subcc(count, 1 << (shift + 1), count); 1.1779 + __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop); 1.1780 + __ delayed()->add(to, 8, to); 1.1781 + 1.1782 + // fill trailing 4 bytes 1.1783 + __ andcc(count, 1<<shift, G0); // in delay slot of branches 1.1784 + if (t == T_INT) { 1.1785 + __ BIND(L_fill_elements); 1.1786 + } 1.1787 + __ BIND(L_fill_4_bytes); 1.1788 + __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes); 1.1789 + if (t == T_BYTE || t == T_SHORT) { 1.1790 + __ delayed()->andcc(count, 1<<(shift-1), G0); 1.1791 + } else { 1.1792 + __ delayed()->nop(); 1.1793 + } 1.1794 + __ stw(value, to, 0); 1.1795 + if (t == T_BYTE || t == T_SHORT) { 1.1796 + __ inc(to, 4); 1.1797 + // fill trailing 2 bytes 1.1798 + __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches 1.1799 + __ BIND(L_fill_2_bytes); 1.1800 + __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte); 1.1801 + __ delayed()->andcc(count, 1, count); 1.1802 + __ sth(value, to, 0); 1.1803 + if (t == T_BYTE) { 1.1804 + __ inc(to, 2); 1.1805 + // fill trailing byte 1.1806 + __ andcc(count, 1, count); // in delay slot of branches 1.1807 + __ BIND(L_fill_byte); 1.1808 + __ brx(Assembler::zero, false, Assembler::pt, L_exit); 1.1809 + __ delayed()->nop(); 1.1810 + __ stb(value, to, 0); 1.1811 + } else { 1.1812 + __ BIND(L_fill_byte); 1.1813 + } 1.1814 + } else { 1.1815 + __ BIND(L_fill_2_bytes); 1.1816 + } 1.1817 + __ BIND(L_exit); 1.1818 + __ retl(); 1.1819 + __ delayed()->nop(); 1.1820 + 1.1821 + // Handle copies less than 8 bytes. Int is handled elsewhere. 1.1822 + if (t == T_BYTE) { 1.1823 + __ BIND(L_fill_elements); 1.1824 + Label L_fill_2, L_fill_4; 1.1825 + // in delay slot __ andcc(count, 1, G0); 1.1826 + __ brx(Assembler::zero, false, Assembler::pt, L_fill_2); 1.1827 + __ delayed()->andcc(count, 2, G0); 1.1828 + __ stb(value, to, 0); 1.1829 + __ inc(to, 1); 1.1830 + __ BIND(L_fill_2); 1.1831 + __ brx(Assembler::zero, false, Assembler::pt, L_fill_4); 1.1832 + __ delayed()->andcc(count, 4, G0); 1.1833 + __ stb(value, to, 0); 1.1834 + __ stb(value, to, 1); 1.1835 + __ inc(to, 2); 1.1836 + __ BIND(L_fill_4); 1.1837 + __ brx(Assembler::zero, false, Assembler::pt, L_exit); 1.1838 + __ delayed()->nop(); 1.1839 + __ stb(value, to, 0); 1.1840 + __ stb(value, to, 1); 1.1841 + __ stb(value, to, 2); 1.1842 + __ retl(); 1.1843 + __ delayed()->stb(value, to, 3); 1.1844 + } 1.1845 + 1.1846 + if (t == T_SHORT) { 1.1847 + Label L_fill_2; 1.1848 + __ BIND(L_fill_elements); 1.1849 + // in delay slot __ andcc(count, 1, G0); 1.1850 + __ brx(Assembler::zero, false, Assembler::pt, L_fill_2); 1.1851 + __ delayed()->andcc(count, 2, G0); 1.1852 + __ sth(value, to, 0); 1.1853 + __ inc(to, 2); 1.1854 + __ BIND(L_fill_2); 1.1855 + __ brx(Assembler::zero, false, Assembler::pt, L_exit); 1.1856 + __ delayed()->nop(); 1.1857 + __ sth(value, to, 0); 1.1858 + __ retl(); 1.1859 + __ delayed()->sth(value, to, 2); 1.1860 + } 1.1861 + return start; 1.1862 + } 1.1863 + 1.1864 + // 1.1865 + // Generate stub for conjoint short copy. If "aligned" is true, the 1.1866 + // "from" and "to" addresses are assumed to be heapword aligned. 1.1867 + // 1.1868 + // Arguments for generated stub: 1.1869 + // from: O0 1.1870 + // to: O1 1.1871 + // count: O2 treated as signed 1.1872 + // 1.1873 + address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1.1874 + address *entry, const char *name) { 1.1875 + // Do reverse copy. 1.1876 + 1.1877 + __ align(CodeEntryAlignment); 1.1878 + StubCodeMark mark(this, "StubRoutines", name); 1.1879 + address start = __ pc(); 1.1880 + 1.1881 + Label L_skip_alignment, L_skip_alignment2, L_aligned_copy; 1.1882 + Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit; 1.1883 + 1.1884 + const Register from = O0; // source array address 1.1885 + const Register to = O1; // destination array address 1.1886 + const Register count = O2; // elements count 1.1887 + const Register end_from = from; // source array end address 1.1888 + const Register end_to = to; // destination array end address 1.1889 + 1.1890 + const Register byte_count = O3; // bytes count to copy 1.1891 + 1.1892 + assert_clean_int(count, O3); // Make sure 'count' is clean int. 1.1893 + 1.1894 + if (entry != NULL) { 1.1895 + *entry = __ pc(); 1.1896 + // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1.1897 + BLOCK_COMMENT("Entry:"); 1.1898 + } 1.1899 + 1.1900 + array_overlap_test(nooverlap_target, 1); 1.1901 + 1.1902 + __ sllx(count, LogBytesPerShort, byte_count); 1.1903 + __ add(to, byte_count, end_to); // offset after last copied element 1.1904 + 1.1905 + // for short arrays, just do single element copy 1.1906 + __ cmp(count, 11); // 8 + 3 (22 bytes) 1.1907 + __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes); 1.1908 + __ delayed()->add(from, byte_count, end_from); 1.1909 + 1.1910 + { 1.1911 + // Align end of arrays since they could be not aligned even 1.1912 + // when arrays itself are aligned. 1.1913 + 1.1914 + // copy 1 element if necessary to align 'end_to' on an 4 bytes 1.1915 + __ andcc(end_to, 3, G0); 1.1916 + __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1.1917 + __ delayed()->lduh(end_from, -2, O3); 1.1918 + __ dec(end_from, 2); 1.1919 + __ dec(end_to, 2); 1.1920 + __ dec(count); 1.1921 + __ sth(O3, end_to, 0); 1.1922 + __ BIND(L_skip_alignment); 1.1923 + 1.1924 + // copy 2 elements to align 'end_to' on an 8 byte boundary 1.1925 + __ andcc(end_to, 7, G0); 1.1926 + __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2); 1.1927 + __ delayed()->lduh(end_from, -2, O3); 1.1928 + __ dec(count, 2); 1.1929 + __ lduh(end_from, -4, O4); 1.1930 + __ dec(end_from, 4); 1.1931 + __ dec(end_to, 4); 1.1932 + __ sth(O3, end_to, 2); 1.1933 + __ sth(O4, end_to, 0); 1.1934 + __ BIND(L_skip_alignment2); 1.1935 + } 1.1936 +#ifdef _LP64 1.1937 + if (aligned) { 1.1938 + // Both arrays are aligned to 8-bytes in 64-bits VM. 1.1939 + // The 'count' is decremented in copy_16_bytes_backward_with_shift() 1.1940 + // in unaligned case. 1.1941 + __ dec(count, 8); 1.1942 + } else 1.1943 +#endif 1.1944 + { 1.1945 + // Copy with shift 16 bytes per iteration if arrays do not have 1.1946 + // the same alignment mod 8, otherwise jump to the next 1.1947 + // code for aligned copy (and substracting 8 from 'count' before jump). 1.1948 + // The compare above (count >= 11) guarantes 'count' >= 16 bytes. 1.1949 + // Also jump over aligned copy after the copy with shift completed. 1.1950 + 1.1951 + copy_16_bytes_backward_with_shift(end_from, end_to, count, 8, 1.1952 + L_aligned_copy, L_copy_2_bytes); 1.1953 + } 1.1954 + // copy 4 elements (16 bytes) at a time 1.1955 + __ align(OptoLoopAlignment); 1.1956 + __ BIND(L_aligned_copy); 1.1957 + __ dec(end_from, 16); 1.1958 + __ ldx(end_from, 8, O3); 1.1959 + __ ldx(end_from, 0, O4); 1.1960 + __ dec(end_to, 16); 1.1961 + __ deccc(count, 8); 1.1962 + __ stx(O3, end_to, 8); 1.1963 + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy); 1.1964 + __ delayed()->stx(O4, end_to, 0); 1.1965 + __ inc(count, 8); 1.1966 + 1.1967 + // copy 1 element (2 bytes) at a time 1.1968 + __ BIND(L_copy_2_bytes); 1.1969 + __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1.1970 + __ BIND(L_copy_2_bytes_loop); 1.1971 + __ dec(end_from, 2); 1.1972 + __ dec(end_to, 2); 1.1973 + __ lduh(end_from, 0, O4); 1.1974 + __ deccc(count); 1.1975 + __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop); 1.1976 + __ delayed()->sth(O4, end_to, 0); 1.1977 + 1.1978 + __ BIND(L_exit); 1.1979 + // O3, O4 are used as temp registers 1.1980 + inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4); 1.1981 + __ retl(); 1.1982 + __ delayed()->mov(G0, O0); // return 0 1.1983 + return start; 1.1984 + } 1.1985 + 1.1986 + // 1.1987 + // Helper methods for generate_disjoint_int_copy_core() 1.1988 + // 1.1989 + void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec, 1.1990 + Label& L_loop, bool use_prefetch, bool use_bis) { 1.1991 + 1.1992 + __ align(OptoLoopAlignment); 1.1993 + __ BIND(L_loop); 1.1994 + if (use_prefetch) { 1.1995 + if (ArraycopySrcPrefetchDistance > 0) { 1.1996 + __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); 1.1997 + } 1.1998 + if (ArraycopyDstPrefetchDistance > 0) { 1.1999 + __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); 1.2000 + } 1.2001 + } 1.2002 + __ ldx(from, 4, O4); 1.2003 + __ ldx(from, 12, G4); 1.2004 + __ inc(to, 16); 1.2005 + __ inc(from, 16); 1.2006 + __ deccc(count, 4); // Can we do next iteration after this one? 1.2007 + 1.2008 + __ srlx(O4, 32, G3); 1.2009 + __ bset(G3, O3); 1.2010 + __ sllx(O4, 32, O4); 1.2011 + __ srlx(G4, 32, G3); 1.2012 + __ bset(G3, O4); 1.2013 + if (use_bis) { 1.2014 + __ stxa(O3, to, -16); 1.2015 + __ stxa(O4, to, -8); 1.2016 + } else { 1.2017 + __ stx(O3, to, -16); 1.2018 + __ stx(O4, to, -8); 1.2019 + } 1.2020 + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1.2021 + __ delayed()->sllx(G4, 32, O3); 1.2022 + 1.2023 + } 1.2024 + 1.2025 + // 1.2026 + // Generate core code for disjoint int copy (and oop copy on 32-bit). 1.2027 + // If "aligned" is true, the "from" and "to" addresses are assumed 1.2028 + // to be heapword aligned. 1.2029 + // 1.2030 + // Arguments: 1.2031 + // from: O0 1.2032 + // to: O1 1.2033 + // count: O2 treated as signed 1.2034 + // 1.2035 + void generate_disjoint_int_copy_core(bool aligned) { 1.2036 + 1.2037 + Label L_skip_alignment, L_aligned_copy; 1.2038 + Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit; 1.2039 + 1.2040 + const Register from = O0; // source array address 1.2041 + const Register to = O1; // destination array address 1.2042 + const Register count = O2; // elements count 1.2043 + const Register offset = O5; // offset from start of arrays 1.2044 + // O3, O4, G3, G4 are used as temp registers 1.2045 + 1.2046 + // 'aligned' == true when it is known statically during compilation 1.2047 + // of this arraycopy call site that both 'from' and 'to' addresses 1.2048 + // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()). 1.2049 + // 1.2050 + // Aligned arrays have 4 bytes alignment in 32-bits VM 1.2051 + // and 8 bytes - in 64-bits VM. 1.2052 + // 1.2053 +#ifdef _LP64 1.2054 + if (!aligned) 1.2055 +#endif 1.2056 + { 1.2057 + // The next check could be put under 'ifndef' since the code in 1.2058 + // generate_disjoint_long_copy_core() has own checks and set 'offset'. 1.2059 + 1.2060 + // for short arrays, just do single element copy 1.2061 + __ cmp(count, 5); // 4 + 1 (20 bytes) 1.2062 + __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes); 1.2063 + __ delayed()->mov(G0, offset); 1.2064 + 1.2065 + // copy 1 element to align 'to' on an 8 byte boundary 1.2066 + __ andcc(to, 7, G0); 1.2067 + __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1.2068 + __ delayed()->ld(from, 0, O3); 1.2069 + __ inc(from, 4); 1.2070 + __ inc(to, 4); 1.2071 + __ dec(count); 1.2072 + __ st(O3, to, -4); 1.2073 + __ BIND(L_skip_alignment); 1.2074 + 1.2075 + // if arrays have same alignment mod 8, do 4 elements copy 1.2076 + __ andcc(from, 7, G0); 1.2077 + __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 1.2078 + __ delayed()->ld(from, 0, O3); 1.2079 + 1.2080 + // 1.2081 + // Load 2 aligned 8-bytes chunks and use one from previous iteration 1.2082 + // to form 2 aligned 8-bytes chunks to store. 1.2083 + // 1.2084 + // copy_16_bytes_forward_with_shift() is not used here since this 1.2085 + // code is more optimal. 1.2086 + 1.2087 + // copy with shift 4 elements (16 bytes) at a time 1.2088 + __ dec(count, 4); // The cmp at the beginning guaranty count >= 4 1.2089 + __ sllx(O3, 32, O3); 1.2090 + 1.2091 + disjoint_copy_core(from, to, count, 2, 16, &StubGenerator::copy_16_bytes_loop); 1.2092 + 1.2093 + __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes); 1.2094 + __ delayed()->inc(count, 4); // restore 'count' 1.2095 + 1.2096 + __ BIND(L_aligned_copy); 1.2097 + } // !aligned 1.2098 + 1.2099 + // copy 4 elements (16 bytes) at a time 1.2100 + __ and3(count, 1, G4); // Save 1.2101 + __ srl(count, 1, count); 1.2102 + generate_disjoint_long_copy_core(aligned); 1.2103 + __ mov(G4, count); // Restore 1.2104 + 1.2105 + // copy 1 element at a time 1.2106 + __ BIND(L_copy_4_bytes); 1.2107 + __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1.2108 + __ BIND(L_copy_4_bytes_loop); 1.2109 + __ ld(from, offset, O3); 1.2110 + __ deccc(count); 1.2111 + __ st(O3, to, offset); 1.2112 + __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop); 1.2113 + __ delayed()->inc(offset, 4); 1.2114 + __ BIND(L_exit); 1.2115 + } 1.2116 + 1.2117 + // 1.2118 + // Generate stub for disjoint int copy. If "aligned" is true, the 1.2119 + // "from" and "to" addresses are assumed to be heapword aligned. 1.2120 + // 1.2121 + // Arguments for generated stub: 1.2122 + // from: O0 1.2123 + // to: O1 1.2124 + // count: O2 treated as signed 1.2125 + // 1.2126 + address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) { 1.2127 + __ align(CodeEntryAlignment); 1.2128 + StubCodeMark mark(this, "StubRoutines", name); 1.2129 + address start = __ pc(); 1.2130 + 1.2131 + const Register count = O2; 1.2132 + assert_clean_int(count, O3); // Make sure 'count' is clean int. 1.2133 + 1.2134 + if (entry != NULL) { 1.2135 + *entry = __ pc(); 1.2136 + // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1.2137 + BLOCK_COMMENT("Entry:"); 1.2138 + } 1.2139 + 1.2140 + generate_disjoint_int_copy_core(aligned); 1.2141 + 1.2142 + // O3, O4 are used as temp registers 1.2143 + inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4); 1.2144 + __ retl(); 1.2145 + __ delayed()->mov(G0, O0); // return 0 1.2146 + return start; 1.2147 + } 1.2148 + 1.2149 + // 1.2150 + // Generate core code for conjoint int copy (and oop copy on 32-bit). 1.2151 + // If "aligned" is true, the "from" and "to" addresses are assumed 1.2152 + // to be heapword aligned. 1.2153 + // 1.2154 + // Arguments: 1.2155 + // from: O0 1.2156 + // to: O1 1.2157 + // count: O2 treated as signed 1.2158 + // 1.2159 + void generate_conjoint_int_copy_core(bool aligned) { 1.2160 + // Do reverse copy. 1.2161 + 1.2162 + Label L_skip_alignment, L_aligned_copy; 1.2163 + Label L_copy_16_bytes, L_copy_4_bytes, L_copy_4_bytes_loop, L_exit; 1.2164 + 1.2165 + const Register from = O0; // source array address 1.2166 + const Register to = O1; // destination array address 1.2167 + const Register count = O2; // elements count 1.2168 + const Register end_from = from; // source array end address 1.2169 + const Register end_to = to; // destination array end address 1.2170 + // O3, O4, O5, G3 are used as temp registers 1.2171 + 1.2172 + const Register byte_count = O3; // bytes count to copy 1.2173 + 1.2174 + __ sllx(count, LogBytesPerInt, byte_count); 1.2175 + __ add(to, byte_count, end_to); // offset after last copied element 1.2176 + 1.2177 + __ cmp(count, 5); // for short arrays, just do single element copy 1.2178 + __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes); 1.2179 + __ delayed()->add(from, byte_count, end_from); 1.2180 + 1.2181 + // copy 1 element to align 'to' on an 8 byte boundary 1.2182 + __ andcc(end_to, 7, G0); 1.2183 + __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1.2184 + __ delayed()->nop(); 1.2185 + __ dec(count); 1.2186 + __ dec(end_from, 4); 1.2187 + __ dec(end_to, 4); 1.2188 + __ ld(end_from, 0, O4); 1.2189 + __ st(O4, end_to, 0); 1.2190 + __ BIND(L_skip_alignment); 1.2191 + 1.2192 + // Check if 'end_from' and 'end_to' has the same alignment. 1.2193 + __ andcc(end_from, 7, G0); 1.2194 + __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 1.2195 + __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4 1.2196 + 1.2197 + // copy with shift 4 elements (16 bytes) at a time 1.2198 + // 1.2199 + // Load 2 aligned 8-bytes chunks and use one from previous iteration 1.2200 + // to form 2 aligned 8-bytes chunks to store. 1.2201 + // 1.2202 + __ ldx(end_from, -4, O3); 1.2203 + __ align(OptoLoopAlignment); 1.2204 + __ BIND(L_copy_16_bytes); 1.2205 + __ ldx(end_from, -12, O4); 1.2206 + __ deccc(count, 4); 1.2207 + __ ldx(end_from, -20, O5); 1.2208 + __ dec(end_to, 16); 1.2209 + __ dec(end_from, 16); 1.2210 + __ srlx(O3, 32, O3); 1.2211 + __ sllx(O4, 32, G3); 1.2212 + __ bset(G3, O3); 1.2213 + __ stx(O3, end_to, 8); 1.2214 + __ srlx(O4, 32, O4); 1.2215 + __ sllx(O5, 32, G3); 1.2216 + __ bset(O4, G3); 1.2217 + __ stx(G3, end_to, 0); 1.2218 + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes); 1.2219 + __ delayed()->mov(O5, O3); 1.2220 + 1.2221 + __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes); 1.2222 + __ delayed()->inc(count, 4); 1.2223 + 1.2224 + // copy 4 elements (16 bytes) at a time 1.2225 + __ align(OptoLoopAlignment); 1.2226 + __ BIND(L_aligned_copy); 1.2227 + __ dec(end_from, 16); 1.2228 + __ ldx(end_from, 8, O3); 1.2229 + __ ldx(end_from, 0, O4); 1.2230 + __ dec(end_to, 16); 1.2231 + __ deccc(count, 4); 1.2232 + __ stx(O3, end_to, 8); 1.2233 + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy); 1.2234 + __ delayed()->stx(O4, end_to, 0); 1.2235 + __ inc(count, 4); 1.2236 + 1.2237 + // copy 1 element (4 bytes) at a time 1.2238 + __ BIND(L_copy_4_bytes); 1.2239 + __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1.2240 + __ BIND(L_copy_4_bytes_loop); 1.2241 + __ dec(end_from, 4); 1.2242 + __ dec(end_to, 4); 1.2243 + __ ld(end_from, 0, O4); 1.2244 + __ deccc(count); 1.2245 + __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop); 1.2246 + __ delayed()->st(O4, end_to, 0); 1.2247 + __ BIND(L_exit); 1.2248 + } 1.2249 + 1.2250 + // 1.2251 + // Generate stub for conjoint int copy. If "aligned" is true, the 1.2252 + // "from" and "to" addresses are assumed to be heapword aligned. 1.2253 + // 1.2254 + // Arguments for generated stub: 1.2255 + // from: O0 1.2256 + // to: O1 1.2257 + // count: O2 treated as signed 1.2258 + // 1.2259 + address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1.2260 + address *entry, const char *name) { 1.2261 + __ align(CodeEntryAlignment); 1.2262 + StubCodeMark mark(this, "StubRoutines", name); 1.2263 + address start = __ pc(); 1.2264 + 1.2265 + assert_clean_int(O2, O3); // Make sure 'count' is clean int. 1.2266 + 1.2267 + if (entry != NULL) { 1.2268 + *entry = __ pc(); 1.2269 + // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1.2270 + BLOCK_COMMENT("Entry:"); 1.2271 + } 1.2272 + 1.2273 + array_overlap_test(nooverlap_target, 2); 1.2274 + 1.2275 + generate_conjoint_int_copy_core(aligned); 1.2276 + 1.2277 + // O3, O4 are used as temp registers 1.2278 + inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4); 1.2279 + __ retl(); 1.2280 + __ delayed()->mov(G0, O0); // return 0 1.2281 + return start; 1.2282 + } 1.2283 + 1.2284 + // 1.2285 + // Helper methods for generate_disjoint_long_copy_core() 1.2286 + // 1.2287 + void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec, 1.2288 + Label& L_loop, bool use_prefetch, bool use_bis) { 1.2289 + __ align(OptoLoopAlignment); 1.2290 + __ BIND(L_loop); 1.2291 + for (int off = 0; off < 64; off += 16) { 1.2292 + if (use_prefetch && (off & 31) == 0) { 1.2293 + if (ArraycopySrcPrefetchDistance > 0) { 1.2294 + __ prefetch(from, ArraycopySrcPrefetchDistance+off, Assembler::severalReads); 1.2295 + } 1.2296 + if (ArraycopyDstPrefetchDistance > 0) { 1.2297 + __ prefetch(to, ArraycopyDstPrefetchDistance+off, Assembler::severalWritesAndPossiblyReads); 1.2298 + } 1.2299 + } 1.2300 + __ ldx(from, off+0, O4); 1.2301 + __ ldx(from, off+8, O5); 1.2302 + if (use_bis) { 1.2303 + __ stxa(O4, to, off+0); 1.2304 + __ stxa(O5, to, off+8); 1.2305 + } else { 1.2306 + __ stx(O4, to, off+0); 1.2307 + __ stx(O5, to, off+8); 1.2308 + } 1.2309 + } 1.2310 + __ deccc(count, 8); 1.2311 + __ inc(from, 64); 1.2312 + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1.2313 + __ delayed()->inc(to, 64); 1.2314 + } 1.2315 + 1.2316 + // 1.2317 + // Generate core code for disjoint long copy (and oop copy on 64-bit). 1.2318 + // "aligned" is ignored, because we must make the stronger 1.2319 + // assumption that both addresses are always 64-bit aligned. 1.2320 + // 1.2321 + // Arguments: 1.2322 + // from: O0 1.2323 + // to: O1 1.2324 + // count: O2 treated as signed 1.2325 + // 1.2326 + // count -= 2; 1.2327 + // if ( count >= 0 ) { // >= 2 elements 1.2328 + // if ( count > 6) { // >= 8 elements 1.2329 + // count -= 6; // original count - 8 1.2330 + // do { 1.2331 + // copy_8_elements; 1.2332 + // count -= 8; 1.2333 + // } while ( count >= 0 ); 1.2334 + // count += 6; 1.2335 + // } 1.2336 + // if ( count >= 0 ) { // >= 2 elements 1.2337 + // do { 1.2338 + // copy_2_elements; 1.2339 + // } while ( (count=count-2) >= 0 ); 1.2340 + // } 1.2341 + // } 1.2342 + // count += 2; 1.2343 + // if ( count != 0 ) { // 1 element left 1.2344 + // copy_1_element; 1.2345 + // } 1.2346 + // 1.2347 + void generate_disjoint_long_copy_core(bool aligned) { 1.2348 + Label L_copy_8_bytes, L_copy_16_bytes, L_exit; 1.2349 + const Register from = O0; // source array address 1.2350 + const Register to = O1; // destination array address 1.2351 + const Register count = O2; // elements count 1.2352 + const Register offset0 = O4; // element offset 1.2353 + const Register offset8 = O5; // next element offset 1.2354 + 1.2355 + __ deccc(count, 2); 1.2356 + __ mov(G0, offset0); // offset from start of arrays (0) 1.2357 + __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); 1.2358 + __ delayed()->add(offset0, 8, offset8); 1.2359 + 1.2360 + // Copy by 64 bytes chunks 1.2361 + 1.2362 + const Register from64 = O3; // source address 1.2363 + const Register to64 = G3; // destination address 1.2364 + __ subcc(count, 6, O3); 1.2365 + __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes ); 1.2366 + __ delayed()->mov(to, to64); 1.2367 + // Now we can use O4(offset0), O5(offset8) as temps 1.2368 + __ mov(O3, count); 1.2369 + // count >= 0 (original count - 8) 1.2370 + __ mov(from, from64); 1.2371 + 1.2372 + disjoint_copy_core(from64, to64, count, 3, 64, &StubGenerator::copy_64_bytes_loop); 1.2373 + 1.2374 + // Restore O4(offset0), O5(offset8) 1.2375 + __ sub(from64, from, offset0); 1.2376 + __ inccc(count, 6); // restore count 1.2377 + __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); 1.2378 + __ delayed()->add(offset0, 8, offset8); 1.2379 + 1.2380 + // Copy by 16 bytes chunks 1.2381 + __ align(OptoLoopAlignment); 1.2382 + __ BIND(L_copy_16_bytes); 1.2383 + __ ldx(from, offset0, O3); 1.2384 + __ ldx(from, offset8, G3); 1.2385 + __ deccc(count, 2); 1.2386 + __ stx(O3, to, offset0); 1.2387 + __ inc(offset0, 16); 1.2388 + __ stx(G3, to, offset8); 1.2389 + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes); 1.2390 + __ delayed()->inc(offset8, 16); 1.2391 + 1.2392 + // Copy last 8 bytes 1.2393 + __ BIND(L_copy_8_bytes); 1.2394 + __ inccc(count, 2); 1.2395 + __ brx(Assembler::zero, true, Assembler::pn, L_exit ); 1.2396 + __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs 1.2397 + __ ldx(from, offset0, O3); 1.2398 + __ stx(O3, to, offset0); 1.2399 + __ BIND(L_exit); 1.2400 + } 1.2401 + 1.2402 + // 1.2403 + // Generate stub for disjoint long copy. 1.2404 + // "aligned" is ignored, because we must make the stronger 1.2405 + // assumption that both addresses are always 64-bit aligned. 1.2406 + // 1.2407 + // Arguments for generated stub: 1.2408 + // from: O0 1.2409 + // to: O1 1.2410 + // count: O2 treated as signed 1.2411 + // 1.2412 + address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) { 1.2413 + __ align(CodeEntryAlignment); 1.2414 + StubCodeMark mark(this, "StubRoutines", name); 1.2415 + address start = __ pc(); 1.2416 + 1.2417 + assert_clean_int(O2, O3); // Make sure 'count' is clean int. 1.2418 + 1.2419 + if (entry != NULL) { 1.2420 + *entry = __ pc(); 1.2421 + // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1.2422 + BLOCK_COMMENT("Entry:"); 1.2423 + } 1.2424 + 1.2425 + generate_disjoint_long_copy_core(aligned); 1.2426 + 1.2427 + // O3, O4 are used as temp registers 1.2428 + inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4); 1.2429 + __ retl(); 1.2430 + __ delayed()->mov(G0, O0); // return 0 1.2431 + return start; 1.2432 + } 1.2433 + 1.2434 + // 1.2435 + // Generate core code for conjoint long copy (and oop copy on 64-bit). 1.2436 + // "aligned" is ignored, because we must make the stronger 1.2437 + // assumption that both addresses are always 64-bit aligned. 1.2438 + // 1.2439 + // Arguments: 1.2440 + // from: O0 1.2441 + // to: O1 1.2442 + // count: O2 treated as signed 1.2443 + // 1.2444 + void generate_conjoint_long_copy_core(bool aligned) { 1.2445 + // Do reverse copy. 1.2446 + Label L_copy_8_bytes, L_copy_16_bytes, L_exit; 1.2447 + const Register from = O0; // source array address 1.2448 + const Register to = O1; // destination array address 1.2449 + const Register count = O2; // elements count 1.2450 + const Register offset8 = O4; // element offset 1.2451 + const Register offset0 = O5; // previous element offset 1.2452 + 1.2453 + __ subcc(count, 1, count); 1.2454 + __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes ); 1.2455 + __ delayed()->sllx(count, LogBytesPerLong, offset8); 1.2456 + __ sub(offset8, 8, offset0); 1.2457 + __ align(OptoLoopAlignment); 1.2458 + __ BIND(L_copy_16_bytes); 1.2459 + __ ldx(from, offset8, O2); 1.2460 + __ ldx(from, offset0, O3); 1.2461 + __ stx(O2, to, offset8); 1.2462 + __ deccc(offset8, 16); // use offset8 as counter 1.2463 + __ stx(O3, to, offset0); 1.2464 + __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes); 1.2465 + __ delayed()->dec(offset0, 16); 1.2466 + 1.2467 + __ BIND(L_copy_8_bytes); 1.2468 + __ brx(Assembler::negative, false, Assembler::pn, L_exit ); 1.2469 + __ delayed()->nop(); 1.2470 + __ ldx(from, 0, O3); 1.2471 + __ stx(O3, to, 0); 1.2472 + __ BIND(L_exit); 1.2473 + } 1.2474 + 1.2475 + // Generate stub for conjoint long copy. 1.2476 + // "aligned" is ignored, because we must make the stronger 1.2477 + // assumption that both addresses are always 64-bit aligned. 1.2478 + // 1.2479 + // Arguments for generated stub: 1.2480 + // from: O0 1.2481 + // to: O1 1.2482 + // count: O2 treated as signed 1.2483 + // 1.2484 + address generate_conjoint_long_copy(bool aligned, address nooverlap_target, 1.2485 + address *entry, const char *name) { 1.2486 + __ align(CodeEntryAlignment); 1.2487 + StubCodeMark mark(this, "StubRoutines", name); 1.2488 + address start = __ pc(); 1.2489 + 1.2490 + assert(aligned, "Should always be aligned"); 1.2491 + 1.2492 + assert_clean_int(O2, O3); // Make sure 'count' is clean int. 1.2493 + 1.2494 + if (entry != NULL) { 1.2495 + *entry = __ pc(); 1.2496 + // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1.2497 + BLOCK_COMMENT("Entry:"); 1.2498 + } 1.2499 + 1.2500 + array_overlap_test(nooverlap_target, 3); 1.2501 + 1.2502 + generate_conjoint_long_copy_core(aligned); 1.2503 + 1.2504 + // O3, O4 are used as temp registers 1.2505 + inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4); 1.2506 + __ retl(); 1.2507 + __ delayed()->mov(G0, O0); // return 0 1.2508 + return start; 1.2509 + } 1.2510 + 1.2511 + // Generate stub for disjoint oop copy. If "aligned" is true, the 1.2512 + // "from" and "to" addresses are assumed to be heapword aligned. 1.2513 + // 1.2514 + // Arguments for generated stub: 1.2515 + // from: O0 1.2516 + // to: O1 1.2517 + // count: O2 treated as signed 1.2518 + // 1.2519 + address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name, 1.2520 + bool dest_uninitialized = false) { 1.2521 + 1.2522 + const Register from = O0; // source array address 1.2523 + const Register to = O1; // destination array address 1.2524 + const Register count = O2; // elements count 1.2525 + 1.2526 + __ align(CodeEntryAlignment); 1.2527 + StubCodeMark mark(this, "StubRoutines", name); 1.2528 + address start = __ pc(); 1.2529 + 1.2530 + assert_clean_int(count, O3); // Make sure 'count' is clean int. 1.2531 + 1.2532 + if (entry != NULL) { 1.2533 + *entry = __ pc(); 1.2534 + // caller can pass a 64-bit byte count here 1.2535 + BLOCK_COMMENT("Entry:"); 1.2536 + } 1.2537 + 1.2538 + // save arguments for barrier generation 1.2539 + __ mov(to, G1); 1.2540 + __ mov(count, G5); 1.2541 + gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized); 1.2542 + #ifdef _LP64 1.2543 + assert_clean_int(count, O3); // Make sure 'count' is clean int. 1.2544 + if (UseCompressedOops) { 1.2545 + generate_disjoint_int_copy_core(aligned); 1.2546 + } else { 1.2547 + generate_disjoint_long_copy_core(aligned); 1.2548 + } 1.2549 + #else 1.2550 + generate_disjoint_int_copy_core(aligned); 1.2551 + #endif 1.2552 + // O0 is used as temp register 1.2553 + gen_write_ref_array_post_barrier(G1, G5, O0); 1.2554 + 1.2555 + // O3, O4 are used as temp registers 1.2556 + inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4); 1.2557 + __ retl(); 1.2558 + __ delayed()->mov(G0, O0); // return 0 1.2559 + return start; 1.2560 + } 1.2561 + 1.2562 + // Generate stub for conjoint oop copy. If "aligned" is true, the 1.2563 + // "from" and "to" addresses are assumed to be heapword aligned. 1.2564 + // 1.2565 + // Arguments for generated stub: 1.2566 + // from: O0 1.2567 + // to: O1 1.2568 + // count: O2 treated as signed 1.2569 + // 1.2570 + address generate_conjoint_oop_copy(bool aligned, address nooverlap_target, 1.2571 + address *entry, const char *name, 1.2572 + bool dest_uninitialized = false) { 1.2573 + 1.2574 + const Register from = O0; // source array address 1.2575 + const Register to = O1; // destination array address 1.2576 + const Register count = O2; // elements count 1.2577 + 1.2578 + __ align(CodeEntryAlignment); 1.2579 + StubCodeMark mark(this, "StubRoutines", name); 1.2580 + address start = __ pc(); 1.2581 + 1.2582 + assert_clean_int(count, O3); // Make sure 'count' is clean int. 1.2583 + 1.2584 + if (entry != NULL) { 1.2585 + *entry = __ pc(); 1.2586 + // caller can pass a 64-bit byte count here 1.2587 + BLOCK_COMMENT("Entry:"); 1.2588 + } 1.2589 + 1.2590 + array_overlap_test(nooverlap_target, LogBytesPerHeapOop); 1.2591 + 1.2592 + // save arguments for barrier generation 1.2593 + __ mov(to, G1); 1.2594 + __ mov(count, G5); 1.2595 + gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized); 1.2596 + 1.2597 + #ifdef _LP64 1.2598 + if (UseCompressedOops) { 1.2599 + generate_conjoint_int_copy_core(aligned); 1.2600 + } else { 1.2601 + generate_conjoint_long_copy_core(aligned); 1.2602 + } 1.2603 + #else 1.2604 + generate_conjoint_int_copy_core(aligned); 1.2605 + #endif 1.2606 + 1.2607 + // O0 is used as temp register 1.2608 + gen_write_ref_array_post_barrier(G1, G5, O0); 1.2609 + 1.2610 + // O3, O4 are used as temp registers 1.2611 + inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4); 1.2612 + __ retl(); 1.2613 + __ delayed()->mov(G0, O0); // return 0 1.2614 + return start; 1.2615 + } 1.2616 + 1.2617 + 1.2618 + // Helper for generating a dynamic type check. 1.2619 + // Smashes only the given temp registers. 1.2620 + void generate_type_check(Register sub_klass, 1.2621 + Register super_check_offset, 1.2622 + Register super_klass, 1.2623 + Register temp, 1.2624 + Label& L_success) { 1.2625 + assert_different_registers(sub_klass, super_check_offset, super_klass, temp); 1.2626 + 1.2627 + BLOCK_COMMENT("type_check:"); 1.2628 + 1.2629 + Label L_miss, L_pop_to_miss; 1.2630 + 1.2631 + assert_clean_int(super_check_offset, temp); 1.2632 + 1.2633 + __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg, 1.2634 + &L_success, &L_miss, NULL, 1.2635 + super_check_offset); 1.2636 + 1.2637 + BLOCK_COMMENT("type_check_slow_path:"); 1.2638 + __ save_frame(0); 1.2639 + __ check_klass_subtype_slow_path(sub_klass->after_save(), 1.2640 + super_klass->after_save(), 1.2641 + L0, L1, L2, L4, 1.2642 + NULL, &L_pop_to_miss); 1.2643 + __ ba(L_success); 1.2644 + __ delayed()->restore(); 1.2645 + 1.2646 + __ bind(L_pop_to_miss); 1.2647 + __ restore(); 1.2648 + 1.2649 + // Fall through on failure! 1.2650 + __ BIND(L_miss); 1.2651 + } 1.2652 + 1.2653 + 1.2654 + // Generate stub for checked oop copy. 1.2655 + // 1.2656 + // Arguments for generated stub: 1.2657 + // from: O0 1.2658 + // to: O1 1.2659 + // count: O2 treated as signed 1.2660 + // ckoff: O3 (super_check_offset) 1.2661 + // ckval: O4 (super_klass) 1.2662 + // ret: O0 zero for success; (-1^K) where K is partial transfer count 1.2663 + // 1.2664 + address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) { 1.2665 + 1.2666 + const Register O0_from = O0; // source array address 1.2667 + const Register O1_to = O1; // destination array address 1.2668 + const Register O2_count = O2; // elements count 1.2669 + const Register O3_ckoff = O3; // super_check_offset 1.2670 + const Register O4_ckval = O4; // super_klass 1.2671 + 1.2672 + const Register O5_offset = O5; // loop var, with stride wordSize 1.2673 + const Register G1_remain = G1; // loop var, with stride -1 1.2674 + const Register G3_oop = G3; // actual oop copied 1.2675 + const Register G4_klass = G4; // oop._klass 1.2676 + const Register G5_super = G5; // oop._klass._primary_supers[ckval] 1.2677 + 1.2678 + __ align(CodeEntryAlignment); 1.2679 + StubCodeMark mark(this, "StubRoutines", name); 1.2680 + address start = __ pc(); 1.2681 + 1.2682 +#ifdef ASSERT 1.2683 + // We sometimes save a frame (see generate_type_check below). 1.2684 + // If this will cause trouble, let's fail now instead of later. 1.2685 + __ save_frame(0); 1.2686 + __ restore(); 1.2687 +#endif 1.2688 + 1.2689 + assert_clean_int(O2_count, G1); // Make sure 'count' is clean int. 1.2690 + 1.2691 +#ifdef ASSERT 1.2692 + // caller guarantees that the arrays really are different 1.2693 + // otherwise, we would have to make conjoint checks 1.2694 + { Label L; 1.2695 + __ mov(O3, G1); // spill: overlap test smashes O3 1.2696 + __ mov(O4, G4); // spill: overlap test smashes O4 1.2697 + array_overlap_test(L, LogBytesPerHeapOop); 1.2698 + __ stop("checkcast_copy within a single array"); 1.2699 + __ bind(L); 1.2700 + __ mov(G1, O3); 1.2701 + __ mov(G4, O4); 1.2702 + } 1.2703 +#endif //ASSERT 1.2704 + 1.2705 + if (entry != NULL) { 1.2706 + *entry = __ pc(); 1.2707 + // caller can pass a 64-bit byte count here (from generic stub) 1.2708 + BLOCK_COMMENT("Entry:"); 1.2709 + } 1.2710 + gen_write_ref_array_pre_barrier(O1_to, O2_count, dest_uninitialized); 1.2711 + 1.2712 + Label load_element, store_element, do_card_marks, fail, done; 1.2713 + __ addcc(O2_count, 0, G1_remain); // initialize loop index, and test it 1.2714 + __ brx(Assembler::notZero, false, Assembler::pt, load_element); 1.2715 + __ delayed()->mov(G0, O5_offset); // offset from start of arrays 1.2716 + 1.2717 + // Empty array: Nothing to do. 1.2718 + inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4); 1.2719 + __ retl(); 1.2720 + __ delayed()->set(0, O0); // return 0 on (trivial) success 1.2721 + 1.2722 + // ======== begin loop ======== 1.2723 + // (Loop is rotated; its entry is load_element.) 1.2724 + // Loop variables: 1.2725 + // (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays 1.2726 + // (O2 = len; O2 != 0; O2--) --- number of oops *remaining* 1.2727 + // G3, G4, G5 --- current oop, oop.klass, oop.klass.super 1.2728 + __ align(OptoLoopAlignment); 1.2729 + 1.2730 + __ BIND(store_element); 1.2731 + __ deccc(G1_remain); // decrement the count 1.2732 + __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop 1.2733 + __ inc(O5_offset, heapOopSize); // step to next offset 1.2734 + __ brx(Assembler::zero, true, Assembler::pt, do_card_marks); 1.2735 + __ delayed()->set(0, O0); // return -1 on success 1.2736 + 1.2737 + // ======== loop entry is here ======== 1.2738 + __ BIND(load_element); 1.2739 + __ load_heap_oop(O0_from, O5_offset, G3_oop); // load the oop 1.2740 + __ br_null_short(G3_oop, Assembler::pt, store_element); 1.2741 + 1.2742 + __ load_klass(G3_oop, G4_klass); // query the object klass 1.2743 + 1.2744 + generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super, 1.2745 + // branch to this on success: 1.2746 + store_element); 1.2747 + // ======== end loop ======== 1.2748 + 1.2749 + // It was a real error; we must depend on the caller to finish the job. 1.2750 + // Register G1 has number of *remaining* oops, O2 number of *total* oops. 1.2751 + // Emit GC store barriers for the oops we have copied (O2 minus G1), 1.2752 + // and report their number to the caller. 1.2753 + __ BIND(fail); 1.2754 + __ subcc(O2_count, G1_remain, O2_count); 1.2755 + __ brx(Assembler::zero, false, Assembler::pt, done); 1.2756 + __ delayed()->not1(O2_count, O0); // report (-1^K) to caller 1.2757 + 1.2758 + __ BIND(do_card_marks); 1.2759 + gen_write_ref_array_post_barrier(O1_to, O2_count, O3); // store check on O1[0..O2] 1.2760 + 1.2761 + __ BIND(done); 1.2762 + inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4); 1.2763 + __ retl(); 1.2764 + __ delayed()->nop(); // return value in 00 1.2765 + 1.2766 + return start; 1.2767 + } 1.2768 + 1.2769 + 1.2770 + // Generate 'unsafe' array copy stub 1.2771 + // Though just as safe as the other stubs, it takes an unscaled 1.2772 + // size_t argument instead of an element count. 1.2773 + // 1.2774 + // Arguments for generated stub: 1.2775 + // from: O0 1.2776 + // to: O1 1.2777 + // count: O2 byte count, treated as ssize_t, can be zero 1.2778 + // 1.2779 + // Examines the alignment of the operands and dispatches 1.2780 + // to a long, int, short, or byte copy loop. 1.2781 + // 1.2782 + address generate_unsafe_copy(const char* name, 1.2783 + address byte_copy_entry, 1.2784 + address short_copy_entry, 1.2785 + address int_copy_entry, 1.2786 + address long_copy_entry) { 1.2787 + 1.2788 + const Register O0_from = O0; // source array address 1.2789 + const Register O1_to = O1; // destination array address 1.2790 + const Register O2_count = O2; // elements count 1.2791 + 1.2792 + const Register G1_bits = G1; // test copy of low bits 1.2793 + 1.2794 + __ align(CodeEntryAlignment); 1.2795 + StubCodeMark mark(this, "StubRoutines", name); 1.2796 + address start = __ pc(); 1.2797 + 1.2798 + // bump this on entry, not on exit: 1.2799 + inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3); 1.2800 + 1.2801 + __ or3(O0_from, O1_to, G1_bits); 1.2802 + __ or3(O2_count, G1_bits, G1_bits); 1.2803 + 1.2804 + __ btst(BytesPerLong-1, G1_bits); 1.2805 + __ br(Assembler::zero, true, Assembler::pt, 1.2806 + long_copy_entry, relocInfo::runtime_call_type); 1.2807 + // scale the count on the way out: 1.2808 + __ delayed()->srax(O2_count, LogBytesPerLong, O2_count); 1.2809 + 1.2810 + __ btst(BytesPerInt-1, G1_bits); 1.2811 + __ br(Assembler::zero, true, Assembler::pt, 1.2812 + int_copy_entry, relocInfo::runtime_call_type); 1.2813 + // scale the count on the way out: 1.2814 + __ delayed()->srax(O2_count, LogBytesPerInt, O2_count); 1.2815 + 1.2816 + __ btst(BytesPerShort-1, G1_bits); 1.2817 + __ br(Assembler::zero, true, Assembler::pt, 1.2818 + short_copy_entry, relocInfo::runtime_call_type); 1.2819 + // scale the count on the way out: 1.2820 + __ delayed()->srax(O2_count, LogBytesPerShort, O2_count); 1.2821 + 1.2822 + __ br(Assembler::always, false, Assembler::pt, 1.2823 + byte_copy_entry, relocInfo::runtime_call_type); 1.2824 + __ delayed()->nop(); 1.2825 + 1.2826 + return start; 1.2827 + } 1.2828 + 1.2829 + 1.2830 + // Perform range checks on the proposed arraycopy. 1.2831 + // Kills the two temps, but nothing else. 1.2832 + // Also, clean the sign bits of src_pos and dst_pos. 1.2833 + void arraycopy_range_checks(Register src, // source array oop (O0) 1.2834 + Register src_pos, // source position (O1) 1.2835 + Register dst, // destination array oo (O2) 1.2836 + Register dst_pos, // destination position (O3) 1.2837 + Register length, // length of copy (O4) 1.2838 + Register temp1, Register temp2, 1.2839 + Label& L_failed) { 1.2840 + BLOCK_COMMENT("arraycopy_range_checks:"); 1.2841 + 1.2842 + // if (src_pos + length > arrayOop(src)->length() ) FAIL; 1.2843 + 1.2844 + const Register array_length = temp1; // scratch 1.2845 + const Register end_pos = temp2; // scratch 1.2846 + 1.2847 + // Note: This next instruction may be in the delay slot of a branch: 1.2848 + __ add(length, src_pos, end_pos); // src_pos + length 1.2849 + __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length); 1.2850 + __ cmp(end_pos, array_length); 1.2851 + __ br(Assembler::greater, false, Assembler::pn, L_failed); 1.2852 + 1.2853 + // if (dst_pos + length > arrayOop(dst)->length() ) FAIL; 1.2854 + __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length 1.2855 + __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length); 1.2856 + __ cmp(end_pos, array_length); 1.2857 + __ br(Assembler::greater, false, Assembler::pn, L_failed); 1.2858 + 1.2859 + // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'. 1.2860 + // Move with sign extension can be used since they are positive. 1.2861 + __ delayed()->signx(src_pos, src_pos); 1.2862 + __ signx(dst_pos, dst_pos); 1.2863 + 1.2864 + BLOCK_COMMENT("arraycopy_range_checks done"); 1.2865 + } 1.2866 + 1.2867 + 1.2868 + // 1.2869 + // Generate generic array copy stubs 1.2870 + // 1.2871 + // Input: 1.2872 + // O0 - src oop 1.2873 + // O1 - src_pos 1.2874 + // O2 - dst oop 1.2875 + // O3 - dst_pos 1.2876 + // O4 - element count 1.2877 + // 1.2878 + // Output: 1.2879 + // O0 == 0 - success 1.2880 + // O0 == -1 - need to call System.arraycopy 1.2881 + // 1.2882 + address generate_generic_copy(const char *name, 1.2883 + address entry_jbyte_arraycopy, 1.2884 + address entry_jshort_arraycopy, 1.2885 + address entry_jint_arraycopy, 1.2886 + address entry_oop_arraycopy, 1.2887 + address entry_jlong_arraycopy, 1.2888 + address entry_checkcast_arraycopy) { 1.2889 + Label L_failed, L_objArray; 1.2890 + 1.2891 + // Input registers 1.2892 + const Register src = O0; // source array oop 1.2893 + const Register src_pos = O1; // source position 1.2894 + const Register dst = O2; // destination array oop 1.2895 + const Register dst_pos = O3; // destination position 1.2896 + const Register length = O4; // elements count 1.2897 + 1.2898 + // registers used as temp 1.2899 + const Register G3_src_klass = G3; // source array klass 1.2900 + const Register G4_dst_klass = G4; // destination array klass 1.2901 + const Register G5_lh = G5; // layout handler 1.2902 + const Register O5_temp = O5; 1.2903 + 1.2904 + __ align(CodeEntryAlignment); 1.2905 + StubCodeMark mark(this, "StubRoutines", name); 1.2906 + address start = __ pc(); 1.2907 + 1.2908 + // bump this on entry, not on exit: 1.2909 + inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3); 1.2910 + 1.2911 + // In principle, the int arguments could be dirty. 1.2912 + //assert_clean_int(src_pos, G1); 1.2913 + //assert_clean_int(dst_pos, G1); 1.2914 + //assert_clean_int(length, G1); 1.2915 + 1.2916 + //----------------------------------------------------------------------- 1.2917 + // Assembler stubs will be used for this call to arraycopy 1.2918 + // if the following conditions are met: 1.2919 + // 1.2920 + // (1) src and dst must not be null. 1.2921 + // (2) src_pos must not be negative. 1.2922 + // (3) dst_pos must not be negative. 1.2923 + // (4) length must not be negative. 1.2924 + // (5) src klass and dst klass should be the same and not NULL. 1.2925 + // (6) src and dst should be arrays. 1.2926 + // (7) src_pos + length must not exceed length of src. 1.2927 + // (8) dst_pos + length must not exceed length of dst. 1.2928 + BLOCK_COMMENT("arraycopy initial argument checks"); 1.2929 + 1.2930 + // if (src == NULL) return -1; 1.2931 + __ br_null(src, false, Assembler::pn, L_failed); 1.2932 + 1.2933 + // if (src_pos < 0) return -1; 1.2934 + __ delayed()->tst(src_pos); 1.2935 + __ br(Assembler::negative, false, Assembler::pn, L_failed); 1.2936 + __ delayed()->nop(); 1.2937 + 1.2938 + // if (dst == NULL) return -1; 1.2939 + __ br_null(dst, false, Assembler::pn, L_failed); 1.2940 + 1.2941 + // if (dst_pos < 0) return -1; 1.2942 + __ delayed()->tst(dst_pos); 1.2943 + __ br(Assembler::negative, false, Assembler::pn, L_failed); 1.2944 + 1.2945 + // if (length < 0) return -1; 1.2946 + __ delayed()->tst(length); 1.2947 + __ br(Assembler::negative, false, Assembler::pn, L_failed); 1.2948 + 1.2949 + BLOCK_COMMENT("arraycopy argument klass checks"); 1.2950 + // get src->klass() 1.2951 + if (UseCompressedClassPointers) { 1.2952 + __ delayed()->nop(); // ??? not good 1.2953 + __ load_klass(src, G3_src_klass); 1.2954 + } else { 1.2955 + __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass); 1.2956 + } 1.2957 + 1.2958 +#ifdef ASSERT 1.2959 + // assert(src->klass() != NULL); 1.2960 + BLOCK_COMMENT("assert klasses not null"); 1.2961 + { Label L_a, L_b; 1.2962 + __ br_notnull_short(G3_src_klass, Assembler::pt, L_b); // it is broken if klass is NULL 1.2963 + __ bind(L_a); 1.2964 + __ stop("broken null klass"); 1.2965 + __ bind(L_b); 1.2966 + __ load_klass(dst, G4_dst_klass); 1.2967 + __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also 1.2968 + __ delayed()->mov(G0, G4_dst_klass); // scribble the temp 1.2969 + BLOCK_COMMENT("assert done"); 1.2970 + } 1.2971 +#endif 1.2972 + 1.2973 + // Load layout helper 1.2974 + // 1.2975 + // |array_tag| | header_size | element_type | |log2_element_size| 1.2976 + // 32 30 24 16 8 2 0 1.2977 + // 1.2978 + // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 1.2979 + // 1.2980 + 1.2981 + int lh_offset = in_bytes(Klass::layout_helper_offset()); 1.2982 + 1.2983 + // Load 32-bits signed value. Use br() instruction with it to check icc. 1.2984 + __ lduw(G3_src_klass, lh_offset, G5_lh); 1.2985 + 1.2986 + if (UseCompressedClassPointers) { 1.2987 + __ load_klass(dst, G4_dst_klass); 1.2988 + } 1.2989 + // Handle objArrays completely differently... 1.2990 + juint objArray_lh = Klass::array_layout_helper(T_OBJECT); 1.2991 + __ set(objArray_lh, O5_temp); 1.2992 + __ cmp(G5_lh, O5_temp); 1.2993 + __ br(Assembler::equal, false, Assembler::pt, L_objArray); 1.2994 + if (UseCompressedClassPointers) { 1.2995 + __ delayed()->nop(); 1.2996 + } else { 1.2997 + __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass); 1.2998 + } 1.2999 + 1.3000 + // if (src->klass() != dst->klass()) return -1; 1.3001 + __ cmp_and_brx_short(G3_src_klass, G4_dst_klass, Assembler::notEqual, Assembler::pn, L_failed); 1.3002 + 1.3003 + // if (!src->is_Array()) return -1; 1.3004 + __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0 1.3005 + __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed); 1.3006 + 1.3007 + // At this point, it is known to be a typeArray (array_tag 0x3). 1.3008 +#ifdef ASSERT 1.3009 + __ delayed()->nop(); 1.3010 + { Label L; 1.3011 + jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 1.3012 + __ set(lh_prim_tag_in_place, O5_temp); 1.3013 + __ cmp(G5_lh, O5_temp); 1.3014 + __ br(Assembler::greaterEqual, false, Assembler::pt, L); 1.3015 + __ delayed()->nop(); 1.3016 + __ stop("must be a primitive array"); 1.3017 + __ bind(L); 1.3018 + } 1.3019 +#else 1.3020 + __ delayed(); // match next insn to prev branch 1.3021 +#endif 1.3022 + 1.3023 + arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 1.3024 + O5_temp, G4_dst_klass, L_failed); 1.3025 + 1.3026 + // TypeArrayKlass 1.3027 + // 1.3028 + // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 1.3029 + // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 1.3030 + // 1.3031 + 1.3032 + const Register G4_offset = G4_dst_klass; // array offset 1.3033 + const Register G3_elsize = G3_src_klass; // log2 element size 1.3034 + 1.3035 + __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset); 1.3036 + __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset 1.3037 + __ add(src, G4_offset, src); // src array offset 1.3038 + __ add(dst, G4_offset, dst); // dst array offset 1.3039 + __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size 1.3040 + 1.3041 + // next registers should be set before the jump to corresponding stub 1.3042 + const Register from = O0; // source array address 1.3043 + const Register to = O1; // destination array address 1.3044 + const Register count = O2; // elements count 1.3045 + 1.3046 + // 'from', 'to', 'count' registers should be set in this order 1.3047 + // since they are the same as 'src', 'src_pos', 'dst'. 1.3048 + 1.3049 + BLOCK_COMMENT("scale indexes to element size"); 1.3050 + __ sll_ptr(src_pos, G3_elsize, src_pos); 1.3051 + __ sll_ptr(dst_pos, G3_elsize, dst_pos); 1.3052 + __ add(src, src_pos, from); // src_addr 1.3053 + __ add(dst, dst_pos, to); // dst_addr 1.3054 + 1.3055 + BLOCK_COMMENT("choose copy loop based on element size"); 1.3056 + __ cmp(G3_elsize, 0); 1.3057 + __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy); 1.3058 + __ delayed()->signx(length, count); // length 1.3059 + 1.3060 + __ cmp(G3_elsize, LogBytesPerShort); 1.3061 + __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy); 1.3062 + __ delayed()->signx(length, count); // length 1.3063 + 1.3064 + __ cmp(G3_elsize, LogBytesPerInt); 1.3065 + __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy); 1.3066 + __ delayed()->signx(length, count); // length 1.3067 +#ifdef ASSERT 1.3068 + { Label L; 1.3069 + __ cmp_and_br_short(G3_elsize, LogBytesPerLong, Assembler::equal, Assembler::pt, L); 1.3070 + __ stop("must be long copy, but elsize is wrong"); 1.3071 + __ bind(L); 1.3072 + } 1.3073 +#endif 1.3074 + __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy); 1.3075 + __ delayed()->signx(length, count); // length 1.3076 + 1.3077 + // ObjArrayKlass 1.3078 + __ BIND(L_objArray); 1.3079 + // live at this point: G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length 1.3080 + 1.3081 + Label L_plain_copy, L_checkcast_copy; 1.3082 + // test array classes for subtyping 1.3083 + __ cmp(G3_src_klass, G4_dst_klass); // usual case is exact equality 1.3084 + __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy); 1.3085 + __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below 1.3086 + 1.3087 + // Identically typed arrays can be copied without element-wise checks. 1.3088 + arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 1.3089 + O5_temp, G5_lh, L_failed); 1.3090 + 1.3091 + __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset 1.3092 + __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset 1.3093 + __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos); 1.3094 + __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos); 1.3095 + __ add(src, src_pos, from); // src_addr 1.3096 + __ add(dst, dst_pos, to); // dst_addr 1.3097 + __ BIND(L_plain_copy); 1.3098 + __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy); 1.3099 + __ delayed()->signx(length, count); // length 1.3100 + 1.3101 + __ BIND(L_checkcast_copy); 1.3102 + // live at this point: G3_src_klass, G4_dst_klass 1.3103 + { 1.3104 + // Before looking at dst.length, make sure dst is also an objArray. 1.3105 + // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot 1.3106 + __ cmp(G5_lh, O5_temp); 1.3107 + __ br(Assembler::notEqual, false, Assembler::pn, L_failed); 1.3108 + 1.3109 + // It is safe to examine both src.length and dst.length. 1.3110 + __ delayed(); // match next insn to prev branch 1.3111 + arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 1.3112 + O5_temp, G5_lh, L_failed); 1.3113 + 1.3114 + // Marshal the base address arguments now, freeing registers. 1.3115 + __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset 1.3116 + __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset 1.3117 + __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos); 1.3118 + __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos); 1.3119 + __ add(src, src_pos, from); // src_addr 1.3120 + __ add(dst, dst_pos, to); // dst_addr 1.3121 + __ signx(length, count); // length (reloaded) 1.3122 + 1.3123 + Register sco_temp = O3; // this register is free now 1.3124 + assert_different_registers(from, to, count, sco_temp, 1.3125 + G4_dst_klass, G3_src_klass); 1.3126 + 1.3127 + // Generate the type check. 1.3128 + int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1.3129 + __ lduw(G4_dst_klass, sco_offset, sco_temp); 1.3130 + generate_type_check(G3_src_klass, sco_temp, G4_dst_klass, 1.3131 + O5_temp, L_plain_copy); 1.3132 + 1.3133 + // Fetch destination element klass from the ObjArrayKlass header. 1.3134 + int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 1.3135 + 1.3136 + // the checkcast_copy loop needs two extra arguments: 1.3137 + __ ld_ptr(G4_dst_klass, ek_offset, O4); // dest elem klass 1.3138 + // lduw(O4, sco_offset, O3); // sco of elem klass 1.3139 + 1.3140 + __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy); 1.3141 + __ delayed()->lduw(O4, sco_offset, O3); 1.3142 + } 1.3143 + 1.3144 + __ BIND(L_failed); 1.3145 + __ retl(); 1.3146 + __ delayed()->sub(G0, 1, O0); // return -1 1.3147 + return start; 1.3148 + } 1.3149 + 1.3150 + // 1.3151 + // Generate stub for heap zeroing. 1.3152 + // "to" address is aligned to jlong (8 bytes). 1.3153 + // 1.3154 + // Arguments for generated stub: 1.3155 + // to: O0 1.3156 + // count: O1 treated as signed (count of HeapWord) 1.3157 + // count could be 0 1.3158 + // 1.3159 + address generate_zero_aligned_words(const char* name) { 1.3160 + __ align(CodeEntryAlignment); 1.3161 + StubCodeMark mark(this, "StubRoutines", name); 1.3162 + address start = __ pc(); 1.3163 + 1.3164 + const Register to = O0; // source array address 1.3165 + const Register count = O1; // HeapWords count 1.3166 + const Register temp = O2; // scratch 1.3167 + 1.3168 + Label Ldone; 1.3169 + __ sllx(count, LogHeapWordSize, count); // to bytes count 1.3170 + // Use BIS for zeroing 1.3171 + __ bis_zeroing(to, count, temp, Ldone); 1.3172 + __ bind(Ldone); 1.3173 + __ retl(); 1.3174 + __ delayed()->nop(); 1.3175 + return start; 1.3176 +} 1.3177 + 1.3178 + void generate_arraycopy_stubs() { 1.3179 + address entry; 1.3180 + address entry_jbyte_arraycopy; 1.3181 + address entry_jshort_arraycopy; 1.3182 + address entry_jint_arraycopy; 1.3183 + address entry_oop_arraycopy; 1.3184 + address entry_jlong_arraycopy; 1.3185 + address entry_checkcast_arraycopy; 1.3186 + 1.3187 + //*** jbyte 1.3188 + // Always need aligned and unaligned versions 1.3189 + StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 1.3190 + "jbyte_disjoint_arraycopy"); 1.3191 + StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 1.3192 + &entry_jbyte_arraycopy, 1.3193 + "jbyte_arraycopy"); 1.3194 + StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 1.3195 + "arrayof_jbyte_disjoint_arraycopy"); 1.3196 + StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 1.3197 + "arrayof_jbyte_arraycopy"); 1.3198 + 1.3199 + //*** jshort 1.3200 + // Always need aligned and unaligned versions 1.3201 + StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 1.3202 + "jshort_disjoint_arraycopy"); 1.3203 + StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 1.3204 + &entry_jshort_arraycopy, 1.3205 + "jshort_arraycopy"); 1.3206 + StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 1.3207 + "arrayof_jshort_disjoint_arraycopy"); 1.3208 + StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 1.3209 + "arrayof_jshort_arraycopy"); 1.3210 + 1.3211 + //*** jint 1.3212 + // Aligned versions 1.3213 + StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 1.3214 + "arrayof_jint_disjoint_arraycopy"); 1.3215 + StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 1.3216 + "arrayof_jint_arraycopy"); 1.3217 +#ifdef _LP64 1.3218 + // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 1.3219 + // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it). 1.3220 + StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 1.3221 + "jint_disjoint_arraycopy"); 1.3222 + StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 1.3223 + &entry_jint_arraycopy, 1.3224 + "jint_arraycopy"); 1.3225 +#else 1.3226 + // In 32 bit jints are always HeapWordSize aligned, so always use the aligned version 1.3227 + // (in fact in 32bit we always have a pre-loop part even in the aligned version, 1.3228 + // because it uses 64-bit loads/stores, so the aligned flag is actually ignored). 1.3229 + StubRoutines::_jint_disjoint_arraycopy = StubRoutines::_arrayof_jint_disjoint_arraycopy; 1.3230 + StubRoutines::_jint_arraycopy = StubRoutines::_arrayof_jint_arraycopy; 1.3231 +#endif 1.3232 + 1.3233 + 1.3234 + //*** jlong 1.3235 + // It is always aligned 1.3236 + StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 1.3237 + "arrayof_jlong_disjoint_arraycopy"); 1.3238 + StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 1.3239 + "arrayof_jlong_arraycopy"); 1.3240 + StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 1.3241 + StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 1.3242 + 1.3243 + 1.3244 + //*** oops 1.3245 + // Aligned versions 1.3246 + StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(true, &entry, 1.3247 + "arrayof_oop_disjoint_arraycopy"); 1.3248 + StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy, 1.3249 + "arrayof_oop_arraycopy"); 1.3250 + // Aligned versions without pre-barriers 1.3251 + StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry, 1.3252 + "arrayof_oop_disjoint_arraycopy_uninit", 1.3253 + /*dest_uninitialized*/true); 1.3254 + StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, entry, NULL, 1.3255 + "arrayof_oop_arraycopy_uninit", 1.3256 + /*dest_uninitialized*/true); 1.3257 +#ifdef _LP64 1.3258 + if (UseCompressedOops) { 1.3259 + // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy. 1.3260 + StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(false, &entry, 1.3261 + "oop_disjoint_arraycopy"); 1.3262 + StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy, 1.3263 + "oop_arraycopy"); 1.3264 + // Unaligned versions without pre-barriers 1.3265 + StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(false, &entry, 1.3266 + "oop_disjoint_arraycopy_uninit", 1.3267 + /*dest_uninitialized*/true); 1.3268 + StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, entry, NULL, 1.3269 + "oop_arraycopy_uninit", 1.3270 + /*dest_uninitialized*/true); 1.3271 + } else 1.3272 +#endif 1.3273 + { 1.3274 + // oop arraycopy is always aligned on 32bit and 64bit without compressed oops 1.3275 + StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 1.3276 + StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 1.3277 + StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 1.3278 + StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 1.3279 + } 1.3280 + 1.3281 + StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 1.3282 + StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 1.3283 + /*dest_uninitialized*/true); 1.3284 + 1.3285 + StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 1.3286 + entry_jbyte_arraycopy, 1.3287 + entry_jshort_arraycopy, 1.3288 + entry_jint_arraycopy, 1.3289 + entry_jlong_arraycopy); 1.3290 + StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 1.3291 + entry_jbyte_arraycopy, 1.3292 + entry_jshort_arraycopy, 1.3293 + entry_jint_arraycopy, 1.3294 + entry_oop_arraycopy, 1.3295 + entry_jlong_arraycopy, 1.3296 + entry_checkcast_arraycopy); 1.3297 + 1.3298 + StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 1.3299 + StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 1.3300 + StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 1.3301 + StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 1.3302 + StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 1.3303 + StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 1.3304 + 1.3305 + if (UseBlockZeroing) { 1.3306 + StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words"); 1.3307 + } 1.3308 + } 1.3309 + 1.3310 + address generate_aescrypt_encryptBlock() { 1.3311 + // required since we read expanded key 'int' array starting first element without alignment considerations 1.3312 + assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 1.3313 + "the following code assumes that first element of an int array is aligned to 8 bytes"); 1.3314 + __ align(CodeEntryAlignment); 1.3315 + StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 1.3316 + Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output; 1.3317 + address start = __ pc(); 1.3318 + Register from = O0; // source byte array 1.3319 + Register to = O1; // destination byte array 1.3320 + Register key = O2; // expanded key array 1.3321 + const Register keylen = O4; //reg for storing expanded key array length 1.3322 + 1.3323 + // read expanded key length 1.3324 + __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 1.3325 + 1.3326 + // Method to address arbitrary alignment for load instructions: 1.3327 + // Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary 1.3328 + // If zero/aligned then continue with double FP load instructions 1.3329 + // If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata 1.3330 + // alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address 1.3331 + // load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address 1.3332 + // faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs 1.3333 + 1.3334 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 1.3335 + __ andcc(from, 7, G0); 1.3336 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input); 1.3337 + __ delayed()->alignaddr(from, G0, from); 1.3338 + 1.3339 + // aligned case: load input into F54-F56 1.3340 + __ ldf(FloatRegisterImpl::D, from, 0, F54); 1.3341 + __ ldf(FloatRegisterImpl::D, from, 8, F56); 1.3342 + __ ba_short(L_load_expanded_key); 1.3343 + 1.3344 + __ BIND(L_load_misaligned_input); 1.3345 + __ ldf(FloatRegisterImpl::D, from, 0, F54); 1.3346 + __ ldf(FloatRegisterImpl::D, from, 8, F56); 1.3347 + __ ldf(FloatRegisterImpl::D, from, 16, F58); 1.3348 + __ faligndata(F54, F56, F54); 1.3349 + __ faligndata(F56, F58, F56); 1.3350 + 1.3351 + __ BIND(L_load_expanded_key); 1.3352 + // Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed 1.3353 + for ( int i = 0; i <= 38; i += 2 ) { 1.3354 + __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i)); 1.3355 + } 1.3356 + 1.3357 + // perform cipher transformation 1.3358 + __ fxor(FloatRegisterImpl::D, F0, F54, F54); 1.3359 + __ fxor(FloatRegisterImpl::D, F2, F56, F56); 1.3360 + // rounds 1 through 8 1.3361 + for ( int i = 4; i <= 28; i += 8 ) { 1.3362 + __ aes_eround01(as_FloatRegister(i), F54, F56, F58); 1.3363 + __ aes_eround23(as_FloatRegister(i+2), F54, F56, F60); 1.3364 + __ aes_eround01(as_FloatRegister(i+4), F58, F60, F54); 1.3365 + __ aes_eround23(as_FloatRegister(i+6), F58, F60, F56); 1.3366 + } 1.3367 + __ aes_eround01(F36, F54, F56, F58); //round 9 1.3368 + __ aes_eround23(F38, F54, F56, F60); 1.3369 + 1.3370 + // 128-bit original key size 1.3371 + __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_doLast128bit); 1.3372 + 1.3373 + for ( int i = 40; i <= 50; i += 2 ) { 1.3374 + __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i) ); 1.3375 + } 1.3376 + __ aes_eround01(F40, F58, F60, F54); //round 10 1.3377 + __ aes_eround23(F42, F58, F60, F56); 1.3378 + __ aes_eround01(F44, F54, F56, F58); //round 11 1.3379 + __ aes_eround23(F46, F54, F56, F60); 1.3380 + 1.3381 + // 192-bit original key size 1.3382 + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_storeOutput); 1.3383 + 1.3384 + __ ldf(FloatRegisterImpl::D, key, 208, F52); 1.3385 + __ aes_eround01(F48, F58, F60, F54); //round 12 1.3386 + __ aes_eround23(F50, F58, F60, F56); 1.3387 + __ ldf(FloatRegisterImpl::D, key, 216, F46); 1.3388 + __ ldf(FloatRegisterImpl::D, key, 224, F48); 1.3389 + __ ldf(FloatRegisterImpl::D, key, 232, F50); 1.3390 + __ aes_eround01(F52, F54, F56, F58); //round 13 1.3391 + __ aes_eround23(F46, F54, F56, F60); 1.3392 + __ ba_short(L_storeOutput); 1.3393 + 1.3394 + __ BIND(L_doLast128bit); 1.3395 + __ ldf(FloatRegisterImpl::D, key, 160, F48); 1.3396 + __ ldf(FloatRegisterImpl::D, key, 168, F50); 1.3397 + 1.3398 + __ BIND(L_storeOutput); 1.3399 + // perform last round of encryption common for all key sizes 1.3400 + __ aes_eround01_l(F48, F58, F60, F54); //last round 1.3401 + __ aes_eround23_l(F50, F58, F60, F56); 1.3402 + 1.3403 + // Method to address arbitrary alignment for store instructions: 1.3404 + // Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary 1.3405 + // If zero/aligned then continue with double FP store instructions 1.3406 + // If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case) 1.3407 + // Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001 1.3408 + // Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case 1.3409 + // We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case. 1.3410 + // Set GSR.align to (8-n) using alignaddr 1.3411 + // Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf 1.3412 + // Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address 1.3413 + // Store (partial) the original first (8-n) bytes starting at the original 'dest' address 1.3414 + // Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address 1.3415 + // We need to execute this process for both the 8-byte result values 1.3416 + 1.3417 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 1.3418 + __ andcc(to, 7, O5); 1.3419 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output); 1.3420 + __ delayed()->edge8n(to, G0, O3); 1.3421 + 1.3422 + // aligned case: store output into the destination array 1.3423 + __ stf(FloatRegisterImpl::D, F54, to, 0); 1.3424 + __ retl(); 1.3425 + __ delayed()->stf(FloatRegisterImpl::D, F56, to, 8); 1.3426 + 1.3427 + __ BIND(L_store_misaligned_output); 1.3428 + __ add(to, 8, O4); 1.3429 + __ mov(8, O2); 1.3430 + __ sub(O2, O5, O2); 1.3431 + __ alignaddr(O2, G0, O2); 1.3432 + __ faligndata(F54, F54, F54); 1.3433 + __ faligndata(F56, F56, F56); 1.3434 + __ and3(to, -8, to); 1.3435 + __ and3(O4, -8, O4); 1.3436 + __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY); 1.3437 + __ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY); 1.3438 + __ add(to, 8, to); 1.3439 + __ add(O4, 8, O4); 1.3440 + __ orn(G0, O3, O3); 1.3441 + __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY); 1.3442 + __ retl(); 1.3443 + __ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY); 1.3444 + 1.3445 + return start; 1.3446 + } 1.3447 + 1.3448 + address generate_aescrypt_decryptBlock() { 1.3449 + assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 1.3450 + "the following code assumes that first element of an int array is aligned to 8 bytes"); 1.3451 + // required since we read original key 'byte' array as well in the decryption stubs 1.3452 + assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, 1.3453 + "the following code assumes that first element of a byte array is aligned to 8 bytes"); 1.3454 + __ align(CodeEntryAlignment); 1.3455 + StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 1.3456 + address start = __ pc(); 1.3457 + Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input; 1.3458 + Label L_256bit_transform, L_common_transform, L_store_misaligned_output; 1.3459 + Register from = O0; // source byte array 1.3460 + Register to = O1; // destination byte array 1.3461 + Register key = O2; // expanded key array 1.3462 + Register original_key = O3; // original key array only required during decryption 1.3463 + const Register keylen = O4; // reg for storing expanded key array length 1.3464 + 1.3465 + // read expanded key array length 1.3466 + __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 1.3467 + 1.3468 + // save 'from' since we may need to recheck alignment in case of 256-bit decryption 1.3469 + __ mov(from, G1); 1.3470 + 1.3471 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 1.3472 + __ andcc(from, 7, G0); 1.3473 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input); 1.3474 + __ delayed()->alignaddr(from, G0, from); 1.3475 + 1.3476 + // aligned case: load input into F52-F54 1.3477 + __ ldf(FloatRegisterImpl::D, from, 0, F52); 1.3478 + __ ldf(FloatRegisterImpl::D, from, 8, F54); 1.3479 + __ ba_short(L_load_original_key); 1.3480 + 1.3481 + __ BIND(L_load_misaligned_input); 1.3482 + __ ldf(FloatRegisterImpl::D, from, 0, F52); 1.3483 + __ ldf(FloatRegisterImpl::D, from, 8, F54); 1.3484 + __ ldf(FloatRegisterImpl::D, from, 16, F56); 1.3485 + __ faligndata(F52, F54, F52); 1.3486 + __ faligndata(F54, F56, F54); 1.3487 + 1.3488 + __ BIND(L_load_original_key); 1.3489 + // load original key from SunJCE expanded decryption key 1.3490 + // Since we load original key buffer starting first element, 8-byte alignment is guaranteed 1.3491 + for ( int i = 0; i <= 3; i++ ) { 1.3492 + __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 1.3493 + } 1.3494 + 1.3495 + // 256-bit original key size 1.3496 + __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit); 1.3497 + 1.3498 + // 192-bit original key size 1.3499 + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit); 1.3500 + 1.3501 + // 128-bit original key size 1.3502 + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 1.3503 + for ( int i = 0; i <= 36; i += 4 ) { 1.3504 + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4)); 1.3505 + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6)); 1.3506 + } 1.3507 + 1.3508 + // perform 128-bit key specific inverse cipher transformation 1.3509 + __ fxor(FloatRegisterImpl::D, F42, F54, F54); 1.3510 + __ fxor(FloatRegisterImpl::D, F40, F52, F52); 1.3511 + __ ba_short(L_common_transform); 1.3512 + 1.3513 + __ BIND(L_expand192bit); 1.3514 + 1.3515 + // start loading rest of the 192-bit key 1.3516 + __ ldf(FloatRegisterImpl::S, original_key, 16, F4); 1.3517 + __ ldf(FloatRegisterImpl::S, original_key, 20, F5); 1.3518 + 1.3519 + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 1.3520 + for ( int i = 0; i <= 36; i += 6 ) { 1.3521 + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6)); 1.3522 + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8)); 1.3523 + __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10)); 1.3524 + } 1.3525 + __ aes_kexpand1(F42, F46, 7, F48); 1.3526 + __ aes_kexpand2(F44, F48, F50); 1.3527 + 1.3528 + // perform 192-bit key specific inverse cipher transformation 1.3529 + __ fxor(FloatRegisterImpl::D, F50, F54, F54); 1.3530 + __ fxor(FloatRegisterImpl::D, F48, F52, F52); 1.3531 + __ aes_dround23(F46, F52, F54, F58); 1.3532 + __ aes_dround01(F44, F52, F54, F56); 1.3533 + __ aes_dround23(F42, F56, F58, F54); 1.3534 + __ aes_dround01(F40, F56, F58, F52); 1.3535 + __ ba_short(L_common_transform); 1.3536 + 1.3537 + __ BIND(L_expand256bit); 1.3538 + 1.3539 + // load rest of the 256-bit key 1.3540 + for ( int i = 4; i <= 7; i++ ) { 1.3541 + __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 1.3542 + } 1.3543 + 1.3544 + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 1.3545 + for ( int i = 0; i <= 40; i += 8 ) { 1.3546 + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8)); 1.3547 + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10)); 1.3548 + __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12)); 1.3549 + __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14)); 1.3550 + } 1.3551 + __ aes_kexpand1(F48, F54, 6, F56); 1.3552 + __ aes_kexpand2(F50, F56, F58); 1.3553 + 1.3554 + for ( int i = 0; i <= 6; i += 2 ) { 1.3555 + __ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i)); 1.3556 + } 1.3557 + 1.3558 + // reload original 'from' address 1.3559 + __ mov(G1, from); 1.3560 + 1.3561 + // re-check 8-byte alignment 1.3562 + __ andcc(from, 7, G0); 1.3563 + __ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input); 1.3564 + __ delayed()->alignaddr(from, G0, from); 1.3565 + 1.3566 + // aligned case: load input into F52-F54 1.3567 + __ ldf(FloatRegisterImpl::D, from, 0, F52); 1.3568 + __ ldf(FloatRegisterImpl::D, from, 8, F54); 1.3569 + __ ba_short(L_256bit_transform); 1.3570 + 1.3571 + __ BIND(L_reload_misaligned_input); 1.3572 + __ ldf(FloatRegisterImpl::D, from, 0, F52); 1.3573 + __ ldf(FloatRegisterImpl::D, from, 8, F54); 1.3574 + __ ldf(FloatRegisterImpl::D, from, 16, F56); 1.3575 + __ faligndata(F52, F54, F52); 1.3576 + __ faligndata(F54, F56, F54); 1.3577 + 1.3578 + // perform 256-bit key specific inverse cipher transformation 1.3579 + __ BIND(L_256bit_transform); 1.3580 + __ fxor(FloatRegisterImpl::D, F0, F54, F54); 1.3581 + __ fxor(FloatRegisterImpl::D, F2, F52, F52); 1.3582 + __ aes_dround23(F4, F52, F54, F58); 1.3583 + __ aes_dround01(F6, F52, F54, F56); 1.3584 + __ aes_dround23(F50, F56, F58, F54); 1.3585 + __ aes_dround01(F48, F56, F58, F52); 1.3586 + __ aes_dround23(F46, F52, F54, F58); 1.3587 + __ aes_dround01(F44, F52, F54, F56); 1.3588 + __ aes_dround23(F42, F56, F58, F54); 1.3589 + __ aes_dround01(F40, F56, F58, F52); 1.3590 + 1.3591 + for ( int i = 0; i <= 7; i++ ) { 1.3592 + __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 1.3593 + } 1.3594 + 1.3595 + // perform inverse cipher transformations common for all key sizes 1.3596 + __ BIND(L_common_transform); 1.3597 + for ( int i = 38; i >= 6; i -= 8 ) { 1.3598 + __ aes_dround23(as_FloatRegister(i), F52, F54, F58); 1.3599 + __ aes_dround01(as_FloatRegister(i-2), F52, F54, F56); 1.3600 + if ( i != 6) { 1.3601 + __ aes_dround23(as_FloatRegister(i-4), F56, F58, F54); 1.3602 + __ aes_dround01(as_FloatRegister(i-6), F56, F58, F52); 1.3603 + } else { 1.3604 + __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F54); 1.3605 + __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F52); 1.3606 + } 1.3607 + } 1.3608 + 1.3609 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 1.3610 + __ andcc(to, 7, O5); 1.3611 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output); 1.3612 + __ delayed()->edge8n(to, G0, O3); 1.3613 + 1.3614 + // aligned case: store output into the destination array 1.3615 + __ stf(FloatRegisterImpl::D, F52, to, 0); 1.3616 + __ retl(); 1.3617 + __ delayed()->stf(FloatRegisterImpl::D, F54, to, 8); 1.3618 + 1.3619 + __ BIND(L_store_misaligned_output); 1.3620 + __ add(to, 8, O4); 1.3621 + __ mov(8, O2); 1.3622 + __ sub(O2, O5, O2); 1.3623 + __ alignaddr(O2, G0, O2); 1.3624 + __ faligndata(F52, F52, F52); 1.3625 + __ faligndata(F54, F54, F54); 1.3626 + __ and3(to, -8, to); 1.3627 + __ and3(O4, -8, O4); 1.3628 + __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY); 1.3629 + __ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY); 1.3630 + __ add(to, 8, to); 1.3631 + __ add(O4, 8, O4); 1.3632 + __ orn(G0, O3, O3); 1.3633 + __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY); 1.3634 + __ retl(); 1.3635 + __ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY); 1.3636 + 1.3637 + return start; 1.3638 + } 1.3639 + 1.3640 + address generate_cipherBlockChaining_encryptAESCrypt() { 1.3641 + assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 1.3642 + "the following code assumes that first element of an int array is aligned to 8 bytes"); 1.3643 + assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, 1.3644 + "the following code assumes that first element of a byte array is aligned to 8 bytes"); 1.3645 + __ align(CodeEntryAlignment); 1.3646 + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 1.3647 + Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit; 1.3648 + Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform; 1.3649 + Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit; 1.3650 + Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit; 1.3651 + address start = __ pc(); 1.3652 + Register from = I0; // source byte array 1.3653 + Register to = I1; // destination byte array 1.3654 + Register key = I2; // expanded key array 1.3655 + Register rvec = I3; // init vector 1.3656 + const Register len_reg = I4; // cipher length 1.3657 + const Register keylen = I5; // reg for storing expanded key array length 1.3658 + 1.3659 + __ save_frame(0); 1.3660 + // save cipher len to return in the end 1.3661 + __ mov(len_reg, L0); 1.3662 + 1.3663 + // read expanded key length 1.3664 + __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 1.3665 + 1.3666 + // load initial vector, 8-byte alignment is guranteed 1.3667 + __ ldf(FloatRegisterImpl::D, rvec, 0, F60); 1.3668 + __ ldf(FloatRegisterImpl::D, rvec, 8, F62); 1.3669 + // load key, 8-byte alignment is guranteed 1.3670 + __ ldx(key,0,G1); 1.3671 + __ ldx(key,8,G5); 1.3672 + 1.3673 + // start loading expanded key, 8-byte alignment is guranteed 1.3674 + for ( int i = 0, j = 16; i <= 38; i += 2, j += 8 ) { 1.3675 + __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); 1.3676 + } 1.3677 + 1.3678 + // 128-bit original key size 1.3679 + __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_cbcenc128); 1.3680 + 1.3681 + for ( int i = 40, j = 176; i <= 46; i += 2, j += 8 ) { 1.3682 + __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); 1.3683 + } 1.3684 + 1.3685 + // 192-bit original key size 1.3686 + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_cbcenc192); 1.3687 + 1.3688 + for ( int i = 48, j = 208; i <= 54; i += 2, j += 8 ) { 1.3689 + __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); 1.3690 + } 1.3691 + 1.3692 + // 256-bit original key size 1.3693 + __ ba_short(L_cbcenc256); 1.3694 + 1.3695 + __ align(OptoLoopAlignment); 1.3696 + __ BIND(L_cbcenc128); 1.3697 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 1.3698 + __ andcc(from, 7, G0); 1.3699 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit); 1.3700 + __ delayed()->mov(from, L1); // save original 'from' address before alignaddr 1.3701 + 1.3702 + // aligned case: load input into G3 and G4 1.3703 + __ ldx(from,0,G3); 1.3704 + __ ldx(from,8,G4); 1.3705 + __ ba_short(L_128bit_transform); 1.3706 + 1.3707 + __ BIND(L_load_misaligned_input_128bit); 1.3708 + // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption 1.3709 + __ alignaddr(from, G0, from); 1.3710 + __ ldf(FloatRegisterImpl::D, from, 0, F48); 1.3711 + __ ldf(FloatRegisterImpl::D, from, 8, F50); 1.3712 + __ ldf(FloatRegisterImpl::D, from, 16, F52); 1.3713 + __ faligndata(F48, F50, F48); 1.3714 + __ faligndata(F50, F52, F50); 1.3715 + __ movdtox(F48, G3); 1.3716 + __ movdtox(F50, G4); 1.3717 + __ mov(L1, from); 1.3718 + 1.3719 + __ BIND(L_128bit_transform); 1.3720 + __ xor3(G1,G3,G3); 1.3721 + __ xor3(G5,G4,G4); 1.3722 + __ movxtod(G3,F56); 1.3723 + __ movxtod(G4,F58); 1.3724 + __ fxor(FloatRegisterImpl::D, F60, F56, F60); 1.3725 + __ fxor(FloatRegisterImpl::D, F62, F58, F62); 1.3726 + 1.3727 + // TEN_EROUNDS 1.3728 + for ( int i = 0; i <= 32; i += 8 ) { 1.3729 + __ aes_eround01(as_FloatRegister(i), F60, F62, F56); 1.3730 + __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58); 1.3731 + if (i != 32 ) { 1.3732 + __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60); 1.3733 + __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62); 1.3734 + } else { 1.3735 + __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); 1.3736 + __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); 1.3737 + } 1.3738 + } 1.3739 + 1.3740 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 1.3741 + __ andcc(to, 7, L1); 1.3742 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit); 1.3743 + __ delayed()->edge8n(to, G0, L2); 1.3744 + 1.3745 + // aligned case: store output into the destination array 1.3746 + __ stf(FloatRegisterImpl::D, F60, to, 0); 1.3747 + __ stf(FloatRegisterImpl::D, F62, to, 8); 1.3748 + __ ba_short(L_check_loop_end_128bit); 1.3749 + 1.3750 + __ BIND(L_store_misaligned_output_128bit); 1.3751 + __ add(to, 8, L3); 1.3752 + __ mov(8, L4); 1.3753 + __ sub(L4, L1, L4); 1.3754 + __ alignaddr(L4, G0, L4); 1.3755 + // save cipher text before circular right shift 1.3756 + // as it needs to be stored as iv for next block (see code before next retl) 1.3757 + __ movdtox(F60, L6); 1.3758 + __ movdtox(F62, L7); 1.3759 + __ faligndata(F60, F60, F60); 1.3760 + __ faligndata(F62, F62, F62); 1.3761 + __ mov(to, L5); 1.3762 + __ and3(to, -8, to); 1.3763 + __ and3(L3, -8, L3); 1.3764 + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 1.3765 + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 1.3766 + __ add(to, 8, to); 1.3767 + __ add(L3, 8, L3); 1.3768 + __ orn(G0, L2, L2); 1.3769 + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 1.3770 + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 1.3771 + __ mov(L5, to); 1.3772 + __ movxtod(L6, F60); 1.3773 + __ movxtod(L7, F62); 1.3774 + 1.3775 + __ BIND(L_check_loop_end_128bit); 1.3776 + __ add(from, 16, from); 1.3777 + __ add(to, 16, to); 1.3778 + __ subcc(len_reg, 16, len_reg); 1.3779 + __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128); 1.3780 + __ delayed()->nop(); 1.3781 + // re-init intial vector for next block, 8-byte alignment is guaranteed 1.3782 + __ stf(FloatRegisterImpl::D, F60, rvec, 0); 1.3783 + __ stf(FloatRegisterImpl::D, F62, rvec, 8); 1.3784 + __ mov(L0, I0); 1.3785 + __ ret(); 1.3786 + __ delayed()->restore(); 1.3787 + 1.3788 + __ align(OptoLoopAlignment); 1.3789 + __ BIND(L_cbcenc192); 1.3790 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 1.3791 + __ andcc(from, 7, G0); 1.3792 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit); 1.3793 + __ delayed()->mov(from, L1); // save original 'from' address before alignaddr 1.3794 + 1.3795 + // aligned case: load input into G3 and G4 1.3796 + __ ldx(from,0,G3); 1.3797 + __ ldx(from,8,G4); 1.3798 + __ ba_short(L_192bit_transform); 1.3799 + 1.3800 + __ BIND(L_load_misaligned_input_192bit); 1.3801 + // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption 1.3802 + __ alignaddr(from, G0, from); 1.3803 + __ ldf(FloatRegisterImpl::D, from, 0, F48); 1.3804 + __ ldf(FloatRegisterImpl::D, from, 8, F50); 1.3805 + __ ldf(FloatRegisterImpl::D, from, 16, F52); 1.3806 + __ faligndata(F48, F50, F48); 1.3807 + __ faligndata(F50, F52, F50); 1.3808 + __ movdtox(F48, G3); 1.3809 + __ movdtox(F50, G4); 1.3810 + __ mov(L1, from); 1.3811 + 1.3812 + __ BIND(L_192bit_transform); 1.3813 + __ xor3(G1,G3,G3); 1.3814 + __ xor3(G5,G4,G4); 1.3815 + __ movxtod(G3,F56); 1.3816 + __ movxtod(G4,F58); 1.3817 + __ fxor(FloatRegisterImpl::D, F60, F56, F60); 1.3818 + __ fxor(FloatRegisterImpl::D, F62, F58, F62); 1.3819 + 1.3820 + // TWELEVE_EROUNDS 1.3821 + for ( int i = 0; i <= 40; i += 8 ) { 1.3822 + __ aes_eround01(as_FloatRegister(i), F60, F62, F56); 1.3823 + __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58); 1.3824 + if (i != 40 ) { 1.3825 + __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60); 1.3826 + __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62); 1.3827 + } else { 1.3828 + __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); 1.3829 + __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); 1.3830 + } 1.3831 + } 1.3832 + 1.3833 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 1.3834 + __ andcc(to, 7, L1); 1.3835 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit); 1.3836 + __ delayed()->edge8n(to, G0, L2); 1.3837 + 1.3838 + // aligned case: store output into the destination array 1.3839 + __ stf(FloatRegisterImpl::D, F60, to, 0); 1.3840 + __ stf(FloatRegisterImpl::D, F62, to, 8); 1.3841 + __ ba_short(L_check_loop_end_192bit); 1.3842 + 1.3843 + __ BIND(L_store_misaligned_output_192bit); 1.3844 + __ add(to, 8, L3); 1.3845 + __ mov(8, L4); 1.3846 + __ sub(L4, L1, L4); 1.3847 + __ alignaddr(L4, G0, L4); 1.3848 + __ movdtox(F60, L6); 1.3849 + __ movdtox(F62, L7); 1.3850 + __ faligndata(F60, F60, F60); 1.3851 + __ faligndata(F62, F62, F62); 1.3852 + __ mov(to, L5); 1.3853 + __ and3(to, -8, to); 1.3854 + __ and3(L3, -8, L3); 1.3855 + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 1.3856 + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 1.3857 + __ add(to, 8, to); 1.3858 + __ add(L3, 8, L3); 1.3859 + __ orn(G0, L2, L2); 1.3860 + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 1.3861 + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 1.3862 + __ mov(L5, to); 1.3863 + __ movxtod(L6, F60); 1.3864 + __ movxtod(L7, F62); 1.3865 + 1.3866 + __ BIND(L_check_loop_end_192bit); 1.3867 + __ add(from, 16, from); 1.3868 + __ subcc(len_reg, 16, len_reg); 1.3869 + __ add(to, 16, to); 1.3870 + __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192); 1.3871 + __ delayed()->nop(); 1.3872 + // re-init intial vector for next block, 8-byte alignment is guaranteed 1.3873 + __ stf(FloatRegisterImpl::D, F60, rvec, 0); 1.3874 + __ stf(FloatRegisterImpl::D, F62, rvec, 8); 1.3875 + __ mov(L0, I0); 1.3876 + __ ret(); 1.3877 + __ delayed()->restore(); 1.3878 + 1.3879 + __ align(OptoLoopAlignment); 1.3880 + __ BIND(L_cbcenc256); 1.3881 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 1.3882 + __ andcc(from, 7, G0); 1.3883 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit); 1.3884 + __ delayed()->mov(from, L1); // save original 'from' address before alignaddr 1.3885 + 1.3886 + // aligned case: load input into G3 and G4 1.3887 + __ ldx(from,0,G3); 1.3888 + __ ldx(from,8,G4); 1.3889 + __ ba_short(L_256bit_transform); 1.3890 + 1.3891 + __ BIND(L_load_misaligned_input_256bit); 1.3892 + // cannot clobber F48, F50 and F52. F56, F58 can be used though 1.3893 + __ alignaddr(from, G0, from); 1.3894 + __ movdtox(F60, L2); // save F60 before overwriting 1.3895 + __ ldf(FloatRegisterImpl::D, from, 0, F56); 1.3896 + __ ldf(FloatRegisterImpl::D, from, 8, F58); 1.3897 + __ ldf(FloatRegisterImpl::D, from, 16, F60); 1.3898 + __ faligndata(F56, F58, F56); 1.3899 + __ faligndata(F58, F60, F58); 1.3900 + __ movdtox(F56, G3); 1.3901 + __ movdtox(F58, G4); 1.3902 + __ mov(L1, from); 1.3903 + __ movxtod(L2, F60); 1.3904 + 1.3905 + __ BIND(L_256bit_transform); 1.3906 + __ xor3(G1,G3,G3); 1.3907 + __ xor3(G5,G4,G4); 1.3908 + __ movxtod(G3,F56); 1.3909 + __ movxtod(G4,F58); 1.3910 + __ fxor(FloatRegisterImpl::D, F60, F56, F60); 1.3911 + __ fxor(FloatRegisterImpl::D, F62, F58, F62); 1.3912 + 1.3913 + // FOURTEEN_EROUNDS 1.3914 + for ( int i = 0; i <= 48; i += 8 ) { 1.3915 + __ aes_eround01(as_FloatRegister(i), F60, F62, F56); 1.3916 + __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58); 1.3917 + if (i != 48 ) { 1.3918 + __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60); 1.3919 + __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62); 1.3920 + } else { 1.3921 + __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); 1.3922 + __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); 1.3923 + } 1.3924 + } 1.3925 + 1.3926 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 1.3927 + __ andcc(to, 7, L1); 1.3928 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit); 1.3929 + __ delayed()->edge8n(to, G0, L2); 1.3930 + 1.3931 + // aligned case: store output into the destination array 1.3932 + __ stf(FloatRegisterImpl::D, F60, to, 0); 1.3933 + __ stf(FloatRegisterImpl::D, F62, to, 8); 1.3934 + __ ba_short(L_check_loop_end_256bit); 1.3935 + 1.3936 + __ BIND(L_store_misaligned_output_256bit); 1.3937 + __ add(to, 8, L3); 1.3938 + __ mov(8, L4); 1.3939 + __ sub(L4, L1, L4); 1.3940 + __ alignaddr(L4, G0, L4); 1.3941 + __ movdtox(F60, L6); 1.3942 + __ movdtox(F62, L7); 1.3943 + __ faligndata(F60, F60, F60); 1.3944 + __ faligndata(F62, F62, F62); 1.3945 + __ mov(to, L5); 1.3946 + __ and3(to, -8, to); 1.3947 + __ and3(L3, -8, L3); 1.3948 + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 1.3949 + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 1.3950 + __ add(to, 8, to); 1.3951 + __ add(L3, 8, L3); 1.3952 + __ orn(G0, L2, L2); 1.3953 + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 1.3954 + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 1.3955 + __ mov(L5, to); 1.3956 + __ movxtod(L6, F60); 1.3957 + __ movxtod(L7, F62); 1.3958 + 1.3959 + __ BIND(L_check_loop_end_256bit); 1.3960 + __ add(from, 16, from); 1.3961 + __ subcc(len_reg, 16, len_reg); 1.3962 + __ add(to, 16, to); 1.3963 + __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256); 1.3964 + __ delayed()->nop(); 1.3965 + // re-init intial vector for next block, 8-byte alignment is guaranteed 1.3966 + __ stf(FloatRegisterImpl::D, F60, rvec, 0); 1.3967 + __ stf(FloatRegisterImpl::D, F62, rvec, 8); 1.3968 + __ mov(L0, I0); 1.3969 + __ ret(); 1.3970 + __ delayed()->restore(); 1.3971 + 1.3972 + return start; 1.3973 + } 1.3974 + 1.3975 + address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { 1.3976 + assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 1.3977 + "the following code assumes that first element of an int array is aligned to 8 bytes"); 1.3978 + assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, 1.3979 + "the following code assumes that first element of a byte array is aligned to 8 bytes"); 1.3980 + __ align(CodeEntryAlignment); 1.3981 + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 1.3982 + Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start; 1.3983 + Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256; 1.3984 + Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128; 1.3985 + Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256; 1.3986 + Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128; 1.3987 + Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192; 1.3988 + Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256; 1.3989 + address start = __ pc(); 1.3990 + Register from = I0; // source byte array 1.3991 + Register to = I1; // destination byte array 1.3992 + Register key = I2; // expanded key array 1.3993 + Register rvec = I3; // init vector 1.3994 + const Register len_reg = I4; // cipher length 1.3995 + const Register original_key = I5; // original key array only required during decryption 1.3996 + const Register keylen = L6; // reg for storing expanded key array length 1.3997 + 1.3998 + __ save_frame(0); //args are read from I* registers since we save the frame in the beginning 1.3999 + // save cipher len to return in the end 1.4000 + __ mov(len_reg, L7); 1.4001 + 1.4002 + // load original key from SunJCE expanded decryption key 1.4003 + // Since we load original key buffer starting first element, 8-byte alignment is guaranteed 1.4004 + for ( int i = 0; i <= 3; i++ ) { 1.4005 + __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 1.4006 + } 1.4007 + 1.4008 + // load initial vector, 8-byte alignment is guaranteed 1.4009 + __ ldx(rvec,0,L0); 1.4010 + __ ldx(rvec,8,L1); 1.4011 + 1.4012 + // read expanded key array length 1.4013 + __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 1.4014 + 1.4015 + // 256-bit original key size 1.4016 + __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit); 1.4017 + 1.4018 + // 192-bit original key size 1.4019 + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit); 1.4020 + 1.4021 + // 128-bit original key size 1.4022 + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 1.4023 + for ( int i = 0; i <= 36; i += 4 ) { 1.4024 + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4)); 1.4025 + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6)); 1.4026 + } 1.4027 + 1.4028 + // load expanded key[last-1] and key[last] elements 1.4029 + __ movdtox(F40,L2); 1.4030 + __ movdtox(F42,L3); 1.4031 + 1.4032 + __ and3(len_reg, 16, L4); 1.4033 + __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128); 1.4034 + __ nop(); 1.4035 + 1.4036 + __ ba_short(L_dec_first_block_start); 1.4037 + 1.4038 + __ BIND(L_expand192bit); 1.4039 + // load rest of the 192-bit key 1.4040 + __ ldf(FloatRegisterImpl::S, original_key, 16, F4); 1.4041 + __ ldf(FloatRegisterImpl::S, original_key, 20, F5); 1.4042 + 1.4043 + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 1.4044 + for ( int i = 0; i <= 36; i += 6 ) { 1.4045 + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6)); 1.4046 + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8)); 1.4047 + __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10)); 1.4048 + } 1.4049 + __ aes_kexpand1(F42, F46, 7, F48); 1.4050 + __ aes_kexpand2(F44, F48, F50); 1.4051 + 1.4052 + // load expanded key[last-1] and key[last] elements 1.4053 + __ movdtox(F48,L2); 1.4054 + __ movdtox(F50,L3); 1.4055 + 1.4056 + __ and3(len_reg, 16, L4); 1.4057 + __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192); 1.4058 + __ nop(); 1.4059 + 1.4060 + __ ba_short(L_dec_first_block_start); 1.4061 + 1.4062 + __ BIND(L_expand256bit); 1.4063 + // load rest of the 256-bit key 1.4064 + for ( int i = 4; i <= 7; i++ ) { 1.4065 + __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 1.4066 + } 1.4067 + 1.4068 + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 1.4069 + for ( int i = 0; i <= 40; i += 8 ) { 1.4070 + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8)); 1.4071 + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10)); 1.4072 + __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12)); 1.4073 + __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14)); 1.4074 + } 1.4075 + __ aes_kexpand1(F48, F54, 6, F56); 1.4076 + __ aes_kexpand2(F50, F56, F58); 1.4077 + 1.4078 + // load expanded key[last-1] and key[last] elements 1.4079 + __ movdtox(F56,L2); 1.4080 + __ movdtox(F58,L3); 1.4081 + 1.4082 + __ and3(len_reg, 16, L4); 1.4083 + __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256); 1.4084 + 1.4085 + __ BIND(L_dec_first_block_start); 1.4086 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 1.4087 + __ andcc(from, 7, G0); 1.4088 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block); 1.4089 + __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 1.4090 + 1.4091 + // aligned case: load input into L4 and L5 1.4092 + __ ldx(from,0,L4); 1.4093 + __ ldx(from,8,L5); 1.4094 + __ ba_short(L_transform_first_block); 1.4095 + 1.4096 + __ BIND(L_load_misaligned_input_first_block); 1.4097 + __ alignaddr(from, G0, from); 1.4098 + // F58, F60, F62 can be clobbered 1.4099 + __ ldf(FloatRegisterImpl::D, from, 0, F58); 1.4100 + __ ldf(FloatRegisterImpl::D, from, 8, F60); 1.4101 + __ ldf(FloatRegisterImpl::D, from, 16, F62); 1.4102 + __ faligndata(F58, F60, F58); 1.4103 + __ faligndata(F60, F62, F60); 1.4104 + __ movdtox(F58, L4); 1.4105 + __ movdtox(F60, L5); 1.4106 + __ mov(G1, from); 1.4107 + 1.4108 + __ BIND(L_transform_first_block); 1.4109 + __ xor3(L2,L4,G1); 1.4110 + __ movxtod(G1,F60); 1.4111 + __ xor3(L3,L5,G1); 1.4112 + __ movxtod(G1,F62); 1.4113 + 1.4114 + // 128-bit original key size 1.4115 + __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pn, L_dec_first_block128); 1.4116 + 1.4117 + // 192-bit original key size 1.4118 + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_first_block192); 1.4119 + 1.4120 + __ aes_dround23(F54, F60, F62, F58); 1.4121 + __ aes_dround01(F52, F60, F62, F56); 1.4122 + __ aes_dround23(F50, F56, F58, F62); 1.4123 + __ aes_dround01(F48, F56, F58, F60); 1.4124 + 1.4125 + __ BIND(L_dec_first_block192); 1.4126 + __ aes_dround23(F46, F60, F62, F58); 1.4127 + __ aes_dround01(F44, F60, F62, F56); 1.4128 + __ aes_dround23(F42, F56, F58, F62); 1.4129 + __ aes_dround01(F40, F56, F58, F60); 1.4130 + 1.4131 + __ BIND(L_dec_first_block128); 1.4132 + for ( int i = 38; i >= 6; i -= 8 ) { 1.4133 + __ aes_dround23(as_FloatRegister(i), F60, F62, F58); 1.4134 + __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); 1.4135 + if ( i != 6) { 1.4136 + __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); 1.4137 + __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); 1.4138 + } else { 1.4139 + __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62); 1.4140 + __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60); 1.4141 + } 1.4142 + } 1.4143 + 1.4144 + __ movxtod(L0,F56); 1.4145 + __ movxtod(L1,F58); 1.4146 + __ mov(L4,L0); 1.4147 + __ mov(L5,L1); 1.4148 + __ fxor(FloatRegisterImpl::D, F56, F60, F60); 1.4149 + __ fxor(FloatRegisterImpl::D, F58, F62, F62); 1.4150 + 1.4151 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 1.4152 + __ andcc(to, 7, G1); 1.4153 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block); 1.4154 + __ delayed()->edge8n(to, G0, G2); 1.4155 + 1.4156 + // aligned case: store output into the destination array 1.4157 + __ stf(FloatRegisterImpl::D, F60, to, 0); 1.4158 + __ stf(FloatRegisterImpl::D, F62, to, 8); 1.4159 + __ ba_short(L_check_decrypt_end); 1.4160 + 1.4161 + __ BIND(L_store_misaligned_output_first_block); 1.4162 + __ add(to, 8, G3); 1.4163 + __ mov(8, G4); 1.4164 + __ sub(G4, G1, G4); 1.4165 + __ alignaddr(G4, G0, G4); 1.4166 + __ faligndata(F60, F60, F60); 1.4167 + __ faligndata(F62, F62, F62); 1.4168 + __ mov(to, G1); 1.4169 + __ and3(to, -8, to); 1.4170 + __ and3(G3, -8, G3); 1.4171 + __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY); 1.4172 + __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY); 1.4173 + __ add(to, 8, to); 1.4174 + __ add(G3, 8, G3); 1.4175 + __ orn(G0, G2, G2); 1.4176 + __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY); 1.4177 + __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY); 1.4178 + __ mov(G1, to); 1.4179 + 1.4180 + __ BIND(L_check_decrypt_end); 1.4181 + __ add(from, 16, from); 1.4182 + __ add(to, 16, to); 1.4183 + __ subcc(len_reg, 16, len_reg); 1.4184 + __ br(Assembler::equal, false, Assembler::pt, L_cbcdec_end); 1.4185 + __ delayed()->nop(); 1.4186 + 1.4187 + // 256-bit original key size 1.4188 + __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_dec_next2_blocks256); 1.4189 + 1.4190 + // 192-bit original key size 1.4191 + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_next2_blocks192); 1.4192 + 1.4193 + __ align(OptoLoopAlignment); 1.4194 + __ BIND(L_dec_next2_blocks128); 1.4195 + __ nop(); 1.4196 + 1.4197 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 1.4198 + __ andcc(from, 7, G0); 1.4199 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128); 1.4200 + __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 1.4201 + 1.4202 + // aligned case: load input into G4, G5, L4 and L5 1.4203 + __ ldx(from,0,G4); 1.4204 + __ ldx(from,8,G5); 1.4205 + __ ldx(from,16,L4); 1.4206 + __ ldx(from,24,L5); 1.4207 + __ ba_short(L_transform_next2_blocks128); 1.4208 + 1.4209 + __ BIND(L_load_misaligned_next2_blocks128); 1.4210 + __ alignaddr(from, G0, from); 1.4211 + // F40, F42, F58, F60, F62 can be clobbered 1.4212 + __ ldf(FloatRegisterImpl::D, from, 0, F40); 1.4213 + __ ldf(FloatRegisterImpl::D, from, 8, F42); 1.4214 + __ ldf(FloatRegisterImpl::D, from, 16, F60); 1.4215 + __ ldf(FloatRegisterImpl::D, from, 24, F62); 1.4216 + __ ldf(FloatRegisterImpl::D, from, 32, F58); 1.4217 + __ faligndata(F40, F42, F40); 1.4218 + __ faligndata(F42, F60, F42); 1.4219 + __ faligndata(F60, F62, F60); 1.4220 + __ faligndata(F62, F58, F62); 1.4221 + __ movdtox(F40, G4); 1.4222 + __ movdtox(F42, G5); 1.4223 + __ movdtox(F60, L4); 1.4224 + __ movdtox(F62, L5); 1.4225 + __ mov(G1, from); 1.4226 + 1.4227 + __ BIND(L_transform_next2_blocks128); 1.4228 + // F40:F42 used for first 16-bytes 1.4229 + __ xor3(L2,G4,G1); 1.4230 + __ movxtod(G1,F40); 1.4231 + __ xor3(L3,G5,G1); 1.4232 + __ movxtod(G1,F42); 1.4233 + 1.4234 + // F60:F62 used for next 16-bytes 1.4235 + __ xor3(L2,L4,G1); 1.4236 + __ movxtod(G1,F60); 1.4237 + __ xor3(L3,L5,G1); 1.4238 + __ movxtod(G1,F62); 1.4239 + 1.4240 + for ( int i = 38; i >= 6; i -= 8 ) { 1.4241 + __ aes_dround23(as_FloatRegister(i), F40, F42, F44); 1.4242 + __ aes_dround01(as_FloatRegister(i-2), F40, F42, F46); 1.4243 + __ aes_dround23(as_FloatRegister(i), F60, F62, F58); 1.4244 + __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); 1.4245 + if (i != 6 ) { 1.4246 + __ aes_dround23(as_FloatRegister(i-4), F46, F44, F42); 1.4247 + __ aes_dround01(as_FloatRegister(i-6), F46, F44, F40); 1.4248 + __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); 1.4249 + __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); 1.4250 + } else { 1.4251 + __ aes_dround23_l(as_FloatRegister(i-4), F46, F44, F42); 1.4252 + __ aes_dround01_l(as_FloatRegister(i-6), F46, F44, F40); 1.4253 + __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62); 1.4254 + __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60); 1.4255 + } 1.4256 + } 1.4257 + 1.4258 + __ movxtod(L0,F46); 1.4259 + __ movxtod(L1,F44); 1.4260 + __ fxor(FloatRegisterImpl::D, F46, F40, F40); 1.4261 + __ fxor(FloatRegisterImpl::D, F44, F42, F42); 1.4262 + 1.4263 + __ movxtod(G4,F56); 1.4264 + __ movxtod(G5,F58); 1.4265 + __ mov(L4,L0); 1.4266 + __ mov(L5,L1); 1.4267 + __ fxor(FloatRegisterImpl::D, F56, F60, F60); 1.4268 + __ fxor(FloatRegisterImpl::D, F58, F62, F62); 1.4269 + 1.4270 + // For mis-aligned store of 32 bytes of result we can do: 1.4271 + // Circular right-shift all 4 FP registers so that 'head' and 'tail' 1.4272 + // parts that need to be stored starting at mis-aligned address are in a FP reg 1.4273 + // the other 3 FP regs can thus be stored using regular store 1.4274 + // we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts 1.4275 + 1.4276 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 1.4277 + __ andcc(to, 7, G1); 1.4278 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128); 1.4279 + __ delayed()->edge8n(to, G0, G2); 1.4280 + 1.4281 + // aligned case: store output into the destination array 1.4282 + __ stf(FloatRegisterImpl::D, F40, to, 0); 1.4283 + __ stf(FloatRegisterImpl::D, F42, to, 8); 1.4284 + __ stf(FloatRegisterImpl::D, F60, to, 16); 1.4285 + __ stf(FloatRegisterImpl::D, F62, to, 24); 1.4286 + __ ba_short(L_check_decrypt_loop_end128); 1.4287 + 1.4288 + __ BIND(L_store_misaligned_output_next2_blocks128); 1.4289 + __ mov(8, G4); 1.4290 + __ sub(G4, G1, G4); 1.4291 + __ alignaddr(G4, G0, G4); 1.4292 + __ faligndata(F40, F42, F56); // F56 can be clobbered 1.4293 + __ faligndata(F42, F60, F42); 1.4294 + __ faligndata(F60, F62, F60); 1.4295 + __ faligndata(F62, F40, F40); 1.4296 + __ mov(to, G1); 1.4297 + __ and3(to, -8, to); 1.4298 + __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY); 1.4299 + __ stf(FloatRegisterImpl::D, F56, to, 8); 1.4300 + __ stf(FloatRegisterImpl::D, F42, to, 16); 1.4301 + __ stf(FloatRegisterImpl::D, F60, to, 24); 1.4302 + __ add(to, 32, to); 1.4303 + __ orn(G0, G2, G2); 1.4304 + __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY); 1.4305 + __ mov(G1, to); 1.4306 + 1.4307 + __ BIND(L_check_decrypt_loop_end128); 1.4308 + __ add(from, 32, from); 1.4309 + __ add(to, 32, to); 1.4310 + __ subcc(len_reg, 32, len_reg); 1.4311 + __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128); 1.4312 + __ delayed()->nop(); 1.4313 + __ ba_short(L_cbcdec_end); 1.4314 + 1.4315 + __ align(OptoLoopAlignment); 1.4316 + __ BIND(L_dec_next2_blocks192); 1.4317 + __ nop(); 1.4318 + 1.4319 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 1.4320 + __ andcc(from, 7, G0); 1.4321 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192); 1.4322 + __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 1.4323 + 1.4324 + // aligned case: load input into G4, G5, L4 and L5 1.4325 + __ ldx(from,0,G4); 1.4326 + __ ldx(from,8,G5); 1.4327 + __ ldx(from,16,L4); 1.4328 + __ ldx(from,24,L5); 1.4329 + __ ba_short(L_transform_next2_blocks192); 1.4330 + 1.4331 + __ BIND(L_load_misaligned_next2_blocks192); 1.4332 + __ alignaddr(from, G0, from); 1.4333 + // F48, F50, F52, F60, F62 can be clobbered 1.4334 + __ ldf(FloatRegisterImpl::D, from, 0, F48); 1.4335 + __ ldf(FloatRegisterImpl::D, from, 8, F50); 1.4336 + __ ldf(FloatRegisterImpl::D, from, 16, F60); 1.4337 + __ ldf(FloatRegisterImpl::D, from, 24, F62); 1.4338 + __ ldf(FloatRegisterImpl::D, from, 32, F52); 1.4339 + __ faligndata(F48, F50, F48); 1.4340 + __ faligndata(F50, F60, F50); 1.4341 + __ faligndata(F60, F62, F60); 1.4342 + __ faligndata(F62, F52, F62); 1.4343 + __ movdtox(F48, G4); 1.4344 + __ movdtox(F50, G5); 1.4345 + __ movdtox(F60, L4); 1.4346 + __ movdtox(F62, L5); 1.4347 + __ mov(G1, from); 1.4348 + 1.4349 + __ BIND(L_transform_next2_blocks192); 1.4350 + // F48:F50 used for first 16-bytes 1.4351 + __ xor3(L2,G4,G1); 1.4352 + __ movxtod(G1,F48); 1.4353 + __ xor3(L3,G5,G1); 1.4354 + __ movxtod(G1,F50); 1.4355 + 1.4356 + // F60:F62 used for next 16-bytes 1.4357 + __ xor3(L2,L4,G1); 1.4358 + __ movxtod(G1,F60); 1.4359 + __ xor3(L3,L5,G1); 1.4360 + __ movxtod(G1,F62); 1.4361 + 1.4362 + for ( int i = 46; i >= 6; i -= 8 ) { 1.4363 + __ aes_dround23(as_FloatRegister(i), F48, F50, F52); 1.4364 + __ aes_dround01(as_FloatRegister(i-2), F48, F50, F54); 1.4365 + __ aes_dround23(as_FloatRegister(i), F60, F62, F58); 1.4366 + __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); 1.4367 + if (i != 6 ) { 1.4368 + __ aes_dround23(as_FloatRegister(i-4), F54, F52, F50); 1.4369 + __ aes_dround01(as_FloatRegister(i-6), F54, F52, F48); 1.4370 + __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); 1.4371 + __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); 1.4372 + } else { 1.4373 + __ aes_dround23_l(as_FloatRegister(i-4), F54, F52, F50); 1.4374 + __ aes_dround01_l(as_FloatRegister(i-6), F54, F52, F48); 1.4375 + __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62); 1.4376 + __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60); 1.4377 + } 1.4378 + } 1.4379 + 1.4380 + __ movxtod(L0,F54); 1.4381 + __ movxtod(L1,F52); 1.4382 + __ fxor(FloatRegisterImpl::D, F54, F48, F48); 1.4383 + __ fxor(FloatRegisterImpl::D, F52, F50, F50); 1.4384 + 1.4385 + __ movxtod(G4,F56); 1.4386 + __ movxtod(G5,F58); 1.4387 + __ mov(L4,L0); 1.4388 + __ mov(L5,L1); 1.4389 + __ fxor(FloatRegisterImpl::D, F56, F60, F60); 1.4390 + __ fxor(FloatRegisterImpl::D, F58, F62, F62); 1.4391 + 1.4392 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 1.4393 + __ andcc(to, 7, G1); 1.4394 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192); 1.4395 + __ delayed()->edge8n(to, G0, G2); 1.4396 + 1.4397 + // aligned case: store output into the destination array 1.4398 + __ stf(FloatRegisterImpl::D, F48, to, 0); 1.4399 + __ stf(FloatRegisterImpl::D, F50, to, 8); 1.4400 + __ stf(FloatRegisterImpl::D, F60, to, 16); 1.4401 + __ stf(FloatRegisterImpl::D, F62, to, 24); 1.4402 + __ ba_short(L_check_decrypt_loop_end192); 1.4403 + 1.4404 + __ BIND(L_store_misaligned_output_next2_blocks192); 1.4405 + __ mov(8, G4); 1.4406 + __ sub(G4, G1, G4); 1.4407 + __ alignaddr(G4, G0, G4); 1.4408 + __ faligndata(F48, F50, F56); // F56 can be clobbered 1.4409 + __ faligndata(F50, F60, F50); 1.4410 + __ faligndata(F60, F62, F60); 1.4411 + __ faligndata(F62, F48, F48); 1.4412 + __ mov(to, G1); 1.4413 + __ and3(to, -8, to); 1.4414 + __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY); 1.4415 + __ stf(FloatRegisterImpl::D, F56, to, 8); 1.4416 + __ stf(FloatRegisterImpl::D, F50, to, 16); 1.4417 + __ stf(FloatRegisterImpl::D, F60, to, 24); 1.4418 + __ add(to, 32, to); 1.4419 + __ orn(G0, G2, G2); 1.4420 + __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY); 1.4421 + __ mov(G1, to); 1.4422 + 1.4423 + __ BIND(L_check_decrypt_loop_end192); 1.4424 + __ add(from, 32, from); 1.4425 + __ add(to, 32, to); 1.4426 + __ subcc(len_reg, 32, len_reg); 1.4427 + __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192); 1.4428 + __ delayed()->nop(); 1.4429 + __ ba_short(L_cbcdec_end); 1.4430 + 1.4431 + __ align(OptoLoopAlignment); 1.4432 + __ BIND(L_dec_next2_blocks256); 1.4433 + __ nop(); 1.4434 + 1.4435 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 1.4436 + __ andcc(from, 7, G0); 1.4437 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256); 1.4438 + __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 1.4439 + 1.4440 + // aligned case: load input into G4, G5, L4 and L5 1.4441 + __ ldx(from,0,G4); 1.4442 + __ ldx(from,8,G5); 1.4443 + __ ldx(from,16,L4); 1.4444 + __ ldx(from,24,L5); 1.4445 + __ ba_short(L_transform_next2_blocks256); 1.4446 + 1.4447 + __ BIND(L_load_misaligned_next2_blocks256); 1.4448 + __ alignaddr(from, G0, from); 1.4449 + // F0, F2, F4, F60, F62 can be clobbered 1.4450 + __ ldf(FloatRegisterImpl::D, from, 0, F0); 1.4451 + __ ldf(FloatRegisterImpl::D, from, 8, F2); 1.4452 + __ ldf(FloatRegisterImpl::D, from, 16, F60); 1.4453 + __ ldf(FloatRegisterImpl::D, from, 24, F62); 1.4454 + __ ldf(FloatRegisterImpl::D, from, 32, F4); 1.4455 + __ faligndata(F0, F2, F0); 1.4456 + __ faligndata(F2, F60, F2); 1.4457 + __ faligndata(F60, F62, F60); 1.4458 + __ faligndata(F62, F4, F62); 1.4459 + __ movdtox(F0, G4); 1.4460 + __ movdtox(F2, G5); 1.4461 + __ movdtox(F60, L4); 1.4462 + __ movdtox(F62, L5); 1.4463 + __ mov(G1, from); 1.4464 + 1.4465 + __ BIND(L_transform_next2_blocks256); 1.4466 + // F0:F2 used for first 16-bytes 1.4467 + __ xor3(L2,G4,G1); 1.4468 + __ movxtod(G1,F0); 1.4469 + __ xor3(L3,G5,G1); 1.4470 + __ movxtod(G1,F2); 1.4471 + 1.4472 + // F60:F62 used for next 16-bytes 1.4473 + __ xor3(L2,L4,G1); 1.4474 + __ movxtod(G1,F60); 1.4475 + __ xor3(L3,L5,G1); 1.4476 + __ movxtod(G1,F62); 1.4477 + 1.4478 + __ aes_dround23(F54, F0, F2, F4); 1.4479 + __ aes_dround01(F52, F0, F2, F6); 1.4480 + __ aes_dround23(F54, F60, F62, F58); 1.4481 + __ aes_dround01(F52, F60, F62, F56); 1.4482 + __ aes_dround23(F50, F6, F4, F2); 1.4483 + __ aes_dround01(F48, F6, F4, F0); 1.4484 + __ aes_dround23(F50, F56, F58, F62); 1.4485 + __ aes_dround01(F48, F56, F58, F60); 1.4486 + // save F48:F54 in temp registers 1.4487 + __ movdtox(F54,G2); 1.4488 + __ movdtox(F52,G3); 1.4489 + __ movdtox(F50,G6); 1.4490 + __ movdtox(F48,G1); 1.4491 + for ( int i = 46; i >= 14; i -= 8 ) { 1.4492 + __ aes_dround23(as_FloatRegister(i), F0, F2, F4); 1.4493 + __ aes_dround01(as_FloatRegister(i-2), F0, F2, F6); 1.4494 + __ aes_dround23(as_FloatRegister(i), F60, F62, F58); 1.4495 + __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); 1.4496 + __ aes_dround23(as_FloatRegister(i-4), F6, F4, F2); 1.4497 + __ aes_dround01(as_FloatRegister(i-6), F6, F4, F0); 1.4498 + __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); 1.4499 + __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); 1.4500 + } 1.4501 + // init F48:F54 with F0:F6 values (original key) 1.4502 + __ ldf(FloatRegisterImpl::D, original_key, 0, F48); 1.4503 + __ ldf(FloatRegisterImpl::D, original_key, 8, F50); 1.4504 + __ ldf(FloatRegisterImpl::D, original_key, 16, F52); 1.4505 + __ ldf(FloatRegisterImpl::D, original_key, 24, F54); 1.4506 + __ aes_dround23(F54, F0, F2, F4); 1.4507 + __ aes_dround01(F52, F0, F2, F6); 1.4508 + __ aes_dround23(F54, F60, F62, F58); 1.4509 + __ aes_dround01(F52, F60, F62, F56); 1.4510 + __ aes_dround23_l(F50, F6, F4, F2); 1.4511 + __ aes_dround01_l(F48, F6, F4, F0); 1.4512 + __ aes_dround23_l(F50, F56, F58, F62); 1.4513 + __ aes_dround01_l(F48, F56, F58, F60); 1.4514 + // re-init F48:F54 with their original values 1.4515 + __ movxtod(G2,F54); 1.4516 + __ movxtod(G3,F52); 1.4517 + __ movxtod(G6,F50); 1.4518 + __ movxtod(G1,F48); 1.4519 + 1.4520 + __ movxtod(L0,F6); 1.4521 + __ movxtod(L1,F4); 1.4522 + __ fxor(FloatRegisterImpl::D, F6, F0, F0); 1.4523 + __ fxor(FloatRegisterImpl::D, F4, F2, F2); 1.4524 + 1.4525 + __ movxtod(G4,F56); 1.4526 + __ movxtod(G5,F58); 1.4527 + __ mov(L4,L0); 1.4528 + __ mov(L5,L1); 1.4529 + __ fxor(FloatRegisterImpl::D, F56, F60, F60); 1.4530 + __ fxor(FloatRegisterImpl::D, F58, F62, F62); 1.4531 + 1.4532 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 1.4533 + __ andcc(to, 7, G1); 1.4534 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256); 1.4535 + __ delayed()->edge8n(to, G0, G2); 1.4536 + 1.4537 + // aligned case: store output into the destination array 1.4538 + __ stf(FloatRegisterImpl::D, F0, to, 0); 1.4539 + __ stf(FloatRegisterImpl::D, F2, to, 8); 1.4540 + __ stf(FloatRegisterImpl::D, F60, to, 16); 1.4541 + __ stf(FloatRegisterImpl::D, F62, to, 24); 1.4542 + __ ba_short(L_check_decrypt_loop_end256); 1.4543 + 1.4544 + __ BIND(L_store_misaligned_output_next2_blocks256); 1.4545 + __ mov(8, G4); 1.4546 + __ sub(G4, G1, G4); 1.4547 + __ alignaddr(G4, G0, G4); 1.4548 + __ faligndata(F0, F2, F56); // F56 can be clobbered 1.4549 + __ faligndata(F2, F60, F2); 1.4550 + __ faligndata(F60, F62, F60); 1.4551 + __ faligndata(F62, F0, F0); 1.4552 + __ mov(to, G1); 1.4553 + __ and3(to, -8, to); 1.4554 + __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY); 1.4555 + __ stf(FloatRegisterImpl::D, F56, to, 8); 1.4556 + __ stf(FloatRegisterImpl::D, F2, to, 16); 1.4557 + __ stf(FloatRegisterImpl::D, F60, to, 24); 1.4558 + __ add(to, 32, to); 1.4559 + __ orn(G0, G2, G2); 1.4560 + __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY); 1.4561 + __ mov(G1, to); 1.4562 + 1.4563 + __ BIND(L_check_decrypt_loop_end256); 1.4564 + __ add(from, 32, from); 1.4565 + __ add(to, 32, to); 1.4566 + __ subcc(len_reg, 32, len_reg); 1.4567 + __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks256); 1.4568 + __ delayed()->nop(); 1.4569 + 1.4570 + __ BIND(L_cbcdec_end); 1.4571 + // re-init intial vector for next block, 8-byte alignment is guaranteed 1.4572 + __ stx(L0, rvec, 0); 1.4573 + __ stx(L1, rvec, 8); 1.4574 + __ mov(L7, I0); 1.4575 + __ ret(); 1.4576 + __ delayed()->restore(); 1.4577 + 1.4578 + return start; 1.4579 + } 1.4580 + 1.4581 + void generate_initial() { 1.4582 + // Generates all stubs and initializes the entry points 1.4583 + 1.4584 + //------------------------------------------------------------------------------------------------------------------------ 1.4585 + // entry points that exist in all platforms 1.4586 + // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than 1.4587 + // the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp. 1.4588 + StubRoutines::_forward_exception_entry = generate_forward_exception(); 1.4589 + 1.4590 + StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address); 1.4591 + StubRoutines::_catch_exception_entry = generate_catch_exception(); 1.4592 + 1.4593 + //------------------------------------------------------------------------------------------------------------------------ 1.4594 + // entry points that are platform specific 1.4595 + StubRoutines::Sparc::_test_stop_entry = generate_test_stop(); 1.4596 + 1.4597 + StubRoutines::Sparc::_stop_subroutine_entry = generate_stop_subroutine(); 1.4598 + StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows(); 1.4599 + 1.4600 +#if !defined(COMPILER2) && !defined(_LP64) 1.4601 + StubRoutines::_atomic_xchg_entry = generate_atomic_xchg(); 1.4602 + StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg(); 1.4603 + StubRoutines::_atomic_add_entry = generate_atomic_add(); 1.4604 + StubRoutines::_atomic_xchg_ptr_entry = StubRoutines::_atomic_xchg_entry; 1.4605 + StubRoutines::_atomic_cmpxchg_ptr_entry = StubRoutines::_atomic_cmpxchg_entry; 1.4606 + StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long(); 1.4607 + StubRoutines::_atomic_add_ptr_entry = StubRoutines::_atomic_add_entry; 1.4608 +#endif // COMPILER2 !=> _LP64 1.4609 + 1.4610 + // Build this early so it's available for the interpreter. 1.4611 + StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError)); 1.4612 + } 1.4613 + 1.4614 + 1.4615 + void generate_all() { 1.4616 + // Generates all stubs and initializes the entry points 1.4617 + 1.4618 + // Generate partial_subtype_check first here since its code depends on 1.4619 + // UseZeroBaseCompressedOops which is defined after heap initialization. 1.4620 + StubRoutines::Sparc::_partial_subtype_check = generate_partial_subtype_check(); 1.4621 + // These entry points require SharedInfo::stack0 to be set up in non-core builds 1.4622 + StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError)); 1.4623 + StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError)); 1.4624 + StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call)); 1.4625 + 1.4626 + StubRoutines::_handler_for_unsafe_access_entry = 1.4627 + generate_handler_for_unsafe_access(); 1.4628 + 1.4629 + // support for verify_oop (must happen after universe_init) 1.4630 + StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop_subroutine(); 1.4631 + 1.4632 + // arraycopy stubs used by compilers 1.4633 + generate_arraycopy_stubs(); 1.4634 + 1.4635 + // Don't initialize the platform math functions since sparc 1.4636 + // doesn't have intrinsics for these operations. 1.4637 + 1.4638 + // Safefetch stubs. 1.4639 + generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 1.4640 + &StubRoutines::_safefetch32_fault_pc, 1.4641 + &StubRoutines::_safefetch32_continuation_pc); 1.4642 + generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 1.4643 + &StubRoutines::_safefetchN_fault_pc, 1.4644 + &StubRoutines::_safefetchN_continuation_pc); 1.4645 + 1.4646 + // generate AES intrinsics code 1.4647 + if (UseAESIntrinsics) { 1.4648 + StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 1.4649 + StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 1.4650 + StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 1.4651 + StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); 1.4652 + } 1.4653 + } 1.4654 + 1.4655 + 1.4656 + public: 1.4657 + StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 1.4658 + // replace the standard masm with a special one: 1.4659 + _masm = new MacroAssembler(code); 1.4660 + 1.4661 + _stub_count = !all ? 0x100 : 0x200; 1.4662 + if (all) { 1.4663 + generate_all(); 1.4664 + } else { 1.4665 + generate_initial(); 1.4666 + } 1.4667 + 1.4668 + // make sure this stub is available for all local calls 1.4669 + if (_atomic_add_stub.is_unbound()) { 1.4670 + // generate a second time, if necessary 1.4671 + (void) generate_atomic_add(); 1.4672 + } 1.4673 + } 1.4674 + 1.4675 + 1.4676 + private: 1.4677 + int _stub_count; 1.4678 + void stub_prolog(StubCodeDesc* cdesc) { 1.4679 + # ifdef ASSERT 1.4680 + // put extra information in the stub code, to make it more readable 1.4681 +#ifdef _LP64 1.4682 +// Write the high part of the address 1.4683 +// [RGV] Check if there is a dependency on the size of this prolog 1.4684 + __ emit_data((intptr_t)cdesc >> 32, relocInfo::none); 1.4685 +#endif 1.4686 + __ emit_data((intptr_t)cdesc, relocInfo::none); 1.4687 + __ emit_data(++_stub_count, relocInfo::none); 1.4688 + # endif 1.4689 + align(true); 1.4690 + } 1.4691 + 1.4692 + void align(bool at_header = false) { 1.4693 + // %%%%% move this constant somewhere else 1.4694 + // UltraSPARC cache line size is 8 instructions: 1.4695 + const unsigned int icache_line_size = 32; 1.4696 + const unsigned int icache_half_line_size = 16; 1.4697 + 1.4698 + if (at_header) { 1.4699 + while ((intptr_t)(__ pc()) % icache_line_size != 0) { 1.4700 + __ emit_data(0, relocInfo::none); 1.4701 + } 1.4702 + } else { 1.4703 + while ((intptr_t)(__ pc()) % icache_half_line_size != 0) { 1.4704 + __ nop(); 1.4705 + } 1.4706 + } 1.4707 + } 1.4708 + 1.4709 +}; // end class declaration 1.4710 + 1.4711 +void StubGenerator_generate(CodeBuffer* code, bool all) { 1.4712 + StubGenerator g(code, all); 1.4713 +}