Fri, 02 Sep 2011 12:13:33 -0700
7039731: arraycopy could use prefetch on SPARC
Summary: Use BIS and prefetch in arraycopy stubs for Sparc (BIS for T4 only).
Reviewed-by: never, iveresov
1 /*
2 * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
25 #include "precompiled.hpp"
26 #include "asm/assembler.hpp"
27 #include "assembler_sparc.inline.hpp"
28 #include "interpreter/interpreter.hpp"
29 #include "nativeInst_sparc.hpp"
30 #include "oops/instanceOop.hpp"
31 #include "oops/methodOop.hpp"
32 #include "oops/objArrayKlass.hpp"
33 #include "oops/oop.inline.hpp"
34 #include "prims/methodHandles.hpp"
35 #include "runtime/frame.inline.hpp"
36 #include "runtime/handles.inline.hpp"
37 #include "runtime/sharedRuntime.hpp"
38 #include "runtime/stubCodeGenerator.hpp"
39 #include "runtime/stubRoutines.hpp"
40 #include "utilities/top.hpp"
41 #ifdef TARGET_OS_FAMILY_linux
42 # include "thread_linux.inline.hpp"
43 #endif
44 #ifdef TARGET_OS_FAMILY_solaris
45 # include "thread_solaris.inline.hpp"
46 #endif
47 #ifdef COMPILER2
48 #include "opto/runtime.hpp"
49 #endif
51 // Declaration and definition of StubGenerator (no .hpp file).
52 // For a more detailed description of the stub routine structure
53 // see the comment in stubRoutines.hpp.
55 #define __ _masm->
57 #ifdef PRODUCT
58 #define BLOCK_COMMENT(str) /* nothing */
59 #else
60 #define BLOCK_COMMENT(str) __ block_comment(str)
61 #endif
63 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
65 // Note: The register L7 is used as L7_thread_cache, and may not be used
66 // any other way within this module.
69 static const Register& Lstub_temp = L2;
71 // -------------------------------------------------------------------------------------------------------------------------
72 // Stub Code definitions
74 static address handle_unsafe_access() {
75 JavaThread* thread = JavaThread::current();
76 address pc = thread->saved_exception_pc();
77 address npc = thread->saved_exception_npc();
78 // pc is the instruction which we must emulate
79 // doing a no-op is fine: return garbage from the load
81 // request an async exception
82 thread->set_pending_unsafe_access_error();
84 // return address of next instruction to execute
85 return npc;
86 }
88 class StubGenerator: public StubCodeGenerator {
89 private:
91 #ifdef PRODUCT
92 #define inc_counter_np(a,b,c) (0)
93 #else
94 #define inc_counter_np(counter, t1, t2) \
95 BLOCK_COMMENT("inc_counter " #counter); \
96 __ inc_counter(&counter, t1, t2);
97 #endif
99 //----------------------------------------------------------------------------------------------------
100 // Call stubs are used to call Java from C
102 address generate_call_stub(address& return_pc) {
103 StubCodeMark mark(this, "StubRoutines", "call_stub");
104 address start = __ pc();
106 // Incoming arguments:
107 //
108 // o0 : call wrapper address
109 // o1 : result (address)
110 // o2 : result type
111 // o3 : method
112 // o4 : (interpreter) entry point
113 // o5 : parameters (address)
114 // [sp + 0x5c]: parameter size (in words)
115 // [sp + 0x60]: thread
116 //
117 // +---------------+ <--- sp + 0
118 // | |
119 // . reg save area .
120 // | |
121 // +---------------+ <--- sp + 0x40
122 // | |
123 // . extra 7 slots .
124 // | |
125 // +---------------+ <--- sp + 0x5c
126 // | param. size |
127 // +---------------+ <--- sp + 0x60
128 // | thread |
129 // +---------------+
130 // | |
132 // note: if the link argument position changes, adjust
133 // the code in frame::entry_frame_call_wrapper()
135 const Argument link = Argument(0, false); // used only for GC
136 const Argument result = Argument(1, false);
137 const Argument result_type = Argument(2, false);
138 const Argument method = Argument(3, false);
139 const Argument entry_point = Argument(4, false);
140 const Argument parameters = Argument(5, false);
141 const Argument parameter_size = Argument(6, false);
142 const Argument thread = Argument(7, false);
144 // setup thread register
145 __ ld_ptr(thread.as_address(), G2_thread);
146 __ reinit_heapbase();
148 #ifdef ASSERT
149 // make sure we have no pending exceptions
150 { const Register t = G3_scratch;
151 Label L;
152 __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t);
153 __ br_null_short(t, Assembler::pt, L);
154 __ stop("StubRoutines::call_stub: entered with pending exception");
155 __ bind(L);
156 }
157 #endif
159 // create activation frame & allocate space for parameters
160 { const Register t = G3_scratch;
161 __ ld_ptr(parameter_size.as_address(), t); // get parameter size (in words)
162 __ add(t, frame::memory_parameter_word_sp_offset, t); // add space for save area (in words)
163 __ round_to(t, WordsPerLong); // make sure it is multiple of 2 (in words)
164 __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes
165 __ neg(t); // negate so it can be used with save
166 __ save(SP, t, SP); // setup new frame
167 }
169 // +---------------+ <--- sp + 0
170 // | |
171 // . reg save area .
172 // | |
173 // +---------------+ <--- sp + 0x40
174 // | |
175 // . extra 7 slots .
176 // | |
177 // +---------------+ <--- sp + 0x5c
178 // | empty slot | (only if parameter size is even)
179 // +---------------+
180 // | |
181 // . parameters .
182 // | |
183 // +---------------+ <--- fp + 0
184 // | |
185 // . reg save area .
186 // | |
187 // +---------------+ <--- fp + 0x40
188 // | |
189 // . extra 7 slots .
190 // | |
191 // +---------------+ <--- fp + 0x5c
192 // | param. size |
193 // +---------------+ <--- fp + 0x60
194 // | thread |
195 // +---------------+
196 // | |
198 // pass parameters if any
199 BLOCK_COMMENT("pass parameters if any");
200 { const Register src = parameters.as_in().as_register();
201 const Register dst = Lentry_args;
202 const Register tmp = G3_scratch;
203 const Register cnt = G4_scratch;
205 // test if any parameters & setup of Lentry_args
206 Label exit;
207 __ ld_ptr(parameter_size.as_in().as_address(), cnt); // parameter counter
208 __ add( FP, STACK_BIAS, dst );
209 __ cmp_zero_and_br(Assembler::zero, cnt, exit);
210 __ delayed()->sub(dst, BytesPerWord, dst); // setup Lentry_args
212 // copy parameters if any
213 Label loop;
214 __ BIND(loop);
215 // Store parameter value
216 __ ld_ptr(src, 0, tmp);
217 __ add(src, BytesPerWord, src);
218 __ st_ptr(tmp, dst, 0);
219 __ deccc(cnt);
220 __ br(Assembler::greater, false, Assembler::pt, loop);
221 __ delayed()->sub(dst, Interpreter::stackElementSize, dst);
223 // done
224 __ BIND(exit);
225 }
227 // setup parameters, method & call Java function
228 #ifdef ASSERT
229 // layout_activation_impl checks it's notion of saved SP against
230 // this register, so if this changes update it as well.
231 const Register saved_SP = Lscratch;
232 __ mov(SP, saved_SP); // keep track of SP before call
233 #endif
235 // setup parameters
236 const Register t = G3_scratch;
237 __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words)
238 __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes
239 __ sub(FP, t, Gargs); // setup parameter pointer
240 #ifdef _LP64
241 __ add( Gargs, STACK_BIAS, Gargs ); // Account for LP64 stack bias
242 #endif
243 __ mov(SP, O5_savedSP);
246 // do the call
247 //
248 // the following register must be setup:
249 //
250 // G2_thread
251 // G5_method
252 // Gargs
253 BLOCK_COMMENT("call Java function");
254 __ jmpl(entry_point.as_in().as_register(), G0, O7);
255 __ delayed()->mov(method.as_in().as_register(), G5_method); // setup method
257 BLOCK_COMMENT("call_stub_return_address:");
258 return_pc = __ pc();
260 // The callee, if it wasn't interpreted, can return with SP changed so
261 // we can no longer assert of change of SP.
263 // store result depending on type
264 // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE
265 // is treated as T_INT)
266 { const Register addr = result .as_in().as_register();
267 const Register type = result_type.as_in().as_register();
268 Label is_long, is_float, is_double, is_object, exit;
269 __ cmp(type, T_OBJECT); __ br(Assembler::equal, false, Assembler::pn, is_object);
270 __ delayed()->cmp(type, T_FLOAT); __ br(Assembler::equal, false, Assembler::pn, is_float);
271 __ delayed()->cmp(type, T_DOUBLE); __ br(Assembler::equal, false, Assembler::pn, is_double);
272 __ delayed()->cmp(type, T_LONG); __ br(Assembler::equal, false, Assembler::pn, is_long);
273 __ delayed()->nop();
275 // store int result
276 __ st(O0, addr, G0);
278 __ BIND(exit);
279 __ ret();
280 __ delayed()->restore();
282 __ BIND(is_object);
283 __ ba(exit);
284 __ delayed()->st_ptr(O0, addr, G0);
286 __ BIND(is_float);
287 __ ba(exit);
288 __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0);
290 __ BIND(is_double);
291 __ ba(exit);
292 __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0);
294 __ BIND(is_long);
295 #ifdef _LP64
296 __ ba(exit);
297 __ delayed()->st_long(O0, addr, G0); // store entire long
298 #else
299 #if defined(COMPILER2)
300 // All return values are where we want them, except for Longs. C2 returns
301 // longs in G1 in the 32-bit build whereas the interpreter wants them in O0/O1.
302 // Since the interpreter will return longs in G1 and O0/O1 in the 32bit
303 // build we simply always use G1.
304 // Note: I tried to make c2 return longs in O0/O1 and G1 so we wouldn't have to
305 // do this here. Unfortunately if we did a rethrow we'd see an machepilog node
306 // first which would move g1 -> O0/O1 and destroy the exception we were throwing.
308 __ ba(exit);
309 __ delayed()->stx(G1, addr, G0); // store entire long
310 #else
311 __ st(O1, addr, BytesPerInt);
312 __ ba(exit);
313 __ delayed()->st(O0, addr, G0);
314 #endif /* COMPILER2 */
315 #endif /* _LP64 */
316 }
317 return start;
318 }
321 //----------------------------------------------------------------------------------------------------
322 // Return point for a Java call if there's an exception thrown in Java code.
323 // The exception is caught and transformed into a pending exception stored in
324 // JavaThread that can be tested from within the VM.
325 //
326 // Oexception: exception oop
328 address generate_catch_exception() {
329 StubCodeMark mark(this, "StubRoutines", "catch_exception");
331 address start = __ pc();
332 // verify that thread corresponds
333 __ verify_thread();
335 const Register& temp_reg = Gtemp;
336 Address pending_exception_addr (G2_thread, Thread::pending_exception_offset());
337 Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset ());
338 Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset ());
340 // set pending exception
341 __ verify_oop(Oexception);
342 __ st_ptr(Oexception, pending_exception_addr);
343 __ set((intptr_t)__FILE__, temp_reg);
344 __ st_ptr(temp_reg, exception_file_offset_addr);
345 __ set((intptr_t)__LINE__, temp_reg);
346 __ st(temp_reg, exception_line_offset_addr);
348 // complete return to VM
349 assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
351 AddressLiteral stub_ret(StubRoutines::_call_stub_return_address);
352 __ jump_to(stub_ret, temp_reg);
353 __ delayed()->nop();
355 return start;
356 }
359 //----------------------------------------------------------------------------------------------------
360 // Continuation point for runtime calls returning with a pending exception
361 // The pending exception check happened in the runtime or native call stub
362 // The pending exception in Thread is converted into a Java-level exception
363 //
364 // Contract with Java-level exception handler: O0 = exception
365 // O1 = throwing pc
367 address generate_forward_exception() {
368 StubCodeMark mark(this, "StubRoutines", "forward_exception");
369 address start = __ pc();
371 // Upon entry, O7 has the return address returning into Java
372 // (interpreted or compiled) code; i.e. the return address
373 // becomes the throwing pc.
375 const Register& handler_reg = Gtemp;
377 Address exception_addr(G2_thread, Thread::pending_exception_offset());
379 #ifdef ASSERT
380 // make sure that this code is only executed if there is a pending exception
381 { Label L;
382 __ ld_ptr(exception_addr, Gtemp);
383 __ br_notnull_short(Gtemp, Assembler::pt, L);
384 __ stop("StubRoutines::forward exception: no pending exception (1)");
385 __ bind(L);
386 }
387 #endif
389 // compute exception handler into handler_reg
390 __ get_thread();
391 __ ld_ptr(exception_addr, Oexception);
392 __ verify_oop(Oexception);
393 __ save_frame(0); // compensates for compiler weakness
394 __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC
395 BLOCK_COMMENT("call exception_handler_for_return_address");
396 __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch);
397 __ mov(O0, handler_reg);
398 __ restore(); // compensates for compiler weakness
400 __ ld_ptr(exception_addr, Oexception);
401 __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC
403 #ifdef ASSERT
404 // make sure exception is set
405 { Label L;
406 __ br_notnull_short(Oexception, Assembler::pt, L);
407 __ stop("StubRoutines::forward exception: no pending exception (2)");
408 __ bind(L);
409 }
410 #endif
411 // jump to exception handler
412 __ jmp(handler_reg, 0);
413 // clear pending exception
414 __ delayed()->st_ptr(G0, exception_addr);
416 return start;
417 }
420 //------------------------------------------------------------------------------------------------------------------------
421 // Continuation point for throwing of implicit exceptions that are not handled in
422 // the current activation. Fabricates an exception oop and initiates normal
423 // exception dispatching in this frame. Only callee-saved registers are preserved
424 // (through the normal register window / RegisterMap handling).
425 // If the compiler needs all registers to be preserved between the fault
426 // point and the exception handler then it must assume responsibility for that in
427 // AbstractCompiler::continuation_for_implicit_null_exception or
428 // continuation_for_implicit_division_by_zero_exception. All other implicit
429 // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
430 // either at call sites or otherwise assume that stack unwinding will be initiated,
431 // so caller saved registers were assumed volatile in the compiler.
433 // Note that we generate only this stub into a RuntimeStub, because it needs to be
434 // properly traversed and ignored during GC, so we change the meaning of the "__"
435 // macro within this method.
436 #undef __
437 #define __ masm->
439 address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc,
440 Register arg1 = noreg, Register arg2 = noreg) {
441 #ifdef ASSERT
442 int insts_size = VerifyThread ? 1 * K : 600;
443 #else
444 int insts_size = VerifyThread ? 1 * K : 256;
445 #endif /* ASSERT */
446 int locs_size = 32;
448 CodeBuffer code(name, insts_size, locs_size);
449 MacroAssembler* masm = new MacroAssembler(&code);
451 __ verify_thread();
453 // This is an inlined and slightly modified version of call_VM
454 // which has the ability to fetch the return PC out of thread-local storage
455 __ assert_not_delayed();
457 // Note that we always push a frame because on the SPARC
458 // architecture, for all of our implicit exception kinds at call
459 // sites, the implicit exception is taken before the callee frame
460 // is pushed.
461 __ save_frame(0);
463 int frame_complete = __ offset();
465 if (restore_saved_exception_pc) {
466 __ ld_ptr(G2_thread, JavaThread::saved_exception_pc_offset(), I7);
467 __ sub(I7, frame::pc_return_offset, I7);
468 }
470 // Note that we always have a runtime stub frame on the top of stack by this point
471 Register last_java_sp = SP;
472 // 64-bit last_java_sp is biased!
473 __ set_last_Java_frame(last_java_sp, G0);
474 if (VerifyThread) __ mov(G2_thread, O0); // about to be smashed; pass early
475 __ save_thread(noreg);
476 if (arg1 != noreg) {
477 assert(arg2 != O1, "clobbered");
478 __ mov(arg1, O1);
479 }
480 if (arg2 != noreg) {
481 __ mov(arg2, O2);
482 }
483 // do the call
484 BLOCK_COMMENT("call runtime_entry");
485 __ call(runtime_entry, relocInfo::runtime_call_type);
486 if (!VerifyThread)
487 __ delayed()->mov(G2_thread, O0); // pass thread as first argument
488 else
489 __ delayed()->nop(); // (thread already passed)
490 __ restore_thread(noreg);
491 __ reset_last_Java_frame();
493 // check for pending exceptions. use Gtemp as scratch register.
494 #ifdef ASSERT
495 Label L;
497 Address exception_addr(G2_thread, Thread::pending_exception_offset());
498 Register scratch_reg = Gtemp;
499 __ ld_ptr(exception_addr, scratch_reg);
500 __ br_notnull_short(scratch_reg, Assembler::pt, L);
501 __ should_not_reach_here();
502 __ bind(L);
503 #endif // ASSERT
504 BLOCK_COMMENT("call forward_exception_entry");
505 __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
506 // we use O7 linkage so that forward_exception_entry has the issuing PC
507 __ delayed()->restore();
509 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false);
510 return stub->entry_point();
511 }
513 #undef __
514 #define __ _masm->
517 // Generate a routine that sets all the registers so we
518 // can tell if the stop routine prints them correctly.
519 address generate_test_stop() {
520 StubCodeMark mark(this, "StubRoutines", "test_stop");
521 address start = __ pc();
523 int i;
525 __ save_frame(0);
527 static jfloat zero = 0.0, one = 1.0;
529 // put addr in L0, then load through L0 to F0
530 __ set((intptr_t)&zero, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F0);
531 __ set((intptr_t)&one, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1
533 // use add to put 2..18 in F2..F18
534 for ( i = 2; i <= 18; ++i ) {
535 __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1), as_FloatRegister(i));
536 }
538 // Now put double 2 in F16, double 18 in F18
539 __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 );
540 __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 );
542 // use add to put 20..32 in F20..F32
543 for (i = 20; i < 32; i += 2) {
544 __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2), as_FloatRegister(i));
545 }
547 // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's
548 for ( i = 0; i < 8; ++i ) {
549 if (i < 6) {
550 __ set( i, as_iRegister(i));
551 __ set(16 + i, as_oRegister(i));
552 __ set(24 + i, as_gRegister(i));
553 }
554 __ set( 8 + i, as_lRegister(i));
555 }
557 __ stop("testing stop");
560 __ ret();
561 __ delayed()->restore();
563 return start;
564 }
567 address generate_stop_subroutine() {
568 StubCodeMark mark(this, "StubRoutines", "stop_subroutine");
569 address start = __ pc();
571 __ stop_subroutine();
573 return start;
574 }
576 address generate_flush_callers_register_windows() {
577 StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows");
578 address start = __ pc();
580 __ flush_windows();
581 __ retl(false);
582 __ delayed()->add( FP, STACK_BIAS, O0 );
583 // The returned value must be a stack pointer whose register save area
584 // is flushed, and will stay flushed while the caller executes.
586 return start;
587 }
589 // Helper functions for v8 atomic operations.
590 //
591 void get_v8_oop_lock_ptr(Register lock_ptr_reg, Register mark_oop_reg, Register scratch_reg) {
592 if (mark_oop_reg == noreg) {
593 address lock_ptr = (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr();
594 __ set((intptr_t)lock_ptr, lock_ptr_reg);
595 } else {
596 assert(scratch_reg != noreg, "just checking");
597 address lock_ptr = (address)StubRoutines::Sparc::_v8_oop_lock_cache;
598 __ set((intptr_t)lock_ptr, lock_ptr_reg);
599 __ and3(mark_oop_reg, StubRoutines::Sparc::v8_oop_lock_mask_in_place, scratch_reg);
600 __ add(lock_ptr_reg, scratch_reg, lock_ptr_reg);
601 }
602 }
604 void generate_v8_lock_prologue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) {
606 get_v8_oop_lock_ptr(lock_ptr_reg, mark_oop_reg, scratch_reg);
607 __ set(StubRoutines::Sparc::locked, lock_reg);
608 // Initialize yield counter
609 __ mov(G0,yield_reg);
611 __ BIND(retry);
612 __ cmp_and_br_short(yield_reg, V8AtomicOperationUnderLockSpinCount, Assembler::less, Assembler::pt, dontyield);
614 // This code can only be called from inside the VM, this
615 // stub is only invoked from Atomic::add(). We do not
616 // want to use call_VM, because _last_java_sp and such
617 // must already be set.
618 //
619 // Save the regs and make space for a C call
620 __ save(SP, -96, SP);
621 __ save_all_globals_into_locals();
622 BLOCK_COMMENT("call os::naked_sleep");
623 __ call(CAST_FROM_FN_PTR(address, os::naked_sleep));
624 __ delayed()->nop();
625 __ restore_globals_from_locals();
626 __ restore();
627 // reset the counter
628 __ mov(G0,yield_reg);
630 __ BIND(dontyield);
632 // try to get lock
633 __ swap(lock_ptr_reg, 0, lock_reg);
635 // did we get the lock?
636 __ cmp(lock_reg, StubRoutines::Sparc::unlocked);
637 __ br(Assembler::notEqual, true, Assembler::pn, retry);
638 __ delayed()->add(yield_reg,1,yield_reg);
640 // yes, got lock. do the operation here.
641 }
643 void generate_v8_lock_epilogue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) {
644 __ st(lock_reg, lock_ptr_reg, 0); // unlock
645 }
647 // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest).
648 //
649 // Arguments :
650 //
651 // exchange_value: O0
652 // dest: O1
653 //
654 // Results:
655 //
656 // O0: the value previously stored in dest
657 //
658 address generate_atomic_xchg() {
659 StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
660 address start = __ pc();
662 if (UseCASForSwap) {
663 // Use CAS instead of swap, just in case the MP hardware
664 // prefers to work with just one kind of synch. instruction.
665 Label retry;
666 __ BIND(retry);
667 __ mov(O0, O3); // scratch copy of exchange value
668 __ ld(O1, 0, O2); // observe the previous value
669 // try to replace O2 with O3
670 __ cas_under_lock(O1, O2, O3,
671 (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
672 __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
674 __ retl(false);
675 __ delayed()->mov(O2, O0); // report previous value to caller
677 } else {
678 if (VM_Version::v9_instructions_work()) {
679 __ retl(false);
680 __ delayed()->swap(O1, 0, O0);
681 } else {
682 const Register& lock_reg = O2;
683 const Register& lock_ptr_reg = O3;
684 const Register& yield_reg = O4;
686 Label retry;
687 Label dontyield;
689 generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
690 // got the lock, do the swap
691 __ swap(O1, 0, O0);
693 generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
694 __ retl(false);
695 __ delayed()->nop();
696 }
697 }
699 return start;
700 }
703 // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value)
704 //
705 // Arguments :
706 //
707 // exchange_value: O0
708 // dest: O1
709 // compare_value: O2
710 //
711 // Results:
712 //
713 // O0: the value previously stored in dest
714 //
715 // Overwrites (v8): O3,O4,O5
716 //
717 address generate_atomic_cmpxchg() {
718 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
719 address start = __ pc();
721 // cmpxchg(dest, compare_value, exchange_value)
722 __ cas_under_lock(O1, O2, O0,
723 (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
724 __ retl(false);
725 __ delayed()->nop();
727 return start;
728 }
730 // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
731 //
732 // Arguments :
733 //
734 // exchange_value: O1:O0
735 // dest: O2
736 // compare_value: O4:O3
737 //
738 // Results:
739 //
740 // O1:O0: the value previously stored in dest
741 //
742 // This only works on V9, on V8 we don't generate any
743 // code and just return NULL.
744 //
745 // Overwrites: G1,G2,G3
746 //
747 address generate_atomic_cmpxchg_long() {
748 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
749 address start = __ pc();
751 if (!VM_Version::supports_cx8())
752 return NULL;;
753 __ sllx(O0, 32, O0);
754 __ srl(O1, 0, O1);
755 __ or3(O0,O1,O0); // O0 holds 64-bit value from compare_value
756 __ sllx(O3, 32, O3);
757 __ srl(O4, 0, O4);
758 __ or3(O3,O4,O3); // O3 holds 64-bit value from exchange_value
759 __ casx(O2, O3, O0);
760 __ srl(O0, 0, O1); // unpacked return value in O1:O0
761 __ retl(false);
762 __ delayed()->srlx(O0, 32, O0);
764 return start;
765 }
768 // Support for jint Atomic::add(jint add_value, volatile jint* dest).
769 //
770 // Arguments :
771 //
772 // add_value: O0 (e.g., +1 or -1)
773 // dest: O1
774 //
775 // Results:
776 //
777 // O0: the new value stored in dest
778 //
779 // Overwrites (v9): O3
780 // Overwrites (v8): O3,O4,O5
781 //
782 address generate_atomic_add() {
783 StubCodeMark mark(this, "StubRoutines", "atomic_add");
784 address start = __ pc();
785 __ BIND(_atomic_add_stub);
787 if (VM_Version::v9_instructions_work()) {
788 Label(retry);
789 __ BIND(retry);
791 __ lduw(O1, 0, O2);
792 __ add(O0, O2, O3);
793 __ cas(O1, O2, O3);
794 __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
795 __ retl(false);
796 __ delayed()->add(O0, O2, O0); // note that cas made O2==O3
797 } else {
798 const Register& lock_reg = O2;
799 const Register& lock_ptr_reg = O3;
800 const Register& value_reg = O4;
801 const Register& yield_reg = O5;
803 Label(retry);
804 Label(dontyield);
806 generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
807 // got lock, do the increment
808 __ ld(O1, 0, value_reg);
809 __ add(O0, value_reg, value_reg);
810 __ st(value_reg, O1, 0);
812 // %%% only for RMO and PSO
813 __ membar(Assembler::StoreStore);
815 generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
817 __ retl(false);
818 __ delayed()->mov(value_reg, O0);
819 }
821 return start;
822 }
823 Label _atomic_add_stub; // called from other stubs
826 //------------------------------------------------------------------------------------------------------------------------
827 // The following routine generates a subroutine to throw an asynchronous
828 // UnknownError when an unsafe access gets a fault that could not be
829 // reasonably prevented by the programmer. (Example: SIGBUS/OBJERR.)
830 //
831 // Arguments :
832 //
833 // trapping PC: O7
834 //
835 // Results:
836 // posts an asynchronous exception, skips the trapping instruction
837 //
839 address generate_handler_for_unsafe_access() {
840 StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
841 address start = __ pc();
843 const int preserve_register_words = (64 * 2);
844 Address preserve_addr(FP, (-preserve_register_words * wordSize) + STACK_BIAS);
846 Register Lthread = L7_thread_cache;
847 int i;
849 __ save_frame(0);
850 __ mov(G1, L1);
851 __ mov(G2, L2);
852 __ mov(G3, L3);
853 __ mov(G4, L4);
854 __ mov(G5, L5);
855 for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) {
856 __ stf(FloatRegisterImpl::D, as_FloatRegister(i), preserve_addr, i * wordSize);
857 }
859 address entry_point = CAST_FROM_FN_PTR(address, handle_unsafe_access);
860 BLOCK_COMMENT("call handle_unsafe_access");
861 __ call(entry_point, relocInfo::runtime_call_type);
862 __ delayed()->nop();
864 __ mov(L1, G1);
865 __ mov(L2, G2);
866 __ mov(L3, G3);
867 __ mov(L4, G4);
868 __ mov(L5, G5);
869 for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) {
870 __ ldf(FloatRegisterImpl::D, preserve_addr, as_FloatRegister(i), i * wordSize);
871 }
873 __ verify_thread();
875 __ jmp(O0, 0);
876 __ delayed()->restore();
878 return start;
879 }
882 // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super );
883 // Arguments :
884 //
885 // ret : O0, returned
886 // icc/xcc: set as O0 (depending on wordSize)
887 // sub : O1, argument, not changed
888 // super: O2, argument, not changed
889 // raddr: O7, blown by call
890 address generate_partial_subtype_check() {
891 __ align(CodeEntryAlignment);
892 StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
893 address start = __ pc();
894 Label miss;
896 #if defined(COMPILER2) && !defined(_LP64)
897 // Do not use a 'save' because it blows the 64-bit O registers.
898 __ add(SP,-4*wordSize,SP); // Make space for 4 temps (stack must be 2 words aligned)
899 __ st_ptr(L0,SP,(frame::register_save_words+0)*wordSize);
900 __ st_ptr(L1,SP,(frame::register_save_words+1)*wordSize);
901 __ st_ptr(L2,SP,(frame::register_save_words+2)*wordSize);
902 __ st_ptr(L3,SP,(frame::register_save_words+3)*wordSize);
903 Register Rret = O0;
904 Register Rsub = O1;
905 Register Rsuper = O2;
906 #else
907 __ save_frame(0);
908 Register Rret = I0;
909 Register Rsub = I1;
910 Register Rsuper = I2;
911 #endif
913 Register L0_ary_len = L0;
914 Register L1_ary_ptr = L1;
915 Register L2_super = L2;
916 Register L3_index = L3;
918 __ check_klass_subtype_slow_path(Rsub, Rsuper,
919 L0, L1, L2, L3,
920 NULL, &miss);
922 // Match falls through here.
923 __ addcc(G0,0,Rret); // set Z flags, Z result
925 #if defined(COMPILER2) && !defined(_LP64)
926 __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
927 __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
928 __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
929 __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
930 __ retl(); // Result in Rret is zero; flags set to Z
931 __ delayed()->add(SP,4*wordSize,SP);
932 #else
933 __ ret(); // Result in Rret is zero; flags set to Z
934 __ delayed()->restore();
935 #endif
937 __ BIND(miss);
938 __ addcc(G0,1,Rret); // set NZ flags, NZ result
940 #if defined(COMPILER2) && !defined(_LP64)
941 __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
942 __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
943 __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
944 __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
945 __ retl(); // Result in Rret is != 0; flags set to NZ
946 __ delayed()->add(SP,4*wordSize,SP);
947 #else
948 __ ret(); // Result in Rret is != 0; flags set to NZ
949 __ delayed()->restore();
950 #endif
952 return start;
953 }
956 // Called from MacroAssembler::verify_oop
957 //
958 address generate_verify_oop_subroutine() {
959 StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
961 address start = __ pc();
963 __ verify_oop_subroutine();
965 return start;
966 }
969 //
970 // Verify that a register contains clean 32-bits positive value
971 // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax).
972 //
973 // Input:
974 // Rint - 32-bits value
975 // Rtmp - scratch
976 //
977 void assert_clean_int(Register Rint, Register Rtmp) {
978 #if defined(ASSERT) && defined(_LP64)
979 __ signx(Rint, Rtmp);
980 __ cmp(Rint, Rtmp);
981 __ breakpoint_trap(Assembler::notEqual, Assembler::xcc);
982 #endif
983 }
985 //
986 // Generate overlap test for array copy stubs
987 //
988 // Input:
989 // O0 - array1
990 // O1 - array2
991 // O2 - element count
992 //
993 // Kills temps: O3, O4
994 //
995 void array_overlap_test(address no_overlap_target, int log2_elem_size) {
996 assert(no_overlap_target != NULL, "must be generated");
997 array_overlap_test(no_overlap_target, NULL, log2_elem_size);
998 }
999 void array_overlap_test(Label& L_no_overlap, int log2_elem_size) {
1000 array_overlap_test(NULL, &L_no_overlap, log2_elem_size);
1001 }
1002 void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) {
1003 const Register from = O0;
1004 const Register to = O1;
1005 const Register count = O2;
1006 const Register to_from = O3; // to - from
1007 const Register byte_count = O4; // count << log2_elem_size
1009 __ subcc(to, from, to_from);
1010 __ sll_ptr(count, log2_elem_size, byte_count);
1011 if (NOLp == NULL)
1012 __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target);
1013 else
1014 __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp));
1015 __ delayed()->cmp(to_from, byte_count);
1016 if (NOLp == NULL)
1017 __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target);
1018 else
1019 __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp));
1020 __ delayed()->nop();
1021 }
1023 //
1024 // Generate pre-write barrier for array.
1025 //
1026 // Input:
1027 // addr - register containing starting address
1028 // count - register containing element count
1029 // tmp - scratch register
1030 //
1031 // The input registers are overwritten.
1032 //
1033 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
1034 BarrierSet* bs = Universe::heap()->barrier_set();
1035 switch (bs->kind()) {
1036 case BarrierSet::G1SATBCT:
1037 case BarrierSet::G1SATBCTLogging:
1038 // With G1, don't generate the call if we statically know that the target in uninitialized
1039 if (!dest_uninitialized) {
1040 __ save_frame(0);
1041 // Save the necessary global regs... will be used after.
1042 if (addr->is_global()) {
1043 __ mov(addr, L0);
1044 }
1045 if (count->is_global()) {
1046 __ mov(count, L1);
1047 }
1048 __ mov(addr->after_save(), O0);
1049 // Get the count into O1
1050 __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
1051 __ delayed()->mov(count->after_save(), O1);
1052 if (addr->is_global()) {
1053 __ mov(L0, addr);
1054 }
1055 if (count->is_global()) {
1056 __ mov(L1, count);
1057 }
1058 __ restore();
1059 }
1060 break;
1061 case BarrierSet::CardTableModRef:
1062 case BarrierSet::CardTableExtension:
1063 case BarrierSet::ModRef:
1064 break;
1065 default:
1066 ShouldNotReachHere();
1067 }
1068 }
1069 //
1070 // Generate post-write barrier for array.
1071 //
1072 // Input:
1073 // addr - register containing starting address
1074 // count - register containing element count
1075 // tmp - scratch register
1076 //
1077 // The input registers are overwritten.
1078 //
1079 void gen_write_ref_array_post_barrier(Register addr, Register count,
1080 Register tmp) {
1081 BarrierSet* bs = Universe::heap()->barrier_set();
1083 switch (bs->kind()) {
1084 case BarrierSet::G1SATBCT:
1085 case BarrierSet::G1SATBCTLogging:
1086 {
1087 // Get some new fresh output registers.
1088 __ save_frame(0);
1089 __ mov(addr->after_save(), O0);
1090 __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
1091 __ delayed()->mov(count->after_save(), O1);
1092 __ restore();
1093 }
1094 break;
1095 case BarrierSet::CardTableModRef:
1096 case BarrierSet::CardTableExtension:
1097 {
1098 CardTableModRefBS* ct = (CardTableModRefBS*)bs;
1099 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1100 assert_different_registers(addr, count, tmp);
1102 Label L_loop;
1104 __ sll_ptr(count, LogBytesPerHeapOop, count);
1105 __ sub(count, BytesPerHeapOop, count);
1106 __ add(count, addr, count);
1107 // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
1108 __ srl_ptr(addr, CardTableModRefBS::card_shift, addr);
1109 __ srl_ptr(count, CardTableModRefBS::card_shift, count);
1110 __ sub(count, addr, count);
1111 AddressLiteral rs(ct->byte_map_base);
1112 __ set(rs, tmp);
1113 __ BIND(L_loop);
1114 __ stb(G0, tmp, addr);
1115 __ subcc(count, 1, count);
1116 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1117 __ delayed()->add(addr, 1, addr);
1118 }
1119 break;
1120 case BarrierSet::ModRef:
1121 break;
1122 default:
1123 ShouldNotReachHere();
1124 }
1125 }
1127 //
1128 // Generate main code for disjoint arraycopy
1129 //
1130 typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec,
1131 Label& L_loop, bool use_prefetch, bool use_bis);
1133 void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size,
1134 int iter_size, CopyLoopFunc copy_loop_func) {
1135 Label L_copy;
1137 assert(log2_elem_size <= 3, "the following code should be changed");
1138 int count_dec = 16>>log2_elem_size;
1140 int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance);
1141 assert(prefetch_dist < 4096, "invalid value");
1142 prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size
1143 int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count
1145 if (UseBlockCopy) {
1146 Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy;
1148 // 64 bytes tail + bytes copied in one loop iteration
1149 int tail_size = 64 + iter_size;
1150 int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size;
1151 // Use BIS copy only for big arrays since it requires membar.
1152 __ set(block_copy_count, O4);
1153 __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy);
1154 // This code is for disjoint source and destination:
1155 // to <= from || to >= from+count
1156 // but BIS will stomp over 'from' if (to > from-tail_size && to <= from)
1157 __ sub(from, to, O4);
1158 __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm.
1159 __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy);
1161 __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
1162 // BIS should not be used to copy tail (64 bytes+iter_size)
1163 // to avoid zeroing of following values.
1164 __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0
1166 if (prefetch_count > 0) { // rounded up to one iteration count
1167 // Do prefetching only if copy size is bigger
1168 // than prefetch distance.
1169 __ set(prefetch_count, O4);
1170 __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy);
1171 __ sub(count, prefetch_count, count);
1173 (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true);
1174 __ add(count, prefetch_count, count); // restore count
1176 } // prefetch_count > 0
1178 (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true);
1179 __ add(count, (tail_size>>log2_elem_size), count); // restore count
1181 __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT);
1182 // BIS needs membar.
1183 __ membar(Assembler::StoreLoad);
1184 // Copy tail
1185 __ ba_short(L_copy);
1187 __ BIND(L_skip_block_copy);
1188 } // UseBlockCopy
1190 if (prefetch_count > 0) { // rounded up to one iteration count
1191 // Do prefetching only if copy size is bigger
1192 // than prefetch distance.
1193 __ set(prefetch_count, O4);
1194 __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy);
1195 __ sub(count, prefetch_count, count);
1197 Label L_copy_prefetch;
1198 (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false);
1199 __ add(count, prefetch_count, count); // restore count
1201 } // prefetch_count > 0
1203 (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false);
1204 }
1208 //
1209 // Helper methods for copy_16_bytes_forward_with_shift()
1210 //
1211 void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec,
1212 Label& L_loop, bool use_prefetch, bool use_bis) {
1214 const Register left_shift = G1; // left shift bit counter
1215 const Register right_shift = G5; // right shift bit counter
1217 __ align(OptoLoopAlignment);
1218 __ BIND(L_loop);
1219 if (use_prefetch) {
1220 if (ArraycopySrcPrefetchDistance > 0) {
1221 __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
1222 }
1223 if (ArraycopyDstPrefetchDistance > 0) {
1224 __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
1225 }
1226 }
1227 __ ldx(from, 0, O4);
1228 __ ldx(from, 8, G4);
1229 __ inc(to, 16);
1230 __ inc(from, 16);
1231 __ deccc(count, count_dec); // Can we do next iteration after this one?
1232 __ srlx(O4, right_shift, G3);
1233 __ bset(G3, O3);
1234 __ sllx(O4, left_shift, O4);
1235 __ srlx(G4, right_shift, G3);
1236 __ bset(G3, O4);
1237 if (use_bis) {
1238 __ stxa(O3, to, -16);
1239 __ stxa(O4, to, -8);
1240 } else {
1241 __ stx(O3, to, -16);
1242 __ stx(O4, to, -8);
1243 }
1244 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1245 __ delayed()->sllx(G4, left_shift, O3);
1246 }
1248 // Copy big chunks forward with shift
1249 //
1250 // Inputs:
1251 // from - source arrays
1252 // to - destination array aligned to 8-bytes
1253 // count - elements count to copy >= the count equivalent to 16 bytes
1254 // count_dec - elements count's decrement equivalent to 16 bytes
1255 // L_copy_bytes - copy exit label
1256 //
1257 void copy_16_bytes_forward_with_shift(Register from, Register to,
1258 Register count, int log2_elem_size, Label& L_copy_bytes) {
1259 Label L_aligned_copy, L_copy_last_bytes;
1260 assert(log2_elem_size <= 3, "the following code should be changed");
1261 int count_dec = 16>>log2_elem_size;
1263 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1264 __ andcc(from, 7, G1); // misaligned bytes
1265 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1266 __ delayed()->nop();
1268 const Register left_shift = G1; // left shift bit counter
1269 const Register right_shift = G5; // right shift bit counter
1271 __ sll(G1, LogBitsPerByte, left_shift);
1272 __ mov(64, right_shift);
1273 __ sub(right_shift, left_shift, right_shift);
1275 //
1276 // Load 2 aligned 8-bytes chunks and use one from previous iteration
1277 // to form 2 aligned 8-bytes chunks to store.
1278 //
1279 __ dec(count, count_dec); // Pre-decrement 'count'
1280 __ andn(from, 7, from); // Align address
1281 __ ldx(from, 0, O3);
1282 __ inc(from, 8);
1283 __ sllx(O3, left_shift, O3);
1285 disjoint_copy_core(from, to, count, log2_elem_size, 16, copy_16_bytes_shift_loop);
1287 __ inccc(count, count_dec>>1 ); // + 8 bytes
1288 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1289 __ delayed()->inc(count, count_dec>>1); // restore 'count'
1291 // copy 8 bytes, part of them already loaded in O3
1292 __ ldx(from, 0, O4);
1293 __ inc(to, 8);
1294 __ inc(from, 8);
1295 __ srlx(O4, right_shift, G3);
1296 __ bset(O3, G3);
1297 __ stx(G3, to, -8);
1299 __ BIND(L_copy_last_bytes);
1300 __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
1301 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1302 __ delayed()->sub(from, right_shift, from); // restore address
1304 __ BIND(L_aligned_copy);
1305 }
1307 // Copy big chunks backward with shift
1308 //
1309 // Inputs:
1310 // end_from - source arrays end address
1311 // end_to - destination array end address aligned to 8-bytes
1312 // count - elements count to copy >= the count equivalent to 16 bytes
1313 // count_dec - elements count's decrement equivalent to 16 bytes
1314 // L_aligned_copy - aligned copy exit label
1315 // L_copy_bytes - copy exit label
1316 //
1317 void copy_16_bytes_backward_with_shift(Register end_from, Register end_to,
1318 Register count, int count_dec,
1319 Label& L_aligned_copy, Label& L_copy_bytes) {
1320 Label L_loop, L_copy_last_bytes;
1322 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1323 __ andcc(end_from, 7, G1); // misaligned bytes
1324 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1325 __ delayed()->deccc(count, count_dec); // Pre-decrement 'count'
1327 const Register left_shift = G1; // left shift bit counter
1328 const Register right_shift = G5; // right shift bit counter
1330 __ sll(G1, LogBitsPerByte, left_shift);
1331 __ mov(64, right_shift);
1332 __ sub(right_shift, left_shift, right_shift);
1334 //
1335 // Load 2 aligned 8-bytes chunks and use one from previous iteration
1336 // to form 2 aligned 8-bytes chunks to store.
1337 //
1338 __ andn(end_from, 7, end_from); // Align address
1339 __ ldx(end_from, 0, O3);
1340 __ align(OptoLoopAlignment);
1341 __ BIND(L_loop);
1342 __ ldx(end_from, -8, O4);
1343 __ deccc(count, count_dec); // Can we do next iteration after this one?
1344 __ ldx(end_from, -16, G4);
1345 __ dec(end_to, 16);
1346 __ dec(end_from, 16);
1347 __ srlx(O3, right_shift, O3);
1348 __ sllx(O4, left_shift, G3);
1349 __ bset(G3, O3);
1350 __ stx(O3, end_to, 8);
1351 __ srlx(O4, right_shift, O4);
1352 __ sllx(G4, left_shift, G3);
1353 __ bset(G3, O4);
1354 __ stx(O4, end_to, 0);
1355 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1356 __ delayed()->mov(G4, O3);
1358 __ inccc(count, count_dec>>1 ); // + 8 bytes
1359 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1360 __ delayed()->inc(count, count_dec>>1); // restore 'count'
1362 // copy 8 bytes, part of them already loaded in O3
1363 __ ldx(end_from, -8, O4);
1364 __ dec(end_to, 8);
1365 __ dec(end_from, 8);
1366 __ srlx(O3, right_shift, O3);
1367 __ sllx(O4, left_shift, G3);
1368 __ bset(O3, G3);
1369 __ stx(G3, end_to, 0);
1371 __ BIND(L_copy_last_bytes);
1372 __ srl(left_shift, LogBitsPerByte, left_shift); // misaligned bytes
1373 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1374 __ delayed()->add(end_from, left_shift, end_from); // restore address
1375 }
1377 //
1378 // Generate stub for disjoint byte copy. If "aligned" is true, the
1379 // "from" and "to" addresses are assumed to be heapword aligned.
1380 //
1381 // Arguments for generated stub:
1382 // from: O0
1383 // to: O1
1384 // count: O2 treated as signed
1385 //
1386 address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) {
1387 __ align(CodeEntryAlignment);
1388 StubCodeMark mark(this, "StubRoutines", name);
1389 address start = __ pc();
1391 Label L_skip_alignment, L_align;
1392 Label L_copy_byte, L_copy_byte_loop, L_exit;
1394 const Register from = O0; // source array address
1395 const Register to = O1; // destination array address
1396 const Register count = O2; // elements count
1397 const Register offset = O5; // offset from start of arrays
1398 // O3, O4, G3, G4 are used as temp registers
1400 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1402 if (entry != NULL) {
1403 *entry = __ pc();
1404 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1405 BLOCK_COMMENT("Entry:");
1406 }
1408 // for short arrays, just do single element copy
1409 __ cmp(count, 23); // 16 + 7
1410 __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1411 __ delayed()->mov(G0, offset);
1413 if (aligned) {
1414 // 'aligned' == true when it is known statically during compilation
1415 // of this arraycopy call site that both 'from' and 'to' addresses
1416 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1417 //
1418 // Aligned arrays have 4 bytes alignment in 32-bits VM
1419 // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM
1420 //
1421 #ifndef _LP64
1422 // copy a 4-bytes word if necessary to align 'to' to 8 bytes
1423 __ andcc(to, 7, G0);
1424 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment);
1425 __ delayed()->ld(from, 0, O3);
1426 __ inc(from, 4);
1427 __ inc(to, 4);
1428 __ dec(count, 4);
1429 __ st(O3, to, -4);
1430 __ BIND(L_skip_alignment);
1431 #endif
1432 } else {
1433 // copy bytes to align 'to' on 8 byte boundary
1434 __ andcc(to, 7, G1); // misaligned bytes
1435 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1436 __ delayed()->neg(G1);
1437 __ inc(G1, 8); // bytes need to copy to next 8-bytes alignment
1438 __ sub(count, G1, count);
1439 __ BIND(L_align);
1440 __ ldub(from, 0, O3);
1441 __ deccc(G1);
1442 __ inc(from);
1443 __ stb(O3, to, 0);
1444 __ br(Assembler::notZero, false, Assembler::pt, L_align);
1445 __ delayed()->inc(to);
1446 __ BIND(L_skip_alignment);
1447 }
1448 #ifdef _LP64
1449 if (!aligned)
1450 #endif
1451 {
1452 // Copy with shift 16 bytes per iteration if arrays do not have
1453 // the same alignment mod 8, otherwise fall through to the next
1454 // code for aligned copy.
1455 // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
1456 // Also jump over aligned copy after the copy with shift completed.
1458 copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte);
1459 }
1461 // Both array are 8 bytes aligned, copy 16 bytes at a time
1462 __ and3(count, 7, G4); // Save count
1463 __ srl(count, 3, count);
1464 generate_disjoint_long_copy_core(aligned);
1465 __ mov(G4, count); // Restore count
1467 // copy tailing bytes
1468 __ BIND(L_copy_byte);
1469 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1470 __ align(OptoLoopAlignment);
1471 __ BIND(L_copy_byte_loop);
1472 __ ldub(from, offset, O3);
1473 __ deccc(count);
1474 __ stb(O3, to, offset);
1475 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
1476 __ delayed()->inc(offset);
1478 __ BIND(L_exit);
1479 // O3, O4 are used as temp registers
1480 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1481 __ retl();
1482 __ delayed()->mov(G0, O0); // return 0
1483 return start;
1484 }
1486 //
1487 // Generate stub for conjoint byte copy. If "aligned" is true, the
1488 // "from" and "to" addresses are assumed to be heapword aligned.
1489 //
1490 // Arguments for generated stub:
1491 // from: O0
1492 // to: O1
1493 // count: O2 treated as signed
1494 //
1495 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1496 address *entry, const char *name) {
1497 // Do reverse copy.
1499 __ align(CodeEntryAlignment);
1500 StubCodeMark mark(this, "StubRoutines", name);
1501 address start = __ pc();
1503 Label L_skip_alignment, L_align, L_aligned_copy;
1504 Label L_copy_byte, L_copy_byte_loop, L_exit;
1506 const Register from = O0; // source array address
1507 const Register to = O1; // destination array address
1508 const Register count = O2; // elements count
1509 const Register end_from = from; // source array end address
1510 const Register end_to = to; // destination array end address
1512 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1514 if (entry != NULL) {
1515 *entry = __ pc();
1516 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1517 BLOCK_COMMENT("Entry:");
1518 }
1520 array_overlap_test(nooverlap_target, 0);
1522 __ add(to, count, end_to); // offset after last copied element
1524 // for short arrays, just do single element copy
1525 __ cmp(count, 23); // 16 + 7
1526 __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1527 __ delayed()->add(from, count, end_from);
1529 {
1530 // Align end of arrays since they could be not aligned even
1531 // when arrays itself are aligned.
1533 // copy bytes to align 'end_to' on 8 byte boundary
1534 __ andcc(end_to, 7, G1); // misaligned bytes
1535 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1536 __ delayed()->nop();
1537 __ sub(count, G1, count);
1538 __ BIND(L_align);
1539 __ dec(end_from);
1540 __ dec(end_to);
1541 __ ldub(end_from, 0, O3);
1542 __ deccc(G1);
1543 __ brx(Assembler::notZero, false, Assembler::pt, L_align);
1544 __ delayed()->stb(O3, end_to, 0);
1545 __ BIND(L_skip_alignment);
1546 }
1547 #ifdef _LP64
1548 if (aligned) {
1549 // Both arrays are aligned to 8-bytes in 64-bits VM.
1550 // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1551 // in unaligned case.
1552 __ dec(count, 16);
1553 } else
1554 #endif
1555 {
1556 // Copy with shift 16 bytes per iteration if arrays do not have
1557 // the same alignment mod 8, otherwise jump to the next
1558 // code for aligned copy (and substracting 16 from 'count' before jump).
1559 // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1560 // Also jump over aligned copy after the copy with shift completed.
1562 copy_16_bytes_backward_with_shift(end_from, end_to, count, 16,
1563 L_aligned_copy, L_copy_byte);
1564 }
1565 // copy 4 elements (16 bytes) at a time
1566 __ align(OptoLoopAlignment);
1567 __ BIND(L_aligned_copy);
1568 __ dec(end_from, 16);
1569 __ ldx(end_from, 8, O3);
1570 __ ldx(end_from, 0, O4);
1571 __ dec(end_to, 16);
1572 __ deccc(count, 16);
1573 __ stx(O3, end_to, 8);
1574 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1575 __ delayed()->stx(O4, end_to, 0);
1576 __ inc(count, 16);
1578 // copy 1 element (2 bytes) at a time
1579 __ BIND(L_copy_byte);
1580 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1581 __ align(OptoLoopAlignment);
1582 __ BIND(L_copy_byte_loop);
1583 __ dec(end_from);
1584 __ dec(end_to);
1585 __ ldub(end_from, 0, O4);
1586 __ deccc(count);
1587 __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop);
1588 __ delayed()->stb(O4, end_to, 0);
1590 __ BIND(L_exit);
1591 // O3, O4 are used as temp registers
1592 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1593 __ retl();
1594 __ delayed()->mov(G0, O0); // return 0
1595 return start;
1596 }
1598 //
1599 // Generate stub for disjoint short copy. If "aligned" is true, the
1600 // "from" and "to" addresses are assumed to be heapword aligned.
1601 //
1602 // Arguments for generated stub:
1603 // from: O0
1604 // to: O1
1605 // count: O2 treated as signed
1606 //
1607 address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) {
1608 __ align(CodeEntryAlignment);
1609 StubCodeMark mark(this, "StubRoutines", name);
1610 address start = __ pc();
1612 Label L_skip_alignment, L_skip_alignment2;
1613 Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1615 const Register from = O0; // source array address
1616 const Register to = O1; // destination array address
1617 const Register count = O2; // elements count
1618 const Register offset = O5; // offset from start of arrays
1619 // O3, O4, G3, G4 are used as temp registers
1621 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1623 if (entry != NULL) {
1624 *entry = __ pc();
1625 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1626 BLOCK_COMMENT("Entry:");
1627 }
1629 // for short arrays, just do single element copy
1630 __ cmp(count, 11); // 8 + 3 (22 bytes)
1631 __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1632 __ delayed()->mov(G0, offset);
1634 if (aligned) {
1635 // 'aligned' == true when it is known statically during compilation
1636 // of this arraycopy call site that both 'from' and 'to' addresses
1637 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1638 //
1639 // Aligned arrays have 4 bytes alignment in 32-bits VM
1640 // and 8 bytes - in 64-bits VM.
1641 //
1642 #ifndef _LP64
1643 // copy a 2-elements word if necessary to align 'to' to 8 bytes
1644 __ andcc(to, 7, G0);
1645 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1646 __ delayed()->ld(from, 0, O3);
1647 __ inc(from, 4);
1648 __ inc(to, 4);
1649 __ dec(count, 2);
1650 __ st(O3, to, -4);
1651 __ BIND(L_skip_alignment);
1652 #endif
1653 } else {
1654 // copy 1 element if necessary to align 'to' on an 4 bytes
1655 __ andcc(to, 3, G0);
1656 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1657 __ delayed()->lduh(from, 0, O3);
1658 __ inc(from, 2);
1659 __ inc(to, 2);
1660 __ dec(count);
1661 __ sth(O3, to, -2);
1662 __ BIND(L_skip_alignment);
1664 // copy 2 elements to align 'to' on an 8 byte boundary
1665 __ andcc(to, 7, G0);
1666 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1667 __ delayed()->lduh(from, 0, O3);
1668 __ dec(count, 2);
1669 __ lduh(from, 2, O4);
1670 __ inc(from, 4);
1671 __ inc(to, 4);
1672 __ sth(O3, to, -4);
1673 __ sth(O4, to, -2);
1674 __ BIND(L_skip_alignment2);
1675 }
1676 #ifdef _LP64
1677 if (!aligned)
1678 #endif
1679 {
1680 // Copy with shift 16 bytes per iteration if arrays do not have
1681 // the same alignment mod 8, otherwise fall through to the next
1682 // code for aligned copy.
1683 // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1684 // Also jump over aligned copy after the copy with shift completed.
1686 copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes);
1687 }
1689 // Both array are 8 bytes aligned, copy 16 bytes at a time
1690 __ and3(count, 3, G4); // Save
1691 __ srl(count, 2, count);
1692 generate_disjoint_long_copy_core(aligned);
1693 __ mov(G4, count); // restore
1695 // copy 1 element at a time
1696 __ BIND(L_copy_2_bytes);
1697 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1698 __ align(OptoLoopAlignment);
1699 __ BIND(L_copy_2_bytes_loop);
1700 __ lduh(from, offset, O3);
1701 __ deccc(count);
1702 __ sth(O3, to, offset);
1703 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
1704 __ delayed()->inc(offset, 2);
1706 __ BIND(L_exit);
1707 // O3, O4 are used as temp registers
1708 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1709 __ retl();
1710 __ delayed()->mov(G0, O0); // return 0
1711 return start;
1712 }
1714 //
1715 // Generate stub for disjoint short fill. If "aligned" is true, the
1716 // "to" address is assumed to be heapword aligned.
1717 //
1718 // Arguments for generated stub:
1719 // to: O0
1720 // value: O1
1721 // count: O2 treated as signed
1722 //
1723 address generate_fill(BasicType t, bool aligned, const char* name) {
1724 __ align(CodeEntryAlignment);
1725 StubCodeMark mark(this, "StubRoutines", name);
1726 address start = __ pc();
1728 const Register to = O0; // source array address
1729 const Register value = O1; // fill value
1730 const Register count = O2; // elements count
1731 // O3 is used as a temp register
1733 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1735 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
1736 Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes;
1738 int shift = -1;
1739 switch (t) {
1740 case T_BYTE:
1741 shift = 2;
1742 break;
1743 case T_SHORT:
1744 shift = 1;
1745 break;
1746 case T_INT:
1747 shift = 0;
1748 break;
1749 default: ShouldNotReachHere();
1750 }
1752 BLOCK_COMMENT("Entry:");
1754 if (t == T_BYTE) {
1755 // Zero extend value
1756 __ and3(value, 0xff, value);
1757 __ sllx(value, 8, O3);
1758 __ or3(value, O3, value);
1759 }
1760 if (t == T_SHORT) {
1761 // Zero extend value
1762 __ sllx(value, 48, value);
1763 __ srlx(value, 48, value);
1764 }
1765 if (t == T_BYTE || t == T_SHORT) {
1766 __ sllx(value, 16, O3);
1767 __ or3(value, O3, value);
1768 }
1770 __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
1771 __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp
1772 __ delayed()->andcc(count, 1, G0);
1774 if (!aligned && (t == T_BYTE || t == T_SHORT)) {
1775 // align source address at 4 bytes address boundary
1776 if (t == T_BYTE) {
1777 // One byte misalignment happens only for byte arrays
1778 __ andcc(to, 1, G0);
1779 __ br(Assembler::zero, false, Assembler::pt, L_skip_align1);
1780 __ delayed()->nop();
1781 __ stb(value, to, 0);
1782 __ inc(to, 1);
1783 __ dec(count, 1);
1784 __ BIND(L_skip_align1);
1785 }
1786 // Two bytes misalignment happens only for byte and short (char) arrays
1787 __ andcc(to, 2, G0);
1788 __ br(Assembler::zero, false, Assembler::pt, L_skip_align2);
1789 __ delayed()->nop();
1790 __ sth(value, to, 0);
1791 __ inc(to, 2);
1792 __ dec(count, 1 << (shift - 1));
1793 __ BIND(L_skip_align2);
1794 }
1795 #ifdef _LP64
1796 if (!aligned) {
1797 #endif
1798 // align to 8 bytes, we know we are 4 byte aligned to start
1799 __ andcc(to, 7, G0);
1800 __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes);
1801 __ delayed()->nop();
1802 __ stw(value, to, 0);
1803 __ inc(to, 4);
1804 __ dec(count, 1 << shift);
1805 __ BIND(L_fill_32_bytes);
1806 #ifdef _LP64
1807 }
1808 #endif
1810 if (t == T_INT) {
1811 // Zero extend value
1812 __ srl(value, 0, value);
1813 }
1814 if (t == T_BYTE || t == T_SHORT || t == T_INT) {
1815 __ sllx(value, 32, O3);
1816 __ or3(value, O3, value);
1817 }
1819 Label L_check_fill_8_bytes;
1820 // Fill 32-byte chunks
1821 __ subcc(count, 8 << shift, count);
1822 __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes);
1823 __ delayed()->nop();
1825 Label L_fill_32_bytes_loop, L_fill_4_bytes;
1826 __ align(16);
1827 __ BIND(L_fill_32_bytes_loop);
1829 __ stx(value, to, 0);
1830 __ stx(value, to, 8);
1831 __ stx(value, to, 16);
1832 __ stx(value, to, 24);
1834 __ subcc(count, 8 << shift, count);
1835 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop);
1836 __ delayed()->add(to, 32, to);
1838 __ BIND(L_check_fill_8_bytes);
1839 __ addcc(count, 8 << shift, count);
1840 __ brx(Assembler::zero, false, Assembler::pn, L_exit);
1841 __ delayed()->subcc(count, 1 << (shift + 1), count);
1842 __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes);
1843 __ delayed()->andcc(count, 1<<shift, G0);
1845 //
1846 // length is too short, just fill 8 bytes at a time
1847 //
1848 Label L_fill_8_bytes_loop;
1849 __ BIND(L_fill_8_bytes_loop);
1850 __ stx(value, to, 0);
1851 __ subcc(count, 1 << (shift + 1), count);
1852 __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop);
1853 __ delayed()->add(to, 8, to);
1855 // fill trailing 4 bytes
1856 __ andcc(count, 1<<shift, G0); // in delay slot of branches
1857 if (t == T_INT) {
1858 __ BIND(L_fill_elements);
1859 }
1860 __ BIND(L_fill_4_bytes);
1861 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes);
1862 if (t == T_BYTE || t == T_SHORT) {
1863 __ delayed()->andcc(count, 1<<(shift-1), G0);
1864 } else {
1865 __ delayed()->nop();
1866 }
1867 __ stw(value, to, 0);
1868 if (t == T_BYTE || t == T_SHORT) {
1869 __ inc(to, 4);
1870 // fill trailing 2 bytes
1871 __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches
1872 __ BIND(L_fill_2_bytes);
1873 __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte);
1874 __ delayed()->andcc(count, 1, count);
1875 __ sth(value, to, 0);
1876 if (t == T_BYTE) {
1877 __ inc(to, 2);
1878 // fill trailing byte
1879 __ andcc(count, 1, count); // in delay slot of branches
1880 __ BIND(L_fill_byte);
1881 __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1882 __ delayed()->nop();
1883 __ stb(value, to, 0);
1884 } else {
1885 __ BIND(L_fill_byte);
1886 }
1887 } else {
1888 __ BIND(L_fill_2_bytes);
1889 }
1890 __ BIND(L_exit);
1891 __ retl();
1892 __ delayed()->nop();
1894 // Handle copies less than 8 bytes. Int is handled elsewhere.
1895 if (t == T_BYTE) {
1896 __ BIND(L_fill_elements);
1897 Label L_fill_2, L_fill_4;
1898 // in delay slot __ andcc(count, 1, G0);
1899 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1900 __ delayed()->andcc(count, 2, G0);
1901 __ stb(value, to, 0);
1902 __ inc(to, 1);
1903 __ BIND(L_fill_2);
1904 __ brx(Assembler::zero, false, Assembler::pt, L_fill_4);
1905 __ delayed()->andcc(count, 4, G0);
1906 __ stb(value, to, 0);
1907 __ stb(value, to, 1);
1908 __ inc(to, 2);
1909 __ BIND(L_fill_4);
1910 __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1911 __ delayed()->nop();
1912 __ stb(value, to, 0);
1913 __ stb(value, to, 1);
1914 __ stb(value, to, 2);
1915 __ retl();
1916 __ delayed()->stb(value, to, 3);
1917 }
1919 if (t == T_SHORT) {
1920 Label L_fill_2;
1921 __ BIND(L_fill_elements);
1922 // in delay slot __ andcc(count, 1, G0);
1923 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1924 __ delayed()->andcc(count, 2, G0);
1925 __ sth(value, to, 0);
1926 __ inc(to, 2);
1927 __ BIND(L_fill_2);
1928 __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1929 __ delayed()->nop();
1930 __ sth(value, to, 0);
1931 __ retl();
1932 __ delayed()->sth(value, to, 2);
1933 }
1934 return start;
1935 }
1937 //
1938 // Generate stub for conjoint short copy. If "aligned" is true, the
1939 // "from" and "to" addresses are assumed to be heapword aligned.
1940 //
1941 // Arguments for generated stub:
1942 // from: O0
1943 // to: O1
1944 // count: O2 treated as signed
1945 //
1946 address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1947 address *entry, const char *name) {
1948 // Do reverse copy.
1950 __ align(CodeEntryAlignment);
1951 StubCodeMark mark(this, "StubRoutines", name);
1952 address start = __ pc();
1954 Label L_skip_alignment, L_skip_alignment2, L_aligned_copy;
1955 Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1957 const Register from = O0; // source array address
1958 const Register to = O1; // destination array address
1959 const Register count = O2; // elements count
1960 const Register end_from = from; // source array end address
1961 const Register end_to = to; // destination array end address
1963 const Register byte_count = O3; // bytes count to copy
1965 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1967 if (entry != NULL) {
1968 *entry = __ pc();
1969 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1970 BLOCK_COMMENT("Entry:");
1971 }
1973 array_overlap_test(nooverlap_target, 1);
1975 __ sllx(count, LogBytesPerShort, byte_count);
1976 __ add(to, byte_count, end_to); // offset after last copied element
1978 // for short arrays, just do single element copy
1979 __ cmp(count, 11); // 8 + 3 (22 bytes)
1980 __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1981 __ delayed()->add(from, byte_count, end_from);
1983 {
1984 // Align end of arrays since they could be not aligned even
1985 // when arrays itself are aligned.
1987 // copy 1 element if necessary to align 'end_to' on an 4 bytes
1988 __ andcc(end_to, 3, G0);
1989 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1990 __ delayed()->lduh(end_from, -2, O3);
1991 __ dec(end_from, 2);
1992 __ dec(end_to, 2);
1993 __ dec(count);
1994 __ sth(O3, end_to, 0);
1995 __ BIND(L_skip_alignment);
1997 // copy 2 elements to align 'end_to' on an 8 byte boundary
1998 __ andcc(end_to, 7, G0);
1999 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
2000 __ delayed()->lduh(end_from, -2, O3);
2001 __ dec(count, 2);
2002 __ lduh(end_from, -4, O4);
2003 __ dec(end_from, 4);
2004 __ dec(end_to, 4);
2005 __ sth(O3, end_to, 2);
2006 __ sth(O4, end_to, 0);
2007 __ BIND(L_skip_alignment2);
2008 }
2009 #ifdef _LP64
2010 if (aligned) {
2011 // Both arrays are aligned to 8-bytes in 64-bits VM.
2012 // The 'count' is decremented in copy_16_bytes_backward_with_shift()
2013 // in unaligned case.
2014 __ dec(count, 8);
2015 } else
2016 #endif
2017 {
2018 // Copy with shift 16 bytes per iteration if arrays do not have
2019 // the same alignment mod 8, otherwise jump to the next
2020 // code for aligned copy (and substracting 8 from 'count' before jump).
2021 // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
2022 // Also jump over aligned copy after the copy with shift completed.
2024 copy_16_bytes_backward_with_shift(end_from, end_to, count, 8,
2025 L_aligned_copy, L_copy_2_bytes);
2026 }
2027 // copy 4 elements (16 bytes) at a time
2028 __ align(OptoLoopAlignment);
2029 __ BIND(L_aligned_copy);
2030 __ dec(end_from, 16);
2031 __ ldx(end_from, 8, O3);
2032 __ ldx(end_from, 0, O4);
2033 __ dec(end_to, 16);
2034 __ deccc(count, 8);
2035 __ stx(O3, end_to, 8);
2036 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
2037 __ delayed()->stx(O4, end_to, 0);
2038 __ inc(count, 8);
2040 // copy 1 element (2 bytes) at a time
2041 __ BIND(L_copy_2_bytes);
2042 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2043 __ BIND(L_copy_2_bytes_loop);
2044 __ dec(end_from, 2);
2045 __ dec(end_to, 2);
2046 __ lduh(end_from, 0, O4);
2047 __ deccc(count);
2048 __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
2049 __ delayed()->sth(O4, end_to, 0);
2051 __ BIND(L_exit);
2052 // O3, O4 are used as temp registers
2053 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
2054 __ retl();
2055 __ delayed()->mov(G0, O0); // return 0
2056 return start;
2057 }
2059 //
2060 // Helper methods for generate_disjoint_int_copy_core()
2061 //
2062 void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec,
2063 Label& L_loop, bool use_prefetch, bool use_bis) {
2065 __ align(OptoLoopAlignment);
2066 __ BIND(L_loop);
2067 if (use_prefetch) {
2068 if (ArraycopySrcPrefetchDistance > 0) {
2069 __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
2070 }
2071 if (ArraycopyDstPrefetchDistance > 0) {
2072 __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
2073 }
2074 }
2075 __ ldx(from, 4, O4);
2076 __ ldx(from, 12, G4);
2077 __ inc(to, 16);
2078 __ inc(from, 16);
2079 __ deccc(count, 4); // Can we do next iteration after this one?
2081 __ srlx(O4, 32, G3);
2082 __ bset(G3, O3);
2083 __ sllx(O4, 32, O4);
2084 __ srlx(G4, 32, G3);
2085 __ bset(G3, O4);
2086 if (use_bis) {
2087 __ stxa(O3, to, -16);
2088 __ stxa(O4, to, -8);
2089 } else {
2090 __ stx(O3, to, -16);
2091 __ stx(O4, to, -8);
2092 }
2093 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
2094 __ delayed()->sllx(G4, 32, O3);
2096 }
2098 //
2099 // Generate core code for disjoint int copy (and oop copy on 32-bit).
2100 // If "aligned" is true, the "from" and "to" addresses are assumed
2101 // to be heapword aligned.
2102 //
2103 // Arguments:
2104 // from: O0
2105 // to: O1
2106 // count: O2 treated as signed
2107 //
2108 void generate_disjoint_int_copy_core(bool aligned) {
2110 Label L_skip_alignment, L_aligned_copy;
2111 Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
2113 const Register from = O0; // source array address
2114 const Register to = O1; // destination array address
2115 const Register count = O2; // elements count
2116 const Register offset = O5; // offset from start of arrays
2117 // O3, O4, G3, G4 are used as temp registers
2119 // 'aligned' == true when it is known statically during compilation
2120 // of this arraycopy call site that both 'from' and 'to' addresses
2121 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
2122 //
2123 // Aligned arrays have 4 bytes alignment in 32-bits VM
2124 // and 8 bytes - in 64-bits VM.
2125 //
2126 #ifdef _LP64
2127 if (!aligned)
2128 #endif
2129 {
2130 // The next check could be put under 'ifndef' since the code in
2131 // generate_disjoint_long_copy_core() has own checks and set 'offset'.
2133 // for short arrays, just do single element copy
2134 __ cmp(count, 5); // 4 + 1 (20 bytes)
2135 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
2136 __ delayed()->mov(G0, offset);
2138 // copy 1 element to align 'to' on an 8 byte boundary
2139 __ andcc(to, 7, G0);
2140 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
2141 __ delayed()->ld(from, 0, O3);
2142 __ inc(from, 4);
2143 __ inc(to, 4);
2144 __ dec(count);
2145 __ st(O3, to, -4);
2146 __ BIND(L_skip_alignment);
2148 // if arrays have same alignment mod 8, do 4 elements copy
2149 __ andcc(from, 7, G0);
2150 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2151 __ delayed()->ld(from, 0, O3);
2153 //
2154 // Load 2 aligned 8-bytes chunks and use one from previous iteration
2155 // to form 2 aligned 8-bytes chunks to store.
2156 //
2157 // copy_16_bytes_forward_with_shift() is not used here since this
2158 // code is more optimal.
2160 // copy with shift 4 elements (16 bytes) at a time
2161 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4
2162 __ sllx(O3, 32, O3);
2164 disjoint_copy_core(from, to, count, 2, 16, copy_16_bytes_loop);
2166 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2167 __ delayed()->inc(count, 4); // restore 'count'
2169 __ BIND(L_aligned_copy);
2170 } // !aligned
2172 // copy 4 elements (16 bytes) at a time
2173 __ and3(count, 1, G4); // Save
2174 __ srl(count, 1, count);
2175 generate_disjoint_long_copy_core(aligned);
2176 __ mov(G4, count); // Restore
2178 // copy 1 element at a time
2179 __ BIND(L_copy_4_bytes);
2180 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2181 __ BIND(L_copy_4_bytes_loop);
2182 __ ld(from, offset, O3);
2183 __ deccc(count);
2184 __ st(O3, to, offset);
2185 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
2186 __ delayed()->inc(offset, 4);
2187 __ BIND(L_exit);
2188 }
2190 //
2191 // Generate stub for disjoint int copy. If "aligned" is true, the
2192 // "from" and "to" addresses are assumed to be heapword aligned.
2193 //
2194 // Arguments for generated stub:
2195 // from: O0
2196 // to: O1
2197 // count: O2 treated as signed
2198 //
2199 address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) {
2200 __ align(CodeEntryAlignment);
2201 StubCodeMark mark(this, "StubRoutines", name);
2202 address start = __ pc();
2204 const Register count = O2;
2205 assert_clean_int(count, O3); // Make sure 'count' is clean int.
2207 if (entry != NULL) {
2208 *entry = __ pc();
2209 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2210 BLOCK_COMMENT("Entry:");
2211 }
2213 generate_disjoint_int_copy_core(aligned);
2215 // O3, O4 are used as temp registers
2216 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2217 __ retl();
2218 __ delayed()->mov(G0, O0); // return 0
2219 return start;
2220 }
2222 //
2223 // Generate core code for conjoint int copy (and oop copy on 32-bit).
2224 // If "aligned" is true, the "from" and "to" addresses are assumed
2225 // to be heapword aligned.
2226 //
2227 // Arguments:
2228 // from: O0
2229 // to: O1
2230 // count: O2 treated as signed
2231 //
2232 void generate_conjoint_int_copy_core(bool aligned) {
2233 // Do reverse copy.
2235 Label L_skip_alignment, L_aligned_copy;
2236 Label L_copy_16_bytes, L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
2238 const Register from = O0; // source array address
2239 const Register to = O1; // destination array address
2240 const Register count = O2; // elements count
2241 const Register end_from = from; // source array end address
2242 const Register end_to = to; // destination array end address
2243 // O3, O4, O5, G3 are used as temp registers
2245 const Register byte_count = O3; // bytes count to copy
2247 __ sllx(count, LogBytesPerInt, byte_count);
2248 __ add(to, byte_count, end_to); // offset after last copied element
2250 __ cmp(count, 5); // for short arrays, just do single element copy
2251 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
2252 __ delayed()->add(from, byte_count, end_from);
2254 // copy 1 element to align 'to' on an 8 byte boundary
2255 __ andcc(end_to, 7, G0);
2256 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
2257 __ delayed()->nop();
2258 __ dec(count);
2259 __ dec(end_from, 4);
2260 __ dec(end_to, 4);
2261 __ ld(end_from, 0, O4);
2262 __ st(O4, end_to, 0);
2263 __ BIND(L_skip_alignment);
2265 // Check if 'end_from' and 'end_to' has the same alignment.
2266 __ andcc(end_from, 7, G0);
2267 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2268 __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4
2270 // copy with shift 4 elements (16 bytes) at a time
2271 //
2272 // Load 2 aligned 8-bytes chunks and use one from previous iteration
2273 // to form 2 aligned 8-bytes chunks to store.
2274 //
2275 __ ldx(end_from, -4, O3);
2276 __ align(OptoLoopAlignment);
2277 __ BIND(L_copy_16_bytes);
2278 __ ldx(end_from, -12, O4);
2279 __ deccc(count, 4);
2280 __ ldx(end_from, -20, O5);
2281 __ dec(end_to, 16);
2282 __ dec(end_from, 16);
2283 __ srlx(O3, 32, O3);
2284 __ sllx(O4, 32, G3);
2285 __ bset(G3, O3);
2286 __ stx(O3, end_to, 8);
2287 __ srlx(O4, 32, O4);
2288 __ sllx(O5, 32, G3);
2289 __ bset(O4, G3);
2290 __ stx(G3, end_to, 0);
2291 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2292 __ delayed()->mov(O5, O3);
2294 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2295 __ delayed()->inc(count, 4);
2297 // copy 4 elements (16 bytes) at a time
2298 __ align(OptoLoopAlignment);
2299 __ BIND(L_aligned_copy);
2300 __ dec(end_from, 16);
2301 __ ldx(end_from, 8, O3);
2302 __ ldx(end_from, 0, O4);
2303 __ dec(end_to, 16);
2304 __ deccc(count, 4);
2305 __ stx(O3, end_to, 8);
2306 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
2307 __ delayed()->stx(O4, end_to, 0);
2308 __ inc(count, 4);
2310 // copy 1 element (4 bytes) at a time
2311 __ BIND(L_copy_4_bytes);
2312 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2313 __ BIND(L_copy_4_bytes_loop);
2314 __ dec(end_from, 4);
2315 __ dec(end_to, 4);
2316 __ ld(end_from, 0, O4);
2317 __ deccc(count);
2318 __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop);
2319 __ delayed()->st(O4, end_to, 0);
2320 __ BIND(L_exit);
2321 }
2323 //
2324 // Generate stub for conjoint int copy. If "aligned" is true, the
2325 // "from" and "to" addresses are assumed to be heapword aligned.
2326 //
2327 // Arguments for generated stub:
2328 // from: O0
2329 // to: O1
2330 // count: O2 treated as signed
2331 //
2332 address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
2333 address *entry, const char *name) {
2334 __ align(CodeEntryAlignment);
2335 StubCodeMark mark(this, "StubRoutines", name);
2336 address start = __ pc();
2338 assert_clean_int(O2, O3); // Make sure 'count' is clean int.
2340 if (entry != NULL) {
2341 *entry = __ pc();
2342 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2343 BLOCK_COMMENT("Entry:");
2344 }
2346 array_overlap_test(nooverlap_target, 2);
2348 generate_conjoint_int_copy_core(aligned);
2350 // O3, O4 are used as temp registers
2351 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2352 __ retl();
2353 __ delayed()->mov(G0, O0); // return 0
2354 return start;
2355 }
2357 //
2358 // Helper methods for generate_disjoint_long_copy_core()
2359 //
2360 void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec,
2361 Label& L_loop, bool use_prefetch, bool use_bis) {
2362 __ align(OptoLoopAlignment);
2363 __ BIND(L_loop);
2364 for (int off = 0; off < 64; off += 16) {
2365 if (use_prefetch && (off & 31) == 0) {
2366 if (ArraycopySrcPrefetchDistance > 0) {
2367 __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
2368 }
2369 if (ArraycopyDstPrefetchDistance > 0) {
2370 __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
2371 }
2372 }
2373 __ ldx(from, off+0, O4);
2374 __ ldx(from, off+8, O5);
2375 if (use_bis) {
2376 __ stxa(O4, to, off+0);
2377 __ stxa(O5, to, off+8);
2378 } else {
2379 __ stx(O4, to, off+0);
2380 __ stx(O5, to, off+8);
2381 }
2382 }
2383 __ deccc(count, 8);
2384 __ inc(from, 64);
2385 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
2386 __ delayed()->inc(to, 64);
2387 }
2389 //
2390 // Generate core code for disjoint long copy (and oop copy on 64-bit).
2391 // "aligned" is ignored, because we must make the stronger
2392 // assumption that both addresses are always 64-bit aligned.
2393 //
2394 // Arguments:
2395 // from: O0
2396 // to: O1
2397 // count: O2 treated as signed
2398 //
2399 // count -= 2;
2400 // if ( count >= 0 ) { // >= 2 elements
2401 // if ( count > 6) { // >= 8 elements
2402 // count -= 6; // original count - 8
2403 // do {
2404 // copy_8_elements;
2405 // count -= 8;
2406 // } while ( count >= 0 );
2407 // count += 6;
2408 // }
2409 // if ( count >= 0 ) { // >= 2 elements
2410 // do {
2411 // copy_2_elements;
2412 // } while ( (count=count-2) >= 0 );
2413 // }
2414 // }
2415 // count += 2;
2416 // if ( count != 0 ) { // 1 element left
2417 // copy_1_element;
2418 // }
2419 //
2420 void generate_disjoint_long_copy_core(bool aligned) {
2421 Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2422 const Register from = O0; // source array address
2423 const Register to = O1; // destination array address
2424 const Register count = O2; // elements count
2425 const Register offset0 = O4; // element offset
2426 const Register offset8 = O5; // next element offset
2428 __ deccc(count, 2);
2429 __ mov(G0, offset0); // offset from start of arrays (0)
2430 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2431 __ delayed()->add(offset0, 8, offset8);
2433 // Copy by 64 bytes chunks
2435 const Register from64 = O3; // source address
2436 const Register to64 = G3; // destination address
2437 __ subcc(count, 6, O3);
2438 __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
2439 __ delayed()->mov(to, to64);
2440 // Now we can use O4(offset0), O5(offset8) as temps
2441 __ mov(O3, count);
2442 // count >= 0 (original count - 8)
2443 __ mov(from, from64);
2445 disjoint_copy_core(from64, to64, count, 3, 64, copy_64_bytes_loop);
2447 // Restore O4(offset0), O5(offset8)
2448 __ sub(from64, from, offset0);
2449 __ inccc(count, 6); // restore count
2450 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2451 __ delayed()->add(offset0, 8, offset8);
2453 // Copy by 16 bytes chunks
2454 __ align(OptoLoopAlignment);
2455 __ BIND(L_copy_16_bytes);
2456 __ ldx(from, offset0, O3);
2457 __ ldx(from, offset8, G3);
2458 __ deccc(count, 2);
2459 __ stx(O3, to, offset0);
2460 __ inc(offset0, 16);
2461 __ stx(G3, to, offset8);
2462 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2463 __ delayed()->inc(offset8, 16);
2465 // Copy last 8 bytes
2466 __ BIND(L_copy_8_bytes);
2467 __ inccc(count, 2);
2468 __ brx(Assembler::zero, true, Assembler::pn, L_exit );
2469 __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs
2470 __ ldx(from, offset0, O3);
2471 __ stx(O3, to, offset0);
2472 __ BIND(L_exit);
2473 }
2475 //
2476 // Generate stub for disjoint long copy.
2477 // "aligned" is ignored, because we must make the stronger
2478 // assumption that both addresses are always 64-bit aligned.
2479 //
2480 // Arguments for generated stub:
2481 // from: O0
2482 // to: O1
2483 // count: O2 treated as signed
2484 //
2485 address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) {
2486 __ align(CodeEntryAlignment);
2487 StubCodeMark mark(this, "StubRoutines", name);
2488 address start = __ pc();
2490 assert_clean_int(O2, O3); // Make sure 'count' is clean int.
2492 if (entry != NULL) {
2493 *entry = __ pc();
2494 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2495 BLOCK_COMMENT("Entry:");
2496 }
2498 generate_disjoint_long_copy_core(aligned);
2500 // O3, O4 are used as temp registers
2501 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2502 __ retl();
2503 __ delayed()->mov(G0, O0); // return 0
2504 return start;
2505 }
2507 //
2508 // Generate core code for conjoint long copy (and oop copy on 64-bit).
2509 // "aligned" is ignored, because we must make the stronger
2510 // assumption that both addresses are always 64-bit aligned.
2511 //
2512 // Arguments:
2513 // from: O0
2514 // to: O1
2515 // count: O2 treated as signed
2516 //
2517 void generate_conjoint_long_copy_core(bool aligned) {
2518 // Do reverse copy.
2519 Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2520 const Register from = O0; // source array address
2521 const Register to = O1; // destination array address
2522 const Register count = O2; // elements count
2523 const Register offset8 = O4; // element offset
2524 const Register offset0 = O5; // previous element offset
2526 __ subcc(count, 1, count);
2527 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
2528 __ delayed()->sllx(count, LogBytesPerLong, offset8);
2529 __ sub(offset8, 8, offset0);
2530 __ align(OptoLoopAlignment);
2531 __ BIND(L_copy_16_bytes);
2532 __ ldx(from, offset8, O2);
2533 __ ldx(from, offset0, O3);
2534 __ stx(O2, to, offset8);
2535 __ deccc(offset8, 16); // use offset8 as counter
2536 __ stx(O3, to, offset0);
2537 __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes);
2538 __ delayed()->dec(offset0, 16);
2540 __ BIND(L_copy_8_bytes);
2541 __ brx(Assembler::negative, false, Assembler::pn, L_exit );
2542 __ delayed()->nop();
2543 __ ldx(from, 0, O3);
2544 __ stx(O3, to, 0);
2545 __ BIND(L_exit);
2546 }
2548 // Generate stub for conjoint long copy.
2549 // "aligned" is ignored, because we must make the stronger
2550 // assumption that both addresses are always 64-bit aligned.
2551 //
2552 // Arguments for generated stub:
2553 // from: O0
2554 // to: O1
2555 // count: O2 treated as signed
2556 //
2557 address generate_conjoint_long_copy(bool aligned, address nooverlap_target,
2558 address *entry, const char *name) {
2559 __ align(CodeEntryAlignment);
2560 StubCodeMark mark(this, "StubRoutines", name);
2561 address start = __ pc();
2563 assert(aligned, "Should always be aligned");
2565 assert_clean_int(O2, O3); // Make sure 'count' is clean int.
2567 if (entry != NULL) {
2568 *entry = __ pc();
2569 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2570 BLOCK_COMMENT("Entry:");
2571 }
2573 array_overlap_test(nooverlap_target, 3);
2575 generate_conjoint_long_copy_core(aligned);
2577 // O3, O4 are used as temp registers
2578 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2579 __ retl();
2580 __ delayed()->mov(G0, O0); // return 0
2581 return start;
2582 }
2584 // Generate stub for disjoint oop copy. If "aligned" is true, the
2585 // "from" and "to" addresses are assumed to be heapword aligned.
2586 //
2587 // Arguments for generated stub:
2588 // from: O0
2589 // to: O1
2590 // count: O2 treated as signed
2591 //
2592 address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name,
2593 bool dest_uninitialized = false) {
2595 const Register from = O0; // source array address
2596 const Register to = O1; // destination array address
2597 const Register count = O2; // elements count
2599 __ align(CodeEntryAlignment);
2600 StubCodeMark mark(this, "StubRoutines", name);
2601 address start = __ pc();
2603 assert_clean_int(count, O3); // Make sure 'count' is clean int.
2605 if (entry != NULL) {
2606 *entry = __ pc();
2607 // caller can pass a 64-bit byte count here
2608 BLOCK_COMMENT("Entry:");
2609 }
2611 // save arguments for barrier generation
2612 __ mov(to, G1);
2613 __ mov(count, G5);
2614 gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2615 #ifdef _LP64
2616 assert_clean_int(count, O3); // Make sure 'count' is clean int.
2617 if (UseCompressedOops) {
2618 generate_disjoint_int_copy_core(aligned);
2619 } else {
2620 generate_disjoint_long_copy_core(aligned);
2621 }
2622 #else
2623 generate_disjoint_int_copy_core(aligned);
2624 #endif
2625 // O0 is used as temp register
2626 gen_write_ref_array_post_barrier(G1, G5, O0);
2628 // O3, O4 are used as temp registers
2629 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2630 __ retl();
2631 __ delayed()->mov(G0, O0); // return 0
2632 return start;
2633 }
2635 // Generate stub for conjoint oop copy. If "aligned" is true, the
2636 // "from" and "to" addresses are assumed to be heapword aligned.
2637 //
2638 // Arguments for generated stub:
2639 // from: O0
2640 // to: O1
2641 // count: O2 treated as signed
2642 //
2643 address generate_conjoint_oop_copy(bool aligned, address nooverlap_target,
2644 address *entry, const char *name,
2645 bool dest_uninitialized = false) {
2647 const Register from = O0; // source array address
2648 const Register to = O1; // destination array address
2649 const Register count = O2; // elements count
2651 __ align(CodeEntryAlignment);
2652 StubCodeMark mark(this, "StubRoutines", name);
2653 address start = __ pc();
2655 assert_clean_int(count, O3); // Make sure 'count' is clean int.
2657 if (entry != NULL) {
2658 *entry = __ pc();
2659 // caller can pass a 64-bit byte count here
2660 BLOCK_COMMENT("Entry:");
2661 }
2663 array_overlap_test(nooverlap_target, LogBytesPerHeapOop);
2665 // save arguments for barrier generation
2666 __ mov(to, G1);
2667 __ mov(count, G5);
2668 gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2670 #ifdef _LP64
2671 if (UseCompressedOops) {
2672 generate_conjoint_int_copy_core(aligned);
2673 } else {
2674 generate_conjoint_long_copy_core(aligned);
2675 }
2676 #else
2677 generate_conjoint_int_copy_core(aligned);
2678 #endif
2680 // O0 is used as temp register
2681 gen_write_ref_array_post_barrier(G1, G5, O0);
2683 // O3, O4 are used as temp registers
2684 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2685 __ retl();
2686 __ delayed()->mov(G0, O0); // return 0
2687 return start;
2688 }
2691 // Helper for generating a dynamic type check.
2692 // Smashes only the given temp registers.
2693 void generate_type_check(Register sub_klass,
2694 Register super_check_offset,
2695 Register super_klass,
2696 Register temp,
2697 Label& L_success) {
2698 assert_different_registers(sub_klass, super_check_offset, super_klass, temp);
2700 BLOCK_COMMENT("type_check:");
2702 Label L_miss, L_pop_to_miss;
2704 assert_clean_int(super_check_offset, temp);
2706 __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg,
2707 &L_success, &L_miss, NULL,
2708 super_check_offset);
2710 BLOCK_COMMENT("type_check_slow_path:");
2711 __ save_frame(0);
2712 __ check_klass_subtype_slow_path(sub_klass->after_save(),
2713 super_klass->after_save(),
2714 L0, L1, L2, L4,
2715 NULL, &L_pop_to_miss);
2716 __ ba(L_success);
2717 __ delayed()->restore();
2719 __ bind(L_pop_to_miss);
2720 __ restore();
2722 // Fall through on failure!
2723 __ BIND(L_miss);
2724 }
2727 // Generate stub for checked oop copy.
2728 //
2729 // Arguments for generated stub:
2730 // from: O0
2731 // to: O1
2732 // count: O2 treated as signed
2733 // ckoff: O3 (super_check_offset)
2734 // ckval: O4 (super_klass)
2735 // ret: O0 zero for success; (-1^K) where K is partial transfer count
2736 //
2737 address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) {
2739 const Register O0_from = O0; // source array address
2740 const Register O1_to = O1; // destination array address
2741 const Register O2_count = O2; // elements count
2742 const Register O3_ckoff = O3; // super_check_offset
2743 const Register O4_ckval = O4; // super_klass
2745 const Register O5_offset = O5; // loop var, with stride wordSize
2746 const Register G1_remain = G1; // loop var, with stride -1
2747 const Register G3_oop = G3; // actual oop copied
2748 const Register G4_klass = G4; // oop._klass
2749 const Register G5_super = G5; // oop._klass._primary_supers[ckval]
2751 __ align(CodeEntryAlignment);
2752 StubCodeMark mark(this, "StubRoutines", name);
2753 address start = __ pc();
2755 #ifdef ASSERT
2756 // We sometimes save a frame (see generate_type_check below).
2757 // If this will cause trouble, let's fail now instead of later.
2758 __ save_frame(0);
2759 __ restore();
2760 #endif
2762 assert_clean_int(O2_count, G1); // Make sure 'count' is clean int.
2764 #ifdef ASSERT
2765 // caller guarantees that the arrays really are different
2766 // otherwise, we would have to make conjoint checks
2767 { Label L;
2768 __ mov(O3, G1); // spill: overlap test smashes O3
2769 __ mov(O4, G4); // spill: overlap test smashes O4
2770 array_overlap_test(L, LogBytesPerHeapOop);
2771 __ stop("checkcast_copy within a single array");
2772 __ bind(L);
2773 __ mov(G1, O3);
2774 __ mov(G4, O4);
2775 }
2776 #endif //ASSERT
2778 if (entry != NULL) {
2779 *entry = __ pc();
2780 // caller can pass a 64-bit byte count here (from generic stub)
2781 BLOCK_COMMENT("Entry:");
2782 }
2783 gen_write_ref_array_pre_barrier(O1_to, O2_count, dest_uninitialized);
2785 Label load_element, store_element, do_card_marks, fail, done;
2786 __ addcc(O2_count, 0, G1_remain); // initialize loop index, and test it
2787 __ brx(Assembler::notZero, false, Assembler::pt, load_element);
2788 __ delayed()->mov(G0, O5_offset); // offset from start of arrays
2790 // Empty array: Nothing to do.
2791 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2792 __ retl();
2793 __ delayed()->set(0, O0); // return 0 on (trivial) success
2795 // ======== begin loop ========
2796 // (Loop is rotated; its entry is load_element.)
2797 // Loop variables:
2798 // (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
2799 // (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
2800 // G3, G4, G5 --- current oop, oop.klass, oop.klass.super
2801 __ align(OptoLoopAlignment);
2803 __ BIND(store_element);
2804 __ deccc(G1_remain); // decrement the count
2805 __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop
2806 __ inc(O5_offset, heapOopSize); // step to next offset
2807 __ brx(Assembler::zero, true, Assembler::pt, do_card_marks);
2808 __ delayed()->set(0, O0); // return -1 on success
2810 // ======== loop entry is here ========
2811 __ BIND(load_element);
2812 __ load_heap_oop(O0_from, O5_offset, G3_oop); // load the oop
2813 __ br_null_short(G3_oop, Assembler::pt, store_element);
2815 __ load_klass(G3_oop, G4_klass); // query the object klass
2817 generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super,
2818 // branch to this on success:
2819 store_element);
2820 // ======== end loop ========
2822 // It was a real error; we must depend on the caller to finish the job.
2823 // Register G1 has number of *remaining* oops, O2 number of *total* oops.
2824 // Emit GC store barriers for the oops we have copied (O2 minus G1),
2825 // and report their number to the caller.
2826 __ BIND(fail);
2827 __ subcc(O2_count, G1_remain, O2_count);
2828 __ brx(Assembler::zero, false, Assembler::pt, done);
2829 __ delayed()->not1(O2_count, O0); // report (-1^K) to caller
2831 __ BIND(do_card_marks);
2832 gen_write_ref_array_post_barrier(O1_to, O2_count, O3); // store check on O1[0..O2]
2834 __ BIND(done);
2835 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2836 __ retl();
2837 __ delayed()->nop(); // return value in 00
2839 return start;
2840 }
2843 // Generate 'unsafe' array copy stub
2844 // Though just as safe as the other stubs, it takes an unscaled
2845 // size_t argument instead of an element count.
2846 //
2847 // Arguments for generated stub:
2848 // from: O0
2849 // to: O1
2850 // count: O2 byte count, treated as ssize_t, can be zero
2851 //
2852 // Examines the alignment of the operands and dispatches
2853 // to a long, int, short, or byte copy loop.
2854 //
2855 address generate_unsafe_copy(const char* name,
2856 address byte_copy_entry,
2857 address short_copy_entry,
2858 address int_copy_entry,
2859 address long_copy_entry) {
2861 const Register O0_from = O0; // source array address
2862 const Register O1_to = O1; // destination array address
2863 const Register O2_count = O2; // elements count
2865 const Register G1_bits = G1; // test copy of low bits
2867 __ align(CodeEntryAlignment);
2868 StubCodeMark mark(this, "StubRoutines", name);
2869 address start = __ pc();
2871 // bump this on entry, not on exit:
2872 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3);
2874 __ or3(O0_from, O1_to, G1_bits);
2875 __ or3(O2_count, G1_bits, G1_bits);
2877 __ btst(BytesPerLong-1, G1_bits);
2878 __ br(Assembler::zero, true, Assembler::pt,
2879 long_copy_entry, relocInfo::runtime_call_type);
2880 // scale the count on the way out:
2881 __ delayed()->srax(O2_count, LogBytesPerLong, O2_count);
2883 __ btst(BytesPerInt-1, G1_bits);
2884 __ br(Assembler::zero, true, Assembler::pt,
2885 int_copy_entry, relocInfo::runtime_call_type);
2886 // scale the count on the way out:
2887 __ delayed()->srax(O2_count, LogBytesPerInt, O2_count);
2889 __ btst(BytesPerShort-1, G1_bits);
2890 __ br(Assembler::zero, true, Assembler::pt,
2891 short_copy_entry, relocInfo::runtime_call_type);
2892 // scale the count on the way out:
2893 __ delayed()->srax(O2_count, LogBytesPerShort, O2_count);
2895 __ br(Assembler::always, false, Assembler::pt,
2896 byte_copy_entry, relocInfo::runtime_call_type);
2897 __ delayed()->nop();
2899 return start;
2900 }
2903 // Perform range checks on the proposed arraycopy.
2904 // Kills the two temps, but nothing else.
2905 // Also, clean the sign bits of src_pos and dst_pos.
2906 void arraycopy_range_checks(Register src, // source array oop (O0)
2907 Register src_pos, // source position (O1)
2908 Register dst, // destination array oo (O2)
2909 Register dst_pos, // destination position (O3)
2910 Register length, // length of copy (O4)
2911 Register temp1, Register temp2,
2912 Label& L_failed) {
2913 BLOCK_COMMENT("arraycopy_range_checks:");
2915 // if (src_pos + length > arrayOop(src)->length() ) FAIL;
2917 const Register array_length = temp1; // scratch
2918 const Register end_pos = temp2; // scratch
2920 // Note: This next instruction may be in the delay slot of a branch:
2921 __ add(length, src_pos, end_pos); // src_pos + length
2922 __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length);
2923 __ cmp(end_pos, array_length);
2924 __ br(Assembler::greater, false, Assembler::pn, L_failed);
2926 // if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2927 __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length
2928 __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length);
2929 __ cmp(end_pos, array_length);
2930 __ br(Assembler::greater, false, Assembler::pn, L_failed);
2932 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2933 // Move with sign extension can be used since they are positive.
2934 __ delayed()->signx(src_pos, src_pos);
2935 __ signx(dst_pos, dst_pos);
2937 BLOCK_COMMENT("arraycopy_range_checks done");
2938 }
2941 //
2942 // Generate generic array copy stubs
2943 //
2944 // Input:
2945 // O0 - src oop
2946 // O1 - src_pos
2947 // O2 - dst oop
2948 // O3 - dst_pos
2949 // O4 - element count
2950 //
2951 // Output:
2952 // O0 == 0 - success
2953 // O0 == -1 - need to call System.arraycopy
2954 //
2955 address generate_generic_copy(const char *name,
2956 address entry_jbyte_arraycopy,
2957 address entry_jshort_arraycopy,
2958 address entry_jint_arraycopy,
2959 address entry_oop_arraycopy,
2960 address entry_jlong_arraycopy,
2961 address entry_checkcast_arraycopy) {
2962 Label L_failed, L_objArray;
2964 // Input registers
2965 const Register src = O0; // source array oop
2966 const Register src_pos = O1; // source position
2967 const Register dst = O2; // destination array oop
2968 const Register dst_pos = O3; // destination position
2969 const Register length = O4; // elements count
2971 // registers used as temp
2972 const Register G3_src_klass = G3; // source array klass
2973 const Register G4_dst_klass = G4; // destination array klass
2974 const Register G5_lh = G5; // layout handler
2975 const Register O5_temp = O5;
2977 __ align(CodeEntryAlignment);
2978 StubCodeMark mark(this, "StubRoutines", name);
2979 address start = __ pc();
2981 // bump this on entry, not on exit:
2982 inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3);
2984 // In principle, the int arguments could be dirty.
2985 //assert_clean_int(src_pos, G1);
2986 //assert_clean_int(dst_pos, G1);
2987 //assert_clean_int(length, G1);
2989 //-----------------------------------------------------------------------
2990 // Assembler stubs will be used for this call to arraycopy
2991 // if the following conditions are met:
2992 //
2993 // (1) src and dst must not be null.
2994 // (2) src_pos must not be negative.
2995 // (3) dst_pos must not be negative.
2996 // (4) length must not be negative.
2997 // (5) src klass and dst klass should be the same and not NULL.
2998 // (6) src and dst should be arrays.
2999 // (7) src_pos + length must not exceed length of src.
3000 // (8) dst_pos + length must not exceed length of dst.
3001 BLOCK_COMMENT("arraycopy initial argument checks");
3003 // if (src == NULL) return -1;
3004 __ br_null(src, false, Assembler::pn, L_failed);
3006 // if (src_pos < 0) return -1;
3007 __ delayed()->tst(src_pos);
3008 __ br(Assembler::negative, false, Assembler::pn, L_failed);
3009 __ delayed()->nop();
3011 // if (dst == NULL) return -1;
3012 __ br_null(dst, false, Assembler::pn, L_failed);
3014 // if (dst_pos < 0) return -1;
3015 __ delayed()->tst(dst_pos);
3016 __ br(Assembler::negative, false, Assembler::pn, L_failed);
3018 // if (length < 0) return -1;
3019 __ delayed()->tst(length);
3020 __ br(Assembler::negative, false, Assembler::pn, L_failed);
3022 BLOCK_COMMENT("arraycopy argument klass checks");
3023 // get src->klass()
3024 if (UseCompressedOops) {
3025 __ delayed()->nop(); // ??? not good
3026 __ load_klass(src, G3_src_klass);
3027 } else {
3028 __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass);
3029 }
3031 #ifdef ASSERT
3032 // assert(src->klass() != NULL);
3033 BLOCK_COMMENT("assert klasses not null");
3034 { Label L_a, L_b;
3035 __ br_notnull_short(G3_src_klass, Assembler::pt, L_b); // it is broken if klass is NULL
3036 __ bind(L_a);
3037 __ stop("broken null klass");
3038 __ bind(L_b);
3039 __ load_klass(dst, G4_dst_klass);
3040 __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also
3041 __ delayed()->mov(G0, G4_dst_klass); // scribble the temp
3042 BLOCK_COMMENT("assert done");
3043 }
3044 #endif
3046 // Load layout helper
3047 //
3048 // |array_tag| | header_size | element_type | |log2_element_size|
3049 // 32 30 24 16 8 2 0
3050 //
3051 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3052 //
3054 int lh_offset = klassOopDesc::header_size() * HeapWordSize +
3055 Klass::layout_helper_offset_in_bytes();
3057 // Load 32-bits signed value. Use br() instruction with it to check icc.
3058 __ lduw(G3_src_klass, lh_offset, G5_lh);
3060 if (UseCompressedOops) {
3061 __ load_klass(dst, G4_dst_klass);
3062 }
3063 // Handle objArrays completely differently...
3064 juint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3065 __ set(objArray_lh, O5_temp);
3066 __ cmp(G5_lh, O5_temp);
3067 __ br(Assembler::equal, false, Assembler::pt, L_objArray);
3068 if (UseCompressedOops) {
3069 __ delayed()->nop();
3070 } else {
3071 __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass);
3072 }
3074 // if (src->klass() != dst->klass()) return -1;
3075 __ cmp_and_brx_short(G3_src_klass, G4_dst_klass, Assembler::notEqual, Assembler::pn, L_failed);
3077 // if (!src->is_Array()) return -1;
3078 __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0
3079 __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed);
3081 // At this point, it is known to be a typeArray (array_tag 0x3).
3082 #ifdef ASSERT
3083 __ delayed()->nop();
3084 { Label L;
3085 jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
3086 __ set(lh_prim_tag_in_place, O5_temp);
3087 __ cmp(G5_lh, O5_temp);
3088 __ br(Assembler::greaterEqual, false, Assembler::pt, L);
3089 __ delayed()->nop();
3090 __ stop("must be a primitive array");
3091 __ bind(L);
3092 }
3093 #else
3094 __ delayed(); // match next insn to prev branch
3095 #endif
3097 arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3098 O5_temp, G4_dst_klass, L_failed);
3100 // typeArrayKlass
3101 //
3102 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3103 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3104 //
3106 const Register G4_offset = G4_dst_klass; // array offset
3107 const Register G3_elsize = G3_src_klass; // log2 element size
3109 __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset);
3110 __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset
3111 __ add(src, G4_offset, src); // src array offset
3112 __ add(dst, G4_offset, dst); // dst array offset
3113 __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size
3115 // next registers should be set before the jump to corresponding stub
3116 const Register from = O0; // source array address
3117 const Register to = O1; // destination array address
3118 const Register count = O2; // elements count
3120 // 'from', 'to', 'count' registers should be set in this order
3121 // since they are the same as 'src', 'src_pos', 'dst'.
3123 BLOCK_COMMENT("scale indexes to element size");
3124 __ sll_ptr(src_pos, G3_elsize, src_pos);
3125 __ sll_ptr(dst_pos, G3_elsize, dst_pos);
3126 __ add(src, src_pos, from); // src_addr
3127 __ add(dst, dst_pos, to); // dst_addr
3129 BLOCK_COMMENT("choose copy loop based on element size");
3130 __ cmp(G3_elsize, 0);
3131 __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy);
3132 __ delayed()->signx(length, count); // length
3134 __ cmp(G3_elsize, LogBytesPerShort);
3135 __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy);
3136 __ delayed()->signx(length, count); // length
3138 __ cmp(G3_elsize, LogBytesPerInt);
3139 __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy);
3140 __ delayed()->signx(length, count); // length
3141 #ifdef ASSERT
3142 { Label L;
3143 __ cmp_and_br_short(G3_elsize, LogBytesPerLong, Assembler::equal, Assembler::pt, L);
3144 __ stop("must be long copy, but elsize is wrong");
3145 __ bind(L);
3146 }
3147 #endif
3148 __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy);
3149 __ delayed()->signx(length, count); // length
3151 // objArrayKlass
3152 __ BIND(L_objArray);
3153 // live at this point: G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length
3155 Label L_plain_copy, L_checkcast_copy;
3156 // test array classes for subtyping
3157 __ cmp(G3_src_klass, G4_dst_klass); // usual case is exact equality
3158 __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy);
3159 __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below
3161 // Identically typed arrays can be copied without element-wise checks.
3162 arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3163 O5_temp, G5_lh, L_failed);
3165 __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
3166 __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
3167 __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
3168 __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
3169 __ add(src, src_pos, from); // src_addr
3170 __ add(dst, dst_pos, to); // dst_addr
3171 __ BIND(L_plain_copy);
3172 __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy);
3173 __ delayed()->signx(length, count); // length
3175 __ BIND(L_checkcast_copy);
3176 // live at this point: G3_src_klass, G4_dst_klass
3177 {
3178 // Before looking at dst.length, make sure dst is also an objArray.
3179 // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot
3180 __ cmp(G5_lh, O5_temp);
3181 __ br(Assembler::notEqual, false, Assembler::pn, L_failed);
3183 // It is safe to examine both src.length and dst.length.
3184 __ delayed(); // match next insn to prev branch
3185 arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3186 O5_temp, G5_lh, L_failed);
3188 // Marshal the base address arguments now, freeing registers.
3189 __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
3190 __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
3191 __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
3192 __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
3193 __ add(src, src_pos, from); // src_addr
3194 __ add(dst, dst_pos, to); // dst_addr
3195 __ signx(length, count); // length (reloaded)
3197 Register sco_temp = O3; // this register is free now
3198 assert_different_registers(from, to, count, sco_temp,
3199 G4_dst_klass, G3_src_klass);
3201 // Generate the type check.
3202 int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
3203 Klass::super_check_offset_offset_in_bytes());
3204 __ lduw(G4_dst_klass, sco_offset, sco_temp);
3205 generate_type_check(G3_src_klass, sco_temp, G4_dst_klass,
3206 O5_temp, L_plain_copy);
3208 // Fetch destination element klass from the objArrayKlass header.
3209 int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
3210 objArrayKlass::element_klass_offset_in_bytes());
3212 // the checkcast_copy loop needs two extra arguments:
3213 __ ld_ptr(G4_dst_klass, ek_offset, O4); // dest elem klass
3214 // lduw(O4, sco_offset, O3); // sco of elem klass
3216 __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy);
3217 __ delayed()->lduw(O4, sco_offset, O3);
3218 }
3220 __ BIND(L_failed);
3221 __ retl();
3222 __ delayed()->sub(G0, 1, O0); // return -1
3223 return start;
3224 }
3226 //
3227 // Generate stub for heap zeroing.
3228 // "to" address is aligned to jlong (8 bytes).
3229 //
3230 // Arguments for generated stub:
3231 // to: O0
3232 // count: O1 treated as signed (count of HeapWord)
3233 // count could be 0
3234 //
3235 address generate_zero_aligned_words(const char* name) {
3236 __ align(CodeEntryAlignment);
3237 StubCodeMark mark(this, "StubRoutines", name);
3238 address start = __ pc();
3240 const Register to = O0; // source array address
3241 const Register count = O1; // HeapWords count
3242 const Register temp = O2; // scratch
3244 Label Ldone;
3245 __ sllx(count, LogHeapWordSize, count); // to bytes count
3246 // Use BIS for zeroing
3247 __ bis_zeroing(to, count, temp, Ldone);
3248 __ bind(Ldone);
3249 __ retl();
3250 __ delayed()->nop();
3251 return start;
3252 }
3254 void generate_arraycopy_stubs() {
3255 address entry;
3256 address entry_jbyte_arraycopy;
3257 address entry_jshort_arraycopy;
3258 address entry_jint_arraycopy;
3259 address entry_oop_arraycopy;
3260 address entry_jlong_arraycopy;
3261 address entry_checkcast_arraycopy;
3263 //*** jbyte
3264 // Always need aligned and unaligned versions
3265 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry,
3266 "jbyte_disjoint_arraycopy");
3267 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry,
3268 &entry_jbyte_arraycopy,
3269 "jbyte_arraycopy");
3270 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
3271 "arrayof_jbyte_disjoint_arraycopy");
3272 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL,
3273 "arrayof_jbyte_arraycopy");
3275 //*** jshort
3276 // Always need aligned and unaligned versions
3277 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
3278 "jshort_disjoint_arraycopy");
3279 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry,
3280 &entry_jshort_arraycopy,
3281 "jshort_arraycopy");
3282 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
3283 "arrayof_jshort_disjoint_arraycopy");
3284 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL,
3285 "arrayof_jshort_arraycopy");
3287 //*** jint
3288 // Aligned versions
3289 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
3290 "arrayof_jint_disjoint_arraycopy");
3291 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
3292 "arrayof_jint_arraycopy");
3293 #ifdef _LP64
3294 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
3295 // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it).
3296 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry,
3297 "jint_disjoint_arraycopy");
3298 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry,
3299 &entry_jint_arraycopy,
3300 "jint_arraycopy");
3301 #else
3302 // In 32 bit jints are always HeapWordSize aligned, so always use the aligned version
3303 // (in fact in 32bit we always have a pre-loop part even in the aligned version,
3304 // because it uses 64-bit loads/stores, so the aligned flag is actually ignored).
3305 StubRoutines::_jint_disjoint_arraycopy = StubRoutines::_arrayof_jint_disjoint_arraycopy;
3306 StubRoutines::_jint_arraycopy = StubRoutines::_arrayof_jint_arraycopy;
3307 #endif
3310 //*** jlong
3311 // It is always aligned
3312 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
3313 "arrayof_jlong_disjoint_arraycopy");
3314 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
3315 "arrayof_jlong_arraycopy");
3316 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
3317 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
3320 //*** oops
3321 // Aligned versions
3322 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(true, &entry,
3323 "arrayof_oop_disjoint_arraycopy");
3324 StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy,
3325 "arrayof_oop_arraycopy");
3326 // Aligned versions without pre-barriers
3327 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry,
3328 "arrayof_oop_disjoint_arraycopy_uninit",
3329 /*dest_uninitialized*/true);
3330 StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, entry, NULL,
3331 "arrayof_oop_arraycopy_uninit",
3332 /*dest_uninitialized*/true);
3333 #ifdef _LP64
3334 if (UseCompressedOops) {
3335 // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy.
3336 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(false, &entry,
3337 "oop_disjoint_arraycopy");
3338 StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy,
3339 "oop_arraycopy");
3340 // Unaligned versions without pre-barriers
3341 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(false, &entry,
3342 "oop_disjoint_arraycopy_uninit",
3343 /*dest_uninitialized*/true);
3344 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, entry, NULL,
3345 "oop_arraycopy_uninit",
3346 /*dest_uninitialized*/true);
3347 } else
3348 #endif
3349 {
3350 // oop arraycopy is always aligned on 32bit and 64bit without compressed oops
3351 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
3352 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
3353 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
3354 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
3355 }
3357 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3358 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3359 /*dest_uninitialized*/true);
3361 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy",
3362 entry_jbyte_arraycopy,
3363 entry_jshort_arraycopy,
3364 entry_jint_arraycopy,
3365 entry_jlong_arraycopy);
3366 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy",
3367 entry_jbyte_arraycopy,
3368 entry_jshort_arraycopy,
3369 entry_jint_arraycopy,
3370 entry_oop_arraycopy,
3371 entry_jlong_arraycopy,
3372 entry_checkcast_arraycopy);
3374 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3375 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3376 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3377 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3378 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3379 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3381 if (UseBlockZeroing) {
3382 StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words");
3383 }
3384 }
3386 void generate_initial() {
3387 // Generates all stubs and initializes the entry points
3389 //------------------------------------------------------------------------------------------------------------------------
3390 // entry points that exist in all platforms
3391 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
3392 // the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
3393 StubRoutines::_forward_exception_entry = generate_forward_exception();
3395 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
3396 StubRoutines::_catch_exception_entry = generate_catch_exception();
3398 //------------------------------------------------------------------------------------------------------------------------
3399 // entry points that are platform specific
3400 StubRoutines::Sparc::_test_stop_entry = generate_test_stop();
3402 StubRoutines::Sparc::_stop_subroutine_entry = generate_stop_subroutine();
3403 StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
3405 #if !defined(COMPILER2) && !defined(_LP64)
3406 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg();
3407 StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg();
3408 StubRoutines::_atomic_add_entry = generate_atomic_add();
3409 StubRoutines::_atomic_xchg_ptr_entry = StubRoutines::_atomic_xchg_entry;
3410 StubRoutines::_atomic_cmpxchg_ptr_entry = StubRoutines::_atomic_cmpxchg_entry;
3411 StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
3412 StubRoutines::_atomic_add_ptr_entry = StubRoutines::_atomic_add_entry;
3413 #endif // COMPILER2 !=> _LP64
3415 // Build this early so it's available for the interpreter. The
3416 // stub expects the required and actual type to already be in O1
3417 // and O2 respectively.
3418 StubRoutines::_throw_WrongMethodTypeException_entry =
3419 generate_throw_exception("WrongMethodTypeException throw_exception",
3420 CAST_FROM_FN_PTR(address, SharedRuntime::throw_WrongMethodTypeException),
3421 false, G5_method_type, G3_method_handle);
3422 }
3425 void generate_all() {
3426 // Generates all stubs and initializes the entry points
3428 // Generate partial_subtype_check first here since its code depends on
3429 // UseZeroBaseCompressedOops which is defined after heap initialization.
3430 StubRoutines::Sparc::_partial_subtype_check = generate_partial_subtype_check();
3431 // These entry points require SharedInfo::stack0 to be set up in non-core builds
3432 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError), false);
3433 StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError), false);
3434 StubRoutines::_throw_ArithmeticException_entry = generate_throw_exception("ArithmeticException throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_ArithmeticException), true);
3435 StubRoutines::_throw_NullPointerException_entry = generate_throw_exception("NullPointerException throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException), true);
3436 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
3437 StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
3439 StubRoutines::_handler_for_unsafe_access_entry =
3440 generate_handler_for_unsafe_access();
3442 // support for verify_oop (must happen after universe_init)
3443 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop_subroutine();
3445 // arraycopy stubs used by compilers
3446 generate_arraycopy_stubs();
3448 // Don't initialize the platform math functions since sparc
3449 // doesn't have intrinsics for these operations.
3450 }
3453 public:
3454 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3455 // replace the standard masm with a special one:
3456 _masm = new MacroAssembler(code);
3458 _stub_count = !all ? 0x100 : 0x200;
3459 if (all) {
3460 generate_all();
3461 } else {
3462 generate_initial();
3463 }
3465 // make sure this stub is available for all local calls
3466 if (_atomic_add_stub.is_unbound()) {
3467 // generate a second time, if necessary
3468 (void) generate_atomic_add();
3469 }
3470 }
3473 private:
3474 int _stub_count;
3475 void stub_prolog(StubCodeDesc* cdesc) {
3476 # ifdef ASSERT
3477 // put extra information in the stub code, to make it more readable
3478 #ifdef _LP64
3479 // Write the high part of the address
3480 // [RGV] Check if there is a dependency on the size of this prolog
3481 __ emit_data((intptr_t)cdesc >> 32, relocInfo::none);
3482 #endif
3483 __ emit_data((intptr_t)cdesc, relocInfo::none);
3484 __ emit_data(++_stub_count, relocInfo::none);
3485 # endif
3486 align(true);
3487 }
3489 void align(bool at_header = false) {
3490 // %%%%% move this constant somewhere else
3491 // UltraSPARC cache line size is 8 instructions:
3492 const unsigned int icache_line_size = 32;
3493 const unsigned int icache_half_line_size = 16;
3495 if (at_header) {
3496 while ((intptr_t)(__ pc()) % icache_line_size != 0) {
3497 __ emit_data(0, relocInfo::none);
3498 }
3499 } else {
3500 while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
3501 __ nop();
3502 }
3503 }
3504 }
3506 }; // end class declaration
3508 void StubGenerator_generate(CodeBuffer* code, bool all) {
3509 StubGenerator g(code, all);
3510 }