Tue, 27 Nov 2012 14:20:21 +0100
8003935: Simplify the needed includes for using Thread::current()
Reviewed-by: dholmes, rbackman, coleenp
1 /*
2 * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
25 #include "precompiled.hpp"
26 #include "asm/assembler.hpp"
27 #include "assembler_sparc.inline.hpp"
28 #include "interpreter/interpreter.hpp"
29 #include "nativeInst_sparc.hpp"
30 #include "oops/instanceOop.hpp"
31 #include "oops/method.hpp"
32 #include "oops/objArrayKlass.hpp"
33 #include "oops/oop.inline.hpp"
34 #include "prims/methodHandles.hpp"
35 #include "runtime/frame.inline.hpp"
36 #include "runtime/handles.inline.hpp"
37 #include "runtime/sharedRuntime.hpp"
38 #include "runtime/stubCodeGenerator.hpp"
39 #include "runtime/stubRoutines.hpp"
40 #include "runtime/thread.inline.hpp"
41 #include "utilities/top.hpp"
42 #ifdef COMPILER2
43 #include "opto/runtime.hpp"
44 #endif
46 // Declaration and definition of StubGenerator (no .hpp file).
47 // For a more detailed description of the stub routine structure
48 // see the comment in stubRoutines.hpp.
50 #define __ _masm->
52 #ifdef PRODUCT
53 #define BLOCK_COMMENT(str) /* nothing */
54 #else
55 #define BLOCK_COMMENT(str) __ block_comment(str)
56 #endif
58 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
60 // Note: The register L7 is used as L7_thread_cache, and may not be used
61 // any other way within this module.
64 static const Register& Lstub_temp = L2;
66 // -------------------------------------------------------------------------------------------------------------------------
67 // Stub Code definitions
69 static address handle_unsafe_access() {
70 JavaThread* thread = JavaThread::current();
71 address pc = thread->saved_exception_pc();
72 address npc = thread->saved_exception_npc();
73 // pc is the instruction which we must emulate
74 // doing a no-op is fine: return garbage from the load
76 // request an async exception
77 thread->set_pending_unsafe_access_error();
79 // return address of next instruction to execute
80 return npc;
81 }
83 class StubGenerator: public StubCodeGenerator {
84 private:
86 #ifdef PRODUCT
87 #define inc_counter_np(a,b,c) (0)
88 #else
89 #define inc_counter_np(counter, t1, t2) \
90 BLOCK_COMMENT("inc_counter " #counter); \
91 __ inc_counter(&counter, t1, t2);
92 #endif
94 //----------------------------------------------------------------------------------------------------
95 // Call stubs are used to call Java from C
97 address generate_call_stub(address& return_pc) {
98 StubCodeMark mark(this, "StubRoutines", "call_stub");
99 address start = __ pc();
101 // Incoming arguments:
102 //
103 // o0 : call wrapper address
104 // o1 : result (address)
105 // o2 : result type
106 // o3 : method
107 // o4 : (interpreter) entry point
108 // o5 : parameters (address)
109 // [sp + 0x5c]: parameter size (in words)
110 // [sp + 0x60]: thread
111 //
112 // +---------------+ <--- sp + 0
113 // | |
114 // . reg save area .
115 // | |
116 // +---------------+ <--- sp + 0x40
117 // | |
118 // . extra 7 slots .
119 // | |
120 // +---------------+ <--- sp + 0x5c
121 // | param. size |
122 // +---------------+ <--- sp + 0x60
123 // | thread |
124 // +---------------+
125 // | |
127 // note: if the link argument position changes, adjust
128 // the code in frame::entry_frame_call_wrapper()
130 const Argument link = Argument(0, false); // used only for GC
131 const Argument result = Argument(1, false);
132 const Argument result_type = Argument(2, false);
133 const Argument method = Argument(3, false);
134 const Argument entry_point = Argument(4, false);
135 const Argument parameters = Argument(5, false);
136 const Argument parameter_size = Argument(6, false);
137 const Argument thread = Argument(7, false);
139 // setup thread register
140 __ ld_ptr(thread.as_address(), G2_thread);
141 __ reinit_heapbase();
143 #ifdef ASSERT
144 // make sure we have no pending exceptions
145 { const Register t = G3_scratch;
146 Label L;
147 __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t);
148 __ br_null_short(t, Assembler::pt, L);
149 __ stop("StubRoutines::call_stub: entered with pending exception");
150 __ bind(L);
151 }
152 #endif
154 // create activation frame & allocate space for parameters
155 { const Register t = G3_scratch;
156 __ ld_ptr(parameter_size.as_address(), t); // get parameter size (in words)
157 __ add(t, frame::memory_parameter_word_sp_offset, t); // add space for save area (in words)
158 __ round_to(t, WordsPerLong); // make sure it is multiple of 2 (in words)
159 __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes
160 __ neg(t); // negate so it can be used with save
161 __ save(SP, t, SP); // setup new frame
162 }
164 // +---------------+ <--- sp + 0
165 // | |
166 // . reg save area .
167 // | |
168 // +---------------+ <--- sp + 0x40
169 // | |
170 // . extra 7 slots .
171 // | |
172 // +---------------+ <--- sp + 0x5c
173 // | empty slot | (only if parameter size is even)
174 // +---------------+
175 // | |
176 // . parameters .
177 // | |
178 // +---------------+ <--- fp + 0
179 // | |
180 // . reg save area .
181 // | |
182 // +---------------+ <--- fp + 0x40
183 // | |
184 // . extra 7 slots .
185 // | |
186 // +---------------+ <--- fp + 0x5c
187 // | param. size |
188 // +---------------+ <--- fp + 0x60
189 // | thread |
190 // +---------------+
191 // | |
193 // pass parameters if any
194 BLOCK_COMMENT("pass parameters if any");
195 { const Register src = parameters.as_in().as_register();
196 const Register dst = Lentry_args;
197 const Register tmp = G3_scratch;
198 const Register cnt = G4_scratch;
200 // test if any parameters & setup of Lentry_args
201 Label exit;
202 __ ld_ptr(parameter_size.as_in().as_address(), cnt); // parameter counter
203 __ add( FP, STACK_BIAS, dst );
204 __ cmp_zero_and_br(Assembler::zero, cnt, exit);
205 __ delayed()->sub(dst, BytesPerWord, dst); // setup Lentry_args
207 // copy parameters if any
208 Label loop;
209 __ BIND(loop);
210 // Store parameter value
211 __ ld_ptr(src, 0, tmp);
212 __ add(src, BytesPerWord, src);
213 __ st_ptr(tmp, dst, 0);
214 __ deccc(cnt);
215 __ br(Assembler::greater, false, Assembler::pt, loop);
216 __ delayed()->sub(dst, Interpreter::stackElementSize, dst);
218 // done
219 __ BIND(exit);
220 }
222 // setup parameters, method & call Java function
223 #ifdef ASSERT
224 // layout_activation_impl checks it's notion of saved SP against
225 // this register, so if this changes update it as well.
226 const Register saved_SP = Lscratch;
227 __ mov(SP, saved_SP); // keep track of SP before call
228 #endif
230 // setup parameters
231 const Register t = G3_scratch;
232 __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words)
233 __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes
234 __ sub(FP, t, Gargs); // setup parameter pointer
235 #ifdef _LP64
236 __ add( Gargs, STACK_BIAS, Gargs ); // Account for LP64 stack bias
237 #endif
238 __ mov(SP, O5_savedSP);
241 // do the call
242 //
243 // the following register must be setup:
244 //
245 // G2_thread
246 // G5_method
247 // Gargs
248 BLOCK_COMMENT("call Java function");
249 __ jmpl(entry_point.as_in().as_register(), G0, O7);
250 __ delayed()->mov(method.as_in().as_register(), G5_method); // setup method
252 BLOCK_COMMENT("call_stub_return_address:");
253 return_pc = __ pc();
255 // The callee, if it wasn't interpreted, can return with SP changed so
256 // we can no longer assert of change of SP.
258 // store result depending on type
259 // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE
260 // is treated as T_INT)
261 { const Register addr = result .as_in().as_register();
262 const Register type = result_type.as_in().as_register();
263 Label is_long, is_float, is_double, is_object, exit;
264 __ cmp(type, T_OBJECT); __ br(Assembler::equal, false, Assembler::pn, is_object);
265 __ delayed()->cmp(type, T_FLOAT); __ br(Assembler::equal, false, Assembler::pn, is_float);
266 __ delayed()->cmp(type, T_DOUBLE); __ br(Assembler::equal, false, Assembler::pn, is_double);
267 __ delayed()->cmp(type, T_LONG); __ br(Assembler::equal, false, Assembler::pn, is_long);
268 __ delayed()->nop();
270 // store int result
271 __ st(O0, addr, G0);
273 __ BIND(exit);
274 __ ret();
275 __ delayed()->restore();
277 __ BIND(is_object);
278 __ ba(exit);
279 __ delayed()->st_ptr(O0, addr, G0);
281 __ BIND(is_float);
282 __ ba(exit);
283 __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0);
285 __ BIND(is_double);
286 __ ba(exit);
287 __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0);
289 __ BIND(is_long);
290 #ifdef _LP64
291 __ ba(exit);
292 __ delayed()->st_long(O0, addr, G0); // store entire long
293 #else
294 #if defined(COMPILER2)
295 // All return values are where we want them, except for Longs. C2 returns
296 // longs in G1 in the 32-bit build whereas the interpreter wants them in O0/O1.
297 // Since the interpreter will return longs in G1 and O0/O1 in the 32bit
298 // build we simply always use G1.
299 // Note: I tried to make c2 return longs in O0/O1 and G1 so we wouldn't have to
300 // do this here. Unfortunately if we did a rethrow we'd see an machepilog node
301 // first which would move g1 -> O0/O1 and destroy the exception we were throwing.
303 __ ba(exit);
304 __ delayed()->stx(G1, addr, G0); // store entire long
305 #else
306 __ st(O1, addr, BytesPerInt);
307 __ ba(exit);
308 __ delayed()->st(O0, addr, G0);
309 #endif /* COMPILER2 */
310 #endif /* _LP64 */
311 }
312 return start;
313 }
316 //----------------------------------------------------------------------------------------------------
317 // Return point for a Java call if there's an exception thrown in Java code.
318 // The exception is caught and transformed into a pending exception stored in
319 // JavaThread that can be tested from within the VM.
320 //
321 // Oexception: exception oop
323 address generate_catch_exception() {
324 StubCodeMark mark(this, "StubRoutines", "catch_exception");
326 address start = __ pc();
327 // verify that thread corresponds
328 __ verify_thread();
330 const Register& temp_reg = Gtemp;
331 Address pending_exception_addr (G2_thread, Thread::pending_exception_offset());
332 Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset ());
333 Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset ());
335 // set pending exception
336 __ verify_oop(Oexception);
337 __ st_ptr(Oexception, pending_exception_addr);
338 __ set((intptr_t)__FILE__, temp_reg);
339 __ st_ptr(temp_reg, exception_file_offset_addr);
340 __ set((intptr_t)__LINE__, temp_reg);
341 __ st(temp_reg, exception_line_offset_addr);
343 // complete return to VM
344 assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
346 AddressLiteral stub_ret(StubRoutines::_call_stub_return_address);
347 __ jump_to(stub_ret, temp_reg);
348 __ delayed()->nop();
350 return start;
351 }
354 //----------------------------------------------------------------------------------------------------
355 // Continuation point for runtime calls returning with a pending exception
356 // The pending exception check happened in the runtime or native call stub
357 // The pending exception in Thread is converted into a Java-level exception
358 //
359 // Contract with Java-level exception handler: O0 = exception
360 // O1 = throwing pc
362 address generate_forward_exception() {
363 StubCodeMark mark(this, "StubRoutines", "forward_exception");
364 address start = __ pc();
366 // Upon entry, O7 has the return address returning into Java
367 // (interpreted or compiled) code; i.e. the return address
368 // becomes the throwing pc.
370 const Register& handler_reg = Gtemp;
372 Address exception_addr(G2_thread, Thread::pending_exception_offset());
374 #ifdef ASSERT
375 // make sure that this code is only executed if there is a pending exception
376 { Label L;
377 __ ld_ptr(exception_addr, Gtemp);
378 __ br_notnull_short(Gtemp, Assembler::pt, L);
379 __ stop("StubRoutines::forward exception: no pending exception (1)");
380 __ bind(L);
381 }
382 #endif
384 // compute exception handler into handler_reg
385 __ get_thread();
386 __ ld_ptr(exception_addr, Oexception);
387 __ verify_oop(Oexception);
388 __ save_frame(0); // compensates for compiler weakness
389 __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC
390 BLOCK_COMMENT("call exception_handler_for_return_address");
391 __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch);
392 __ mov(O0, handler_reg);
393 __ restore(); // compensates for compiler weakness
395 __ ld_ptr(exception_addr, Oexception);
396 __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC
398 #ifdef ASSERT
399 // make sure exception is set
400 { Label L;
401 __ br_notnull_short(Oexception, Assembler::pt, L);
402 __ stop("StubRoutines::forward exception: no pending exception (2)");
403 __ bind(L);
404 }
405 #endif
406 // jump to exception handler
407 __ jmp(handler_reg, 0);
408 // clear pending exception
409 __ delayed()->st_ptr(G0, exception_addr);
411 return start;
412 }
415 //------------------------------------------------------------------------------------------------------------------------
416 // Continuation point for throwing of implicit exceptions that are not handled in
417 // the current activation. Fabricates an exception oop and initiates normal
418 // exception dispatching in this frame. Only callee-saved registers are preserved
419 // (through the normal register window / RegisterMap handling).
420 // If the compiler needs all registers to be preserved between the fault
421 // point and the exception handler then it must assume responsibility for that in
422 // AbstractCompiler::continuation_for_implicit_null_exception or
423 // continuation_for_implicit_division_by_zero_exception. All other implicit
424 // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
425 // either at call sites or otherwise assume that stack unwinding will be initiated,
426 // so caller saved registers were assumed volatile in the compiler.
428 // Note that we generate only this stub into a RuntimeStub, because it needs to be
429 // properly traversed and ignored during GC, so we change the meaning of the "__"
430 // macro within this method.
431 #undef __
432 #define __ masm->
434 address generate_throw_exception(const char* name, address runtime_entry,
435 Register arg1 = noreg, Register arg2 = noreg) {
436 #ifdef ASSERT
437 int insts_size = VerifyThread ? 1 * K : 600;
438 #else
439 int insts_size = VerifyThread ? 1 * K : 256;
440 #endif /* ASSERT */
441 int locs_size = 32;
443 CodeBuffer code(name, insts_size, locs_size);
444 MacroAssembler* masm = new MacroAssembler(&code);
446 __ verify_thread();
448 // This is an inlined and slightly modified version of call_VM
449 // which has the ability to fetch the return PC out of thread-local storage
450 __ assert_not_delayed();
452 // Note that we always push a frame because on the SPARC
453 // architecture, for all of our implicit exception kinds at call
454 // sites, the implicit exception is taken before the callee frame
455 // is pushed.
456 __ save_frame(0);
458 int frame_complete = __ offset();
460 // Note that we always have a runtime stub frame on the top of stack by this point
461 Register last_java_sp = SP;
462 // 64-bit last_java_sp is biased!
463 __ set_last_Java_frame(last_java_sp, G0);
464 if (VerifyThread) __ mov(G2_thread, O0); // about to be smashed; pass early
465 __ save_thread(noreg);
466 if (arg1 != noreg) {
467 assert(arg2 != O1, "clobbered");
468 __ mov(arg1, O1);
469 }
470 if (arg2 != noreg) {
471 __ mov(arg2, O2);
472 }
473 // do the call
474 BLOCK_COMMENT("call runtime_entry");
475 __ call(runtime_entry, relocInfo::runtime_call_type);
476 if (!VerifyThread)
477 __ delayed()->mov(G2_thread, O0); // pass thread as first argument
478 else
479 __ delayed()->nop(); // (thread already passed)
480 __ restore_thread(noreg);
481 __ reset_last_Java_frame();
483 // check for pending exceptions. use Gtemp as scratch register.
484 #ifdef ASSERT
485 Label L;
487 Address exception_addr(G2_thread, Thread::pending_exception_offset());
488 Register scratch_reg = Gtemp;
489 __ ld_ptr(exception_addr, scratch_reg);
490 __ br_notnull_short(scratch_reg, Assembler::pt, L);
491 __ should_not_reach_here();
492 __ bind(L);
493 #endif // ASSERT
494 BLOCK_COMMENT("call forward_exception_entry");
495 __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
496 // we use O7 linkage so that forward_exception_entry has the issuing PC
497 __ delayed()->restore();
499 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false);
500 return stub->entry_point();
501 }
503 #undef __
504 #define __ _masm->
507 // Generate a routine that sets all the registers so we
508 // can tell if the stop routine prints them correctly.
509 address generate_test_stop() {
510 StubCodeMark mark(this, "StubRoutines", "test_stop");
511 address start = __ pc();
513 int i;
515 __ save_frame(0);
517 static jfloat zero = 0.0, one = 1.0;
519 // put addr in L0, then load through L0 to F0
520 __ set((intptr_t)&zero, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F0);
521 __ set((intptr_t)&one, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1
523 // use add to put 2..18 in F2..F18
524 for ( i = 2; i <= 18; ++i ) {
525 __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1), as_FloatRegister(i));
526 }
528 // Now put double 2 in F16, double 18 in F18
529 __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 );
530 __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 );
532 // use add to put 20..32 in F20..F32
533 for (i = 20; i < 32; i += 2) {
534 __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2), as_FloatRegister(i));
535 }
537 // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's
538 for ( i = 0; i < 8; ++i ) {
539 if (i < 6) {
540 __ set( i, as_iRegister(i));
541 __ set(16 + i, as_oRegister(i));
542 __ set(24 + i, as_gRegister(i));
543 }
544 __ set( 8 + i, as_lRegister(i));
545 }
547 __ stop("testing stop");
550 __ ret();
551 __ delayed()->restore();
553 return start;
554 }
557 address generate_stop_subroutine() {
558 StubCodeMark mark(this, "StubRoutines", "stop_subroutine");
559 address start = __ pc();
561 __ stop_subroutine();
563 return start;
564 }
566 address generate_flush_callers_register_windows() {
567 StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows");
568 address start = __ pc();
570 __ flush_windows();
571 __ retl(false);
572 __ delayed()->add( FP, STACK_BIAS, O0 );
573 // The returned value must be a stack pointer whose register save area
574 // is flushed, and will stay flushed while the caller executes.
576 return start;
577 }
579 // Helper functions for v8 atomic operations.
580 //
581 void get_v8_oop_lock_ptr(Register lock_ptr_reg, Register mark_oop_reg, Register scratch_reg) {
582 if (mark_oop_reg == noreg) {
583 address lock_ptr = (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr();
584 __ set((intptr_t)lock_ptr, lock_ptr_reg);
585 } else {
586 assert(scratch_reg != noreg, "just checking");
587 address lock_ptr = (address)StubRoutines::Sparc::_v8_oop_lock_cache;
588 __ set((intptr_t)lock_ptr, lock_ptr_reg);
589 __ and3(mark_oop_reg, StubRoutines::Sparc::v8_oop_lock_mask_in_place, scratch_reg);
590 __ add(lock_ptr_reg, scratch_reg, lock_ptr_reg);
591 }
592 }
594 void generate_v8_lock_prologue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) {
596 get_v8_oop_lock_ptr(lock_ptr_reg, mark_oop_reg, scratch_reg);
597 __ set(StubRoutines::Sparc::locked, lock_reg);
598 // Initialize yield counter
599 __ mov(G0,yield_reg);
601 __ BIND(retry);
602 __ cmp_and_br_short(yield_reg, V8AtomicOperationUnderLockSpinCount, Assembler::less, Assembler::pt, dontyield);
604 // This code can only be called from inside the VM, this
605 // stub is only invoked from Atomic::add(). We do not
606 // want to use call_VM, because _last_java_sp and such
607 // must already be set.
608 //
609 // Save the regs and make space for a C call
610 __ save(SP, -96, SP);
611 __ save_all_globals_into_locals();
612 BLOCK_COMMENT("call os::naked_sleep");
613 __ call(CAST_FROM_FN_PTR(address, os::naked_sleep));
614 __ delayed()->nop();
615 __ restore_globals_from_locals();
616 __ restore();
617 // reset the counter
618 __ mov(G0,yield_reg);
620 __ BIND(dontyield);
622 // try to get lock
623 __ swap(lock_ptr_reg, 0, lock_reg);
625 // did we get the lock?
626 __ cmp(lock_reg, StubRoutines::Sparc::unlocked);
627 __ br(Assembler::notEqual, true, Assembler::pn, retry);
628 __ delayed()->add(yield_reg,1,yield_reg);
630 // yes, got lock. do the operation here.
631 }
633 void generate_v8_lock_epilogue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) {
634 __ st(lock_reg, lock_ptr_reg, 0); // unlock
635 }
637 // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest).
638 //
639 // Arguments :
640 //
641 // exchange_value: O0
642 // dest: O1
643 //
644 // Results:
645 //
646 // O0: the value previously stored in dest
647 //
648 address generate_atomic_xchg() {
649 StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
650 address start = __ pc();
652 if (UseCASForSwap) {
653 // Use CAS instead of swap, just in case the MP hardware
654 // prefers to work with just one kind of synch. instruction.
655 Label retry;
656 __ BIND(retry);
657 __ mov(O0, O3); // scratch copy of exchange value
658 __ ld(O1, 0, O2); // observe the previous value
659 // try to replace O2 with O3
660 __ cas_under_lock(O1, O2, O3,
661 (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
662 __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
664 __ retl(false);
665 __ delayed()->mov(O2, O0); // report previous value to caller
667 } else {
668 if (VM_Version::v9_instructions_work()) {
669 __ retl(false);
670 __ delayed()->swap(O1, 0, O0);
671 } else {
672 const Register& lock_reg = O2;
673 const Register& lock_ptr_reg = O3;
674 const Register& yield_reg = O4;
676 Label retry;
677 Label dontyield;
679 generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
680 // got the lock, do the swap
681 __ swap(O1, 0, O0);
683 generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
684 __ retl(false);
685 __ delayed()->nop();
686 }
687 }
689 return start;
690 }
693 // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value)
694 //
695 // Arguments :
696 //
697 // exchange_value: O0
698 // dest: O1
699 // compare_value: O2
700 //
701 // Results:
702 //
703 // O0: the value previously stored in dest
704 //
705 // Overwrites (v8): O3,O4,O5
706 //
707 address generate_atomic_cmpxchg() {
708 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
709 address start = __ pc();
711 // cmpxchg(dest, compare_value, exchange_value)
712 __ cas_under_lock(O1, O2, O0,
713 (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
714 __ retl(false);
715 __ delayed()->nop();
717 return start;
718 }
720 // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
721 //
722 // Arguments :
723 //
724 // exchange_value: O1:O0
725 // dest: O2
726 // compare_value: O4:O3
727 //
728 // Results:
729 //
730 // O1:O0: the value previously stored in dest
731 //
732 // This only works on V9, on V8 we don't generate any
733 // code and just return NULL.
734 //
735 // Overwrites: G1,G2,G3
736 //
737 address generate_atomic_cmpxchg_long() {
738 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
739 address start = __ pc();
741 if (!VM_Version::supports_cx8())
742 return NULL;;
743 __ sllx(O0, 32, O0);
744 __ srl(O1, 0, O1);
745 __ or3(O0,O1,O0); // O0 holds 64-bit value from compare_value
746 __ sllx(O3, 32, O3);
747 __ srl(O4, 0, O4);
748 __ or3(O3,O4,O3); // O3 holds 64-bit value from exchange_value
749 __ casx(O2, O3, O0);
750 __ srl(O0, 0, O1); // unpacked return value in O1:O0
751 __ retl(false);
752 __ delayed()->srlx(O0, 32, O0);
754 return start;
755 }
758 // Support for jint Atomic::add(jint add_value, volatile jint* dest).
759 //
760 // Arguments :
761 //
762 // add_value: O0 (e.g., +1 or -1)
763 // dest: O1
764 //
765 // Results:
766 //
767 // O0: the new value stored in dest
768 //
769 // Overwrites (v9): O3
770 // Overwrites (v8): O3,O4,O5
771 //
772 address generate_atomic_add() {
773 StubCodeMark mark(this, "StubRoutines", "atomic_add");
774 address start = __ pc();
775 __ BIND(_atomic_add_stub);
777 if (VM_Version::v9_instructions_work()) {
778 Label(retry);
779 __ BIND(retry);
781 __ lduw(O1, 0, O2);
782 __ add(O0, O2, O3);
783 __ cas(O1, O2, O3);
784 __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
785 __ retl(false);
786 __ delayed()->add(O0, O2, O0); // note that cas made O2==O3
787 } else {
788 const Register& lock_reg = O2;
789 const Register& lock_ptr_reg = O3;
790 const Register& value_reg = O4;
791 const Register& yield_reg = O5;
793 Label(retry);
794 Label(dontyield);
796 generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
797 // got lock, do the increment
798 __ ld(O1, 0, value_reg);
799 __ add(O0, value_reg, value_reg);
800 __ st(value_reg, O1, 0);
802 // %%% only for RMO and PSO
803 __ membar(Assembler::StoreStore);
805 generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
807 __ retl(false);
808 __ delayed()->mov(value_reg, O0);
809 }
811 return start;
812 }
813 Label _atomic_add_stub; // called from other stubs
816 //------------------------------------------------------------------------------------------------------------------------
817 // The following routine generates a subroutine to throw an asynchronous
818 // UnknownError when an unsafe access gets a fault that could not be
819 // reasonably prevented by the programmer. (Example: SIGBUS/OBJERR.)
820 //
821 // Arguments :
822 //
823 // trapping PC: O7
824 //
825 // Results:
826 // posts an asynchronous exception, skips the trapping instruction
827 //
829 address generate_handler_for_unsafe_access() {
830 StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
831 address start = __ pc();
833 const int preserve_register_words = (64 * 2);
834 Address preserve_addr(FP, (-preserve_register_words * wordSize) + STACK_BIAS);
836 Register Lthread = L7_thread_cache;
837 int i;
839 __ save_frame(0);
840 __ mov(G1, L1);
841 __ mov(G2, L2);
842 __ mov(G3, L3);
843 __ mov(G4, L4);
844 __ mov(G5, L5);
845 for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) {
846 __ stf(FloatRegisterImpl::D, as_FloatRegister(i), preserve_addr, i * wordSize);
847 }
849 address entry_point = CAST_FROM_FN_PTR(address, handle_unsafe_access);
850 BLOCK_COMMENT("call handle_unsafe_access");
851 __ call(entry_point, relocInfo::runtime_call_type);
852 __ delayed()->nop();
854 __ mov(L1, G1);
855 __ mov(L2, G2);
856 __ mov(L3, G3);
857 __ mov(L4, G4);
858 __ mov(L5, G5);
859 for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) {
860 __ ldf(FloatRegisterImpl::D, preserve_addr, as_FloatRegister(i), i * wordSize);
861 }
863 __ verify_thread();
865 __ jmp(O0, 0);
866 __ delayed()->restore();
868 return start;
869 }
872 // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super );
873 // Arguments :
874 //
875 // ret : O0, returned
876 // icc/xcc: set as O0 (depending on wordSize)
877 // sub : O1, argument, not changed
878 // super: O2, argument, not changed
879 // raddr: O7, blown by call
880 address generate_partial_subtype_check() {
881 __ align(CodeEntryAlignment);
882 StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
883 address start = __ pc();
884 Label miss;
886 #if defined(COMPILER2) && !defined(_LP64)
887 // Do not use a 'save' because it blows the 64-bit O registers.
888 __ add(SP,-4*wordSize,SP); // Make space for 4 temps (stack must be 2 words aligned)
889 __ st_ptr(L0,SP,(frame::register_save_words+0)*wordSize);
890 __ st_ptr(L1,SP,(frame::register_save_words+1)*wordSize);
891 __ st_ptr(L2,SP,(frame::register_save_words+2)*wordSize);
892 __ st_ptr(L3,SP,(frame::register_save_words+3)*wordSize);
893 Register Rret = O0;
894 Register Rsub = O1;
895 Register Rsuper = O2;
896 #else
897 __ save_frame(0);
898 Register Rret = I0;
899 Register Rsub = I1;
900 Register Rsuper = I2;
901 #endif
903 Register L0_ary_len = L0;
904 Register L1_ary_ptr = L1;
905 Register L2_super = L2;
906 Register L3_index = L3;
908 __ check_klass_subtype_slow_path(Rsub, Rsuper,
909 L0, L1, L2, L3,
910 NULL, &miss);
912 // Match falls through here.
913 __ addcc(G0,0,Rret); // set Z flags, Z result
915 #if defined(COMPILER2) && !defined(_LP64)
916 __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
917 __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
918 __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
919 __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
920 __ retl(); // Result in Rret is zero; flags set to Z
921 __ delayed()->add(SP,4*wordSize,SP);
922 #else
923 __ ret(); // Result in Rret is zero; flags set to Z
924 __ delayed()->restore();
925 #endif
927 __ BIND(miss);
928 __ addcc(G0,1,Rret); // set NZ flags, NZ result
930 #if defined(COMPILER2) && !defined(_LP64)
931 __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
932 __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
933 __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
934 __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
935 __ retl(); // Result in Rret is != 0; flags set to NZ
936 __ delayed()->add(SP,4*wordSize,SP);
937 #else
938 __ ret(); // Result in Rret is != 0; flags set to NZ
939 __ delayed()->restore();
940 #endif
942 return start;
943 }
946 // Called from MacroAssembler::verify_oop
947 //
948 address generate_verify_oop_subroutine() {
949 StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
951 address start = __ pc();
953 __ verify_oop_subroutine();
955 return start;
956 }
959 //
960 // Verify that a register contains clean 32-bits positive value
961 // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax).
962 //
963 // Input:
964 // Rint - 32-bits value
965 // Rtmp - scratch
966 //
967 void assert_clean_int(Register Rint, Register Rtmp) {
968 #if defined(ASSERT) && defined(_LP64)
969 __ signx(Rint, Rtmp);
970 __ cmp(Rint, Rtmp);
971 __ breakpoint_trap(Assembler::notEqual, Assembler::xcc);
972 #endif
973 }
975 //
976 // Generate overlap test for array copy stubs
977 //
978 // Input:
979 // O0 - array1
980 // O1 - array2
981 // O2 - element count
982 //
983 // Kills temps: O3, O4
984 //
985 void array_overlap_test(address no_overlap_target, int log2_elem_size) {
986 assert(no_overlap_target != NULL, "must be generated");
987 array_overlap_test(no_overlap_target, NULL, log2_elem_size);
988 }
989 void array_overlap_test(Label& L_no_overlap, int log2_elem_size) {
990 array_overlap_test(NULL, &L_no_overlap, log2_elem_size);
991 }
992 void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) {
993 const Register from = O0;
994 const Register to = O1;
995 const Register count = O2;
996 const Register to_from = O3; // to - from
997 const Register byte_count = O4; // count << log2_elem_size
999 __ subcc(to, from, to_from);
1000 __ sll_ptr(count, log2_elem_size, byte_count);
1001 if (NOLp == NULL)
1002 __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target);
1003 else
1004 __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp));
1005 __ delayed()->cmp(to_from, byte_count);
1006 if (NOLp == NULL)
1007 __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target);
1008 else
1009 __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp));
1010 __ delayed()->nop();
1011 }
1013 //
1014 // Generate pre-write barrier for array.
1015 //
1016 // Input:
1017 // addr - register containing starting address
1018 // count - register containing element count
1019 // tmp - scratch register
1020 //
1021 // The input registers are overwritten.
1022 //
1023 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
1024 BarrierSet* bs = Universe::heap()->barrier_set();
1025 switch (bs->kind()) {
1026 case BarrierSet::G1SATBCT:
1027 case BarrierSet::G1SATBCTLogging:
1028 // With G1, don't generate the call if we statically know that the target in uninitialized
1029 if (!dest_uninitialized) {
1030 __ save_frame(0);
1031 // Save the necessary global regs... will be used after.
1032 if (addr->is_global()) {
1033 __ mov(addr, L0);
1034 }
1035 if (count->is_global()) {
1036 __ mov(count, L1);
1037 }
1038 __ mov(addr->after_save(), O0);
1039 // Get the count into O1
1040 __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
1041 __ delayed()->mov(count->after_save(), O1);
1042 if (addr->is_global()) {
1043 __ mov(L0, addr);
1044 }
1045 if (count->is_global()) {
1046 __ mov(L1, count);
1047 }
1048 __ restore();
1049 }
1050 break;
1051 case BarrierSet::CardTableModRef:
1052 case BarrierSet::CardTableExtension:
1053 case BarrierSet::ModRef:
1054 break;
1055 default:
1056 ShouldNotReachHere();
1057 }
1058 }
1059 //
1060 // Generate post-write barrier for array.
1061 //
1062 // Input:
1063 // addr - register containing starting address
1064 // count - register containing element count
1065 // tmp - scratch register
1066 //
1067 // The input registers are overwritten.
1068 //
1069 void gen_write_ref_array_post_barrier(Register addr, Register count,
1070 Register tmp) {
1071 BarrierSet* bs = Universe::heap()->barrier_set();
1073 switch (bs->kind()) {
1074 case BarrierSet::G1SATBCT:
1075 case BarrierSet::G1SATBCTLogging:
1076 {
1077 // Get some new fresh output registers.
1078 __ save_frame(0);
1079 __ mov(addr->after_save(), O0);
1080 __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
1081 __ delayed()->mov(count->after_save(), O1);
1082 __ restore();
1083 }
1084 break;
1085 case BarrierSet::CardTableModRef:
1086 case BarrierSet::CardTableExtension:
1087 {
1088 CardTableModRefBS* ct = (CardTableModRefBS*)bs;
1089 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1090 assert_different_registers(addr, count, tmp);
1092 Label L_loop;
1094 __ sll_ptr(count, LogBytesPerHeapOop, count);
1095 __ sub(count, BytesPerHeapOop, count);
1096 __ add(count, addr, count);
1097 // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
1098 __ srl_ptr(addr, CardTableModRefBS::card_shift, addr);
1099 __ srl_ptr(count, CardTableModRefBS::card_shift, count);
1100 __ sub(count, addr, count);
1101 AddressLiteral rs(ct->byte_map_base);
1102 __ set(rs, tmp);
1103 __ BIND(L_loop);
1104 __ stb(G0, tmp, addr);
1105 __ subcc(count, 1, count);
1106 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1107 __ delayed()->add(addr, 1, addr);
1108 }
1109 break;
1110 case BarrierSet::ModRef:
1111 break;
1112 default:
1113 ShouldNotReachHere();
1114 }
1115 }
1117 //
1118 // Generate main code for disjoint arraycopy
1119 //
1120 typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec,
1121 Label& L_loop, bool use_prefetch, bool use_bis);
1123 void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size,
1124 int iter_size, CopyLoopFunc copy_loop_func) {
1125 Label L_copy;
1127 assert(log2_elem_size <= 3, "the following code should be changed");
1128 int count_dec = 16>>log2_elem_size;
1130 int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance);
1131 assert(prefetch_dist < 4096, "invalid value");
1132 prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size
1133 int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count
1135 if (UseBlockCopy) {
1136 Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy;
1138 // 64 bytes tail + bytes copied in one loop iteration
1139 int tail_size = 64 + iter_size;
1140 int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size;
1141 // Use BIS copy only for big arrays since it requires membar.
1142 __ set(block_copy_count, O4);
1143 __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy);
1144 // This code is for disjoint source and destination:
1145 // to <= from || to >= from+count
1146 // but BIS will stomp over 'from' if (to > from-tail_size && to <= from)
1147 __ sub(from, to, O4);
1148 __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm.
1149 __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy);
1151 __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
1152 // BIS should not be used to copy tail (64 bytes+iter_size)
1153 // to avoid zeroing of following values.
1154 __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0
1156 if (prefetch_count > 0) { // rounded up to one iteration count
1157 // Do prefetching only if copy size is bigger
1158 // than prefetch distance.
1159 __ set(prefetch_count, O4);
1160 __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy);
1161 __ sub(count, prefetch_count, count);
1163 (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true);
1164 __ add(count, prefetch_count, count); // restore count
1166 } // prefetch_count > 0
1168 (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true);
1169 __ add(count, (tail_size>>log2_elem_size), count); // restore count
1171 __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT);
1172 // BIS needs membar.
1173 __ membar(Assembler::StoreLoad);
1174 // Copy tail
1175 __ ba_short(L_copy);
1177 __ BIND(L_skip_block_copy);
1178 } // UseBlockCopy
1180 if (prefetch_count > 0) { // rounded up to one iteration count
1181 // Do prefetching only if copy size is bigger
1182 // than prefetch distance.
1183 __ set(prefetch_count, O4);
1184 __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy);
1185 __ sub(count, prefetch_count, count);
1187 Label L_copy_prefetch;
1188 (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false);
1189 __ add(count, prefetch_count, count); // restore count
1191 } // prefetch_count > 0
1193 (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false);
1194 }
1198 //
1199 // Helper methods for copy_16_bytes_forward_with_shift()
1200 //
1201 void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec,
1202 Label& L_loop, bool use_prefetch, bool use_bis) {
1204 const Register left_shift = G1; // left shift bit counter
1205 const Register right_shift = G5; // right shift bit counter
1207 __ align(OptoLoopAlignment);
1208 __ BIND(L_loop);
1209 if (use_prefetch) {
1210 if (ArraycopySrcPrefetchDistance > 0) {
1211 __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
1212 }
1213 if (ArraycopyDstPrefetchDistance > 0) {
1214 __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
1215 }
1216 }
1217 __ ldx(from, 0, O4);
1218 __ ldx(from, 8, G4);
1219 __ inc(to, 16);
1220 __ inc(from, 16);
1221 __ deccc(count, count_dec); // Can we do next iteration after this one?
1222 __ srlx(O4, right_shift, G3);
1223 __ bset(G3, O3);
1224 __ sllx(O4, left_shift, O4);
1225 __ srlx(G4, right_shift, G3);
1226 __ bset(G3, O4);
1227 if (use_bis) {
1228 __ stxa(O3, to, -16);
1229 __ stxa(O4, to, -8);
1230 } else {
1231 __ stx(O3, to, -16);
1232 __ stx(O4, to, -8);
1233 }
1234 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1235 __ delayed()->sllx(G4, left_shift, O3);
1236 }
1238 // Copy big chunks forward with shift
1239 //
1240 // Inputs:
1241 // from - source arrays
1242 // to - destination array aligned to 8-bytes
1243 // count - elements count to copy >= the count equivalent to 16 bytes
1244 // count_dec - elements count's decrement equivalent to 16 bytes
1245 // L_copy_bytes - copy exit label
1246 //
1247 void copy_16_bytes_forward_with_shift(Register from, Register to,
1248 Register count, int log2_elem_size, Label& L_copy_bytes) {
1249 Label L_aligned_copy, L_copy_last_bytes;
1250 assert(log2_elem_size <= 3, "the following code should be changed");
1251 int count_dec = 16>>log2_elem_size;
1253 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1254 __ andcc(from, 7, G1); // misaligned bytes
1255 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1256 __ delayed()->nop();
1258 const Register left_shift = G1; // left shift bit counter
1259 const Register right_shift = G5; // right shift bit counter
1261 __ sll(G1, LogBitsPerByte, left_shift);
1262 __ mov(64, right_shift);
1263 __ sub(right_shift, left_shift, right_shift);
1265 //
1266 // Load 2 aligned 8-bytes chunks and use one from previous iteration
1267 // to form 2 aligned 8-bytes chunks to store.
1268 //
1269 __ dec(count, count_dec); // Pre-decrement 'count'
1270 __ andn(from, 7, from); // Align address
1271 __ ldx(from, 0, O3);
1272 __ inc(from, 8);
1273 __ sllx(O3, left_shift, O3);
1275 disjoint_copy_core(from, to, count, log2_elem_size, 16, copy_16_bytes_shift_loop);
1277 __ inccc(count, count_dec>>1 ); // + 8 bytes
1278 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1279 __ delayed()->inc(count, count_dec>>1); // restore 'count'
1281 // copy 8 bytes, part of them already loaded in O3
1282 __ ldx(from, 0, O4);
1283 __ inc(to, 8);
1284 __ inc(from, 8);
1285 __ srlx(O4, right_shift, G3);
1286 __ bset(O3, G3);
1287 __ stx(G3, to, -8);
1289 __ BIND(L_copy_last_bytes);
1290 __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
1291 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1292 __ delayed()->sub(from, right_shift, from); // restore address
1294 __ BIND(L_aligned_copy);
1295 }
1297 // Copy big chunks backward with shift
1298 //
1299 // Inputs:
1300 // end_from - source arrays end address
1301 // end_to - destination array end address aligned to 8-bytes
1302 // count - elements count to copy >= the count equivalent to 16 bytes
1303 // count_dec - elements count's decrement equivalent to 16 bytes
1304 // L_aligned_copy - aligned copy exit label
1305 // L_copy_bytes - copy exit label
1306 //
1307 void copy_16_bytes_backward_with_shift(Register end_from, Register end_to,
1308 Register count, int count_dec,
1309 Label& L_aligned_copy, Label& L_copy_bytes) {
1310 Label L_loop, L_copy_last_bytes;
1312 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1313 __ andcc(end_from, 7, G1); // misaligned bytes
1314 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1315 __ delayed()->deccc(count, count_dec); // Pre-decrement 'count'
1317 const Register left_shift = G1; // left shift bit counter
1318 const Register right_shift = G5; // right shift bit counter
1320 __ sll(G1, LogBitsPerByte, left_shift);
1321 __ mov(64, right_shift);
1322 __ sub(right_shift, left_shift, right_shift);
1324 //
1325 // Load 2 aligned 8-bytes chunks and use one from previous iteration
1326 // to form 2 aligned 8-bytes chunks to store.
1327 //
1328 __ andn(end_from, 7, end_from); // Align address
1329 __ ldx(end_from, 0, O3);
1330 __ align(OptoLoopAlignment);
1331 __ BIND(L_loop);
1332 __ ldx(end_from, -8, O4);
1333 __ deccc(count, count_dec); // Can we do next iteration after this one?
1334 __ ldx(end_from, -16, G4);
1335 __ dec(end_to, 16);
1336 __ dec(end_from, 16);
1337 __ srlx(O3, right_shift, O3);
1338 __ sllx(O4, left_shift, G3);
1339 __ bset(G3, O3);
1340 __ stx(O3, end_to, 8);
1341 __ srlx(O4, right_shift, O4);
1342 __ sllx(G4, left_shift, G3);
1343 __ bset(G3, O4);
1344 __ stx(O4, end_to, 0);
1345 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1346 __ delayed()->mov(G4, O3);
1348 __ inccc(count, count_dec>>1 ); // + 8 bytes
1349 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1350 __ delayed()->inc(count, count_dec>>1); // restore 'count'
1352 // copy 8 bytes, part of them already loaded in O3
1353 __ ldx(end_from, -8, O4);
1354 __ dec(end_to, 8);
1355 __ dec(end_from, 8);
1356 __ srlx(O3, right_shift, O3);
1357 __ sllx(O4, left_shift, G3);
1358 __ bset(O3, G3);
1359 __ stx(G3, end_to, 0);
1361 __ BIND(L_copy_last_bytes);
1362 __ srl(left_shift, LogBitsPerByte, left_shift); // misaligned bytes
1363 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1364 __ delayed()->add(end_from, left_shift, end_from); // restore address
1365 }
1367 //
1368 // Generate stub for disjoint byte copy. If "aligned" is true, the
1369 // "from" and "to" addresses are assumed to be heapword aligned.
1370 //
1371 // Arguments for generated stub:
1372 // from: O0
1373 // to: O1
1374 // count: O2 treated as signed
1375 //
1376 address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) {
1377 __ align(CodeEntryAlignment);
1378 StubCodeMark mark(this, "StubRoutines", name);
1379 address start = __ pc();
1381 Label L_skip_alignment, L_align;
1382 Label L_copy_byte, L_copy_byte_loop, L_exit;
1384 const Register from = O0; // source array address
1385 const Register to = O1; // destination array address
1386 const Register count = O2; // elements count
1387 const Register offset = O5; // offset from start of arrays
1388 // O3, O4, G3, G4 are used as temp registers
1390 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1392 if (entry != NULL) {
1393 *entry = __ pc();
1394 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1395 BLOCK_COMMENT("Entry:");
1396 }
1398 // for short arrays, just do single element copy
1399 __ cmp(count, 23); // 16 + 7
1400 __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1401 __ delayed()->mov(G0, offset);
1403 if (aligned) {
1404 // 'aligned' == true when it is known statically during compilation
1405 // of this arraycopy call site that both 'from' and 'to' addresses
1406 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1407 //
1408 // Aligned arrays have 4 bytes alignment in 32-bits VM
1409 // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM
1410 //
1411 #ifndef _LP64
1412 // copy a 4-bytes word if necessary to align 'to' to 8 bytes
1413 __ andcc(to, 7, G0);
1414 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment);
1415 __ delayed()->ld(from, 0, O3);
1416 __ inc(from, 4);
1417 __ inc(to, 4);
1418 __ dec(count, 4);
1419 __ st(O3, to, -4);
1420 __ BIND(L_skip_alignment);
1421 #endif
1422 } else {
1423 // copy bytes to align 'to' on 8 byte boundary
1424 __ andcc(to, 7, G1); // misaligned bytes
1425 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1426 __ delayed()->neg(G1);
1427 __ inc(G1, 8); // bytes need to copy to next 8-bytes alignment
1428 __ sub(count, G1, count);
1429 __ BIND(L_align);
1430 __ ldub(from, 0, O3);
1431 __ deccc(G1);
1432 __ inc(from);
1433 __ stb(O3, to, 0);
1434 __ br(Assembler::notZero, false, Assembler::pt, L_align);
1435 __ delayed()->inc(to);
1436 __ BIND(L_skip_alignment);
1437 }
1438 #ifdef _LP64
1439 if (!aligned)
1440 #endif
1441 {
1442 // Copy with shift 16 bytes per iteration if arrays do not have
1443 // the same alignment mod 8, otherwise fall through to the next
1444 // code for aligned copy.
1445 // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
1446 // Also jump over aligned copy after the copy with shift completed.
1448 copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte);
1449 }
1451 // Both array are 8 bytes aligned, copy 16 bytes at a time
1452 __ and3(count, 7, G4); // Save count
1453 __ srl(count, 3, count);
1454 generate_disjoint_long_copy_core(aligned);
1455 __ mov(G4, count); // Restore count
1457 // copy tailing bytes
1458 __ BIND(L_copy_byte);
1459 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1460 __ align(OptoLoopAlignment);
1461 __ BIND(L_copy_byte_loop);
1462 __ ldub(from, offset, O3);
1463 __ deccc(count);
1464 __ stb(O3, to, offset);
1465 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
1466 __ delayed()->inc(offset);
1468 __ BIND(L_exit);
1469 // O3, O4 are used as temp registers
1470 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1471 __ retl();
1472 __ delayed()->mov(G0, O0); // return 0
1473 return start;
1474 }
1476 //
1477 // Generate stub for conjoint byte copy. If "aligned" is true, the
1478 // "from" and "to" addresses are assumed to be heapword aligned.
1479 //
1480 // Arguments for generated stub:
1481 // from: O0
1482 // to: O1
1483 // count: O2 treated as signed
1484 //
1485 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1486 address *entry, const char *name) {
1487 // Do reverse copy.
1489 __ align(CodeEntryAlignment);
1490 StubCodeMark mark(this, "StubRoutines", name);
1491 address start = __ pc();
1493 Label L_skip_alignment, L_align, L_aligned_copy;
1494 Label L_copy_byte, L_copy_byte_loop, L_exit;
1496 const Register from = O0; // source array address
1497 const Register to = O1; // destination array address
1498 const Register count = O2; // elements count
1499 const Register end_from = from; // source array end address
1500 const Register end_to = to; // destination array end address
1502 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1504 if (entry != NULL) {
1505 *entry = __ pc();
1506 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1507 BLOCK_COMMENT("Entry:");
1508 }
1510 array_overlap_test(nooverlap_target, 0);
1512 __ add(to, count, end_to); // offset after last copied element
1514 // for short arrays, just do single element copy
1515 __ cmp(count, 23); // 16 + 7
1516 __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1517 __ delayed()->add(from, count, end_from);
1519 {
1520 // Align end of arrays since they could be not aligned even
1521 // when arrays itself are aligned.
1523 // copy bytes to align 'end_to' on 8 byte boundary
1524 __ andcc(end_to, 7, G1); // misaligned bytes
1525 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1526 __ delayed()->nop();
1527 __ sub(count, G1, count);
1528 __ BIND(L_align);
1529 __ dec(end_from);
1530 __ dec(end_to);
1531 __ ldub(end_from, 0, O3);
1532 __ deccc(G1);
1533 __ brx(Assembler::notZero, false, Assembler::pt, L_align);
1534 __ delayed()->stb(O3, end_to, 0);
1535 __ BIND(L_skip_alignment);
1536 }
1537 #ifdef _LP64
1538 if (aligned) {
1539 // Both arrays are aligned to 8-bytes in 64-bits VM.
1540 // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1541 // in unaligned case.
1542 __ dec(count, 16);
1543 } else
1544 #endif
1545 {
1546 // Copy with shift 16 bytes per iteration if arrays do not have
1547 // the same alignment mod 8, otherwise jump to the next
1548 // code for aligned copy (and substracting 16 from 'count' before jump).
1549 // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1550 // Also jump over aligned copy after the copy with shift completed.
1552 copy_16_bytes_backward_with_shift(end_from, end_to, count, 16,
1553 L_aligned_copy, L_copy_byte);
1554 }
1555 // copy 4 elements (16 bytes) at a time
1556 __ align(OptoLoopAlignment);
1557 __ BIND(L_aligned_copy);
1558 __ dec(end_from, 16);
1559 __ ldx(end_from, 8, O3);
1560 __ ldx(end_from, 0, O4);
1561 __ dec(end_to, 16);
1562 __ deccc(count, 16);
1563 __ stx(O3, end_to, 8);
1564 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1565 __ delayed()->stx(O4, end_to, 0);
1566 __ inc(count, 16);
1568 // copy 1 element (2 bytes) at a time
1569 __ BIND(L_copy_byte);
1570 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1571 __ align(OptoLoopAlignment);
1572 __ BIND(L_copy_byte_loop);
1573 __ dec(end_from);
1574 __ dec(end_to);
1575 __ ldub(end_from, 0, O4);
1576 __ deccc(count);
1577 __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop);
1578 __ delayed()->stb(O4, end_to, 0);
1580 __ BIND(L_exit);
1581 // O3, O4 are used as temp registers
1582 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1583 __ retl();
1584 __ delayed()->mov(G0, O0); // return 0
1585 return start;
1586 }
1588 //
1589 // Generate stub for disjoint short copy. If "aligned" is true, the
1590 // "from" and "to" addresses are assumed to be heapword aligned.
1591 //
1592 // Arguments for generated stub:
1593 // from: O0
1594 // to: O1
1595 // count: O2 treated as signed
1596 //
1597 address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) {
1598 __ align(CodeEntryAlignment);
1599 StubCodeMark mark(this, "StubRoutines", name);
1600 address start = __ pc();
1602 Label L_skip_alignment, L_skip_alignment2;
1603 Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1605 const Register from = O0; // source array address
1606 const Register to = O1; // destination array address
1607 const Register count = O2; // elements count
1608 const Register offset = O5; // offset from start of arrays
1609 // O3, O4, G3, G4 are used as temp registers
1611 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1613 if (entry != NULL) {
1614 *entry = __ pc();
1615 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1616 BLOCK_COMMENT("Entry:");
1617 }
1619 // for short arrays, just do single element copy
1620 __ cmp(count, 11); // 8 + 3 (22 bytes)
1621 __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1622 __ delayed()->mov(G0, offset);
1624 if (aligned) {
1625 // 'aligned' == true when it is known statically during compilation
1626 // of this arraycopy call site that both 'from' and 'to' addresses
1627 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1628 //
1629 // Aligned arrays have 4 bytes alignment in 32-bits VM
1630 // and 8 bytes - in 64-bits VM.
1631 //
1632 #ifndef _LP64
1633 // copy a 2-elements word if necessary to align 'to' to 8 bytes
1634 __ andcc(to, 7, G0);
1635 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1636 __ delayed()->ld(from, 0, O3);
1637 __ inc(from, 4);
1638 __ inc(to, 4);
1639 __ dec(count, 2);
1640 __ st(O3, to, -4);
1641 __ BIND(L_skip_alignment);
1642 #endif
1643 } else {
1644 // copy 1 element if necessary to align 'to' on an 4 bytes
1645 __ andcc(to, 3, G0);
1646 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1647 __ delayed()->lduh(from, 0, O3);
1648 __ inc(from, 2);
1649 __ inc(to, 2);
1650 __ dec(count);
1651 __ sth(O3, to, -2);
1652 __ BIND(L_skip_alignment);
1654 // copy 2 elements to align 'to' on an 8 byte boundary
1655 __ andcc(to, 7, G0);
1656 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1657 __ delayed()->lduh(from, 0, O3);
1658 __ dec(count, 2);
1659 __ lduh(from, 2, O4);
1660 __ inc(from, 4);
1661 __ inc(to, 4);
1662 __ sth(O3, to, -4);
1663 __ sth(O4, to, -2);
1664 __ BIND(L_skip_alignment2);
1665 }
1666 #ifdef _LP64
1667 if (!aligned)
1668 #endif
1669 {
1670 // Copy with shift 16 bytes per iteration if arrays do not have
1671 // the same alignment mod 8, otherwise fall through to the next
1672 // code for aligned copy.
1673 // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1674 // Also jump over aligned copy after the copy with shift completed.
1676 copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes);
1677 }
1679 // Both array are 8 bytes aligned, copy 16 bytes at a time
1680 __ and3(count, 3, G4); // Save
1681 __ srl(count, 2, count);
1682 generate_disjoint_long_copy_core(aligned);
1683 __ mov(G4, count); // restore
1685 // copy 1 element at a time
1686 __ BIND(L_copy_2_bytes);
1687 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1688 __ align(OptoLoopAlignment);
1689 __ BIND(L_copy_2_bytes_loop);
1690 __ lduh(from, offset, O3);
1691 __ deccc(count);
1692 __ sth(O3, to, offset);
1693 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
1694 __ delayed()->inc(offset, 2);
1696 __ BIND(L_exit);
1697 // O3, O4 are used as temp registers
1698 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1699 __ retl();
1700 __ delayed()->mov(G0, O0); // return 0
1701 return start;
1702 }
1704 //
1705 // Generate stub for disjoint short fill. If "aligned" is true, the
1706 // "to" address is assumed to be heapword aligned.
1707 //
1708 // Arguments for generated stub:
1709 // to: O0
1710 // value: O1
1711 // count: O2 treated as signed
1712 //
1713 address generate_fill(BasicType t, bool aligned, const char* name) {
1714 __ align(CodeEntryAlignment);
1715 StubCodeMark mark(this, "StubRoutines", name);
1716 address start = __ pc();
1718 const Register to = O0; // source array address
1719 const Register value = O1; // fill value
1720 const Register count = O2; // elements count
1721 // O3 is used as a temp register
1723 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1725 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
1726 Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes;
1728 int shift = -1;
1729 switch (t) {
1730 case T_BYTE:
1731 shift = 2;
1732 break;
1733 case T_SHORT:
1734 shift = 1;
1735 break;
1736 case T_INT:
1737 shift = 0;
1738 break;
1739 default: ShouldNotReachHere();
1740 }
1742 BLOCK_COMMENT("Entry:");
1744 if (t == T_BYTE) {
1745 // Zero extend value
1746 __ and3(value, 0xff, value);
1747 __ sllx(value, 8, O3);
1748 __ or3(value, O3, value);
1749 }
1750 if (t == T_SHORT) {
1751 // Zero extend value
1752 __ sllx(value, 48, value);
1753 __ srlx(value, 48, value);
1754 }
1755 if (t == T_BYTE || t == T_SHORT) {
1756 __ sllx(value, 16, O3);
1757 __ or3(value, O3, value);
1758 }
1760 __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
1761 __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp
1762 __ delayed()->andcc(count, 1, G0);
1764 if (!aligned && (t == T_BYTE || t == T_SHORT)) {
1765 // align source address at 4 bytes address boundary
1766 if (t == T_BYTE) {
1767 // One byte misalignment happens only for byte arrays
1768 __ andcc(to, 1, G0);
1769 __ br(Assembler::zero, false, Assembler::pt, L_skip_align1);
1770 __ delayed()->nop();
1771 __ stb(value, to, 0);
1772 __ inc(to, 1);
1773 __ dec(count, 1);
1774 __ BIND(L_skip_align1);
1775 }
1776 // Two bytes misalignment happens only for byte and short (char) arrays
1777 __ andcc(to, 2, G0);
1778 __ br(Assembler::zero, false, Assembler::pt, L_skip_align2);
1779 __ delayed()->nop();
1780 __ sth(value, to, 0);
1781 __ inc(to, 2);
1782 __ dec(count, 1 << (shift - 1));
1783 __ BIND(L_skip_align2);
1784 }
1785 #ifdef _LP64
1786 if (!aligned) {
1787 #endif
1788 // align to 8 bytes, we know we are 4 byte aligned to start
1789 __ andcc(to, 7, G0);
1790 __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes);
1791 __ delayed()->nop();
1792 __ stw(value, to, 0);
1793 __ inc(to, 4);
1794 __ dec(count, 1 << shift);
1795 __ BIND(L_fill_32_bytes);
1796 #ifdef _LP64
1797 }
1798 #endif
1800 if (t == T_INT) {
1801 // Zero extend value
1802 __ srl(value, 0, value);
1803 }
1804 if (t == T_BYTE || t == T_SHORT || t == T_INT) {
1805 __ sllx(value, 32, O3);
1806 __ or3(value, O3, value);
1807 }
1809 Label L_check_fill_8_bytes;
1810 // Fill 32-byte chunks
1811 __ subcc(count, 8 << shift, count);
1812 __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes);
1813 __ delayed()->nop();
1815 Label L_fill_32_bytes_loop, L_fill_4_bytes;
1816 __ align(16);
1817 __ BIND(L_fill_32_bytes_loop);
1819 __ stx(value, to, 0);
1820 __ stx(value, to, 8);
1821 __ stx(value, to, 16);
1822 __ stx(value, to, 24);
1824 __ subcc(count, 8 << shift, count);
1825 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop);
1826 __ delayed()->add(to, 32, to);
1828 __ BIND(L_check_fill_8_bytes);
1829 __ addcc(count, 8 << shift, count);
1830 __ brx(Assembler::zero, false, Assembler::pn, L_exit);
1831 __ delayed()->subcc(count, 1 << (shift + 1), count);
1832 __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes);
1833 __ delayed()->andcc(count, 1<<shift, G0);
1835 //
1836 // length is too short, just fill 8 bytes at a time
1837 //
1838 Label L_fill_8_bytes_loop;
1839 __ BIND(L_fill_8_bytes_loop);
1840 __ stx(value, to, 0);
1841 __ subcc(count, 1 << (shift + 1), count);
1842 __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop);
1843 __ delayed()->add(to, 8, to);
1845 // fill trailing 4 bytes
1846 __ andcc(count, 1<<shift, G0); // in delay slot of branches
1847 if (t == T_INT) {
1848 __ BIND(L_fill_elements);
1849 }
1850 __ BIND(L_fill_4_bytes);
1851 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes);
1852 if (t == T_BYTE || t == T_SHORT) {
1853 __ delayed()->andcc(count, 1<<(shift-1), G0);
1854 } else {
1855 __ delayed()->nop();
1856 }
1857 __ stw(value, to, 0);
1858 if (t == T_BYTE || t == T_SHORT) {
1859 __ inc(to, 4);
1860 // fill trailing 2 bytes
1861 __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches
1862 __ BIND(L_fill_2_bytes);
1863 __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte);
1864 __ delayed()->andcc(count, 1, count);
1865 __ sth(value, to, 0);
1866 if (t == T_BYTE) {
1867 __ inc(to, 2);
1868 // fill trailing byte
1869 __ andcc(count, 1, count); // in delay slot of branches
1870 __ BIND(L_fill_byte);
1871 __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1872 __ delayed()->nop();
1873 __ stb(value, to, 0);
1874 } else {
1875 __ BIND(L_fill_byte);
1876 }
1877 } else {
1878 __ BIND(L_fill_2_bytes);
1879 }
1880 __ BIND(L_exit);
1881 __ retl();
1882 __ delayed()->nop();
1884 // Handle copies less than 8 bytes. Int is handled elsewhere.
1885 if (t == T_BYTE) {
1886 __ BIND(L_fill_elements);
1887 Label L_fill_2, L_fill_4;
1888 // in delay slot __ andcc(count, 1, G0);
1889 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1890 __ delayed()->andcc(count, 2, G0);
1891 __ stb(value, to, 0);
1892 __ inc(to, 1);
1893 __ BIND(L_fill_2);
1894 __ brx(Assembler::zero, false, Assembler::pt, L_fill_4);
1895 __ delayed()->andcc(count, 4, G0);
1896 __ stb(value, to, 0);
1897 __ stb(value, to, 1);
1898 __ inc(to, 2);
1899 __ BIND(L_fill_4);
1900 __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1901 __ delayed()->nop();
1902 __ stb(value, to, 0);
1903 __ stb(value, to, 1);
1904 __ stb(value, to, 2);
1905 __ retl();
1906 __ delayed()->stb(value, to, 3);
1907 }
1909 if (t == T_SHORT) {
1910 Label L_fill_2;
1911 __ BIND(L_fill_elements);
1912 // in delay slot __ andcc(count, 1, G0);
1913 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1914 __ delayed()->andcc(count, 2, G0);
1915 __ sth(value, to, 0);
1916 __ inc(to, 2);
1917 __ BIND(L_fill_2);
1918 __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1919 __ delayed()->nop();
1920 __ sth(value, to, 0);
1921 __ retl();
1922 __ delayed()->sth(value, to, 2);
1923 }
1924 return start;
1925 }
1927 //
1928 // Generate stub for conjoint short copy. If "aligned" is true, the
1929 // "from" and "to" addresses are assumed to be heapword aligned.
1930 //
1931 // Arguments for generated stub:
1932 // from: O0
1933 // to: O1
1934 // count: O2 treated as signed
1935 //
1936 address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1937 address *entry, const char *name) {
1938 // Do reverse copy.
1940 __ align(CodeEntryAlignment);
1941 StubCodeMark mark(this, "StubRoutines", name);
1942 address start = __ pc();
1944 Label L_skip_alignment, L_skip_alignment2, L_aligned_copy;
1945 Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1947 const Register from = O0; // source array address
1948 const Register to = O1; // destination array address
1949 const Register count = O2; // elements count
1950 const Register end_from = from; // source array end address
1951 const Register end_to = to; // destination array end address
1953 const Register byte_count = O3; // bytes count to copy
1955 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1957 if (entry != NULL) {
1958 *entry = __ pc();
1959 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1960 BLOCK_COMMENT("Entry:");
1961 }
1963 array_overlap_test(nooverlap_target, 1);
1965 __ sllx(count, LogBytesPerShort, byte_count);
1966 __ add(to, byte_count, end_to); // offset after last copied element
1968 // for short arrays, just do single element copy
1969 __ cmp(count, 11); // 8 + 3 (22 bytes)
1970 __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1971 __ delayed()->add(from, byte_count, end_from);
1973 {
1974 // Align end of arrays since they could be not aligned even
1975 // when arrays itself are aligned.
1977 // copy 1 element if necessary to align 'end_to' on an 4 bytes
1978 __ andcc(end_to, 3, G0);
1979 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1980 __ delayed()->lduh(end_from, -2, O3);
1981 __ dec(end_from, 2);
1982 __ dec(end_to, 2);
1983 __ dec(count);
1984 __ sth(O3, end_to, 0);
1985 __ BIND(L_skip_alignment);
1987 // copy 2 elements to align 'end_to' on an 8 byte boundary
1988 __ andcc(end_to, 7, G0);
1989 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1990 __ delayed()->lduh(end_from, -2, O3);
1991 __ dec(count, 2);
1992 __ lduh(end_from, -4, O4);
1993 __ dec(end_from, 4);
1994 __ dec(end_to, 4);
1995 __ sth(O3, end_to, 2);
1996 __ sth(O4, end_to, 0);
1997 __ BIND(L_skip_alignment2);
1998 }
1999 #ifdef _LP64
2000 if (aligned) {
2001 // Both arrays are aligned to 8-bytes in 64-bits VM.
2002 // The 'count' is decremented in copy_16_bytes_backward_with_shift()
2003 // in unaligned case.
2004 __ dec(count, 8);
2005 } else
2006 #endif
2007 {
2008 // Copy with shift 16 bytes per iteration if arrays do not have
2009 // the same alignment mod 8, otherwise jump to the next
2010 // code for aligned copy (and substracting 8 from 'count' before jump).
2011 // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
2012 // Also jump over aligned copy after the copy with shift completed.
2014 copy_16_bytes_backward_with_shift(end_from, end_to, count, 8,
2015 L_aligned_copy, L_copy_2_bytes);
2016 }
2017 // copy 4 elements (16 bytes) at a time
2018 __ align(OptoLoopAlignment);
2019 __ BIND(L_aligned_copy);
2020 __ dec(end_from, 16);
2021 __ ldx(end_from, 8, O3);
2022 __ ldx(end_from, 0, O4);
2023 __ dec(end_to, 16);
2024 __ deccc(count, 8);
2025 __ stx(O3, end_to, 8);
2026 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
2027 __ delayed()->stx(O4, end_to, 0);
2028 __ inc(count, 8);
2030 // copy 1 element (2 bytes) at a time
2031 __ BIND(L_copy_2_bytes);
2032 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2033 __ BIND(L_copy_2_bytes_loop);
2034 __ dec(end_from, 2);
2035 __ dec(end_to, 2);
2036 __ lduh(end_from, 0, O4);
2037 __ deccc(count);
2038 __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
2039 __ delayed()->sth(O4, end_to, 0);
2041 __ BIND(L_exit);
2042 // O3, O4 are used as temp registers
2043 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
2044 __ retl();
2045 __ delayed()->mov(G0, O0); // return 0
2046 return start;
2047 }
2049 //
2050 // Helper methods for generate_disjoint_int_copy_core()
2051 //
2052 void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec,
2053 Label& L_loop, bool use_prefetch, bool use_bis) {
2055 __ align(OptoLoopAlignment);
2056 __ BIND(L_loop);
2057 if (use_prefetch) {
2058 if (ArraycopySrcPrefetchDistance > 0) {
2059 __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
2060 }
2061 if (ArraycopyDstPrefetchDistance > 0) {
2062 __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
2063 }
2064 }
2065 __ ldx(from, 4, O4);
2066 __ ldx(from, 12, G4);
2067 __ inc(to, 16);
2068 __ inc(from, 16);
2069 __ deccc(count, 4); // Can we do next iteration after this one?
2071 __ srlx(O4, 32, G3);
2072 __ bset(G3, O3);
2073 __ sllx(O4, 32, O4);
2074 __ srlx(G4, 32, G3);
2075 __ bset(G3, O4);
2076 if (use_bis) {
2077 __ stxa(O3, to, -16);
2078 __ stxa(O4, to, -8);
2079 } else {
2080 __ stx(O3, to, -16);
2081 __ stx(O4, to, -8);
2082 }
2083 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
2084 __ delayed()->sllx(G4, 32, O3);
2086 }
2088 //
2089 // Generate core code for disjoint int copy (and oop copy on 32-bit).
2090 // If "aligned" is true, the "from" and "to" addresses are assumed
2091 // to be heapword aligned.
2092 //
2093 // Arguments:
2094 // from: O0
2095 // to: O1
2096 // count: O2 treated as signed
2097 //
2098 void generate_disjoint_int_copy_core(bool aligned) {
2100 Label L_skip_alignment, L_aligned_copy;
2101 Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
2103 const Register from = O0; // source array address
2104 const Register to = O1; // destination array address
2105 const Register count = O2; // elements count
2106 const Register offset = O5; // offset from start of arrays
2107 // O3, O4, G3, G4 are used as temp registers
2109 // 'aligned' == true when it is known statically during compilation
2110 // of this arraycopy call site that both 'from' and 'to' addresses
2111 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
2112 //
2113 // Aligned arrays have 4 bytes alignment in 32-bits VM
2114 // and 8 bytes - in 64-bits VM.
2115 //
2116 #ifdef _LP64
2117 if (!aligned)
2118 #endif
2119 {
2120 // The next check could be put under 'ifndef' since the code in
2121 // generate_disjoint_long_copy_core() has own checks and set 'offset'.
2123 // for short arrays, just do single element copy
2124 __ cmp(count, 5); // 4 + 1 (20 bytes)
2125 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
2126 __ delayed()->mov(G0, offset);
2128 // copy 1 element to align 'to' on an 8 byte boundary
2129 __ andcc(to, 7, G0);
2130 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
2131 __ delayed()->ld(from, 0, O3);
2132 __ inc(from, 4);
2133 __ inc(to, 4);
2134 __ dec(count);
2135 __ st(O3, to, -4);
2136 __ BIND(L_skip_alignment);
2138 // if arrays have same alignment mod 8, do 4 elements copy
2139 __ andcc(from, 7, G0);
2140 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2141 __ delayed()->ld(from, 0, O3);
2143 //
2144 // Load 2 aligned 8-bytes chunks and use one from previous iteration
2145 // to form 2 aligned 8-bytes chunks to store.
2146 //
2147 // copy_16_bytes_forward_with_shift() is not used here since this
2148 // code is more optimal.
2150 // copy with shift 4 elements (16 bytes) at a time
2151 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4
2152 __ sllx(O3, 32, O3);
2154 disjoint_copy_core(from, to, count, 2, 16, copy_16_bytes_loop);
2156 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2157 __ delayed()->inc(count, 4); // restore 'count'
2159 __ BIND(L_aligned_copy);
2160 } // !aligned
2162 // copy 4 elements (16 bytes) at a time
2163 __ and3(count, 1, G4); // Save
2164 __ srl(count, 1, count);
2165 generate_disjoint_long_copy_core(aligned);
2166 __ mov(G4, count); // Restore
2168 // copy 1 element at a time
2169 __ BIND(L_copy_4_bytes);
2170 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2171 __ BIND(L_copy_4_bytes_loop);
2172 __ ld(from, offset, O3);
2173 __ deccc(count);
2174 __ st(O3, to, offset);
2175 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
2176 __ delayed()->inc(offset, 4);
2177 __ BIND(L_exit);
2178 }
2180 //
2181 // Generate stub for disjoint int copy. If "aligned" is true, the
2182 // "from" and "to" addresses are assumed to be heapword aligned.
2183 //
2184 // Arguments for generated stub:
2185 // from: O0
2186 // to: O1
2187 // count: O2 treated as signed
2188 //
2189 address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) {
2190 __ align(CodeEntryAlignment);
2191 StubCodeMark mark(this, "StubRoutines", name);
2192 address start = __ pc();
2194 const Register count = O2;
2195 assert_clean_int(count, O3); // Make sure 'count' is clean int.
2197 if (entry != NULL) {
2198 *entry = __ pc();
2199 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2200 BLOCK_COMMENT("Entry:");
2201 }
2203 generate_disjoint_int_copy_core(aligned);
2205 // O3, O4 are used as temp registers
2206 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2207 __ retl();
2208 __ delayed()->mov(G0, O0); // return 0
2209 return start;
2210 }
2212 //
2213 // Generate core code for conjoint int copy (and oop copy on 32-bit).
2214 // If "aligned" is true, the "from" and "to" addresses are assumed
2215 // to be heapword aligned.
2216 //
2217 // Arguments:
2218 // from: O0
2219 // to: O1
2220 // count: O2 treated as signed
2221 //
2222 void generate_conjoint_int_copy_core(bool aligned) {
2223 // Do reverse copy.
2225 Label L_skip_alignment, L_aligned_copy;
2226 Label L_copy_16_bytes, L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
2228 const Register from = O0; // source array address
2229 const Register to = O1; // destination array address
2230 const Register count = O2; // elements count
2231 const Register end_from = from; // source array end address
2232 const Register end_to = to; // destination array end address
2233 // O3, O4, O5, G3 are used as temp registers
2235 const Register byte_count = O3; // bytes count to copy
2237 __ sllx(count, LogBytesPerInt, byte_count);
2238 __ add(to, byte_count, end_to); // offset after last copied element
2240 __ cmp(count, 5); // for short arrays, just do single element copy
2241 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
2242 __ delayed()->add(from, byte_count, end_from);
2244 // copy 1 element to align 'to' on an 8 byte boundary
2245 __ andcc(end_to, 7, G0);
2246 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
2247 __ delayed()->nop();
2248 __ dec(count);
2249 __ dec(end_from, 4);
2250 __ dec(end_to, 4);
2251 __ ld(end_from, 0, O4);
2252 __ st(O4, end_to, 0);
2253 __ BIND(L_skip_alignment);
2255 // Check if 'end_from' and 'end_to' has the same alignment.
2256 __ andcc(end_from, 7, G0);
2257 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2258 __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4
2260 // copy with shift 4 elements (16 bytes) at a time
2261 //
2262 // Load 2 aligned 8-bytes chunks and use one from previous iteration
2263 // to form 2 aligned 8-bytes chunks to store.
2264 //
2265 __ ldx(end_from, -4, O3);
2266 __ align(OptoLoopAlignment);
2267 __ BIND(L_copy_16_bytes);
2268 __ ldx(end_from, -12, O4);
2269 __ deccc(count, 4);
2270 __ ldx(end_from, -20, O5);
2271 __ dec(end_to, 16);
2272 __ dec(end_from, 16);
2273 __ srlx(O3, 32, O3);
2274 __ sllx(O4, 32, G3);
2275 __ bset(G3, O3);
2276 __ stx(O3, end_to, 8);
2277 __ srlx(O4, 32, O4);
2278 __ sllx(O5, 32, G3);
2279 __ bset(O4, G3);
2280 __ stx(G3, end_to, 0);
2281 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2282 __ delayed()->mov(O5, O3);
2284 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2285 __ delayed()->inc(count, 4);
2287 // copy 4 elements (16 bytes) at a time
2288 __ align(OptoLoopAlignment);
2289 __ BIND(L_aligned_copy);
2290 __ dec(end_from, 16);
2291 __ ldx(end_from, 8, O3);
2292 __ ldx(end_from, 0, O4);
2293 __ dec(end_to, 16);
2294 __ deccc(count, 4);
2295 __ stx(O3, end_to, 8);
2296 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
2297 __ delayed()->stx(O4, end_to, 0);
2298 __ inc(count, 4);
2300 // copy 1 element (4 bytes) at a time
2301 __ BIND(L_copy_4_bytes);
2302 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2303 __ BIND(L_copy_4_bytes_loop);
2304 __ dec(end_from, 4);
2305 __ dec(end_to, 4);
2306 __ ld(end_from, 0, O4);
2307 __ deccc(count);
2308 __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop);
2309 __ delayed()->st(O4, end_to, 0);
2310 __ BIND(L_exit);
2311 }
2313 //
2314 // Generate stub for conjoint int copy. If "aligned" is true, the
2315 // "from" and "to" addresses are assumed to be heapword aligned.
2316 //
2317 // Arguments for generated stub:
2318 // from: O0
2319 // to: O1
2320 // count: O2 treated as signed
2321 //
2322 address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
2323 address *entry, const char *name) {
2324 __ align(CodeEntryAlignment);
2325 StubCodeMark mark(this, "StubRoutines", name);
2326 address start = __ pc();
2328 assert_clean_int(O2, O3); // Make sure 'count' is clean int.
2330 if (entry != NULL) {
2331 *entry = __ pc();
2332 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2333 BLOCK_COMMENT("Entry:");
2334 }
2336 array_overlap_test(nooverlap_target, 2);
2338 generate_conjoint_int_copy_core(aligned);
2340 // O3, O4 are used as temp registers
2341 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2342 __ retl();
2343 __ delayed()->mov(G0, O0); // return 0
2344 return start;
2345 }
2347 //
2348 // Helper methods for generate_disjoint_long_copy_core()
2349 //
2350 void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec,
2351 Label& L_loop, bool use_prefetch, bool use_bis) {
2352 __ align(OptoLoopAlignment);
2353 __ BIND(L_loop);
2354 for (int off = 0; off < 64; off += 16) {
2355 if (use_prefetch && (off & 31) == 0) {
2356 if (ArraycopySrcPrefetchDistance > 0) {
2357 __ prefetch(from, ArraycopySrcPrefetchDistance+off, Assembler::severalReads);
2358 }
2359 if (ArraycopyDstPrefetchDistance > 0) {
2360 __ prefetch(to, ArraycopyDstPrefetchDistance+off, Assembler::severalWritesAndPossiblyReads);
2361 }
2362 }
2363 __ ldx(from, off+0, O4);
2364 __ ldx(from, off+8, O5);
2365 if (use_bis) {
2366 __ stxa(O4, to, off+0);
2367 __ stxa(O5, to, off+8);
2368 } else {
2369 __ stx(O4, to, off+0);
2370 __ stx(O5, to, off+8);
2371 }
2372 }
2373 __ deccc(count, 8);
2374 __ inc(from, 64);
2375 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
2376 __ delayed()->inc(to, 64);
2377 }
2379 //
2380 // Generate core code for disjoint long copy (and oop copy on 64-bit).
2381 // "aligned" is ignored, because we must make the stronger
2382 // assumption that both addresses are always 64-bit aligned.
2383 //
2384 // Arguments:
2385 // from: O0
2386 // to: O1
2387 // count: O2 treated as signed
2388 //
2389 // count -= 2;
2390 // if ( count >= 0 ) { // >= 2 elements
2391 // if ( count > 6) { // >= 8 elements
2392 // count -= 6; // original count - 8
2393 // do {
2394 // copy_8_elements;
2395 // count -= 8;
2396 // } while ( count >= 0 );
2397 // count += 6;
2398 // }
2399 // if ( count >= 0 ) { // >= 2 elements
2400 // do {
2401 // copy_2_elements;
2402 // } while ( (count=count-2) >= 0 );
2403 // }
2404 // }
2405 // count += 2;
2406 // if ( count != 0 ) { // 1 element left
2407 // copy_1_element;
2408 // }
2409 //
2410 void generate_disjoint_long_copy_core(bool aligned) {
2411 Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2412 const Register from = O0; // source array address
2413 const Register to = O1; // destination array address
2414 const Register count = O2; // elements count
2415 const Register offset0 = O4; // element offset
2416 const Register offset8 = O5; // next element offset
2418 __ deccc(count, 2);
2419 __ mov(G0, offset0); // offset from start of arrays (0)
2420 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2421 __ delayed()->add(offset0, 8, offset8);
2423 // Copy by 64 bytes chunks
2425 const Register from64 = O3; // source address
2426 const Register to64 = G3; // destination address
2427 __ subcc(count, 6, O3);
2428 __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
2429 __ delayed()->mov(to, to64);
2430 // Now we can use O4(offset0), O5(offset8) as temps
2431 __ mov(O3, count);
2432 // count >= 0 (original count - 8)
2433 __ mov(from, from64);
2435 disjoint_copy_core(from64, to64, count, 3, 64, copy_64_bytes_loop);
2437 // Restore O4(offset0), O5(offset8)
2438 __ sub(from64, from, offset0);
2439 __ inccc(count, 6); // restore count
2440 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2441 __ delayed()->add(offset0, 8, offset8);
2443 // Copy by 16 bytes chunks
2444 __ align(OptoLoopAlignment);
2445 __ BIND(L_copy_16_bytes);
2446 __ ldx(from, offset0, O3);
2447 __ ldx(from, offset8, G3);
2448 __ deccc(count, 2);
2449 __ stx(O3, to, offset0);
2450 __ inc(offset0, 16);
2451 __ stx(G3, to, offset8);
2452 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2453 __ delayed()->inc(offset8, 16);
2455 // Copy last 8 bytes
2456 __ BIND(L_copy_8_bytes);
2457 __ inccc(count, 2);
2458 __ brx(Assembler::zero, true, Assembler::pn, L_exit );
2459 __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs
2460 __ ldx(from, offset0, O3);
2461 __ stx(O3, to, offset0);
2462 __ BIND(L_exit);
2463 }
2465 //
2466 // Generate stub for disjoint long copy.
2467 // "aligned" is ignored, because we must make the stronger
2468 // assumption that both addresses are always 64-bit aligned.
2469 //
2470 // Arguments for generated stub:
2471 // from: O0
2472 // to: O1
2473 // count: O2 treated as signed
2474 //
2475 address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) {
2476 __ align(CodeEntryAlignment);
2477 StubCodeMark mark(this, "StubRoutines", name);
2478 address start = __ pc();
2480 assert_clean_int(O2, O3); // Make sure 'count' is clean int.
2482 if (entry != NULL) {
2483 *entry = __ pc();
2484 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2485 BLOCK_COMMENT("Entry:");
2486 }
2488 generate_disjoint_long_copy_core(aligned);
2490 // O3, O4 are used as temp registers
2491 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2492 __ retl();
2493 __ delayed()->mov(G0, O0); // return 0
2494 return start;
2495 }
2497 //
2498 // Generate core code for conjoint long copy (and oop copy on 64-bit).
2499 // "aligned" is ignored, because we must make the stronger
2500 // assumption that both addresses are always 64-bit aligned.
2501 //
2502 // Arguments:
2503 // from: O0
2504 // to: O1
2505 // count: O2 treated as signed
2506 //
2507 void generate_conjoint_long_copy_core(bool aligned) {
2508 // Do reverse copy.
2509 Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2510 const Register from = O0; // source array address
2511 const Register to = O1; // destination array address
2512 const Register count = O2; // elements count
2513 const Register offset8 = O4; // element offset
2514 const Register offset0 = O5; // previous element offset
2516 __ subcc(count, 1, count);
2517 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
2518 __ delayed()->sllx(count, LogBytesPerLong, offset8);
2519 __ sub(offset8, 8, offset0);
2520 __ align(OptoLoopAlignment);
2521 __ BIND(L_copy_16_bytes);
2522 __ ldx(from, offset8, O2);
2523 __ ldx(from, offset0, O3);
2524 __ stx(O2, to, offset8);
2525 __ deccc(offset8, 16); // use offset8 as counter
2526 __ stx(O3, to, offset0);
2527 __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes);
2528 __ delayed()->dec(offset0, 16);
2530 __ BIND(L_copy_8_bytes);
2531 __ brx(Assembler::negative, false, Assembler::pn, L_exit );
2532 __ delayed()->nop();
2533 __ ldx(from, 0, O3);
2534 __ stx(O3, to, 0);
2535 __ BIND(L_exit);
2536 }
2538 // Generate stub for conjoint long copy.
2539 // "aligned" is ignored, because we must make the stronger
2540 // assumption that both addresses are always 64-bit aligned.
2541 //
2542 // Arguments for generated stub:
2543 // from: O0
2544 // to: O1
2545 // count: O2 treated as signed
2546 //
2547 address generate_conjoint_long_copy(bool aligned, address nooverlap_target,
2548 address *entry, const char *name) {
2549 __ align(CodeEntryAlignment);
2550 StubCodeMark mark(this, "StubRoutines", name);
2551 address start = __ pc();
2553 assert(aligned, "Should always be aligned");
2555 assert_clean_int(O2, O3); // Make sure 'count' is clean int.
2557 if (entry != NULL) {
2558 *entry = __ pc();
2559 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2560 BLOCK_COMMENT("Entry:");
2561 }
2563 array_overlap_test(nooverlap_target, 3);
2565 generate_conjoint_long_copy_core(aligned);
2567 // O3, O4 are used as temp registers
2568 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2569 __ retl();
2570 __ delayed()->mov(G0, O0); // return 0
2571 return start;
2572 }
2574 // Generate stub for disjoint oop copy. If "aligned" is true, the
2575 // "from" and "to" addresses are assumed to be heapword aligned.
2576 //
2577 // Arguments for generated stub:
2578 // from: O0
2579 // to: O1
2580 // count: O2 treated as signed
2581 //
2582 address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name,
2583 bool dest_uninitialized = false) {
2585 const Register from = O0; // source array address
2586 const Register to = O1; // destination array address
2587 const Register count = O2; // elements count
2589 __ align(CodeEntryAlignment);
2590 StubCodeMark mark(this, "StubRoutines", name);
2591 address start = __ pc();
2593 assert_clean_int(count, O3); // Make sure 'count' is clean int.
2595 if (entry != NULL) {
2596 *entry = __ pc();
2597 // caller can pass a 64-bit byte count here
2598 BLOCK_COMMENT("Entry:");
2599 }
2601 // save arguments for barrier generation
2602 __ mov(to, G1);
2603 __ mov(count, G5);
2604 gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2605 #ifdef _LP64
2606 assert_clean_int(count, O3); // Make sure 'count' is clean int.
2607 if (UseCompressedOops) {
2608 generate_disjoint_int_copy_core(aligned);
2609 } else {
2610 generate_disjoint_long_copy_core(aligned);
2611 }
2612 #else
2613 generate_disjoint_int_copy_core(aligned);
2614 #endif
2615 // O0 is used as temp register
2616 gen_write_ref_array_post_barrier(G1, G5, O0);
2618 // O3, O4 are used as temp registers
2619 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2620 __ retl();
2621 __ delayed()->mov(G0, O0); // return 0
2622 return start;
2623 }
2625 // Generate stub for conjoint oop copy. If "aligned" is true, the
2626 // "from" and "to" addresses are assumed to be heapword aligned.
2627 //
2628 // Arguments for generated stub:
2629 // from: O0
2630 // to: O1
2631 // count: O2 treated as signed
2632 //
2633 address generate_conjoint_oop_copy(bool aligned, address nooverlap_target,
2634 address *entry, const char *name,
2635 bool dest_uninitialized = false) {
2637 const Register from = O0; // source array address
2638 const Register to = O1; // destination array address
2639 const Register count = O2; // elements count
2641 __ align(CodeEntryAlignment);
2642 StubCodeMark mark(this, "StubRoutines", name);
2643 address start = __ pc();
2645 assert_clean_int(count, O3); // Make sure 'count' is clean int.
2647 if (entry != NULL) {
2648 *entry = __ pc();
2649 // caller can pass a 64-bit byte count here
2650 BLOCK_COMMENT("Entry:");
2651 }
2653 array_overlap_test(nooverlap_target, LogBytesPerHeapOop);
2655 // save arguments for barrier generation
2656 __ mov(to, G1);
2657 __ mov(count, G5);
2658 gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2660 #ifdef _LP64
2661 if (UseCompressedOops) {
2662 generate_conjoint_int_copy_core(aligned);
2663 } else {
2664 generate_conjoint_long_copy_core(aligned);
2665 }
2666 #else
2667 generate_conjoint_int_copy_core(aligned);
2668 #endif
2670 // O0 is used as temp register
2671 gen_write_ref_array_post_barrier(G1, G5, O0);
2673 // O3, O4 are used as temp registers
2674 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2675 __ retl();
2676 __ delayed()->mov(G0, O0); // return 0
2677 return start;
2678 }
2681 // Helper for generating a dynamic type check.
2682 // Smashes only the given temp registers.
2683 void generate_type_check(Register sub_klass,
2684 Register super_check_offset,
2685 Register super_klass,
2686 Register temp,
2687 Label& L_success) {
2688 assert_different_registers(sub_klass, super_check_offset, super_klass, temp);
2690 BLOCK_COMMENT("type_check:");
2692 Label L_miss, L_pop_to_miss;
2694 assert_clean_int(super_check_offset, temp);
2696 __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg,
2697 &L_success, &L_miss, NULL,
2698 super_check_offset);
2700 BLOCK_COMMENT("type_check_slow_path:");
2701 __ save_frame(0);
2702 __ check_klass_subtype_slow_path(sub_klass->after_save(),
2703 super_klass->after_save(),
2704 L0, L1, L2, L4,
2705 NULL, &L_pop_to_miss);
2706 __ ba(L_success);
2707 __ delayed()->restore();
2709 __ bind(L_pop_to_miss);
2710 __ restore();
2712 // Fall through on failure!
2713 __ BIND(L_miss);
2714 }
2717 // Generate stub for checked oop copy.
2718 //
2719 // Arguments for generated stub:
2720 // from: O0
2721 // to: O1
2722 // count: O2 treated as signed
2723 // ckoff: O3 (super_check_offset)
2724 // ckval: O4 (super_klass)
2725 // ret: O0 zero for success; (-1^K) where K is partial transfer count
2726 //
2727 address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) {
2729 const Register O0_from = O0; // source array address
2730 const Register O1_to = O1; // destination array address
2731 const Register O2_count = O2; // elements count
2732 const Register O3_ckoff = O3; // super_check_offset
2733 const Register O4_ckval = O4; // super_klass
2735 const Register O5_offset = O5; // loop var, with stride wordSize
2736 const Register G1_remain = G1; // loop var, with stride -1
2737 const Register G3_oop = G3; // actual oop copied
2738 const Register G4_klass = G4; // oop._klass
2739 const Register G5_super = G5; // oop._klass._primary_supers[ckval]
2741 __ align(CodeEntryAlignment);
2742 StubCodeMark mark(this, "StubRoutines", name);
2743 address start = __ pc();
2745 #ifdef ASSERT
2746 // We sometimes save a frame (see generate_type_check below).
2747 // If this will cause trouble, let's fail now instead of later.
2748 __ save_frame(0);
2749 __ restore();
2750 #endif
2752 assert_clean_int(O2_count, G1); // Make sure 'count' is clean int.
2754 #ifdef ASSERT
2755 // caller guarantees that the arrays really are different
2756 // otherwise, we would have to make conjoint checks
2757 { Label L;
2758 __ mov(O3, G1); // spill: overlap test smashes O3
2759 __ mov(O4, G4); // spill: overlap test smashes O4
2760 array_overlap_test(L, LogBytesPerHeapOop);
2761 __ stop("checkcast_copy within a single array");
2762 __ bind(L);
2763 __ mov(G1, O3);
2764 __ mov(G4, O4);
2765 }
2766 #endif //ASSERT
2768 if (entry != NULL) {
2769 *entry = __ pc();
2770 // caller can pass a 64-bit byte count here (from generic stub)
2771 BLOCK_COMMENT("Entry:");
2772 }
2773 gen_write_ref_array_pre_barrier(O1_to, O2_count, dest_uninitialized);
2775 Label load_element, store_element, do_card_marks, fail, done;
2776 __ addcc(O2_count, 0, G1_remain); // initialize loop index, and test it
2777 __ brx(Assembler::notZero, false, Assembler::pt, load_element);
2778 __ delayed()->mov(G0, O5_offset); // offset from start of arrays
2780 // Empty array: Nothing to do.
2781 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2782 __ retl();
2783 __ delayed()->set(0, O0); // return 0 on (trivial) success
2785 // ======== begin loop ========
2786 // (Loop is rotated; its entry is load_element.)
2787 // Loop variables:
2788 // (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
2789 // (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
2790 // G3, G4, G5 --- current oop, oop.klass, oop.klass.super
2791 __ align(OptoLoopAlignment);
2793 __ BIND(store_element);
2794 __ deccc(G1_remain); // decrement the count
2795 __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop
2796 __ inc(O5_offset, heapOopSize); // step to next offset
2797 __ brx(Assembler::zero, true, Assembler::pt, do_card_marks);
2798 __ delayed()->set(0, O0); // return -1 on success
2800 // ======== loop entry is here ========
2801 __ BIND(load_element);
2802 __ load_heap_oop(O0_from, O5_offset, G3_oop); // load the oop
2803 __ br_null_short(G3_oop, Assembler::pt, store_element);
2805 __ load_klass(G3_oop, G4_klass); // query the object klass
2807 generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super,
2808 // branch to this on success:
2809 store_element);
2810 // ======== end loop ========
2812 // It was a real error; we must depend on the caller to finish the job.
2813 // Register G1 has number of *remaining* oops, O2 number of *total* oops.
2814 // Emit GC store barriers for the oops we have copied (O2 minus G1),
2815 // and report their number to the caller.
2816 __ BIND(fail);
2817 __ subcc(O2_count, G1_remain, O2_count);
2818 __ brx(Assembler::zero, false, Assembler::pt, done);
2819 __ delayed()->not1(O2_count, O0); // report (-1^K) to caller
2821 __ BIND(do_card_marks);
2822 gen_write_ref_array_post_barrier(O1_to, O2_count, O3); // store check on O1[0..O2]
2824 __ BIND(done);
2825 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2826 __ retl();
2827 __ delayed()->nop(); // return value in 00
2829 return start;
2830 }
2833 // Generate 'unsafe' array copy stub
2834 // Though just as safe as the other stubs, it takes an unscaled
2835 // size_t argument instead of an element count.
2836 //
2837 // Arguments for generated stub:
2838 // from: O0
2839 // to: O1
2840 // count: O2 byte count, treated as ssize_t, can be zero
2841 //
2842 // Examines the alignment of the operands and dispatches
2843 // to a long, int, short, or byte copy loop.
2844 //
2845 address generate_unsafe_copy(const char* name,
2846 address byte_copy_entry,
2847 address short_copy_entry,
2848 address int_copy_entry,
2849 address long_copy_entry) {
2851 const Register O0_from = O0; // source array address
2852 const Register O1_to = O1; // destination array address
2853 const Register O2_count = O2; // elements count
2855 const Register G1_bits = G1; // test copy of low bits
2857 __ align(CodeEntryAlignment);
2858 StubCodeMark mark(this, "StubRoutines", name);
2859 address start = __ pc();
2861 // bump this on entry, not on exit:
2862 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3);
2864 __ or3(O0_from, O1_to, G1_bits);
2865 __ or3(O2_count, G1_bits, G1_bits);
2867 __ btst(BytesPerLong-1, G1_bits);
2868 __ br(Assembler::zero, true, Assembler::pt,
2869 long_copy_entry, relocInfo::runtime_call_type);
2870 // scale the count on the way out:
2871 __ delayed()->srax(O2_count, LogBytesPerLong, O2_count);
2873 __ btst(BytesPerInt-1, G1_bits);
2874 __ br(Assembler::zero, true, Assembler::pt,
2875 int_copy_entry, relocInfo::runtime_call_type);
2876 // scale the count on the way out:
2877 __ delayed()->srax(O2_count, LogBytesPerInt, O2_count);
2879 __ btst(BytesPerShort-1, G1_bits);
2880 __ br(Assembler::zero, true, Assembler::pt,
2881 short_copy_entry, relocInfo::runtime_call_type);
2882 // scale the count on the way out:
2883 __ delayed()->srax(O2_count, LogBytesPerShort, O2_count);
2885 __ br(Assembler::always, false, Assembler::pt,
2886 byte_copy_entry, relocInfo::runtime_call_type);
2887 __ delayed()->nop();
2889 return start;
2890 }
2893 // Perform range checks on the proposed arraycopy.
2894 // Kills the two temps, but nothing else.
2895 // Also, clean the sign bits of src_pos and dst_pos.
2896 void arraycopy_range_checks(Register src, // source array oop (O0)
2897 Register src_pos, // source position (O1)
2898 Register dst, // destination array oo (O2)
2899 Register dst_pos, // destination position (O3)
2900 Register length, // length of copy (O4)
2901 Register temp1, Register temp2,
2902 Label& L_failed) {
2903 BLOCK_COMMENT("arraycopy_range_checks:");
2905 // if (src_pos + length > arrayOop(src)->length() ) FAIL;
2907 const Register array_length = temp1; // scratch
2908 const Register end_pos = temp2; // scratch
2910 // Note: This next instruction may be in the delay slot of a branch:
2911 __ add(length, src_pos, end_pos); // src_pos + length
2912 __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length);
2913 __ cmp(end_pos, array_length);
2914 __ br(Assembler::greater, false, Assembler::pn, L_failed);
2916 // if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2917 __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length
2918 __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length);
2919 __ cmp(end_pos, array_length);
2920 __ br(Assembler::greater, false, Assembler::pn, L_failed);
2922 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2923 // Move with sign extension can be used since they are positive.
2924 __ delayed()->signx(src_pos, src_pos);
2925 __ signx(dst_pos, dst_pos);
2927 BLOCK_COMMENT("arraycopy_range_checks done");
2928 }
2931 //
2932 // Generate generic array copy stubs
2933 //
2934 // Input:
2935 // O0 - src oop
2936 // O1 - src_pos
2937 // O2 - dst oop
2938 // O3 - dst_pos
2939 // O4 - element count
2940 //
2941 // Output:
2942 // O0 == 0 - success
2943 // O0 == -1 - need to call System.arraycopy
2944 //
2945 address generate_generic_copy(const char *name,
2946 address entry_jbyte_arraycopy,
2947 address entry_jshort_arraycopy,
2948 address entry_jint_arraycopy,
2949 address entry_oop_arraycopy,
2950 address entry_jlong_arraycopy,
2951 address entry_checkcast_arraycopy) {
2952 Label L_failed, L_objArray;
2954 // Input registers
2955 const Register src = O0; // source array oop
2956 const Register src_pos = O1; // source position
2957 const Register dst = O2; // destination array oop
2958 const Register dst_pos = O3; // destination position
2959 const Register length = O4; // elements count
2961 // registers used as temp
2962 const Register G3_src_klass = G3; // source array klass
2963 const Register G4_dst_klass = G4; // destination array klass
2964 const Register G5_lh = G5; // layout handler
2965 const Register O5_temp = O5;
2967 __ align(CodeEntryAlignment);
2968 StubCodeMark mark(this, "StubRoutines", name);
2969 address start = __ pc();
2971 // bump this on entry, not on exit:
2972 inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3);
2974 // In principle, the int arguments could be dirty.
2975 //assert_clean_int(src_pos, G1);
2976 //assert_clean_int(dst_pos, G1);
2977 //assert_clean_int(length, G1);
2979 //-----------------------------------------------------------------------
2980 // Assembler stubs will be used for this call to arraycopy
2981 // if the following conditions are met:
2982 //
2983 // (1) src and dst must not be null.
2984 // (2) src_pos must not be negative.
2985 // (3) dst_pos must not be negative.
2986 // (4) length must not be negative.
2987 // (5) src klass and dst klass should be the same and not NULL.
2988 // (6) src and dst should be arrays.
2989 // (7) src_pos + length must not exceed length of src.
2990 // (8) dst_pos + length must not exceed length of dst.
2991 BLOCK_COMMENT("arraycopy initial argument checks");
2993 // if (src == NULL) return -1;
2994 __ br_null(src, false, Assembler::pn, L_failed);
2996 // if (src_pos < 0) return -1;
2997 __ delayed()->tst(src_pos);
2998 __ br(Assembler::negative, false, Assembler::pn, L_failed);
2999 __ delayed()->nop();
3001 // if (dst == NULL) return -1;
3002 __ br_null(dst, false, Assembler::pn, L_failed);
3004 // if (dst_pos < 0) return -1;
3005 __ delayed()->tst(dst_pos);
3006 __ br(Assembler::negative, false, Assembler::pn, L_failed);
3008 // if (length < 0) return -1;
3009 __ delayed()->tst(length);
3010 __ br(Assembler::negative, false, Assembler::pn, L_failed);
3012 BLOCK_COMMENT("arraycopy argument klass checks");
3013 // get src->klass()
3014 if (UseCompressedKlassPointers) {
3015 __ delayed()->nop(); // ??? not good
3016 __ load_klass(src, G3_src_klass);
3017 } else {
3018 __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass);
3019 }
3021 #ifdef ASSERT
3022 // assert(src->klass() != NULL);
3023 BLOCK_COMMENT("assert klasses not null");
3024 { Label L_a, L_b;
3025 __ br_notnull_short(G3_src_klass, Assembler::pt, L_b); // it is broken if klass is NULL
3026 __ bind(L_a);
3027 __ stop("broken null klass");
3028 __ bind(L_b);
3029 __ load_klass(dst, G4_dst_klass);
3030 __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also
3031 __ delayed()->mov(G0, G4_dst_klass); // scribble the temp
3032 BLOCK_COMMENT("assert done");
3033 }
3034 #endif
3036 // Load layout helper
3037 //
3038 // |array_tag| | header_size | element_type | |log2_element_size|
3039 // 32 30 24 16 8 2 0
3040 //
3041 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3042 //
3044 int lh_offset = in_bytes(Klass::layout_helper_offset());
3046 // Load 32-bits signed value. Use br() instruction with it to check icc.
3047 __ lduw(G3_src_klass, lh_offset, G5_lh);
3049 if (UseCompressedKlassPointers) {
3050 __ load_klass(dst, G4_dst_klass);
3051 }
3052 // Handle objArrays completely differently...
3053 juint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3054 __ set(objArray_lh, O5_temp);
3055 __ cmp(G5_lh, O5_temp);
3056 __ br(Assembler::equal, false, Assembler::pt, L_objArray);
3057 if (UseCompressedKlassPointers) {
3058 __ delayed()->nop();
3059 } else {
3060 __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass);
3061 }
3063 // if (src->klass() != dst->klass()) return -1;
3064 __ cmp_and_brx_short(G3_src_klass, G4_dst_klass, Assembler::notEqual, Assembler::pn, L_failed);
3066 // if (!src->is_Array()) return -1;
3067 __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0
3068 __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed);
3070 // At this point, it is known to be a typeArray (array_tag 0x3).
3071 #ifdef ASSERT
3072 __ delayed()->nop();
3073 { Label L;
3074 jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
3075 __ set(lh_prim_tag_in_place, O5_temp);
3076 __ cmp(G5_lh, O5_temp);
3077 __ br(Assembler::greaterEqual, false, Assembler::pt, L);
3078 __ delayed()->nop();
3079 __ stop("must be a primitive array");
3080 __ bind(L);
3081 }
3082 #else
3083 __ delayed(); // match next insn to prev branch
3084 #endif
3086 arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3087 O5_temp, G4_dst_klass, L_failed);
3089 // TypeArrayKlass
3090 //
3091 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3092 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3093 //
3095 const Register G4_offset = G4_dst_klass; // array offset
3096 const Register G3_elsize = G3_src_klass; // log2 element size
3098 __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset);
3099 __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset
3100 __ add(src, G4_offset, src); // src array offset
3101 __ add(dst, G4_offset, dst); // dst array offset
3102 __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size
3104 // next registers should be set before the jump to corresponding stub
3105 const Register from = O0; // source array address
3106 const Register to = O1; // destination array address
3107 const Register count = O2; // elements count
3109 // 'from', 'to', 'count' registers should be set in this order
3110 // since they are the same as 'src', 'src_pos', 'dst'.
3112 BLOCK_COMMENT("scale indexes to element size");
3113 __ sll_ptr(src_pos, G3_elsize, src_pos);
3114 __ sll_ptr(dst_pos, G3_elsize, dst_pos);
3115 __ add(src, src_pos, from); // src_addr
3116 __ add(dst, dst_pos, to); // dst_addr
3118 BLOCK_COMMENT("choose copy loop based on element size");
3119 __ cmp(G3_elsize, 0);
3120 __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy);
3121 __ delayed()->signx(length, count); // length
3123 __ cmp(G3_elsize, LogBytesPerShort);
3124 __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy);
3125 __ delayed()->signx(length, count); // length
3127 __ cmp(G3_elsize, LogBytesPerInt);
3128 __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy);
3129 __ delayed()->signx(length, count); // length
3130 #ifdef ASSERT
3131 { Label L;
3132 __ cmp_and_br_short(G3_elsize, LogBytesPerLong, Assembler::equal, Assembler::pt, L);
3133 __ stop("must be long copy, but elsize is wrong");
3134 __ bind(L);
3135 }
3136 #endif
3137 __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy);
3138 __ delayed()->signx(length, count); // length
3140 // ObjArrayKlass
3141 __ BIND(L_objArray);
3142 // live at this point: G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length
3144 Label L_plain_copy, L_checkcast_copy;
3145 // test array classes for subtyping
3146 __ cmp(G3_src_klass, G4_dst_klass); // usual case is exact equality
3147 __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy);
3148 __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below
3150 // Identically typed arrays can be copied without element-wise checks.
3151 arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3152 O5_temp, G5_lh, L_failed);
3154 __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
3155 __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
3156 __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
3157 __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
3158 __ add(src, src_pos, from); // src_addr
3159 __ add(dst, dst_pos, to); // dst_addr
3160 __ BIND(L_plain_copy);
3161 __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy);
3162 __ delayed()->signx(length, count); // length
3164 __ BIND(L_checkcast_copy);
3165 // live at this point: G3_src_klass, G4_dst_klass
3166 {
3167 // Before looking at dst.length, make sure dst is also an objArray.
3168 // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot
3169 __ cmp(G5_lh, O5_temp);
3170 __ br(Assembler::notEqual, false, Assembler::pn, L_failed);
3172 // It is safe to examine both src.length and dst.length.
3173 __ delayed(); // match next insn to prev branch
3174 arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3175 O5_temp, G5_lh, L_failed);
3177 // Marshal the base address arguments now, freeing registers.
3178 __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
3179 __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
3180 __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
3181 __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
3182 __ add(src, src_pos, from); // src_addr
3183 __ add(dst, dst_pos, to); // dst_addr
3184 __ signx(length, count); // length (reloaded)
3186 Register sco_temp = O3; // this register is free now
3187 assert_different_registers(from, to, count, sco_temp,
3188 G4_dst_klass, G3_src_klass);
3190 // Generate the type check.
3191 int sco_offset = in_bytes(Klass::super_check_offset_offset());
3192 __ lduw(G4_dst_klass, sco_offset, sco_temp);
3193 generate_type_check(G3_src_klass, sco_temp, G4_dst_klass,
3194 O5_temp, L_plain_copy);
3196 // Fetch destination element klass from the ObjArrayKlass header.
3197 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3199 // the checkcast_copy loop needs two extra arguments:
3200 __ ld_ptr(G4_dst_klass, ek_offset, O4); // dest elem klass
3201 // lduw(O4, sco_offset, O3); // sco of elem klass
3203 __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy);
3204 __ delayed()->lduw(O4, sco_offset, O3);
3205 }
3207 __ BIND(L_failed);
3208 __ retl();
3209 __ delayed()->sub(G0, 1, O0); // return -1
3210 return start;
3211 }
3213 //
3214 // Generate stub for heap zeroing.
3215 // "to" address is aligned to jlong (8 bytes).
3216 //
3217 // Arguments for generated stub:
3218 // to: O0
3219 // count: O1 treated as signed (count of HeapWord)
3220 // count could be 0
3221 //
3222 address generate_zero_aligned_words(const char* name) {
3223 __ align(CodeEntryAlignment);
3224 StubCodeMark mark(this, "StubRoutines", name);
3225 address start = __ pc();
3227 const Register to = O0; // source array address
3228 const Register count = O1; // HeapWords count
3229 const Register temp = O2; // scratch
3231 Label Ldone;
3232 __ sllx(count, LogHeapWordSize, count); // to bytes count
3233 // Use BIS for zeroing
3234 __ bis_zeroing(to, count, temp, Ldone);
3235 __ bind(Ldone);
3236 __ retl();
3237 __ delayed()->nop();
3238 return start;
3239 }
3241 void generate_arraycopy_stubs() {
3242 address entry;
3243 address entry_jbyte_arraycopy;
3244 address entry_jshort_arraycopy;
3245 address entry_jint_arraycopy;
3246 address entry_oop_arraycopy;
3247 address entry_jlong_arraycopy;
3248 address entry_checkcast_arraycopy;
3250 //*** jbyte
3251 // Always need aligned and unaligned versions
3252 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry,
3253 "jbyte_disjoint_arraycopy");
3254 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry,
3255 &entry_jbyte_arraycopy,
3256 "jbyte_arraycopy");
3257 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
3258 "arrayof_jbyte_disjoint_arraycopy");
3259 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL,
3260 "arrayof_jbyte_arraycopy");
3262 //*** jshort
3263 // Always need aligned and unaligned versions
3264 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
3265 "jshort_disjoint_arraycopy");
3266 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry,
3267 &entry_jshort_arraycopy,
3268 "jshort_arraycopy");
3269 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
3270 "arrayof_jshort_disjoint_arraycopy");
3271 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL,
3272 "arrayof_jshort_arraycopy");
3274 //*** jint
3275 // Aligned versions
3276 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
3277 "arrayof_jint_disjoint_arraycopy");
3278 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
3279 "arrayof_jint_arraycopy");
3280 #ifdef _LP64
3281 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
3282 // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it).
3283 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry,
3284 "jint_disjoint_arraycopy");
3285 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry,
3286 &entry_jint_arraycopy,
3287 "jint_arraycopy");
3288 #else
3289 // In 32 bit jints are always HeapWordSize aligned, so always use the aligned version
3290 // (in fact in 32bit we always have a pre-loop part even in the aligned version,
3291 // because it uses 64-bit loads/stores, so the aligned flag is actually ignored).
3292 StubRoutines::_jint_disjoint_arraycopy = StubRoutines::_arrayof_jint_disjoint_arraycopy;
3293 StubRoutines::_jint_arraycopy = StubRoutines::_arrayof_jint_arraycopy;
3294 #endif
3297 //*** jlong
3298 // It is always aligned
3299 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
3300 "arrayof_jlong_disjoint_arraycopy");
3301 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
3302 "arrayof_jlong_arraycopy");
3303 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
3304 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
3307 //*** oops
3308 // Aligned versions
3309 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(true, &entry,
3310 "arrayof_oop_disjoint_arraycopy");
3311 StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy,
3312 "arrayof_oop_arraycopy");
3313 // Aligned versions without pre-barriers
3314 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry,
3315 "arrayof_oop_disjoint_arraycopy_uninit",
3316 /*dest_uninitialized*/true);
3317 StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, entry, NULL,
3318 "arrayof_oop_arraycopy_uninit",
3319 /*dest_uninitialized*/true);
3320 #ifdef _LP64
3321 if (UseCompressedOops) {
3322 // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy.
3323 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(false, &entry,
3324 "oop_disjoint_arraycopy");
3325 StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy,
3326 "oop_arraycopy");
3327 // Unaligned versions without pre-barriers
3328 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(false, &entry,
3329 "oop_disjoint_arraycopy_uninit",
3330 /*dest_uninitialized*/true);
3331 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, entry, NULL,
3332 "oop_arraycopy_uninit",
3333 /*dest_uninitialized*/true);
3334 } else
3335 #endif
3336 {
3337 // oop arraycopy is always aligned on 32bit and 64bit without compressed oops
3338 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
3339 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
3340 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
3341 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
3342 }
3344 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3345 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3346 /*dest_uninitialized*/true);
3348 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy",
3349 entry_jbyte_arraycopy,
3350 entry_jshort_arraycopy,
3351 entry_jint_arraycopy,
3352 entry_jlong_arraycopy);
3353 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy",
3354 entry_jbyte_arraycopy,
3355 entry_jshort_arraycopy,
3356 entry_jint_arraycopy,
3357 entry_oop_arraycopy,
3358 entry_jlong_arraycopy,
3359 entry_checkcast_arraycopy);
3361 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3362 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3363 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3364 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3365 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3366 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3368 if (UseBlockZeroing) {
3369 StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words");
3370 }
3371 }
3373 void generate_initial() {
3374 // Generates all stubs and initializes the entry points
3376 //------------------------------------------------------------------------------------------------------------------------
3377 // entry points that exist in all platforms
3378 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
3379 // the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
3380 StubRoutines::_forward_exception_entry = generate_forward_exception();
3382 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
3383 StubRoutines::_catch_exception_entry = generate_catch_exception();
3385 //------------------------------------------------------------------------------------------------------------------------
3386 // entry points that are platform specific
3387 StubRoutines::Sparc::_test_stop_entry = generate_test_stop();
3389 StubRoutines::Sparc::_stop_subroutine_entry = generate_stop_subroutine();
3390 StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
3392 #if !defined(COMPILER2) && !defined(_LP64)
3393 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg();
3394 StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg();
3395 StubRoutines::_atomic_add_entry = generate_atomic_add();
3396 StubRoutines::_atomic_xchg_ptr_entry = StubRoutines::_atomic_xchg_entry;
3397 StubRoutines::_atomic_cmpxchg_ptr_entry = StubRoutines::_atomic_cmpxchg_entry;
3398 StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
3399 StubRoutines::_atomic_add_ptr_entry = StubRoutines::_atomic_add_entry;
3400 #endif // COMPILER2 !=> _LP64
3402 // Build this early so it's available for the interpreter.
3403 StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
3404 }
3407 void generate_all() {
3408 // Generates all stubs and initializes the entry points
3410 // Generate partial_subtype_check first here since its code depends on
3411 // UseZeroBaseCompressedOops which is defined after heap initialization.
3412 StubRoutines::Sparc::_partial_subtype_check = generate_partial_subtype_check();
3413 // These entry points require SharedInfo::stack0 to be set up in non-core builds
3414 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
3415 StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
3416 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
3418 StubRoutines::_handler_for_unsafe_access_entry =
3419 generate_handler_for_unsafe_access();
3421 // support for verify_oop (must happen after universe_init)
3422 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop_subroutine();
3424 // arraycopy stubs used by compilers
3425 generate_arraycopy_stubs();
3427 // Don't initialize the platform math functions since sparc
3428 // doesn't have intrinsics for these operations.
3429 }
3432 public:
3433 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3434 // replace the standard masm with a special one:
3435 _masm = new MacroAssembler(code);
3437 _stub_count = !all ? 0x100 : 0x200;
3438 if (all) {
3439 generate_all();
3440 } else {
3441 generate_initial();
3442 }
3444 // make sure this stub is available for all local calls
3445 if (_atomic_add_stub.is_unbound()) {
3446 // generate a second time, if necessary
3447 (void) generate_atomic_add();
3448 }
3449 }
3452 private:
3453 int _stub_count;
3454 void stub_prolog(StubCodeDesc* cdesc) {
3455 # ifdef ASSERT
3456 // put extra information in the stub code, to make it more readable
3457 #ifdef _LP64
3458 // Write the high part of the address
3459 // [RGV] Check if there is a dependency on the size of this prolog
3460 __ emit_data((intptr_t)cdesc >> 32, relocInfo::none);
3461 #endif
3462 __ emit_data((intptr_t)cdesc, relocInfo::none);
3463 __ emit_data(++_stub_count, relocInfo::none);
3464 # endif
3465 align(true);
3466 }
3468 void align(bool at_header = false) {
3469 // %%%%% move this constant somewhere else
3470 // UltraSPARC cache line size is 8 instructions:
3471 const unsigned int icache_line_size = 32;
3472 const unsigned int icache_half_line_size = 16;
3474 if (at_header) {
3475 while ((intptr_t)(__ pc()) % icache_line_size != 0) {
3476 __ emit_data(0, relocInfo::none);
3477 }
3478 } else {
3479 while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
3480 __ nop();
3481 }
3482 }
3483 }
3485 }; // end class declaration
3487 void StubGenerator_generate(CodeBuffer* code, bool all) {
3488 StubGenerator g(code, all);
3489 }