Thu, 07 Apr 2011 09:53:20 -0700
7009266: G1: assert(obj->is_oop_or_null(true )) failed: Error
Summary: A referent object that is only weakly reachable at the start of concurrent marking but is re-attached to the strongly reachable object graph during marking may not be marked as live. This can cause the reference object to be processed prematurely and leave dangling pointers to the referent object. Implement a read barrier for the java.lang.ref.Reference::referent field by intrinsifying the Reference.get() method, and intercepting accesses though JNI, reflection, and Unsafe, so that when a non-null referent object is read it is also logged in an SATB buffer.
Reviewed-by: kvn, iveresov, never, tonyp, dholmes
1 /*
2 * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
25 #include "precompiled.hpp"
26 #include "asm/assembler.hpp"
27 #include "assembler_sparc.inline.hpp"
28 #include "interpreter/interpreter.hpp"
29 #include "nativeInst_sparc.hpp"
30 #include "oops/instanceOop.hpp"
31 #include "oops/methodOop.hpp"
32 #include "oops/objArrayKlass.hpp"
33 #include "oops/oop.inline.hpp"
34 #include "prims/methodHandles.hpp"
35 #include "runtime/frame.inline.hpp"
36 #include "runtime/handles.inline.hpp"
37 #include "runtime/sharedRuntime.hpp"
38 #include "runtime/stubCodeGenerator.hpp"
39 #include "runtime/stubRoutines.hpp"
40 #include "utilities/top.hpp"
41 #ifdef TARGET_OS_FAMILY_linux
42 # include "thread_linux.inline.hpp"
43 #endif
44 #ifdef TARGET_OS_FAMILY_solaris
45 # include "thread_solaris.inline.hpp"
46 #endif
47 #ifdef COMPILER2
48 #include "opto/runtime.hpp"
49 #endif
51 // Declaration and definition of StubGenerator (no .hpp file).
52 // For a more detailed description of the stub routine structure
53 // see the comment in stubRoutines.hpp.
55 #define __ _masm->
57 #ifdef PRODUCT
58 #define BLOCK_COMMENT(str) /* nothing */
59 #else
60 #define BLOCK_COMMENT(str) __ block_comment(str)
61 #endif
63 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
65 // Note: The register L7 is used as L7_thread_cache, and may not be used
66 // any other way within this module.
69 static const Register& Lstub_temp = L2;
71 // -------------------------------------------------------------------------------------------------------------------------
72 // Stub Code definitions
74 static address handle_unsafe_access() {
75 JavaThread* thread = JavaThread::current();
76 address pc = thread->saved_exception_pc();
77 address npc = thread->saved_exception_npc();
78 // pc is the instruction which we must emulate
79 // doing a no-op is fine: return garbage from the load
81 // request an async exception
82 thread->set_pending_unsafe_access_error();
84 // return address of next instruction to execute
85 return npc;
86 }
88 class StubGenerator: public StubCodeGenerator {
89 private:
91 #ifdef PRODUCT
92 #define inc_counter_np(a,b,c) (0)
93 #else
94 #define inc_counter_np(counter, t1, t2) \
95 BLOCK_COMMENT("inc_counter " #counter); \
96 __ inc_counter(&counter, t1, t2);
97 #endif
99 //----------------------------------------------------------------------------------------------------
100 // Call stubs are used to call Java from C
102 address generate_call_stub(address& return_pc) {
103 StubCodeMark mark(this, "StubRoutines", "call_stub");
104 address start = __ pc();
106 // Incoming arguments:
107 //
108 // o0 : call wrapper address
109 // o1 : result (address)
110 // o2 : result type
111 // o3 : method
112 // o4 : (interpreter) entry point
113 // o5 : parameters (address)
114 // [sp + 0x5c]: parameter size (in words)
115 // [sp + 0x60]: thread
116 //
117 // +---------------+ <--- sp + 0
118 // | |
119 // . reg save area .
120 // | |
121 // +---------------+ <--- sp + 0x40
122 // | |
123 // . extra 7 slots .
124 // | |
125 // +---------------+ <--- sp + 0x5c
126 // | param. size |
127 // +---------------+ <--- sp + 0x60
128 // | thread |
129 // +---------------+
130 // | |
132 // note: if the link argument position changes, adjust
133 // the code in frame::entry_frame_call_wrapper()
135 const Argument link = Argument(0, false); // used only for GC
136 const Argument result = Argument(1, false);
137 const Argument result_type = Argument(2, false);
138 const Argument method = Argument(3, false);
139 const Argument entry_point = Argument(4, false);
140 const Argument parameters = Argument(5, false);
141 const Argument parameter_size = Argument(6, false);
142 const Argument thread = Argument(7, false);
144 // setup thread register
145 __ ld_ptr(thread.as_address(), G2_thread);
146 __ reinit_heapbase();
148 #ifdef ASSERT
149 // make sure we have no pending exceptions
150 { const Register t = G3_scratch;
151 Label L;
152 __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t);
153 __ br_null(t, false, Assembler::pt, L);
154 __ delayed()->nop();
155 __ stop("StubRoutines::call_stub: entered with pending exception");
156 __ bind(L);
157 }
158 #endif
160 // create activation frame & allocate space for parameters
161 { const Register t = G3_scratch;
162 __ ld_ptr(parameter_size.as_address(), t); // get parameter size (in words)
163 __ add(t, frame::memory_parameter_word_sp_offset, t); // add space for save area (in words)
164 __ round_to(t, WordsPerLong); // make sure it is multiple of 2 (in words)
165 __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes
166 __ neg(t); // negate so it can be used with save
167 __ save(SP, t, SP); // setup new frame
168 }
170 // +---------------+ <--- sp + 0
171 // | |
172 // . reg save area .
173 // | |
174 // +---------------+ <--- sp + 0x40
175 // | |
176 // . extra 7 slots .
177 // | |
178 // +---------------+ <--- sp + 0x5c
179 // | empty slot | (only if parameter size is even)
180 // +---------------+
181 // | |
182 // . parameters .
183 // | |
184 // +---------------+ <--- fp + 0
185 // | |
186 // . reg save area .
187 // | |
188 // +---------------+ <--- fp + 0x40
189 // | |
190 // . extra 7 slots .
191 // | |
192 // +---------------+ <--- fp + 0x5c
193 // | param. size |
194 // +---------------+ <--- fp + 0x60
195 // | thread |
196 // +---------------+
197 // | |
199 // pass parameters if any
200 BLOCK_COMMENT("pass parameters if any");
201 { const Register src = parameters.as_in().as_register();
202 const Register dst = Lentry_args;
203 const Register tmp = G3_scratch;
204 const Register cnt = G4_scratch;
206 // test if any parameters & setup of Lentry_args
207 Label exit;
208 __ ld_ptr(parameter_size.as_in().as_address(), cnt); // parameter counter
209 __ add( FP, STACK_BIAS, dst );
210 __ tst(cnt);
211 __ br(Assembler::zero, false, Assembler::pn, exit);
212 __ delayed()->sub(dst, BytesPerWord, dst); // setup Lentry_args
214 // copy parameters if any
215 Label loop;
216 __ BIND(loop);
217 // Store parameter value
218 __ ld_ptr(src, 0, tmp);
219 __ add(src, BytesPerWord, src);
220 __ st_ptr(tmp, dst, 0);
221 __ deccc(cnt);
222 __ br(Assembler::greater, false, Assembler::pt, loop);
223 __ delayed()->sub(dst, Interpreter::stackElementSize, dst);
225 // done
226 __ BIND(exit);
227 }
229 // setup parameters, method & call Java function
230 #ifdef ASSERT
231 // layout_activation_impl checks it's notion of saved SP against
232 // this register, so if this changes update it as well.
233 const Register saved_SP = Lscratch;
234 __ mov(SP, saved_SP); // keep track of SP before call
235 #endif
237 // setup parameters
238 const Register t = G3_scratch;
239 __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words)
240 __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes
241 __ sub(FP, t, Gargs); // setup parameter pointer
242 #ifdef _LP64
243 __ add( Gargs, STACK_BIAS, Gargs ); // Account for LP64 stack bias
244 #endif
245 __ mov(SP, O5_savedSP);
248 // do the call
249 //
250 // the following register must be setup:
251 //
252 // G2_thread
253 // G5_method
254 // Gargs
255 BLOCK_COMMENT("call Java function");
256 __ jmpl(entry_point.as_in().as_register(), G0, O7);
257 __ delayed()->mov(method.as_in().as_register(), G5_method); // setup method
259 BLOCK_COMMENT("call_stub_return_address:");
260 return_pc = __ pc();
262 // The callee, if it wasn't interpreted, can return with SP changed so
263 // we can no longer assert of change of SP.
265 // store result depending on type
266 // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE
267 // is treated as T_INT)
268 { const Register addr = result .as_in().as_register();
269 const Register type = result_type.as_in().as_register();
270 Label is_long, is_float, is_double, is_object, exit;
271 __ cmp(type, T_OBJECT); __ br(Assembler::equal, false, Assembler::pn, is_object);
272 __ delayed()->cmp(type, T_FLOAT); __ br(Assembler::equal, false, Assembler::pn, is_float);
273 __ delayed()->cmp(type, T_DOUBLE); __ br(Assembler::equal, false, Assembler::pn, is_double);
274 __ delayed()->cmp(type, T_LONG); __ br(Assembler::equal, false, Assembler::pn, is_long);
275 __ delayed()->nop();
277 // store int result
278 __ st(O0, addr, G0);
280 __ BIND(exit);
281 __ ret();
282 __ delayed()->restore();
284 __ BIND(is_object);
285 __ ba(false, exit);
286 __ delayed()->st_ptr(O0, addr, G0);
288 __ BIND(is_float);
289 __ ba(false, exit);
290 __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0);
292 __ BIND(is_double);
293 __ ba(false, exit);
294 __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0);
296 __ BIND(is_long);
297 #ifdef _LP64
298 __ ba(false, exit);
299 __ delayed()->st_long(O0, addr, G0); // store entire long
300 #else
301 #if defined(COMPILER2)
302 // All return values are where we want them, except for Longs. C2 returns
303 // longs in G1 in the 32-bit build whereas the interpreter wants them in O0/O1.
304 // Since the interpreter will return longs in G1 and O0/O1 in the 32bit
305 // build we simply always use G1.
306 // Note: I tried to make c2 return longs in O0/O1 and G1 so we wouldn't have to
307 // do this here. Unfortunately if we did a rethrow we'd see an machepilog node
308 // first which would move g1 -> O0/O1 and destroy the exception we were throwing.
310 __ ba(false, exit);
311 __ delayed()->stx(G1, addr, G0); // store entire long
312 #else
313 __ st(O1, addr, BytesPerInt);
314 __ ba(false, exit);
315 __ delayed()->st(O0, addr, G0);
316 #endif /* COMPILER2 */
317 #endif /* _LP64 */
318 }
319 return start;
320 }
323 //----------------------------------------------------------------------------------------------------
324 // Return point for a Java call if there's an exception thrown in Java code.
325 // The exception is caught and transformed into a pending exception stored in
326 // JavaThread that can be tested from within the VM.
327 //
328 // Oexception: exception oop
330 address generate_catch_exception() {
331 StubCodeMark mark(this, "StubRoutines", "catch_exception");
333 address start = __ pc();
334 // verify that thread corresponds
335 __ verify_thread();
337 const Register& temp_reg = Gtemp;
338 Address pending_exception_addr (G2_thread, Thread::pending_exception_offset());
339 Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset ());
340 Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset ());
342 // set pending exception
343 __ verify_oop(Oexception);
344 __ st_ptr(Oexception, pending_exception_addr);
345 __ set((intptr_t)__FILE__, temp_reg);
346 __ st_ptr(temp_reg, exception_file_offset_addr);
347 __ set((intptr_t)__LINE__, temp_reg);
348 __ st(temp_reg, exception_line_offset_addr);
350 // complete return to VM
351 assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
353 AddressLiteral stub_ret(StubRoutines::_call_stub_return_address);
354 __ jump_to(stub_ret, temp_reg);
355 __ delayed()->nop();
357 return start;
358 }
361 //----------------------------------------------------------------------------------------------------
362 // Continuation point for runtime calls returning with a pending exception
363 // The pending exception check happened in the runtime or native call stub
364 // The pending exception in Thread is converted into a Java-level exception
365 //
366 // Contract with Java-level exception handler: O0 = exception
367 // O1 = throwing pc
369 address generate_forward_exception() {
370 StubCodeMark mark(this, "StubRoutines", "forward_exception");
371 address start = __ pc();
373 // Upon entry, O7 has the return address returning into Java
374 // (interpreted or compiled) code; i.e. the return address
375 // becomes the throwing pc.
377 const Register& handler_reg = Gtemp;
379 Address exception_addr(G2_thread, Thread::pending_exception_offset());
381 #ifdef ASSERT
382 // make sure that this code is only executed if there is a pending exception
383 { Label L;
384 __ ld_ptr(exception_addr, Gtemp);
385 __ br_notnull(Gtemp, false, Assembler::pt, L);
386 __ delayed()->nop();
387 __ stop("StubRoutines::forward exception: no pending exception (1)");
388 __ bind(L);
389 }
390 #endif
392 // compute exception handler into handler_reg
393 __ get_thread();
394 __ ld_ptr(exception_addr, Oexception);
395 __ verify_oop(Oexception);
396 __ save_frame(0); // compensates for compiler weakness
397 __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC
398 BLOCK_COMMENT("call exception_handler_for_return_address");
399 __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch);
400 __ mov(O0, handler_reg);
401 __ restore(); // compensates for compiler weakness
403 __ ld_ptr(exception_addr, Oexception);
404 __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC
406 #ifdef ASSERT
407 // make sure exception is set
408 { Label L;
409 __ br_notnull(Oexception, false, Assembler::pt, L);
410 __ delayed()->nop();
411 __ stop("StubRoutines::forward exception: no pending exception (2)");
412 __ bind(L);
413 }
414 #endif
415 // jump to exception handler
416 __ jmp(handler_reg, 0);
417 // clear pending exception
418 __ delayed()->st_ptr(G0, exception_addr);
420 return start;
421 }
424 //------------------------------------------------------------------------------------------------------------------------
425 // Continuation point for throwing of implicit exceptions that are not handled in
426 // the current activation. Fabricates an exception oop and initiates normal
427 // exception dispatching in this frame. Only callee-saved registers are preserved
428 // (through the normal register window / RegisterMap handling).
429 // If the compiler needs all registers to be preserved between the fault
430 // point and the exception handler then it must assume responsibility for that in
431 // AbstractCompiler::continuation_for_implicit_null_exception or
432 // continuation_for_implicit_division_by_zero_exception. All other implicit
433 // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
434 // either at call sites or otherwise assume that stack unwinding will be initiated,
435 // so caller saved registers were assumed volatile in the compiler.
437 // Note that we generate only this stub into a RuntimeStub, because it needs to be
438 // properly traversed and ignored during GC, so we change the meaning of the "__"
439 // macro within this method.
440 #undef __
441 #define __ masm->
443 address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc) {
444 #ifdef ASSERT
445 int insts_size = VerifyThread ? 1 * K : 600;
446 #else
447 int insts_size = VerifyThread ? 1 * K : 256;
448 #endif /* ASSERT */
449 int locs_size = 32;
451 CodeBuffer code(name, insts_size, locs_size);
452 MacroAssembler* masm = new MacroAssembler(&code);
454 __ verify_thread();
456 // This is an inlined and slightly modified version of call_VM
457 // which has the ability to fetch the return PC out of thread-local storage
458 __ assert_not_delayed();
460 // Note that we always push a frame because on the SPARC
461 // architecture, for all of our implicit exception kinds at call
462 // sites, the implicit exception is taken before the callee frame
463 // is pushed.
464 __ save_frame(0);
466 int frame_complete = __ offset();
468 if (restore_saved_exception_pc) {
469 __ ld_ptr(G2_thread, JavaThread::saved_exception_pc_offset(), I7);
470 __ sub(I7, frame::pc_return_offset, I7);
471 }
473 // Note that we always have a runtime stub frame on the top of stack by this point
474 Register last_java_sp = SP;
475 // 64-bit last_java_sp is biased!
476 __ set_last_Java_frame(last_java_sp, G0);
477 if (VerifyThread) __ mov(G2_thread, O0); // about to be smashed; pass early
478 __ save_thread(noreg);
479 // do the call
480 BLOCK_COMMENT("call runtime_entry");
481 __ call(runtime_entry, relocInfo::runtime_call_type);
482 if (!VerifyThread)
483 __ delayed()->mov(G2_thread, O0); // pass thread as first argument
484 else
485 __ delayed()->nop(); // (thread already passed)
486 __ restore_thread(noreg);
487 __ reset_last_Java_frame();
489 // check for pending exceptions. use Gtemp as scratch register.
490 #ifdef ASSERT
491 Label L;
493 Address exception_addr(G2_thread, Thread::pending_exception_offset());
494 Register scratch_reg = Gtemp;
495 __ ld_ptr(exception_addr, scratch_reg);
496 __ br_notnull(scratch_reg, false, Assembler::pt, L);
497 __ delayed()->nop();
498 __ should_not_reach_here();
499 __ bind(L);
500 #endif // ASSERT
501 BLOCK_COMMENT("call forward_exception_entry");
502 __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
503 // we use O7 linkage so that forward_exception_entry has the issuing PC
504 __ delayed()->restore();
506 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false);
507 return stub->entry_point();
508 }
510 #undef __
511 #define __ _masm->
514 // Generate a routine that sets all the registers so we
515 // can tell if the stop routine prints them correctly.
516 address generate_test_stop() {
517 StubCodeMark mark(this, "StubRoutines", "test_stop");
518 address start = __ pc();
520 int i;
522 __ save_frame(0);
524 static jfloat zero = 0.0, one = 1.0;
526 // put addr in L0, then load through L0 to F0
527 __ set((intptr_t)&zero, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F0);
528 __ set((intptr_t)&one, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1
530 // use add to put 2..18 in F2..F18
531 for ( i = 2; i <= 18; ++i ) {
532 __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1), as_FloatRegister(i));
533 }
535 // Now put double 2 in F16, double 18 in F18
536 __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 );
537 __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 );
539 // use add to put 20..32 in F20..F32
540 for (i = 20; i < 32; i += 2) {
541 __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2), as_FloatRegister(i));
542 }
544 // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's
545 for ( i = 0; i < 8; ++i ) {
546 if (i < 6) {
547 __ set( i, as_iRegister(i));
548 __ set(16 + i, as_oRegister(i));
549 __ set(24 + i, as_gRegister(i));
550 }
551 __ set( 8 + i, as_lRegister(i));
552 }
554 __ stop("testing stop");
557 __ ret();
558 __ delayed()->restore();
560 return start;
561 }
564 address generate_stop_subroutine() {
565 StubCodeMark mark(this, "StubRoutines", "stop_subroutine");
566 address start = __ pc();
568 __ stop_subroutine();
570 return start;
571 }
573 address generate_flush_callers_register_windows() {
574 StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows");
575 address start = __ pc();
577 __ flush_windows();
578 __ retl(false);
579 __ delayed()->add( FP, STACK_BIAS, O0 );
580 // The returned value must be a stack pointer whose register save area
581 // is flushed, and will stay flushed while the caller executes.
583 return start;
584 }
586 // Helper functions for v8 atomic operations.
587 //
588 void get_v8_oop_lock_ptr(Register lock_ptr_reg, Register mark_oop_reg, Register scratch_reg) {
589 if (mark_oop_reg == noreg) {
590 address lock_ptr = (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr();
591 __ set((intptr_t)lock_ptr, lock_ptr_reg);
592 } else {
593 assert(scratch_reg != noreg, "just checking");
594 address lock_ptr = (address)StubRoutines::Sparc::_v8_oop_lock_cache;
595 __ set((intptr_t)lock_ptr, lock_ptr_reg);
596 __ and3(mark_oop_reg, StubRoutines::Sparc::v8_oop_lock_mask_in_place, scratch_reg);
597 __ add(lock_ptr_reg, scratch_reg, lock_ptr_reg);
598 }
599 }
601 void generate_v8_lock_prologue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) {
603 get_v8_oop_lock_ptr(lock_ptr_reg, mark_oop_reg, scratch_reg);
604 __ set(StubRoutines::Sparc::locked, lock_reg);
605 // Initialize yield counter
606 __ mov(G0,yield_reg);
608 __ BIND(retry);
609 __ cmp(yield_reg, V8AtomicOperationUnderLockSpinCount);
610 __ br(Assembler::less, false, Assembler::pt, dontyield);
611 __ delayed()->nop();
613 // This code can only be called from inside the VM, this
614 // stub is only invoked from Atomic::add(). We do not
615 // want to use call_VM, because _last_java_sp and such
616 // must already be set.
617 //
618 // Save the regs and make space for a C call
619 __ save(SP, -96, SP);
620 __ save_all_globals_into_locals();
621 BLOCK_COMMENT("call os::naked_sleep");
622 __ call(CAST_FROM_FN_PTR(address, os::naked_sleep));
623 __ delayed()->nop();
624 __ restore_globals_from_locals();
625 __ restore();
626 // reset the counter
627 __ mov(G0,yield_reg);
629 __ BIND(dontyield);
631 // try to get lock
632 __ swap(lock_ptr_reg, 0, lock_reg);
634 // did we get the lock?
635 __ cmp(lock_reg, StubRoutines::Sparc::unlocked);
636 __ br(Assembler::notEqual, true, Assembler::pn, retry);
637 __ delayed()->add(yield_reg,1,yield_reg);
639 // yes, got lock. do the operation here.
640 }
642 void generate_v8_lock_epilogue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) {
643 __ st(lock_reg, lock_ptr_reg, 0); // unlock
644 }
646 // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest).
647 //
648 // Arguments :
649 //
650 // exchange_value: O0
651 // dest: O1
652 //
653 // Results:
654 //
655 // O0: the value previously stored in dest
656 //
657 address generate_atomic_xchg() {
658 StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
659 address start = __ pc();
661 if (UseCASForSwap) {
662 // Use CAS instead of swap, just in case the MP hardware
663 // prefers to work with just one kind of synch. instruction.
664 Label retry;
665 __ BIND(retry);
666 __ mov(O0, O3); // scratch copy of exchange value
667 __ ld(O1, 0, O2); // observe the previous value
668 // try to replace O2 with O3
669 __ cas_under_lock(O1, O2, O3,
670 (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
671 __ cmp(O2, O3);
672 __ br(Assembler::notEqual, false, Assembler::pn, retry);
673 __ delayed()->nop();
675 __ retl(false);
676 __ delayed()->mov(O2, O0); // report previous value to caller
678 } else {
679 if (VM_Version::v9_instructions_work()) {
680 __ retl(false);
681 __ delayed()->swap(O1, 0, O0);
682 } else {
683 const Register& lock_reg = O2;
684 const Register& lock_ptr_reg = O3;
685 const Register& yield_reg = O4;
687 Label retry;
688 Label dontyield;
690 generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
691 // got the lock, do the swap
692 __ swap(O1, 0, O0);
694 generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
695 __ retl(false);
696 __ delayed()->nop();
697 }
698 }
700 return start;
701 }
704 // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value)
705 //
706 // Arguments :
707 //
708 // exchange_value: O0
709 // dest: O1
710 // compare_value: O2
711 //
712 // Results:
713 //
714 // O0: the value previously stored in dest
715 //
716 // Overwrites (v8): O3,O4,O5
717 //
718 address generate_atomic_cmpxchg() {
719 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
720 address start = __ pc();
722 // cmpxchg(dest, compare_value, exchange_value)
723 __ cas_under_lock(O1, O2, O0,
724 (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
725 __ retl(false);
726 __ delayed()->nop();
728 return start;
729 }
731 // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
732 //
733 // Arguments :
734 //
735 // exchange_value: O1:O0
736 // dest: O2
737 // compare_value: O4:O3
738 //
739 // Results:
740 //
741 // O1:O0: the value previously stored in dest
742 //
743 // This only works on V9, on V8 we don't generate any
744 // code and just return NULL.
745 //
746 // Overwrites: G1,G2,G3
747 //
748 address generate_atomic_cmpxchg_long() {
749 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
750 address start = __ pc();
752 if (!VM_Version::supports_cx8())
753 return NULL;;
754 __ sllx(O0, 32, O0);
755 __ srl(O1, 0, O1);
756 __ or3(O0,O1,O0); // O0 holds 64-bit value from compare_value
757 __ sllx(O3, 32, O3);
758 __ srl(O4, 0, O4);
759 __ or3(O3,O4,O3); // O3 holds 64-bit value from exchange_value
760 __ casx(O2, O3, O0);
761 __ srl(O0, 0, O1); // unpacked return value in O1:O0
762 __ retl(false);
763 __ delayed()->srlx(O0, 32, O0);
765 return start;
766 }
769 // Support for jint Atomic::add(jint add_value, volatile jint* dest).
770 //
771 // Arguments :
772 //
773 // add_value: O0 (e.g., +1 or -1)
774 // dest: O1
775 //
776 // Results:
777 //
778 // O0: the new value stored in dest
779 //
780 // Overwrites (v9): O3
781 // Overwrites (v8): O3,O4,O5
782 //
783 address generate_atomic_add() {
784 StubCodeMark mark(this, "StubRoutines", "atomic_add");
785 address start = __ pc();
786 __ BIND(_atomic_add_stub);
788 if (VM_Version::v9_instructions_work()) {
789 Label(retry);
790 __ BIND(retry);
792 __ lduw(O1, 0, O2);
793 __ add(O0, O2, O3);
794 __ cas(O1, O2, O3);
795 __ cmp( O2, O3);
796 __ br(Assembler::notEqual, false, Assembler::pn, retry);
797 __ delayed()->nop();
798 __ retl(false);
799 __ delayed()->add(O0, O2, O0); // note that cas made O2==O3
800 } else {
801 const Register& lock_reg = O2;
802 const Register& lock_ptr_reg = O3;
803 const Register& value_reg = O4;
804 const Register& yield_reg = O5;
806 Label(retry);
807 Label(dontyield);
809 generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
810 // got lock, do the increment
811 __ ld(O1, 0, value_reg);
812 __ add(O0, value_reg, value_reg);
813 __ st(value_reg, O1, 0);
815 // %%% only for RMO and PSO
816 __ membar(Assembler::StoreStore);
818 generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
820 __ retl(false);
821 __ delayed()->mov(value_reg, O0);
822 }
824 return start;
825 }
826 Label _atomic_add_stub; // called from other stubs
829 //------------------------------------------------------------------------------------------------------------------------
830 // The following routine generates a subroutine to throw an asynchronous
831 // UnknownError when an unsafe access gets a fault that could not be
832 // reasonably prevented by the programmer. (Example: SIGBUS/OBJERR.)
833 //
834 // Arguments :
835 //
836 // trapping PC: O7
837 //
838 // Results:
839 // posts an asynchronous exception, skips the trapping instruction
840 //
842 address generate_handler_for_unsafe_access() {
843 StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
844 address start = __ pc();
846 const int preserve_register_words = (64 * 2);
847 Address preserve_addr(FP, (-preserve_register_words * wordSize) + STACK_BIAS);
849 Register Lthread = L7_thread_cache;
850 int i;
852 __ save_frame(0);
853 __ mov(G1, L1);
854 __ mov(G2, L2);
855 __ mov(G3, L3);
856 __ mov(G4, L4);
857 __ mov(G5, L5);
858 for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) {
859 __ stf(FloatRegisterImpl::D, as_FloatRegister(i), preserve_addr, i * wordSize);
860 }
862 address entry_point = CAST_FROM_FN_PTR(address, handle_unsafe_access);
863 BLOCK_COMMENT("call handle_unsafe_access");
864 __ call(entry_point, relocInfo::runtime_call_type);
865 __ delayed()->nop();
867 __ mov(L1, G1);
868 __ mov(L2, G2);
869 __ mov(L3, G3);
870 __ mov(L4, G4);
871 __ mov(L5, G5);
872 for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) {
873 __ ldf(FloatRegisterImpl::D, preserve_addr, as_FloatRegister(i), i * wordSize);
874 }
876 __ verify_thread();
878 __ jmp(O0, 0);
879 __ delayed()->restore();
881 return start;
882 }
885 // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super );
886 // Arguments :
887 //
888 // ret : O0, returned
889 // icc/xcc: set as O0 (depending on wordSize)
890 // sub : O1, argument, not changed
891 // super: O2, argument, not changed
892 // raddr: O7, blown by call
893 address generate_partial_subtype_check() {
894 __ align(CodeEntryAlignment);
895 StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
896 address start = __ pc();
897 Label miss;
899 #if defined(COMPILER2) && !defined(_LP64)
900 // Do not use a 'save' because it blows the 64-bit O registers.
901 __ add(SP,-4*wordSize,SP); // Make space for 4 temps (stack must be 2 words aligned)
902 __ st_ptr(L0,SP,(frame::register_save_words+0)*wordSize);
903 __ st_ptr(L1,SP,(frame::register_save_words+1)*wordSize);
904 __ st_ptr(L2,SP,(frame::register_save_words+2)*wordSize);
905 __ st_ptr(L3,SP,(frame::register_save_words+3)*wordSize);
906 Register Rret = O0;
907 Register Rsub = O1;
908 Register Rsuper = O2;
909 #else
910 __ save_frame(0);
911 Register Rret = I0;
912 Register Rsub = I1;
913 Register Rsuper = I2;
914 #endif
916 Register L0_ary_len = L0;
917 Register L1_ary_ptr = L1;
918 Register L2_super = L2;
919 Register L3_index = L3;
921 __ check_klass_subtype_slow_path(Rsub, Rsuper,
922 L0, L1, L2, L3,
923 NULL, &miss);
925 // Match falls through here.
926 __ addcc(G0,0,Rret); // set Z flags, Z result
928 #if defined(COMPILER2) && !defined(_LP64)
929 __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
930 __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
931 __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
932 __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
933 __ retl(); // Result in Rret is zero; flags set to Z
934 __ delayed()->add(SP,4*wordSize,SP);
935 #else
936 __ ret(); // Result in Rret is zero; flags set to Z
937 __ delayed()->restore();
938 #endif
940 __ BIND(miss);
941 __ addcc(G0,1,Rret); // set NZ flags, NZ result
943 #if defined(COMPILER2) && !defined(_LP64)
944 __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
945 __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
946 __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
947 __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
948 __ retl(); // Result in Rret is != 0; flags set to NZ
949 __ delayed()->add(SP,4*wordSize,SP);
950 #else
951 __ ret(); // Result in Rret is != 0; flags set to NZ
952 __ delayed()->restore();
953 #endif
955 return start;
956 }
959 // Called from MacroAssembler::verify_oop
960 //
961 address generate_verify_oop_subroutine() {
962 StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
964 address start = __ pc();
966 __ verify_oop_subroutine();
968 return start;
969 }
972 //
973 // Verify that a register contains clean 32-bits positive value
974 // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax).
975 //
976 // Input:
977 // Rint - 32-bits value
978 // Rtmp - scratch
979 //
980 void assert_clean_int(Register Rint, Register Rtmp) {
981 #if defined(ASSERT) && defined(_LP64)
982 __ signx(Rint, Rtmp);
983 __ cmp(Rint, Rtmp);
984 __ breakpoint_trap(Assembler::notEqual, Assembler::xcc);
985 #endif
986 }
988 //
989 // Generate overlap test for array copy stubs
990 //
991 // Input:
992 // O0 - array1
993 // O1 - array2
994 // O2 - element count
995 //
996 // Kills temps: O3, O4
997 //
998 void array_overlap_test(address no_overlap_target, int log2_elem_size) {
999 assert(no_overlap_target != NULL, "must be generated");
1000 array_overlap_test(no_overlap_target, NULL, log2_elem_size);
1001 }
1002 void array_overlap_test(Label& L_no_overlap, int log2_elem_size) {
1003 array_overlap_test(NULL, &L_no_overlap, log2_elem_size);
1004 }
1005 void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) {
1006 const Register from = O0;
1007 const Register to = O1;
1008 const Register count = O2;
1009 const Register to_from = O3; // to - from
1010 const Register byte_count = O4; // count << log2_elem_size
1012 __ subcc(to, from, to_from);
1013 __ sll_ptr(count, log2_elem_size, byte_count);
1014 if (NOLp == NULL)
1015 __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target);
1016 else
1017 __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp));
1018 __ delayed()->cmp(to_from, byte_count);
1019 if (NOLp == NULL)
1020 __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target);
1021 else
1022 __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp));
1023 __ delayed()->nop();
1024 }
1026 //
1027 // Generate pre-write barrier for array.
1028 //
1029 // Input:
1030 // addr - register containing starting address
1031 // count - register containing element count
1032 // tmp - scratch register
1033 //
1034 // The input registers are overwritten.
1035 //
1036 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
1037 BarrierSet* bs = Universe::heap()->barrier_set();
1038 switch (bs->kind()) {
1039 case BarrierSet::G1SATBCT:
1040 case BarrierSet::G1SATBCTLogging:
1041 // With G1, don't generate the call if we statically know that the target in uninitialized
1042 if (!dest_uninitialized) {
1043 __ save_frame(0);
1044 // Save the necessary global regs... will be used after.
1045 if (addr->is_global()) {
1046 __ mov(addr, L0);
1047 }
1048 if (count->is_global()) {
1049 __ mov(count, L1);
1050 }
1051 __ mov(addr->after_save(), O0);
1052 // Get the count into O1
1053 __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
1054 __ delayed()->mov(count->after_save(), O1);
1055 if (addr->is_global()) {
1056 __ mov(L0, addr);
1057 }
1058 if (count->is_global()) {
1059 __ mov(L1, count);
1060 }
1061 __ restore();
1062 }
1063 break;
1064 case BarrierSet::CardTableModRef:
1065 case BarrierSet::CardTableExtension:
1066 case BarrierSet::ModRef:
1067 break;
1068 default:
1069 ShouldNotReachHere();
1070 }
1071 }
1072 //
1073 // Generate post-write barrier for array.
1074 //
1075 // Input:
1076 // addr - register containing starting address
1077 // count - register containing element count
1078 // tmp - scratch register
1079 //
1080 // The input registers are overwritten.
1081 //
1082 void gen_write_ref_array_post_barrier(Register addr, Register count,
1083 Register tmp) {
1084 BarrierSet* bs = Universe::heap()->barrier_set();
1086 switch (bs->kind()) {
1087 case BarrierSet::G1SATBCT:
1088 case BarrierSet::G1SATBCTLogging:
1089 {
1090 // Get some new fresh output registers.
1091 __ save_frame(0);
1092 __ mov(addr->after_save(), O0);
1093 __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
1094 __ delayed()->mov(count->after_save(), O1);
1095 __ restore();
1096 }
1097 break;
1098 case BarrierSet::CardTableModRef:
1099 case BarrierSet::CardTableExtension:
1100 {
1101 CardTableModRefBS* ct = (CardTableModRefBS*)bs;
1102 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1103 assert_different_registers(addr, count, tmp);
1105 Label L_loop;
1107 __ sll_ptr(count, LogBytesPerHeapOop, count);
1108 __ sub(count, BytesPerHeapOop, count);
1109 __ add(count, addr, count);
1110 // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
1111 __ srl_ptr(addr, CardTableModRefBS::card_shift, addr);
1112 __ srl_ptr(count, CardTableModRefBS::card_shift, count);
1113 __ sub(count, addr, count);
1114 AddressLiteral rs(ct->byte_map_base);
1115 __ set(rs, tmp);
1116 __ BIND(L_loop);
1117 __ stb(G0, tmp, addr);
1118 __ subcc(count, 1, count);
1119 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1120 __ delayed()->add(addr, 1, addr);
1121 }
1122 break;
1123 case BarrierSet::ModRef:
1124 break;
1125 default:
1126 ShouldNotReachHere();
1127 }
1128 }
1131 // Copy big chunks forward with shift
1132 //
1133 // Inputs:
1134 // from - source arrays
1135 // to - destination array aligned to 8-bytes
1136 // count - elements count to copy >= the count equivalent to 16 bytes
1137 // count_dec - elements count's decrement equivalent to 16 bytes
1138 // L_copy_bytes - copy exit label
1139 //
1140 void copy_16_bytes_forward_with_shift(Register from, Register to,
1141 Register count, int count_dec, Label& L_copy_bytes) {
1142 Label L_loop, L_aligned_copy, L_copy_last_bytes;
1144 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1145 __ andcc(from, 7, G1); // misaligned bytes
1146 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1147 __ delayed()->nop();
1149 const Register left_shift = G1; // left shift bit counter
1150 const Register right_shift = G5; // right shift bit counter
1152 __ sll(G1, LogBitsPerByte, left_shift);
1153 __ mov(64, right_shift);
1154 __ sub(right_shift, left_shift, right_shift);
1156 //
1157 // Load 2 aligned 8-bytes chunks and use one from previous iteration
1158 // to form 2 aligned 8-bytes chunks to store.
1159 //
1160 __ deccc(count, count_dec); // Pre-decrement 'count'
1161 __ andn(from, 7, from); // Align address
1162 __ ldx(from, 0, O3);
1163 __ inc(from, 8);
1164 __ align(OptoLoopAlignment);
1165 __ BIND(L_loop);
1166 __ ldx(from, 0, O4);
1167 __ deccc(count, count_dec); // Can we do next iteration after this one?
1168 __ ldx(from, 8, G4);
1169 __ inc(to, 16);
1170 __ inc(from, 16);
1171 __ sllx(O3, left_shift, O3);
1172 __ srlx(O4, right_shift, G3);
1173 __ bset(G3, O3);
1174 __ stx(O3, to, -16);
1175 __ sllx(O4, left_shift, O4);
1176 __ srlx(G4, right_shift, G3);
1177 __ bset(G3, O4);
1178 __ stx(O4, to, -8);
1179 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1180 __ delayed()->mov(G4, O3);
1182 __ inccc(count, count_dec>>1 ); // + 8 bytes
1183 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1184 __ delayed()->inc(count, count_dec>>1); // restore 'count'
1186 // copy 8 bytes, part of them already loaded in O3
1187 __ ldx(from, 0, O4);
1188 __ inc(to, 8);
1189 __ inc(from, 8);
1190 __ sllx(O3, left_shift, O3);
1191 __ srlx(O4, right_shift, G3);
1192 __ bset(O3, G3);
1193 __ stx(G3, to, -8);
1195 __ BIND(L_copy_last_bytes);
1196 __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
1197 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1198 __ delayed()->sub(from, right_shift, from); // restore address
1200 __ BIND(L_aligned_copy);
1201 }
1203 // Copy big chunks backward with shift
1204 //
1205 // Inputs:
1206 // end_from - source arrays end address
1207 // end_to - destination array end address aligned to 8-bytes
1208 // count - elements count to copy >= the count equivalent to 16 bytes
1209 // count_dec - elements count's decrement equivalent to 16 bytes
1210 // L_aligned_copy - aligned copy exit label
1211 // L_copy_bytes - copy exit label
1212 //
1213 void copy_16_bytes_backward_with_shift(Register end_from, Register end_to,
1214 Register count, int count_dec,
1215 Label& L_aligned_copy, Label& L_copy_bytes) {
1216 Label L_loop, L_copy_last_bytes;
1218 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1219 __ andcc(end_from, 7, G1); // misaligned bytes
1220 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1221 __ delayed()->deccc(count, count_dec); // Pre-decrement 'count'
1223 const Register left_shift = G1; // left shift bit counter
1224 const Register right_shift = G5; // right shift bit counter
1226 __ sll(G1, LogBitsPerByte, left_shift);
1227 __ mov(64, right_shift);
1228 __ sub(right_shift, left_shift, right_shift);
1230 //
1231 // Load 2 aligned 8-bytes chunks and use one from previous iteration
1232 // to form 2 aligned 8-bytes chunks to store.
1233 //
1234 __ andn(end_from, 7, end_from); // Align address
1235 __ ldx(end_from, 0, O3);
1236 __ align(OptoLoopAlignment);
1237 __ BIND(L_loop);
1238 __ ldx(end_from, -8, O4);
1239 __ deccc(count, count_dec); // Can we do next iteration after this one?
1240 __ ldx(end_from, -16, G4);
1241 __ dec(end_to, 16);
1242 __ dec(end_from, 16);
1243 __ srlx(O3, right_shift, O3);
1244 __ sllx(O4, left_shift, G3);
1245 __ bset(G3, O3);
1246 __ stx(O3, end_to, 8);
1247 __ srlx(O4, right_shift, O4);
1248 __ sllx(G4, left_shift, G3);
1249 __ bset(G3, O4);
1250 __ stx(O4, end_to, 0);
1251 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1252 __ delayed()->mov(G4, O3);
1254 __ inccc(count, count_dec>>1 ); // + 8 bytes
1255 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1256 __ delayed()->inc(count, count_dec>>1); // restore 'count'
1258 // copy 8 bytes, part of them already loaded in O3
1259 __ ldx(end_from, -8, O4);
1260 __ dec(end_to, 8);
1261 __ dec(end_from, 8);
1262 __ srlx(O3, right_shift, O3);
1263 __ sllx(O4, left_shift, G3);
1264 __ bset(O3, G3);
1265 __ stx(G3, end_to, 0);
1267 __ BIND(L_copy_last_bytes);
1268 __ srl(left_shift, LogBitsPerByte, left_shift); // misaligned bytes
1269 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1270 __ delayed()->add(end_from, left_shift, end_from); // restore address
1271 }
1273 //
1274 // Generate stub for disjoint byte copy. If "aligned" is true, the
1275 // "from" and "to" addresses are assumed to be heapword aligned.
1276 //
1277 // Arguments for generated stub:
1278 // from: O0
1279 // to: O1
1280 // count: O2 treated as signed
1281 //
1282 address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) {
1283 __ align(CodeEntryAlignment);
1284 StubCodeMark mark(this, "StubRoutines", name);
1285 address start = __ pc();
1287 Label L_skip_alignment, L_align;
1288 Label L_copy_byte, L_copy_byte_loop, L_exit;
1290 const Register from = O0; // source array address
1291 const Register to = O1; // destination array address
1292 const Register count = O2; // elements count
1293 const Register offset = O5; // offset from start of arrays
1294 // O3, O4, G3, G4 are used as temp registers
1296 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1298 if (entry != NULL) {
1299 *entry = __ pc();
1300 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1301 BLOCK_COMMENT("Entry:");
1302 }
1304 // for short arrays, just do single element copy
1305 __ cmp(count, 23); // 16 + 7
1306 __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1307 __ delayed()->mov(G0, offset);
1309 if (aligned) {
1310 // 'aligned' == true when it is known statically during compilation
1311 // of this arraycopy call site that both 'from' and 'to' addresses
1312 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1313 //
1314 // Aligned arrays have 4 bytes alignment in 32-bits VM
1315 // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM
1316 //
1317 #ifndef _LP64
1318 // copy a 4-bytes word if necessary to align 'to' to 8 bytes
1319 __ andcc(to, 7, G0);
1320 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment);
1321 __ delayed()->ld(from, 0, O3);
1322 __ inc(from, 4);
1323 __ inc(to, 4);
1324 __ dec(count, 4);
1325 __ st(O3, to, -4);
1326 __ BIND(L_skip_alignment);
1327 #endif
1328 } else {
1329 // copy bytes to align 'to' on 8 byte boundary
1330 __ andcc(to, 7, G1); // misaligned bytes
1331 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1332 __ delayed()->neg(G1);
1333 __ inc(G1, 8); // bytes need to copy to next 8-bytes alignment
1334 __ sub(count, G1, count);
1335 __ BIND(L_align);
1336 __ ldub(from, 0, O3);
1337 __ deccc(G1);
1338 __ inc(from);
1339 __ stb(O3, to, 0);
1340 __ br(Assembler::notZero, false, Assembler::pt, L_align);
1341 __ delayed()->inc(to);
1342 __ BIND(L_skip_alignment);
1343 }
1344 #ifdef _LP64
1345 if (!aligned)
1346 #endif
1347 {
1348 // Copy with shift 16 bytes per iteration if arrays do not have
1349 // the same alignment mod 8, otherwise fall through to the next
1350 // code for aligned copy.
1351 // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
1352 // Also jump over aligned copy after the copy with shift completed.
1354 copy_16_bytes_forward_with_shift(from, to, count, 16, L_copy_byte);
1355 }
1357 // Both array are 8 bytes aligned, copy 16 bytes at a time
1358 __ and3(count, 7, G4); // Save count
1359 __ srl(count, 3, count);
1360 generate_disjoint_long_copy_core(aligned);
1361 __ mov(G4, count); // Restore count
1363 // copy tailing bytes
1364 __ BIND(L_copy_byte);
1365 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
1366 __ delayed()->nop();
1367 __ align(OptoLoopAlignment);
1368 __ BIND(L_copy_byte_loop);
1369 __ ldub(from, offset, O3);
1370 __ deccc(count);
1371 __ stb(O3, to, offset);
1372 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
1373 __ delayed()->inc(offset);
1375 __ BIND(L_exit);
1376 // O3, O4 are used as temp registers
1377 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1378 __ retl();
1379 __ delayed()->mov(G0, O0); // return 0
1380 return start;
1381 }
1383 //
1384 // Generate stub for conjoint byte copy. If "aligned" is true, the
1385 // "from" and "to" addresses are assumed to be heapword aligned.
1386 //
1387 // Arguments for generated stub:
1388 // from: O0
1389 // to: O1
1390 // count: O2 treated as signed
1391 //
1392 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1393 address *entry, const char *name) {
1394 // Do reverse copy.
1396 __ align(CodeEntryAlignment);
1397 StubCodeMark mark(this, "StubRoutines", name);
1398 address start = __ pc();
1400 Label L_skip_alignment, L_align, L_aligned_copy;
1401 Label L_copy_byte, L_copy_byte_loop, L_exit;
1403 const Register from = O0; // source array address
1404 const Register to = O1; // destination array address
1405 const Register count = O2; // elements count
1406 const Register end_from = from; // source array end address
1407 const Register end_to = to; // destination array end address
1409 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1411 if (entry != NULL) {
1412 *entry = __ pc();
1413 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1414 BLOCK_COMMENT("Entry:");
1415 }
1417 array_overlap_test(nooverlap_target, 0);
1419 __ add(to, count, end_to); // offset after last copied element
1421 // for short arrays, just do single element copy
1422 __ cmp(count, 23); // 16 + 7
1423 __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1424 __ delayed()->add(from, count, end_from);
1426 {
1427 // Align end of arrays since they could be not aligned even
1428 // when arrays itself are aligned.
1430 // copy bytes to align 'end_to' on 8 byte boundary
1431 __ andcc(end_to, 7, G1); // misaligned bytes
1432 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1433 __ delayed()->nop();
1434 __ sub(count, G1, count);
1435 __ BIND(L_align);
1436 __ dec(end_from);
1437 __ dec(end_to);
1438 __ ldub(end_from, 0, O3);
1439 __ deccc(G1);
1440 __ brx(Assembler::notZero, false, Assembler::pt, L_align);
1441 __ delayed()->stb(O3, end_to, 0);
1442 __ BIND(L_skip_alignment);
1443 }
1444 #ifdef _LP64
1445 if (aligned) {
1446 // Both arrays are aligned to 8-bytes in 64-bits VM.
1447 // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1448 // in unaligned case.
1449 __ dec(count, 16);
1450 } else
1451 #endif
1452 {
1453 // Copy with shift 16 bytes per iteration if arrays do not have
1454 // the same alignment mod 8, otherwise jump to the next
1455 // code for aligned copy (and substracting 16 from 'count' before jump).
1456 // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1457 // Also jump over aligned copy after the copy with shift completed.
1459 copy_16_bytes_backward_with_shift(end_from, end_to, count, 16,
1460 L_aligned_copy, L_copy_byte);
1461 }
1462 // copy 4 elements (16 bytes) at a time
1463 __ align(OptoLoopAlignment);
1464 __ BIND(L_aligned_copy);
1465 __ dec(end_from, 16);
1466 __ ldx(end_from, 8, O3);
1467 __ ldx(end_from, 0, O4);
1468 __ dec(end_to, 16);
1469 __ deccc(count, 16);
1470 __ stx(O3, end_to, 8);
1471 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1472 __ delayed()->stx(O4, end_to, 0);
1473 __ inc(count, 16);
1475 // copy 1 element (2 bytes) at a time
1476 __ BIND(L_copy_byte);
1477 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
1478 __ delayed()->nop();
1479 __ align(OptoLoopAlignment);
1480 __ BIND(L_copy_byte_loop);
1481 __ dec(end_from);
1482 __ dec(end_to);
1483 __ ldub(end_from, 0, O4);
1484 __ deccc(count);
1485 __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop);
1486 __ delayed()->stb(O4, end_to, 0);
1488 __ BIND(L_exit);
1489 // O3, O4 are used as temp registers
1490 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1491 __ retl();
1492 __ delayed()->mov(G0, O0); // return 0
1493 return start;
1494 }
1496 //
1497 // Generate stub for disjoint short copy. If "aligned" is true, the
1498 // "from" and "to" addresses are assumed to be heapword aligned.
1499 //
1500 // Arguments for generated stub:
1501 // from: O0
1502 // to: O1
1503 // count: O2 treated as signed
1504 //
1505 address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) {
1506 __ align(CodeEntryAlignment);
1507 StubCodeMark mark(this, "StubRoutines", name);
1508 address start = __ pc();
1510 Label L_skip_alignment, L_skip_alignment2;
1511 Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1513 const Register from = O0; // source array address
1514 const Register to = O1; // destination array address
1515 const Register count = O2; // elements count
1516 const Register offset = O5; // offset from start of arrays
1517 // O3, O4, G3, G4 are used as temp registers
1519 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1521 if (entry != NULL) {
1522 *entry = __ pc();
1523 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1524 BLOCK_COMMENT("Entry:");
1525 }
1527 // for short arrays, just do single element copy
1528 __ cmp(count, 11); // 8 + 3 (22 bytes)
1529 __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1530 __ delayed()->mov(G0, offset);
1532 if (aligned) {
1533 // 'aligned' == true when it is known statically during compilation
1534 // of this arraycopy call site that both 'from' and 'to' addresses
1535 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1536 //
1537 // Aligned arrays have 4 bytes alignment in 32-bits VM
1538 // and 8 bytes - in 64-bits VM.
1539 //
1540 #ifndef _LP64
1541 // copy a 2-elements word if necessary to align 'to' to 8 bytes
1542 __ andcc(to, 7, G0);
1543 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1544 __ delayed()->ld(from, 0, O3);
1545 __ inc(from, 4);
1546 __ inc(to, 4);
1547 __ dec(count, 2);
1548 __ st(O3, to, -4);
1549 __ BIND(L_skip_alignment);
1550 #endif
1551 } else {
1552 // copy 1 element if necessary to align 'to' on an 4 bytes
1553 __ andcc(to, 3, G0);
1554 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1555 __ delayed()->lduh(from, 0, O3);
1556 __ inc(from, 2);
1557 __ inc(to, 2);
1558 __ dec(count);
1559 __ sth(O3, to, -2);
1560 __ BIND(L_skip_alignment);
1562 // copy 2 elements to align 'to' on an 8 byte boundary
1563 __ andcc(to, 7, G0);
1564 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1565 __ delayed()->lduh(from, 0, O3);
1566 __ dec(count, 2);
1567 __ lduh(from, 2, O4);
1568 __ inc(from, 4);
1569 __ inc(to, 4);
1570 __ sth(O3, to, -4);
1571 __ sth(O4, to, -2);
1572 __ BIND(L_skip_alignment2);
1573 }
1574 #ifdef _LP64
1575 if (!aligned)
1576 #endif
1577 {
1578 // Copy with shift 16 bytes per iteration if arrays do not have
1579 // the same alignment mod 8, otherwise fall through to the next
1580 // code for aligned copy.
1581 // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1582 // Also jump over aligned copy after the copy with shift completed.
1584 copy_16_bytes_forward_with_shift(from, to, count, 8, L_copy_2_bytes);
1585 }
1587 // Both array are 8 bytes aligned, copy 16 bytes at a time
1588 __ and3(count, 3, G4); // Save
1589 __ srl(count, 2, count);
1590 generate_disjoint_long_copy_core(aligned);
1591 __ mov(G4, count); // restore
1593 // copy 1 element at a time
1594 __ BIND(L_copy_2_bytes);
1595 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
1596 __ delayed()->nop();
1597 __ align(OptoLoopAlignment);
1598 __ BIND(L_copy_2_bytes_loop);
1599 __ lduh(from, offset, O3);
1600 __ deccc(count);
1601 __ sth(O3, to, offset);
1602 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
1603 __ delayed()->inc(offset, 2);
1605 __ BIND(L_exit);
1606 // O3, O4 are used as temp registers
1607 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1608 __ retl();
1609 __ delayed()->mov(G0, O0); // return 0
1610 return start;
1611 }
1613 //
1614 // Generate stub for disjoint short fill. If "aligned" is true, the
1615 // "to" address is assumed to be heapword aligned.
1616 //
1617 // Arguments for generated stub:
1618 // to: O0
1619 // value: O1
1620 // count: O2 treated as signed
1621 //
1622 address generate_fill(BasicType t, bool aligned, const char* name) {
1623 __ align(CodeEntryAlignment);
1624 StubCodeMark mark(this, "StubRoutines", name);
1625 address start = __ pc();
1627 const Register to = O0; // source array address
1628 const Register value = O1; // fill value
1629 const Register count = O2; // elements count
1630 // O3 is used as a temp register
1632 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1634 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
1635 Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes;
1637 int shift = -1;
1638 switch (t) {
1639 case T_BYTE:
1640 shift = 2;
1641 break;
1642 case T_SHORT:
1643 shift = 1;
1644 break;
1645 case T_INT:
1646 shift = 0;
1647 break;
1648 default: ShouldNotReachHere();
1649 }
1651 BLOCK_COMMENT("Entry:");
1653 if (t == T_BYTE) {
1654 // Zero extend value
1655 __ and3(value, 0xff, value);
1656 __ sllx(value, 8, O3);
1657 __ or3(value, O3, value);
1658 }
1659 if (t == T_SHORT) {
1660 // Zero extend value
1661 __ sllx(value, 48, value);
1662 __ srlx(value, 48, value);
1663 }
1664 if (t == T_BYTE || t == T_SHORT) {
1665 __ sllx(value, 16, O3);
1666 __ or3(value, O3, value);
1667 }
1669 __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
1670 __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp
1671 __ delayed()->andcc(count, 1, G0);
1673 if (!aligned && (t == T_BYTE || t == T_SHORT)) {
1674 // align source address at 4 bytes address boundary
1675 if (t == T_BYTE) {
1676 // One byte misalignment happens only for byte arrays
1677 __ andcc(to, 1, G0);
1678 __ br(Assembler::zero, false, Assembler::pt, L_skip_align1);
1679 __ delayed()->nop();
1680 __ stb(value, to, 0);
1681 __ inc(to, 1);
1682 __ dec(count, 1);
1683 __ BIND(L_skip_align1);
1684 }
1685 // Two bytes misalignment happens only for byte and short (char) arrays
1686 __ andcc(to, 2, G0);
1687 __ br(Assembler::zero, false, Assembler::pt, L_skip_align2);
1688 __ delayed()->nop();
1689 __ sth(value, to, 0);
1690 __ inc(to, 2);
1691 __ dec(count, 1 << (shift - 1));
1692 __ BIND(L_skip_align2);
1693 }
1694 #ifdef _LP64
1695 if (!aligned) {
1696 #endif
1697 // align to 8 bytes, we know we are 4 byte aligned to start
1698 __ andcc(to, 7, G0);
1699 __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes);
1700 __ delayed()->nop();
1701 __ stw(value, to, 0);
1702 __ inc(to, 4);
1703 __ dec(count, 1 << shift);
1704 __ BIND(L_fill_32_bytes);
1705 #ifdef _LP64
1706 }
1707 #endif
1709 if (t == T_INT) {
1710 // Zero extend value
1711 __ srl(value, 0, value);
1712 }
1713 if (t == T_BYTE || t == T_SHORT || t == T_INT) {
1714 __ sllx(value, 32, O3);
1715 __ or3(value, O3, value);
1716 }
1718 Label L_check_fill_8_bytes;
1719 // Fill 32-byte chunks
1720 __ subcc(count, 8 << shift, count);
1721 __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes);
1722 __ delayed()->nop();
1724 Label L_fill_32_bytes_loop, L_fill_4_bytes;
1725 __ align(16);
1726 __ BIND(L_fill_32_bytes_loop);
1728 __ stx(value, to, 0);
1729 __ stx(value, to, 8);
1730 __ stx(value, to, 16);
1731 __ stx(value, to, 24);
1733 __ subcc(count, 8 << shift, count);
1734 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop);
1735 __ delayed()->add(to, 32, to);
1737 __ BIND(L_check_fill_8_bytes);
1738 __ addcc(count, 8 << shift, count);
1739 __ brx(Assembler::zero, false, Assembler::pn, L_exit);
1740 __ delayed()->subcc(count, 1 << (shift + 1), count);
1741 __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes);
1742 __ delayed()->andcc(count, 1<<shift, G0);
1744 //
1745 // length is too short, just fill 8 bytes at a time
1746 //
1747 Label L_fill_8_bytes_loop;
1748 __ BIND(L_fill_8_bytes_loop);
1749 __ stx(value, to, 0);
1750 __ subcc(count, 1 << (shift + 1), count);
1751 __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop);
1752 __ delayed()->add(to, 8, to);
1754 // fill trailing 4 bytes
1755 __ andcc(count, 1<<shift, G0); // in delay slot of branches
1756 if (t == T_INT) {
1757 __ BIND(L_fill_elements);
1758 }
1759 __ BIND(L_fill_4_bytes);
1760 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes);
1761 if (t == T_BYTE || t == T_SHORT) {
1762 __ delayed()->andcc(count, 1<<(shift-1), G0);
1763 } else {
1764 __ delayed()->nop();
1765 }
1766 __ stw(value, to, 0);
1767 if (t == T_BYTE || t == T_SHORT) {
1768 __ inc(to, 4);
1769 // fill trailing 2 bytes
1770 __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches
1771 __ BIND(L_fill_2_bytes);
1772 __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte);
1773 __ delayed()->andcc(count, 1, count);
1774 __ sth(value, to, 0);
1775 if (t == T_BYTE) {
1776 __ inc(to, 2);
1777 // fill trailing byte
1778 __ andcc(count, 1, count); // in delay slot of branches
1779 __ BIND(L_fill_byte);
1780 __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1781 __ delayed()->nop();
1782 __ stb(value, to, 0);
1783 } else {
1784 __ BIND(L_fill_byte);
1785 }
1786 } else {
1787 __ BIND(L_fill_2_bytes);
1788 }
1789 __ BIND(L_exit);
1790 __ retl();
1791 __ delayed()->nop();
1793 // Handle copies less than 8 bytes. Int is handled elsewhere.
1794 if (t == T_BYTE) {
1795 __ BIND(L_fill_elements);
1796 Label L_fill_2, L_fill_4;
1797 // in delay slot __ andcc(count, 1, G0);
1798 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1799 __ delayed()->andcc(count, 2, G0);
1800 __ stb(value, to, 0);
1801 __ inc(to, 1);
1802 __ BIND(L_fill_2);
1803 __ brx(Assembler::zero, false, Assembler::pt, L_fill_4);
1804 __ delayed()->andcc(count, 4, G0);
1805 __ stb(value, to, 0);
1806 __ stb(value, to, 1);
1807 __ inc(to, 2);
1808 __ BIND(L_fill_4);
1809 __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1810 __ delayed()->nop();
1811 __ stb(value, to, 0);
1812 __ stb(value, to, 1);
1813 __ stb(value, to, 2);
1814 __ retl();
1815 __ delayed()->stb(value, to, 3);
1816 }
1818 if (t == T_SHORT) {
1819 Label L_fill_2;
1820 __ BIND(L_fill_elements);
1821 // in delay slot __ andcc(count, 1, G0);
1822 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1823 __ delayed()->andcc(count, 2, G0);
1824 __ sth(value, to, 0);
1825 __ inc(to, 2);
1826 __ BIND(L_fill_2);
1827 __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1828 __ delayed()->nop();
1829 __ sth(value, to, 0);
1830 __ retl();
1831 __ delayed()->sth(value, to, 2);
1832 }
1833 return start;
1834 }
1836 //
1837 // Generate stub for conjoint short copy. If "aligned" is true, the
1838 // "from" and "to" addresses are assumed to be heapword aligned.
1839 //
1840 // Arguments for generated stub:
1841 // from: O0
1842 // to: O1
1843 // count: O2 treated as signed
1844 //
1845 address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1846 address *entry, const char *name) {
1847 // Do reverse copy.
1849 __ align(CodeEntryAlignment);
1850 StubCodeMark mark(this, "StubRoutines", name);
1851 address start = __ pc();
1853 Label L_skip_alignment, L_skip_alignment2, L_aligned_copy;
1854 Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1856 const Register from = O0; // source array address
1857 const Register to = O1; // destination array address
1858 const Register count = O2; // elements count
1859 const Register end_from = from; // source array end address
1860 const Register end_to = to; // destination array end address
1862 const Register byte_count = O3; // bytes count to copy
1864 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1866 if (entry != NULL) {
1867 *entry = __ pc();
1868 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1869 BLOCK_COMMENT("Entry:");
1870 }
1872 array_overlap_test(nooverlap_target, 1);
1874 __ sllx(count, LogBytesPerShort, byte_count);
1875 __ add(to, byte_count, end_to); // offset after last copied element
1877 // for short arrays, just do single element copy
1878 __ cmp(count, 11); // 8 + 3 (22 bytes)
1879 __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1880 __ delayed()->add(from, byte_count, end_from);
1882 {
1883 // Align end of arrays since they could be not aligned even
1884 // when arrays itself are aligned.
1886 // copy 1 element if necessary to align 'end_to' on an 4 bytes
1887 __ andcc(end_to, 3, G0);
1888 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1889 __ delayed()->lduh(end_from, -2, O3);
1890 __ dec(end_from, 2);
1891 __ dec(end_to, 2);
1892 __ dec(count);
1893 __ sth(O3, end_to, 0);
1894 __ BIND(L_skip_alignment);
1896 // copy 2 elements to align 'end_to' on an 8 byte boundary
1897 __ andcc(end_to, 7, G0);
1898 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1899 __ delayed()->lduh(end_from, -2, O3);
1900 __ dec(count, 2);
1901 __ lduh(end_from, -4, O4);
1902 __ dec(end_from, 4);
1903 __ dec(end_to, 4);
1904 __ sth(O3, end_to, 2);
1905 __ sth(O4, end_to, 0);
1906 __ BIND(L_skip_alignment2);
1907 }
1908 #ifdef _LP64
1909 if (aligned) {
1910 // Both arrays are aligned to 8-bytes in 64-bits VM.
1911 // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1912 // in unaligned case.
1913 __ dec(count, 8);
1914 } else
1915 #endif
1916 {
1917 // Copy with shift 16 bytes per iteration if arrays do not have
1918 // the same alignment mod 8, otherwise jump to the next
1919 // code for aligned copy (and substracting 8 from 'count' before jump).
1920 // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1921 // Also jump over aligned copy after the copy with shift completed.
1923 copy_16_bytes_backward_with_shift(end_from, end_to, count, 8,
1924 L_aligned_copy, L_copy_2_bytes);
1925 }
1926 // copy 4 elements (16 bytes) at a time
1927 __ align(OptoLoopAlignment);
1928 __ BIND(L_aligned_copy);
1929 __ dec(end_from, 16);
1930 __ ldx(end_from, 8, O3);
1931 __ ldx(end_from, 0, O4);
1932 __ dec(end_to, 16);
1933 __ deccc(count, 8);
1934 __ stx(O3, end_to, 8);
1935 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1936 __ delayed()->stx(O4, end_to, 0);
1937 __ inc(count, 8);
1939 // copy 1 element (2 bytes) at a time
1940 __ BIND(L_copy_2_bytes);
1941 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
1942 __ delayed()->nop();
1943 __ BIND(L_copy_2_bytes_loop);
1944 __ dec(end_from, 2);
1945 __ dec(end_to, 2);
1946 __ lduh(end_from, 0, O4);
1947 __ deccc(count);
1948 __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
1949 __ delayed()->sth(O4, end_to, 0);
1951 __ BIND(L_exit);
1952 // O3, O4 are used as temp registers
1953 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1954 __ retl();
1955 __ delayed()->mov(G0, O0); // return 0
1956 return start;
1957 }
1959 //
1960 // Generate core code for disjoint int copy (and oop copy on 32-bit).
1961 // If "aligned" is true, the "from" and "to" addresses are assumed
1962 // to be heapword aligned.
1963 //
1964 // Arguments:
1965 // from: O0
1966 // to: O1
1967 // count: O2 treated as signed
1968 //
1969 void generate_disjoint_int_copy_core(bool aligned) {
1971 Label L_skip_alignment, L_aligned_copy;
1972 Label L_copy_16_bytes, L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
1974 const Register from = O0; // source array address
1975 const Register to = O1; // destination array address
1976 const Register count = O2; // elements count
1977 const Register offset = O5; // offset from start of arrays
1978 // O3, O4, G3, G4 are used as temp registers
1980 // 'aligned' == true when it is known statically during compilation
1981 // of this arraycopy call site that both 'from' and 'to' addresses
1982 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1983 //
1984 // Aligned arrays have 4 bytes alignment in 32-bits VM
1985 // and 8 bytes - in 64-bits VM.
1986 //
1987 #ifdef _LP64
1988 if (!aligned)
1989 #endif
1990 {
1991 // The next check could be put under 'ifndef' since the code in
1992 // generate_disjoint_long_copy_core() has own checks and set 'offset'.
1994 // for short arrays, just do single element copy
1995 __ cmp(count, 5); // 4 + 1 (20 bytes)
1996 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
1997 __ delayed()->mov(G0, offset);
1999 // copy 1 element to align 'to' on an 8 byte boundary
2000 __ andcc(to, 7, G0);
2001 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
2002 __ delayed()->ld(from, 0, O3);
2003 __ inc(from, 4);
2004 __ inc(to, 4);
2005 __ dec(count);
2006 __ st(O3, to, -4);
2007 __ BIND(L_skip_alignment);
2009 // if arrays have same alignment mod 8, do 4 elements copy
2010 __ andcc(from, 7, G0);
2011 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2012 __ delayed()->ld(from, 0, O3);
2014 //
2015 // Load 2 aligned 8-bytes chunks and use one from previous iteration
2016 // to form 2 aligned 8-bytes chunks to store.
2017 //
2018 // copy_16_bytes_forward_with_shift() is not used here since this
2019 // code is more optimal.
2021 // copy with shift 4 elements (16 bytes) at a time
2022 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4
2024 __ align(OptoLoopAlignment);
2025 __ BIND(L_copy_16_bytes);
2026 __ ldx(from, 4, O4);
2027 __ deccc(count, 4); // Can we do next iteration after this one?
2028 __ ldx(from, 12, G4);
2029 __ inc(to, 16);
2030 __ inc(from, 16);
2031 __ sllx(O3, 32, O3);
2032 __ srlx(O4, 32, G3);
2033 __ bset(G3, O3);
2034 __ stx(O3, to, -16);
2035 __ sllx(O4, 32, O4);
2036 __ srlx(G4, 32, G3);
2037 __ bset(G3, O4);
2038 __ stx(O4, to, -8);
2039 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2040 __ delayed()->mov(G4, O3);
2042 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2043 __ delayed()->inc(count, 4); // restore 'count'
2045 __ BIND(L_aligned_copy);
2046 }
2047 // copy 4 elements (16 bytes) at a time
2048 __ and3(count, 1, G4); // Save
2049 __ srl(count, 1, count);
2050 generate_disjoint_long_copy_core(aligned);
2051 __ mov(G4, count); // Restore
2053 // copy 1 element at a time
2054 __ BIND(L_copy_4_bytes);
2055 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
2056 __ delayed()->nop();
2057 __ BIND(L_copy_4_bytes_loop);
2058 __ ld(from, offset, O3);
2059 __ deccc(count);
2060 __ st(O3, to, offset);
2061 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
2062 __ delayed()->inc(offset, 4);
2063 __ BIND(L_exit);
2064 }
2066 //
2067 // Generate stub for disjoint int copy. If "aligned" is true, the
2068 // "from" and "to" addresses are assumed to be heapword aligned.
2069 //
2070 // Arguments for generated stub:
2071 // from: O0
2072 // to: O1
2073 // count: O2 treated as signed
2074 //
2075 address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) {
2076 __ align(CodeEntryAlignment);
2077 StubCodeMark mark(this, "StubRoutines", name);
2078 address start = __ pc();
2080 const Register count = O2;
2081 assert_clean_int(count, O3); // Make sure 'count' is clean int.
2083 if (entry != NULL) {
2084 *entry = __ pc();
2085 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2086 BLOCK_COMMENT("Entry:");
2087 }
2089 generate_disjoint_int_copy_core(aligned);
2091 // O3, O4 are used as temp registers
2092 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2093 __ retl();
2094 __ delayed()->mov(G0, O0); // return 0
2095 return start;
2096 }
2098 //
2099 // Generate core code for conjoint int copy (and oop copy on 32-bit).
2100 // If "aligned" is true, the "from" and "to" addresses are assumed
2101 // to be heapword aligned.
2102 //
2103 // Arguments:
2104 // from: O0
2105 // to: O1
2106 // count: O2 treated as signed
2107 //
2108 void generate_conjoint_int_copy_core(bool aligned) {
2109 // Do reverse copy.
2111 Label L_skip_alignment, L_aligned_copy;
2112 Label L_copy_16_bytes, L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
2114 const Register from = O0; // source array address
2115 const Register to = O1; // destination array address
2116 const Register count = O2; // elements count
2117 const Register end_from = from; // source array end address
2118 const Register end_to = to; // destination array end address
2119 // O3, O4, O5, G3 are used as temp registers
2121 const Register byte_count = O3; // bytes count to copy
2123 __ sllx(count, LogBytesPerInt, byte_count);
2124 __ add(to, byte_count, end_to); // offset after last copied element
2126 __ cmp(count, 5); // for short arrays, just do single element copy
2127 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
2128 __ delayed()->add(from, byte_count, end_from);
2130 // copy 1 element to align 'to' on an 8 byte boundary
2131 __ andcc(end_to, 7, G0);
2132 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
2133 __ delayed()->nop();
2134 __ dec(count);
2135 __ dec(end_from, 4);
2136 __ dec(end_to, 4);
2137 __ ld(end_from, 0, O4);
2138 __ st(O4, end_to, 0);
2139 __ BIND(L_skip_alignment);
2141 // Check if 'end_from' and 'end_to' has the same alignment.
2142 __ andcc(end_from, 7, G0);
2143 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2144 __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4
2146 // copy with shift 4 elements (16 bytes) at a time
2147 //
2148 // Load 2 aligned 8-bytes chunks and use one from previous iteration
2149 // to form 2 aligned 8-bytes chunks to store.
2150 //
2151 __ ldx(end_from, -4, O3);
2152 __ align(OptoLoopAlignment);
2153 __ BIND(L_copy_16_bytes);
2154 __ ldx(end_from, -12, O4);
2155 __ deccc(count, 4);
2156 __ ldx(end_from, -20, O5);
2157 __ dec(end_to, 16);
2158 __ dec(end_from, 16);
2159 __ srlx(O3, 32, O3);
2160 __ sllx(O4, 32, G3);
2161 __ bset(G3, O3);
2162 __ stx(O3, end_to, 8);
2163 __ srlx(O4, 32, O4);
2164 __ sllx(O5, 32, G3);
2165 __ bset(O4, G3);
2166 __ stx(G3, end_to, 0);
2167 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2168 __ delayed()->mov(O5, O3);
2170 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2171 __ delayed()->inc(count, 4);
2173 // copy 4 elements (16 bytes) at a time
2174 __ align(OptoLoopAlignment);
2175 __ BIND(L_aligned_copy);
2176 __ dec(end_from, 16);
2177 __ ldx(end_from, 8, O3);
2178 __ ldx(end_from, 0, O4);
2179 __ dec(end_to, 16);
2180 __ deccc(count, 4);
2181 __ stx(O3, end_to, 8);
2182 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
2183 __ delayed()->stx(O4, end_to, 0);
2184 __ inc(count, 4);
2186 // copy 1 element (4 bytes) at a time
2187 __ BIND(L_copy_4_bytes);
2188 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
2189 __ delayed()->nop();
2190 __ BIND(L_copy_4_bytes_loop);
2191 __ dec(end_from, 4);
2192 __ dec(end_to, 4);
2193 __ ld(end_from, 0, O4);
2194 __ deccc(count);
2195 __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop);
2196 __ delayed()->st(O4, end_to, 0);
2197 __ BIND(L_exit);
2198 }
2200 //
2201 // Generate stub for conjoint int copy. If "aligned" is true, the
2202 // "from" and "to" addresses are assumed to be heapword aligned.
2203 //
2204 // Arguments for generated stub:
2205 // from: O0
2206 // to: O1
2207 // count: O2 treated as signed
2208 //
2209 address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
2210 address *entry, const char *name) {
2211 __ align(CodeEntryAlignment);
2212 StubCodeMark mark(this, "StubRoutines", name);
2213 address start = __ pc();
2215 assert_clean_int(O2, O3); // Make sure 'count' is clean int.
2217 if (entry != NULL) {
2218 *entry = __ pc();
2219 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2220 BLOCK_COMMENT("Entry:");
2221 }
2223 array_overlap_test(nooverlap_target, 2);
2225 generate_conjoint_int_copy_core(aligned);
2227 // O3, O4 are used as temp registers
2228 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2229 __ retl();
2230 __ delayed()->mov(G0, O0); // return 0
2231 return start;
2232 }
2234 //
2235 // Generate core code for disjoint long copy (and oop copy on 64-bit).
2236 // "aligned" is ignored, because we must make the stronger
2237 // assumption that both addresses are always 64-bit aligned.
2238 //
2239 // Arguments:
2240 // from: O0
2241 // to: O1
2242 // count: O2 treated as signed
2243 //
2244 // count -= 2;
2245 // if ( count >= 0 ) { // >= 2 elements
2246 // if ( count > 6) { // >= 8 elements
2247 // count -= 6; // original count - 8
2248 // do {
2249 // copy_8_elements;
2250 // count -= 8;
2251 // } while ( count >= 0 );
2252 // count += 6;
2253 // }
2254 // if ( count >= 0 ) { // >= 2 elements
2255 // do {
2256 // copy_2_elements;
2257 // } while ( (count=count-2) >= 0 );
2258 // }
2259 // }
2260 // count += 2;
2261 // if ( count != 0 ) { // 1 element left
2262 // copy_1_element;
2263 // }
2264 //
2265 void generate_disjoint_long_copy_core(bool aligned) {
2266 Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2267 const Register from = O0; // source array address
2268 const Register to = O1; // destination array address
2269 const Register count = O2; // elements count
2270 const Register offset0 = O4; // element offset
2271 const Register offset8 = O5; // next element offset
2273 __ deccc(count, 2);
2274 __ mov(G0, offset0); // offset from start of arrays (0)
2275 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2276 __ delayed()->add(offset0, 8, offset8);
2278 // Copy by 64 bytes chunks
2279 Label L_copy_64_bytes;
2280 const Register from64 = O3; // source address
2281 const Register to64 = G3; // destination address
2282 __ subcc(count, 6, O3);
2283 __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
2284 __ delayed()->mov(to, to64);
2285 // Now we can use O4(offset0), O5(offset8) as temps
2286 __ mov(O3, count);
2287 __ mov(from, from64);
2289 __ align(OptoLoopAlignment);
2290 __ BIND(L_copy_64_bytes);
2291 for( int off = 0; off < 64; off += 16 ) {
2292 __ ldx(from64, off+0, O4);
2293 __ ldx(from64, off+8, O5);
2294 __ stx(O4, to64, off+0);
2295 __ stx(O5, to64, off+8);
2296 }
2297 __ deccc(count, 8);
2298 __ inc(from64, 64);
2299 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_64_bytes);
2300 __ delayed()->inc(to64, 64);
2302 // Restore O4(offset0), O5(offset8)
2303 __ sub(from64, from, offset0);
2304 __ inccc(count, 6);
2305 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2306 __ delayed()->add(offset0, 8, offset8);
2308 // Copy by 16 bytes chunks
2309 __ align(OptoLoopAlignment);
2310 __ BIND(L_copy_16_bytes);
2311 __ ldx(from, offset0, O3);
2312 __ ldx(from, offset8, G3);
2313 __ deccc(count, 2);
2314 __ stx(O3, to, offset0);
2315 __ inc(offset0, 16);
2316 __ stx(G3, to, offset8);
2317 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2318 __ delayed()->inc(offset8, 16);
2320 // Copy last 8 bytes
2321 __ BIND(L_copy_8_bytes);
2322 __ inccc(count, 2);
2323 __ brx(Assembler::zero, true, Assembler::pn, L_exit );
2324 __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs
2325 __ ldx(from, offset0, O3);
2326 __ stx(O3, to, offset0);
2327 __ BIND(L_exit);
2328 }
2330 //
2331 // Generate stub for disjoint long copy.
2332 // "aligned" is ignored, because we must make the stronger
2333 // assumption that both addresses are always 64-bit aligned.
2334 //
2335 // Arguments for generated stub:
2336 // from: O0
2337 // to: O1
2338 // count: O2 treated as signed
2339 //
2340 address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) {
2341 __ align(CodeEntryAlignment);
2342 StubCodeMark mark(this, "StubRoutines", name);
2343 address start = __ pc();
2345 assert_clean_int(O2, O3); // Make sure 'count' is clean int.
2347 if (entry != NULL) {
2348 *entry = __ pc();
2349 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2350 BLOCK_COMMENT("Entry:");
2351 }
2353 generate_disjoint_long_copy_core(aligned);
2355 // O3, O4 are used as temp registers
2356 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2357 __ retl();
2358 __ delayed()->mov(G0, O0); // return 0
2359 return start;
2360 }
2362 //
2363 // Generate core code for conjoint long copy (and oop copy on 64-bit).
2364 // "aligned" is ignored, because we must make the stronger
2365 // assumption that both addresses are always 64-bit aligned.
2366 //
2367 // Arguments:
2368 // from: O0
2369 // to: O1
2370 // count: O2 treated as signed
2371 //
2372 void generate_conjoint_long_copy_core(bool aligned) {
2373 // Do reverse copy.
2374 Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2375 const Register from = O0; // source array address
2376 const Register to = O1; // destination array address
2377 const Register count = O2; // elements count
2378 const Register offset8 = O4; // element offset
2379 const Register offset0 = O5; // previous element offset
2381 __ subcc(count, 1, count);
2382 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
2383 __ delayed()->sllx(count, LogBytesPerLong, offset8);
2384 __ sub(offset8, 8, offset0);
2385 __ align(OptoLoopAlignment);
2386 __ BIND(L_copy_16_bytes);
2387 __ ldx(from, offset8, O2);
2388 __ ldx(from, offset0, O3);
2389 __ stx(O2, to, offset8);
2390 __ deccc(offset8, 16); // use offset8 as counter
2391 __ stx(O3, to, offset0);
2392 __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes);
2393 __ delayed()->dec(offset0, 16);
2395 __ BIND(L_copy_8_bytes);
2396 __ brx(Assembler::negative, false, Assembler::pn, L_exit );
2397 __ delayed()->nop();
2398 __ ldx(from, 0, O3);
2399 __ stx(O3, to, 0);
2400 __ BIND(L_exit);
2401 }
2403 // Generate stub for conjoint long copy.
2404 // "aligned" is ignored, because we must make the stronger
2405 // assumption that both addresses are always 64-bit aligned.
2406 //
2407 // Arguments for generated stub:
2408 // from: O0
2409 // to: O1
2410 // count: O2 treated as signed
2411 //
2412 address generate_conjoint_long_copy(bool aligned, address nooverlap_target,
2413 address *entry, const char *name) {
2414 __ align(CodeEntryAlignment);
2415 StubCodeMark mark(this, "StubRoutines", name);
2416 address start = __ pc();
2418 assert(aligned, "Should always be aligned");
2420 assert_clean_int(O2, O3); // Make sure 'count' is clean int.
2422 if (entry != NULL) {
2423 *entry = __ pc();
2424 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2425 BLOCK_COMMENT("Entry:");
2426 }
2428 array_overlap_test(nooverlap_target, 3);
2430 generate_conjoint_long_copy_core(aligned);
2432 // O3, O4 are used as temp registers
2433 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2434 __ retl();
2435 __ delayed()->mov(G0, O0); // return 0
2436 return start;
2437 }
2439 // Generate stub for disjoint oop copy. If "aligned" is true, the
2440 // "from" and "to" addresses are assumed to be heapword aligned.
2441 //
2442 // Arguments for generated stub:
2443 // from: O0
2444 // to: O1
2445 // count: O2 treated as signed
2446 //
2447 address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name,
2448 bool dest_uninitialized = false) {
2450 const Register from = O0; // source array address
2451 const Register to = O1; // destination array address
2452 const Register count = O2; // elements count
2454 __ align(CodeEntryAlignment);
2455 StubCodeMark mark(this, "StubRoutines", name);
2456 address start = __ pc();
2458 assert_clean_int(count, O3); // Make sure 'count' is clean int.
2460 if (entry != NULL) {
2461 *entry = __ pc();
2462 // caller can pass a 64-bit byte count here
2463 BLOCK_COMMENT("Entry:");
2464 }
2466 // save arguments for barrier generation
2467 __ mov(to, G1);
2468 __ mov(count, G5);
2469 gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2470 #ifdef _LP64
2471 assert_clean_int(count, O3); // Make sure 'count' is clean int.
2472 if (UseCompressedOops) {
2473 generate_disjoint_int_copy_core(aligned);
2474 } else {
2475 generate_disjoint_long_copy_core(aligned);
2476 }
2477 #else
2478 generate_disjoint_int_copy_core(aligned);
2479 #endif
2480 // O0 is used as temp register
2481 gen_write_ref_array_post_barrier(G1, G5, O0);
2483 // O3, O4 are used as temp registers
2484 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2485 __ retl();
2486 __ delayed()->mov(G0, O0); // return 0
2487 return start;
2488 }
2490 // Generate stub for conjoint oop copy. If "aligned" is true, the
2491 // "from" and "to" addresses are assumed to be heapword aligned.
2492 //
2493 // Arguments for generated stub:
2494 // from: O0
2495 // to: O1
2496 // count: O2 treated as signed
2497 //
2498 address generate_conjoint_oop_copy(bool aligned, address nooverlap_target,
2499 address *entry, const char *name,
2500 bool dest_uninitialized = false) {
2502 const Register from = O0; // source array address
2503 const Register to = O1; // destination array address
2504 const Register count = O2; // elements count
2506 __ align(CodeEntryAlignment);
2507 StubCodeMark mark(this, "StubRoutines", name);
2508 address start = __ pc();
2510 assert_clean_int(count, O3); // Make sure 'count' is clean int.
2512 if (entry != NULL) {
2513 *entry = __ pc();
2514 // caller can pass a 64-bit byte count here
2515 BLOCK_COMMENT("Entry:");
2516 }
2518 array_overlap_test(nooverlap_target, LogBytesPerHeapOop);
2520 // save arguments for barrier generation
2521 __ mov(to, G1);
2522 __ mov(count, G5);
2523 gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2525 #ifdef _LP64
2526 if (UseCompressedOops) {
2527 generate_conjoint_int_copy_core(aligned);
2528 } else {
2529 generate_conjoint_long_copy_core(aligned);
2530 }
2531 #else
2532 generate_conjoint_int_copy_core(aligned);
2533 #endif
2535 // O0 is used as temp register
2536 gen_write_ref_array_post_barrier(G1, G5, O0);
2538 // O3, O4 are used as temp registers
2539 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2540 __ retl();
2541 __ delayed()->mov(G0, O0); // return 0
2542 return start;
2543 }
2546 // Helper for generating a dynamic type check.
2547 // Smashes only the given temp registers.
2548 void generate_type_check(Register sub_klass,
2549 Register super_check_offset,
2550 Register super_klass,
2551 Register temp,
2552 Label& L_success) {
2553 assert_different_registers(sub_klass, super_check_offset, super_klass, temp);
2555 BLOCK_COMMENT("type_check:");
2557 Label L_miss, L_pop_to_miss;
2559 assert_clean_int(super_check_offset, temp);
2561 __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg,
2562 &L_success, &L_miss, NULL,
2563 super_check_offset);
2565 BLOCK_COMMENT("type_check_slow_path:");
2566 __ save_frame(0);
2567 __ check_klass_subtype_slow_path(sub_klass->after_save(),
2568 super_klass->after_save(),
2569 L0, L1, L2, L4,
2570 NULL, &L_pop_to_miss);
2571 __ ba(false, L_success);
2572 __ delayed()->restore();
2574 __ bind(L_pop_to_miss);
2575 __ restore();
2577 // Fall through on failure!
2578 __ BIND(L_miss);
2579 }
2582 // Generate stub for checked oop copy.
2583 //
2584 // Arguments for generated stub:
2585 // from: O0
2586 // to: O1
2587 // count: O2 treated as signed
2588 // ckoff: O3 (super_check_offset)
2589 // ckval: O4 (super_klass)
2590 // ret: O0 zero for success; (-1^K) where K is partial transfer count
2591 //
2592 address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) {
2594 const Register O0_from = O0; // source array address
2595 const Register O1_to = O1; // destination array address
2596 const Register O2_count = O2; // elements count
2597 const Register O3_ckoff = O3; // super_check_offset
2598 const Register O4_ckval = O4; // super_klass
2600 const Register O5_offset = O5; // loop var, with stride wordSize
2601 const Register G1_remain = G1; // loop var, with stride -1
2602 const Register G3_oop = G3; // actual oop copied
2603 const Register G4_klass = G4; // oop._klass
2604 const Register G5_super = G5; // oop._klass._primary_supers[ckval]
2606 __ align(CodeEntryAlignment);
2607 StubCodeMark mark(this, "StubRoutines", name);
2608 address start = __ pc();
2610 #ifdef ASSERT
2611 // We sometimes save a frame (see generate_type_check below).
2612 // If this will cause trouble, let's fail now instead of later.
2613 __ save_frame(0);
2614 __ restore();
2615 #endif
2617 assert_clean_int(O2_count, G1); // Make sure 'count' is clean int.
2619 #ifdef ASSERT
2620 // caller guarantees that the arrays really are different
2621 // otherwise, we would have to make conjoint checks
2622 { Label L;
2623 __ mov(O3, G1); // spill: overlap test smashes O3
2624 __ mov(O4, G4); // spill: overlap test smashes O4
2625 array_overlap_test(L, LogBytesPerHeapOop);
2626 __ stop("checkcast_copy within a single array");
2627 __ bind(L);
2628 __ mov(G1, O3);
2629 __ mov(G4, O4);
2630 }
2631 #endif //ASSERT
2633 if (entry != NULL) {
2634 *entry = __ pc();
2635 // caller can pass a 64-bit byte count here (from generic stub)
2636 BLOCK_COMMENT("Entry:");
2637 }
2638 gen_write_ref_array_pre_barrier(O1_to, O2_count, dest_uninitialized);
2640 Label load_element, store_element, do_card_marks, fail, done;
2641 __ addcc(O2_count, 0, G1_remain); // initialize loop index, and test it
2642 __ brx(Assembler::notZero, false, Assembler::pt, load_element);
2643 __ delayed()->mov(G0, O5_offset); // offset from start of arrays
2645 // Empty array: Nothing to do.
2646 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2647 __ retl();
2648 __ delayed()->set(0, O0); // return 0 on (trivial) success
2650 // ======== begin loop ========
2651 // (Loop is rotated; its entry is load_element.)
2652 // Loop variables:
2653 // (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
2654 // (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
2655 // G3, G4, G5 --- current oop, oop.klass, oop.klass.super
2656 __ align(OptoLoopAlignment);
2658 __ BIND(store_element);
2659 __ deccc(G1_remain); // decrement the count
2660 __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop
2661 __ inc(O5_offset, heapOopSize); // step to next offset
2662 __ brx(Assembler::zero, true, Assembler::pt, do_card_marks);
2663 __ delayed()->set(0, O0); // return -1 on success
2665 // ======== loop entry is here ========
2666 __ BIND(load_element);
2667 __ load_heap_oop(O0_from, O5_offset, G3_oop); // load the oop
2668 __ br_null(G3_oop, true, Assembler::pt, store_element);
2669 __ delayed()->nop();
2671 __ load_klass(G3_oop, G4_klass); // query the object klass
2673 generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super,
2674 // branch to this on success:
2675 store_element);
2676 // ======== end loop ========
2678 // It was a real error; we must depend on the caller to finish the job.
2679 // Register G1 has number of *remaining* oops, O2 number of *total* oops.
2680 // Emit GC store barriers for the oops we have copied (O2 minus G1),
2681 // and report their number to the caller.
2682 __ BIND(fail);
2683 __ subcc(O2_count, G1_remain, O2_count);
2684 __ brx(Assembler::zero, false, Assembler::pt, done);
2685 __ delayed()->not1(O2_count, O0); // report (-1^K) to caller
2687 __ BIND(do_card_marks);
2688 gen_write_ref_array_post_barrier(O1_to, O2_count, O3); // store check on O1[0..O2]
2690 __ BIND(done);
2691 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2692 __ retl();
2693 __ delayed()->nop(); // return value in 00
2695 return start;
2696 }
2699 // Generate 'unsafe' array copy stub
2700 // Though just as safe as the other stubs, it takes an unscaled
2701 // size_t argument instead of an element count.
2702 //
2703 // Arguments for generated stub:
2704 // from: O0
2705 // to: O1
2706 // count: O2 byte count, treated as ssize_t, can be zero
2707 //
2708 // Examines the alignment of the operands and dispatches
2709 // to a long, int, short, or byte copy loop.
2710 //
2711 address generate_unsafe_copy(const char* name,
2712 address byte_copy_entry,
2713 address short_copy_entry,
2714 address int_copy_entry,
2715 address long_copy_entry) {
2717 const Register O0_from = O0; // source array address
2718 const Register O1_to = O1; // destination array address
2719 const Register O2_count = O2; // elements count
2721 const Register G1_bits = G1; // test copy of low bits
2723 __ align(CodeEntryAlignment);
2724 StubCodeMark mark(this, "StubRoutines", name);
2725 address start = __ pc();
2727 // bump this on entry, not on exit:
2728 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3);
2730 __ or3(O0_from, O1_to, G1_bits);
2731 __ or3(O2_count, G1_bits, G1_bits);
2733 __ btst(BytesPerLong-1, G1_bits);
2734 __ br(Assembler::zero, true, Assembler::pt,
2735 long_copy_entry, relocInfo::runtime_call_type);
2736 // scale the count on the way out:
2737 __ delayed()->srax(O2_count, LogBytesPerLong, O2_count);
2739 __ btst(BytesPerInt-1, G1_bits);
2740 __ br(Assembler::zero, true, Assembler::pt,
2741 int_copy_entry, relocInfo::runtime_call_type);
2742 // scale the count on the way out:
2743 __ delayed()->srax(O2_count, LogBytesPerInt, O2_count);
2745 __ btst(BytesPerShort-1, G1_bits);
2746 __ br(Assembler::zero, true, Assembler::pt,
2747 short_copy_entry, relocInfo::runtime_call_type);
2748 // scale the count on the way out:
2749 __ delayed()->srax(O2_count, LogBytesPerShort, O2_count);
2751 __ br(Assembler::always, false, Assembler::pt,
2752 byte_copy_entry, relocInfo::runtime_call_type);
2753 __ delayed()->nop();
2755 return start;
2756 }
2759 // Perform range checks on the proposed arraycopy.
2760 // Kills the two temps, but nothing else.
2761 // Also, clean the sign bits of src_pos and dst_pos.
2762 void arraycopy_range_checks(Register src, // source array oop (O0)
2763 Register src_pos, // source position (O1)
2764 Register dst, // destination array oo (O2)
2765 Register dst_pos, // destination position (O3)
2766 Register length, // length of copy (O4)
2767 Register temp1, Register temp2,
2768 Label& L_failed) {
2769 BLOCK_COMMENT("arraycopy_range_checks:");
2771 // if (src_pos + length > arrayOop(src)->length() ) FAIL;
2773 const Register array_length = temp1; // scratch
2774 const Register end_pos = temp2; // scratch
2776 // Note: This next instruction may be in the delay slot of a branch:
2777 __ add(length, src_pos, end_pos); // src_pos + length
2778 __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length);
2779 __ cmp(end_pos, array_length);
2780 __ br(Assembler::greater, false, Assembler::pn, L_failed);
2782 // if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2783 __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length
2784 __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length);
2785 __ cmp(end_pos, array_length);
2786 __ br(Assembler::greater, false, Assembler::pn, L_failed);
2788 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2789 // Move with sign extension can be used since they are positive.
2790 __ delayed()->signx(src_pos, src_pos);
2791 __ signx(dst_pos, dst_pos);
2793 BLOCK_COMMENT("arraycopy_range_checks done");
2794 }
2797 //
2798 // Generate generic array copy stubs
2799 //
2800 // Input:
2801 // O0 - src oop
2802 // O1 - src_pos
2803 // O2 - dst oop
2804 // O3 - dst_pos
2805 // O4 - element count
2806 //
2807 // Output:
2808 // O0 == 0 - success
2809 // O0 == -1 - need to call System.arraycopy
2810 //
2811 address generate_generic_copy(const char *name,
2812 address entry_jbyte_arraycopy,
2813 address entry_jshort_arraycopy,
2814 address entry_jint_arraycopy,
2815 address entry_oop_arraycopy,
2816 address entry_jlong_arraycopy,
2817 address entry_checkcast_arraycopy) {
2818 Label L_failed, L_objArray;
2820 // Input registers
2821 const Register src = O0; // source array oop
2822 const Register src_pos = O1; // source position
2823 const Register dst = O2; // destination array oop
2824 const Register dst_pos = O3; // destination position
2825 const Register length = O4; // elements count
2827 // registers used as temp
2828 const Register G3_src_klass = G3; // source array klass
2829 const Register G4_dst_klass = G4; // destination array klass
2830 const Register G5_lh = G5; // layout handler
2831 const Register O5_temp = O5;
2833 __ align(CodeEntryAlignment);
2834 StubCodeMark mark(this, "StubRoutines", name);
2835 address start = __ pc();
2837 // bump this on entry, not on exit:
2838 inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3);
2840 // In principle, the int arguments could be dirty.
2841 //assert_clean_int(src_pos, G1);
2842 //assert_clean_int(dst_pos, G1);
2843 //assert_clean_int(length, G1);
2845 //-----------------------------------------------------------------------
2846 // Assembler stubs will be used for this call to arraycopy
2847 // if the following conditions are met:
2848 //
2849 // (1) src and dst must not be null.
2850 // (2) src_pos must not be negative.
2851 // (3) dst_pos must not be negative.
2852 // (4) length must not be negative.
2853 // (5) src klass and dst klass should be the same and not NULL.
2854 // (6) src and dst should be arrays.
2855 // (7) src_pos + length must not exceed length of src.
2856 // (8) dst_pos + length must not exceed length of dst.
2857 BLOCK_COMMENT("arraycopy initial argument checks");
2859 // if (src == NULL) return -1;
2860 __ br_null(src, false, Assembler::pn, L_failed);
2862 // if (src_pos < 0) return -1;
2863 __ delayed()->tst(src_pos);
2864 __ br(Assembler::negative, false, Assembler::pn, L_failed);
2865 __ delayed()->nop();
2867 // if (dst == NULL) return -1;
2868 __ br_null(dst, false, Assembler::pn, L_failed);
2870 // if (dst_pos < 0) return -1;
2871 __ delayed()->tst(dst_pos);
2872 __ br(Assembler::negative, false, Assembler::pn, L_failed);
2874 // if (length < 0) return -1;
2875 __ delayed()->tst(length);
2876 __ br(Assembler::negative, false, Assembler::pn, L_failed);
2878 BLOCK_COMMENT("arraycopy argument klass checks");
2879 // get src->klass()
2880 if (UseCompressedOops) {
2881 __ delayed()->nop(); // ??? not good
2882 __ load_klass(src, G3_src_klass);
2883 } else {
2884 __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass);
2885 }
2887 #ifdef ASSERT
2888 // assert(src->klass() != NULL);
2889 BLOCK_COMMENT("assert klasses not null");
2890 { Label L_a, L_b;
2891 __ br_notnull(G3_src_klass, false, Assembler::pt, L_b); // it is broken if klass is NULL
2892 __ delayed()->nop();
2893 __ bind(L_a);
2894 __ stop("broken null klass");
2895 __ bind(L_b);
2896 __ load_klass(dst, G4_dst_klass);
2897 __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also
2898 __ delayed()->mov(G0, G4_dst_klass); // scribble the temp
2899 BLOCK_COMMENT("assert done");
2900 }
2901 #endif
2903 // Load layout helper
2904 //
2905 // |array_tag| | header_size | element_type | |log2_element_size|
2906 // 32 30 24 16 8 2 0
2907 //
2908 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2909 //
2911 int lh_offset = klassOopDesc::header_size() * HeapWordSize +
2912 Klass::layout_helper_offset_in_bytes();
2914 // Load 32-bits signed value. Use br() instruction with it to check icc.
2915 __ lduw(G3_src_klass, lh_offset, G5_lh);
2917 if (UseCompressedOops) {
2918 __ load_klass(dst, G4_dst_klass);
2919 }
2920 // Handle objArrays completely differently...
2921 juint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2922 __ set(objArray_lh, O5_temp);
2923 __ cmp(G5_lh, O5_temp);
2924 __ br(Assembler::equal, false, Assembler::pt, L_objArray);
2925 if (UseCompressedOops) {
2926 __ delayed()->nop();
2927 } else {
2928 __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass);
2929 }
2931 // if (src->klass() != dst->klass()) return -1;
2932 __ cmp(G3_src_klass, G4_dst_klass);
2933 __ brx(Assembler::notEqual, false, Assembler::pn, L_failed);
2934 __ delayed()->nop();
2936 // if (!src->is_Array()) return -1;
2937 __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0
2938 __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed);
2940 // At this point, it is known to be a typeArray (array_tag 0x3).
2941 #ifdef ASSERT
2942 __ delayed()->nop();
2943 { Label L;
2944 jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2945 __ set(lh_prim_tag_in_place, O5_temp);
2946 __ cmp(G5_lh, O5_temp);
2947 __ br(Assembler::greaterEqual, false, Assembler::pt, L);
2948 __ delayed()->nop();
2949 __ stop("must be a primitive array");
2950 __ bind(L);
2951 }
2952 #else
2953 __ delayed(); // match next insn to prev branch
2954 #endif
2956 arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2957 O5_temp, G4_dst_klass, L_failed);
2959 // typeArrayKlass
2960 //
2961 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2962 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2963 //
2965 const Register G4_offset = G4_dst_klass; // array offset
2966 const Register G3_elsize = G3_src_klass; // log2 element size
2968 __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset);
2969 __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset
2970 __ add(src, G4_offset, src); // src array offset
2971 __ add(dst, G4_offset, dst); // dst array offset
2972 __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size
2974 // next registers should be set before the jump to corresponding stub
2975 const Register from = O0; // source array address
2976 const Register to = O1; // destination array address
2977 const Register count = O2; // elements count
2979 // 'from', 'to', 'count' registers should be set in this order
2980 // since they are the same as 'src', 'src_pos', 'dst'.
2982 BLOCK_COMMENT("scale indexes to element size");
2983 __ sll_ptr(src_pos, G3_elsize, src_pos);
2984 __ sll_ptr(dst_pos, G3_elsize, dst_pos);
2985 __ add(src, src_pos, from); // src_addr
2986 __ add(dst, dst_pos, to); // dst_addr
2988 BLOCK_COMMENT("choose copy loop based on element size");
2989 __ cmp(G3_elsize, 0);
2990 __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy);
2991 __ delayed()->signx(length, count); // length
2993 __ cmp(G3_elsize, LogBytesPerShort);
2994 __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy);
2995 __ delayed()->signx(length, count); // length
2997 __ cmp(G3_elsize, LogBytesPerInt);
2998 __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy);
2999 __ delayed()->signx(length, count); // length
3000 #ifdef ASSERT
3001 { Label L;
3002 __ cmp(G3_elsize, LogBytesPerLong);
3003 __ br(Assembler::equal, false, Assembler::pt, L);
3004 __ delayed()->nop();
3005 __ stop("must be long copy, but elsize is wrong");
3006 __ bind(L);
3007 }
3008 #endif
3009 __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy);
3010 __ delayed()->signx(length, count); // length
3012 // objArrayKlass
3013 __ BIND(L_objArray);
3014 // live at this point: G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length
3016 Label L_plain_copy, L_checkcast_copy;
3017 // test array classes for subtyping
3018 __ cmp(G3_src_klass, G4_dst_klass); // usual case is exact equality
3019 __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy);
3020 __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below
3022 // Identically typed arrays can be copied without element-wise checks.
3023 arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3024 O5_temp, G5_lh, L_failed);
3026 __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
3027 __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
3028 __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
3029 __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
3030 __ add(src, src_pos, from); // src_addr
3031 __ add(dst, dst_pos, to); // dst_addr
3032 __ BIND(L_plain_copy);
3033 __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy);
3034 __ delayed()->signx(length, count); // length
3036 __ BIND(L_checkcast_copy);
3037 // live at this point: G3_src_klass, G4_dst_klass
3038 {
3039 // Before looking at dst.length, make sure dst is also an objArray.
3040 // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot
3041 __ cmp(G5_lh, O5_temp);
3042 __ br(Assembler::notEqual, false, Assembler::pn, L_failed);
3044 // It is safe to examine both src.length and dst.length.
3045 __ delayed(); // match next insn to prev branch
3046 arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3047 O5_temp, G5_lh, L_failed);
3049 // Marshal the base address arguments now, freeing registers.
3050 __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
3051 __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
3052 __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
3053 __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
3054 __ add(src, src_pos, from); // src_addr
3055 __ add(dst, dst_pos, to); // dst_addr
3056 __ signx(length, count); // length (reloaded)
3058 Register sco_temp = O3; // this register is free now
3059 assert_different_registers(from, to, count, sco_temp,
3060 G4_dst_klass, G3_src_klass);
3062 // Generate the type check.
3063 int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
3064 Klass::super_check_offset_offset_in_bytes());
3065 __ lduw(G4_dst_klass, sco_offset, sco_temp);
3066 generate_type_check(G3_src_klass, sco_temp, G4_dst_klass,
3067 O5_temp, L_plain_copy);
3069 // Fetch destination element klass from the objArrayKlass header.
3070 int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
3071 objArrayKlass::element_klass_offset_in_bytes());
3073 // the checkcast_copy loop needs two extra arguments:
3074 __ ld_ptr(G4_dst_klass, ek_offset, O4); // dest elem klass
3075 // lduw(O4, sco_offset, O3); // sco of elem klass
3077 __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy);
3078 __ delayed()->lduw(O4, sco_offset, O3);
3079 }
3081 __ BIND(L_failed);
3082 __ retl();
3083 __ delayed()->sub(G0, 1, O0); // return -1
3084 return start;
3085 }
3087 void generate_arraycopy_stubs() {
3088 address entry;
3089 address entry_jbyte_arraycopy;
3090 address entry_jshort_arraycopy;
3091 address entry_jint_arraycopy;
3092 address entry_oop_arraycopy;
3093 address entry_jlong_arraycopy;
3094 address entry_checkcast_arraycopy;
3096 //*** jbyte
3097 // Always need aligned and unaligned versions
3098 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry,
3099 "jbyte_disjoint_arraycopy");
3100 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry,
3101 &entry_jbyte_arraycopy,
3102 "jbyte_arraycopy");
3103 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
3104 "arrayof_jbyte_disjoint_arraycopy");
3105 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL,
3106 "arrayof_jbyte_arraycopy");
3108 //*** jshort
3109 // Always need aligned and unaligned versions
3110 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
3111 "jshort_disjoint_arraycopy");
3112 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry,
3113 &entry_jshort_arraycopy,
3114 "jshort_arraycopy");
3115 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
3116 "arrayof_jshort_disjoint_arraycopy");
3117 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL,
3118 "arrayof_jshort_arraycopy");
3120 //*** jint
3121 // Aligned versions
3122 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
3123 "arrayof_jint_disjoint_arraycopy");
3124 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
3125 "arrayof_jint_arraycopy");
3126 #ifdef _LP64
3127 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
3128 // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it).
3129 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry,
3130 "jint_disjoint_arraycopy");
3131 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry,
3132 &entry_jint_arraycopy,
3133 "jint_arraycopy");
3134 #else
3135 // In 32 bit jints are always HeapWordSize aligned, so always use the aligned version
3136 // (in fact in 32bit we always have a pre-loop part even in the aligned version,
3137 // because it uses 64-bit loads/stores, so the aligned flag is actually ignored).
3138 StubRoutines::_jint_disjoint_arraycopy = StubRoutines::_arrayof_jint_disjoint_arraycopy;
3139 StubRoutines::_jint_arraycopy = StubRoutines::_arrayof_jint_arraycopy;
3140 #endif
3143 //*** jlong
3144 // It is always aligned
3145 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
3146 "arrayof_jlong_disjoint_arraycopy");
3147 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
3148 "arrayof_jlong_arraycopy");
3149 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
3150 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
3153 //*** oops
3154 // Aligned versions
3155 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(true, &entry,
3156 "arrayof_oop_disjoint_arraycopy");
3157 StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy,
3158 "arrayof_oop_arraycopy");
3159 // Aligned versions without pre-barriers
3160 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry,
3161 "arrayof_oop_disjoint_arraycopy_uninit",
3162 /*dest_uninitialized*/true);
3163 StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, entry, NULL,
3164 "arrayof_oop_arraycopy_uninit",
3165 /*dest_uninitialized*/true);
3166 #ifdef _LP64
3167 if (UseCompressedOops) {
3168 // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy.
3169 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(false, &entry,
3170 "oop_disjoint_arraycopy");
3171 StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy,
3172 "oop_arraycopy");
3173 // Unaligned versions without pre-barriers
3174 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(false, &entry,
3175 "oop_disjoint_arraycopy_uninit",
3176 /*dest_uninitialized*/true);
3177 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, entry, NULL,
3178 "oop_arraycopy_uninit",
3179 /*dest_uninitialized*/true);
3180 } else
3181 #endif
3182 {
3183 // oop arraycopy is always aligned on 32bit and 64bit without compressed oops
3184 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
3185 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
3186 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
3187 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
3188 }
3190 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3191 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3192 /*dest_uninitialized*/true);
3194 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy",
3195 entry_jbyte_arraycopy,
3196 entry_jshort_arraycopy,
3197 entry_jint_arraycopy,
3198 entry_jlong_arraycopy);
3199 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy",
3200 entry_jbyte_arraycopy,
3201 entry_jshort_arraycopy,
3202 entry_jint_arraycopy,
3203 entry_oop_arraycopy,
3204 entry_jlong_arraycopy,
3205 entry_checkcast_arraycopy);
3207 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3208 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3209 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3210 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3211 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3212 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3213 }
3215 void generate_initial() {
3216 // Generates all stubs and initializes the entry points
3218 //------------------------------------------------------------------------------------------------------------------------
3219 // entry points that exist in all platforms
3220 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
3221 // the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
3222 StubRoutines::_forward_exception_entry = generate_forward_exception();
3224 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
3225 StubRoutines::_catch_exception_entry = generate_catch_exception();
3227 //------------------------------------------------------------------------------------------------------------------------
3228 // entry points that are platform specific
3229 StubRoutines::Sparc::_test_stop_entry = generate_test_stop();
3231 StubRoutines::Sparc::_stop_subroutine_entry = generate_stop_subroutine();
3232 StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
3234 #if !defined(COMPILER2) && !defined(_LP64)
3235 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg();
3236 StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg();
3237 StubRoutines::_atomic_add_entry = generate_atomic_add();
3238 StubRoutines::_atomic_xchg_ptr_entry = StubRoutines::_atomic_xchg_entry;
3239 StubRoutines::_atomic_cmpxchg_ptr_entry = StubRoutines::_atomic_cmpxchg_entry;
3240 StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
3241 StubRoutines::_atomic_add_ptr_entry = StubRoutines::_atomic_add_entry;
3242 #endif // COMPILER2 !=> _LP64
3243 }
3246 void generate_all() {
3247 // Generates all stubs and initializes the entry points
3249 // Generate partial_subtype_check first here since its code depends on
3250 // UseZeroBaseCompressedOops which is defined after heap initialization.
3251 StubRoutines::Sparc::_partial_subtype_check = generate_partial_subtype_check();
3252 // These entry points require SharedInfo::stack0 to be set up in non-core builds
3253 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError), false);
3254 StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError), false);
3255 StubRoutines::_throw_ArithmeticException_entry = generate_throw_exception("ArithmeticException throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_ArithmeticException), true);
3256 StubRoutines::_throw_NullPointerException_entry = generate_throw_exception("NullPointerException throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException), true);
3257 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
3258 StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
3260 StubRoutines::_handler_for_unsafe_access_entry =
3261 generate_handler_for_unsafe_access();
3263 // support for verify_oop (must happen after universe_init)
3264 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop_subroutine();
3266 // arraycopy stubs used by compilers
3267 generate_arraycopy_stubs();
3269 // Don't initialize the platform math functions since sparc
3270 // doesn't have intrinsics for these operations.
3271 }
3274 public:
3275 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3276 // replace the standard masm with a special one:
3277 _masm = new MacroAssembler(code);
3279 _stub_count = !all ? 0x100 : 0x200;
3280 if (all) {
3281 generate_all();
3282 } else {
3283 generate_initial();
3284 }
3286 // make sure this stub is available for all local calls
3287 if (_atomic_add_stub.is_unbound()) {
3288 // generate a second time, if necessary
3289 (void) generate_atomic_add();
3290 }
3291 }
3294 private:
3295 int _stub_count;
3296 void stub_prolog(StubCodeDesc* cdesc) {
3297 # ifdef ASSERT
3298 // put extra information in the stub code, to make it more readable
3299 #ifdef _LP64
3300 // Write the high part of the address
3301 // [RGV] Check if there is a dependency on the size of this prolog
3302 __ emit_data((intptr_t)cdesc >> 32, relocInfo::none);
3303 #endif
3304 __ emit_data((intptr_t)cdesc, relocInfo::none);
3305 __ emit_data(++_stub_count, relocInfo::none);
3306 # endif
3307 align(true);
3308 }
3310 void align(bool at_header = false) {
3311 // %%%%% move this constant somewhere else
3312 // UltraSPARC cache line size is 8 instructions:
3313 const unsigned int icache_line_size = 32;
3314 const unsigned int icache_half_line_size = 16;
3316 if (at_header) {
3317 while ((intptr_t)(__ pc()) % icache_line_size != 0) {
3318 __ emit_data(0, relocInfo::none);
3319 }
3320 } else {
3321 while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
3322 __ nop();
3323 }
3324 }
3325 }
3327 }; // end class declaration
3329 void StubGenerator_generate(CodeBuffer* code, bool all) {
3330 StubGenerator g(code, all);
3331 }