Thu, 06 Mar 2014 10:55:28 -0800
8035647: PPC64: Support for elf v2 abi.
Summary: ELFv2 ABI used by the little endian PowerPC64 on Linux.
Reviewed-by: kvn
Contributed-by: asmundak@google.com
1 /*
2 * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
3 * Copyright 2012, 2013 SAP AG. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
26 #include "precompiled.hpp"
27 #include "asm/assembler.hpp"
28 #include "asm/macroAssembler.inline.hpp"
29 #include "interpreter/interpreter.hpp"
30 #include "nativeInst_ppc.hpp"
31 #include "oops/instanceOop.hpp"
32 #include "oops/method.hpp"
33 #include "oops/objArrayKlass.hpp"
34 #include "oops/oop.inline.hpp"
35 #include "prims/methodHandles.hpp"
36 #include "runtime/frame.inline.hpp"
37 #include "runtime/handles.inline.hpp"
38 #include "runtime/sharedRuntime.hpp"
39 #include "runtime/stubCodeGenerator.hpp"
40 #include "runtime/stubRoutines.hpp"
41 #include "utilities/top.hpp"
42 #ifdef TARGET_OS_FAMILY_aix
43 # include "thread_aix.inline.hpp"
44 #endif
45 #ifdef TARGET_OS_FAMILY_linux
46 # include "thread_linux.inline.hpp"
47 #endif
48 #ifdef COMPILER2
49 #include "opto/runtime.hpp"
50 #endif
52 #define __ _masm->
54 #ifdef PRODUCT
55 #define BLOCK_COMMENT(str) // nothing
56 #else
57 #define BLOCK_COMMENT(str) __ block_comment(str)
58 #endif
60 class StubGenerator: public StubCodeGenerator {
61 private:
63 // Call stubs are used to call Java from C
64 //
65 // Arguments:
66 //
67 // R3 - call wrapper address : address
68 // R4 - result : intptr_t*
69 // R5 - result type : BasicType
70 // R6 - method : Method
71 // R7 - frame mgr entry point : address
72 // R8 - parameter block : intptr_t*
73 // R9 - parameter count in words : int
74 // R10 - thread : Thread*
75 //
76 address generate_call_stub(address& return_address) {
77 // Setup a new c frame, copy java arguments, call frame manager or
78 // native_entry, and process result.
80 StubCodeMark mark(this, "StubRoutines", "call_stub");
82 address start = __ function_entry();
84 // some sanity checks
85 assert((sizeof(frame::abi_minframe) % 16) == 0, "unaligned");
86 assert((sizeof(frame::abi_reg_args) % 16) == 0, "unaligned");
87 assert((sizeof(frame::spill_nonvolatiles) % 16) == 0, "unaligned");
88 assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned");
89 assert((sizeof(frame::entry_frame_locals) % 16) == 0, "unaligned");
91 Register r_arg_call_wrapper_addr = R3;
92 Register r_arg_result_addr = R4;
93 Register r_arg_result_type = R5;
94 Register r_arg_method = R6;
95 Register r_arg_entry = R7;
96 Register r_arg_thread = R10;
98 Register r_temp = R24;
99 Register r_top_of_arguments_addr = R25;
100 Register r_entryframe_fp = R26;
102 {
103 // Stack on entry to call_stub:
104 //
105 // F1 [C_FRAME]
106 // ...
108 Register r_arg_argument_addr = R8;
109 Register r_arg_argument_count = R9;
110 Register r_frame_alignment_in_bytes = R27;
111 Register r_argument_addr = R28;
112 Register r_argumentcopy_addr = R29;
113 Register r_argument_size_in_bytes = R30;
114 Register r_frame_size = R23;
116 Label arguments_copied;
118 // Save LR/CR to caller's C_FRAME.
119 __ save_LR_CR(R0);
121 // Zero extend arg_argument_count.
122 __ clrldi(r_arg_argument_count, r_arg_argument_count, 32);
124 // Save non-volatiles GPRs to ENTRY_FRAME (not yet pushed, but it's safe).
125 __ save_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
127 // Keep copy of our frame pointer (caller's SP).
128 __ mr(r_entryframe_fp, R1_SP);
130 BLOCK_COMMENT("Push ENTRY_FRAME including arguments");
131 // Push ENTRY_FRAME including arguments:
132 //
133 // F0 [TOP_IJAVA_FRAME_ABI]
134 // alignment (optional)
135 // [outgoing Java arguments]
136 // [ENTRY_FRAME_LOCALS]
137 // F1 [C_FRAME]
138 // ...
140 // calculate frame size
142 // unaligned size of arguments
143 __ sldi(r_argument_size_in_bytes,
144 r_arg_argument_count, Interpreter::logStackElementSize);
145 // arguments alignment (max 1 slot)
146 // FIXME: use round_to() here
147 __ andi_(r_frame_alignment_in_bytes, r_arg_argument_count, 1);
148 __ sldi(r_frame_alignment_in_bytes,
149 r_frame_alignment_in_bytes, Interpreter::logStackElementSize);
151 // size = unaligned size of arguments + top abi's size
152 __ addi(r_frame_size, r_argument_size_in_bytes,
153 frame::top_ijava_frame_abi_size);
154 // size += arguments alignment
155 __ add(r_frame_size,
156 r_frame_size, r_frame_alignment_in_bytes);
157 // size += size of call_stub locals
158 __ addi(r_frame_size,
159 r_frame_size, frame::entry_frame_locals_size);
161 // push ENTRY_FRAME
162 __ push_frame(r_frame_size, r_temp);
164 // initialize call_stub locals (step 1)
165 __ std(r_arg_call_wrapper_addr,
166 _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp);
167 __ std(r_arg_result_addr,
168 _entry_frame_locals_neg(result_address), r_entryframe_fp);
169 __ std(r_arg_result_type,
170 _entry_frame_locals_neg(result_type), r_entryframe_fp);
171 // we will save arguments_tos_address later
174 BLOCK_COMMENT("Copy Java arguments");
175 // copy Java arguments
177 // Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later.
178 // FIXME: why not simply use SP+frame::top_ijava_frame_size?
179 __ addi(r_top_of_arguments_addr,
180 R1_SP, frame::top_ijava_frame_abi_size);
181 __ add(r_top_of_arguments_addr,
182 r_top_of_arguments_addr, r_frame_alignment_in_bytes);
184 // any arguments to copy?
185 __ cmpdi(CCR0, r_arg_argument_count, 0);
186 __ beq(CCR0, arguments_copied);
188 // prepare loop and copy arguments in reverse order
189 {
190 // init CTR with arg_argument_count
191 __ mtctr(r_arg_argument_count);
193 // let r_argumentcopy_addr point to last outgoing Java arguments P
194 __ mr(r_argumentcopy_addr, r_top_of_arguments_addr);
196 // let r_argument_addr point to last incoming java argument
197 __ add(r_argument_addr,
198 r_arg_argument_addr, r_argument_size_in_bytes);
199 __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
201 // now loop while CTR > 0 and copy arguments
202 {
203 Label next_argument;
204 __ bind(next_argument);
206 __ ld(r_temp, 0, r_argument_addr);
207 // argument_addr--;
208 __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
209 __ std(r_temp, 0, r_argumentcopy_addr);
210 // argumentcopy_addr++;
211 __ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord);
213 __ bdnz(next_argument);
214 }
215 }
217 // Arguments copied, continue.
218 __ bind(arguments_copied);
219 }
221 {
222 BLOCK_COMMENT("Call frame manager or native entry.");
223 // Call frame manager or native entry.
224 Register r_new_arg_entry = R14_state;
225 assert_different_registers(r_new_arg_entry, r_top_of_arguments_addr,
226 r_arg_method, r_arg_thread);
228 __ mr(r_new_arg_entry, r_arg_entry);
230 // Register state on entry to frame manager / native entry:
231 //
232 // tos - intptr_t* sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8
233 // R19_method - Method
234 // R16_thread - JavaThread*
236 // Tos must point to last argument - element_size.
237 const Register tos = R17_tos;
238 __ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);
240 // initialize call_stub locals (step 2)
241 // now save tos as arguments_tos_address
242 __ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
244 // load argument registers for call
245 __ mr(R19_method, r_arg_method);
246 __ mr(R16_thread, r_arg_thread);
247 assert(tos != r_arg_method, "trashed r_arg_method");
248 assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");
250 // Set R15_prev_state to 0 for simplifying checks in callee.
251 __ li(R15_prev_state, 0);
253 // Stack on entry to frame manager / native entry:
254 //
255 // F0 [TOP_IJAVA_FRAME_ABI]
256 // alignment (optional)
257 // [outgoing Java arguments]
258 // [ENTRY_FRAME_LOCALS]
259 // F1 [C_FRAME]
260 // ...
261 //
263 // global toc register
264 __ load_const(R29, MacroAssembler::global_toc(), R11_scratch1);
266 // Load narrow oop base.
267 __ reinit_heapbase(R30, R11_scratch1);
269 // Remember the senderSP so we interpreter can pop c2i arguments off of the stack
270 // when called via a c2i.
272 // Pass initial_caller_sp to framemanager.
273 __ mr(R21_tmp1, R1_SP);
275 // Do a light-weight C-call here, r_new_arg_entry holds the address
276 // of the interpreter entry point (frame manager or native entry)
277 // and save runtime-value of LR in return_address.
278 assert(r_new_arg_entry != tos && r_new_arg_entry != R19_method && r_new_arg_entry != R16_thread,
279 "trashed r_new_arg_entry");
280 return_address = __ call_stub(r_new_arg_entry);
281 }
283 {
284 BLOCK_COMMENT("Returned from frame manager or native entry.");
285 // Returned from frame manager or native entry.
286 // Now pop frame, process result, and return to caller.
288 // Stack on exit from frame manager / native entry:
289 //
290 // F0 [ABI]
291 // ...
292 // [ENTRY_FRAME_LOCALS]
293 // F1 [C_FRAME]
294 // ...
295 //
296 // Just pop the topmost frame ...
297 //
299 Label ret_is_object;
300 Label ret_is_long;
301 Label ret_is_float;
302 Label ret_is_double;
304 Register r_entryframe_fp = R30;
305 Register r_lr = R7_ARG5;
306 Register r_cr = R8_ARG6;
308 // Reload some volatile registers which we've spilled before the call
309 // to frame manager / native entry.
310 // Access all locals via frame pointer, because we know nothing about
311 // the topmost frame's size.
312 __ ld(r_entryframe_fp, _abi(callers_sp), R1_SP);
313 assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);
314 __ ld(r_arg_result_addr,
315 _entry_frame_locals_neg(result_address), r_entryframe_fp);
316 __ ld(r_arg_result_type,
317 _entry_frame_locals_neg(result_type), r_entryframe_fp);
318 __ ld(r_cr, _abi(cr), r_entryframe_fp);
319 __ ld(r_lr, _abi(lr), r_entryframe_fp);
321 // pop frame and restore non-volatiles, LR and CR
322 __ mr(R1_SP, r_entryframe_fp);
323 __ mtcr(r_cr);
324 __ mtlr(r_lr);
326 // Store result depending on type. Everything that is not
327 // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
328 __ cmpwi(CCR0, r_arg_result_type, T_OBJECT);
329 __ cmpwi(CCR1, r_arg_result_type, T_LONG);
330 __ cmpwi(CCR5, r_arg_result_type, T_FLOAT);
331 __ cmpwi(CCR6, r_arg_result_type, T_DOUBLE);
333 // restore non-volatile registers
334 __ restore_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
337 // Stack on exit from call_stub:
338 //
339 // 0 [C_FRAME]
340 // ...
341 //
342 // no call_stub frames left.
344 // All non-volatiles have been restored at this point!!
345 assert(R3_RET == R3, "R3_RET should be R3");
347 __ beq(CCR0, ret_is_object);
348 __ beq(CCR1, ret_is_long);
349 __ beq(CCR5, ret_is_float);
350 __ beq(CCR6, ret_is_double);
352 // default:
353 __ stw(R3_RET, 0, r_arg_result_addr);
354 __ blr(); // return to caller
356 // case T_OBJECT:
357 __ bind(ret_is_object);
358 __ std(R3_RET, 0, r_arg_result_addr);
359 __ blr(); // return to caller
361 // case T_LONG:
362 __ bind(ret_is_long);
363 __ std(R3_RET, 0, r_arg_result_addr);
364 __ blr(); // return to caller
366 // case T_FLOAT:
367 __ bind(ret_is_float);
368 __ stfs(F1_RET, 0, r_arg_result_addr);
369 __ blr(); // return to caller
371 // case T_DOUBLE:
372 __ bind(ret_is_double);
373 __ stfd(F1_RET, 0, r_arg_result_addr);
374 __ blr(); // return to caller
375 }
377 return start;
378 }
380 // Return point for a Java call if there's an exception thrown in
381 // Java code. The exception is caught and transformed into a
382 // pending exception stored in JavaThread that can be tested from
383 // within the VM.
384 //
385 address generate_catch_exception() {
386 StubCodeMark mark(this, "StubRoutines", "catch_exception");
388 address start = __ pc();
390 // Registers alive
391 //
392 // R16_thread
393 // R3_ARG1 - address of pending exception
394 // R4_ARG2 - return address in call stub
396 const Register exception_file = R21_tmp1;
397 const Register exception_line = R22_tmp2;
399 __ load_const(exception_file, (void*)__FILE__);
400 __ load_const(exception_line, (void*)__LINE__);
402 __ std(R3_ARG1, thread_(pending_exception));
403 // store into `char *'
404 __ std(exception_file, thread_(exception_file));
405 // store into `int'
406 __ stw(exception_line, thread_(exception_line));
408 // complete return to VM
409 assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
411 __ mtlr(R4_ARG2);
412 // continue in call stub
413 __ blr();
415 return start;
416 }
418 // Continuation point for runtime calls returning with a pending
419 // exception. The pending exception check happened in the runtime
420 // or native call stub. The pending exception in Thread is
421 // converted into a Java-level exception.
422 //
423 address generate_forward_exception() {
424 StubCodeMark mark(this, "StubRoutines", "forward_exception");
425 address start = __ pc();
427 #if !defined(PRODUCT)
428 if (VerifyOops) {
429 // Get pending exception oop.
430 __ ld(R3_ARG1,
431 in_bytes(Thread::pending_exception_offset()),
432 R16_thread);
433 // Make sure that this code is only executed if there is a pending exception.
434 {
435 Label L;
436 __ cmpdi(CCR0, R3_ARG1, 0);
437 __ bne(CCR0, L);
438 __ stop("StubRoutines::forward exception: no pending exception (1)");
439 __ bind(L);
440 }
441 __ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");
442 }
443 #endif
445 // Save LR/CR and copy exception pc (LR) into R4_ARG2.
446 __ save_LR_CR(R4_ARG2);
447 __ push_frame_reg_args(0, R0);
448 // Find exception handler.
449 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
450 SharedRuntime::exception_handler_for_return_address),
451 R16_thread,
452 R4_ARG2);
453 // Copy handler's address.
454 __ mtctr(R3_RET);
455 __ pop_frame();
456 __ restore_LR_CR(R0);
458 // Set up the arguments for the exception handler:
459 // - R3_ARG1: exception oop
460 // - R4_ARG2: exception pc.
462 // Load pending exception oop.
463 __ ld(R3_ARG1,
464 in_bytes(Thread::pending_exception_offset()),
465 R16_thread);
467 // The exception pc is the return address in the caller.
468 // Must load it into R4_ARG2.
469 __ mflr(R4_ARG2);
471 #ifdef ASSERT
472 // Make sure exception is set.
473 {
474 Label L;
475 __ cmpdi(CCR0, R3_ARG1, 0);
476 __ bne(CCR0, L);
477 __ stop("StubRoutines::forward exception: no pending exception (2)");
478 __ bind(L);
479 }
480 #endif
482 // Clear the pending exception.
483 __ li(R0, 0);
484 __ std(R0,
485 in_bytes(Thread::pending_exception_offset()),
486 R16_thread);
487 // Jump to exception handler.
488 __ bctr();
490 return start;
491 }
493 #undef __
494 #define __ masm->
495 // Continuation point for throwing of implicit exceptions that are
496 // not handled in the current activation. Fabricates an exception
497 // oop and initiates normal exception dispatching in this
498 // frame. Only callee-saved registers are preserved (through the
499 // normal register window / RegisterMap handling). If the compiler
500 // needs all registers to be preserved between the fault point and
501 // the exception handler then it must assume responsibility for that
502 // in AbstractCompiler::continuation_for_implicit_null_exception or
503 // continuation_for_implicit_division_by_zero_exception. All other
504 // implicit exceptions (e.g., NullPointerException or
505 // AbstractMethodError on entry) are either at call sites or
506 // otherwise assume that stack unwinding will be initiated, so
507 // caller saved registers were assumed volatile in the compiler.
508 //
509 // Note that we generate only this stub into a RuntimeStub, because
510 // it needs to be properly traversed and ignored during GC, so we
511 // change the meaning of the "__" macro within this method.
512 //
513 // Note: the routine set_pc_not_at_call_for_caller in
514 // SharedRuntime.cpp requires that this code be generated into a
515 // RuntimeStub.
516 address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc,
517 Register arg1 = noreg, Register arg2 = noreg) {
518 CodeBuffer code(name, 1024 DEBUG_ONLY(+ 512), 0);
519 MacroAssembler* masm = new MacroAssembler(&code);
521 OopMapSet* oop_maps = new OopMapSet();
522 int frame_size_in_bytes = frame::abi_reg_args_size;
523 OopMap* map = new OopMap(frame_size_in_bytes / sizeof(jint), 0);
525 StubCodeMark mark(this, "StubRoutines", "throw_exception");
527 address start = __ pc();
529 __ save_LR_CR(R11_scratch1);
531 // Push a frame.
532 __ push_frame_reg_args(0, R11_scratch1);
534 address frame_complete_pc = __ pc();
536 if (restore_saved_exception_pc) {
537 __ unimplemented("StubGenerator::throw_exception with restore_saved_exception_pc", 74);
538 }
540 // Note that we always have a runtime stub frame on the top of
541 // stack by this point. Remember the offset of the instruction
542 // whose address will be moved to R11_scratch1.
543 address gc_map_pc = __ get_PC_trash_LR(R11_scratch1);
545 __ set_last_Java_frame(/*sp*/R1_SP, /*pc*/R11_scratch1);
547 __ mr(R3_ARG1, R16_thread);
548 if (arg1 != noreg) {
549 __ mr(R4_ARG2, arg1);
550 }
551 if (arg2 != noreg) {
552 __ mr(R5_ARG3, arg2);
553 }
554 #if defined(ABI_ELFv2)
555 __ call_c(runtime_entry, relocInfo::none);
556 #else
557 __ call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, runtime_entry), relocInfo::none);
558 #endif
560 // Set an oopmap for the call site.
561 oop_maps->add_gc_map((int)(gc_map_pc - start), map);
563 __ reset_last_Java_frame();
565 #ifdef ASSERT
566 // Make sure that this code is only executed if there is a pending
567 // exception.
568 {
569 Label L;
570 __ ld(R0,
571 in_bytes(Thread::pending_exception_offset()),
572 R16_thread);
573 __ cmpdi(CCR0, R0, 0);
574 __ bne(CCR0, L);
575 __ stop("StubRoutines::throw_exception: no pending exception");
576 __ bind(L);
577 }
578 #endif
580 // Pop frame.
581 __ pop_frame();
583 __ restore_LR_CR(R11_scratch1);
585 __ load_const(R11_scratch1, StubRoutines::forward_exception_entry());
586 __ mtctr(R11_scratch1);
587 __ bctr();
589 // Create runtime stub with OopMap.
590 RuntimeStub* stub =
591 RuntimeStub::new_runtime_stub(name, &code,
592 /*frame_complete=*/ (int)(frame_complete_pc - start),
593 frame_size_in_bytes/wordSize,
594 oop_maps,
595 false);
596 return stub->entry_point();
597 }
598 #undef __
599 #define __ _masm->
601 // Generate G1 pre-write barrier for array.
602 //
603 // Input:
604 // from - register containing src address (only needed for spilling)
605 // to - register containing starting address
606 // count - register containing element count
607 // tmp - scratch register
608 //
609 // Kills:
610 // nothing
611 //
612 void gen_write_ref_array_pre_barrier(Register from, Register to, Register count, bool dest_uninitialized, Register Rtmp1) {
613 BarrierSet* const bs = Universe::heap()->barrier_set();
614 switch (bs->kind()) {
615 case BarrierSet::G1SATBCT:
616 case BarrierSet::G1SATBCTLogging:
617 // With G1, don't generate the call if we statically know that the target in uninitialized
618 if (!dest_uninitialized) {
619 const int spill_slots = 4 * wordSize;
620 const int frame_size = frame::abi_reg_args_size + spill_slots;
621 Label filtered;
623 // Is marking active?
624 if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
625 __ lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
626 } else {
627 guarantee(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
628 __ lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
629 }
630 __ cmpdi(CCR0, Rtmp1, 0);
631 __ beq(CCR0, filtered);
633 __ save_LR_CR(R0);
634 __ push_frame_reg_args(spill_slots, R0);
635 __ std(from, frame_size - 1 * wordSize, R1_SP);
636 __ std(to, frame_size - 2 * wordSize, R1_SP);
637 __ std(count, frame_size - 3 * wordSize, R1_SP);
639 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), to, count);
641 __ ld(from, frame_size - 1 * wordSize, R1_SP);
642 __ ld(to, frame_size - 2 * wordSize, R1_SP);
643 __ ld(count, frame_size - 3 * wordSize, R1_SP);
644 __ pop_frame();
645 __ restore_LR_CR(R0);
647 __ bind(filtered);
648 }
649 break;
650 case BarrierSet::CardTableModRef:
651 case BarrierSet::CardTableExtension:
652 case BarrierSet::ModRef:
653 break;
654 default:
655 ShouldNotReachHere();
656 }
657 }
659 // Generate CMS/G1 post-write barrier for array.
660 //
661 // Input:
662 // addr - register containing starting address
663 // count - register containing element count
664 // tmp - scratch register
665 //
666 // The input registers and R0 are overwritten.
667 //
668 void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp, bool branchToEnd) {
669 BarrierSet* const bs = Universe::heap()->barrier_set();
671 switch (bs->kind()) {
672 case BarrierSet::G1SATBCT:
673 case BarrierSet::G1SATBCTLogging:
674 {
675 if (branchToEnd) {
676 __ save_LR_CR(R0);
677 // We need this frame only to spill LR.
678 __ push_frame_reg_args(0, R0);
679 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), addr, count);
680 __ pop_frame();
681 __ restore_LR_CR(R0);
682 } else {
683 // Tail call: fake call from stub caller by branching without linking.
684 address entry_point = (address)CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post);
685 __ mr_if_needed(R3_ARG1, addr);
686 __ mr_if_needed(R4_ARG2, count);
687 __ load_const(R11, entry_point, R0);
688 __ call_c_and_return_to_caller(R11);
689 }
690 }
691 break;
692 case BarrierSet::CardTableModRef:
693 case BarrierSet::CardTableExtension:
694 {
695 Label Lskip_loop, Lstore_loop;
696 if (UseConcMarkSweepGC) {
697 // TODO PPC port: contribute optimization / requires shared changes
698 __ release();
699 }
701 CardTableModRefBS* const ct = (CardTableModRefBS*)bs;
702 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
703 assert_different_registers(addr, count, tmp);
705 __ sldi(count, count, LogBytesPerHeapOop);
706 __ addi(count, count, -BytesPerHeapOop);
707 __ add(count, addr, count);
708 // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
709 __ srdi(addr, addr, CardTableModRefBS::card_shift);
710 __ srdi(count, count, CardTableModRefBS::card_shift);
711 __ subf(count, addr, count);
712 assert_different_registers(R0, addr, count, tmp);
713 __ load_const(tmp, (address)ct->byte_map_base);
714 __ addic_(count, count, 1);
715 __ beq(CCR0, Lskip_loop);
716 __ li(R0, 0);
717 __ mtctr(count);
718 // Byte store loop
719 __ bind(Lstore_loop);
720 __ stbx(R0, tmp, addr);
721 __ addi(addr, addr, 1);
722 __ bdnz(Lstore_loop);
723 __ bind(Lskip_loop);
725 if (!branchToEnd) __ blr();
726 }
727 break;
728 case BarrierSet::ModRef:
729 if (!branchToEnd) __ blr();
730 break;
731 default:
732 ShouldNotReachHere();
733 }
734 }
736 // Support for void zero_words_aligned8(HeapWord* to, size_t count)
737 //
738 // Arguments:
739 // to:
740 // count:
741 //
742 // Destroys:
743 //
744 address generate_zero_words_aligned8() {
745 StubCodeMark mark(this, "StubRoutines", "zero_words_aligned8");
747 // Implemented as in ClearArray.
748 address start = __ function_entry();
750 Register base_ptr_reg = R3_ARG1; // tohw (needs to be 8b aligned)
751 Register cnt_dwords_reg = R4_ARG2; // count (in dwords)
752 Register tmp1_reg = R5_ARG3;
753 Register tmp2_reg = R6_ARG4;
754 Register zero_reg = R7_ARG5;
756 // Procedure for large arrays (uses data cache block zero instruction).
757 Label dwloop, fast, fastloop, restloop, lastdword, done;
758 int cl_size=VM_Version::get_cache_line_size(), cl_dwords=cl_size>>3, cl_dwordaddr_bits=exact_log2(cl_dwords);
759 int min_dcbz=2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines.
761 // Clear up to 128byte boundary if long enough, dword_cnt=(16-(base>>3))%16.
762 __ dcbtst(base_ptr_reg); // Indicate write access to first cache line ...
763 __ andi(tmp2_reg, cnt_dwords_reg, 1); // to check if number of dwords is even.
764 __ srdi_(tmp1_reg, cnt_dwords_reg, 1); // number of double dwords
765 __ load_const_optimized(zero_reg, 0L); // Use as zero register.
767 __ cmpdi(CCR1, tmp2_reg, 0); // cnt_dwords even?
768 __ beq(CCR0, lastdword); // size <= 1
769 __ mtctr(tmp1_reg); // Speculatively preload counter for rest loop (>0).
770 __ cmpdi(CCR0, cnt_dwords_reg, (min_dcbz+1)*cl_dwords-1); // Big enough to ensure >=min_dcbz cache lines are included?
771 __ neg(tmp1_reg, base_ptr_reg); // bit 0..58: bogus, bit 57..60: (16-(base>>3))%16, bit 61..63: 000
773 __ blt(CCR0, restloop); // Too small. (<31=(2*cl_dwords)-1 is sufficient, but bigger performs better.)
774 __ rldicl_(tmp1_reg, tmp1_reg, 64-3, 64-cl_dwordaddr_bits); // Extract number of dwords to 128byte boundary=(16-(base>>3))%16.
776 __ beq(CCR0, fast); // already 128byte aligned
777 __ mtctr(tmp1_reg); // Set ctr to hit 128byte boundary (0<ctr<cnt).
778 __ subf(cnt_dwords_reg, tmp1_reg, cnt_dwords_reg); // rest (>0 since size>=256-8)
780 // Clear in first cache line dword-by-dword if not already 128byte aligned.
781 __ bind(dwloop);
782 __ std(zero_reg, 0, base_ptr_reg); // Clear 8byte aligned block.
783 __ addi(base_ptr_reg, base_ptr_reg, 8);
784 __ bdnz(dwloop);
786 // clear 128byte blocks
787 __ bind(fast);
788 __ srdi(tmp1_reg, cnt_dwords_reg, cl_dwordaddr_bits); // loop count for 128byte loop (>0 since size>=256-8)
789 __ andi(tmp2_reg, cnt_dwords_reg, 1); // to check if rest even
791 __ mtctr(tmp1_reg); // load counter
792 __ cmpdi(CCR1, tmp2_reg, 0); // rest even?
793 __ rldicl_(tmp1_reg, cnt_dwords_reg, 63, 65-cl_dwordaddr_bits); // rest in double dwords
795 __ bind(fastloop);
796 __ dcbz(base_ptr_reg); // Clear 128byte aligned block.
797 __ addi(base_ptr_reg, base_ptr_reg, cl_size);
798 __ bdnz(fastloop);
800 //__ dcbtst(base_ptr_reg); // Indicate write access to last cache line.
801 __ beq(CCR0, lastdword); // rest<=1
802 __ mtctr(tmp1_reg); // load counter
804 // Clear rest.
805 __ bind(restloop);
806 __ std(zero_reg, 0, base_ptr_reg); // Clear 8byte aligned block.
807 __ std(zero_reg, 8, base_ptr_reg); // Clear 8byte aligned block.
808 __ addi(base_ptr_reg, base_ptr_reg, 16);
809 __ bdnz(restloop);
811 __ bind(lastdword);
812 __ beq(CCR1, done);
813 __ std(zero_reg, 0, base_ptr_reg);
814 __ bind(done);
815 __ blr(); // return
817 return start;
818 }
820 // The following routine generates a subroutine to throw an asynchronous
821 // UnknownError when an unsafe access gets a fault that could not be
822 // reasonably prevented by the programmer. (Example: SIGBUS/OBJERR.)
823 //
824 address generate_handler_for_unsafe_access() {
825 StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
826 address start = __ function_entry();
827 __ unimplemented("StubRoutines::handler_for_unsafe_access", 93);
828 return start;
829 }
831 #if !defined(PRODUCT)
832 // Wrapper which calls oopDesc::is_oop_or_null()
833 // Only called by MacroAssembler::verify_oop
834 static void verify_oop_helper(const char* message, oop o) {
835 if (!o->is_oop_or_null()) {
836 fatal(message);
837 }
838 ++ StubRoutines::_verify_oop_count;
839 }
840 #endif
842 // Return address of code to be called from code generated by
843 // MacroAssembler::verify_oop.
844 //
845 // Don't generate, rather use C++ code.
846 address generate_verify_oop() {
847 StubCodeMark mark(this, "StubRoutines", "verify_oop");
849 // this is actually a `FunctionDescriptor*'.
850 address start = 0;
852 #if !defined(PRODUCT)
853 start = CAST_FROM_FN_PTR(address, verify_oop_helper);
854 #endif
856 return start;
857 }
859 // Fairer handling of safepoints for native methods.
860 //
861 // Generate code which reads from the polling page. This special handling is needed as the
862 // linux-ppc64 kernel before 2.6.6 doesn't set si_addr on some segfaults in 64bit mode
863 // (cf. http://www.kernel.org/pub/linux/kernel/v2.6/ChangeLog-2.6.6), especially when we try
864 // to read from the safepoint polling page.
865 address generate_load_from_poll() {
866 StubCodeMark mark(this, "StubRoutines", "generate_load_from_poll");
867 address start = __ function_entry();
868 __ unimplemented("StubRoutines::verify_oop", 95); // TODO PPC port
869 return start;
870 }
872 // -XX:+OptimizeFill : convert fill/copy loops into intrinsic
873 //
874 // The code is implemented(ported from sparc) as we believe it benefits JVM98, however
875 // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!
876 //
877 // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition
878 // for turning on loop predication optimization, and hence the behavior of "array range check"
879 // and "loop invariant check" could be influenced, which potentially boosted JVM98.
880 //
881 // Generate stub for disjoint short fill. If "aligned" is true, the
882 // "to" address is assumed to be heapword aligned.
883 //
884 // Arguments for generated stub:
885 // to: R3_ARG1
886 // value: R4_ARG2
887 // count: R5_ARG3 treated as signed
888 //
889 address generate_fill(BasicType t, bool aligned, const char* name) {
890 StubCodeMark mark(this, "StubRoutines", name);
891 address start = __ function_entry();
893 const Register to = R3_ARG1; // source array address
894 const Register value = R4_ARG2; // fill value
895 const Register count = R5_ARG3; // elements count
896 const Register temp = R6_ARG4; // temp register
898 //assert_clean_int(count, O3); // Make sure 'count' is clean int.
900 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
901 Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;
903 int shift = -1;
904 switch (t) {
905 case T_BYTE:
906 shift = 2;
907 // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
908 __ rldimi(value, value, 8, 48); // 8 bit -> 16 bit
909 __ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
910 __ blt(CCR0, L_fill_elements);
911 __ rldimi(value, value, 16, 32); // 16 bit -> 32 bit
912 break;
913 case T_SHORT:
914 shift = 1;
915 // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
916 __ rldimi(value, value, 16, 32); // 16 bit -> 32 bit
917 __ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
918 __ blt(CCR0, L_fill_elements);
919 break;
920 case T_INT:
921 shift = 0;
922 __ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
923 __ blt(CCR0, L_fill_4_bytes);
924 break;
925 default: ShouldNotReachHere();
926 }
928 if (!aligned && (t == T_BYTE || t == T_SHORT)) {
929 // Align source address at 4 bytes address boundary.
930 if (t == T_BYTE) {
931 // One byte misalignment happens only for byte arrays.
932 __ andi_(temp, to, 1);
933 __ beq(CCR0, L_skip_align1);
934 __ stb(value, 0, to);
935 __ addi(to, to, 1);
936 __ addi(count, count, -1);
937 __ bind(L_skip_align1);
938 }
939 // Two bytes misalignment happens only for byte and short (char) arrays.
940 __ andi_(temp, to, 2);
941 __ beq(CCR0, L_skip_align2);
942 __ sth(value, 0, to);
943 __ addi(to, to, 2);
944 __ addi(count, count, -(1 << (shift - 1)));
945 __ bind(L_skip_align2);
946 }
948 if (!aligned) {
949 // Align to 8 bytes, we know we are 4 byte aligned to start.
950 __ andi_(temp, to, 7);
951 __ beq(CCR0, L_fill_32_bytes);
952 __ stw(value, 0, to);
953 __ addi(to, to, 4);
954 __ addi(count, count, -(1 << shift));
955 __ bind(L_fill_32_bytes);
956 }
958 __ li(temp, 8<<shift); // Prepare for 32 byte loop.
959 // Clone bytes int->long as above.
960 __ rldimi(value, value, 32, 0); // 32 bit -> 64 bit
962 Label L_check_fill_8_bytes;
963 // Fill 32-byte chunks.
964 __ subf_(count, temp, count);
965 __ blt(CCR0, L_check_fill_8_bytes);
967 Label L_fill_32_bytes_loop;
968 __ align(32);
969 __ bind(L_fill_32_bytes_loop);
971 __ std(value, 0, to);
972 __ std(value, 8, to);
973 __ subf_(count, temp, count); // Update count.
974 __ std(value, 16, to);
975 __ std(value, 24, to);
977 __ addi(to, to, 32);
978 __ bge(CCR0, L_fill_32_bytes_loop);
980 __ bind(L_check_fill_8_bytes);
981 __ add_(count, temp, count);
982 __ beq(CCR0, L_exit);
983 __ addic_(count, count, -(2 << shift));
984 __ blt(CCR0, L_fill_4_bytes);
986 //
987 // Length is too short, just fill 8 bytes at a time.
988 //
989 Label L_fill_8_bytes_loop;
990 __ bind(L_fill_8_bytes_loop);
991 __ std(value, 0, to);
992 __ addic_(count, count, -(2 << shift));
993 __ addi(to, to, 8);
994 __ bge(CCR0, L_fill_8_bytes_loop);
996 // Fill trailing 4 bytes.
997 __ bind(L_fill_4_bytes);
998 __ andi_(temp, count, 1<<shift);
999 __ beq(CCR0, L_fill_2_bytes);
1001 __ stw(value, 0, to);
1002 if (t == T_BYTE || t == T_SHORT) {
1003 __ addi(to, to, 4);
1004 // Fill trailing 2 bytes.
1005 __ bind(L_fill_2_bytes);
1006 __ andi_(temp, count, 1<<(shift-1));
1007 __ beq(CCR0, L_fill_byte);
1008 __ sth(value, 0, to);
1009 if (t == T_BYTE) {
1010 __ addi(to, to, 2);
1011 // Fill trailing byte.
1012 __ bind(L_fill_byte);
1013 __ andi_(count, count, 1);
1014 __ beq(CCR0, L_exit);
1015 __ stb(value, 0, to);
1016 } else {
1017 __ bind(L_fill_byte);
1018 }
1019 } else {
1020 __ bind(L_fill_2_bytes);
1021 }
1022 __ bind(L_exit);
1023 __ blr();
1025 // Handle copies less than 8 bytes. Int is handled elsewhere.
1026 if (t == T_BYTE) {
1027 __ bind(L_fill_elements);
1028 Label L_fill_2, L_fill_4;
1029 __ andi_(temp, count, 1);
1030 __ beq(CCR0, L_fill_2);
1031 __ stb(value, 0, to);
1032 __ addi(to, to, 1);
1033 __ bind(L_fill_2);
1034 __ andi_(temp, count, 2);
1035 __ beq(CCR0, L_fill_4);
1036 __ stb(value, 0, to);
1037 __ stb(value, 0, to);
1038 __ addi(to, to, 2);
1039 __ bind(L_fill_4);
1040 __ andi_(temp, count, 4);
1041 __ beq(CCR0, L_exit);
1042 __ stb(value, 0, to);
1043 __ stb(value, 1, to);
1044 __ stb(value, 2, to);
1045 __ stb(value, 3, to);
1046 __ blr();
1047 }
1049 if (t == T_SHORT) {
1050 Label L_fill_2;
1051 __ bind(L_fill_elements);
1052 __ andi_(temp, count, 1);
1053 __ beq(CCR0, L_fill_2);
1054 __ sth(value, 0, to);
1055 __ addi(to, to, 2);
1056 __ bind(L_fill_2);
1057 __ andi_(temp, count, 2);
1058 __ beq(CCR0, L_exit);
1059 __ sth(value, 0, to);
1060 __ sth(value, 2, to);
1061 __ blr();
1062 }
1063 return start;
1064 }
1067 // Generate overlap test for array copy stubs.
1068 //
1069 // Input:
1070 // R3_ARG1 - from
1071 // R4_ARG2 - to
1072 // R5_ARG3 - element count
1073 //
1074 void array_overlap_test(address no_overlap_target, int log2_elem_size) {
1075 Register tmp1 = R6_ARG4;
1076 Register tmp2 = R7_ARG5;
1078 Label l_overlap;
1079 #ifdef ASSERT
1080 __ srdi_(tmp2, R5_ARG3, 31);
1081 __ asm_assert_eq("missing zero extend", 0xAFFE);
1082 #endif
1084 __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
1085 __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
1086 __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
1087 __ cmpld(CCR1, tmp1, tmp2);
1088 __ crand(/*CCR0 lt*/0, /*CCR1 lt*/4+0, /*CCR0 lt*/0);
1089 __ blt(CCR0, l_overlap); // Src before dst and distance smaller than size.
1091 // need to copy forwards
1092 if (__ is_within_range_of_b(no_overlap_target, __ pc())) {
1093 __ b(no_overlap_target);
1094 } else {
1095 __ load_const(tmp1, no_overlap_target, tmp2);
1096 __ mtctr(tmp1);
1097 __ bctr();
1098 }
1100 __ bind(l_overlap);
1101 // need to copy backwards
1102 }
1104 // The guideline in the implementations of generate_disjoint_xxx_copy
1105 // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
1106 // single instructions, but to avoid alignment interrupts (see subsequent
1107 // comment). Furthermore, we try to minimize misaligned access, even
1108 // though they cause no alignment interrupt.
1109 //
1110 // In Big-Endian mode, the PowerPC architecture requires implementations to
1111 // handle automatically misaligned integer halfword and word accesses,
1112 // word-aligned integer doubleword accesses, and word-aligned floating-point
1113 // accesses. Other accesses may or may not generate an Alignment interrupt
1114 // depending on the implementation.
1115 // Alignment interrupt handling may require on the order of hundreds of cycles,
1116 // so every effort should be made to avoid misaligned memory values.
1117 //
1118 //
1119 // Generate stub for disjoint byte copy. If "aligned" is true, the
1120 // "from" and "to" addresses are assumed to be heapword aligned.
1121 //
1122 // Arguments for generated stub:
1123 // from: R3_ARG1
1124 // to: R4_ARG2
1125 // count: R5_ARG3 treated as signed
1126 //
1127 address generate_disjoint_byte_copy(bool aligned, const char * name) {
1128 StubCodeMark mark(this, "StubRoutines", name);
1129 address start = __ function_entry();
1131 Register tmp1 = R6_ARG4;
1132 Register tmp2 = R7_ARG5;
1133 Register tmp3 = R8_ARG6;
1134 Register tmp4 = R9_ARG7;
1137 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
1138 // Don't try anything fancy if arrays don't have many elements.
1139 __ li(tmp3, 0);
1140 __ cmpwi(CCR0, R5_ARG3, 17);
1141 __ ble(CCR0, l_6); // copy 4 at a time
1143 if (!aligned) {
1144 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1145 __ andi_(tmp1, tmp1, 3);
1146 __ bne(CCR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
1148 // Copy elements if necessary to align to 4 bytes.
1149 __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
1150 __ andi_(tmp1, tmp1, 3);
1151 __ beq(CCR0, l_2);
1153 __ subf(R5_ARG3, tmp1, R5_ARG3);
1154 __ bind(l_9);
1155 __ lbz(tmp2, 0, R3_ARG1);
1156 __ addic_(tmp1, tmp1, -1);
1157 __ stb(tmp2, 0, R4_ARG2);
1158 __ addi(R3_ARG1, R3_ARG1, 1);
1159 __ addi(R4_ARG2, R4_ARG2, 1);
1160 __ bne(CCR0, l_9);
1162 __ bind(l_2);
1163 }
1165 // copy 8 elements at a time
1166 __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
1167 __ andi_(tmp1, tmp2, 7);
1168 __ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8
1170 // copy a 2-element word if necessary to align to 8 bytes
1171 __ andi_(R0, R3_ARG1, 7);
1172 __ beq(CCR0, l_7);
1174 __ lwzx(tmp2, R3_ARG1, tmp3);
1175 __ addi(R5_ARG3, R5_ARG3, -4);
1176 __ stwx(tmp2, R4_ARG2, tmp3);
1177 { // FasterArrayCopy
1178 __ addi(R3_ARG1, R3_ARG1, 4);
1179 __ addi(R4_ARG2, R4_ARG2, 4);
1180 }
1181 __ bind(l_7);
1183 { // FasterArrayCopy
1184 __ cmpwi(CCR0, R5_ARG3, 31);
1185 __ ble(CCR0, l_6); // copy 2 at a time if less than 32 elements remain
1187 __ srdi(tmp1, R5_ARG3, 5);
1188 __ andi_(R5_ARG3, R5_ARG3, 31);
1189 __ mtctr(tmp1);
1191 __ bind(l_8);
1192 // Use unrolled version for mass copying (copy 32 elements a time)
1193 // Load feeding store gets zero latency on Power6, however not on Power5.
1194 // Therefore, the following sequence is made for the good of both.
1195 __ ld(tmp1, 0, R3_ARG1);
1196 __ ld(tmp2, 8, R3_ARG1);
1197 __ ld(tmp3, 16, R3_ARG1);
1198 __ ld(tmp4, 24, R3_ARG1);
1199 __ std(tmp1, 0, R4_ARG2);
1200 __ std(tmp2, 8, R4_ARG2);
1201 __ std(tmp3, 16, R4_ARG2);
1202 __ std(tmp4, 24, R4_ARG2);
1203 __ addi(R3_ARG1, R3_ARG1, 32);
1204 __ addi(R4_ARG2, R4_ARG2, 32);
1205 __ bdnz(l_8);
1206 }
1208 __ bind(l_6);
1210 // copy 4 elements at a time
1211 __ cmpwi(CCR0, R5_ARG3, 4);
1212 __ blt(CCR0, l_1);
1213 __ srdi(tmp1, R5_ARG3, 2);
1214 __ mtctr(tmp1); // is > 0
1215 __ andi_(R5_ARG3, R5_ARG3, 3);
1217 { // FasterArrayCopy
1218 __ addi(R3_ARG1, R3_ARG1, -4);
1219 __ addi(R4_ARG2, R4_ARG2, -4);
1220 __ bind(l_3);
1221 __ lwzu(tmp2, 4, R3_ARG1);
1222 __ stwu(tmp2, 4, R4_ARG2);
1223 __ bdnz(l_3);
1224 __ addi(R3_ARG1, R3_ARG1, 4);
1225 __ addi(R4_ARG2, R4_ARG2, 4);
1226 }
1228 // do single element copy
1229 __ bind(l_1);
1230 __ cmpwi(CCR0, R5_ARG3, 0);
1231 __ beq(CCR0, l_4);
1233 { // FasterArrayCopy
1234 __ mtctr(R5_ARG3);
1235 __ addi(R3_ARG1, R3_ARG1, -1);
1236 __ addi(R4_ARG2, R4_ARG2, -1);
1238 __ bind(l_5);
1239 __ lbzu(tmp2, 1, R3_ARG1);
1240 __ stbu(tmp2, 1, R4_ARG2);
1241 __ bdnz(l_5);
1242 }
1244 __ bind(l_4);
1245 __ blr();
1247 return start;
1248 }
1250 // Generate stub for conjoint byte copy. If "aligned" is true, the
1251 // "from" and "to" addresses are assumed to be heapword aligned.
1252 //
1253 // Arguments for generated stub:
1254 // from: R3_ARG1
1255 // to: R4_ARG2
1256 // count: R5_ARG3 treated as signed
1257 //
1258 address generate_conjoint_byte_copy(bool aligned, const char * name) {
1259 StubCodeMark mark(this, "StubRoutines", name);
1260 address start = __ function_entry();
1262 Register tmp1 = R6_ARG4;
1263 Register tmp2 = R7_ARG5;
1264 Register tmp3 = R8_ARG6;
1266 #if defined(ABI_ELFv2)
1267 address nooverlap_target = aligned ?
1268 StubRoutines::arrayof_jbyte_disjoint_arraycopy() :
1269 StubRoutines::jbyte_disjoint_arraycopy();
1270 #else
1271 address nooverlap_target = aligned ?
1272 ((FunctionDescriptor*)StubRoutines::arrayof_jbyte_disjoint_arraycopy())->entry() :
1273 ((FunctionDescriptor*)StubRoutines::jbyte_disjoint_arraycopy())->entry();
1274 #endif
1276 array_overlap_test(nooverlap_target, 0);
1277 // Do reverse copy. We assume the case of actual overlap is rare enough
1278 // that we don't have to optimize it.
1279 Label l_1, l_2;
1281 __ b(l_2);
1282 __ bind(l_1);
1283 __ stbx(tmp1, R4_ARG2, R5_ARG3);
1284 __ bind(l_2);
1285 __ addic_(R5_ARG3, R5_ARG3, -1);
1286 __ lbzx(tmp1, R3_ARG1, R5_ARG3);
1287 __ bge(CCR0, l_1);
1289 __ blr();
1291 return start;
1292 }
1294 // Generate stub for disjoint short copy. If "aligned" is true, the
1295 // "from" and "to" addresses are assumed to be heapword aligned.
1296 //
1297 // Arguments for generated stub:
1298 // from: R3_ARG1
1299 // to: R4_ARG2
1300 // elm.count: R5_ARG3 treated as signed
1301 //
1302 // Strategy for aligned==true:
1303 //
1304 // If length <= 9:
1305 // 1. copy 2 elements at a time (l_6)
1306 // 2. copy last element if original element count was odd (l_1)
1307 //
1308 // If length > 9:
1309 // 1. copy 4 elements at a time until less than 4 elements are left (l_7)
1310 // 2. copy 2 elements at a time until less than 2 elements are left (l_6)
1311 // 3. copy last element if one was left in step 2. (l_1)
1312 //
1313 //
1314 // Strategy for aligned==false:
1315 //
1316 // If length <= 9: same as aligned==true case, but NOTE: load/stores
1317 // can be unaligned (see comment below)
1318 //
1319 // If length > 9:
1320 // 1. continue with step 6. if the alignment of from and to mod 4
1321 // is different.
1322 // 2. align from and to to 4 bytes by copying 1 element if necessary
1323 // 3. at l_2 from and to are 4 byte aligned; continue with
1324 // 5. if they cannot be aligned to 8 bytes because they have
1325 // got different alignment mod 8.
1326 // 4. at this point we know that both, from and to, have the same
1327 // alignment mod 8, now copy one element if necessary to get
1328 // 8 byte alignment of from and to.
1329 // 5. copy 4 elements at a time until less than 4 elements are
1330 // left; depending on step 3. all load/stores are aligned or
1331 // either all loads or all stores are unaligned.
1332 // 6. copy 2 elements at a time until less than 2 elements are
1333 // left (l_6); arriving here from step 1., there is a chance
1334 // that all accesses are unaligned.
1335 // 7. copy last element if one was left in step 6. (l_1)
1336 //
1337 // There are unaligned data accesses using integer load/store
1338 // instructions in this stub. POWER allows such accesses.
1339 //
1340 // According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
1341 // Chapter 2: Effect of Operand Placement on Performance) unaligned
1342 // integer load/stores have good performance. Only unaligned
1343 // floating point load/stores can have poor performance.
1344 //
1345 // TODO:
1346 //
1347 // 1. check if aligning the backbranch target of loops is beneficial
1348 //
1349 address generate_disjoint_short_copy(bool aligned, const char * name) {
1350 StubCodeMark mark(this, "StubRoutines", name);
1352 Register tmp1 = R6_ARG4;
1353 Register tmp2 = R7_ARG5;
1354 Register tmp3 = R8_ARG6;
1355 Register tmp4 = R9_ARG7;
1357 address start = __ function_entry();
1359 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
1360 // don't try anything fancy if arrays don't have many elements
1361 __ li(tmp3, 0);
1362 __ cmpwi(CCR0, R5_ARG3, 9);
1363 __ ble(CCR0, l_6); // copy 2 at a time
1365 if (!aligned) {
1366 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1367 __ andi_(tmp1, tmp1, 3);
1368 __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
1370 // At this point it is guaranteed that both, from and to have the same alignment mod 4.
1372 // Copy 1 element if necessary to align to 4 bytes.
1373 __ andi_(tmp1, R3_ARG1, 3);
1374 __ beq(CCR0, l_2);
1376 __ lhz(tmp2, 0, R3_ARG1);
1377 __ addi(R3_ARG1, R3_ARG1, 2);
1378 __ sth(tmp2, 0, R4_ARG2);
1379 __ addi(R4_ARG2, R4_ARG2, 2);
1380 __ addi(R5_ARG3, R5_ARG3, -1);
1381 __ bind(l_2);
1383 // At this point the positions of both, from and to, are at least 4 byte aligned.
1385 // Copy 4 elements at a time.
1386 // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
1387 __ xorr(tmp2, R3_ARG1, R4_ARG2);
1388 __ andi_(tmp1, tmp2, 7);
1389 __ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
1391 // Copy a 2-element word if necessary to align to 8 bytes.
1392 __ andi_(R0, R3_ARG1, 7);
1393 __ beq(CCR0, l_7);
1395 __ lwzx(tmp2, R3_ARG1, tmp3);
1396 __ addi(R5_ARG3, R5_ARG3, -2);
1397 __ stwx(tmp2, R4_ARG2, tmp3);
1398 { // FasterArrayCopy
1399 __ addi(R3_ARG1, R3_ARG1, 4);
1400 __ addi(R4_ARG2, R4_ARG2, 4);
1401 }
1402 }
1404 __ bind(l_7);
1406 // Copy 4 elements at a time; either the loads or the stores can
1407 // be unaligned if aligned == false.
1409 { // FasterArrayCopy
1410 __ cmpwi(CCR0, R5_ARG3, 15);
1411 __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain
1413 __ srdi(tmp1, R5_ARG3, 4);
1414 __ andi_(R5_ARG3, R5_ARG3, 15);
1415 __ mtctr(tmp1);
1417 __ bind(l_8);
1418 // Use unrolled version for mass copying (copy 16 elements a time).
1419 // Load feeding store gets zero latency on Power6, however not on Power5.
1420 // Therefore, the following sequence is made for the good of both.
1421 __ ld(tmp1, 0, R3_ARG1);
1422 __ ld(tmp2, 8, R3_ARG1);
1423 __ ld(tmp3, 16, R3_ARG1);
1424 __ ld(tmp4, 24, R3_ARG1);
1425 __ std(tmp1, 0, R4_ARG2);
1426 __ std(tmp2, 8, R4_ARG2);
1427 __ std(tmp3, 16, R4_ARG2);
1428 __ std(tmp4, 24, R4_ARG2);
1429 __ addi(R3_ARG1, R3_ARG1, 32);
1430 __ addi(R4_ARG2, R4_ARG2, 32);
1431 __ bdnz(l_8);
1432 }
1433 __ bind(l_6);
1435 // copy 2 elements at a time
1436 { // FasterArrayCopy
1437 __ cmpwi(CCR0, R5_ARG3, 2);
1438 __ blt(CCR0, l_1);
1439 __ srdi(tmp1, R5_ARG3, 1);
1440 __ andi_(R5_ARG3, R5_ARG3, 1);
1442 __ addi(R3_ARG1, R3_ARG1, -4);
1443 __ addi(R4_ARG2, R4_ARG2, -4);
1444 __ mtctr(tmp1);
1446 __ bind(l_3);
1447 __ lwzu(tmp2, 4, R3_ARG1);
1448 __ stwu(tmp2, 4, R4_ARG2);
1449 __ bdnz(l_3);
1451 __ addi(R3_ARG1, R3_ARG1, 4);
1452 __ addi(R4_ARG2, R4_ARG2, 4);
1453 }
1455 // do single element copy
1456 __ bind(l_1);
1457 __ cmpwi(CCR0, R5_ARG3, 0);
1458 __ beq(CCR0, l_4);
1460 { // FasterArrayCopy
1461 __ mtctr(R5_ARG3);
1462 __ addi(R3_ARG1, R3_ARG1, -2);
1463 __ addi(R4_ARG2, R4_ARG2, -2);
1465 __ bind(l_5);
1466 __ lhzu(tmp2, 2, R3_ARG1);
1467 __ sthu(tmp2, 2, R4_ARG2);
1468 __ bdnz(l_5);
1469 }
1470 __ bind(l_4);
1471 __ blr();
1473 return start;
1474 }
1476 // Generate stub for conjoint short copy. If "aligned" is true, the
1477 // "from" and "to" addresses are assumed to be heapword aligned.
1478 //
1479 // Arguments for generated stub:
1480 // from: R3_ARG1
1481 // to: R4_ARG2
1482 // count: R5_ARG3 treated as signed
1483 //
1484 address generate_conjoint_short_copy(bool aligned, const char * name) {
1485 StubCodeMark mark(this, "StubRoutines", name);
1486 address start = __ function_entry();
1488 Register tmp1 = R6_ARG4;
1489 Register tmp2 = R7_ARG5;
1490 Register tmp3 = R8_ARG6;
1492 #if defined(ABI_ELFv2)
1493 address nooverlap_target = aligned ?
1494 StubRoutines::arrayof_jshort_disjoint_arraycopy() :
1495 StubRoutines::jshort_disjoint_arraycopy();
1496 #else
1497 address nooverlap_target = aligned ?
1498 ((FunctionDescriptor*)StubRoutines::arrayof_jshort_disjoint_arraycopy())->entry() :
1499 ((FunctionDescriptor*)StubRoutines::jshort_disjoint_arraycopy())->entry();
1500 #endif
1502 array_overlap_test(nooverlap_target, 1);
1504 Label l_1, l_2;
1505 __ sldi(tmp1, R5_ARG3, 1);
1506 __ b(l_2);
1507 __ bind(l_1);
1508 __ sthx(tmp2, R4_ARG2, tmp1);
1509 __ bind(l_2);
1510 __ addic_(tmp1, tmp1, -2);
1511 __ lhzx(tmp2, R3_ARG1, tmp1);
1512 __ bge(CCR0, l_1);
1514 __ blr();
1516 return start;
1517 }
1519 // Generate core code for disjoint int copy (and oop copy on 32-bit). If "aligned"
1520 // is true, the "from" and "to" addresses are assumed to be heapword aligned.
1521 //
1522 // Arguments:
1523 // from: R3_ARG1
1524 // to: R4_ARG2
1525 // count: R5_ARG3 treated as signed
1526 //
1527 void generate_disjoint_int_copy_core(bool aligned) {
1528 Register tmp1 = R6_ARG4;
1529 Register tmp2 = R7_ARG5;
1530 Register tmp3 = R8_ARG6;
1531 Register tmp4 = R0;
1533 Label l_1, l_2, l_3, l_4, l_5, l_6;
1534 // for short arrays, just do single element copy
1535 __ li(tmp3, 0);
1536 __ cmpwi(CCR0, R5_ARG3, 5);
1537 __ ble(CCR0, l_2);
1539 if (!aligned) {
1540 // check if arrays have same alignment mod 8.
1541 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1542 __ andi_(R0, tmp1, 7);
1543 // Not the same alignment, but ld and std just need to be 4 byte aligned.
1544 __ bne(CCR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
1546 // copy 1 element to align to and from on an 8 byte boundary
1547 __ andi_(R0, R3_ARG1, 7);
1548 __ beq(CCR0, l_4);
1550 __ lwzx(tmp2, R3_ARG1, tmp3);
1551 __ addi(R5_ARG3, R5_ARG3, -1);
1552 __ stwx(tmp2, R4_ARG2, tmp3);
1553 { // FasterArrayCopy
1554 __ addi(R3_ARG1, R3_ARG1, 4);
1555 __ addi(R4_ARG2, R4_ARG2, 4);
1556 }
1557 __ bind(l_4);
1558 }
1560 { // FasterArrayCopy
1561 __ cmpwi(CCR0, R5_ARG3, 7);
1562 __ ble(CCR0, l_2); // copy 1 at a time if less than 8 elements remain
1564 __ srdi(tmp1, R5_ARG3, 3);
1565 __ andi_(R5_ARG3, R5_ARG3, 7);
1566 __ mtctr(tmp1);
1568 __ bind(l_6);
1569 // Use unrolled version for mass copying (copy 8 elements a time).
1570 // Load feeding store gets zero latency on power6, however not on power 5.
1571 // Therefore, the following sequence is made for the good of both.
1572 __ ld(tmp1, 0, R3_ARG1);
1573 __ ld(tmp2, 8, R3_ARG1);
1574 __ ld(tmp3, 16, R3_ARG1);
1575 __ ld(tmp4, 24, R3_ARG1);
1576 __ std(tmp1, 0, R4_ARG2);
1577 __ std(tmp2, 8, R4_ARG2);
1578 __ std(tmp3, 16, R4_ARG2);
1579 __ std(tmp4, 24, R4_ARG2);
1580 __ addi(R3_ARG1, R3_ARG1, 32);
1581 __ addi(R4_ARG2, R4_ARG2, 32);
1582 __ bdnz(l_6);
1583 }
1585 // copy 1 element at a time
1586 __ bind(l_2);
1587 __ cmpwi(CCR0, R5_ARG3, 0);
1588 __ beq(CCR0, l_1);
1590 { // FasterArrayCopy
1591 __ mtctr(R5_ARG3);
1592 __ addi(R3_ARG1, R3_ARG1, -4);
1593 __ addi(R4_ARG2, R4_ARG2, -4);
1595 __ bind(l_3);
1596 __ lwzu(tmp2, 4, R3_ARG1);
1597 __ stwu(tmp2, 4, R4_ARG2);
1598 __ bdnz(l_3);
1599 }
1601 __ bind(l_1);
1602 return;
1603 }
1605 // Generate stub for disjoint int copy. If "aligned" is true, the
1606 // "from" and "to" addresses are assumed to be heapword aligned.
1607 //
1608 // Arguments for generated stub:
1609 // from: R3_ARG1
1610 // to: R4_ARG2
1611 // count: R5_ARG3 treated as signed
1612 //
1613 address generate_disjoint_int_copy(bool aligned, const char * name) {
1614 StubCodeMark mark(this, "StubRoutines", name);
1615 address start = __ function_entry();
1616 generate_disjoint_int_copy_core(aligned);
1617 __ blr();
1618 return start;
1619 }
1621 // Generate core code for conjoint int copy (and oop copy on
1622 // 32-bit). If "aligned" is true, the "from" and "to" addresses
1623 // are assumed to be heapword aligned.
1624 //
1625 // Arguments:
1626 // from: R3_ARG1
1627 // to: R4_ARG2
1628 // count: R5_ARG3 treated as signed
1629 //
1630 void generate_conjoint_int_copy_core(bool aligned) {
1631 // Do reverse copy. We assume the case of actual overlap is rare enough
1632 // that we don't have to optimize it.
1634 Label l_1, l_2, l_3, l_4, l_5, l_6;
1636 Register tmp1 = R6_ARG4;
1637 Register tmp2 = R7_ARG5;
1638 Register tmp3 = R8_ARG6;
1639 Register tmp4 = R0;
1641 { // FasterArrayCopy
1642 __ cmpwi(CCR0, R5_ARG3, 0);
1643 __ beq(CCR0, l_6);
1645 __ sldi(R5_ARG3, R5_ARG3, 2);
1646 __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1647 __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1648 __ srdi(R5_ARG3, R5_ARG3, 2);
1650 __ cmpwi(CCR0, R5_ARG3, 7);
1651 __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain
1653 __ srdi(tmp1, R5_ARG3, 3);
1654 __ andi(R5_ARG3, R5_ARG3, 7);
1655 __ mtctr(tmp1);
1657 __ bind(l_4);
1658 // Use unrolled version for mass copying (copy 4 elements a time).
1659 // Load feeding store gets zero latency on Power6, however not on Power5.
1660 // Therefore, the following sequence is made for the good of both.
1661 __ addi(R3_ARG1, R3_ARG1, -32);
1662 __ addi(R4_ARG2, R4_ARG2, -32);
1663 __ ld(tmp4, 24, R3_ARG1);
1664 __ ld(tmp3, 16, R3_ARG1);
1665 __ ld(tmp2, 8, R3_ARG1);
1666 __ ld(tmp1, 0, R3_ARG1);
1667 __ std(tmp4, 24, R4_ARG2);
1668 __ std(tmp3, 16, R4_ARG2);
1669 __ std(tmp2, 8, R4_ARG2);
1670 __ std(tmp1, 0, R4_ARG2);
1671 __ bdnz(l_4);
1673 __ cmpwi(CCR0, R5_ARG3, 0);
1674 __ beq(CCR0, l_6);
1676 __ bind(l_5);
1677 __ mtctr(R5_ARG3);
1678 __ bind(l_3);
1679 __ lwz(R0, -4, R3_ARG1);
1680 __ stw(R0, -4, R4_ARG2);
1681 __ addi(R3_ARG1, R3_ARG1, -4);
1682 __ addi(R4_ARG2, R4_ARG2, -4);
1683 __ bdnz(l_3);
1685 __ bind(l_6);
1686 }
1687 }
1689 // Generate stub for conjoint int copy. If "aligned" is true, the
1690 // "from" and "to" addresses are assumed to be heapword aligned.
1691 //
1692 // Arguments for generated stub:
1693 // from: R3_ARG1
1694 // to: R4_ARG2
1695 // count: R5_ARG3 treated as signed
1696 //
1697 address generate_conjoint_int_copy(bool aligned, const char * name) {
1698 StubCodeMark mark(this, "StubRoutines", name);
1699 address start = __ function_entry();
1701 #if defined(ABI_ELFv2)
1702 address nooverlap_target = aligned ?
1703 StubRoutines::arrayof_jint_disjoint_arraycopy() :
1704 StubRoutines::jint_disjoint_arraycopy();
1705 #else
1706 address nooverlap_target = aligned ?
1707 ((FunctionDescriptor*)StubRoutines::arrayof_jint_disjoint_arraycopy())->entry() :
1708 ((FunctionDescriptor*)StubRoutines::jint_disjoint_arraycopy())->entry();
1709 #endif
1711 array_overlap_test(nooverlap_target, 2);
1713 generate_conjoint_int_copy_core(aligned);
1715 __ blr();
1717 return start;
1718 }
1720 // Generate core code for disjoint long copy (and oop copy on
1721 // 64-bit). If "aligned" is true, the "from" and "to" addresses
1722 // are assumed to be heapword aligned.
1723 //
1724 // Arguments:
1725 // from: R3_ARG1
1726 // to: R4_ARG2
1727 // count: R5_ARG3 treated as signed
1728 //
1729 void generate_disjoint_long_copy_core(bool aligned) {
1730 Register tmp1 = R6_ARG4;
1731 Register tmp2 = R7_ARG5;
1732 Register tmp3 = R8_ARG6;
1733 Register tmp4 = R0;
1735 Label l_1, l_2, l_3, l_4;
1737 { // FasterArrayCopy
1738 __ cmpwi(CCR0, R5_ARG3, 3);
1739 __ ble(CCR0, l_3); // copy 1 at a time if less than 4 elements remain
1741 __ srdi(tmp1, R5_ARG3, 2);
1742 __ andi_(R5_ARG3, R5_ARG3, 3);
1743 __ mtctr(tmp1);
1745 __ bind(l_4);
1746 // Use unrolled version for mass copying (copy 4 elements a time).
1747 // Load feeding store gets zero latency on Power6, however not on Power5.
1748 // Therefore, the following sequence is made for the good of both.
1749 __ ld(tmp1, 0, R3_ARG1);
1750 __ ld(tmp2, 8, R3_ARG1);
1751 __ ld(tmp3, 16, R3_ARG1);
1752 __ ld(tmp4, 24, R3_ARG1);
1753 __ std(tmp1, 0, R4_ARG2);
1754 __ std(tmp2, 8, R4_ARG2);
1755 __ std(tmp3, 16, R4_ARG2);
1756 __ std(tmp4, 24, R4_ARG2);
1757 __ addi(R3_ARG1, R3_ARG1, 32);
1758 __ addi(R4_ARG2, R4_ARG2, 32);
1759 __ bdnz(l_4);
1760 }
1762 // copy 1 element at a time
1763 __ bind(l_3);
1764 __ cmpwi(CCR0, R5_ARG3, 0);
1765 __ beq(CCR0, l_1);
1767 { // FasterArrayCopy
1768 __ mtctr(R5_ARG3);
1769 __ addi(R3_ARG1, R3_ARG1, -8);
1770 __ addi(R4_ARG2, R4_ARG2, -8);
1772 __ bind(l_2);
1773 __ ldu(R0, 8, R3_ARG1);
1774 __ stdu(R0, 8, R4_ARG2);
1775 __ bdnz(l_2);
1777 }
1778 __ bind(l_1);
1779 }
1781 // Generate stub for disjoint long copy. If "aligned" is true, the
1782 // "from" and "to" addresses are assumed to be heapword aligned.
1783 //
1784 // Arguments for generated stub:
1785 // from: R3_ARG1
1786 // to: R4_ARG2
1787 // count: R5_ARG3 treated as signed
1788 //
1789 address generate_disjoint_long_copy(bool aligned, const char * name) {
1790 StubCodeMark mark(this, "StubRoutines", name);
1791 address start = __ function_entry();
1792 generate_disjoint_long_copy_core(aligned);
1793 __ blr();
1795 return start;
1796 }
1798 // Generate core code for conjoint long copy (and oop copy on
1799 // 64-bit). If "aligned" is true, the "from" and "to" addresses
1800 // are assumed to be heapword aligned.
1801 //
1802 // Arguments:
1803 // from: R3_ARG1
1804 // to: R4_ARG2
1805 // count: R5_ARG3 treated as signed
1806 //
1807 void generate_conjoint_long_copy_core(bool aligned) {
1808 Register tmp1 = R6_ARG4;
1809 Register tmp2 = R7_ARG5;
1810 Register tmp3 = R8_ARG6;
1811 Register tmp4 = R0;
1813 Label l_1, l_2, l_3, l_4, l_5;
1815 __ cmpwi(CCR0, R5_ARG3, 0);
1816 __ beq(CCR0, l_1);
1818 { // FasterArrayCopy
1819 __ sldi(R5_ARG3, R5_ARG3, 3);
1820 __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1821 __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1822 __ srdi(R5_ARG3, R5_ARG3, 3);
1824 __ cmpwi(CCR0, R5_ARG3, 3);
1825 __ ble(CCR0, l_5); // copy 1 at a time if less than 4 elements remain
1827 __ srdi(tmp1, R5_ARG3, 2);
1828 __ andi(R5_ARG3, R5_ARG3, 3);
1829 __ mtctr(tmp1);
1831 __ bind(l_4);
1832 // Use unrolled version for mass copying (copy 4 elements a time).
1833 // Load feeding store gets zero latency on Power6, however not on Power5.
1834 // Therefore, the following sequence is made for the good of both.
1835 __ addi(R3_ARG1, R3_ARG1, -32);
1836 __ addi(R4_ARG2, R4_ARG2, -32);
1837 __ ld(tmp4, 24, R3_ARG1);
1838 __ ld(tmp3, 16, R3_ARG1);
1839 __ ld(tmp2, 8, R3_ARG1);
1840 __ ld(tmp1, 0, R3_ARG1);
1841 __ std(tmp4, 24, R4_ARG2);
1842 __ std(tmp3, 16, R4_ARG2);
1843 __ std(tmp2, 8, R4_ARG2);
1844 __ std(tmp1, 0, R4_ARG2);
1845 __ bdnz(l_4);
1847 __ cmpwi(CCR0, R5_ARG3, 0);
1848 __ beq(CCR0, l_1);
1850 __ bind(l_5);
1851 __ mtctr(R5_ARG3);
1852 __ bind(l_3);
1853 __ ld(R0, -8, R3_ARG1);
1854 __ std(R0, -8, R4_ARG2);
1855 __ addi(R3_ARG1, R3_ARG1, -8);
1856 __ addi(R4_ARG2, R4_ARG2, -8);
1857 __ bdnz(l_3);
1859 }
1860 __ bind(l_1);
1861 }
1863 // Generate stub for conjoint long copy. If "aligned" is true, the
1864 // "from" and "to" addresses are assumed to be heapword aligned.
1865 //
1866 // Arguments for generated stub:
1867 // from: R3_ARG1
1868 // to: R4_ARG2
1869 // count: R5_ARG3 treated as signed
1870 //
1871 address generate_conjoint_long_copy(bool aligned, const char * name) {
1872 StubCodeMark mark(this, "StubRoutines", name);
1873 address start = __ function_entry();
1875 #if defined(ABI_ELFv2)
1876 address nooverlap_target = aligned ?
1877 StubRoutines::arrayof_jlong_disjoint_arraycopy() :
1878 StubRoutines::jlong_disjoint_arraycopy();
1879 #else
1880 address nooverlap_target = aligned ?
1881 ((FunctionDescriptor*)StubRoutines::arrayof_jlong_disjoint_arraycopy())->entry() :
1882 ((FunctionDescriptor*)StubRoutines::jlong_disjoint_arraycopy())->entry();
1883 #endif
1885 array_overlap_test(nooverlap_target, 3);
1886 generate_conjoint_long_copy_core(aligned);
1888 __ blr();
1890 return start;
1891 }
1893 // Generate stub for conjoint oop copy. If "aligned" is true, the
1894 // "from" and "to" addresses are assumed to be heapword aligned.
1895 //
1896 // Arguments for generated stub:
1897 // from: R3_ARG1
1898 // to: R4_ARG2
1899 // count: R5_ARG3 treated as signed
1900 // dest_uninitialized: G1 support
1901 //
1902 address generate_conjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
1903 StubCodeMark mark(this, "StubRoutines", name);
1905 address start = __ function_entry();
1907 #if defined(ABI_ELFv2)
1908 address nooverlap_target = aligned ?
1909 StubRoutines::arrayof_oop_disjoint_arraycopy() :
1910 StubRoutines::oop_disjoint_arraycopy();
1911 #else
1912 address nooverlap_target = aligned ?
1913 ((FunctionDescriptor*)StubRoutines::arrayof_oop_disjoint_arraycopy())->entry() :
1914 ((FunctionDescriptor*)StubRoutines::oop_disjoint_arraycopy())->entry();
1915 #endif
1917 gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);
1919 // Save arguments.
1920 __ mr(R9_ARG7, R4_ARG2);
1921 __ mr(R10_ARG8, R5_ARG3);
1923 if (UseCompressedOops) {
1924 array_overlap_test(nooverlap_target, 2);
1925 generate_conjoint_int_copy_core(aligned);
1926 } else {
1927 array_overlap_test(nooverlap_target, 3);
1928 generate_conjoint_long_copy_core(aligned);
1929 }
1931 gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);
1932 return start;
1933 }
1935 // Generate stub for disjoint oop copy. If "aligned" is true, the
1936 // "from" and "to" addresses are assumed to be heapword aligned.
1937 //
1938 // Arguments for generated stub:
1939 // from: R3_ARG1
1940 // to: R4_ARG2
1941 // count: R5_ARG3 treated as signed
1942 // dest_uninitialized: G1 support
1943 //
1944 address generate_disjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
1945 StubCodeMark mark(this, "StubRoutines", name);
1946 address start = __ function_entry();
1948 gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);
1950 // save some arguments, disjoint_long_copy_core destroys them.
1951 // needed for post barrier
1952 __ mr(R9_ARG7, R4_ARG2);
1953 __ mr(R10_ARG8, R5_ARG3);
1955 if (UseCompressedOops) {
1956 generate_disjoint_int_copy_core(aligned);
1957 } else {
1958 generate_disjoint_long_copy_core(aligned);
1959 }
1961 gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);
1963 return start;
1964 }
1966 void generate_arraycopy_stubs() {
1967 // Note: the disjoint stubs must be generated first, some of
1968 // the conjoint stubs use them.
1970 // non-aligned disjoint versions
1971 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
1972 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
1973 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, "jint_disjoint_arraycopy");
1974 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
1975 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy", false);
1976 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy_uninit", true);
1978 // aligned disjoint versions
1979 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
1980 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
1981 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, "arrayof_jint_disjoint_arraycopy");
1982 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
1983 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(true, "arrayof_oop_disjoint_arraycopy", false);
1984 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, "oop_disjoint_arraycopy_uninit", true);
1986 // non-aligned conjoint versions
1987 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
1988 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, "jshort_arraycopy");
1989 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, "jint_arraycopy");
1990 StubRoutines::_jlong_arraycopy = generate_conjoint_long_copy(false, "jlong_arraycopy");
1991 StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(false, "oop_arraycopy", false);
1992 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, "oop_arraycopy_uninit", true);
1994 // aligned conjoint versions
1995 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
1996 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
1997 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, "arrayof_jint_arraycopy");
1998 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");
1999 StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", false);
2000 StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", true);
2002 // fill routines
2003 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2004 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2005 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2006 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2007 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2008 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2009 }
2011 // Safefetch stubs.
2012 void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) {
2013 // safefetch signatures:
2014 // int SafeFetch32(int* adr, int errValue);
2015 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
2016 //
2017 // arguments:
2018 // R3_ARG1 = adr
2019 // R4_ARG2 = errValue
2020 //
2021 // result:
2022 // R3_RET = *adr or errValue
2024 StubCodeMark mark(this, "StubRoutines", name);
2026 // Entry point, pc or function descriptor.
2027 *entry = __ function_entry();
2029 // Load *adr into R4_ARG2, may fault.
2030 *fault_pc = __ pc();
2031 switch (size) {
2032 case 4:
2033 // int32_t, signed extended
2034 __ lwa(R4_ARG2, 0, R3_ARG1);
2035 break;
2036 case 8:
2037 // int64_t
2038 __ ld(R4_ARG2, 0, R3_ARG1);
2039 break;
2040 default:
2041 ShouldNotReachHere();
2042 }
2044 // return errValue or *adr
2045 *continuation_pc = __ pc();
2046 __ mr(R3_RET, R4_ARG2);
2047 __ blr();
2048 }
2050 // Initialization
2051 void generate_initial() {
2052 // Generates all stubs and initializes the entry points
2054 // Entry points that exist in all platforms.
2055 // Note: This is code that could be shared among different platforms - however the
2056 // benefit seems to be smaller than the disadvantage of having a
2057 // much more complicated generator structure. See also comment in
2058 // stubRoutines.hpp.
2060 StubRoutines::_forward_exception_entry = generate_forward_exception();
2061 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
2062 StubRoutines::_catch_exception_entry = generate_catch_exception();
2064 // Build this early so it's available for the interpreter.
2065 StubRoutines::_throw_StackOverflowError_entry =
2066 generate_throw_exception("StackOverflowError throw_exception",
2067 CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
2068 }
2070 void generate_all() {
2071 // Generates all stubs and initializes the entry points
2073 // These entry points require SharedInfo::stack0 to be set up in
2074 // non-core builds
2075 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError), false);
2076 // Handle IncompatibleClassChangeError in itable stubs.
2077 StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError), false);
2078 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
2080 StubRoutines::_handler_for_unsafe_access_entry = generate_handler_for_unsafe_access();
2082 // support for verify_oop (must happen after universe_init)
2083 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
2085 // arraycopy stubs used by compilers
2086 generate_arraycopy_stubs();
2088 if (UseAESIntrinsics) {
2089 guarantee(!UseAESIntrinsics, "not yet implemented.");
2090 }
2092 // PPC uses stubs for safefetch.
2093 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
2094 &StubRoutines::_safefetch32_fault_pc,
2095 &StubRoutines::_safefetch32_continuation_pc);
2096 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
2097 &StubRoutines::_safefetchN_fault_pc,
2098 &StubRoutines::_safefetchN_continuation_pc);
2099 }
2101 public:
2102 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
2103 // replace the standard masm with a special one:
2104 _masm = new MacroAssembler(code);
2105 if (all) {
2106 generate_all();
2107 } else {
2108 generate_initial();
2109 }
2110 }
2111 };
2113 void StubGenerator_generate(CodeBuffer* code, bool all) {
2114 StubGenerator g(code, all);
2115 }