Sun, 26 May 2019 21:02:55 -0400
8166684: PPC64: implement intrinsic code with vector instructions for Unsafe.copyMemory()
Reviewed-by: simonis, mdoerr
Contributed-by: Michihiro Horie <horie@jp.ibm.com>
1 /*
2 * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
26 #include "precompiled.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "interpreter/interpreter.hpp"
29 #include "nativeInst_ppc.hpp"
30 #include "oops/instanceOop.hpp"
31 #include "oops/method.hpp"
32 #include "oops/objArrayKlass.hpp"
33 #include "oops/oop.inline.hpp"
34 #include "prims/methodHandles.hpp"
35 #include "runtime/frame.inline.hpp"
36 #include "runtime/handles.inline.hpp"
37 #include "runtime/sharedRuntime.hpp"
38 #include "runtime/stubCodeGenerator.hpp"
39 #include "runtime/stubRoutines.hpp"
40 #include "utilities/top.hpp"
41 #include "runtime/thread.inline.hpp"
43 #define __ _masm->
45 #ifdef PRODUCT
46 #define BLOCK_COMMENT(str) // nothing
47 #else
48 #define BLOCK_COMMENT(str) __ block_comment(str)
49 #endif
51 class StubGenerator: public StubCodeGenerator {
52 private:
54 // Call stubs are used to call Java from C
55 //
56 // Arguments:
57 //
58 // R3 - call wrapper address : address
59 // R4 - result : intptr_t*
60 // R5 - result type : BasicType
61 // R6 - method : Method
62 // R7 - frame mgr entry point : address
63 // R8 - parameter block : intptr_t*
64 // R9 - parameter count in words : int
65 // R10 - thread : Thread*
66 //
67 address generate_call_stub(address& return_address) {
68 // Setup a new c frame, copy java arguments, call frame manager or
69 // native_entry, and process result.
71 StubCodeMark mark(this, "StubRoutines", "call_stub");
73 address start = __ function_entry();
75 // some sanity checks
76 assert((sizeof(frame::abi_minframe) % 16) == 0, "unaligned");
77 assert((sizeof(frame::abi_reg_args) % 16) == 0, "unaligned");
78 assert((sizeof(frame::spill_nonvolatiles) % 16) == 0, "unaligned");
79 assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned");
80 assert((sizeof(frame::entry_frame_locals) % 16) == 0, "unaligned");
82 Register r_arg_call_wrapper_addr = R3;
83 Register r_arg_result_addr = R4;
84 Register r_arg_result_type = R5;
85 Register r_arg_method = R6;
86 Register r_arg_entry = R7;
87 Register r_arg_thread = R10;
89 Register r_temp = R24;
90 Register r_top_of_arguments_addr = R25;
91 Register r_entryframe_fp = R26;
93 {
94 // Stack on entry to call_stub:
95 //
96 // F1 [C_FRAME]
97 // ...
99 Register r_arg_argument_addr = R8;
100 Register r_arg_argument_count = R9;
101 Register r_frame_alignment_in_bytes = R27;
102 Register r_argument_addr = R28;
103 Register r_argumentcopy_addr = R29;
104 Register r_argument_size_in_bytes = R30;
105 Register r_frame_size = R23;
107 Label arguments_copied;
109 // Save LR/CR to caller's C_FRAME.
110 __ save_LR_CR(R0);
112 // Zero extend arg_argument_count.
113 __ clrldi(r_arg_argument_count, r_arg_argument_count, 32);
115 // Save non-volatiles GPRs to ENTRY_FRAME (not yet pushed, but it's safe).
116 __ save_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
118 // Keep copy of our frame pointer (caller's SP).
119 __ mr(r_entryframe_fp, R1_SP);
121 BLOCK_COMMENT("Push ENTRY_FRAME including arguments");
122 // Push ENTRY_FRAME including arguments:
123 //
124 // F0 [TOP_IJAVA_FRAME_ABI]
125 // alignment (optional)
126 // [outgoing Java arguments]
127 // [ENTRY_FRAME_LOCALS]
128 // F1 [C_FRAME]
129 // ...
131 // calculate frame size
133 // unaligned size of arguments
134 __ sldi(r_argument_size_in_bytes,
135 r_arg_argument_count, Interpreter::logStackElementSize);
136 // arguments alignment (max 1 slot)
137 // FIXME: use round_to() here
138 __ andi_(r_frame_alignment_in_bytes, r_arg_argument_count, 1);
139 __ sldi(r_frame_alignment_in_bytes,
140 r_frame_alignment_in_bytes, Interpreter::logStackElementSize);
142 // size = unaligned size of arguments + top abi's size
143 __ addi(r_frame_size, r_argument_size_in_bytes,
144 frame::top_ijava_frame_abi_size);
145 // size += arguments alignment
146 __ add(r_frame_size,
147 r_frame_size, r_frame_alignment_in_bytes);
148 // size += size of call_stub locals
149 __ addi(r_frame_size,
150 r_frame_size, frame::entry_frame_locals_size);
152 // push ENTRY_FRAME
153 __ push_frame(r_frame_size, r_temp);
155 // initialize call_stub locals (step 1)
156 __ std(r_arg_call_wrapper_addr,
157 _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp);
158 __ std(r_arg_result_addr,
159 _entry_frame_locals_neg(result_address), r_entryframe_fp);
160 __ std(r_arg_result_type,
161 _entry_frame_locals_neg(result_type), r_entryframe_fp);
162 // we will save arguments_tos_address later
165 BLOCK_COMMENT("Copy Java arguments");
166 // copy Java arguments
168 // Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later.
169 // FIXME: why not simply use SP+frame::top_ijava_frame_size?
170 __ addi(r_top_of_arguments_addr,
171 R1_SP, frame::top_ijava_frame_abi_size);
172 __ add(r_top_of_arguments_addr,
173 r_top_of_arguments_addr, r_frame_alignment_in_bytes);
175 // any arguments to copy?
176 __ cmpdi(CCR0, r_arg_argument_count, 0);
177 __ beq(CCR0, arguments_copied);
179 // prepare loop and copy arguments in reverse order
180 {
181 // init CTR with arg_argument_count
182 __ mtctr(r_arg_argument_count);
184 // let r_argumentcopy_addr point to last outgoing Java arguments P
185 __ mr(r_argumentcopy_addr, r_top_of_arguments_addr);
187 // let r_argument_addr point to last incoming java argument
188 __ add(r_argument_addr,
189 r_arg_argument_addr, r_argument_size_in_bytes);
190 __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
192 // now loop while CTR > 0 and copy arguments
193 {
194 Label next_argument;
195 __ bind(next_argument);
197 __ ld(r_temp, 0, r_argument_addr);
198 // argument_addr--;
199 __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
200 __ std(r_temp, 0, r_argumentcopy_addr);
201 // argumentcopy_addr++;
202 __ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord);
204 __ bdnz(next_argument);
205 }
206 }
208 // Arguments copied, continue.
209 __ bind(arguments_copied);
210 }
212 {
213 BLOCK_COMMENT("Call frame manager or native entry.");
214 // Call frame manager or native entry.
215 Register r_new_arg_entry = R14;
216 assert_different_registers(r_new_arg_entry, r_top_of_arguments_addr,
217 r_arg_method, r_arg_thread);
219 __ mr(r_new_arg_entry, r_arg_entry);
221 // Register state on entry to frame manager / native entry:
222 //
223 // tos - intptr_t* sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8
224 // R19_method - Method
225 // R16_thread - JavaThread*
227 // Tos must point to last argument - element_size.
228 #ifdef CC_INTERP
229 const Register tos = R17_tos;
230 #else
231 const Register tos = R15_esp;
232 #endif
233 __ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);
235 // initialize call_stub locals (step 2)
236 // now save tos as arguments_tos_address
237 __ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
239 // load argument registers for call
240 __ mr(R19_method, r_arg_method);
241 __ mr(R16_thread, r_arg_thread);
242 assert(tos != r_arg_method, "trashed r_arg_method");
243 assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");
245 // Set R15_prev_state to 0 for simplifying checks in callee.
246 #ifdef CC_INTERP
247 __ li(R15_prev_state, 0);
248 #else
249 __ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R11_scratch1);
250 #endif
251 // Stack on entry to frame manager / native entry:
252 //
253 // F0 [TOP_IJAVA_FRAME_ABI]
254 // alignment (optional)
255 // [outgoing Java arguments]
256 // [ENTRY_FRAME_LOCALS]
257 // F1 [C_FRAME]
258 // ...
259 //
261 // global toc register
262 __ load_const(R29, MacroAssembler::global_toc(), R11_scratch1);
264 // Load narrow oop base.
265 __ reinit_heapbase(R30, R11_scratch1);
267 // Remember the senderSP so we interpreter can pop c2i arguments off of the stack
268 // when called via a c2i.
270 // Pass initial_caller_sp to framemanager.
271 __ mr(R21_tmp1, R1_SP);
273 // Do a light-weight C-call here, r_new_arg_entry holds the address
274 // of the interpreter entry point (frame manager or native entry)
275 // and save runtime-value of LR in return_address.
276 assert(r_new_arg_entry != tos && r_new_arg_entry != R19_method && r_new_arg_entry != R16_thread,
277 "trashed r_new_arg_entry");
278 return_address = __ call_stub(r_new_arg_entry);
279 }
281 {
282 BLOCK_COMMENT("Returned from frame manager or native entry.");
283 // Returned from frame manager or native entry.
284 // Now pop frame, process result, and return to caller.
286 // Stack on exit from frame manager / native entry:
287 //
288 // F0 [ABI]
289 // ...
290 // [ENTRY_FRAME_LOCALS]
291 // F1 [C_FRAME]
292 // ...
293 //
294 // Just pop the topmost frame ...
295 //
297 Label ret_is_object;
298 Label ret_is_long;
299 Label ret_is_float;
300 Label ret_is_double;
302 Register r_entryframe_fp = R30;
303 Register r_lr = R7_ARG5;
304 Register r_cr = R8_ARG6;
306 // Reload some volatile registers which we've spilled before the call
307 // to frame manager / native entry.
308 // Access all locals via frame pointer, because we know nothing about
309 // the topmost frame's size.
310 __ ld(r_entryframe_fp, _abi(callers_sp), R1_SP);
311 assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);
312 __ ld(r_arg_result_addr,
313 _entry_frame_locals_neg(result_address), r_entryframe_fp);
314 __ ld(r_arg_result_type,
315 _entry_frame_locals_neg(result_type), r_entryframe_fp);
316 __ ld(r_cr, _abi(cr), r_entryframe_fp);
317 __ ld(r_lr, _abi(lr), r_entryframe_fp);
319 // pop frame and restore non-volatiles, LR and CR
320 __ mr(R1_SP, r_entryframe_fp);
321 __ mtcr(r_cr);
322 __ mtlr(r_lr);
324 // Store result depending on type. Everything that is not
325 // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
326 __ cmpwi(CCR0, r_arg_result_type, T_OBJECT);
327 __ cmpwi(CCR1, r_arg_result_type, T_LONG);
328 __ cmpwi(CCR5, r_arg_result_type, T_FLOAT);
329 __ cmpwi(CCR6, r_arg_result_type, T_DOUBLE);
331 // restore non-volatile registers
332 __ restore_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
335 // Stack on exit from call_stub:
336 //
337 // 0 [C_FRAME]
338 // ...
339 //
340 // no call_stub frames left.
342 // All non-volatiles have been restored at this point!!
343 assert(R3_RET == R3, "R3_RET should be R3");
345 __ beq(CCR0, ret_is_object);
346 __ beq(CCR1, ret_is_long);
347 __ beq(CCR5, ret_is_float);
348 __ beq(CCR6, ret_is_double);
350 // default:
351 __ stw(R3_RET, 0, r_arg_result_addr);
352 __ blr(); // return to caller
354 // case T_OBJECT:
355 __ bind(ret_is_object);
356 __ std(R3_RET, 0, r_arg_result_addr);
357 __ blr(); // return to caller
359 // case T_LONG:
360 __ bind(ret_is_long);
361 __ std(R3_RET, 0, r_arg_result_addr);
362 __ blr(); // return to caller
364 // case T_FLOAT:
365 __ bind(ret_is_float);
366 __ stfs(F1_RET, 0, r_arg_result_addr);
367 __ blr(); // return to caller
369 // case T_DOUBLE:
370 __ bind(ret_is_double);
371 __ stfd(F1_RET, 0, r_arg_result_addr);
372 __ blr(); // return to caller
373 }
375 return start;
376 }
378 // Return point for a Java call if there's an exception thrown in
379 // Java code. The exception is caught and transformed into a
380 // pending exception stored in JavaThread that can be tested from
381 // within the VM.
382 //
383 address generate_catch_exception() {
384 StubCodeMark mark(this, "StubRoutines", "catch_exception");
386 address start = __ pc();
388 // Registers alive
389 //
390 // R16_thread
391 // R3_ARG1 - address of pending exception
392 // R4_ARG2 - return address in call stub
394 const Register exception_file = R21_tmp1;
395 const Register exception_line = R22_tmp2;
397 __ load_const(exception_file, (void*)__FILE__);
398 __ load_const(exception_line, (void*)__LINE__);
400 __ std(R3_ARG1, thread_(pending_exception));
401 // store into `char *'
402 __ std(exception_file, thread_(exception_file));
403 // store into `int'
404 __ stw(exception_line, thread_(exception_line));
406 // complete return to VM
407 assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
409 __ mtlr(R4_ARG2);
410 // continue in call stub
411 __ blr();
413 return start;
414 }
416 // Continuation point for runtime calls returning with a pending
417 // exception. The pending exception check happened in the runtime
418 // or native call stub. The pending exception in Thread is
419 // converted into a Java-level exception.
420 //
421 address generate_forward_exception() {
422 StubCodeMark mark(this, "StubRoutines", "forward_exception");
423 address start = __ pc();
425 #if !defined(PRODUCT)
426 if (VerifyOops) {
427 // Get pending exception oop.
428 __ ld(R3_ARG1,
429 in_bytes(Thread::pending_exception_offset()),
430 R16_thread);
431 // Make sure that this code is only executed if there is a pending exception.
432 {
433 Label L;
434 __ cmpdi(CCR0, R3_ARG1, 0);
435 __ bne(CCR0, L);
436 __ stop("StubRoutines::forward exception: no pending exception (1)");
437 __ bind(L);
438 }
439 __ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");
440 }
441 #endif
443 // Save LR/CR and copy exception pc (LR) into R4_ARG2.
444 __ save_LR_CR(R4_ARG2);
445 __ push_frame_reg_args(0, R0);
446 // Find exception handler.
447 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
448 SharedRuntime::exception_handler_for_return_address),
449 R16_thread,
450 R4_ARG2);
451 // Copy handler's address.
452 __ mtctr(R3_RET);
453 __ pop_frame();
454 __ restore_LR_CR(R0);
456 // Set up the arguments for the exception handler:
457 // - R3_ARG1: exception oop
458 // - R4_ARG2: exception pc.
460 // Load pending exception oop.
461 __ ld(R3_ARG1,
462 in_bytes(Thread::pending_exception_offset()),
463 R16_thread);
465 // The exception pc is the return address in the caller.
466 // Must load it into R4_ARG2.
467 __ mflr(R4_ARG2);
469 #ifdef ASSERT
470 // Make sure exception is set.
471 {
472 Label L;
473 __ cmpdi(CCR0, R3_ARG1, 0);
474 __ bne(CCR0, L);
475 __ stop("StubRoutines::forward exception: no pending exception (2)");
476 __ bind(L);
477 }
478 #endif
480 // Clear the pending exception.
481 __ li(R0, 0);
482 __ std(R0,
483 in_bytes(Thread::pending_exception_offset()),
484 R16_thread);
485 // Jump to exception handler.
486 __ bctr();
488 return start;
489 }
491 #undef __
492 #define __ masm->
493 // Continuation point for throwing of implicit exceptions that are
494 // not handled in the current activation. Fabricates an exception
495 // oop and initiates normal exception dispatching in this
496 // frame. Only callee-saved registers are preserved (through the
497 // normal register window / RegisterMap handling). If the compiler
498 // needs all registers to be preserved between the fault point and
499 // the exception handler then it must assume responsibility for that
500 // in AbstractCompiler::continuation_for_implicit_null_exception or
501 // continuation_for_implicit_division_by_zero_exception. All other
502 // implicit exceptions (e.g., NullPointerException or
503 // AbstractMethodError on entry) are either at call sites or
504 // otherwise assume that stack unwinding will be initiated, so
505 // caller saved registers were assumed volatile in the compiler.
506 //
507 // Note that we generate only this stub into a RuntimeStub, because
508 // it needs to be properly traversed and ignored during GC, so we
509 // change the meaning of the "__" macro within this method.
510 //
511 // Note: the routine set_pc_not_at_call_for_caller in
512 // SharedRuntime.cpp requires that this code be generated into a
513 // RuntimeStub.
514 address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc,
515 Register arg1 = noreg, Register arg2 = noreg) {
516 CodeBuffer code(name, 1024 DEBUG_ONLY(+ 512), 0);
517 MacroAssembler* masm = new MacroAssembler(&code);
519 OopMapSet* oop_maps = new OopMapSet();
520 int frame_size_in_bytes = frame::abi_reg_args_size;
521 OopMap* map = new OopMap(frame_size_in_bytes / sizeof(jint), 0);
523 StubCodeMark mark(this, "StubRoutines", "throw_exception");
525 address start = __ pc();
527 __ save_LR_CR(R11_scratch1);
529 // Push a frame.
530 __ push_frame_reg_args(0, R11_scratch1);
532 address frame_complete_pc = __ pc();
534 if (restore_saved_exception_pc) {
535 __ unimplemented("StubGenerator::throw_exception with restore_saved_exception_pc", 74);
536 }
538 // Note that we always have a runtime stub frame on the top of
539 // stack by this point. Remember the offset of the instruction
540 // whose address will be moved to R11_scratch1.
541 address gc_map_pc = __ get_PC_trash_LR(R11_scratch1);
543 __ set_last_Java_frame(/*sp*/R1_SP, /*pc*/R11_scratch1);
545 __ mr(R3_ARG1, R16_thread);
546 if (arg1 != noreg) {
547 __ mr(R4_ARG2, arg1);
548 }
549 if (arg2 != noreg) {
550 __ mr(R5_ARG3, arg2);
551 }
552 #if defined(ABI_ELFv2)
553 __ call_c(runtime_entry, relocInfo::none);
554 #else
555 __ call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, runtime_entry), relocInfo::none);
556 #endif
558 // Set an oopmap for the call site.
559 oop_maps->add_gc_map((int)(gc_map_pc - start), map);
561 __ reset_last_Java_frame();
563 #ifdef ASSERT
564 // Make sure that this code is only executed if there is a pending
565 // exception.
566 {
567 Label L;
568 __ ld(R0,
569 in_bytes(Thread::pending_exception_offset()),
570 R16_thread);
571 __ cmpdi(CCR0, R0, 0);
572 __ bne(CCR0, L);
573 __ stop("StubRoutines::throw_exception: no pending exception");
574 __ bind(L);
575 }
576 #endif
578 // Pop frame.
579 __ pop_frame();
581 __ restore_LR_CR(R11_scratch1);
583 __ load_const(R11_scratch1, StubRoutines::forward_exception_entry());
584 __ mtctr(R11_scratch1);
585 __ bctr();
587 // Create runtime stub with OopMap.
588 RuntimeStub* stub =
589 RuntimeStub::new_runtime_stub(name, &code,
590 /*frame_complete=*/ (int)(frame_complete_pc - start),
591 frame_size_in_bytes/wordSize,
592 oop_maps,
593 false);
594 return stub->entry_point();
595 }
596 #undef __
597 #define __ _masm->
599 // Generate G1 pre-write barrier for array.
600 //
601 // Input:
602 // from - register containing src address (only needed for spilling)
603 // to - register containing starting address
604 // count - register containing element count
605 // tmp - scratch register
606 //
607 // Kills:
608 // nothing
609 //
610 void gen_write_ref_array_pre_barrier(Register from, Register to, Register count, bool dest_uninitialized, Register Rtmp1) {
611 BarrierSet* const bs = Universe::heap()->barrier_set();
612 switch (bs->kind()) {
613 case BarrierSet::G1SATBCT:
614 case BarrierSet::G1SATBCTLogging:
615 // With G1, don't generate the call if we statically know that the target in uninitialized
616 if (!dest_uninitialized) {
617 const int spill_slots = 4 * wordSize;
618 const int frame_size = frame::abi_reg_args_size + spill_slots;
619 Label filtered;
621 // Is marking active?
622 if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
623 __ lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
624 } else {
625 guarantee(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
626 __ lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
627 }
628 __ cmpdi(CCR0, Rtmp1, 0);
629 __ beq(CCR0, filtered);
631 __ save_LR_CR(R0);
632 __ push_frame_reg_args(spill_slots, R0);
633 __ std(from, frame_size - 1 * wordSize, R1_SP);
634 __ std(to, frame_size - 2 * wordSize, R1_SP);
635 __ std(count, frame_size - 3 * wordSize, R1_SP);
637 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), to, count);
639 __ ld(from, frame_size - 1 * wordSize, R1_SP);
640 __ ld(to, frame_size - 2 * wordSize, R1_SP);
641 __ ld(count, frame_size - 3 * wordSize, R1_SP);
642 __ pop_frame();
643 __ restore_LR_CR(R0);
645 __ bind(filtered);
646 }
647 break;
648 case BarrierSet::CardTableModRef:
649 case BarrierSet::CardTableExtension:
650 case BarrierSet::ModRef:
651 break;
652 default:
653 ShouldNotReachHere();
654 }
655 }
657 // Generate CMS/G1 post-write barrier for array.
658 //
659 // Input:
660 // addr - register containing starting address
661 // count - register containing element count
662 // tmp - scratch register
663 //
664 // The input registers and R0 are overwritten.
665 //
666 void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp, bool branchToEnd) {
667 BarrierSet* const bs = Universe::heap()->barrier_set();
669 switch (bs->kind()) {
670 case BarrierSet::G1SATBCT:
671 case BarrierSet::G1SATBCTLogging:
672 {
673 if (branchToEnd) {
674 __ save_LR_CR(R0);
675 // We need this frame only to spill LR.
676 __ push_frame_reg_args(0, R0);
677 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), addr, count);
678 __ pop_frame();
679 __ restore_LR_CR(R0);
680 } else {
681 // Tail call: fake call from stub caller by branching without linking.
682 address entry_point = (address)CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post);
683 __ mr_if_needed(R3_ARG1, addr);
684 __ mr_if_needed(R4_ARG2, count);
685 __ load_const(R11, entry_point, R0);
686 __ call_c_and_return_to_caller(R11);
687 }
688 }
689 break;
690 case BarrierSet::CardTableModRef:
691 case BarrierSet::CardTableExtension:
692 {
693 Label Lskip_loop, Lstore_loop;
694 if (UseConcMarkSweepGC) {
695 // TODO PPC port: contribute optimization / requires shared changes
696 __ release();
697 }
699 CardTableModRefBS* const ct = (CardTableModRefBS*)bs;
700 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
701 assert_different_registers(addr, count, tmp);
703 __ sldi(count, count, LogBytesPerHeapOop);
704 __ addi(count, count, -BytesPerHeapOop);
705 __ add(count, addr, count);
706 // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
707 __ srdi(addr, addr, CardTableModRefBS::card_shift);
708 __ srdi(count, count, CardTableModRefBS::card_shift);
709 __ subf(count, addr, count);
710 assert_different_registers(R0, addr, count, tmp);
711 __ load_const(tmp, (address)ct->byte_map_base);
712 __ addic_(count, count, 1);
713 __ beq(CCR0, Lskip_loop);
714 __ li(R0, 0);
715 __ mtctr(count);
716 // Byte store loop
717 __ bind(Lstore_loop);
718 __ stbx(R0, tmp, addr);
719 __ addi(addr, addr, 1);
720 __ bdnz(Lstore_loop);
721 __ bind(Lskip_loop);
723 if (!branchToEnd) __ blr();
724 }
725 break;
726 case BarrierSet::ModRef:
727 if (!branchToEnd) __ blr();
728 break;
729 default:
730 ShouldNotReachHere();
731 }
732 }
734 // Support for void zero_words_aligned8(HeapWord* to, size_t count)
735 //
736 // Arguments:
737 // to:
738 // count:
739 //
740 // Destroys:
741 //
742 address generate_zero_words_aligned8() {
743 StubCodeMark mark(this, "StubRoutines", "zero_words_aligned8");
745 // Implemented as in ClearArray.
746 address start = __ function_entry();
748 Register base_ptr_reg = R3_ARG1; // tohw (needs to be 8b aligned)
749 Register cnt_dwords_reg = R4_ARG2; // count (in dwords)
750 Register tmp1_reg = R5_ARG3;
751 Register tmp2_reg = R6_ARG4;
752 Register zero_reg = R7_ARG5;
754 // Procedure for large arrays (uses data cache block zero instruction).
755 Label dwloop, fast, fastloop, restloop, lastdword, done;
756 int cl_size=VM_Version::get_cache_line_size(), cl_dwords=cl_size>>3, cl_dwordaddr_bits=exact_log2(cl_dwords);
757 int min_dcbz=2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines.
759 // Clear up to 128byte boundary if long enough, dword_cnt=(16-(base>>3))%16.
760 __ dcbtst(base_ptr_reg); // Indicate write access to first cache line ...
761 __ andi(tmp2_reg, cnt_dwords_reg, 1); // to check if number of dwords is even.
762 __ srdi_(tmp1_reg, cnt_dwords_reg, 1); // number of double dwords
763 __ load_const_optimized(zero_reg, 0L); // Use as zero register.
765 __ cmpdi(CCR1, tmp2_reg, 0); // cnt_dwords even?
766 __ beq(CCR0, lastdword); // size <= 1
767 __ mtctr(tmp1_reg); // Speculatively preload counter for rest loop (>0).
768 __ cmpdi(CCR0, cnt_dwords_reg, (min_dcbz+1)*cl_dwords-1); // Big enough to ensure >=min_dcbz cache lines are included?
769 __ neg(tmp1_reg, base_ptr_reg); // bit 0..58: bogus, bit 57..60: (16-(base>>3))%16, bit 61..63: 000
771 __ blt(CCR0, restloop); // Too small. (<31=(2*cl_dwords)-1 is sufficient, but bigger performs better.)
772 __ rldicl_(tmp1_reg, tmp1_reg, 64-3, 64-cl_dwordaddr_bits); // Extract number of dwords to 128byte boundary=(16-(base>>3))%16.
774 __ beq(CCR0, fast); // already 128byte aligned
775 __ mtctr(tmp1_reg); // Set ctr to hit 128byte boundary (0<ctr<cnt).
776 __ subf(cnt_dwords_reg, tmp1_reg, cnt_dwords_reg); // rest (>0 since size>=256-8)
778 // Clear in first cache line dword-by-dword if not already 128byte aligned.
779 __ bind(dwloop);
780 __ std(zero_reg, 0, base_ptr_reg); // Clear 8byte aligned block.
781 __ addi(base_ptr_reg, base_ptr_reg, 8);
782 __ bdnz(dwloop);
784 // clear 128byte blocks
785 __ bind(fast);
786 __ srdi(tmp1_reg, cnt_dwords_reg, cl_dwordaddr_bits); // loop count for 128byte loop (>0 since size>=256-8)
787 __ andi(tmp2_reg, cnt_dwords_reg, 1); // to check if rest even
789 __ mtctr(tmp1_reg); // load counter
790 __ cmpdi(CCR1, tmp2_reg, 0); // rest even?
791 __ rldicl_(tmp1_reg, cnt_dwords_reg, 63, 65-cl_dwordaddr_bits); // rest in double dwords
793 __ bind(fastloop);
794 __ dcbz(base_ptr_reg); // Clear 128byte aligned block.
795 __ addi(base_ptr_reg, base_ptr_reg, cl_size);
796 __ bdnz(fastloop);
798 //__ dcbtst(base_ptr_reg); // Indicate write access to last cache line.
799 __ beq(CCR0, lastdword); // rest<=1
800 __ mtctr(tmp1_reg); // load counter
802 // Clear rest.
803 __ bind(restloop);
804 __ std(zero_reg, 0, base_ptr_reg); // Clear 8byte aligned block.
805 __ std(zero_reg, 8, base_ptr_reg); // Clear 8byte aligned block.
806 __ addi(base_ptr_reg, base_ptr_reg, 16);
807 __ bdnz(restloop);
809 __ bind(lastdword);
810 __ beq(CCR1, done);
811 __ std(zero_reg, 0, base_ptr_reg);
812 __ bind(done);
813 __ blr(); // return
815 return start;
816 }
818 // The following routine generates a subroutine to throw an asynchronous
819 // UnknownError when an unsafe access gets a fault that could not be
820 // reasonably prevented by the programmer. (Example: SIGBUS/OBJERR.)
821 //
822 address generate_handler_for_unsafe_access() {
823 StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
824 address start = __ function_entry();
825 __ unimplemented("StubRoutines::handler_for_unsafe_access", 93);
826 return start;
827 }
829 #if !defined(PRODUCT)
830 // Wrapper which calls oopDesc::is_oop_or_null()
831 // Only called by MacroAssembler::verify_oop
832 static void verify_oop_helper(const char* message, oop o) {
833 if (!o->is_oop_or_null()) {
834 fatal(message);
835 }
836 ++ StubRoutines::_verify_oop_count;
837 }
838 #endif
840 // Return address of code to be called from code generated by
841 // MacroAssembler::verify_oop.
842 //
843 // Don't generate, rather use C++ code.
844 address generate_verify_oop() {
845 StubCodeMark mark(this, "StubRoutines", "verify_oop");
847 // this is actually a `FunctionDescriptor*'.
848 address start = 0;
850 #if !defined(PRODUCT)
851 start = CAST_FROM_FN_PTR(address, verify_oop_helper);
852 #endif
854 return start;
855 }
857 // Fairer handling of safepoints for native methods.
858 //
859 // Generate code which reads from the polling page. This special handling is needed as the
860 // linux-ppc64 kernel before 2.6.6 doesn't set si_addr on some segfaults in 64bit mode
861 // (cf. http://www.kernel.org/pub/linux/kernel/v2.6/ChangeLog-2.6.6), especially when we try
862 // to read from the safepoint polling page.
863 address generate_load_from_poll() {
864 StubCodeMark mark(this, "StubRoutines", "generate_load_from_poll");
865 address start = __ function_entry();
866 __ unimplemented("StubRoutines::verify_oop", 95); // TODO PPC port
867 return start;
868 }
870 // -XX:+OptimizeFill : convert fill/copy loops into intrinsic
871 //
872 // The code is implemented(ported from sparc) as we believe it benefits JVM98, however
873 // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!
874 //
875 // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition
876 // for turning on loop predication optimization, and hence the behavior of "array range check"
877 // and "loop invariant check" could be influenced, which potentially boosted JVM98.
878 //
879 // Generate stub for disjoint short fill. If "aligned" is true, the
880 // "to" address is assumed to be heapword aligned.
881 //
882 // Arguments for generated stub:
883 // to: R3_ARG1
884 // value: R4_ARG2
885 // count: R5_ARG3 treated as signed
886 //
887 address generate_fill(BasicType t, bool aligned, const char* name) {
888 StubCodeMark mark(this, "StubRoutines", name);
889 address start = __ function_entry();
891 const Register to = R3_ARG1; // source array address
892 const Register value = R4_ARG2; // fill value
893 const Register count = R5_ARG3; // elements count
894 const Register temp = R6_ARG4; // temp register
896 //assert_clean_int(count, O3); // Make sure 'count' is clean int.
898 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
899 Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;
901 int shift = -1;
902 switch (t) {
903 case T_BYTE:
904 shift = 2;
905 // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
906 __ rldimi(value, value, 8, 48); // 8 bit -> 16 bit
907 __ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
908 __ blt(CCR0, L_fill_elements);
909 __ rldimi(value, value, 16, 32); // 16 bit -> 32 bit
910 break;
911 case T_SHORT:
912 shift = 1;
913 // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
914 __ rldimi(value, value, 16, 32); // 16 bit -> 32 bit
915 __ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
916 __ blt(CCR0, L_fill_elements);
917 break;
918 case T_INT:
919 shift = 0;
920 __ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
921 __ blt(CCR0, L_fill_4_bytes);
922 break;
923 default: ShouldNotReachHere();
924 }
926 if (!aligned && (t == T_BYTE || t == T_SHORT)) {
927 // Align source address at 4 bytes address boundary.
928 if (t == T_BYTE) {
929 // One byte misalignment happens only for byte arrays.
930 __ andi_(temp, to, 1);
931 __ beq(CCR0, L_skip_align1);
932 __ stb(value, 0, to);
933 __ addi(to, to, 1);
934 __ addi(count, count, -1);
935 __ bind(L_skip_align1);
936 }
937 // Two bytes misalignment happens only for byte and short (char) arrays.
938 __ andi_(temp, to, 2);
939 __ beq(CCR0, L_skip_align2);
940 __ sth(value, 0, to);
941 __ addi(to, to, 2);
942 __ addi(count, count, -(1 << (shift - 1)));
943 __ bind(L_skip_align2);
944 }
946 if (!aligned) {
947 // Align to 8 bytes, we know we are 4 byte aligned to start.
948 __ andi_(temp, to, 7);
949 __ beq(CCR0, L_fill_32_bytes);
950 __ stw(value, 0, to);
951 __ addi(to, to, 4);
952 __ addi(count, count, -(1 << shift));
953 __ bind(L_fill_32_bytes);
954 }
956 __ li(temp, 8<<shift); // Prepare for 32 byte loop.
957 // Clone bytes int->long as above.
958 __ rldimi(value, value, 32, 0); // 32 bit -> 64 bit
960 Label L_check_fill_8_bytes;
961 // Fill 32-byte chunks.
962 __ subf_(count, temp, count);
963 __ blt(CCR0, L_check_fill_8_bytes);
965 Label L_fill_32_bytes_loop;
966 __ align(32);
967 __ bind(L_fill_32_bytes_loop);
969 __ std(value, 0, to);
970 __ std(value, 8, to);
971 __ subf_(count, temp, count); // Update count.
972 __ std(value, 16, to);
973 __ std(value, 24, to);
975 __ addi(to, to, 32);
976 __ bge(CCR0, L_fill_32_bytes_loop);
978 __ bind(L_check_fill_8_bytes);
979 __ add_(count, temp, count);
980 __ beq(CCR0, L_exit);
981 __ addic_(count, count, -(2 << shift));
982 __ blt(CCR0, L_fill_4_bytes);
984 //
985 // Length is too short, just fill 8 bytes at a time.
986 //
987 Label L_fill_8_bytes_loop;
988 __ bind(L_fill_8_bytes_loop);
989 __ std(value, 0, to);
990 __ addic_(count, count, -(2 << shift));
991 __ addi(to, to, 8);
992 __ bge(CCR0, L_fill_8_bytes_loop);
994 // Fill trailing 4 bytes.
995 __ bind(L_fill_4_bytes);
996 __ andi_(temp, count, 1<<shift);
997 __ beq(CCR0, L_fill_2_bytes);
999 __ stw(value, 0, to);
1000 if (t == T_BYTE || t == T_SHORT) {
1001 __ addi(to, to, 4);
1002 // Fill trailing 2 bytes.
1003 __ bind(L_fill_2_bytes);
1004 __ andi_(temp, count, 1<<(shift-1));
1005 __ beq(CCR0, L_fill_byte);
1006 __ sth(value, 0, to);
1007 if (t == T_BYTE) {
1008 __ addi(to, to, 2);
1009 // Fill trailing byte.
1010 __ bind(L_fill_byte);
1011 __ andi_(count, count, 1);
1012 __ beq(CCR0, L_exit);
1013 __ stb(value, 0, to);
1014 } else {
1015 __ bind(L_fill_byte);
1016 }
1017 } else {
1018 __ bind(L_fill_2_bytes);
1019 }
1020 __ bind(L_exit);
1021 __ blr();
1023 // Handle copies less than 8 bytes. Int is handled elsewhere.
1024 if (t == T_BYTE) {
1025 __ bind(L_fill_elements);
1026 Label L_fill_2, L_fill_4;
1027 __ andi_(temp, count, 1);
1028 __ beq(CCR0, L_fill_2);
1029 __ stb(value, 0, to);
1030 __ addi(to, to, 1);
1031 __ bind(L_fill_2);
1032 __ andi_(temp, count, 2);
1033 __ beq(CCR0, L_fill_4);
1034 __ stb(value, 0, to);
1035 __ stb(value, 0, to);
1036 __ addi(to, to, 2);
1037 __ bind(L_fill_4);
1038 __ andi_(temp, count, 4);
1039 __ beq(CCR0, L_exit);
1040 __ stb(value, 0, to);
1041 __ stb(value, 1, to);
1042 __ stb(value, 2, to);
1043 __ stb(value, 3, to);
1044 __ blr();
1045 }
1047 if (t == T_SHORT) {
1048 Label L_fill_2;
1049 __ bind(L_fill_elements);
1050 __ andi_(temp, count, 1);
1051 __ beq(CCR0, L_fill_2);
1052 __ sth(value, 0, to);
1053 __ addi(to, to, 2);
1054 __ bind(L_fill_2);
1055 __ andi_(temp, count, 2);
1056 __ beq(CCR0, L_exit);
1057 __ sth(value, 0, to);
1058 __ sth(value, 2, to);
1059 __ blr();
1060 }
1061 return start;
1062 }
1065 // Generate overlap test for array copy stubs.
1066 //
1067 // Input:
1068 // R3_ARG1 - from
1069 // R4_ARG2 - to
1070 // R5_ARG3 - element count
1071 //
1072 void array_overlap_test(address no_overlap_target, int log2_elem_size) {
1073 Register tmp1 = R6_ARG4;
1074 Register tmp2 = R7_ARG5;
1076 Label l_overlap;
1077 #ifdef ASSERT
1078 __ srdi_(tmp2, R5_ARG3, 31);
1079 __ asm_assert_eq("missing zero extend", 0xAFFE);
1080 #endif
1082 __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
1083 __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
1084 __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
1085 __ cmpld(CCR1, tmp1, tmp2);
1086 __ crand(/*CCR0 lt*/0, /*CCR1 lt*/4+0, /*CCR0 lt*/0);
1087 __ blt(CCR0, l_overlap); // Src before dst and distance smaller than size.
1089 // need to copy forwards
1090 if (__ is_within_range_of_b(no_overlap_target, __ pc())) {
1091 __ b(no_overlap_target);
1092 } else {
1093 __ load_const(tmp1, no_overlap_target, tmp2);
1094 __ mtctr(tmp1);
1095 __ bctr();
1096 }
1098 __ bind(l_overlap);
1099 // need to copy backwards
1100 }
1102 // The guideline in the implementations of generate_disjoint_xxx_copy
1103 // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
1104 // single instructions, but to avoid alignment interrupts (see subsequent
1105 // comment). Furthermore, we try to minimize misaligned access, even
1106 // though they cause no alignment interrupt.
1107 //
1108 // In Big-Endian mode, the PowerPC architecture requires implementations to
1109 // handle automatically misaligned integer halfword and word accesses,
1110 // word-aligned integer doubleword accesses, and word-aligned floating-point
1111 // accesses. Other accesses may or may not generate an Alignment interrupt
1112 // depending on the implementation.
1113 // Alignment interrupt handling may require on the order of hundreds of cycles,
1114 // so every effort should be made to avoid misaligned memory values.
1115 //
1116 //
1117 // Generate stub for disjoint byte copy. If "aligned" is true, the
1118 // "from" and "to" addresses are assumed to be heapword aligned.
1119 //
1120 // Arguments for generated stub:
1121 // from: R3_ARG1
1122 // to: R4_ARG2
1123 // count: R5_ARG3 treated as signed
1124 //
1125 address generate_disjoint_byte_copy(bool aligned, const char * name) {
1126 StubCodeMark mark(this, "StubRoutines", name);
1127 address start = __ function_entry();
1129 Register tmp1 = R6_ARG4;
1130 Register tmp2 = R7_ARG5;
1131 Register tmp3 = R8_ARG6;
1132 Register tmp4 = R9_ARG7;
1134 VectorSRegister tmp_vsr1 = VSR1;
1135 VectorSRegister tmp_vsr2 = VSR2;
1137 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10;
1139 // Don't try anything fancy if arrays don't have many elements.
1140 __ li(tmp3, 0);
1141 __ cmpwi(CCR0, R5_ARG3, 17);
1142 __ ble(CCR0, l_6); // copy 4 at a time
1144 if (!aligned) {
1145 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1146 __ andi_(tmp1, tmp1, 3);
1147 __ bne(CCR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
1149 // Copy elements if necessary to align to 4 bytes.
1150 __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
1151 __ andi_(tmp1, tmp1, 3);
1152 __ beq(CCR0, l_2);
1154 __ subf(R5_ARG3, tmp1, R5_ARG3);
1155 __ bind(l_9);
1156 __ lbz(tmp2, 0, R3_ARG1);
1157 __ addic_(tmp1, tmp1, -1);
1158 __ stb(tmp2, 0, R4_ARG2);
1159 __ addi(R3_ARG1, R3_ARG1, 1);
1160 __ addi(R4_ARG2, R4_ARG2, 1);
1161 __ bne(CCR0, l_9);
1163 __ bind(l_2);
1164 }
1166 // copy 8 elements at a time
1167 __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
1168 __ andi_(tmp1, tmp2, 7);
1169 __ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8
1171 // copy a 2-element word if necessary to align to 8 bytes
1172 __ andi_(R0, R3_ARG1, 7);
1173 __ beq(CCR0, l_7);
1175 __ lwzx(tmp2, R3_ARG1, tmp3);
1176 __ addi(R5_ARG3, R5_ARG3, -4);
1177 __ stwx(tmp2, R4_ARG2, tmp3);
1178 { // FasterArrayCopy
1179 __ addi(R3_ARG1, R3_ARG1, 4);
1180 __ addi(R4_ARG2, R4_ARG2, 4);
1181 }
1182 __ bind(l_7);
1184 { // FasterArrayCopy
1185 __ cmpwi(CCR0, R5_ARG3, 31);
1186 __ ble(CCR0, l_6); // copy 2 at a time if less than 32 elements remain
1188 __ srdi(tmp1, R5_ARG3, 5);
1189 __ andi_(R5_ARG3, R5_ARG3, 31);
1190 __ mtctr(tmp1);
1192 if (!VM_Version::has_vsx()) {
1194 __ bind(l_8);
1195 // Use unrolled version for mass copying (copy 32 elements a time)
1196 // Load feeding store gets zero latency on Power6, however not on Power5.
1197 // Therefore, the following sequence is made for the good of both.
1198 __ ld(tmp1, 0, R3_ARG1);
1199 __ ld(tmp2, 8, R3_ARG1);
1200 __ ld(tmp3, 16, R3_ARG1);
1201 __ ld(tmp4, 24, R3_ARG1);
1202 __ std(tmp1, 0, R4_ARG2);
1203 __ std(tmp2, 8, R4_ARG2);
1204 __ std(tmp3, 16, R4_ARG2);
1205 __ std(tmp4, 24, R4_ARG2);
1206 __ addi(R3_ARG1, R3_ARG1, 32);
1207 __ addi(R4_ARG2, R4_ARG2, 32);
1208 __ bdnz(l_8);
1210 } else { // Processor supports VSX, so use it to mass copy.
1212 // Prefetch the data into the L2 cache.
1213 __ dcbt(R3_ARG1, 0);
1215 // If supported set DSCR pre-fetch to deepest.
1216 if (VM_Version::has_mfdscr()) {
1217 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1218 __ mtdscr(tmp2);
1219 }
1221 __ li(tmp1, 16);
1223 // Backbranch target aligned to 32-byte. Not 16-byte align as
1224 // loop contains < 8 instructions that fit inside a single
1225 // i-cache sector.
1226 __ align(32);
1228 __ bind(l_10);
1229 // Use loop with VSX load/store instructions to
1230 // copy 32 elements a time.
1231 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1232 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1233 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
1234 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1235 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
1236 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
1237 __ bdnz(l_10); // Dec CTR and loop if not zero.
1239 // Restore DSCR pre-fetch value.
1240 if (VM_Version::has_mfdscr()) {
1241 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1242 __ mtdscr(tmp2);
1243 }
1245 } // VSX
1246 } // FasterArrayCopy
1248 __ bind(l_6);
1250 // copy 4 elements at a time
1251 __ cmpwi(CCR0, R5_ARG3, 4);
1252 __ blt(CCR0, l_1);
1253 __ srdi(tmp1, R5_ARG3, 2);
1254 __ mtctr(tmp1); // is > 0
1255 __ andi_(R5_ARG3, R5_ARG3, 3);
1257 { // FasterArrayCopy
1258 __ addi(R3_ARG1, R3_ARG1, -4);
1259 __ addi(R4_ARG2, R4_ARG2, -4);
1260 __ bind(l_3);
1261 __ lwzu(tmp2, 4, R3_ARG1);
1262 __ stwu(tmp2, 4, R4_ARG2);
1263 __ bdnz(l_3);
1264 __ addi(R3_ARG1, R3_ARG1, 4);
1265 __ addi(R4_ARG2, R4_ARG2, 4);
1266 }
1268 // do single element copy
1269 __ bind(l_1);
1270 __ cmpwi(CCR0, R5_ARG3, 0);
1271 __ beq(CCR0, l_4);
1273 { // FasterArrayCopy
1274 __ mtctr(R5_ARG3);
1275 __ addi(R3_ARG1, R3_ARG1, -1);
1276 __ addi(R4_ARG2, R4_ARG2, -1);
1278 __ bind(l_5);
1279 __ lbzu(tmp2, 1, R3_ARG1);
1280 __ stbu(tmp2, 1, R4_ARG2);
1281 __ bdnz(l_5);
1282 }
1284 __ bind(l_4);
1285 __ blr();
1287 return start;
1288 }
1290 // Generate stub for conjoint byte copy. If "aligned" is true, the
1291 // "from" and "to" addresses are assumed to be heapword aligned.
1292 //
1293 // Arguments for generated stub:
1294 // from: R3_ARG1
1295 // to: R4_ARG2
1296 // count: R5_ARG3 treated as signed
1297 //
1298 address generate_conjoint_byte_copy(bool aligned, const char * name) {
1299 StubCodeMark mark(this, "StubRoutines", name);
1300 address start = __ function_entry();
1302 Register tmp1 = R6_ARG4;
1303 Register tmp2 = R7_ARG5;
1304 Register tmp3 = R8_ARG6;
1306 #if defined(ABI_ELFv2)
1307 address nooverlap_target = aligned ?
1308 StubRoutines::arrayof_jbyte_disjoint_arraycopy() :
1309 StubRoutines::jbyte_disjoint_arraycopy();
1310 #else
1311 address nooverlap_target = aligned ?
1312 ((FunctionDescriptor*)StubRoutines::arrayof_jbyte_disjoint_arraycopy())->entry() :
1313 ((FunctionDescriptor*)StubRoutines::jbyte_disjoint_arraycopy())->entry();
1314 #endif
1316 array_overlap_test(nooverlap_target, 0);
1317 // Do reverse copy. We assume the case of actual overlap is rare enough
1318 // that we don't have to optimize it.
1319 Label l_1, l_2;
1321 __ b(l_2);
1322 __ bind(l_1);
1323 __ stbx(tmp1, R4_ARG2, R5_ARG3);
1324 __ bind(l_2);
1325 __ addic_(R5_ARG3, R5_ARG3, -1);
1326 __ lbzx(tmp1, R3_ARG1, R5_ARG3);
1327 __ bge(CCR0, l_1);
1329 __ blr();
1331 return start;
1332 }
1334 // Generate stub for disjoint short copy. If "aligned" is true, the
1335 // "from" and "to" addresses are assumed to be heapword aligned.
1336 //
1337 // Arguments for generated stub:
1338 // from: R3_ARG1
1339 // to: R4_ARG2
1340 // elm.count: R5_ARG3 treated as signed
1341 //
1342 // Strategy for aligned==true:
1343 //
1344 // If length <= 9:
1345 // 1. copy 2 elements at a time (l_6)
1346 // 2. copy last element if original element count was odd (l_1)
1347 //
1348 // If length > 9:
1349 // 1. copy 4 elements at a time until less than 4 elements are left (l_7)
1350 // 2. copy 2 elements at a time until less than 2 elements are left (l_6)
1351 // 3. copy last element if one was left in step 2. (l_1)
1352 //
1353 //
1354 // Strategy for aligned==false:
1355 //
1356 // If length <= 9: same as aligned==true case, but NOTE: load/stores
1357 // can be unaligned (see comment below)
1358 //
1359 // If length > 9:
1360 // 1. continue with step 6. if the alignment of from and to mod 4
1361 // is different.
1362 // 2. align from and to to 4 bytes by copying 1 element if necessary
1363 // 3. at l_2 from and to are 4 byte aligned; continue with
1364 // 5. if they cannot be aligned to 8 bytes because they have
1365 // got different alignment mod 8.
1366 // 4. at this point we know that both, from and to, have the same
1367 // alignment mod 8, now copy one element if necessary to get
1368 // 8 byte alignment of from and to.
1369 // 5. copy 4 elements at a time until less than 4 elements are
1370 // left; depending on step 3. all load/stores are aligned or
1371 // either all loads or all stores are unaligned.
1372 // 6. copy 2 elements at a time until less than 2 elements are
1373 // left (l_6); arriving here from step 1., there is a chance
1374 // that all accesses are unaligned.
1375 // 7. copy last element if one was left in step 6. (l_1)
1376 //
1377 // There are unaligned data accesses using integer load/store
1378 // instructions in this stub. POWER allows such accesses.
1379 //
1380 // According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
1381 // Chapter 2: Effect of Operand Placement on Performance) unaligned
1382 // integer load/stores have good performance. Only unaligned
1383 // floating point load/stores can have poor performance.
1384 //
1385 // TODO:
1386 //
1387 // 1. check if aligning the backbranch target of loops is beneficial
1388 //
1389 address generate_disjoint_short_copy(bool aligned, const char * name) {
1390 StubCodeMark mark(this, "StubRoutines", name);
1392 Register tmp1 = R6_ARG4;
1393 Register tmp2 = R7_ARG5;
1394 Register tmp3 = R8_ARG6;
1395 Register tmp4 = R9_ARG7;
1397 VectorSRegister tmp_vsr1 = VSR1;
1398 VectorSRegister tmp_vsr2 = VSR2;
1400 address start = __ function_entry();
1402 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
1404 // don't try anything fancy if arrays don't have many elements
1405 __ li(tmp3, 0);
1406 __ cmpwi(CCR0, R5_ARG3, 9);
1407 __ ble(CCR0, l_6); // copy 2 at a time
1409 if (!aligned) {
1410 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1411 __ andi_(tmp1, tmp1, 3);
1412 __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
1414 // At this point it is guaranteed that both, from and to have the same alignment mod 4.
1416 // Copy 1 element if necessary to align to 4 bytes.
1417 __ andi_(tmp1, R3_ARG1, 3);
1418 __ beq(CCR0, l_2);
1420 __ lhz(tmp2, 0, R3_ARG1);
1421 __ addi(R3_ARG1, R3_ARG1, 2);
1422 __ sth(tmp2, 0, R4_ARG2);
1423 __ addi(R4_ARG2, R4_ARG2, 2);
1424 __ addi(R5_ARG3, R5_ARG3, -1);
1425 __ bind(l_2);
1427 // At this point the positions of both, from and to, are at least 4 byte aligned.
1429 // Copy 4 elements at a time.
1430 // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
1431 __ xorr(tmp2, R3_ARG1, R4_ARG2);
1432 __ andi_(tmp1, tmp2, 7);
1433 __ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
1435 // Copy a 2-element word if necessary to align to 8 bytes.
1436 __ andi_(R0, R3_ARG1, 7);
1437 __ beq(CCR0, l_7);
1439 __ lwzx(tmp2, R3_ARG1, tmp3);
1440 __ addi(R5_ARG3, R5_ARG3, -2);
1441 __ stwx(tmp2, R4_ARG2, tmp3);
1442 { // FasterArrayCopy
1443 __ addi(R3_ARG1, R3_ARG1, 4);
1444 __ addi(R4_ARG2, R4_ARG2, 4);
1445 }
1446 }
1448 __ bind(l_7);
1450 // Copy 4 elements at a time; either the loads or the stores can
1451 // be unaligned if aligned == false.
1453 { // FasterArrayCopy
1454 __ cmpwi(CCR0, R5_ARG3, 15);
1455 __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain
1457 __ srdi(tmp1, R5_ARG3, 4);
1458 __ andi_(R5_ARG3, R5_ARG3, 15);
1459 __ mtctr(tmp1);
1461 if (!VM_Version::has_vsx()) {
1463 __ bind(l_8);
1464 // Use unrolled version for mass copying (copy 16 elements a time).
1465 // Load feeding store gets zero latency on Power6, however not on Power5.
1466 // Therefore, the following sequence is made for the good of both.
1467 __ ld(tmp1, 0, R3_ARG1);
1468 __ ld(tmp2, 8, R3_ARG1);
1469 __ ld(tmp3, 16, R3_ARG1);
1470 __ ld(tmp4, 24, R3_ARG1);
1471 __ std(tmp1, 0, R4_ARG2);
1472 __ std(tmp2, 8, R4_ARG2);
1473 __ std(tmp3, 16, R4_ARG2);
1474 __ std(tmp4, 24, R4_ARG2);
1475 __ addi(R3_ARG1, R3_ARG1, 32);
1476 __ addi(R4_ARG2, R4_ARG2, 32);
1477 __ bdnz(l_8);
1479 } else { // Processor supports VSX, so use it to mass copy.
1481 // Prefetch src data into L2 cache.
1482 __ dcbt(R3_ARG1, 0);
1484 // If supported set DSCR pre-fetch to deepest.
1485 if (VM_Version::has_mfdscr()) {
1486 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1487 __ mtdscr(tmp2);
1488 }
1489 __ li(tmp1, 16);
1491 // Backbranch target aligned to 32-byte. It's not aligned 16-byte
1492 // as loop contains < 8 instructions that fit inside a single
1493 // i-cache sector.
1494 __ align(32);
1496 __ bind(l_9);
1497 // Use loop with VSX load/store instructions to
1498 // copy 16 elements a time.
1499 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load from src.
1500 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst.
1501 __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16.
1502 __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
1503 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32.
1504 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32.
1505 __ bdnz(l_9); // Dec CTR and loop if not zero.
1507 // Restore DSCR pre-fetch value.
1508 if (VM_Version::has_mfdscr()) {
1509 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1510 __ mtdscr(tmp2);
1511 }
1513 }
1514 } // FasterArrayCopy
1515 __ bind(l_6);
1517 // copy 2 elements at a time
1518 { // FasterArrayCopy
1519 __ cmpwi(CCR0, R5_ARG3, 2);
1520 __ blt(CCR0, l_1);
1521 __ srdi(tmp1, R5_ARG3, 1);
1522 __ andi_(R5_ARG3, R5_ARG3, 1);
1524 __ addi(R3_ARG1, R3_ARG1, -4);
1525 __ addi(R4_ARG2, R4_ARG2, -4);
1526 __ mtctr(tmp1);
1528 __ bind(l_3);
1529 __ lwzu(tmp2, 4, R3_ARG1);
1530 __ stwu(tmp2, 4, R4_ARG2);
1531 __ bdnz(l_3);
1533 __ addi(R3_ARG1, R3_ARG1, 4);
1534 __ addi(R4_ARG2, R4_ARG2, 4);
1535 }
1537 // do single element copy
1538 __ bind(l_1);
1539 __ cmpwi(CCR0, R5_ARG3, 0);
1540 __ beq(CCR0, l_4);
1542 { // FasterArrayCopy
1543 __ mtctr(R5_ARG3);
1544 __ addi(R3_ARG1, R3_ARG1, -2);
1545 __ addi(R4_ARG2, R4_ARG2, -2);
1547 __ bind(l_5);
1548 __ lhzu(tmp2, 2, R3_ARG1);
1549 __ sthu(tmp2, 2, R4_ARG2);
1550 __ bdnz(l_5);
1551 }
1552 __ bind(l_4);
1553 __ blr();
1555 return start;
1556 }
1558 // Generate stub for conjoint short copy. If "aligned" is true, the
1559 // "from" and "to" addresses are assumed to be heapword aligned.
1560 //
1561 // Arguments for generated stub:
1562 // from: R3_ARG1
1563 // to: R4_ARG2
1564 // count: R5_ARG3 treated as signed
1565 //
1566 address generate_conjoint_short_copy(bool aligned, const char * name) {
1567 StubCodeMark mark(this, "StubRoutines", name);
1568 address start = __ function_entry();
1570 Register tmp1 = R6_ARG4;
1571 Register tmp2 = R7_ARG5;
1572 Register tmp3 = R8_ARG6;
1574 #if defined(ABI_ELFv2)
1575 address nooverlap_target = aligned ?
1576 StubRoutines::arrayof_jshort_disjoint_arraycopy() :
1577 StubRoutines::jshort_disjoint_arraycopy();
1578 #else
1579 address nooverlap_target = aligned ?
1580 ((FunctionDescriptor*)StubRoutines::arrayof_jshort_disjoint_arraycopy())->entry() :
1581 ((FunctionDescriptor*)StubRoutines::jshort_disjoint_arraycopy())->entry();
1582 #endif
1584 array_overlap_test(nooverlap_target, 1);
1586 Label l_1, l_2;
1587 __ sldi(tmp1, R5_ARG3, 1);
1588 __ b(l_2);
1589 __ bind(l_1);
1590 __ sthx(tmp2, R4_ARG2, tmp1);
1591 __ bind(l_2);
1592 __ addic_(tmp1, tmp1, -2);
1593 __ lhzx(tmp2, R3_ARG1, tmp1);
1594 __ bge(CCR0, l_1);
1596 __ blr();
1598 return start;
1599 }
1601 // Generate core code for disjoint int copy (and oop copy on 32-bit). If "aligned"
1602 // is true, the "from" and "to" addresses are assumed to be heapword aligned.
1603 //
1604 // Arguments:
1605 // from: R3_ARG1
1606 // to: R4_ARG2
1607 // count: R5_ARG3 treated as signed
1608 //
1609 void generate_disjoint_int_copy_core(bool aligned) {
1610 Register tmp1 = R6_ARG4;
1611 Register tmp2 = R7_ARG5;
1612 Register tmp3 = R8_ARG6;
1613 Register tmp4 = R0;
1615 VectorSRegister tmp_vsr1 = VSR1;
1616 VectorSRegister tmp_vsr2 = VSR2;
1618 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1620 // for short arrays, just do single element copy
1621 __ li(tmp3, 0);
1622 __ cmpwi(CCR0, R5_ARG3, 5);
1623 __ ble(CCR0, l_2);
1625 if (!aligned) {
1626 // check if arrays have same alignment mod 8.
1627 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1628 __ andi_(R0, tmp1, 7);
1629 // Not the same alignment, but ld and std just need to be 4 byte aligned.
1630 __ bne(CCR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
1632 // copy 1 element to align to and from on an 8 byte boundary
1633 __ andi_(R0, R3_ARG1, 7);
1634 __ beq(CCR0, l_4);
1636 __ lwzx(tmp2, R3_ARG1, tmp3);
1637 __ addi(R5_ARG3, R5_ARG3, -1);
1638 __ stwx(tmp2, R4_ARG2, tmp3);
1639 { // FasterArrayCopy
1640 __ addi(R3_ARG1, R3_ARG1, 4);
1641 __ addi(R4_ARG2, R4_ARG2, 4);
1642 }
1643 __ bind(l_4);
1644 }
1646 { // FasterArrayCopy
1647 __ cmpwi(CCR0, R5_ARG3, 7);
1648 __ ble(CCR0, l_2); // copy 1 at a time if less than 8 elements remain
1650 __ srdi(tmp1, R5_ARG3, 3);
1651 __ andi_(R5_ARG3, R5_ARG3, 7);
1652 __ mtctr(tmp1);
1654 if (!VM_Version::has_vsx()) {
1656 __ bind(l_6);
1657 // Use unrolled version for mass copying (copy 8 elements a time).
1658 // Load feeding store gets zero latency on power6, however not on power 5.
1659 // Therefore, the following sequence is made for the good of both.
1660 __ ld(tmp1, 0, R3_ARG1);
1661 __ ld(tmp2, 8, R3_ARG1);
1662 __ ld(tmp3, 16, R3_ARG1);
1663 __ ld(tmp4, 24, R3_ARG1);
1664 __ std(tmp1, 0, R4_ARG2);
1665 __ std(tmp2, 8, R4_ARG2);
1666 __ std(tmp3, 16, R4_ARG2);
1667 __ std(tmp4, 24, R4_ARG2);
1668 __ addi(R3_ARG1, R3_ARG1, 32);
1669 __ addi(R4_ARG2, R4_ARG2, 32);
1670 __ bdnz(l_6);
1672 } else { // Processor supports VSX, so use it to mass copy.
1674 // Prefetch the data into the L2 cache.
1675 __ dcbt(R3_ARG1, 0);
1677 // If supported set DSCR pre-fetch to deepest.
1678 if (VM_Version::has_mfdscr()) {
1679 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1680 __ mtdscr(tmp2);
1681 }
1683 __ li(tmp1, 16);
1685 // Backbranch target aligned to 32-byte. Not 16-byte align as
1686 // loop contains < 8 instructions that fit inside a single
1687 // i-cache sector.
1688 __ align(32);
1690 __ bind(l_7);
1691 // Use loop with VSX load/store instructions to
1692 // copy 8 elements a time.
1693 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1694 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1695 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
1696 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1697 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
1698 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
1699 __ bdnz(l_7); // Dec CTR and loop if not zero.
1701 // Restore DSCR pre-fetch value.
1702 if (VM_Version::has_mfdscr()) {
1703 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1704 __ mtdscr(tmp2);
1705 }
1707 } // VSX
1708 } // FasterArrayCopy
1710 // copy 1 element at a time
1711 __ bind(l_2);
1712 __ cmpwi(CCR0, R5_ARG3, 0);
1713 __ beq(CCR0, l_1);
1715 { // FasterArrayCopy
1716 __ mtctr(R5_ARG3);
1717 __ addi(R3_ARG1, R3_ARG1, -4);
1718 __ addi(R4_ARG2, R4_ARG2, -4);
1720 __ bind(l_3);
1721 __ lwzu(tmp2, 4, R3_ARG1);
1722 __ stwu(tmp2, 4, R4_ARG2);
1723 __ bdnz(l_3);
1724 }
1726 __ bind(l_1);
1727 return;
1728 }
1730 // Generate stub for disjoint int copy. If "aligned" is true, the
1731 // "from" and "to" addresses are assumed to be heapword aligned.
1732 //
1733 // Arguments for generated stub:
1734 // from: R3_ARG1
1735 // to: R4_ARG2
1736 // count: R5_ARG3 treated as signed
1737 //
1738 address generate_disjoint_int_copy(bool aligned, const char * name) {
1739 StubCodeMark mark(this, "StubRoutines", name);
1740 address start = __ function_entry();
1741 generate_disjoint_int_copy_core(aligned);
1742 __ blr();
1743 return start;
1744 }
1746 // Generate core code for conjoint int copy (and oop copy on
1747 // 32-bit). If "aligned" is true, the "from" and "to" addresses
1748 // are assumed to be heapword aligned.
1749 //
1750 // Arguments:
1751 // from: R3_ARG1
1752 // to: R4_ARG2
1753 // count: R5_ARG3 treated as signed
1754 //
1755 void generate_conjoint_int_copy_core(bool aligned) {
1756 // Do reverse copy. We assume the case of actual overlap is rare enough
1757 // that we don't have to optimize it.
1759 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1761 Register tmp1 = R6_ARG4;
1762 Register tmp2 = R7_ARG5;
1763 Register tmp3 = R8_ARG6;
1764 Register tmp4 = R0;
1766 VectorSRegister tmp_vsr1 = VSR1;
1767 VectorSRegister tmp_vsr2 = VSR2;
1769 { // FasterArrayCopy
1770 __ cmpwi(CCR0, R5_ARG3, 0);
1771 __ beq(CCR0, l_6);
1773 __ sldi(R5_ARG3, R5_ARG3, 2);
1774 __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1775 __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1776 __ srdi(R5_ARG3, R5_ARG3, 2);
1778 if (!aligned) {
1779 // check if arrays have same alignment mod 8.
1780 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1781 __ andi_(R0, tmp1, 7);
1782 // Not the same alignment, but ld and std just need to be 4 byte aligned.
1783 __ bne(CCR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time
1785 // copy 1 element to align to and from on an 8 byte boundary
1786 __ andi_(R0, R3_ARG1, 7);
1787 __ beq(CCR0, l_7);
1789 __ addi(R3_ARG1, R3_ARG1, -4);
1790 __ addi(R4_ARG2, R4_ARG2, -4);
1791 __ addi(R5_ARG3, R5_ARG3, -1);
1792 __ lwzx(tmp2, R3_ARG1);
1793 __ stwx(tmp2, R4_ARG2);
1794 __ bind(l_7);
1795 }
1797 __ cmpwi(CCR0, R5_ARG3, 7);
1798 __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain
1800 __ srdi(tmp1, R5_ARG3, 3);
1801 __ andi(R5_ARG3, R5_ARG3, 7);
1802 __ mtctr(tmp1);
1804 if (!VM_Version::has_vsx()) {
1805 __ bind(l_4);
1806 // Use unrolled version for mass copying (copy 4 elements a time).
1807 // Load feeding store gets zero latency on Power6, however not on Power5.
1808 // Therefore, the following sequence is made for the good of both.
1809 __ addi(R3_ARG1, R3_ARG1, -32);
1810 __ addi(R4_ARG2, R4_ARG2, -32);
1811 __ ld(tmp4, 24, R3_ARG1);
1812 __ ld(tmp3, 16, R3_ARG1);
1813 __ ld(tmp2, 8, R3_ARG1);
1814 __ ld(tmp1, 0, R3_ARG1);
1815 __ std(tmp4, 24, R4_ARG2);
1816 __ std(tmp3, 16, R4_ARG2);
1817 __ std(tmp2, 8, R4_ARG2);
1818 __ std(tmp1, 0, R4_ARG2);
1819 __ bdnz(l_4);
1820 } else { // Processor supports VSX, so use it to mass copy.
1821 // Prefetch the data into the L2 cache.
1822 __ dcbt(R3_ARG1, 0);
1824 // If supported set DSCR pre-fetch to deepest.
1825 if (VM_Version::has_mfdscr()) {
1826 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1827 __ mtdscr(tmp2);
1828 }
1830 __ li(tmp1, 16);
1832 // Backbranch target aligned to 32-byte. Not 16-byte align as
1833 // loop contains < 8 instructions that fit inside a single
1834 // i-cache sector.
1835 __ align(32);
1837 __ bind(l_4);
1838 // Use loop with VSX load/store instructions to
1839 // copy 8 elements a time.
1840 __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32
1841 __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32
1842 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16
1843 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1844 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1845 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1846 __ bdnz(l_4);
1848 // Restore DSCR pre-fetch value.
1849 if (VM_Version::has_mfdscr()) {
1850 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1851 __ mtdscr(tmp2);
1852 }
1853 }
1855 __ cmpwi(CCR0, R5_ARG3, 0);
1856 __ beq(CCR0, l_6);
1858 __ bind(l_5);
1859 __ mtctr(R5_ARG3);
1860 __ bind(l_3);
1861 __ lwz(R0, -4, R3_ARG1);
1862 __ stw(R0, -4, R4_ARG2);
1863 __ addi(R3_ARG1, R3_ARG1, -4);
1864 __ addi(R4_ARG2, R4_ARG2, -4);
1865 __ bdnz(l_3);
1867 __ bind(l_6);
1868 }
1869 }
1871 // Generate stub for conjoint int copy. If "aligned" is true, the
1872 // "from" and "to" addresses are assumed to be heapword aligned.
1873 //
1874 // Arguments for generated stub:
1875 // from: R3_ARG1
1876 // to: R4_ARG2
1877 // count: R5_ARG3 treated as signed
1878 //
1879 address generate_conjoint_int_copy(bool aligned, const char * name) {
1880 StubCodeMark mark(this, "StubRoutines", name);
1881 address start = __ function_entry();
1883 #if defined(ABI_ELFv2)
1884 address nooverlap_target = aligned ?
1885 StubRoutines::arrayof_jint_disjoint_arraycopy() :
1886 StubRoutines::jint_disjoint_arraycopy();
1887 #else
1888 address nooverlap_target = aligned ?
1889 ((FunctionDescriptor*)StubRoutines::arrayof_jint_disjoint_arraycopy())->entry() :
1890 ((FunctionDescriptor*)StubRoutines::jint_disjoint_arraycopy())->entry();
1891 #endif
1893 array_overlap_test(nooverlap_target, 2);
1895 generate_conjoint_int_copy_core(aligned);
1897 __ blr();
1899 return start;
1900 }
1902 // Generate core code for disjoint long copy (and oop copy on
1903 // 64-bit). If "aligned" is true, the "from" and "to" addresses
1904 // are assumed to be heapword aligned.
1905 //
1906 // Arguments:
1907 // from: R3_ARG1
1908 // to: R4_ARG2
1909 // count: R5_ARG3 treated as signed
1910 //
1911 void generate_disjoint_long_copy_core(bool aligned) {
1912 Register tmp1 = R6_ARG4;
1913 Register tmp2 = R7_ARG5;
1914 Register tmp3 = R8_ARG6;
1915 Register tmp4 = R0;
1917 Label l_1, l_2, l_3, l_4, l_5;
1919 VectorSRegister tmp_vsr1 = VSR1;
1920 VectorSRegister tmp_vsr2 = VSR2;
1922 { // FasterArrayCopy
1923 __ cmpwi(CCR0, R5_ARG3, 3);
1924 __ ble(CCR0, l_3); // copy 1 at a time if less than 4 elements remain
1926 __ srdi(tmp1, R5_ARG3, 2);
1927 __ andi_(R5_ARG3, R5_ARG3, 3);
1928 __ mtctr(tmp1);
1930 if (!VM_Version::has_vsx()) {
1931 __ bind(l_4);
1932 // Use unrolled version for mass copying (copy 4 elements a time).
1933 // Load feeding store gets zero latency on Power6, however not on Power5.
1934 // Therefore, the following sequence is made for the good of both.
1935 __ ld(tmp1, 0, R3_ARG1);
1936 __ ld(tmp2, 8, R3_ARG1);
1937 __ ld(tmp3, 16, R3_ARG1);
1938 __ ld(tmp4, 24, R3_ARG1);
1939 __ std(tmp1, 0, R4_ARG2);
1940 __ std(tmp2, 8, R4_ARG2);
1941 __ std(tmp3, 16, R4_ARG2);
1942 __ std(tmp4, 24, R4_ARG2);
1943 __ addi(R3_ARG1, R3_ARG1, 32);
1944 __ addi(R4_ARG2, R4_ARG2, 32);
1945 __ bdnz(l_4);
1947 } else { // Processor supports VSX, so use it to mass copy.
1949 // Prefetch the data into the L2 cache.
1950 __ dcbt(R3_ARG1, 0);
1952 // If supported set DSCR pre-fetch to deepest.
1953 if (VM_Version::has_mfdscr()) {
1954 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1955 __ mtdscr(tmp2);
1956 }
1958 __ li(tmp1, 16);
1960 // Backbranch target aligned to 32-byte. Not 16-byte align as
1961 // loop contains < 8 instructions that fit inside a single
1962 // i-cache sector.
1963 __ align(32);
1965 __ bind(l_5);
1966 // Use loop with VSX load/store instructions to
1967 // copy 4 elements a time.
1968 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1969 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1970 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
1971 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1972 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
1973 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
1974 __ bdnz(l_5); // Dec CTR and loop if not zero.
1976 // Restore DSCR pre-fetch value.
1977 if (VM_Version::has_mfdscr()) {
1978 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1979 __ mtdscr(tmp2);
1980 }
1982 } // VSX
1983 } // FasterArrayCopy
1985 // copy 1 element at a time
1986 __ bind(l_3);
1987 __ cmpwi(CCR0, R5_ARG3, 0);
1988 __ beq(CCR0, l_1);
1990 { // FasterArrayCopy
1991 __ mtctr(R5_ARG3);
1992 __ addi(R3_ARG1, R3_ARG1, -8);
1993 __ addi(R4_ARG2, R4_ARG2, -8);
1995 __ bind(l_2);
1996 __ ldu(R0, 8, R3_ARG1);
1997 __ stdu(R0, 8, R4_ARG2);
1998 __ bdnz(l_2);
2000 }
2001 __ bind(l_1);
2002 }
2004 // Generate stub for disjoint long copy. If "aligned" is true, the
2005 // "from" and "to" addresses are assumed to be heapword aligned.
2006 //
2007 // Arguments for generated stub:
2008 // from: R3_ARG1
2009 // to: R4_ARG2
2010 // count: R5_ARG3 treated as signed
2011 //
2012 address generate_disjoint_long_copy(bool aligned, const char * name) {
2013 StubCodeMark mark(this, "StubRoutines", name);
2014 address start = __ function_entry();
2015 generate_disjoint_long_copy_core(aligned);
2016 __ blr();
2018 return start;
2019 }
2021 // Generate core code for conjoint long copy (and oop copy on
2022 // 64-bit). If "aligned" is true, the "from" and "to" addresses
2023 // are assumed to be heapword aligned.
2024 //
2025 // Arguments:
2026 // from: R3_ARG1
2027 // to: R4_ARG2
2028 // count: R5_ARG3 treated as signed
2029 //
2030 void generate_conjoint_long_copy_core(bool aligned) {
2031 Register tmp1 = R6_ARG4;
2032 Register tmp2 = R7_ARG5;
2033 Register tmp3 = R8_ARG6;
2034 Register tmp4 = R0;
2036 VectorSRegister tmp_vsr1 = VSR1;
2037 VectorSRegister tmp_vsr2 = VSR2;
2039 Label l_1, l_2, l_3, l_4, l_5;
2041 __ cmpwi(CCR0, R5_ARG3, 0);
2042 __ beq(CCR0, l_1);
2044 { // FasterArrayCopy
2045 __ sldi(R5_ARG3, R5_ARG3, 3);
2046 __ add(R3_ARG1, R3_ARG1, R5_ARG3);
2047 __ add(R4_ARG2, R4_ARG2, R5_ARG3);
2048 __ srdi(R5_ARG3, R5_ARG3, 3);
2050 __ cmpwi(CCR0, R5_ARG3, 3);
2051 __ ble(CCR0, l_5); // copy 1 at a time if less than 4 elements remain
2053 __ srdi(tmp1, R5_ARG3, 2);
2054 __ andi(R5_ARG3, R5_ARG3, 3);
2055 __ mtctr(tmp1);
2057 if (!VM_Version::has_vsx()) {
2058 __ bind(l_4);
2059 // Use unrolled version for mass copying (copy 4 elements a time).
2060 // Load feeding store gets zero latency on Power6, however not on Power5.
2061 // Therefore, the following sequence is made for the good of both.
2062 __ addi(R3_ARG1, R3_ARG1, -32);
2063 __ addi(R4_ARG2, R4_ARG2, -32);
2064 __ ld(tmp4, 24, R3_ARG1);
2065 __ ld(tmp3, 16, R3_ARG1);
2066 __ ld(tmp2, 8, R3_ARG1);
2067 __ ld(tmp1, 0, R3_ARG1);
2068 __ std(tmp4, 24, R4_ARG2);
2069 __ std(tmp3, 16, R4_ARG2);
2070 __ std(tmp2, 8, R4_ARG2);
2071 __ std(tmp1, 0, R4_ARG2);
2072 __ bdnz(l_4);
2073 } else { // Processor supports VSX, so use it to mass copy.
2074 // Prefetch the data into the L2 cache.
2075 __ dcbt(R3_ARG1, 0);
2077 // If supported set DSCR pre-fetch to deepest.
2078 if (VM_Version::has_mfdscr()) {
2079 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
2080 __ mtdscr(tmp2);
2081 }
2083 __ li(tmp1, 16);
2085 // Backbranch target aligned to 32-byte. Not 16-byte align as
2086 // loop contains < 8 instructions that fit inside a single
2087 // i-cache sector.
2088 __ align(32);
2090 __ bind(l_4);
2091 // Use loop with VSX load/store instructions to
2092 // copy 4 elements a time.
2093 __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32
2094 __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32
2095 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16
2096 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
2097 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
2098 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
2099 __ bdnz(l_4);
2101 // Restore DSCR pre-fetch value.
2102 if (VM_Version::has_mfdscr()) {
2103 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
2104 __ mtdscr(tmp2);
2105 }
2106 }
2108 __ cmpwi(CCR0, R5_ARG3, 0);
2109 __ beq(CCR0, l_1);
2111 __ bind(l_5);
2112 __ mtctr(R5_ARG3);
2113 __ bind(l_3);
2114 __ ld(R0, -8, R3_ARG1);
2115 __ std(R0, -8, R4_ARG2);
2116 __ addi(R3_ARG1, R3_ARG1, -8);
2117 __ addi(R4_ARG2, R4_ARG2, -8);
2118 __ bdnz(l_3);
2120 }
2121 __ bind(l_1);
2122 }
2124 // Generate stub for conjoint long copy. If "aligned" is true, the
2125 // "from" and "to" addresses are assumed to be heapword aligned.
2126 //
2127 // Arguments for generated stub:
2128 // from: R3_ARG1
2129 // to: R4_ARG2
2130 // count: R5_ARG3 treated as signed
2131 //
2132 address generate_conjoint_long_copy(bool aligned, const char * name) {
2133 StubCodeMark mark(this, "StubRoutines", name);
2134 address start = __ function_entry();
2136 #if defined(ABI_ELFv2)
2137 address nooverlap_target = aligned ?
2138 StubRoutines::arrayof_jlong_disjoint_arraycopy() :
2139 StubRoutines::jlong_disjoint_arraycopy();
2140 #else
2141 address nooverlap_target = aligned ?
2142 ((FunctionDescriptor*)StubRoutines::arrayof_jlong_disjoint_arraycopy())->entry() :
2143 ((FunctionDescriptor*)StubRoutines::jlong_disjoint_arraycopy())->entry();
2144 #endif
2146 array_overlap_test(nooverlap_target, 3);
2147 generate_conjoint_long_copy_core(aligned);
2149 __ blr();
2151 return start;
2152 }
2154 // Generate stub for conjoint oop copy. If "aligned" is true, the
2155 // "from" and "to" addresses are assumed to be heapword aligned.
2156 //
2157 // Arguments for generated stub:
2158 // from: R3_ARG1
2159 // to: R4_ARG2
2160 // count: R5_ARG3 treated as signed
2161 // dest_uninitialized: G1 support
2162 //
2163 address generate_conjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
2164 StubCodeMark mark(this, "StubRoutines", name);
2166 address start = __ function_entry();
2168 #if defined(ABI_ELFv2)
2169 address nooverlap_target = aligned ?
2170 StubRoutines::arrayof_oop_disjoint_arraycopy() :
2171 StubRoutines::oop_disjoint_arraycopy();
2172 #else
2173 address nooverlap_target = aligned ?
2174 ((FunctionDescriptor*)StubRoutines::arrayof_oop_disjoint_arraycopy())->entry() :
2175 ((FunctionDescriptor*)StubRoutines::oop_disjoint_arraycopy())->entry();
2176 #endif
2178 gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);
2180 // Save arguments.
2181 __ mr(R9_ARG7, R4_ARG2);
2182 __ mr(R10_ARG8, R5_ARG3);
2184 if (UseCompressedOops) {
2185 array_overlap_test(nooverlap_target, 2);
2186 generate_conjoint_int_copy_core(aligned);
2187 } else {
2188 array_overlap_test(nooverlap_target, 3);
2189 generate_conjoint_long_copy_core(aligned);
2190 }
2192 gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);
2193 return start;
2194 }
2196 // Generate stub for disjoint oop copy. If "aligned" is true, the
2197 // "from" and "to" addresses are assumed to be heapword aligned.
2198 //
2199 // Arguments for generated stub:
2200 // from: R3_ARG1
2201 // to: R4_ARG2
2202 // count: R5_ARG3 treated as signed
2203 // dest_uninitialized: G1 support
2204 //
2205 address generate_disjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
2206 StubCodeMark mark(this, "StubRoutines", name);
2207 address start = __ function_entry();
2209 gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);
2211 // save some arguments, disjoint_long_copy_core destroys them.
2212 // needed for post barrier
2213 __ mr(R9_ARG7, R4_ARG2);
2214 __ mr(R10_ARG8, R5_ARG3);
2216 if (UseCompressedOops) {
2217 generate_disjoint_int_copy_core(aligned);
2218 } else {
2219 generate_disjoint_long_copy_core(aligned);
2220 }
2222 gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);
2224 return start;
2225 }
2227 // Arguments for generated stub (little endian only):
2228 // R3_ARG1 - source byte array address
2229 // R4_ARG2 - destination byte array address
2230 // R5_ARG3 - round key array
2231 address generate_aescrypt_encryptBlock() {
2232 assert(UseAES, "need AES instructions and misaligned SSE support");
2233 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2235 address start = __ function_entry();
2237 Label L_doLast;
2239 Register from = R3_ARG1; // source array address
2240 Register to = R4_ARG2; // destination array address
2241 Register key = R5_ARG3; // round key array
2243 Register keylen = R8;
2244 Register temp = R9;
2245 Register keypos = R10;
2246 Register hex = R11;
2247 Register fifteen = R12;
2249 VectorRegister vRet = VR0;
2251 VectorRegister vKey1 = VR1;
2252 VectorRegister vKey2 = VR2;
2253 VectorRegister vKey3 = VR3;
2254 VectorRegister vKey4 = VR4;
2256 VectorRegister fromPerm = VR5;
2257 VectorRegister keyPerm = VR6;
2258 VectorRegister toPerm = VR7;
2259 VectorRegister fSplt = VR8;
2261 VectorRegister vTmp1 = VR9;
2262 VectorRegister vTmp2 = VR10;
2263 VectorRegister vTmp3 = VR11;
2264 VectorRegister vTmp4 = VR12;
2266 VectorRegister vLow = VR13;
2267 VectorRegister vHigh = VR14;
2269 __ li (hex, 16);
2270 __ li (fifteen, 15);
2271 __ vspltisb (fSplt, 0x0f);
2273 // load unaligned from[0-15] to vsRet
2274 __ lvx (vRet, from);
2275 __ lvx (vTmp1, fifteen, from);
2276 __ lvsl (fromPerm, from);
2277 __ vxor (fromPerm, fromPerm, fSplt);
2278 __ vperm (vRet, vRet, vTmp1, fromPerm);
2280 // load keylen (44 or 52 or 60)
2281 __ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
2283 // to load keys
2284 __ lvsr (keyPerm, key);
2285 __ vxor (vTmp2, vTmp2, vTmp2);
2286 __ vspltisb (vTmp2, -16);
2287 __ vrld (keyPerm, keyPerm, vTmp2);
2288 __ vrld (keyPerm, keyPerm, vTmp2);
2289 __ vsldoi (keyPerm, keyPerm, keyPerm, 8);
2291 // load the 1st round key to vKey1
2292 __ li (keypos, 0);
2293 __ lvx (vKey1, keypos, key);
2294 __ addi (keypos, keypos, 16);
2295 __ lvx (vTmp1, keypos, key);
2296 __ vperm (vKey1, vTmp1, vKey1, keyPerm);
2298 // 1st round
2299 __ vxor (vRet, vRet, vKey1);
2301 // load the 2nd round key to vKey1
2302 __ addi (keypos, keypos, 16);
2303 __ lvx (vTmp2, keypos, key);
2304 __ vperm (vKey1, vTmp2, vTmp1, keyPerm);
2306 // load the 3rd round key to vKey2
2307 __ addi (keypos, keypos, 16);
2308 __ lvx (vTmp1, keypos, key);
2309 __ vperm (vKey2, vTmp1, vTmp2, keyPerm);
2311 // load the 4th round key to vKey3
2312 __ addi (keypos, keypos, 16);
2313 __ lvx (vTmp2, keypos, key);
2314 __ vperm (vKey3, vTmp2, vTmp1, keyPerm);
2316 // load the 5th round key to vKey4
2317 __ addi (keypos, keypos, 16);
2318 __ lvx (vTmp1, keypos, key);
2319 __ vperm (vKey4, vTmp1, vTmp2, keyPerm);
2321 // 2nd - 5th rounds
2322 __ vcipher (vRet, vRet, vKey1);
2323 __ vcipher (vRet, vRet, vKey2);
2324 __ vcipher (vRet, vRet, vKey3);
2325 __ vcipher (vRet, vRet, vKey4);
2327 // load the 6th round key to vKey1
2328 __ addi (keypos, keypos, 16);
2329 __ lvx (vTmp2, keypos, key);
2330 __ vperm (vKey1, vTmp2, vTmp1, keyPerm);
2332 // load the 7th round key to vKey2
2333 __ addi (keypos, keypos, 16);
2334 __ lvx (vTmp1, keypos, key);
2335 __ vperm (vKey2, vTmp1, vTmp2, keyPerm);
2337 // load the 8th round key to vKey3
2338 __ addi (keypos, keypos, 16);
2339 __ lvx (vTmp2, keypos, key);
2340 __ vperm (vKey3, vTmp2, vTmp1, keyPerm);
2342 // load the 9th round key to vKey4
2343 __ addi (keypos, keypos, 16);
2344 __ lvx (vTmp1, keypos, key);
2345 __ vperm (vKey4, vTmp1, vTmp2, keyPerm);
2347 // 6th - 9th rounds
2348 __ vcipher (vRet, vRet, vKey1);
2349 __ vcipher (vRet, vRet, vKey2);
2350 __ vcipher (vRet, vRet, vKey3);
2351 __ vcipher (vRet, vRet, vKey4);
2353 // load the 10th round key to vKey1
2354 __ addi (keypos, keypos, 16);
2355 __ lvx (vTmp2, keypos, key);
2356 __ vperm (vKey1, vTmp2, vTmp1, keyPerm);
2358 // load the 11th round key to vKey2
2359 __ addi (keypos, keypos, 16);
2360 __ lvx (vTmp1, keypos, key);
2361 __ vperm (vKey2, vTmp1, vTmp2, keyPerm);
2363 // if all round keys are loaded, skip next 4 rounds
2364 __ cmpwi (CCR0, keylen, 44);
2365 __ beq (CCR0, L_doLast);
2367 // 10th - 11th rounds
2368 __ vcipher (vRet, vRet, vKey1);
2369 __ vcipher (vRet, vRet, vKey2);
2371 // load the 12th round key to vKey1
2372 __ addi (keypos, keypos, 16);
2373 __ lvx (vTmp2, keypos, key);
2374 __ vperm (vKey1, vTmp2, vTmp1, keyPerm);
2376 // load the 13th round key to vKey2
2377 __ addi (keypos, keypos, 16);
2378 __ lvx (vTmp1, keypos, key);
2379 __ vperm (vKey2, vTmp1, vTmp2, keyPerm);
2381 // if all round keys are loaded, skip next 2 rounds
2382 __ cmpwi (CCR0, keylen, 52);
2383 __ beq (CCR0, L_doLast);
2385 // 12th - 13th rounds
2386 __ vcipher (vRet, vRet, vKey1);
2387 __ vcipher (vRet, vRet, vKey2);
2389 // load the 14th round key to vKey1
2390 __ addi (keypos, keypos, 16);
2391 __ lvx (vTmp2, keypos, key);
2392 __ vperm (vKey1, vTmp2, vTmp1, keyPerm);
2394 // load the 15th round key to vKey2
2395 __ addi (keypos, keypos, 16);
2396 __ lvx (vTmp1, keypos, key);
2397 __ vperm (vKey2, vTmp1, vTmp2, keyPerm);
2399 __ bind(L_doLast);
2401 // last two rounds
2402 __ vcipher (vRet, vRet, vKey1);
2403 __ vcipherlast (vRet, vRet, vKey2);
2405 __ neg (temp, to);
2406 __ lvsr (toPerm, temp);
2407 __ vspltisb (vTmp2, -1);
2408 __ vxor (vTmp1, vTmp1, vTmp1);
2409 __ vperm (vTmp2, vTmp2, vTmp1, toPerm);
2410 __ vxor (toPerm, toPerm, fSplt);
2411 __ lvx (vTmp1, to);
2412 __ vperm (vRet, vRet, vRet, toPerm);
2413 __ vsel (vTmp1, vTmp1, vRet, vTmp2);
2414 __ lvx (vTmp4, fifteen, to);
2415 __ stvx (vTmp1, to);
2416 __ vsel (vRet, vRet, vTmp4, vTmp2);
2417 __ stvx (vRet, fifteen, to);
2419 __ blr();
2420 return start;
2421 }
2423 // Arguments for generated stub (little endian only):
2424 // R3_ARG1 - source byte array address
2425 // R4_ARG2 - destination byte array address
2426 // R5_ARG3 - K (key) in little endian int array
2427 address generate_aescrypt_decryptBlock() {
2428 assert(UseAES, "need AES instructions and misaligned SSE support");
2429 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2431 address start = __ function_entry();
2433 Label L_doLast;
2434 Label L_do44;
2435 Label L_do52;
2436 Label L_do60;
2438 Register from = R3_ARG1; // source array address
2439 Register to = R4_ARG2; // destination array address
2440 Register key = R5_ARG3; // round key array
2442 Register keylen = R8;
2443 Register temp = R9;
2444 Register keypos = R10;
2445 Register hex = R11;
2446 Register fifteen = R12;
2448 VectorRegister vRet = VR0;
2450 VectorRegister vKey1 = VR1;
2451 VectorRegister vKey2 = VR2;
2452 VectorRegister vKey3 = VR3;
2453 VectorRegister vKey4 = VR4;
2454 VectorRegister vKey5 = VR5;
2456 VectorRegister fromPerm = VR6;
2457 VectorRegister keyPerm = VR7;
2458 VectorRegister toPerm = VR8;
2459 VectorRegister fSplt = VR9;
2461 VectorRegister vTmp1 = VR10;
2462 VectorRegister vTmp2 = VR11;
2463 VectorRegister vTmp3 = VR12;
2464 VectorRegister vTmp4 = VR13;
2466 VectorRegister vLow = VR14;
2467 VectorRegister vHigh = VR15;
2469 __ li (hex, 16);
2470 __ li (fifteen, 15);
2471 __ vspltisb (fSplt, 0x0f);
2473 // load unaligned from[0-15] to vsRet
2474 __ lvx (vRet, from);
2475 __ lvx (vTmp1, fifteen, from);
2476 __ lvsl (fromPerm, from);
2477 __ vxor (fromPerm, fromPerm, fSplt);
2478 __ vperm (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE]
2480 // load keylen (44 or 52 or 60)
2481 __ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
2483 // to load keys
2484 __ lvsr (keyPerm, key);
2485 __ vxor (vTmp2, vTmp2, vTmp2);
2486 __ vspltisb (vTmp2, -16);
2487 __ vrld (keyPerm, keyPerm, vTmp2);
2488 __ vrld (keyPerm, keyPerm, vTmp2);
2489 __ vsldoi (keyPerm, keyPerm, keyPerm, 8);
2491 __ cmpwi (CCR0, keylen, 44);
2492 __ beq (CCR0, L_do44);
2494 __ cmpwi (CCR0, keylen, 52);
2495 __ beq (CCR0, L_do52);
2497 // load the 15th round key to vKey11
2498 __ li (keypos, 240);
2499 __ lvx (vTmp1, keypos, key);
2500 __ addi (keypos, keypos, -16);
2501 __ lvx (vTmp2, keypos, key);
2502 __ vperm (vKey1, vTmp1, vTmp2, keyPerm);
2504 // load the 14th round key to vKey10
2505 __ addi (keypos, keypos, -16);
2506 __ lvx (vTmp1, keypos, key);
2507 __ vperm (vKey2, vTmp2, vTmp1, keyPerm);
2509 // load the 13th round key to vKey10
2510 __ addi (keypos, keypos, -16);
2511 __ lvx (vTmp2, keypos, key);
2512 __ vperm (vKey3, vTmp1, vTmp2, keyPerm);
2514 // load the 12th round key to vKey10
2515 __ addi (keypos, keypos, -16);
2516 __ lvx (vTmp1, keypos, key);
2517 __ vperm (vKey4, vTmp2, vTmp1, keyPerm);
2519 // load the 11th round key to vKey10
2520 __ addi (keypos, keypos, -16);
2521 __ lvx (vTmp2, keypos, key);
2522 __ vperm (vKey5, vTmp1, vTmp2, keyPerm);
2524 // 1st - 5th rounds
2525 __ vxor (vRet, vRet, vKey1);
2526 __ vncipher (vRet, vRet, vKey2);
2527 __ vncipher (vRet, vRet, vKey3);
2528 __ vncipher (vRet, vRet, vKey4);
2529 __ vncipher (vRet, vRet, vKey5);
2531 __ b (L_doLast);
2533 __ bind (L_do52);
2535 // load the 13th round key to vKey11
2536 __ li (keypos, 208);
2537 __ lvx (vTmp1, keypos, key);
2538 __ addi (keypos, keypos, -16);
2539 __ lvx (vTmp2, keypos, key);
2540 __ vperm (vKey1, vTmp1, vTmp2, keyPerm);
2542 // load the 12th round key to vKey10
2543 __ addi (keypos, keypos, -16);
2544 __ lvx (vTmp1, keypos, key);
2545 __ vperm (vKey2, vTmp2, vTmp1, keyPerm);
2547 // load the 11th round key to vKey10
2548 __ addi (keypos, keypos, -16);
2549 __ lvx (vTmp2, keypos, key);
2550 __ vperm (vKey3, vTmp1, vTmp2, keyPerm);
2552 // 1st - 3rd rounds
2553 __ vxor (vRet, vRet, vKey1);
2554 __ vncipher (vRet, vRet, vKey2);
2555 __ vncipher (vRet, vRet, vKey3);
2557 __ b (L_doLast);
2559 __ bind (L_do44);
2561 // load the 11th round key to vKey11
2562 __ li (keypos, 176);
2563 __ lvx (vTmp1, keypos, key);
2564 __ addi (keypos, keypos, -16);
2565 __ lvx (vTmp2, keypos, key);
2566 __ vperm (vKey1, vTmp1, vTmp2, keyPerm);
2568 // 1st round
2569 __ vxor (vRet, vRet, vKey1);
2571 __ bind (L_doLast);
2573 // load the 10th round key to vKey10
2574 __ addi (keypos, keypos, -16);
2575 __ lvx (vTmp1, keypos, key);
2576 __ vperm (vKey1, vTmp2, vTmp1, keyPerm);
2578 // load the 9th round key to vKey10
2579 __ addi (keypos, keypos, -16);
2580 __ lvx (vTmp2, keypos, key);
2581 __ vperm (vKey2, vTmp1, vTmp2, keyPerm);
2583 // load the 8th round key to vKey10
2584 __ addi (keypos, keypos, -16);
2585 __ lvx (vTmp1, keypos, key);
2586 __ vperm (vKey3, vTmp2, vTmp1, keyPerm);
2588 // load the 7th round key to vKey10
2589 __ addi (keypos, keypos, -16);
2590 __ lvx (vTmp2, keypos, key);
2591 __ vperm (vKey4, vTmp1, vTmp2, keyPerm);
2593 // load the 6th round key to vKey10
2594 __ addi (keypos, keypos, -16);
2595 __ lvx (vTmp1, keypos, key);
2596 __ vperm (vKey5, vTmp2, vTmp1, keyPerm);
2598 // last 10th - 6th rounds
2599 __ vncipher (vRet, vRet, vKey1);
2600 __ vncipher (vRet, vRet, vKey2);
2601 __ vncipher (vRet, vRet, vKey3);
2602 __ vncipher (vRet, vRet, vKey4);
2603 __ vncipher (vRet, vRet, vKey5);
2605 // load the 5th round key to vKey10
2606 __ addi (keypos, keypos, -16);
2607 __ lvx (vTmp2, keypos, key);
2608 __ vperm (vKey1, vTmp1, vTmp2, keyPerm);
2610 // load the 4th round key to vKey10
2611 __ addi (keypos, keypos, -16);
2612 __ lvx (vTmp1, keypos, key);
2613 __ vperm (vKey2, vTmp2, vTmp1, keyPerm);
2615 // load the 3rd round key to vKey10
2616 __ addi (keypos, keypos, -16);
2617 __ lvx (vTmp2, keypos, key);
2618 __ vperm (vKey3, vTmp1, vTmp2, keyPerm);
2620 // load the 2nd round key to vKey10
2621 __ addi (keypos, keypos, -16);
2622 __ lvx (vTmp1, keypos, key);
2623 __ vperm (vKey4, vTmp2, vTmp1, keyPerm);
2625 // load the 1st round key to vKey10
2626 __ addi (keypos, keypos, -16);
2627 __ lvx (vTmp2, keypos, key);
2628 __ vperm (vKey5, vTmp1, vTmp2, keyPerm);
2630 // last 5th - 1th rounds
2631 __ vncipher (vRet, vRet, vKey1);
2632 __ vncipher (vRet, vRet, vKey2);
2633 __ vncipher (vRet, vRet, vKey3);
2634 __ vncipher (vRet, vRet, vKey4);
2635 __ vncipherlast (vRet, vRet, vKey5);
2637 __ neg (temp, to);
2638 __ lvsr (toPerm, temp);
2639 __ vspltisb (vTmp2, -1);
2640 __ vxor (vTmp1, vTmp1, vTmp1);
2641 __ vperm (vTmp2, vTmp2, vTmp1, toPerm);
2642 __ vxor (toPerm, toPerm, fSplt);
2643 __ lvx (vTmp1, to);
2644 __ vperm (vRet, vRet, vRet, toPerm);
2645 __ vsel (vTmp1, vTmp1, vRet, vTmp2);
2646 __ lvx (vTmp4, fifteen, to);
2647 __ stvx (vTmp1, to);
2648 __ vsel (vRet, vRet, vTmp4, vTmp2);
2649 __ stvx (vRet, fifteen, to);
2651 __ blr();
2652 return start;
2653 }
2655 void generate_arraycopy_stubs() {
2656 // Note: the disjoint stubs must be generated first, some of
2657 // the conjoint stubs use them.
2659 // non-aligned disjoint versions
2660 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
2661 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
2662 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, "jint_disjoint_arraycopy");
2663 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
2664 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy", false);
2665 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy_uninit", true);
2667 // aligned disjoint versions
2668 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
2669 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
2670 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, "arrayof_jint_disjoint_arraycopy");
2671 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
2672 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(true, "arrayof_oop_disjoint_arraycopy", false);
2673 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, "oop_disjoint_arraycopy_uninit", true);
2675 // non-aligned conjoint versions
2676 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
2677 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, "jshort_arraycopy");
2678 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, "jint_arraycopy");
2679 StubRoutines::_jlong_arraycopy = generate_conjoint_long_copy(false, "jlong_arraycopy");
2680 StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(false, "oop_arraycopy", false);
2681 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, "oop_arraycopy_uninit", true);
2683 // aligned conjoint versions
2684 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
2685 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
2686 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, "arrayof_jint_arraycopy");
2687 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");
2688 StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", false);
2689 StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", true);
2691 // fill routines
2692 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2693 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2694 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2695 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2696 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2697 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2698 }
2700 // Safefetch stubs.
2701 void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) {
2702 // safefetch signatures:
2703 // int SafeFetch32(int* adr, int errValue);
2704 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
2705 //
2706 // arguments:
2707 // R3_ARG1 = adr
2708 // R4_ARG2 = errValue
2709 //
2710 // result:
2711 // R3_RET = *adr or errValue
2713 StubCodeMark mark(this, "StubRoutines", name);
2715 // Entry point, pc or function descriptor.
2716 *entry = __ function_entry();
2718 // Load *adr into R4_ARG2, may fault.
2719 *fault_pc = __ pc();
2720 switch (size) {
2721 case 4:
2722 // int32_t, signed extended
2723 __ lwa(R4_ARG2, 0, R3_ARG1);
2724 break;
2725 case 8:
2726 // int64_t
2727 __ ld(R4_ARG2, 0, R3_ARG1);
2728 break;
2729 default:
2730 ShouldNotReachHere();
2731 }
2733 // return errValue or *adr
2734 *continuation_pc = __ pc();
2735 __ mr(R3_RET, R4_ARG2);
2736 __ blr();
2737 }
2739 /**
2740 * Arguments:
2741 *
2742 * Inputs:
2743 * R3_ARG1 - int crc
2744 * R4_ARG2 - byte* buf
2745 * R5_ARG3 - int length (of buffer)
2746 *
2747 * scratch:
2748 * R2, R6-R12
2749 *
2750 * Ouput:
2751 * R3_RET - int crc result
2752 */
2753 // Compute CRC32 function.
2754 address generate_CRC32_updateBytes(const char* name) {
2755 __ align(CodeEntryAlignment);
2756 StubCodeMark mark(this, "StubRoutines", name);
2757 address start = __ function_entry(); // Remember stub start address (is rtn value).
2759 // arguments to kernel_crc32:
2760 const Register crc = R3_ARG1; // Current checksum, preset by caller or result from previous call.
2761 const Register data = R4_ARG2; // source byte array
2762 const Register dataLen = R5_ARG3; // #bytes to process
2764 const Register table = R6; // crc table address
2766 #ifdef VM_LITTLE_ENDIAN
2767 if (VM_Version::has_vpmsumb()) {
2768 const Register constants = R2; // constants address
2769 const Register bconstants = R8; // barret table address
2771 const Register t0 = R9;
2772 const Register t1 = R10;
2773 const Register t2 = R11;
2774 const Register t3 = R12;
2775 const Register t4 = R7;
2777 BLOCK_COMMENT("Stub body {");
2778 assert_different_registers(crc, data, dataLen, table);
2780 StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
2781 StubRoutines::ppc64::generate_load_crc_constants_addr(_masm, constants);
2782 StubRoutines::ppc64::generate_load_crc_barret_constants_addr(_masm, bconstants);
2784 __ kernel_crc32_1word_vpmsumd(crc, data, dataLen, table, constants, bconstants, t0, t1, t2, t3, t4);
2786 BLOCK_COMMENT("return");
2787 __ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
2788 __ blr();
2790 BLOCK_COMMENT("} Stub body");
2791 } else
2792 #endif
2793 {
2794 const Register t0 = R2;
2795 const Register t1 = R7;
2796 const Register t2 = R8;
2797 const Register t3 = R9;
2798 const Register tc0 = R10;
2799 const Register tc1 = R11;
2800 const Register tc2 = R12;
2802 BLOCK_COMMENT("Stub body {");
2803 assert_different_registers(crc, data, dataLen, table);
2805 StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
2807 __ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, table);
2809 BLOCK_COMMENT("return");
2810 __ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
2811 __ blr();
2813 BLOCK_COMMENT("} Stub body");
2814 }
2816 return start;
2817 }
2819 // Initialization
2820 void generate_initial() {
2821 // Generates all stubs and initializes the entry points
2823 // Entry points that exist in all platforms.
2824 // Note: This is code that could be shared among different platforms - however the
2825 // benefit seems to be smaller than the disadvantage of having a
2826 // much more complicated generator structure. See also comment in
2827 // stubRoutines.hpp.
2829 StubRoutines::_forward_exception_entry = generate_forward_exception();
2830 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
2831 StubRoutines::_catch_exception_entry = generate_catch_exception();
2833 // Build this early so it's available for the interpreter.
2834 StubRoutines::_throw_StackOverflowError_entry =
2835 generate_throw_exception("StackOverflowError throw_exception",
2836 CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
2838 // CRC32 Intrinsics.
2839 if (UseCRC32Intrinsics) {
2840 StubRoutines::_crc_table_adr = (address)StubRoutines::ppc64::_crc_table;
2841 StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes("CRC32_updateBytes");
2842 }
2843 }
2845 void generate_all() {
2846 // Generates all stubs and initializes the entry points
2848 // These entry points require SharedInfo::stack0 to be set up in
2849 // non-core builds
2850 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError), false);
2851 // Handle IncompatibleClassChangeError in itable stubs.
2852 StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError), false);
2853 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
2855 StubRoutines::_handler_for_unsafe_access_entry = generate_handler_for_unsafe_access();
2857 // support for verify_oop (must happen after universe_init)
2858 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
2860 // arraycopy stubs used by compilers
2861 generate_arraycopy_stubs();
2863 // Safefetch stubs.
2864 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
2865 &StubRoutines::_safefetch32_fault_pc,
2866 &StubRoutines::_safefetch32_continuation_pc);
2867 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
2868 &StubRoutines::_safefetchN_fault_pc,
2869 &StubRoutines::_safefetchN_continuation_pc);
2871 if (UseAESIntrinsics) {
2872 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
2873 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
2874 }
2876 if (UseMontgomeryMultiplyIntrinsic) {
2877 StubRoutines::_montgomeryMultiply
2878 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
2879 }
2880 if (UseMontgomerySquareIntrinsic) {
2881 StubRoutines::_montgomerySquare
2882 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
2883 }
2884 }
2886 public:
2887 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
2888 // replace the standard masm with a special one:
2889 _masm = new MacroAssembler(code);
2890 if (all) {
2891 generate_all();
2892 } else {
2893 generate_initial();
2894 }
2895 }
2896 };
2898 void StubGenerator_generate(CodeBuffer* code, bool all) {
2899 StubGenerator g(code, all);
2900 }