Mon, 12 Sep 2016 13:41:43 -0400
Sync before updating the card marks (must be MT-safe on 3A2000).
1 /*
2 * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2015, 2016, Loongson Technology. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
26 #include "precompiled.hpp"
27 #include "asm/macroAssembler.hpp"
28 #include "asm/macroAssembler.inline.hpp"
29 #include "interpreter/interpreter.hpp"
30 #include "nativeInst_mips.hpp"
31 #include "oops/instanceOop.hpp"
32 #include "oops/method.hpp"
33 #include "oops/objArrayKlass.hpp"
34 #include "oops/oop.inline.hpp"
35 #include "prims/methodHandles.hpp"
36 #include "runtime/frame.inline.hpp"
37 #include "runtime/handles.inline.hpp"
38 #include "runtime/sharedRuntime.hpp"
39 #include "runtime/stubCodeGenerator.hpp"
40 #include "runtime/stubRoutines.hpp"
41 #include "runtime/thread.inline.hpp"
42 #include "utilities/top.hpp"
43 #ifdef COMPILER2
44 #include "opto/runtime.hpp"
45 #endif
48 // Declaration and definition of StubGenerator (no .hpp file).
49 // For a more detailed description of the stub routine structure
50 // see the comment in stubRoutines.hpp
52 #define __ _masm->
53 //#define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
54 //#define a__ ((Assembler*)_masm)->
56 //#ifdef PRODUCT
57 //#define BLOCK_COMMENT(str) /* nothing */
58 //#else
59 //#define BLOCK_COMMENT(str) __ block_comment(str)
60 //#endif
62 //#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
63 const int MXCSR_MASK = 0xFFC0; // Mask out any pending exceptions
65 // Stub Code definitions
67 static address handle_unsafe_access() {
68 JavaThread* thread = JavaThread::current();
69 address pc = thread->saved_exception_pc();
70 // pc is the instruction which we must emulate
71 // doing a no-op is fine: return garbage from the load
72 // therefore, compute npc
73 //address npc = Assembler::locate_next_instruction(pc);
74 address npc = (address)((unsigned long)pc + sizeof(unsigned long));
76 // request an async exception
77 thread->set_pending_unsafe_access_error();
79 // return address of next instruction to execute
80 return npc;
81 }
83 class StubGenerator: public StubCodeGenerator {
84 private:
86 // ABI mips n64
87 // This fig is not MIPS ABI. It is call Java from C ABI.
88 // Call stubs are used to call Java from C
89 //
90 // [ return_from_Java ]
91 // [ argument word n-1 ] <--- sp
92 // ...
93 // [ argument word 0 ]
94 // ...
95 //-10 [ S6 ]
96 // -9 [ S5 ]
97 // -8 [ S4 ]
98 // -7 [ S3 ]
99 // -6 [ S0 ]
100 // -5 [ TSR(S2) ]
101 // -4 [ LVP(S7) ]
102 // -3 [ BCP(S1) ]
103 // -2 [ saved fp ] <--- fp_after_call
104 // -1 [ return address ]
105 // 0 [ ptr. to call wrapper ] <--- a0 (old sp -->)fp
106 // 1 [ result ] <--- a1
107 // 2 [ result_type ] <--- a2
108 // 3 [ method ] <--- a3
109 // 4 [ entry_point ] <--- a4
110 // 5 [ parameters ] <--- a5
111 // 6 [ parameter_size ] <--- a6
112 // 7 [ thread ] <--- a7
114 //
115 // _LP64: n64 does not save paras in sp.
116 //
117 // [ return_from_Java ]
118 // [ argument word n-1 ] <--- sp
119 // ...
120 // [ argument word 0 ]
121 // ...
122 //-14 [ thread ]
123 //-13 [ result_type ] <--- a2
124 //-12 [ result ] <--- a1
125 //-11 [ ptr. to call wrapper ] <--- a0
126 //-10 [ S6 ]
127 // -9 [ S5 ]
128 // -8 [ S4 ]
129 // -7 [ S3 ]
130 // -6 [ S0 ]
131 // -5 [ TSR(S2) ]
132 // -4 [ LVP(S7) ]
133 // -3 [ BCP(S1) ]
134 // -2 [ saved fp ] <--- fp_after_call
135 // -1 [ return address ]
136 // 0 [ ] <--- old sp
137 /*
138 * 2014/01/16 Fu: Find a right place in the call_stub for GP.
139 * GP will point to the starting point of Interpreter::dispatch_table(itos).
140 * It should be saved/restored before/after Java calls.
141 *
142 */
143 enum call_stub_layout {
144 RA_off = -1,
145 FP_off = -2,
146 BCP_off = -3,
147 LVP_off = -4,
148 TSR_off = -5,
149 S1_off = -6,
150 S3_off = -7,
151 S4_off = -8,
152 S5_off = -9,
153 S6_off = -10,
154 result_off = -11,
155 result_type_off = -12,
156 thread_off = -13,
157 total_off = thread_off - 3,
158 GP_off = -16,
159 };
161 address generate_call_stub(address& return_address) {
163 StubCodeMark mark(this, "StubRoutines", "call_stub");
164 address start = __ pc();
166 // same as in generate_catch_exception()!
168 // stub code
169 // save ra and fp
170 __ sd(RA, SP, RA_off * wordSize);
171 __ sd(FP, SP, FP_off * wordSize);
172 __ sd(BCP, SP, BCP_off * wordSize);
173 __ sd(LVP, SP, LVP_off * wordSize);
174 __ sd(GP, SP, GP_off * wordSize);
175 __ sd(TSR, SP, TSR_off * wordSize);
176 __ sd(S1, SP, S1_off * wordSize);
177 __ sd(S3, SP, S3_off * wordSize);
178 __ sd(S4, SP, S4_off * wordSize);
179 __ sd(S5, SP, S5_off * wordSize);
180 __ sd(S6, SP, S6_off * wordSize);
183 __ li48(GP, (long)Interpreter::dispatch_table(itos));
185 // I think 14 is the max gap between argument and callee saved register
186 __ daddi(FP, SP, (-2) * wordSize);
187 __ daddi(SP, SP, total_off * wordSize);
188 //FIXME, aoqi. find a suitable place to save A1 & A2.
189 /*
190 __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);
191 __ sd(A1, FP, 3 * wordSize);
192 __ sd(A2, FP, 4 * wordSize);
193 __ sd(A3, FP, 5 * wordSize);
194 __ sd(A4, FP, 6 * wordSize);
195 __ sd(A5, FP, 7 * wordSize);
196 __ sd(A6, FP, 8 * wordSize);
197 __ sd(A7, FP, 9 * wordSize);
198 */
199 __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);
200 __ sd(A1, FP, result_off * wordSize);
201 __ sd(A2, FP, result_type_off * wordSize);
202 __ sd(A7, FP, thread_off * wordSize);
204 #ifdef OPT_THREAD
205 //__ get_thread(TREG);
206 __ move(TREG, A7);
208 //__ ld(TREG, FP, thread_off * wordSize);
209 #endif
210 //add for compressedoops
211 __ reinit_heapbase();
213 #ifdef ASSERT
214 // make sure we have no pending exceptions
215 {
216 Label L;
217 __ ld(AT, A7, in_bytes(Thread::pending_exception_offset()));
218 __ beq(AT, R0, L);
219 __ delayed()->nop();
220 /* FIXME: I do not know how to realize stop in mips arch, do it in the future */
221 __ stop("StubRoutines::call_stub: entered with pending exception");
222 __ bind(L);
223 }
224 #endif
226 // pass parameters if any
227 // A5: parameter
228 // A6: parameter_size
229 // T0: parameter_size_tmp(--)
230 // T2: offset(++)
231 // T3: tmp
232 Label parameters_done;
233 // judge if the parameter_size equals 0
234 __ beq(A6, R0, parameters_done);
235 __ delayed()->nop();
236 __ dsll(AT, A6, Interpreter::logStackElementSize);
237 __ dsub(SP, SP, AT);
238 __ move(AT, -StackAlignmentInBytes);
239 __ andr(SP, SP , AT);
240 // Copy Java parameters in reverse order (receiver last)
241 // Note that the argument order is inverted in the process
242 // source is edx[ecx: N-1..0]
243 // dest is esp[ebx: 0..N-1]
244 Label loop;
245 __ move(T0, A6);
246 __ move(T2, R0);
247 __ bind(loop);
249 // get parameter
250 __ dsll(T3, T0, LogBytesPerWord);
251 __ dadd(T3, T3, A5);
252 __ ld(AT, T3, -wordSize);
253 __ dsll(T3, T2, LogBytesPerWord);
254 __ dadd(T3, T3, SP);
255 __ sd(AT, T3, Interpreter::expr_offset_in_bytes(0));
256 __ daddi(T2, T2, 1);
257 __ daddi(T0, T0, -1);
258 __ bne(T0, R0, loop);
259 __ delayed()->nop();
260 // advance to next parameter
262 // call Java function
263 __ bind(parameters_done);
265 // receiver in V0, methodOop in Rmethod
267 __ move(Rmethod, A3);
268 __ move(Rsender, SP); //set sender sp
269 __ jalr(A4);
270 __ delayed()->nop();
271 return_address = __ pc();
273 Label common_return;
274 __ bind(common_return);
276 // store result depending on type
277 // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
278 __ ld(T0, FP, result_off * wordSize); // result --> T0
279 Label is_long, is_float, is_double, exit;
280 __ ld(T2, FP, result_type_off * wordSize); // result_type --> T2
281 __ daddi(T3, T2, (-1) * T_LONG);
282 __ beq(T3, R0, is_long);
283 __ delayed()->daddi(T3, T2, (-1) * T_FLOAT);
284 __ beq(T3, R0, is_float);
285 __ delayed()->daddi(T3, T2, (-1) * T_DOUBLE);
286 __ beq(T3, R0, is_double);
287 __ delayed()->nop();
289 // handle T_INT case
290 __ sd(V0, T0, 0 * wordSize);
291 __ bind(exit);
293 // restore
294 __ daddi(SP, FP, 2 * wordSize );
295 __ ld(RA, SP, RA_off * wordSize);
296 __ ld(FP, SP, FP_off * wordSize);
297 __ ld(BCP, SP, BCP_off * wordSize);
298 __ ld(LVP, SP, LVP_off * wordSize);
299 __ ld(GP, SP, GP_off * wordSize);
300 __ ld(TSR, SP, TSR_off * wordSize);
302 __ ld(S1, SP, S1_off * wordSize);
303 __ ld(S3, SP, S3_off * wordSize);
304 __ ld(S4, SP, S4_off * wordSize);
305 __ ld(S5, SP, S5_off * wordSize);
306 __ ld(S6, SP, S6_off * wordSize);
308 // return
309 __ jr(RA);
310 __ delayed()->nop();
312 // handle return types different from T_INT
313 __ bind(is_long);
314 __ sd(V0, T0, 0 * wordSize);
315 //__ sd(V1, T0, 1 * wordSize);
316 //__ sd(R0, T0, 1 * wordSize);
317 __ b(exit);
318 __ delayed()->nop();
320 __ bind(is_float);
321 __ swc1(F0, T0, 0 * wordSize);
322 __ b(exit);
323 __ delayed()->nop();
325 __ bind(is_double);
326 __ sdc1(F0, T0, 0 * wordSize);
327 //__ sdc1(F1, T0, 1 * wordSize);
328 //__ sd(R0, T0, 1 * wordSize);
329 __ b(exit);
330 __ delayed()->nop();
331 //FIXME, 1.6 mips version add operation of fpu here
332 StubRoutines::gs2::set_call_stub_compiled_return(__ pc());
333 __ b(common_return);
334 __ delayed()->nop();
335 return start;
336 }
338 // Return point for a Java call if there's an exception thrown in
339 // Java code. The exception is caught and transformed into a
340 // pending exception stored in JavaThread that can be tested from
341 // within the VM.
342 //
343 // Note: Usually the parameters are removed by the callee. In case
344 // of an exception crossing an activation frame boundary, that is
345 // not the case if the callee is compiled code => need to setup the
346 // rsp.
347 //
348 // rax: exception oop
350 address generate_catch_exception() {
351 StubCodeMark mark(this, "StubRoutines", "catch_exception");
352 address start = __ pc();
354 Register thread = TREG;
356 // get thread directly
357 #ifndef OPT_THREAD
358 __ ld(thread, FP, thread_off * wordSize);
359 #endif
361 #ifdef ASSERT
362 // verify that threads correspond
363 { Label L;
364 __ get_thread(T8);
365 __ beq(T8, thread, L);
366 __ delayed()->nop();
367 __ stop("StubRoutines::catch_exception: threads must correspond");
368 __ bind(L);
369 }
370 #endif
371 // set pending exception
372 __ verify_oop(V0);
373 __ sd(V0, thread, in_bytes(Thread::pending_exception_offset()));
374 __ li(AT, (long)__FILE__);
375 __ sd(AT, thread, in_bytes(Thread::exception_file_offset ()));
376 __ li(AT, (long)__LINE__);
377 __ sd(AT, thread, in_bytes(Thread::exception_line_offset ()));
379 // complete return to VM
380 assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
381 __ jmp(StubRoutines::_call_stub_return_address, relocInfo::none);
382 __ delayed()->nop();
384 return start;
385 }
387 // Continuation point for runtime calls returning with a pending
388 // exception. The pending exception check happened in the runtime
389 // or native call stub. The pending exception in Thread is
390 // converted into a Java-level exception.
391 //
392 // Contract with Java-level exception handlers:
393 // rax: exception
394 // rdx: throwing pc
395 //
396 // NOTE: At entry of this stub, exception-pc must be on stack !!
398 address generate_forward_exception() {
399 StubCodeMark mark(this, "StubRoutines", "forward exception");
400 //Register thread = TREG;
401 Register thread = TREG;
402 address start = __ pc();
404 // Upon entry, the sp points to the return address returning into Java
405 // (interpreted or compiled) code; i.e., the return address becomes the
406 // throwing pc.
407 //
408 // Arguments pushed before the runtime call are still on the stack but
409 // the exception handler will reset the stack pointer -> ignore them.
410 // A potential result in registers can be ignored as well.
412 #ifdef ASSERT
413 // make sure this code is only executed if there is a pending exception
414 #ifndef OPT_THREAD
415 __ get_thread(thread);
416 #endif
417 { Label L;
418 __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));
419 __ bne(AT, R0, L);
420 __ delayed()->nop();
421 __ stop("StubRoutines::forward exception: no pending exception (1)");
422 __ bind(L);
423 }
424 #endif
426 // compute exception handler into T9
427 __ ld(A1, SP, 0);
428 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, A1);
429 __ move(T9, V0);
430 __ pop(V1);
432 #ifndef OPT_THREAD
433 __ get_thread(thread);
434 #endif
435 __ ld(V0, thread, in_bytes(Thread::pending_exception_offset()));
436 __ sd(R0, thread, in_bytes(Thread::pending_exception_offset()));
438 #ifdef ASSERT
439 // make sure exception is set
440 { Label L;
441 __ bne(V0, R0, L);
442 __ delayed()->nop();
443 __ stop("StubRoutines::forward exception: no pending exception (2)");
444 __ bind(L);
445 }
446 #endif
448 // continue at exception handler (return address removed)
449 // V0: exception
450 // T9: exception handler
451 // V1: throwing pc
452 __ verify_oop(V0);
453 __ jr(T9);
454 __ delayed()->nop();
456 return start;
457 }
459 // Support for intptr_t get_previous_fp()
460 //
461 // This routine is used to find the previous frame pointer for the
462 // caller (current_frame_guess). This is used as part of debugging
463 // ps() is seemingly lost trying to find frames.
464 // This code assumes that caller current_frame_guess) has a frame.
465 address generate_get_previous_fp() {
466 StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
467 const Address old_fp (FP, 0);
468 const Address older_fp (V0, 0);
469 address start = __ pc();
470 __ enter();
471 __ lw(V0, old_fp); // callers fp
472 __ lw(V0, older_fp); // the frame for ps()
473 __ leave();
474 __ jr(RA);
475 __ delayed()->nop();
476 return start;
477 }
478 // The following routine generates a subroutine to throw an
479 // asynchronous UnknownError when an unsafe access gets a fault that
480 // could not be reasonably prevented by the programmer. (Example:
481 // SIGBUS/OBJERR.)
482 address generate_handler_for_unsafe_access() {
483 StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
484 address start = __ pc();
485 __ pushad(); // push registers
486 // Address next_pc(esp, RegisterImpl::number_of_registers * BytesPerWord);
487 __ call(CAST_FROM_FN_PTR(address, handle_unsafe_access), relocInfo::runtime_call_type);
488 __ delayed()->nop();
489 __ sw(V0, SP, RegisterImpl::number_of_registers * BytesPerWord);
490 __ popad();
491 __ jr(RA);
492 __ delayed()->nop();
493 return start;
494 }
496 // Non-destructive plausibility checks for oops
497 //
498 // Arguments:
499 // all args on stack!
500 //
501 // Stack after saving c_rarg3:
502 // [tos + 0]: saved c_rarg3
503 // [tos + 1]: saved c_rarg2
504 // [tos + 2]: saved r12 (several TemplateTable methods use it)
505 // [tos + 3]: saved flags
506 // [tos + 4]: return address
507 // * [tos + 5]: error message (char*)
508 // * [tos + 6]: object to verify (oop)
509 // * [tos + 7]: saved rax - saved by caller and bashed
510 // * = popped on exit
511 address generate_verify_oop() {
512 StubCodeMark mark(this, "StubRoutines", "verify_oop");
513 address start = __ pc();
514 __ reinit_heapbase();
515 __ verify_oop_subroutine();
516 address end = __ pc();
517 return start;
518 }
520 //
521 // Generate overlap test for array copy stubs
522 //
523 // Input:
524 // A0 - array1
525 // A1 - array2
526 // A2 - element count
527 //
528 // Note: this code can only use %eax, %ecx, and %edx
529 //
531 // use T9 as temp
532 void array_overlap_test(address no_overlap_target, int log2_elem_size) {
533 int elem_size = 1 << log2_elem_size;
534 Address::ScaleFactor sf = Address::times_1;
536 switch (log2_elem_size) {
537 case 0: sf = Address::times_1; break;
538 case 1: sf = Address::times_2; break;
539 case 2: sf = Address::times_4; break;
540 case 3: sf = Address::times_8; break;
541 }
543 __ dsll(AT, A2, sf);
544 __ dadd(AT, AT, A0);
545 __ lea(T9, Address(AT, -elem_size));
546 __ dsub(AT, A1, A0);
547 __ blez(AT, no_overlap_target);
548 __ delayed()->nop();
549 __ dsub(AT, A1, T9);
550 __ bgtz(AT, no_overlap_target);
551 __ delayed()->nop();
553 // 2016/05/10 aoqi: If A0 = 0xf... and A1 = 0x0..., than goto no_overlap_target
554 Label L;
555 __ bgez(A0, L);
556 __ delayed()->nop();
557 __ bgtz(A1, no_overlap_target);
558 __ delayed()->nop();
559 __ bind(L);
561 }
563 //
564 // Generate store check for array
565 //
566 // Input:
567 // %edi - starting address
568 // %ecx - element count
569 //
570 // The 2 input registers are overwritten
571 //
573 //
574 // Generate store check for array
575 //
576 // Input:
577 // T0 - starting address(edi)
578 // T1 - element count (ecx)
579 //
580 // The 2 input registers are overwritten
581 //
583 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
585 void array_store_check() {
586 BarrierSet* bs = Universe::heap()->barrier_set();
587 assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
588 CardTableModRefBS* ct = (CardTableModRefBS*)bs;
589 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
590 Label l_0;
592 __ dsll(AT, T1, TIMES_OOP);
593 __ dadd(AT, T0, AT);
594 __ daddiu(T1, AT, - BytesPerHeapOop);
596 __ shr(T0, CardTableModRefBS::card_shift);
597 __ shr(T1, CardTableModRefBS::card_shift);
599 __ dsub(T1, T1, T0); // end --> cards count
600 __ bind(l_0);
602 __ li48(AT, (long)ct->byte_map_base);
603 __ dadd(AT, AT, T0);
604 __ dadd(AT, AT, T1);
605 __ sync();
606 __ sb(R0, AT, 0);
607 //__ daddi(T1, T1, -4);
608 __ daddi(T1, T1, - 1);
609 __ bgez(T1, l_0);
610 __ delayed()->nop();
611 }
613 // Arguments:
614 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
615 // ignored
616 // name - stub name string
617 //
618 // Inputs:
619 // c_rarg0 - source array address
620 // c_rarg1 - destination array address
621 // c_rarg2 - element count, treated as ssize_t, can be zero
622 //
623 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
624 // we let the hardware handle it. The one to eight bytes within words,
625 // dwords or qwords that span cache line boundaries will still be loaded
626 // and stored atomically.
627 //
628 // Side Effects:
629 // disjoint_byte_copy_entry is set to the no-overlap entry point
630 // used by generate_conjoint_byte_copy().
631 //
632 address generate_disjoint_byte_copy(bool aligned, const char *name) {
633 StubCodeMark mark(this, "StubRoutines", name);
634 __ align(CodeEntryAlignment);
635 address start = __ pc();
636 Label l_0, l_1, l_2, l_3, l_4, l_5, l_6;
638 __ push(T3);
639 __ push(T0);
640 __ push(T1);
641 __ push(T8);
642 __ move(T3, A0);
643 __ move(T0, A1);
644 __ move(T1, A2);
645 __ move(T8, T1); // original count in T1
646 __ daddi(AT, T1, -3);
647 __ blez(AT, l_4);
648 __ delayed()->nop();
649 if (!aligned) {
650 //TODO: copy 8 bytes at one time
651 // 2016/5/8 Jin: only when src and dest has the same alignment can we do lw/sw */
652 __ andi(AT, T3, 3);
653 __ andi(T9, T0, 3);
654 __ bne(AT, T9, l_5);
655 __ delayed()->nop();
657 // align source address at dword address boundary
658 __ move(T1, 4);
659 __ sub(T1, T1, T3);
660 __ andi(T1, T1, 3);
661 __ beq(T1, R0, l_1);
662 __ delayed()->nop();
663 __ sub(T8,T8,T1);
664 __ bind(l_0);
665 __ lb(AT, T3, 0);
666 __ sb(AT, T0, 0);
667 __ addi(T3, T3, 1);
668 __ addi(T0, T0, 1);
669 __ addi(T1 ,T1, -1);
670 __ bne(T1, R0, l_0);
671 __ delayed()->nop();
672 __ bind(l_1);
673 __ move(T1, T8);
674 }
675 __ shr(T1, 2);
676 __ beq(T1, R0, l_4); // no dwords to move
677 __ delayed()->nop();
678 // copy aligned dwords
679 __ bind(l_2);
680 __ align(16);
681 __ bind(l_3);
682 __ lw(AT, T3, 0);
683 __ sw(AT, T0, 0 );
684 __ addi(T3, T3, 4);
685 __ addi(T0, T0, 4);
686 __ addi(T1, T1, -1);
687 __ bne(T1, R0, l_3);
688 __ delayed()->nop();
689 __ bind(l_4);
690 __ move(T1, T8);
691 __ andi(T1, T1, 3);
692 __ beq(T1, R0, l_6);
693 __ delayed()->nop();
694 // copy suffix
695 __ bind(l_5);
696 __ lb(AT, T3, 0);
697 __ sb(AT, T0, 0);
698 __ addi(T3, T3, 1);
699 __ addi(T0, T0, 1);
700 __ addi(T1, T1, -1);
701 __ bne(T1, R0, l_5 );
702 __ delayed()->nop();
703 __ bind(l_6);
704 __ pop(T8);
705 __ pop(T1);
706 __ pop(T0);
707 __ pop(T3);
708 __ jr(RA);
709 __ delayed()->nop();
710 return start;
711 }
713 // Arguments:
714 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
715 // ignored
716 // name - stub name string
717 //
718 // Inputs:
719 // A0 - source array address
720 // A1 - destination array address
721 // A2 - element count, treated as ssize_t, can be zero
722 //
723 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
724 // we let the hardware handle it. The one to eight bytes within words,
725 // dwords or qwords that span cache line boundaries will still be loaded
726 // and stored atomically.
727 //
728 address generate_conjoint_byte_copy(bool aligned, const char *name) {
729 __ align(CodeEntryAlignment);
730 StubCodeMark mark(this, "StubRoutines", name);
731 address start = __ pc();
733 Label l_copy_4_bytes_loop, l_copy_suffix, l_copy_suffix_loop, l_exit;
734 Label l_copy_byte, l_from_unaligned, l_unaligned, l_4_bytes_aligned;
736 address nooverlap_target = aligned ?
737 StubRoutines::arrayof_jbyte_disjoint_arraycopy() :
738 StubRoutines::jbyte_disjoint_arraycopy();
740 array_overlap_test(nooverlap_target, 0);
742 const Register from = A0; // source array address
743 const Register to = A1; // destination array address
744 const Register count = A2; // elements count
745 const Register end_from = T3; // source array end address
746 const Register end_to = T0; // destination array end address
747 const Register end_count = T1; // destination array end address
749 __ push(end_from);
750 __ push(end_to);
751 __ push(end_count);
752 __ push(T8);
754 // copy from high to low
755 __ move(end_count, count);
756 __ dadd(end_from, from, end_count);
757 __ dadd(end_to, to, end_count);
759 // 2016/05/08 aoqi: If end_from and end_to has differante alignment, unaligned copy is performed.
760 __ andi(AT, end_from, 3);
761 __ andi(T8, end_to, 3);
762 __ bne(AT, T8, l_copy_byte);
763 __ delayed()->nop();
765 // First deal with the unaligned data at the top.
766 __ bind(l_unaligned);
767 __ beq(end_count, R0, l_exit);
768 __ delayed()->nop();
770 __ andi(AT, end_from, 3);
771 __ bne(AT, R0, l_from_unaligned);
772 __ delayed()->nop();
774 __ andi(AT, end_to, 3);
775 __ beq(AT, R0, l_4_bytes_aligned);
776 __ delayed()->nop();
778 __ bind(l_from_unaligned);
779 __ lb(AT, end_from, -1);
780 __ sb(AT, end_to, -1);
781 __ daddi(end_from, end_from, -1);
782 __ daddi(end_to, end_to, -1);
783 __ daddi(end_count, end_count, -1);
784 __ b(l_unaligned);
785 __ delayed()->nop();
787 // now end_to, end_from point to 4-byte aligned high-ends
788 // end_count contains byte count that is not copied.
789 // copy 4 bytes at a time
790 __ bind(l_4_bytes_aligned);
792 __ move(T8, end_count);
793 __ daddi(AT, end_count, -3);
794 __ blez(AT, l_copy_suffix);
795 __ delayed()->nop();
797 //__ andi(T8, T8, 3);
798 __ lea(end_from, Address(end_from, -4));
799 __ lea(end_to, Address(end_to, -4));
801 __ dsrl(end_count, end_count, 2);
802 __ align(16);
803 __ bind(l_copy_4_bytes_loop); //l_copy_4_bytes
804 __ lw(AT, end_from, 0);
805 __ sw(AT, end_to, 0);
806 __ addi(end_from, end_from, -4);
807 __ addi(end_to, end_to, -4);
808 __ addi(end_count, end_count, -1);
809 __ bne(end_count, R0, l_copy_4_bytes_loop);
810 __ delayed()->nop();
812 __ b(l_copy_suffix);
813 __ delayed()->nop();
814 // copy dwords aligned or not with repeat move
815 // l_copy_suffix
816 // copy suffix (0-3 bytes)
817 __ bind(l_copy_suffix);
818 __ andi(T8, T8, 3);
819 __ beq(T8, R0, l_exit);
820 __ delayed()->nop();
821 __ addi(end_from, end_from, 3);
822 __ addi(end_to, end_to, 3);
823 __ bind(l_copy_suffix_loop);
824 __ lb(AT, end_from, 0);
825 __ sb(AT, end_to, 0);
826 __ addi(end_from, end_from, -1);
827 __ addi(end_to, end_to, -1);
828 __ addi(T8, T8, -1);
829 __ bne(T8, R0, l_copy_suffix_loop);
830 __ delayed()->nop();
832 __ bind(l_copy_byte);
833 __ beq(end_count, R0, l_exit);
834 __ delayed()->nop();
835 __ lb(AT, end_from, -1);
836 __ sb(AT, end_to, -1);
837 __ daddi(end_from, end_from, -1);
838 __ daddi(end_to, end_to, -1);
839 __ daddi(end_count, end_count, -1);
840 __ b(l_copy_byte);
841 __ delayed()->nop();
843 __ bind(l_exit);
844 __ pop(T8);
845 __ pop(end_count);
846 __ pop(end_to);
847 __ pop(end_from);
848 __ jr(RA);
849 __ delayed()->nop();
850 return start;
851 }
853 // Generate stub for disjoint short copy. If "aligned" is true, the
854 // "from" and "to" addresses are assumed to be heapword aligned.
855 //
856 // Arguments for generated stub:
857 // from: A0
858 // to: A1
859 // elm.count: A2 treated as signed
860 // one element: 2 bytes
861 //
862 // Strategy for aligned==true:
863 //
864 // If length <= 9:
865 // 1. copy 1 elements at a time (l_5)
866 //
867 // If length > 9:
868 // 1. copy 4 elements at a time until less than 4 elements are left (l_7)
869 // 2. copy 2 elements at a time until less than 2 elements are left (l_6)
870 // 3. copy last element if one was left in step 2. (l_1)
871 //
872 //
873 // Strategy for aligned==false:
874 //
875 // If length <= 9: same as aligned==true case
876 //
877 // If length > 9:
878 // 1. continue with step 7. if the alignment of from and to mod 4
879 // is different.
880 // 2. align from and to to 4 bytes by copying 1 element if necessary
881 // 3. at l_2 from and to are 4 byte aligned; continue with
882 // 6. if they cannot be aligned to 8 bytes because they have
883 // got different alignment mod 8.
884 // 4. at this point we know that both, from and to, have the same
885 // alignment mod 8, now copy one element if necessary to get
886 // 8 byte alignment of from and to.
887 // 5. copy 4 elements at a time until less than 4 elements are
888 // left; depending on step 3. all load/stores are aligned.
889 // 6. copy 2 elements at a time until less than 2 elements are
890 // left. (l_6)
891 // 7. copy 1 element at a time. (l_5)
892 // 8. copy last element if one was left in step 6. (l_1)
893 //
894 // TODO:
895 //
896 // 1. use loongson 128-bit load/store
897 // 2. use loop unrolling optimization when len is big enough, for example if len > 0x2000:
898 // __ bind(l_x);
899 // __ ld(AT, tmp1, 0);
900 // __ ld(tmp, tmp1, 8);
901 // __ sd(AT, tmp2, 0);
902 // __ sd(tmp, tmp2, 8);
903 // __ ld(AT, tmp1, 16);
904 // __ ld(tmp, tmp1, 24);
905 // __ sd(AT, tmp2, 16);
906 // __ sd(tmp, tmp2, 24);
907 // __ daddi(tmp1, tmp1, 32);
908 // __ daddi(tmp2, tmp2, 32);
909 // __ daddi(tmp3, tmp3, -16);
910 // __ daddi(AT, tmp3, -16);
911 // __ bgez(AT, l_x);
912 // __ delayed()->nop();
913 //
914 address generate_disjoint_short_copy(bool aligned, const char * name) {
915 StubCodeMark mark(this, "StubRoutines", name);
916 __ align(CodeEntryAlignment);
918 Register tmp1 = T0;
919 Register tmp2 = T1;
920 Register tmp3 = T3;
922 address start = __ pc();
924 __ push(tmp1);
925 __ push(tmp2);
926 __ push(tmp3);
927 __ move(tmp1, A0);
928 __ move(tmp2, A1);
929 __ move(tmp3, A2);
931 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
932 Label l_debug;
933 // don't try anything fancy if arrays don't have many elements
934 __ daddi(AT, tmp3, -9);
935 __ blez(AT, l_1);
936 __ delayed()->nop();
938 if (!aligned) {
939 __ xorr(AT, A0, A1);
940 __ andi(AT, AT, 1);
941 __ bne(AT, R0, l_debug); // if arrays don't have the same alignment mod 2, can this happen?
942 __ delayed()->nop();
944 __ xorr(AT, A0, A1);
945 __ andi(AT, AT, 3);
946 __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 1 element copy
947 __ delayed()->nop();
949 // At this point it is guaranteed that both, from and to have the same alignment mod 4.
951 // Copy 1 element if necessary to align to 4 bytes.
952 __ andi(AT, A0, 3);
953 __ beq(AT, R0, l_2);
954 __ delayed()->nop();
956 __ lhu(AT, tmp1, 0);
957 __ daddi(tmp1, tmp1, 2);
958 __ sh(AT, tmp2, 0);
959 __ daddi(tmp2, tmp2, 2);
960 __ daddi(tmp3, tmp3, -1);
961 __ bind(l_2);
963 // At this point the positions of both, from and to, are at least 4 byte aligned.
965 // Copy 4 elements at a time.
966 // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
967 __ xorr(AT, tmp1, tmp2);
968 __ andi(AT, AT, 7);
969 __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned
970 __ delayed()->nop();
972 // Copy a 2-element word if necessary to align to 8 bytes.
973 __ andi(AT, tmp1, 7);
974 __ beq(AT, R0, l_7);
975 __ delayed()->nop();
977 __ lw(AT, tmp1, 0);
978 __ daddi(tmp3, tmp3, -2);
979 __ sw(AT, tmp2, 0);
980 { // FasterArrayCopy
981 __ daddi(tmp1, tmp1, 4);
982 __ daddi(tmp2, tmp2, 4);
983 }
984 }
986 __ bind(l_7);
988 // Copy 4 elements at a time; either the loads or the stores can
989 // be unaligned if aligned == false.
991 { // FasterArrayCopy
992 __ daddi(AT, tmp3, -15);
993 __ blez(AT, l_6); // copy 2 at a time if less than 16 elements remain
994 __ delayed()->nop();
996 __ bind(l_8);
997 // For Loongson, there is 128-bit memory access. TODO
998 __ ld(AT, tmp1, 0);
999 __ sd(AT, tmp2, 0);
1000 __ daddi(tmp1, tmp1, 8);
1001 __ daddi(tmp2, tmp2, 8);
1002 __ daddi(tmp3, tmp3, -4);
1003 __ daddi(AT, tmp3, -4);
1004 __ bgez(AT, l_8);
1005 __ delayed()->nop();
1006 }
1007 __ bind(l_6);
1009 // copy 2 element at a time
1010 { // FasterArrayCopy
1011 __ daddi(AT, tmp3, -1);
1012 __ blez(AT, l_1);
1013 __ delayed()->nop();
1015 __ bind(l_3);
1016 __ lw(AT, tmp1, 0);
1017 __ sw(AT, tmp2, 0);
1018 __ daddi(tmp1, tmp1, 4);
1019 __ daddi(tmp2, tmp2, 4);
1020 __ daddi(tmp3, tmp3, -2);
1021 __ daddi(AT, tmp3, -2);
1022 __ bgez(AT, l_3);
1023 __ delayed()->nop();
1025 }
1027 // do single element copy (8 bit), can this happen?
1028 __ bind(l_1);
1029 __ beq(R0, tmp3, l_4);
1030 __ delayed()->nop();
1032 { // FasterArrayCopy
1034 __ bind(l_5);
1035 __ lhu(AT, tmp1, 0);
1036 __ daddi(tmp3, tmp3, -1);
1037 __ sh(AT, tmp2, 0);
1038 __ daddi(tmp1, tmp1, 2);
1039 __ daddi(tmp2, tmp2, 2);
1040 __ daddi(AT, tmp3, -1);
1041 __ bgez(AT, l_5);
1042 __ delayed()->nop();
1043 }
1044 __ bind(l_4);
1045 __ pop(tmp3);
1046 __ pop(tmp2);
1047 __ pop(tmp1);
1049 __ jr(RA);
1050 __ delayed()->nop();
1052 __ bind(l_debug);
1053 __ stop("generate_disjoint_short_copy should not reach here");
1054 return start;
1055 }
1057 // Arguments:
1058 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1059 // ignored
1060 // name - stub name string
1061 //
1062 // Inputs:
1063 // c_rarg0 - source array address
1064 // c_rarg1 - destination array address
1065 // c_rarg2 - element count, treated as ssize_t, can be zero
1066 //
1067 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1068 // let the hardware handle it. The two or four words within dwords
1069 // or qwords that span cache line boundaries will still be loaded
1070 // and stored atomically.
1071 //
1072 address generate_conjoint_short_copy(bool aligned, const char *name) {
1073 Label l_1, l_2, l_3, l_4, l_5;
1074 StubCodeMark mark(this, "StubRoutines", name);
1075 __ align(CodeEntryAlignment);
1076 address start = __ pc();
1077 address nooverlap_target = aligned ?
1078 StubRoutines::arrayof_jshort_disjoint_arraycopy() :
1079 StubRoutines::jshort_disjoint_arraycopy();
1081 array_overlap_test(nooverlap_target, 1);
1083 __ push(T3);
1084 __ push(T0);
1085 __ push(T1);
1086 __ push(T8);
1088 /*
1089 __ pushl(esi);
1090 __ movl(ecx, Address(esp, 4+12)); // count
1091 __ pushl(edi);
1092 __ movl(esi, Address(esp, 8+ 4)); // from
1093 __ movl(edi, Address(esp, 8+ 8)); // to
1094 */
1095 __ move(T1, A2);
1096 __ move(T3, A0);
1097 __ move(T0, A1);
1100 // copy dwords from high to low
1101 // __ leal(esi, Address(esi, ecx, Address::times_2, -4)); // from + count*2 - 4
1102 __ sll(AT, T1, Address::times_2);
1103 __ add(AT, T3, AT);
1104 __ lea(T3, Address( AT, -4));
1105 //__ std();
1106 //__ leal(edi, Address(edi, ecx, Address::times_2, -4)); // to + count*2 - 4
1107 __ sll(AT,T1 , Address::times_2);
1108 __ add(AT, T0, AT);
1109 __ lea(T0, Address( AT, -4));
1110 // __ movl(eax, ecx);
1111 __ move(T8, T1);
1112 __ bind(l_1);
1113 // __ sarl(ecx, 1); // dword count
1114 __ sra(T1,T1, 1);
1115 //__ jcc(Assembler::equal, l_4); // no dwords to move
1116 __ beq(T1, R0, l_4);
1117 __ delayed()->nop();
1118 /* __ cmpl(ecx, 32);
1119 __ jcc(Assembler::above, l_3); // > 32 dwords
1120 // copy dwords with loop
1121 __ subl(edi, esi);
1122 */ __ align(16);
1123 __ bind(l_2);
1124 //__ movl(edx, Address(esi));
1125 __ lw(AT, T3, 0);
1126 //__ movl(Address(edi, esi, Address::times_1), edx);
1127 __ sw(AT, T0, 0);
1128 //__ subl(esi, 4);
1129 __ addi(T3, T3, -4);
1130 __ addi(T0, T0, -4);
1131 //__ decl(ecx);
1132 __ addi(T1, T1, -1);
1133 // __ jcc(Assembler::notEqual, l_2);
1134 __ bne(T1, R0, l_2);
1135 __ delayed()->nop();
1136 // __ addl(edi, esi);
1137 // __ jmp(l_4);
1138 __ b(l_4);
1139 __ delayed()->nop();
1140 // copy dwords with repeat move
1141 __ bind(l_3);
1142 // __ rep_movl();
1143 __ bind(l_4);
1144 // __ andl(eax, 1); // suffix count
1145 __ andi(T8, T8, 1); // suffix count
1146 //__ jcc(Assembler::equal, l_5); // no suffix
1147 __ beq(T8, R0, l_5 );
1148 __ delayed()->nop();
1149 // copy suffix
1150 // __ movw(edx, Address(esi, 2));
1151 __ lh(AT, T3, 2);
1152 // __ movw(Address(edi, 2), edx);
1153 __ sh(AT, T0, 2);
1154 __ bind(l_5);
1155 // __ cld();
1156 // __ popl(edi);
1157 // __ popl(esi);
1158 // __ ret(0);
1159 __ pop(T8);
1160 __ pop(T1);
1161 __ pop(T0);
1162 __ pop(T3);
1163 __ jr(RA);
1164 __ delayed()->nop();
1165 return start;
1166 }
1168 // Arguments:
1169 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1170 // ignored
1171 // is_oop - true => oop array, so generate store check code
1172 // name - stub name string
1173 //
1174 // Inputs:
1175 // c_rarg0 - source array address
1176 // c_rarg1 - destination array address
1177 // c_rarg2 - element count, treated as ssize_t, can be zero
1178 //
1179 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1180 // the hardware handle it. The two dwords within qwords that span
1181 // cache line boundaries will still be loaded and stored atomicly.
1182 //
1183 // Side Effects:
1184 // disjoint_int_copy_entry is set to the no-overlap entry point
1185 // used by generate_conjoint_int_oop_copy().
1186 //
1187 address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {
1188 Label l_2, l_3, l_4, l_stchk;
1189 StubCodeMark mark(this, "StubRoutines", name);
1190 __ align(CodeEntryAlignment);
1191 address start = __ pc();
1192 /*
1193 __ pushl(esi);
1194 __ movl(ecx, Address(esp, 4+12)); // count
1195 __ pushl(edi);
1196 __ movl(esi, Address(esp, 8+ 4)); // from
1197 __ movl(edi, Address(esp, 8+ 8)); // to
1198 */
1199 __ push(T3);
1200 __ push(T0);
1201 __ push(T1);
1202 __ push(T8);
1203 __ move(T1, A2);
1204 __ move(T3, A0);
1205 __ move(T0, A1);
1207 // __ cmpl(ecx, 32);
1208 // __ jcc(Assembler::belowEqual, l_2); // <= 32 dwords
1209 // __ rep_movl();
1210 __ b(l_2);
1211 __ delayed()->nop();
1212 if (is_oop) {
1213 // __ jmp(l_stchk);
1214 __ b(l_stchk);
1215 __ delayed()->nop();
1216 }
1217 // __ popl(edi);
1218 // __ popl(esi);
1219 // __ ret(0);
1220 __ pop(T8);
1221 __ pop(T1);
1222 __ pop(T0);
1223 __ pop(T3);
1224 __ jr(RA);
1225 __ delayed()->nop();
1227 __ bind(l_2);
1228 // __ subl(edi, esi);
1229 // __ testl(ecx, ecx);
1230 // __ jcc(Assembler::zero, l_4);
1231 __ beq(T1, R0, l_4);
1232 __ delayed()->nop();
1233 __ align(16);
1234 __ bind(l_3);
1235 //__ movl(edx, Address(esi));
1236 __ lw(AT, T3, 0);
1237 // __ movl(Address(edi, esi, Address::times_1), edx);
1238 __ sw(AT, T0, 0);
1239 // __ addl(esi, 4);
1240 __ addi(T3, T3, 4);
1241 __ addi(T0, T0, 4);
1242 // __ decl(ecx);
1243 __ addi(T1, T1, -1);
1244 // __ jcc(Assembler::notEqual, l_3);
1245 __ bne(T1, R0, l_3);
1246 __ delayed()->nop();
1247 if (is_oop) {
1248 __ bind(l_stchk);
1249 // __ movl(edi, Address(esp, 8+ 8));
1250 // __ movl(ecx, Address(esp, 8+ 12));
1251 __ move(T0, A1);
1252 __ move(T1, A2);
1253 array_store_check();
1254 }
1255 __ bind(l_4);
1256 // __ popl(edi);
1257 // __ popl(esi);
1258 // __ ret(0);
1259 __ pop(T8);
1260 __ pop(T1);
1261 __ pop(T0);
1262 __ pop(T3);
1263 __ jr(RA);
1264 __ delayed()->nop();
1265 return start;
1266 }
1268 // Arguments:
1269 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1270 // ignored
1271 // is_oop - true => oop array, so generate store check code
1272 // name - stub name string
1273 //
1274 // Inputs:
1275 // c_rarg0 - source array address
1276 // c_rarg1 - destination array address
1277 // c_rarg2 - element count, treated as ssize_t, can be zero
1278 //
1279 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1280 // the hardware handle it. The two dwords within qwords that span
1281 // cache line boundaries will still be loaded and stored atomicly.
1282 //
1283 address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {
1284 Label l_2, l_3, l_4, l_stchk;
1285 StubCodeMark mark(this, "StubRoutines", name);
1286 __ align(CodeEntryAlignment);
1287 address start = __ pc();
1288 address nooverlap_target;
1290 if (is_oop) {
1291 nooverlap_target = aligned ?
1292 StubRoutines::arrayof_oop_disjoint_arraycopy() :
1293 StubRoutines::oop_disjoint_arraycopy();
1294 }else {
1295 nooverlap_target = aligned ?
1296 StubRoutines::arrayof_jint_disjoint_arraycopy() :
1297 StubRoutines::jint_disjoint_arraycopy();
1298 }
1300 array_overlap_test(nooverlap_target, 2);
1302 __ push(T3);
1303 __ push(T0);
1304 __ push(T1);
1305 __ push(T8);
1307 /*
1308 __ pushl(esi);
1309 __ movl(ecx, Address(esp, 4+12)); // count
1310 __ pushl(edi);
1311 __ movl(esi, Address(esp, 8+ 4)); // from
1312 __ movl(edi, Address(esp, 8+ 8)); // to
1313 */
1314 __ move(T1, A2);
1315 __ move(T3, A0);
1316 __ move(T0, A1);
1318 //__ leal(esi, Address(esi, ecx, Address::times_4, -4)); // from + count*4 - 4
1319 __ sll(AT, T1, Address::times_4);
1320 __ add(AT, T3, AT);
1321 __ lea(T3 , Address(AT, -4));
1322 //__ std();
1323 //__ leal(edi, Address(edi, ecx, Address::times_4, -4)); // to + count*4 - 4
1324 __ sll(AT, T1, Address::times_4);
1325 __ add(AT, T0, AT);
1326 __ lea(T0 , Address(AT, -4));
1328 // __ cmpl(ecx, 32);
1329 // __ jcc(Assembler::above, l_3); // > 32 dwords
1330 // __ testl(ecx, ecx);
1331 //__ jcc(Assembler::zero, l_4);
1332 __ beq(T1, R0, l_4);
1333 __ delayed()->nop();
1334 // __ subl(edi, esi);
1335 __ align(16);
1336 __ bind(l_2);
1337 // __ movl(edx, Address(esi));
1338 __ lw(AT, T3, 0);
1339 // __ movl(Address(esi, edi, Address::times_1), edx);
1340 __ sw(AT, T0, 0);
1341 // __ subl(esi, 4);
1342 __ addi(T3, T3, -4);
1343 __ addi(T0, T0, -4);
1344 // __ decl(ecx);
1345 __ addi(T1, T1, -1);
1346 //__ jcc(Assembler::notEqual, l_2);
1347 __ bne(T1, R0, l_2);
1348 __ delayed()->nop();
1349 if (is_oop) {
1350 // __ jmp(l_stchk);
1351 __ b( l_stchk);
1352 __ delayed()->nop();
1353 }
1354 __ bind(l_4);
1355 // __ cld();
1356 // __ popl(edi);
1357 // __ popl(esi);
1358 // __ ret(0);
1359 __ pop(T8);
1360 __ pop(T1);
1361 __ pop(T0);
1362 __ pop(T3);
1363 __ jr(RA);
1364 __ delayed()->nop();
1365 __ bind(l_3);
1366 // __ rep_movl();
1367 if (is_oop) {
1368 __ bind(l_stchk);
1369 // __ movl(edi, Address(esp, 8+ 8));
1370 __ move(T0, A1);
1371 // __ movl(ecx, Address(esp, 8+ 12));
1372 __ move(T1, A2);
1373 array_store_check();
1374 }
1375 // __ cld();
1376 // __ popl(edi);
1377 // __ popl(esi);
1378 // __ ret(0);
1379 __ pop(T8);
1380 __ pop(T1);
1381 __ pop(T0);
1382 __ pop(T3);
1383 __ jr(RA);
1384 __ delayed()->nop();
1385 return start;
1386 }
1388 // Arguments:
1389 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1390 // ignored
1391 // is_oop - true => oop array, so generate store check code
1392 // name - stub name string
1393 //
1394 // Inputs:
1395 // c_rarg0 - source array address
1396 // c_rarg1 - destination array address
1397 // c_rarg2 - element count, treated as ssize_t, can be zero
1398 //
1399 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1400 // the hardware handle it. The two dwords within qwords that span
1401 // cache line boundaries will still be loaded and stored atomicly.
1402 //
1403 // Side Effects:
1404 // disjoint_int_copy_entry is set to the no-overlap entry point
1405 // used by generate_conjoint_int_oop_copy().
1406 //
1407 address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
1408 Label l_2, l_3, l_4, l_stchk;
1409 StubCodeMark mark(this, "StubRoutines", name);
1410 __ align(CodeEntryAlignment);
1411 address start = __ pc();
1412 __ push(T3);
1413 __ push(T0);
1414 __ push(T1);
1415 __ push(T8);
1416 __ move(T1, A2);
1417 __ move(T3, A0);
1418 __ move(T0, A1);
1420 // __ cmpl(ecx, 32);
1421 // __ jcc(Assembler::belowEqual, l_2); // <= 32 dwords
1422 // __ rep_movl();
1423 __ b(l_2);
1424 __ delayed()->nop();
1425 if (is_oop) {
1426 // __ jmp(l_stchk);
1427 __ b(l_stchk);
1428 __ delayed()->nop();
1429 }
1430 // __ popl(edi);
1431 // __ popl(esi);
1432 // __ ret(0);
1433 __ pop(T8);
1434 __ pop(T1);
1435 __ pop(T0);
1436 __ pop(T3);
1437 __ jr(RA);
1438 __ delayed()->nop();
1440 __ bind(l_2);
1441 // __ subl(edi, esi);
1442 // __ testl(ecx, ecx);
1443 // __ jcc(Assembler::zero, l_4);
1444 __ beq(T1, R0, l_4);
1445 __ delayed()->nop();
1446 __ align(16);
1447 __ bind(l_3);
1448 //__ movl(edx, Address(esi));
1449 __ ld(AT, T3, 0);
1450 // __ movl(Address(edi, esi, Address::times_1), edx);
1451 __ sd(AT, T0, 0);
1452 // __ addl(esi, 4);
1453 __ addi(T3, T3, 8);
1454 __ addi(T0, T0, 8);
1455 // __ decl(ecx);
1456 __ addi(T1, T1, -1);
1457 // __ jcc(Assembler::notEqual, l_3);
1458 __ bne(T1, R0, l_3);
1459 __ delayed()->nop();
1460 if (is_oop) {
1461 __ bind(l_stchk);
1462 // __ movl(edi, Address(esp, 8+ 8));
1463 // __ movl(ecx, Address(esp, 8+ 12));
1464 __ move(T0, A1);
1465 __ move(T1, A2);
1466 array_store_check();
1467 }
1468 __ bind(l_4);
1469 // __ popl(edi);
1470 // __ popl(esi);
1471 // __ ret(0);
1472 __ pop(T8);
1473 __ pop(T1);
1474 __ pop(T0);
1475 __ pop(T3);
1476 __ jr(RA);
1477 __ delayed()->nop();
1478 return start;
1479 }
1481 // Arguments:
1482 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1483 // ignored
1484 // is_oop - true => oop array, so generate store check code
1485 // name - stub name string
1486 //
1487 // Inputs:
1488 // c_rarg0 - source array address
1489 // c_rarg1 - destination array address
1490 // c_rarg2 - element count, treated as ssize_t, can be zero
1491 //
1492 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1493 // the hardware handle it. The two dwords within qwords that span
1494 // cache line boundaries will still be loaded and stored atomicly.
1495 //
1496 address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
1497 Label l_2, l_3, l_4, l_stchk;
1498 StubCodeMark mark(this, "StubRoutines", name);
1499 __ align(CodeEntryAlignment);
1500 address start = __ pc();
1501 address nooverlap_target;
1503 if (is_oop) {
1504 nooverlap_target = aligned ?
1505 StubRoutines::arrayof_oop_disjoint_arraycopy() :
1506 StubRoutines::oop_disjoint_arraycopy();
1507 }else {
1508 nooverlap_target = aligned ?
1509 StubRoutines::arrayof_jlong_disjoint_arraycopy() :
1510 StubRoutines::jlong_disjoint_arraycopy();
1511 }
1513 array_overlap_test(nooverlap_target, 3);
1515 __ push(T3);
1516 __ push(T0);
1517 __ push(T1);
1518 __ push(T8);
1520 __ move(T1, A2);
1521 __ move(T3, A0);
1522 __ move(T0, A1);
1524 //__ leal(esi, Address(esi, ecx, Address::times_4, -4)); // from + count*4 - 4
1525 __ sll(AT, T1, Address::times_8);
1526 __ add(AT, T3, AT);
1527 __ lea(T3 , Address(AT, -8));
1528 //__ std();
1529 //__ leal(edi, Address(edi, ecx, Address::times_4, -4)); // to + count*4 - 4
1530 __ sll(AT, T1, Address::times_8);
1531 __ add(AT, T0, AT);
1532 __ lea(T0 , Address(AT, -8));
1534 // __ cmpl(ecx, 32);
1535 // __ jcc(Assembler::above, l_3); // > 32 dwords
1536 // __ testl(ecx, ecx);
1537 //__ jcc(Assembler::zero, l_4);
1538 __ beq(T1, R0, l_4);
1539 __ delayed()->nop();
1540 // __ subl(edi, esi);
1541 __ align(16);
1542 __ bind(l_2);
1543 // __ movl(edx, Address(esi));
1544 __ ld(AT, T3, 0);
1545 // __ movl(Address(esi, edi, Address::times_1), edx);
1546 __ sd(AT, T0, 0);
1547 // __ subl(esi, 4);
1548 __ addi(T3, T3, -8);
1549 __ addi(T0, T0, -8);
1550 // __ decl(ecx);
1551 __ addi(T1, T1, -1);
1552 //__ jcc(Assembler::notEqual, l_2);
1553 __ bne(T1, R0, l_2);
1554 __ delayed()->nop();
1555 if (is_oop) {
1556 // __ jmp(l_stchk);
1557 __ b( l_stchk);
1558 __ delayed()->nop();
1559 }
1560 __ bind(l_4);
1561 // __ cld();
1562 // __ popl(edi);
1563 // __ popl(esi);
1564 // __ ret(0);
1565 __ pop(T8);
1566 __ pop(T1);
1567 __ pop(T0);
1568 __ pop(T3);
1569 __ jr(RA);
1570 __ delayed()->nop();
1571 __ bind(l_3);
1572 // __ rep_movl();
1573 if (is_oop) {
1574 __ bind(l_stchk);
1575 // __ movl(edi, Address(esp, 8+ 8));
1576 __ move(T0, A1);
1577 // __ movl(ecx, Address(esp, 8+ 12));
1578 __ move(T1, A2);
1579 array_store_check();
1580 }
1581 // __ cld();
1582 // __ popl(edi);
1583 // __ popl(esi);
1584 // __ ret(0);
1585 __ pop(T8);
1586 __ pop(T1);
1587 __ pop(T0);
1588 __ pop(T3);
1589 __ jr(RA);
1590 __ delayed()->nop();
1591 return start;
1592 }
1593 #if 0
1594 // Arguments:
1595 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1596 // ignored
1597 // is_oop - true => oop array, so generate store check code
1598 // name - stub name string
1599 //
1600 // Inputs:
1601 // c_rarg0 - source array address
1602 // c_rarg1 - destination array address
1603 // c_rarg2 - element count, treated as ssize_t, can be zero
1604 //
1605 address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
1606 __ align(CodeEntryAlignment);
1607 StubCodeMark mark(this, "StubRoutines", name);
1608 address start = __ pc();
1610 Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
1611 const Register from = rdi; // source array address
1612 const Register to = rsi; // destination array address
1613 const Register qword_count = rdx; // elements count
1614 const Register saved_count = rcx;
1616 __ enter(); // required for proper stackwalking of RuntimeStub frame
1617 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1619 address disjoint_copy_entry = NULL;
1620 if (is_oop) {
1621 assert(!UseCompressedOops, "shouldn't be called for compressed oops");
1622 disjoint_copy_entry = disjoint_oop_copy_entry;
1623 oop_copy_entry = __ pc();
1624 array_overlap_test(disjoint_oop_copy_entry, Address::times_8);
1625 } else {
1626 disjoint_copy_entry = disjoint_long_copy_entry;
1627 long_copy_entry = __ pc();
1628 array_overlap_test(disjoint_long_copy_entry, Address::times_8);
1629 }
1630 BLOCK_COMMENT("Entry:");
1631 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1633 array_overlap_test(disjoint_copy_entry, Address::times_8);
1634 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1635 // r9 and r10 may be used to save non-volatile registers
1637 // 'from', 'to' and 'qword_count' are now valid
1639 if (is_oop) {
1640 // Save to and count for store barrier
1641 __ movptr(saved_count, qword_count);
1642 // No registers are destroyed by this call
1643 gen_write_ref_array_pre_barrier(to, saved_count);
1644 }
1646 __ jmp(L_copy_32_bytes);
1648 // Copy trailing qwords
1649 __ BIND(L_copy_8_bytes);
1650 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1651 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1652 __ decrement(qword_count);
1653 __ jcc(Assembler::notZero, L_copy_8_bytes);
1655 if (is_oop) {
1656 __ jmp(L_exit);
1657 } else {
1658 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
1659 restore_arg_regs();
1660 __ xorptr(rax, rax); // return 0
1661 __ leave(); // required for proper stackwalking of RuntimeStub frame
1662 __ ret(0);
1663 }
1665 // Copy in 32-bytes chunks
1666 copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
1668 if (is_oop) {
1669 __ BIND(L_exit);
1670 __ lea(rcx, Address(to, saved_count, Address::times_8, -8));
1671 gen_write_ref_array_post_barrier(to, rcx, rax);
1672 inc_counter_np(SharedRuntime::_oop_array_copy_ctr);
1673 } else {
1674 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
1675 }
1676 restore_arg_regs();
1677 __ xorptr(rax, rax); // return 0
1678 __ leave(); // required for proper stackwalking of RuntimeStub frame
1679 __ ret(0);
1681 return start;
1682 }
1685 // Helper for generating a dynamic type check.
1686 // Smashes no registers.
1687 void generate_type_check(Register sub_klass,
1688 Register super_check_offset,
1689 Register super_klass,
1690 Label& L_success) {
1691 assert_different_registers(sub_klass, super_check_offset, super_klass);
1693 BLOCK_COMMENT("type_check:");
1695 Label L_miss;
1697 // a couple of useful fields in sub_klass:
1698 int ss_offset = (klassOopDesc::header_size() * HeapWordSize +
1699 Klass::secondary_supers_offset_in_bytes());
1700 int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
1701 Klass::secondary_super_cache_offset_in_bytes());
1702 Address secondary_supers_addr(sub_klass, ss_offset);
1703 Address super_cache_addr( sub_klass, sc_offset);
1705 // if the pointers are equal, we are done (e.g., String[] elements)
1706 __ cmpptr(super_klass, sub_klass);
1707 __ jcc(Assembler::equal, L_success);
1709 // check the supertype display:
1710 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
1711 __ cmpptr(super_klass, super_check_addr); // test the super type
1712 __ jcc(Assembler::equal, L_success);
1714 // if it was a primary super, we can just fail immediately
1715 __ cmpl(super_check_offset, sc_offset);
1716 __ jcc(Assembler::notEqual, L_miss);
1718 // Now do a linear scan of the secondary super-klass chain.
1719 // The repne_scan instruction uses fixed registers, which we must spill.
1720 // (We need a couple more temps in any case.)
1721 // This code is rarely used, so simplicity is a virtue here.
1722 inc_counter_np(SharedRuntime::_partial_subtype_ctr);
1723 {
1724 __ push(rax);
1725 __ push(rcx);
1726 __ push(rdi);
1727 assert_different_registers(sub_klass, super_klass, rax, rcx, rdi);
1729 __ movptr(rdi, secondary_supers_addr);
1730 // Load the array length.
1731 __ movl(rcx, Address(rdi, arrayOopDesc::length_offset_in_bytes()));
1732 // Skip to start of data.
1733 __ addptr(rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1734 // Scan rcx words at [rdi] for occurance of rax
1735 // Set NZ/Z based on last compare
1736 __ movptr(rax, super_klass);
1737 if (UseCompressedOops) {
1738 // Compare against compressed form. Don't need to uncompress because
1739 // looks like orig rax is restored in popq below.
1740 __ encode_heap_oop(rax);
1741 __ repne_scanl();
1742 } else {
1743 __ repne_scan();
1744 }
1746 // Unspill the temp. registers:
1747 __ pop(rdi);
1748 __ pop(rcx);
1749 __ pop(rax);
1751 __ jcc(Assembler::notEqual, L_miss);
1752 }
1754 // Success. Cache the super we found and proceed in triumph.
1755 __ movptr(super_cache_addr, super_klass); // note: rax is dead
1756 __ jmp(L_success);
1758 // Fall through on failure!
1759 __ BIND(L_miss);
1760 }
1762 //
1763 // Generate checkcasting array copy stub
1764 //
1765 // Input:
1766 // c_rarg0 - source array address
1767 // c_rarg1 - destination array address
1768 // c_rarg2 - element count, treated as ssize_t, can be zero
1769 // c_rarg3 - size_t ckoff (super_check_offset)
1770 // not Win64
1771 // c_rarg4 - oop ckval (super_klass)
1772 // Win64
1773 // rsp+40 - oop ckval (super_klass)
1774 //
1775 // Output:
1776 // rax == 0 - success
1777 // rax == -1^K - failure, where K is partial transfer count
1778 //
1779 address generate_checkcast_copy(const char *name) {
1781 Label L_load_element, L_store_element, L_do_card_marks, L_done;
1783 // Input registers (after setup_arg_regs)
1784 const Register from = rdi; // source array address
1785 const Register to = rsi; // destination array address
1786 const Register length = rdx; // elements count
1787 const Register ckoff = rcx; // super_check_offset
1788 const Register ckval = r8; // super_klass
1790 // Registers used as temps (r13, r14 are save-on-entry)
1791 const Register end_from = from; // source array end address
1792 const Register end_to = r13; // destination array end address
1793 const Register count = rdx; // -(count_remaining)
1794 const Register r14_length = r14; // saved copy of length
1795 // End pointers are inclusive, and if length is not zero they point
1796 // to the last unit copied: end_to[0] := end_from[0]
1798 const Register rax_oop = rax; // actual oop copied
1799 const Register r11_klass = r11; // oop._klass
1801 //---------------------------------------------------------------
1802 // Assembler stub will be used for this call to arraycopy
1803 // if the two arrays are subtypes of Object[] but the
1804 // destination array type is not equal to or a supertype
1805 // of the source type. Each element must be separately
1806 // checked.
1808 __ align(CodeEntryAlignment);
1809 StubCodeMark mark(this, "StubRoutines", name);
1810 address start = __ pc();
1812 __ enter(); // required for proper stackwalking of RuntimeStub frame
1814 checkcast_copy_entry = __ pc();
1815 BLOCK_COMMENT("Entry:");
1817 #ifdef ASSERT
1818 // caller guarantees that the arrays really are different
1819 // otherwise, we would have to make conjoint checks
1820 { Label L;
1821 array_overlap_test(L, TIMES_OOP);
1822 __ stop("checkcast_copy within a single array");
1823 __ bind(L);
1824 }
1825 #endif //ASSERT
1827 // allocate spill slots for r13, r14
1828 enum {
1829 saved_r13_offset,
1830 saved_r14_offset,
1831 saved_rbp_offset,
1832 saved_rip_offset,
1833 saved_rarg0_offset
1834 };
1835 __ subptr(rsp, saved_rbp_offset * wordSize);
1836 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
1837 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
1838 setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
1839 // ckoff => rcx, ckval => r8
1840 // r9 and r10 may be used to save non-volatile registers
1841 #ifdef _WIN64
1842 // last argument (#4) is on stack on Win64
1843 const int ckval_offset = saved_rarg0_offset + 4;
1844 __ movptr(ckval, Address(rsp, ckval_offset * wordSize));
1845 #endif
1847 // check that int operands are properly extended to size_t
1848 assert_clean_int(length, rax);
1849 assert_clean_int(ckoff, rax);
1851 #ifdef ASSERT
1852 BLOCK_COMMENT("assert consistent ckoff/ckval");
1853 // The ckoff and ckval must be mutually consistent,
1854 // even though caller generates both.
1855 { Label L;
1856 int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
1857 Klass::super_check_offset_offset_in_bytes());
1858 __ cmpl(ckoff, Address(ckval, sco_offset));
1859 __ jcc(Assembler::equal, L);
1860 __ stop("super_check_offset inconsistent");
1861 __ bind(L);
1862 }
1863 #endif //ASSERT
1865 // Loop-invariant addresses. They are exclusive end pointers.
1866 Address end_from_addr(from, length, TIMES_OOP, 0);
1867 Address end_to_addr(to, length, TIMES_OOP, 0);
1868 // Loop-variant addresses. They assume post-incremented count < 0.
1869 Address from_element_addr(end_from, count, TIMES_OOP, 0);
1870 Address to_element_addr(end_to, count, TIMES_OOP, 0);
1872 gen_write_ref_array_pre_barrier(to, count);
1874 // Copy from low to high addresses, indexed from the end of each array.
1875 __ lea(end_from, end_from_addr);
1876 __ lea(end_to, end_to_addr);
1877 __ movptr(r14_length, length); // save a copy of the length
1878 assert(length == count, ""); // else fix next line:
1879 __ negptr(count); // negate and test the length
1880 __ jcc(Assembler::notZero, L_load_element);
1882 // Empty array: Nothing to do.
1883 __ xorptr(rax, rax); // return 0 on (trivial) success
1884 __ jmp(L_done);
1886 // ======== begin loop ========
1887 // (Loop is rotated; its entry is L_load_element.)
1888 // Loop control:
1889 // for (count = -count; count != 0; count++)
1890 // Base pointers src, dst are biased by 8*(count-1),to last element.
1891 __ align(16);
1893 __ BIND(L_store_element);
1894 __ store_heap_oop(rax_oop, to_element_addr); // store the oop
1895 __ sync();
1896 __ increment(count); // increment the count toward zero
1897 __ jcc(Assembler::zero, L_do_card_marks);
1899 // ======== loop entry is here ========
1900 __ BIND(L_load_element);
1901 __ load_heap_oop(rax_oop, from_element_addr); // load the oop
1902 __ testptr(rax_oop, rax_oop);
1903 __ jcc(Assembler::zero, L_store_element);
1905 __ load_klass(r11_klass, rax_oop);// query the object klass
1906 generate_type_check(r11_klass, ckoff, ckval, L_store_element);
1907 // ======== end loop ========
1909 // It was a real error; we must depend on the caller to finish the job.
1910 // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
1911 // Emit GC store barriers for the oops we have copied (r14 + rdx),
1912 // and report their number to the caller.
1913 assert_different_registers(rax, r14_length, count, to, end_to, rcx);
1914 __ lea(end_to, to_element_addr);
1915 gen_write_ref_array_post_barrier(to, end_to, rscratch1);
1916 __ movptr(rax, r14_length); // original oops
1917 __ addptr(rax, count); // K = (original - remaining) oops
1918 __ notptr(rax); // report (-1^K) to caller
1919 __ jmp(L_done);
1921 // Come here on success only.
1922 __ BIND(L_do_card_marks);
1923 __ addptr(end_to, -wordSize); // make an inclusive end pointer
1924 gen_write_ref_array_post_barrier(to, end_to, rscratch1);
1925 __ xorptr(rax, rax); // return 0 on success
1927 // Common exit point (success or failure).
1928 __ BIND(L_done);
1929 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
1930 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
1931 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1932 restore_arg_regs();
1933 __ leave(); // required for proper stackwalking of RuntimeStub frame
1934 __ ret(0);
1936 return start;
1937 }
1939 //
1940 // Generate 'unsafe' array copy stub
1941 // Though just as safe as the other stubs, it takes an unscaled
1942 // size_t argument instead of an element count.
1943 //
1944 // Input:
1945 // c_rarg0 - source array address
1946 // c_rarg1 - destination array address
1947 // c_rarg2 - byte count, treated as ssize_t, can be zero
1948 //
1949 // Examines the alignment of the operands and dispatches
1950 // to a long, int, short, or byte copy loop.
1951 //
1952 address generate_unsafe_copy(const char *name) {
1954 Label L_long_aligned, L_int_aligned, L_short_aligned;
1956 // Input registers (before setup_arg_regs)
1957 const Register from = c_rarg0; // source array address
1958 const Register to = c_rarg1; // destination array address
1959 const Register size = c_rarg2; // byte count (size_t)
1961 // Register used as a temp
1962 const Register bits = rax; // test copy of low bits
1964 __ align(CodeEntryAlignment);
1965 StubCodeMark mark(this, "StubRoutines", name);
1966 address start = __ pc();
1968 __ enter(); // required for proper stackwalking of RuntimeStub frame
1970 // bump this on entry, not on exit:
1971 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1973 __ mov(bits, from);
1974 __ orptr(bits, to);
1975 __ orptr(bits, size);
1977 __ testb(bits, BytesPerLong-1);
1978 __ jccb(Assembler::zero, L_long_aligned);
1980 __ testb(bits, BytesPerInt-1);
1981 __ jccb(Assembler::zero, L_int_aligned);
1983 __ testb(bits, BytesPerShort-1);
1984 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
1986 __ BIND(L_short_aligned);
1987 __ shrptr(size, LogBytesPerShort); // size => short_count
1988 __ jump(RuntimeAddress(short_copy_entry));
1990 __ BIND(L_int_aligned);
1991 __ shrptr(size, LogBytesPerInt); // size => int_count
1992 __ jump(RuntimeAddress(int_copy_entry));
1994 __ BIND(L_long_aligned);
1995 __ shrptr(size, LogBytesPerLong); // size => qword_count
1996 __ jump(RuntimeAddress(long_copy_entry));
1998 return start;
1999 }
2001 // Perform range checks on the proposed arraycopy.
2002 // Kills temp, but nothing else.
2003 // Also, clean the sign bits of src_pos and dst_pos.
2004 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2005 Register src_pos, // source position (c_rarg1)
2006 Register dst, // destination array oo (c_rarg2)
2007 Register dst_pos, // destination position (c_rarg3)
2008 Register length,
2009 Register temp,
2010 Label& L_failed) {
2011 BLOCK_COMMENT("arraycopy_range_checks:");
2013 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2014 __ movl(temp, length);
2015 __ addl(temp, src_pos); // src_pos + length
2016 __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2017 __ jcc(Assembler::above, L_failed);
2019 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2020 __ movl(temp, length);
2021 __ addl(temp, dst_pos); // dst_pos + length
2022 __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2023 __ jcc(Assembler::above, L_failed);
2025 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2026 // Move with sign extension can be used since they are positive.
2027 __ movslq(src_pos, src_pos);
2028 __ movslq(dst_pos, dst_pos);
2030 BLOCK_COMMENT("arraycopy_range_checks done");
2031 }
2033 //
2034 // Generate generic array copy stubs
2035 //
2036 // Input:
2037 // c_rarg0 - src oop
2038 // c_rarg1 - src_pos (32-bits)
2039 // c_rarg2 - dst oop
2040 // c_rarg3 - dst_pos (32-bits)
2041 // not Win64
2042 // c_rarg4 - element count (32-bits)
2043 // Win64
2044 // rsp+40 - element count (32-bits)
2045 //
2046 // Output:
2047 // rax == 0 - success
2048 // rax == -1^K - failure, where K is partial transfer count
2049 //
2050 address generate_generic_copy(const char *name) {
2052 Label L_failed, L_failed_0, L_objArray;
2053 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2055 // Input registers
2056 const Register src = c_rarg0; // source array oop
2057 const Register src_pos = c_rarg1; // source position
2058 const Register dst = c_rarg2; // destination array oop
2059 const Register dst_pos = c_rarg3; // destination position
2060 // elements count is on stack on Win64
2061 #ifdef _WIN64
2062 #define C_RARG4 Address(rsp, 6 * wordSize)
2063 #else
2064 #define C_RARG4 c_rarg4
2065 #endif
2067 { int modulus = CodeEntryAlignment;
2068 int target = modulus - 5; // 5 = sizeof jmp(L_failed)
2069 int advance = target - (__ offset() % modulus);
2070 if (advance < 0) advance += modulus;
2071 if (advance > 0) __ nop(advance);
2072 }
2073 StubCodeMark mark(this, "StubRoutines", name);
2075 // Short-hop target to L_failed. Makes for denser prologue code.
2076 __ BIND(L_failed_0);
2077 __ jmp(L_failed);
2078 assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2080 __ align(CodeEntryAlignment);
2081 address start = __ pc();
2083 __ enter(); // required for proper stackwalking of RuntimeStub frame
2085 // bump this on entry, not on exit:
2086 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2088 //-----------------------------------------------------------------------
2089 // Assembler stub will be used for this call to arraycopy
2090 // if the following conditions are met:
2091 //
2092 // (1) src and dst must not be null.
2093 // (2) src_pos must not be negative.
2094 // (3) dst_pos must not be negative.
2095 // (4) length must not be negative.
2096 // (5) src klass and dst klass should be the same and not NULL.
2097 // (6) src and dst should be arrays.
2098 // (7) src_pos + length must not exceed length of src.
2099 // (8) dst_pos + length must not exceed length of dst.
2100 //
2102 // if (src == NULL) return -1;
2103 __ testptr(src, src); // src oop
2104 size_t j1off = __ offset();
2105 __ jccb(Assembler::zero, L_failed_0);
2107 // if (src_pos < 0) return -1;
2108 __ testl(src_pos, src_pos); // src_pos (32-bits)
2109 __ jccb(Assembler::negative, L_failed_0);
2111 // if (dst == NULL) return -1;
2112 __ testptr(dst, dst); // dst oop
2113 __ jccb(Assembler::zero, L_failed_0);
2115 // if (dst_pos < 0) return -1;
2116 __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2117 size_t j4off = __ offset();
2118 __ jccb(Assembler::negative, L_failed_0);
2120 // The first four tests are very dense code,
2121 // but not quite dense enough to put four
2122 // jumps in a 16-byte instruction fetch buffer.
2123 // That's good, because some branch predicters
2124 // do not like jumps so close together.
2125 // Make sure of this.
2126 guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2128 // registers used as temp
2129 const Register r11_length = r11; // elements count to copy
2130 const Register r10_src_klass = r10; // array klass
2131 const Register r9_dst_klass = r9; // dest array klass
2133 // if (length < 0) return -1;
2134 __ movl(r11_length, C_RARG4); // length (elements count, 32-bits value)
2135 __ testl(r11_length, r11_length);
2136 __ jccb(Assembler::negative, L_failed_0);
2138 __ load_klass(r10_src_klass, src);
2139 #ifdef ASSERT
2140 // assert(src->klass() != NULL);
2141 BLOCK_COMMENT("assert klasses not null");
2142 { Label L1, L2;
2143 __ testptr(r10_src_klass, r10_src_klass);
2144 __ jcc(Assembler::notZero, L2); // it is broken if klass is NULL
2145 __ bind(L1);
2146 __ stop("broken null klass");
2147 __ bind(L2);
2148 __ load_klass(r9_dst_klass, dst);
2149 __ cmpq(r9_dst_klass, 0);
2150 __ jcc(Assembler::equal, L1); // this would be broken also
2151 BLOCK_COMMENT("assert done");
2152 }
2153 #endif
2155 // Load layout helper (32-bits)
2156 //
2157 // |array_tag| | header_size | element_type | |log2_element_size|
2158 // 32 30 24 16 8 2 0
2159 //
2160 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2161 //
2163 int lh_offset = klassOopDesc::header_size() * HeapWordSize +
2164 Klass::layout_helper_offset_in_bytes();
2166 const Register rax_lh = rax; // layout helper
2168 __ movl(rax_lh, Address(r10_src_klass, lh_offset));
2170 // Handle objArrays completely differently...
2171 jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2172 __ cmpl(rax_lh, objArray_lh);
2173 __ jcc(Assembler::equal, L_objArray);
2175 // if (src->klass() != dst->klass()) return -1;
2176 __ load_klass(r9_dst_klass, dst);
2177 __ cmpq(r10_src_klass, r9_dst_klass);
2178 __ jcc(Assembler::notEqual, L_failed);
2180 // if (!src->is_Array()) return -1;
2181 __ cmpl(rax_lh, Klass::_lh_neutral_value);
2182 __ jcc(Assembler::greaterEqual, L_failed);
2184 // At this point, it is known to be a typeArray (array_tag 0x3).
2185 #ifdef ASSERT
2186 { Label L;
2187 __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
2188 __ jcc(Assembler::greaterEqual, L);
2189 __ stop("must be a primitive array");
2190 __ bind(L);
2191 }
2192 #endif
2194 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2195 r10, L_failed);
2197 // typeArrayKlass
2198 //
2199 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2200 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2201 //
2203 const Register r10_offset = r10; // array offset
2204 const Register rax_elsize = rax_lh; // element size
2206 __ movl(r10_offset, rax_lh);
2207 __ shrl(r10_offset, Klass::_lh_header_size_shift);
2208 __ andptr(r10_offset, Klass::_lh_header_size_mask); // array_offset
2209 __ addptr(src, r10_offset); // src array offset
2210 __ addptr(dst, r10_offset); // dst array offset
2211 BLOCK_COMMENT("choose copy loop based on element size");
2212 __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
2214 // next registers should be set before the jump to corresponding stub
2215 const Register from = c_rarg0; // source array address
2216 const Register to = c_rarg1; // destination array address
2217 const Register count = c_rarg2; // elements count
2219 // 'from', 'to', 'count' registers should be set in such order
2220 // since they are the same as 'src', 'src_pos', 'dst'.
2222 __ BIND(L_copy_bytes);
2223 __ cmpl(rax_elsize, 0);
2224 __ jccb(Assembler::notEqual, L_copy_shorts);
2225 __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
2226 __ lea(to, Address(dst, dst_pos, Address::times_1, 0));// dst_addr
2227 __ movl2ptr(count, r11_length); // length
2228 __ jump(RuntimeAddress(byte_copy_entry));
2230 __ BIND(L_copy_shorts);
2231 __ cmpl(rax_elsize, LogBytesPerShort);
2232 __ jccb(Assembler::notEqual, L_copy_ints);
2233 __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
2234 __ lea(to, Address(dst, dst_pos, Address::times_2, 0));// dst_addr
2235 __ movl2ptr(count, r11_length); // length
2236 __ jump(RuntimeAddress(short_copy_entry));
2238 __ BIND(L_copy_ints);
2239 __ cmpl(rax_elsize, LogBytesPerInt);
2240 __ jccb(Assembler::notEqual, L_copy_longs);
2241 __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
2242 __ lea(to, Address(dst, dst_pos, Address::times_4, 0));// dst_addr
2243 __ movl2ptr(count, r11_length); // length
2244 __ jump(RuntimeAddress(int_copy_entry));
2246 __ BIND(L_copy_longs);
2247 #ifdef ASSERT
2248 { Label L;
2249 __ cmpl(rax_elsize, LogBytesPerLong);
2250 __ jcc(Assembler::equal, L);
2251 __ stop("must be long copy, but elsize is wrong");
2252 __ bind(L);
2253 }
2254 #endif
2255 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
2256 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr
2257 __ movl2ptr(count, r11_length); // length
2258 __ jump(RuntimeAddress(long_copy_entry));
2260 // objArrayKlass
2261 __ BIND(L_objArray);
2262 // live at this point: r10_src_klass, src[_pos], dst[_pos]
2264 Label L_plain_copy, L_checkcast_copy;
2265 // test array classes for subtyping
2266 __ load_klass(r9_dst_klass, dst);
2267 __ cmpq(r10_src_klass, r9_dst_klass); // usual case is exact equality
2268 __ jcc(Assembler::notEqual, L_checkcast_copy);
2270 // Identically typed arrays can be copied without element-wise checks.
2271 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2272 r10, L_failed);
2274 __ lea(from, Address(src, src_pos, TIMES_OOP,
2275 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
2276 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
2277 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
2278 __ movl2ptr(count, r11_length); // length
2279 __ BIND(L_plain_copy);
2280 __ jump(RuntimeAddress(oop_copy_entry));
2282 __ BIND(L_checkcast_copy);
2283 // live at this point: r10_src_klass, !r11_length
2284 {
2285 // assert(r11_length == C_RARG4); // will reload from here
2286 Register r11_dst_klass = r11;
2287 __ load_klass(r11_dst_klass, dst);
2289 // Before looking at dst.length, make sure dst is also an objArray.
2290 __ cmpl(Address(r11_dst_klass, lh_offset), objArray_lh);
2291 __ jcc(Assembler::notEqual, L_failed);
2293 // It is safe to examine both src.length and dst.length.
2294 #ifndef _WIN64
2295 arraycopy_range_checks(src, src_pos, dst, dst_pos, C_RARG4,
2296 rax, L_failed);
2297 #else
2298 __ movl(r11_length, C_RARG4); // reload
2299 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2300 rax, L_failed);
2301 __ load_klass(r11_dst_klass, dst); // reload
2302 #endif
2304 // Marshal the base address arguments now, freeing registers.
2305 __ lea(from, Address(src, src_pos, TIMES_OOP,
2306 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2307 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
2308 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2309 __ movl(count, C_RARG4); // length (reloaded)
2310 Register sco_temp = c_rarg3; // this register is free now
2311 assert_different_registers(from, to, count, sco_temp,
2312 r11_dst_klass, r10_src_klass);
2313 assert_clean_int(count, sco_temp);
2315 // Generate the type check.
2316 int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
2317 Klass::super_check_offset_offset_in_bytes());
2318 __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
2319 assert_clean_int(sco_temp, rax);
2320 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
2322 // Fetch destination element klass from the objArrayKlass header.
2323 int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
2324 objArrayKlass::element_klass_offset_in_bytes());
2325 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
2326 __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
2327 assert_clean_int(sco_temp, rax);
2329 // the checkcast_copy loop needs two extra arguments:
2330 assert(c_rarg3 == sco_temp, "#3 already in place");
2331 __ movptr(C_RARG4, r11_dst_klass); // dst.klass.element_klass
2332 __ jump(RuntimeAddress(checkcast_copy_entry));
2333 }
2335 __ BIND(L_failed);
2336 __ xorptr(rax, rax);
2337 __ notptr(rax); // return -1
2338 __ leave(); // required for proper stackwalking of RuntimeStub frame
2339 __ ret(0);
2341 return start;
2342 }
2344 #undef length_arg
2345 #endif
2347 //FIXME
2348 address generate_disjoint_long_copy(bool aligned, const char *name) {
2349 Label l_1, l_2;
2350 StubCodeMark mark(this, "StubRoutines", name);
2351 __ align(CodeEntryAlignment);
2352 address start = __ pc();
2354 // __ movl(ecx, Address(esp, 4+8)); // count
2355 // __ movl(eax, Address(esp, 4+0)); // from
2356 // __ movl(edx, Address(esp, 4+4)); // to
2357 __ move(T1, A2);
2358 __ move(T3, A0);
2359 __ move(T0, A1);
2360 __ push(T3);
2361 __ push(T0);
2362 __ push(T1);
2363 //__ subl(edx, eax);
2364 //__ jmp(l_2);
2365 __ b(l_2);
2366 __ delayed()->nop();
2367 __ align(16);
2368 __ bind(l_1);
2369 // if (VM_Version::supports_mmx()) {
2370 // __ movq(mmx0, Address(eax));
2371 // __ movq(Address(eax, edx, Address::times_1), mmx0);
2372 // } else {
2373 // __ fild_d(Address(eax));
2374 __ ld(AT, T3, 0);
2375 // __ fistp_d(Address(eax, edx, Address::times_1));
2376 __ sd (AT, T0, 0);
2377 // }
2378 // __ addl(eax, 8);
2379 __ addi(T3, T3, 8);
2380 __ addi(T0, T0, 8);
2381 __ bind(l_2);
2382 // __ decl(ecx);
2383 __ addi(T1, T1, -1);
2384 // __ jcc(Assembler::greaterEqual, l_1);
2385 __ bgez(T1, l_1);
2386 __ delayed()->nop();
2387 // if (VM_Version::supports_mmx()) {
2388 // __ emms();
2389 // }
2390 // __ ret(0);
2391 __ pop(T1);
2392 __ pop(T0);
2393 __ pop(T3);
2394 __ jr(RA);
2395 __ delayed()->nop();
2396 return start;
2397 }
2400 address generate_conjoint_long_copy(bool aligned, const char *name) {
2401 Label l_1, l_2;
2402 StubCodeMark mark(this, "StubRoutines", name);
2403 __ align(CodeEntryAlignment);
2404 address start = __ pc();
2405 address nooverlap_target = aligned ?
2406 StubRoutines::arrayof_jlong_disjoint_arraycopy() :
2407 StubRoutines::jlong_disjoint_arraycopy();
2408 array_overlap_test(nooverlap_target, 3);
2410 __ push(T3);
2411 __ push(T0);
2412 __ push(T1);
2414 /* __ movl(ecx, Address(esp, 4+8)); // count
2415 __ movl(eax, Address(esp, 4+0)); // from
2416 __ movl(edx, Address(esp, 4+4)); // to
2417 __ jmp(l_2);
2419 */
2420 __ move(T1, A2);
2421 __ move(T3, A0);
2422 __ move(T0, A1);
2423 __ sll(AT, T1, Address::times_8);
2424 __ add(AT, T3, AT);
2425 __ lea(T3 , Address(AT, -8));
2426 __ sll(AT, T1, Address::times_8);
2427 __ add(AT, T0, AT);
2428 __ lea(T0 , Address(AT, -8));
2432 __ b(l_2);
2433 __ delayed()->nop();
2434 __ align(16);
2435 __ bind(l_1);
2436 /* if (VM_Version::supports_mmx()) {
2437 __ movq(mmx0, Address(eax, ecx, Address::times_8));
2438 __ movq(Address(edx, ecx,Address::times_8), mmx0);
2439 } else {
2440 __ fild_d(Address(eax, ecx, Address::times_8));
2441 __ fistp_d(Address(edx, ecx,Address::times_8));
2442 }
2443 */
2444 __ ld(AT, T3, 0);
2445 __ sd (AT, T0, 0);
2446 __ addi(T3, T3, -8);
2447 __ addi(T0, T0,-8);
2448 __ bind(l_2);
2449 // __ decl(ecx);
2450 __ addi(T1, T1, -1);
2451 //__ jcc(Assembler::greaterEqual, l_1);
2452 __ bgez(T1, l_1);
2453 __ delayed()->nop();
2454 // if (VM_Version::supports_mmx()) {
2455 // __ emms();
2456 // }
2457 // __ ret(0);
2458 __ pop(T1);
2459 __ pop(T0);
2460 __ pop(T3);
2461 __ jr(RA);
2462 __ delayed()->nop();
2463 return start;
2464 }
2466 void generate_arraycopy_stubs() {
2467 if (UseCompressedOops) {
2468 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, true, "oop_disjoint_arraycopy");
2469 StubRoutines::_oop_arraycopy = generate_conjoint_int_oop_copy(false, true, "oop_arraycopy");
2470 } else {
2471 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, true, "oop_disjoint_arraycopy");
2472 StubRoutines::_oop_arraycopy = generate_conjoint_long_oop_copy(false, true, "oop_arraycopy");
2473 }
2475 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
2476 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
2477 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, false, "jint_disjoint_arraycopy");
2478 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
2479 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
2481 // if (VM_Version::supports_mmx())
2482 //if (false)
2483 // StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_mmx_copy_aligned("arrayof_jshort_disjoint_arraycopy");
2484 // else
2485 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
2486 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(true, false, "arrayof_jint_disjoint_arraycopy");
2487 //StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(true, true, "arrayof_oop_disjoint_arraycopy");
2488 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
2490 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
2491 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, "jshort_arraycopy");
2492 StubRoutines::_jint_arraycopy = generate_conjoint_int_oop_copy(false, false, "jint_arraycopy");
2493 StubRoutines::_jlong_arraycopy = generate_conjoint_long_copy(false, "jlong_arraycopy");
2495 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
2496 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
2497 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_oop_copy(true, false, "arrayof_jint_arraycopy");
2498 //StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_int_oop_copy(true, true, "arrayof_oop_arraycopy");
2499 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");
2501 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy;
2502 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy;
2503 }
2505 //Wang: add a function to implement SafeFetch32 and SafeFetchN
2506 void generate_safefetch(const char* name, int size, address* entry,
2507 address* fault_pc, address* continuation_pc) {
2508 // safefetch signatures:
2509 // int SafeFetch32(int* adr, int errValue);
2510 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
2511 //
2512 // arguments:
2513 // A0 = adr
2514 // A1 = errValue
2515 //
2516 // result:
2517 // PPC_RET = *adr or errValue
2519 StubCodeMark mark(this, "StubRoutines", name);
2521 // Entry point, pc or function descriptor.
2522 *entry = __ pc();
2524 // Load *adr into A1, may fault.
2525 *fault_pc = __ pc();
2526 switch (size) {
2527 case 4:
2528 // int32_t
2529 __ lw(A1, A0, 0);
2530 break;
2531 case 8:
2532 // int64_t
2533 __ ld(A1, A0, 0);
2534 break;
2535 default:
2536 ShouldNotReachHere();
2537 }
2539 // return errValue or *adr
2540 *continuation_pc = __ pc();
2541 __ addu(V0,A1,R0);
2542 __ jr(RA);
2543 __ delayed()->nop();
2544 }
2547 #undef __
2548 #define __ masm->
2550 // Continuation point for throwing of implicit exceptions that are
2551 // not handled in the current activation. Fabricates an exception
2552 // oop and initiates normal exception dispatching in this
2553 // frame. Since we need to preserve callee-saved values (currently
2554 // only for C2, but done for C1 as well) we need a callee-saved oop
2555 // map and therefore have to make these stubs into RuntimeStubs
2556 // rather than BufferBlobs. If the compiler needs all registers to
2557 // be preserved between the fault point and the exception handler
2558 // then it must assume responsibility for that in
2559 // AbstractCompiler::continuation_for_implicit_null_exception or
2560 // continuation_for_implicit_division_by_zero_exception. All other
2561 // implicit exceptions (e.g., NullPointerException or
2562 // AbstractMethodError on entry) are either at call sites or
2563 // otherwise assume that stack unwinding will be initiated, so
2564 // caller saved registers were assumed volatile in the compiler.
2565 address generate_throw_exception(const char* name,
2566 address runtime_entry,
2567 bool restore_saved_exception_pc) {
2568 // Information about frame layout at time of blocking runtime call.
2569 // Note that we only have to preserve callee-saved registers since
2570 // the compilers are responsible for supplying a continuation point
2571 // if they expect all registers to be preserved.
2572 //#define aoqi_test
2573 #ifdef aoqi_test
2574 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
2575 #endif
2576 enum layout {
2577 thread_off, // last_java_sp
2578 S7_off, // callee saved register sp + 1
2579 S6_off, // callee saved register sp + 2
2580 S5_off, // callee saved register sp + 3
2581 S4_off, // callee saved register sp + 4
2582 S3_off, // callee saved register sp + 5
2583 S2_off, // callee saved register sp + 6
2584 S1_off, // callee saved register sp + 7
2585 S0_off, // callee saved register sp + 8
2586 FP_off,
2587 ret_address,
2588 framesize
2589 };
2591 int insts_size = 2048;
2592 int locs_size = 32;
2594 // CodeBuffer* code = new CodeBuffer(insts_size, locs_size, 0, 0, 0, false,
2595 // NULL, NULL, NULL, false, NULL, name, false);
2596 CodeBuffer code (name , insts_size, locs_size);
2597 #ifdef aoqi_test
2598 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
2599 #endif
2600 OopMapSet* oop_maps = new OopMapSet();
2601 #ifdef aoqi_test
2602 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
2603 #endif
2604 MacroAssembler* masm = new MacroAssembler(&code);
2605 #ifdef aoqi_test
2606 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
2607 #endif
2609 address start = __ pc();
2610 //__ stop("generate_throw_exception");
2611 /*
2612 __ move(AT, (int)&jerome1 );
2613 __ sw(SP, AT, 0);
2614 __ move(AT, (int)&jerome2 );
2615 __ sw(FP, AT, 0);
2616 __ move(AT, (int)&jerome3 );
2617 __ sw(RA, AT, 0);
2618 __ move(AT, (int)&jerome4 );
2619 __ sw(R0, AT, 0);
2620 __ move(AT, (int)&jerome5 );
2621 __ sw(R0, AT, 0);
2622 __ move(AT, (int)&jerome6 );
2623 __ sw(R0, AT, 0);
2624 __ move(AT, (int)&jerome7 );
2625 __ sw(R0, AT, 0);
2626 __ move(AT, (int)&jerome10 );
2627 __ sw(R0, AT, 0);
2629 __ pushad();
2631 //__ enter();
2632 __ call(CAST_FROM_FN_PTR(address, SharedRuntime::print_call_statistics),
2633 relocInfo::runtime_call_type);
2634 __ delayed()->nop();
2636 //__ leave();
2637 __ popad();
2639 */
2641 // This is an inlined and slightly modified version of call_VM
2642 // which has the ability to fetch the return PC out of
2643 // thread-local storage and also sets up last_Java_sp slightly
2644 // differently than the real call_VM
2645 #ifndef OPT_THREAD
2646 Register java_thread = TREG;
2647 __ get_thread(java_thread);
2648 #else
2649 Register java_thread = TREG;
2650 #endif
2651 #ifdef aoqi_test
2652 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
2653 #endif
2654 if (restore_saved_exception_pc) {
2655 __ ld(RA, java_thread, in_bytes(JavaThread::saved_exception_pc_offset())); // eax
2656 }
2658 __ enter(); // required for proper stackwalking of RuntimeStub frame
2660 __ addi(SP, SP, (-1) * (framesize-2) * wordSize); // prolog
2661 __ sd(S0, SP, S0_off * wordSize);
2662 __ sd(S1, SP, S1_off * wordSize);
2663 __ sd(S2, SP, S2_off * wordSize);
2664 __ sd(S3, SP, S3_off * wordSize);
2665 __ sd(S4, SP, S4_off * wordSize);
2666 __ sd(S5, SP, S5_off * wordSize);
2667 __ sd(S6, SP, S6_off * wordSize);
2668 __ sd(S7, SP, S7_off * wordSize);
2670 int frame_complete = __ pc() - start;
2671 // push java thread (becomes first argument of C function)
2672 __ sd(java_thread, SP, thread_off * wordSize);
2673 if (java_thread!=A0)
2674 __ move(A0, java_thread);
2676 // Set up last_Java_sp and last_Java_fp
2677 __ set_last_Java_frame(java_thread, SP, FP, NULL);
2678 __ relocate(relocInfo::internal_pc_type);
2679 {
2680 intptr_t save_pc = (intptr_t)__ pc() + NativeMovConstReg::instruction_size + NativeCall::return_address_offset + 4;
2681 __ li48(AT, save_pc);
2682 }
2683 __ sd(AT, java_thread, in_bytes(JavaThread::last_Java_pc_offset()));
2685 // Call runtime
2686 __ call(runtime_entry);
2687 __ delayed()->nop();
2688 // Generate oop map
2689 OopMap* map = new OopMap(framesize, 0);
2690 oop_maps->add_gc_map(__ offset(), map);
2692 // restore the thread (cannot use the pushed argument since arguments
2693 // may be overwritten by C code generated by an optimizing compiler);
2694 // however can use the register value directly if it is callee saved.
2695 #ifndef OPT_THREAD
2696 __ get_thread(java_thread);
2697 #endif
2699 __ ld(SP, java_thread, in_bytes(JavaThread::last_Java_sp_offset()));
2700 // __ reset_last_Java_frame(java_thread, true);
2701 __ reset_last_Java_frame(java_thread, true, true);
2703 // Restore callee save registers. This must be done after resetting the Java frame
2704 __ ld(S0, SP, S0_off * wordSize);
2705 __ ld(S1, SP, S1_off * wordSize);
2706 __ ld(S2, SP, S2_off * wordSize);
2707 __ ld(S3, SP, S3_off * wordSize);
2708 __ ld(S4, SP, S4_off * wordSize);
2709 __ ld(S5, SP, S5_off * wordSize);
2710 __ ld(S6, SP, S6_off * wordSize);
2711 __ ld(S7, SP, S7_off * wordSize);
2713 // discard arguments
2714 __ addi(SP, SP, (framesize-2) * wordSize); // epilog
2715 // __ leave(); // required for proper stackwalking of RuntimeStub frame
2716 __ addi(SP, FP, wordSize);
2717 __ ld(FP, SP, -1*wordSize);
2718 // check for pending exceptions
2719 #ifdef ASSERT
2720 Label L;
2721 __ lw(AT, java_thread, in_bytes(Thread::pending_exception_offset()));
2722 __ bne(AT, R0, L);
2723 __ delayed()->nop();
2724 __ should_not_reach_here();
2725 __ bind(L);
2726 #endif //ASSERT
2727 __ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
2728 __ delayed()->nop();
2729 #ifdef aoqi_test
2730 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
2731 #endif
2732 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code,frame_complete,
2733 framesize, oop_maps, false);
2734 #ifdef aoqi_test
2735 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
2736 #endif
2737 return stub->entry_point();
2738 }
2740 // Initialization
2741 void generate_initial() {
2742 /*
2743 // Generates all stubs and initializes the entry points
2745 // This platform-specific stub is needed by generate_call_stub()
2746 StubRoutines::mips::_mxcsr_std = generate_fp_mask("mxcsr_std", 0x0000000000001F80);
2748 // entry points that exist in all platforms Note: This is code
2749 // that could be shared among different platforms - however the
2750 // benefit seems to be smaller than the disadvantage of having a
2751 // much more complicated generator structure. See also comment in
2752 // stubRoutines.hpp.
2754 StubRoutines::_forward_exception_entry = generate_forward_exception();
2756 StubRoutines::_call_stub_entry =
2757 generate_call_stub(StubRoutines::_call_stub_return_address);
2759 // is referenced by megamorphic call
2760 StubRoutines::_catch_exception_entry = generate_catch_exception();
2762 // atomic calls
2763 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg();
2764 StubRoutines::_atomic_xchg_ptr_entry = generate_atomic_xchg_ptr();
2765 StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg();
2766 StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
2767 StubRoutines::_atomic_add_entry = generate_atomic_add();
2768 StubRoutines::_atomic_add_ptr_entry = generate_atomic_add_ptr();
2769 StubRoutines::_fence_entry = generate_orderaccess_fence();
2771 StubRoutines::_handler_for_unsafe_access_entry =
2772 generate_handler_for_unsafe_access();
2774 // platform dependent
2775 StubRoutines::mips::_get_previous_fp_entry = generate_get_previous_fp();
2777 StubRoutines::mips::_verify_mxcsr_entry = generate_verify_mxcsr();
2778 */
2779 // Generates all stubs and initializes the entry points
2781 //-------------------------------------------------------------
2782 //-----------------------------------------------------------
2783 // entry points that exist in all platforms
2784 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller
2785 // than the disadvantage of having a much more complicated generator structure.
2786 // See also comment in stubRoutines.hpp.
2787 StubRoutines::_forward_exception_entry = generate_forward_exception();
2788 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
2789 // is referenced by megamorphic call
2790 StubRoutines::_catch_exception_entry = generate_catch_exception();
2792 StubRoutines::_handler_for_unsafe_access_entry = generate_handler_for_unsafe_access();
2794 // platform dependent
2795 StubRoutines::gs2::_get_previous_fp_entry = generate_get_previous_fp();
2796 }
2798 void generate_all() {
2799 #ifdef aoqi_test
2800 tty->print_cr("%s:%d", __func__, __LINE__);
2801 #endif
2802 // Generates all stubs and initializes the entry points
2804 // These entry points require SharedInfo::stack0 to be set up in
2805 // non-core builds and need to be relocatable, so they each
2806 // fabricate a RuntimeStub internally.
2807 /*
2808 StubRoutines::_throw_AbstractMethodError_entry =
2809 generate_throw_exception("AbstractMethodError throw_exception",
2810 CAST_FROM_FN_PTR(address,
2811 SharedRuntime::
2812 throw_AbstractMethodError),
2813 false);
2815 StubRoutines::_throw_IncompatibleClassChangeError_entry =
2816 generate_throw_exception("IncompatibleClassChangeError throw_exception",
2817 CAST_FROM_FN_PTR(address,
2818 SharedRuntime::
2819 throw_IncompatibleClassChangeError),
2820 false);
2822 StubRoutines::_throw_ArithmeticException_entry =
2823 generate_throw_exception("ArithmeticException throw_exception",
2824 CAST_FROM_FN_PTR(address,
2825 SharedRuntime::
2826 throw_ArithmeticException),
2827 true);
2829 StubRoutines::_throw_NullPointerException_entry =
2830 generate_throw_exception("NullPointerException throw_exception",
2831 CAST_FROM_FN_PTR(address,
2832 SharedRuntime::
2833 throw_NullPointerException),
2834 true);
2836 StubRoutines::_throw_NullPointerException_at_call_entry =
2837 generate_throw_exception("NullPointerException at call throw_exception",
2838 CAST_FROM_FN_PTR(address,
2839 SharedRuntime::
2840 throw_NullPointerException_at_call),
2841 false);
2843 StubRoutines::_throw_StackOverflowError_entry =
2844 generate_throw_exception("StackOverflowError throw_exception",
2845 CAST_FROM_FN_PTR(address,
2846 SharedRuntime::
2847 throw_StackOverflowError),
2848 false);
2850 // entry points that are platform specific
2851 StubRoutines::mips::_f2i_fixup = generate_f2i_fixup();
2852 StubRoutines::mips::_f2l_fixup = generate_f2l_fixup();
2853 StubRoutines::mips::_d2i_fixup = generate_d2i_fixup();
2854 StubRoutines::mips::_d2l_fixup = generate_d2l_fixup();
2856 StubRoutines::mips::_float_sign_mask = generate_fp_mask("float_sign_mask", 0x7FFFFFFF7FFFFFFF);
2857 StubRoutines::mips::_float_sign_flip = generate_fp_mask("float_sign_flip", 0x8000000080000000);
2858 StubRoutines::mips::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
2859 StubRoutines::mips::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
2861 // support for verify_oop (must happen after universe_init)
2862 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
2864 // arraycopy stubs used by compilers
2865 generate_arraycopy_stubs();
2866 */
2867 #ifdef aoqi_test
2868 tty->print_cr("%s:%d", __func__, __LINE__);
2869 #endif
2870 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError), false);
2871 #ifdef aoqi_test
2872 tty->print_cr("%s:%d", __func__, __LINE__);
2873 #endif
2874 // StubRoutines::_throw_ArithmeticException_entry = generate_throw_exception("ArithmeticException throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_ArithmeticException), true);
2875 #ifdef aoqi_test
2876 tty->print_cr("%s:%d", __func__, __LINE__);
2877 #endif
2878 // StubRoutines::_throw_NullPointerException_entry = generate_throw_exception("NullPointerException throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException), true);
2879 #ifdef aoqi_test
2880 tty->print_cr("%s:%d", __func__, __LINE__);
2881 #endif
2882 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
2883 #ifdef aoqi_test
2884 tty->print_cr("%s:%d", __func__, __LINE__);
2885 #endif
2886 StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
2887 #ifdef aoqi_test
2888 tty->print_cr("%s:%d", __func__, __LINE__);
2889 #endif
2891 //------------------------------------------------------
2892 //------------------------------------------------------------------
2893 // entry points that are platform specific
2895 // support for verify_oop (must happen after universe_init)
2896 #ifdef aoqi_test
2897 tty->print_cr("%s:%d", __func__, __LINE__);
2898 #endif
2899 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
2900 #ifdef aoqi_test
2901 tty->print_cr("%s:%d", __func__, __LINE__);
2902 #endif
2903 #ifndef CORE
2904 // arraycopy stubs used by compilers
2905 generate_arraycopy_stubs();
2906 #ifdef aoqi_test
2907 tty->print_cr("%s:%d", __func__, __LINE__);
2908 #endif
2909 #endif
2911 // Safefetch stubs.
2912 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
2913 &StubRoutines::_safefetch32_fault_pc,
2914 &StubRoutines::_safefetch32_continuation_pc);
2915 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
2916 &StubRoutines::_safefetchN_fault_pc,
2917 &StubRoutines::_safefetchN_continuation_pc);
2918 }
2920 public:
2921 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
2922 if (all) {
2923 generate_all();
2924 } else {
2925 generate_initial();
2926 }
2927 }
2928 }; // end class declaration
2929 /*
2930 address StubGenerator::disjoint_byte_copy_entry = NULL;
2931 address StubGenerator::disjoint_short_copy_entry = NULL;
2932 address StubGenerator::disjoint_int_copy_entry = NULL;
2933 address StubGenerator::disjoint_long_copy_entry = NULL;
2934 address StubGenerator::disjoint_oop_copy_entry = NULL;
2936 address StubGenerator::byte_copy_entry = NULL;
2937 address StubGenerator::short_copy_entry = NULL;
2938 address StubGenerator::int_copy_entry = NULL;
2939 address StubGenerator::long_copy_entry = NULL;
2940 address StubGenerator::oop_copy_entry = NULL;
2942 address StubGenerator::checkcast_copy_entry = NULL;
2943 */
2944 void StubGenerator_generate(CodeBuffer* code, bool all) {
2945 StubGenerator g(code, all);
2946 }