Tue, 20 Sep 2016 11:48:21 +0800
#4537 Rewrite generate_disjoint_byte_copy
Eliminated unaligned access and Optimized copy algorithm. The same as changeset 114
The unaligned account does not increase, has passed the SPECjvm2008 test.
20% speed up at the test program.
The test program:
public class ByteCopyTest{
public static void main(String args[]){
int count = 100000;
char []A = new char[count];
char []B = new char[count];
for(int i = 0; i < count; i++){
A[i] = (char)(i % 26 + 97);
}
long startTime = System.nanoTime();
System.arraycopy(A, 0, B, 0, count);
long endTime = System.nanoTime();
System.out.println(endTime - startTime);
}
}
1 /*
2 * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2015, 2016, Loongson Technology. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
26 #include "precompiled.hpp"
27 #include "asm/macroAssembler.hpp"
28 #include "asm/macroAssembler.inline.hpp"
29 #include "interpreter/interpreter.hpp"
30 #include "nativeInst_mips.hpp"
31 #include "oops/instanceOop.hpp"
32 #include "oops/method.hpp"
33 #include "oops/objArrayKlass.hpp"
34 #include "oops/oop.inline.hpp"
35 #include "prims/methodHandles.hpp"
36 #include "runtime/frame.inline.hpp"
37 #include "runtime/handles.inline.hpp"
38 #include "runtime/sharedRuntime.hpp"
39 #include "runtime/stubCodeGenerator.hpp"
40 #include "runtime/stubRoutines.hpp"
41 #include "runtime/thread.inline.hpp"
42 #include "utilities/top.hpp"
43 #ifdef COMPILER2
44 #include "opto/runtime.hpp"
45 #endif
48 // Declaration and definition of StubGenerator (no .hpp file).
49 // For a more detailed description of the stub routine structure
50 // see the comment in stubRoutines.hpp
52 #define __ _masm->
53 //#define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
54 //#define a__ ((Assembler*)_masm)->
56 //#ifdef PRODUCT
57 //#define BLOCK_COMMENT(str) /* nothing */
58 //#else
59 //#define BLOCK_COMMENT(str) __ block_comment(str)
60 //#endif
62 //#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
63 const int MXCSR_MASK = 0xFFC0; // Mask out any pending exceptions
65 // Stub Code definitions
67 static address handle_unsafe_access() {
68 JavaThread* thread = JavaThread::current();
69 address pc = thread->saved_exception_pc();
70 // pc is the instruction which we must emulate
71 // doing a no-op is fine: return garbage from the load
72 // therefore, compute npc
73 //address npc = Assembler::locate_next_instruction(pc);
74 address npc = (address)((unsigned long)pc + sizeof(unsigned long));
76 // request an async exception
77 thread->set_pending_unsafe_access_error();
79 // return address of next instruction to execute
80 return npc;
81 }
83 class StubGenerator: public StubCodeGenerator {
84 private:
86 // ABI mips n64
87 // This fig is not MIPS ABI. It is call Java from C ABI.
88 // Call stubs are used to call Java from C
89 //
90 // [ return_from_Java ]
91 // [ argument word n-1 ] <--- sp
92 // ...
93 // [ argument word 0 ]
94 // ...
95 //-10 [ S6 ]
96 // -9 [ S5 ]
97 // -8 [ S4 ]
98 // -7 [ S3 ]
99 // -6 [ S0 ]
100 // -5 [ TSR(S2) ]
101 // -4 [ LVP(S7) ]
102 // -3 [ BCP(S1) ]
103 // -2 [ saved fp ] <--- fp_after_call
104 // -1 [ return address ]
105 // 0 [ ptr. to call wrapper ] <--- a0 (old sp -->)fp
106 // 1 [ result ] <--- a1
107 // 2 [ result_type ] <--- a2
108 // 3 [ method ] <--- a3
109 // 4 [ entry_point ] <--- a4
110 // 5 [ parameters ] <--- a5
111 // 6 [ parameter_size ] <--- a6
112 // 7 [ thread ] <--- a7
114 //
115 // _LP64: n64 does not save paras in sp.
116 //
117 // [ return_from_Java ]
118 // [ argument word n-1 ] <--- sp
119 // ...
120 // [ argument word 0 ]
121 // ...
122 //-14 [ thread ]
123 //-13 [ result_type ] <--- a2
124 //-12 [ result ] <--- a1
125 //-11 [ ptr. to call wrapper ] <--- a0
126 //-10 [ S6 ]
127 // -9 [ S5 ]
128 // -8 [ S4 ]
129 // -7 [ S3 ]
130 // -6 [ S0 ]
131 // -5 [ TSR(S2) ]
132 // -4 [ LVP(S7) ]
133 // -3 [ BCP(S1) ]
134 // -2 [ saved fp ] <--- fp_after_call
135 // -1 [ return address ]
136 // 0 [ ] <--- old sp
137 /*
138 * 2014/01/16 Fu: Find a right place in the call_stub for GP.
139 * GP will point to the starting point of Interpreter::dispatch_table(itos).
140 * It should be saved/restored before/after Java calls.
141 *
142 */
143 enum call_stub_layout {
144 RA_off = -1,
145 FP_off = -2,
146 BCP_off = -3,
147 LVP_off = -4,
148 TSR_off = -5,
149 S1_off = -6,
150 S3_off = -7,
151 S4_off = -8,
152 S5_off = -9,
153 S6_off = -10,
154 result_off = -11,
155 result_type_off = -12,
156 thread_off = -13,
157 total_off = thread_off - 3,
158 GP_off = -16,
159 };
161 address generate_call_stub(address& return_address) {
163 StubCodeMark mark(this, "StubRoutines", "call_stub");
164 address start = __ pc();
166 // same as in generate_catch_exception()!
168 // stub code
169 // save ra and fp
170 __ sd(RA, SP, RA_off * wordSize);
171 __ sd(FP, SP, FP_off * wordSize);
172 __ sd(BCP, SP, BCP_off * wordSize);
173 __ sd(LVP, SP, LVP_off * wordSize);
174 __ sd(GP, SP, GP_off * wordSize);
175 __ sd(TSR, SP, TSR_off * wordSize);
176 __ sd(S1, SP, S1_off * wordSize);
177 __ sd(S3, SP, S3_off * wordSize);
178 __ sd(S4, SP, S4_off * wordSize);
179 __ sd(S5, SP, S5_off * wordSize);
180 __ sd(S6, SP, S6_off * wordSize);
183 __ li48(GP, (long)Interpreter::dispatch_table(itos));
185 // I think 14 is the max gap between argument and callee saved register
186 __ daddi(FP, SP, (-2) * wordSize);
187 __ daddi(SP, SP, total_off * wordSize);
188 //FIXME, aoqi. find a suitable place to save A1 & A2.
189 /*
190 __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);
191 __ sd(A1, FP, 3 * wordSize);
192 __ sd(A2, FP, 4 * wordSize);
193 __ sd(A3, FP, 5 * wordSize);
194 __ sd(A4, FP, 6 * wordSize);
195 __ sd(A5, FP, 7 * wordSize);
196 __ sd(A6, FP, 8 * wordSize);
197 __ sd(A7, FP, 9 * wordSize);
198 */
199 __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);
200 __ sd(A1, FP, result_off * wordSize);
201 __ sd(A2, FP, result_type_off * wordSize);
202 __ sd(A7, FP, thread_off * wordSize);
204 #ifdef OPT_THREAD
205 //__ get_thread(TREG);
206 __ move(TREG, A7);
208 //__ ld(TREG, FP, thread_off * wordSize);
209 #endif
210 //add for compressedoops
211 __ reinit_heapbase();
213 #ifdef ASSERT
214 // make sure we have no pending exceptions
215 {
216 Label L;
217 __ ld(AT, A7, in_bytes(Thread::pending_exception_offset()));
218 __ beq(AT, R0, L);
219 __ delayed()->nop();
220 /* FIXME: I do not know how to realize stop in mips arch, do it in the future */
221 __ stop("StubRoutines::call_stub: entered with pending exception");
222 __ bind(L);
223 }
224 #endif
226 // pass parameters if any
227 // A5: parameter
228 // A6: parameter_size
229 // T0: parameter_size_tmp(--)
230 // T2: offset(++)
231 // T3: tmp
232 Label parameters_done;
233 // judge if the parameter_size equals 0
234 __ beq(A6, R0, parameters_done);
235 __ delayed()->nop();
236 __ dsll(AT, A6, Interpreter::logStackElementSize);
237 __ dsub(SP, SP, AT);
238 __ move(AT, -StackAlignmentInBytes);
239 __ andr(SP, SP , AT);
240 // Copy Java parameters in reverse order (receiver last)
241 // Note that the argument order is inverted in the process
242 // source is edx[ecx: N-1..0]
243 // dest is esp[ebx: 0..N-1]
244 Label loop;
245 __ move(T0, A6);
246 __ move(T2, R0);
247 __ bind(loop);
249 // get parameter
250 __ dsll(T3, T0, LogBytesPerWord);
251 __ dadd(T3, T3, A5);
252 __ ld(AT, T3, -wordSize);
253 __ dsll(T3, T2, LogBytesPerWord);
254 __ dadd(T3, T3, SP);
255 __ sd(AT, T3, Interpreter::expr_offset_in_bytes(0));
256 __ daddi(T2, T2, 1);
257 __ daddi(T0, T0, -1);
258 __ bne(T0, R0, loop);
259 __ delayed()->nop();
260 // advance to next parameter
262 // call Java function
263 __ bind(parameters_done);
265 // receiver in V0, methodOop in Rmethod
267 __ move(Rmethod, A3);
268 __ move(Rsender, SP); //set sender sp
269 __ jalr(A4);
270 __ delayed()->nop();
271 return_address = __ pc();
273 Label common_return;
274 __ bind(common_return);
276 // store result depending on type
277 // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
278 __ ld(T0, FP, result_off * wordSize); // result --> T0
279 Label is_long, is_float, is_double, exit;
280 __ ld(T2, FP, result_type_off * wordSize); // result_type --> T2
281 __ daddi(T3, T2, (-1) * T_LONG);
282 __ beq(T3, R0, is_long);
283 __ delayed()->daddi(T3, T2, (-1) * T_FLOAT);
284 __ beq(T3, R0, is_float);
285 __ delayed()->daddi(T3, T2, (-1) * T_DOUBLE);
286 __ beq(T3, R0, is_double);
287 __ delayed()->nop();
289 // handle T_INT case
290 __ sd(V0, T0, 0 * wordSize);
291 __ bind(exit);
293 // restore
294 __ daddi(SP, FP, 2 * wordSize );
295 __ ld(RA, SP, RA_off * wordSize);
296 __ ld(FP, SP, FP_off * wordSize);
297 __ ld(BCP, SP, BCP_off * wordSize);
298 __ ld(LVP, SP, LVP_off * wordSize);
299 __ ld(GP, SP, GP_off * wordSize);
300 __ ld(TSR, SP, TSR_off * wordSize);
302 __ ld(S1, SP, S1_off * wordSize);
303 __ ld(S3, SP, S3_off * wordSize);
304 __ ld(S4, SP, S4_off * wordSize);
305 __ ld(S5, SP, S5_off * wordSize);
306 __ ld(S6, SP, S6_off * wordSize);
308 // return
309 __ jr(RA);
310 __ delayed()->nop();
312 // handle return types different from T_INT
313 __ bind(is_long);
314 __ sd(V0, T0, 0 * wordSize);
315 //__ sd(V1, T0, 1 * wordSize);
316 //__ sd(R0, T0, 1 * wordSize);
317 __ b(exit);
318 __ delayed()->nop();
320 __ bind(is_float);
321 __ swc1(F0, T0, 0 * wordSize);
322 __ b(exit);
323 __ delayed()->nop();
325 __ bind(is_double);
326 __ sdc1(F0, T0, 0 * wordSize);
327 //__ sdc1(F1, T0, 1 * wordSize);
328 //__ sd(R0, T0, 1 * wordSize);
329 __ b(exit);
330 __ delayed()->nop();
331 //FIXME, 1.6 mips version add operation of fpu here
332 StubRoutines::gs2::set_call_stub_compiled_return(__ pc());
333 __ b(common_return);
334 __ delayed()->nop();
335 return start;
336 }
338 // Return point for a Java call if there's an exception thrown in
339 // Java code. The exception is caught and transformed into a
340 // pending exception stored in JavaThread that can be tested from
341 // within the VM.
342 //
343 // Note: Usually the parameters are removed by the callee. In case
344 // of an exception crossing an activation frame boundary, that is
345 // not the case if the callee is compiled code => need to setup the
346 // rsp.
347 //
348 // rax: exception oop
350 address generate_catch_exception() {
351 StubCodeMark mark(this, "StubRoutines", "catch_exception");
352 address start = __ pc();
354 Register thread = TREG;
356 // get thread directly
357 #ifndef OPT_THREAD
358 __ ld(thread, FP, thread_off * wordSize);
359 #endif
361 #ifdef ASSERT
362 // verify that threads correspond
363 { Label L;
364 __ get_thread(T8);
365 __ beq(T8, thread, L);
366 __ delayed()->nop();
367 __ stop("StubRoutines::catch_exception: threads must correspond");
368 __ bind(L);
369 }
370 #endif
371 // set pending exception
372 __ verify_oop(V0);
373 __ sd(V0, thread, in_bytes(Thread::pending_exception_offset()));
374 __ li(AT, (long)__FILE__);
375 __ sd(AT, thread, in_bytes(Thread::exception_file_offset ()));
376 __ li(AT, (long)__LINE__);
377 __ sd(AT, thread, in_bytes(Thread::exception_line_offset ()));
379 // complete return to VM
380 assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
381 __ jmp(StubRoutines::_call_stub_return_address, relocInfo::none);
382 __ delayed()->nop();
384 return start;
385 }
387 // Continuation point for runtime calls returning with a pending
388 // exception. The pending exception check happened in the runtime
389 // or native call stub. The pending exception in Thread is
390 // converted into a Java-level exception.
391 //
392 // Contract with Java-level exception handlers:
393 // rax: exception
394 // rdx: throwing pc
395 //
396 // NOTE: At entry of this stub, exception-pc must be on stack !!
398 address generate_forward_exception() {
399 StubCodeMark mark(this, "StubRoutines", "forward exception");
400 //Register thread = TREG;
401 Register thread = TREG;
402 address start = __ pc();
404 // Upon entry, the sp points to the return address returning into Java
405 // (interpreted or compiled) code; i.e., the return address becomes the
406 // throwing pc.
407 //
408 // Arguments pushed before the runtime call are still on the stack but
409 // the exception handler will reset the stack pointer -> ignore them.
410 // A potential result in registers can be ignored as well.
412 #ifdef ASSERT
413 // make sure this code is only executed if there is a pending exception
414 #ifndef OPT_THREAD
415 __ get_thread(thread);
416 #endif
417 { Label L;
418 __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));
419 __ bne(AT, R0, L);
420 __ delayed()->nop();
421 __ stop("StubRoutines::forward exception: no pending exception (1)");
422 __ bind(L);
423 }
424 #endif
426 // compute exception handler into T9
427 __ ld(A1, SP, 0);
428 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, A1);
429 __ move(T9, V0);
430 __ pop(V1);
432 #ifndef OPT_THREAD
433 __ get_thread(thread);
434 #endif
435 __ ld(V0, thread, in_bytes(Thread::pending_exception_offset()));
436 __ sd(R0, thread, in_bytes(Thread::pending_exception_offset()));
438 #ifdef ASSERT
439 // make sure exception is set
440 { Label L;
441 __ bne(V0, R0, L);
442 __ delayed()->nop();
443 __ stop("StubRoutines::forward exception: no pending exception (2)");
444 __ bind(L);
445 }
446 #endif
448 // continue at exception handler (return address removed)
449 // V0: exception
450 // T9: exception handler
451 // V1: throwing pc
452 __ verify_oop(V0);
453 __ jr(T9);
454 __ delayed()->nop();
456 return start;
457 }
459 // Support for intptr_t get_previous_fp()
460 //
461 // This routine is used to find the previous frame pointer for the
462 // caller (current_frame_guess). This is used as part of debugging
463 // ps() is seemingly lost trying to find frames.
464 // This code assumes that caller current_frame_guess) has a frame.
465 address generate_get_previous_fp() {
466 StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
467 const Address old_fp (FP, 0);
468 const Address older_fp (V0, 0);
469 address start = __ pc();
470 __ enter();
471 __ lw(V0, old_fp); // callers fp
472 __ lw(V0, older_fp); // the frame for ps()
473 __ leave();
474 __ jr(RA);
475 __ delayed()->nop();
476 return start;
477 }
478 // The following routine generates a subroutine to throw an
479 // asynchronous UnknownError when an unsafe access gets a fault that
480 // could not be reasonably prevented by the programmer. (Example:
481 // SIGBUS/OBJERR.)
482 address generate_handler_for_unsafe_access() {
483 StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
484 address start = __ pc();
485 __ pushad(); // push registers
486 // Address next_pc(esp, RegisterImpl::number_of_registers * BytesPerWord);
487 __ call(CAST_FROM_FN_PTR(address, handle_unsafe_access), relocInfo::runtime_call_type);
488 __ delayed()->nop();
489 __ sw(V0, SP, RegisterImpl::number_of_registers * BytesPerWord);
490 __ popad();
491 __ jr(RA);
492 __ delayed()->nop();
493 return start;
494 }
496 // Non-destructive plausibility checks for oops
497 //
498 // Arguments:
499 // all args on stack!
500 //
501 // Stack after saving c_rarg3:
502 // [tos + 0]: saved c_rarg3
503 // [tos + 1]: saved c_rarg2
504 // [tos + 2]: saved r12 (several TemplateTable methods use it)
505 // [tos + 3]: saved flags
506 // [tos + 4]: return address
507 // * [tos + 5]: error message (char*)
508 // * [tos + 6]: object to verify (oop)
509 // * [tos + 7]: saved rax - saved by caller and bashed
510 // * = popped on exit
511 address generate_verify_oop() {
512 StubCodeMark mark(this, "StubRoutines", "verify_oop");
513 address start = __ pc();
514 __ reinit_heapbase();
515 __ verify_oop_subroutine();
516 address end = __ pc();
517 return start;
518 }
520 //
521 // Generate overlap test for array copy stubs
522 //
523 // Input:
524 // A0 - array1
525 // A1 - array2
526 // A2 - element count
527 //
528 // Note: this code can only use %eax, %ecx, and %edx
529 //
531 // use T9 as temp
532 void array_overlap_test(address no_overlap_target, int log2_elem_size) {
533 int elem_size = 1 << log2_elem_size;
534 Address::ScaleFactor sf = Address::times_1;
536 switch (log2_elem_size) {
537 case 0: sf = Address::times_1; break;
538 case 1: sf = Address::times_2; break;
539 case 2: sf = Address::times_4; break;
540 case 3: sf = Address::times_8; break;
541 }
543 __ dsll(AT, A2, sf);
544 __ dadd(AT, AT, A0);
545 __ lea(T9, Address(AT, -elem_size));
546 __ dsub(AT, A1, A0);
547 __ blez(AT, no_overlap_target);
548 __ delayed()->nop();
549 __ dsub(AT, A1, T9);
550 __ bgtz(AT, no_overlap_target);
551 __ delayed()->nop();
553 // 2016/05/10 aoqi: If A0 = 0xf... and A1 = 0x0..., than goto no_overlap_target
554 Label L;
555 __ bgez(A0, L);
556 __ delayed()->nop();
557 __ bgtz(A1, no_overlap_target);
558 __ delayed()->nop();
559 __ bind(L);
561 }
563 //
564 // Generate store check for array
565 //
566 // Input:
567 // %edi - starting address
568 // %ecx - element count
569 //
570 // The 2 input registers are overwritten
571 //
573 //
574 // Generate store check for array
575 //
576 // Input:
577 // T0 - starting address(edi)
578 // T1 - element count (ecx)
579 //
580 // The 2 input registers are overwritten
581 //
583 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
585 void array_store_check() {
586 BarrierSet* bs = Universe::heap()->barrier_set();
587 assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
588 CardTableModRefBS* ct = (CardTableModRefBS*)bs;
589 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
590 Label l_0;
592 __ dsll(AT, T1, TIMES_OOP);
593 __ dadd(AT, T0, AT);
594 __ daddiu(T1, AT, - BytesPerHeapOop);
596 __ shr(T0, CardTableModRefBS::card_shift);
597 __ shr(T1, CardTableModRefBS::card_shift);
599 __ dsub(T1, T1, T0); // end --> cards count
600 __ bind(l_0);
602 __ li48(AT, (long)ct->byte_map_base);
603 __ dadd(AT, AT, T0);
604 __ dadd(AT, AT, T1);
605 __ sync();
606 __ sb(R0, AT, 0);
607 //__ daddi(T1, T1, -4);
608 __ daddi(T1, T1, - 1);
609 __ bgez(T1, l_0);
610 __ delayed()->nop();
611 }
613 // Arguments:
614 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
615 // ignored
616 // name - stub name string
617 //
618 // Inputs:
619 // c_rarg0 - source array address
620 // c_rarg1 - destination array address
621 // c_rarg2 - element count, treated as ssize_t, can be zero
622 //
623 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
624 // we let the hardware handle it. The one to eight bytes within words,
625 // dwords or qwords that span cache line boundaries will still be loaded
626 // and stored atomically.
627 //
628 // Side Effects:
629 // disjoint_byte_copy_entry is set to the no-overlap entry point
630 // used by generate_conjoint_byte_copy().
631 //
632 address generate_disjoint_byte_copy(bool aligned, const char * name) {
633 StubCodeMark mark(this, "StubRoutines", name);
634 __ align(CodeEntryAlignment);
637 Register tmp1 = T0;
638 Register tmp2 = T1;
639 Register tmp3 = T3;
641 address start = __ pc();
643 __ push(tmp1);
644 __ push(tmp2);
645 __ push(tmp3);
646 __ move(tmp1, A0);
647 __ move(tmp2, A1);
648 __ move(tmp3, A2);
651 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10, l_11;
652 Label l_debug;
654 __ daddi(AT, tmp3, -9); //why the number is 9 ?
655 __ blez(AT, l_9);
656 __ delayed()->nop();
658 if (!aligned) {
659 __ xorr(AT, tmp1, tmp2);
660 __ andi(AT, AT, 1);
661 __ bne(AT, R0, l_9); // if arrays don't have the same alignment mod 2, do 1 element copy
662 __ delayed()->nop();
664 __ andi(AT, tmp1, 1);
665 __ beq(AT, R0, l_10); //copy 1 enlement if necessary to aligh to 2 bytes
666 __ delayed()->nop();
668 __ lb(AT, tmp1, 0);
669 __ daddi(tmp1, tmp1, 1);
670 __ sb(AT, tmp2, 0);
671 __ daddi(tmp2, tmp2, 1);
672 __ daddi(tmp3, tmp3, -1);
673 __ bind(l_10);
675 __ xorr(AT, tmp1, tmp2);
676 __ andi(AT, AT, 3);
677 __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 2 elements copy
678 __ delayed()->nop();
680 // At this point it is guaranteed that both, from and to have the same alignment mod 4.
682 // Copy 2 elements if necessary to align to 4 bytes.
683 __ andi(AT, tmp1, 3);
684 __ beq(AT, R0, l_2);
685 __ delayed()->nop();
687 __ lhu(AT, tmp1, 0);
688 __ daddi(tmp1, tmp1, 2);
689 __ sh(AT, tmp2, 0);
690 __ daddi(tmp2, tmp2, 2);
691 __ daddi(tmp3, tmp3, -2);
692 __ bind(l_2);
694 // At this point the positions of both, from and to, are at least 4 byte aligned.
696 // Copy 4 elements at a time.
697 // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
698 __ xorr(AT, tmp1, tmp2);
699 __ andi(AT, AT, 7);
700 __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned
701 __ delayed()->nop();
703 // Copy a 4 elements if necessary to align to 8 bytes.
704 __ andi(AT, tmp1, 7);
705 __ beq(AT, R0, l_7);
706 __ delayed()->nop();
708 __ lw(AT, tmp1, 0);
709 __ daddi(tmp3, tmp3, -4);
710 __ sw(AT, tmp2, 0);
711 { // FasterArrayCopy
712 __ daddi(tmp1, tmp1, 4);
713 __ daddi(tmp2, tmp2, 4);
714 }
715 }
717 __ bind(l_7);
719 // Copy 4 elements at a time; either the loads or the stores can
720 // be unaligned if aligned == false.
722 { // FasterArrayCopy
723 __ daddi(AT, tmp3, -7);
724 __ blez(AT, l_6); // copy 4 at a time if less than 4 elements remain
725 __ delayed()->nop();
727 __ bind(l_8);
728 // For Loongson, there is 128-bit memory access. TODO
729 __ ld(AT, tmp1, 0);
730 __ sd(AT, tmp2, 0);
731 __ daddi(tmp1, tmp1, 8);
732 __ daddi(tmp2, tmp2, 8);
733 __ daddi(tmp3, tmp3, -8);
734 __ daddi(AT, tmp3, -8);
735 __ bgez(AT, l_8);
736 __ delayed()->nop();
737 }
738 __ bind(l_6);
740 // copy 4 bytes at a time
741 { // FasterArrayCopy
742 __ daddi(AT, tmp3, -3);
743 __ blez(AT, l_1);
744 __ delayed()->nop();
746 __ bind(l_3);
747 __ lw(AT, tmp1, 0);
748 __ sw(AT, tmp2, 0);
749 __ daddi(tmp1, tmp1, 4);
750 __ daddi(tmp2, tmp2, 4);
751 __ daddi(tmp3, tmp3, -4);
752 __ daddi(AT, tmp3, -4);
753 __ bgez(AT, l_3);
754 __ delayed()->nop();
756 }
758 // do 2 bytes copy
759 __ bind(l_1);
760 {
761 __ daddi(AT, tmp3, -1);
762 __ blez(AT, l_9);
763 __ delayed()->nop();
765 __ bind(l_5);
766 __ lhu(AT, tmp1, 0);
767 __ daddi(tmp3, tmp3, -2);
768 __ sh(AT, tmp2, 0);
769 __ daddi(tmp1, tmp1, 2);
770 __ daddi(tmp2, tmp2, 2);
771 __ daddi(AT, tmp3, -2);
772 __ bgez(AT, l_5);
773 __ delayed()->nop();
774 }
776 //do 1 element copy--byte
777 __ bind(l_9);
778 __ beq(R0, tmp3, l_4);
779 __ delayed()->nop();
781 {
782 __ bind(l_11);
783 __ lb(AT, tmp1, 0);
784 __ daddi(tmp3, tmp3, -1);
785 __ sb(AT, tmp2, 0);
786 __ daddi(tmp1, tmp1, 1);
787 __ daddi(tmp2, tmp2, 1);
788 __ daddi(AT, tmp3, -1);
789 __ bgez(AT, l_11);
790 __ delayed()->nop();
791 }
793 __ bind(l_4);
794 __ pop(tmp3);
795 __ pop(tmp2);
796 __ pop(tmp1);
798 __ jr(RA);
799 __ delayed()->nop();
801 return start;
802 }
804 // Arguments:
805 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
806 // ignored
807 // name - stub name string
808 //
809 // Inputs:
810 // A0 - source array address
811 // A1 - destination array address
812 // A2 - element count, treated as ssize_t, can be zero
813 //
814 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
815 // we let the hardware handle it. The one to eight bytes within words,
816 // dwords or qwords that span cache line boundaries will still be loaded
817 // and stored atomically.
818 //
819 address generate_conjoint_byte_copy(bool aligned, const char *name) {
820 __ align(CodeEntryAlignment);
821 StubCodeMark mark(this, "StubRoutines", name);
822 address start = __ pc();
824 Label l_copy_4_bytes_loop, l_copy_suffix, l_copy_suffix_loop, l_exit;
825 Label l_copy_byte, l_from_unaligned, l_unaligned, l_4_bytes_aligned;
827 address nooverlap_target = aligned ?
828 StubRoutines::arrayof_jbyte_disjoint_arraycopy() :
829 StubRoutines::jbyte_disjoint_arraycopy();
831 array_overlap_test(nooverlap_target, 0);
833 const Register from = A0; // source array address
834 const Register to = A1; // destination array address
835 const Register count = A2; // elements count
836 const Register end_from = T3; // source array end address
837 const Register end_to = T0; // destination array end address
838 const Register end_count = T1; // destination array end address
840 __ push(end_from);
841 __ push(end_to);
842 __ push(end_count);
843 __ push(T8);
845 // copy from high to low
846 __ move(end_count, count);
847 __ dadd(end_from, from, end_count);
848 __ dadd(end_to, to, end_count);
850 // 2016/05/08 aoqi: If end_from and end_to has differante alignment, unaligned copy is performed.
851 __ andi(AT, end_from, 3);
852 __ andi(T8, end_to, 3);
853 __ bne(AT, T8, l_copy_byte);
854 __ delayed()->nop();
856 // First deal with the unaligned data at the top.
857 __ bind(l_unaligned);
858 __ beq(end_count, R0, l_exit);
859 __ delayed()->nop();
861 __ andi(AT, end_from, 3);
862 __ bne(AT, R0, l_from_unaligned);
863 __ delayed()->nop();
865 __ andi(AT, end_to, 3);
866 __ beq(AT, R0, l_4_bytes_aligned);
867 __ delayed()->nop();
869 __ bind(l_from_unaligned);
870 __ lb(AT, end_from, -1);
871 __ sb(AT, end_to, -1);
872 __ daddi(end_from, end_from, -1);
873 __ daddi(end_to, end_to, -1);
874 __ daddi(end_count, end_count, -1);
875 __ b(l_unaligned);
876 __ delayed()->nop();
878 // now end_to, end_from point to 4-byte aligned high-ends
879 // end_count contains byte count that is not copied.
880 // copy 4 bytes at a time
881 __ bind(l_4_bytes_aligned);
883 __ move(T8, end_count);
884 __ daddi(AT, end_count, -3);
885 __ blez(AT, l_copy_suffix);
886 __ delayed()->nop();
888 //__ andi(T8, T8, 3);
889 __ lea(end_from, Address(end_from, -4));
890 __ lea(end_to, Address(end_to, -4));
892 __ dsrl(end_count, end_count, 2);
893 __ align(16);
894 __ bind(l_copy_4_bytes_loop); //l_copy_4_bytes
895 __ lw(AT, end_from, 0);
896 __ sw(AT, end_to, 0);
897 __ addi(end_from, end_from, -4);
898 __ addi(end_to, end_to, -4);
899 __ addi(end_count, end_count, -1);
900 __ bne(end_count, R0, l_copy_4_bytes_loop);
901 __ delayed()->nop();
903 __ b(l_copy_suffix);
904 __ delayed()->nop();
905 // copy dwords aligned or not with repeat move
906 // l_copy_suffix
907 // copy suffix (0-3 bytes)
908 __ bind(l_copy_suffix);
909 __ andi(T8, T8, 3);
910 __ beq(T8, R0, l_exit);
911 __ delayed()->nop();
912 __ addi(end_from, end_from, 3);
913 __ addi(end_to, end_to, 3);
914 __ bind(l_copy_suffix_loop);
915 __ lb(AT, end_from, 0);
916 __ sb(AT, end_to, 0);
917 __ addi(end_from, end_from, -1);
918 __ addi(end_to, end_to, -1);
919 __ addi(T8, T8, -1);
920 __ bne(T8, R0, l_copy_suffix_loop);
921 __ delayed()->nop();
923 __ bind(l_copy_byte);
924 __ beq(end_count, R0, l_exit);
925 __ delayed()->nop();
926 __ lb(AT, end_from, -1);
927 __ sb(AT, end_to, -1);
928 __ daddi(end_from, end_from, -1);
929 __ daddi(end_to, end_to, -1);
930 __ daddi(end_count, end_count, -1);
931 __ b(l_copy_byte);
932 __ delayed()->nop();
934 __ bind(l_exit);
935 __ pop(T8);
936 __ pop(end_count);
937 __ pop(end_to);
938 __ pop(end_from);
939 __ jr(RA);
940 __ delayed()->nop();
941 return start;
942 }
944 // Generate stub for disjoint short copy. If "aligned" is true, the
945 // "from" and "to" addresses are assumed to be heapword aligned.
946 //
947 // Arguments for generated stub:
948 // from: A0
949 // to: A1
950 // elm.count: A2 treated as signed
951 // one element: 2 bytes
952 //
953 // Strategy for aligned==true:
954 //
955 // If length <= 9:
956 // 1. copy 1 elements at a time (l_5)
957 //
958 // If length > 9:
959 // 1. copy 4 elements at a time until less than 4 elements are left (l_7)
960 // 2. copy 2 elements at a time until less than 2 elements are left (l_6)
961 // 3. copy last element if one was left in step 2. (l_1)
962 //
963 //
964 // Strategy for aligned==false:
965 //
966 // If length <= 9: same as aligned==true case
967 //
968 // If length > 9:
969 // 1. continue with step 7. if the alignment of from and to mod 4
970 // is different.
971 // 2. align from and to to 4 bytes by copying 1 element if necessary
972 // 3. at l_2 from and to are 4 byte aligned; continue with
973 // 6. if they cannot be aligned to 8 bytes because they have
974 // got different alignment mod 8.
975 // 4. at this point we know that both, from and to, have the same
976 // alignment mod 8, now copy one element if necessary to get
977 // 8 byte alignment of from and to.
978 // 5. copy 4 elements at a time until less than 4 elements are
979 // left; depending on step 3. all load/stores are aligned.
980 // 6. copy 2 elements at a time until less than 2 elements are
981 // left. (l_6)
982 // 7. copy 1 element at a time. (l_5)
983 // 8. copy last element if one was left in step 6. (l_1)
984 //
985 // TODO:
986 //
987 // 1. use loongson 128-bit load/store
988 // 2. use loop unrolling optimization when len is big enough, for example if len > 0x2000:
989 // __ bind(l_x);
990 // __ ld(AT, tmp1, 0);
991 // __ ld(tmp, tmp1, 8);
992 // __ sd(AT, tmp2, 0);
993 // __ sd(tmp, tmp2, 8);
994 // __ ld(AT, tmp1, 16);
995 // __ ld(tmp, tmp1, 24);
996 // __ sd(AT, tmp2, 16);
997 // __ sd(tmp, tmp2, 24);
998 // __ daddi(tmp1, tmp1, 32);
999 // __ daddi(tmp2, tmp2, 32);
1000 // __ daddi(tmp3, tmp3, -16);
1001 // __ daddi(AT, tmp3, -16);
1002 // __ bgez(AT, l_x);
1003 // __ delayed()->nop();
1004 //
1005 address generate_disjoint_short_copy(bool aligned, const char * name) {
1006 StubCodeMark mark(this, "StubRoutines", name);
1007 __ align(CodeEntryAlignment);
1009 Register tmp1 = T0;
1010 Register tmp2 = T1;
1011 Register tmp3 = T3;
1013 address start = __ pc();
1015 __ push(tmp1);
1016 __ push(tmp2);
1017 __ push(tmp3);
1018 __ move(tmp1, A0);
1019 __ move(tmp2, A1);
1020 __ move(tmp3, A2);
1022 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
1023 Label l_debug;
1024 // don't try anything fancy if arrays don't have many elements
1025 __ daddi(AT, tmp3, -9);
1026 __ blez(AT, l_1);
1027 __ delayed()->nop();
1029 if (!aligned) {
1030 __ xorr(AT, A0, A1);
1031 __ andi(AT, AT, 1);
1032 __ bne(AT, R0, l_debug); // if arrays don't have the same alignment mod 2, can this happen?
1033 __ delayed()->nop();
1035 __ xorr(AT, A0, A1);
1036 __ andi(AT, AT, 3);
1037 __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 1 element copy
1038 __ delayed()->nop();
1040 // At this point it is guaranteed that both, from and to have the same alignment mod 4.
1042 // Copy 1 element if necessary to align to 4 bytes.
1043 __ andi(AT, A0, 3);
1044 __ beq(AT, R0, l_2);
1045 __ delayed()->nop();
1047 __ lhu(AT, tmp1, 0);
1048 __ daddi(tmp1, tmp1, 2);
1049 __ sh(AT, tmp2, 0);
1050 __ daddi(tmp2, tmp2, 2);
1051 __ daddi(tmp3, tmp3, -1);
1052 __ bind(l_2);
1054 // At this point the positions of both, from and to, are at least 4 byte aligned.
1056 // Copy 4 elements at a time.
1057 // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
1058 __ xorr(AT, tmp1, tmp2);
1059 __ andi(AT, AT, 7);
1060 __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned
1061 __ delayed()->nop();
1063 // Copy a 2-element word if necessary to align to 8 bytes.
1064 __ andi(AT, tmp1, 7);
1065 __ beq(AT, R0, l_7);
1066 __ delayed()->nop();
1068 __ lw(AT, tmp1, 0);
1069 __ daddi(tmp3, tmp3, -2);
1070 __ sw(AT, tmp2, 0);
1071 { // FasterArrayCopy
1072 __ daddi(tmp1, tmp1, 4);
1073 __ daddi(tmp2, tmp2, 4);
1074 }
1075 }
1077 __ bind(l_7);
1079 // Copy 4 elements at a time; either the loads or the stores can
1080 // be unaligned if aligned == false.
1082 { // FasterArrayCopy
1083 __ daddi(AT, tmp3, -15);
1084 __ blez(AT, l_6); // copy 2 at a time if less than 16 elements remain
1085 __ delayed()->nop();
1087 __ bind(l_8);
1088 // For Loongson, there is 128-bit memory access. TODO
1089 __ ld(AT, tmp1, 0);
1090 __ sd(AT, tmp2, 0);
1091 __ daddi(tmp1, tmp1, 8);
1092 __ daddi(tmp2, tmp2, 8);
1093 __ daddi(tmp3, tmp3, -4);
1094 __ daddi(AT, tmp3, -4);
1095 __ bgez(AT, l_8);
1096 __ delayed()->nop();
1097 }
1098 __ bind(l_6);
1100 // copy 2 element at a time
1101 { // FasterArrayCopy
1102 __ daddi(AT, tmp3, -1);
1103 __ blez(AT, l_1);
1104 __ delayed()->nop();
1106 __ bind(l_3);
1107 __ lw(AT, tmp1, 0);
1108 __ sw(AT, tmp2, 0);
1109 __ daddi(tmp1, tmp1, 4);
1110 __ daddi(tmp2, tmp2, 4);
1111 __ daddi(tmp3, tmp3, -2);
1112 __ daddi(AT, tmp3, -2);
1113 __ bgez(AT, l_3);
1114 __ delayed()->nop();
1116 }
1118 // do single element copy (8 bit), can this happen?
1119 __ bind(l_1);
1120 __ beq(R0, tmp3, l_4);
1121 __ delayed()->nop();
1123 { // FasterArrayCopy
1125 __ bind(l_5);
1126 __ lhu(AT, tmp1, 0);
1127 __ daddi(tmp3, tmp3, -1);
1128 __ sh(AT, tmp2, 0);
1129 __ daddi(tmp1, tmp1, 2);
1130 __ daddi(tmp2, tmp2, 2);
1131 __ daddi(AT, tmp3, -1);
1132 __ bgez(AT, l_5);
1133 __ delayed()->nop();
1134 }
1135 __ bind(l_4);
1136 __ pop(tmp3);
1137 __ pop(tmp2);
1138 __ pop(tmp1);
1140 __ jr(RA);
1141 __ delayed()->nop();
1143 __ bind(l_debug);
1144 __ stop("generate_disjoint_short_copy should not reach here");
1145 return start;
1146 }
1148 // Arguments:
1149 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1150 // ignored
1151 // name - stub name string
1152 //
1153 // Inputs:
1154 // c_rarg0 - source array address
1155 // c_rarg1 - destination array address
1156 // c_rarg2 - element count, treated as ssize_t, can be zero
1157 //
1158 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1159 // let the hardware handle it. The two or four words within dwords
1160 // or qwords that span cache line boundaries will still be loaded
1161 // and stored atomically.
1162 //
1163 address generate_conjoint_short_copy(bool aligned, const char *name) {
1164 Label l_1, l_2, l_3, l_4, l_5;
1165 StubCodeMark mark(this, "StubRoutines", name);
1166 __ align(CodeEntryAlignment);
1167 address start = __ pc();
1168 address nooverlap_target = aligned ?
1169 StubRoutines::arrayof_jshort_disjoint_arraycopy() :
1170 StubRoutines::jshort_disjoint_arraycopy();
1172 array_overlap_test(nooverlap_target, 1);
1174 __ push(T3);
1175 __ push(T0);
1176 __ push(T1);
1177 __ push(T8);
1179 /*
1180 __ pushl(esi);
1181 __ movl(ecx, Address(esp, 4+12)); // count
1182 __ pushl(edi);
1183 __ movl(esi, Address(esp, 8+ 4)); // from
1184 __ movl(edi, Address(esp, 8+ 8)); // to
1185 */
1186 __ move(T1, A2);
1187 __ move(T3, A0);
1188 __ move(T0, A1);
1191 // copy dwords from high to low
1192 // __ leal(esi, Address(esi, ecx, Address::times_2, -4)); // from + count*2 - 4
1193 __ sll(AT, T1, Address::times_2);
1194 __ add(AT, T3, AT);
1195 __ lea(T3, Address( AT, -4));
1196 //__ std();
1197 //__ leal(edi, Address(edi, ecx, Address::times_2, -4)); // to + count*2 - 4
1198 __ sll(AT,T1 , Address::times_2);
1199 __ add(AT, T0, AT);
1200 __ lea(T0, Address( AT, -4));
1201 // __ movl(eax, ecx);
1202 __ move(T8, T1);
1203 __ bind(l_1);
1204 // __ sarl(ecx, 1); // dword count
1205 __ sra(T1,T1, 1);
1206 //__ jcc(Assembler::equal, l_4); // no dwords to move
1207 __ beq(T1, R0, l_4);
1208 __ delayed()->nop();
1209 /* __ cmpl(ecx, 32);
1210 __ jcc(Assembler::above, l_3); // > 32 dwords
1211 // copy dwords with loop
1212 __ subl(edi, esi);
1213 */ __ align(16);
1214 __ bind(l_2);
1215 //__ movl(edx, Address(esi));
1216 __ lw(AT, T3, 0);
1217 //__ movl(Address(edi, esi, Address::times_1), edx);
1218 __ sw(AT, T0, 0);
1219 //__ subl(esi, 4);
1220 __ addi(T3, T3, -4);
1221 __ addi(T0, T0, -4);
1222 //__ decl(ecx);
1223 __ addi(T1, T1, -1);
1224 // __ jcc(Assembler::notEqual, l_2);
1225 __ bne(T1, R0, l_2);
1226 __ delayed()->nop();
1227 // __ addl(edi, esi);
1228 // __ jmp(l_4);
1229 __ b(l_4);
1230 __ delayed()->nop();
1231 // copy dwords with repeat move
1232 __ bind(l_3);
1233 // __ rep_movl();
1234 __ bind(l_4);
1235 // __ andl(eax, 1); // suffix count
1236 __ andi(T8, T8, 1); // suffix count
1237 //__ jcc(Assembler::equal, l_5); // no suffix
1238 __ beq(T8, R0, l_5 );
1239 __ delayed()->nop();
1240 // copy suffix
1241 // __ movw(edx, Address(esi, 2));
1242 __ lh(AT, T3, 2);
1243 // __ movw(Address(edi, 2), edx);
1244 __ sh(AT, T0, 2);
1245 __ bind(l_5);
1246 // __ cld();
1247 // __ popl(edi);
1248 // __ popl(esi);
1249 // __ ret(0);
1250 __ pop(T8);
1251 __ pop(T1);
1252 __ pop(T0);
1253 __ pop(T3);
1254 __ jr(RA);
1255 __ delayed()->nop();
1256 return start;
1257 }
1259 // Arguments:
1260 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1261 // ignored
1262 // is_oop - true => oop array, so generate store check code
1263 // name - stub name string
1264 //
1265 // Inputs:
1266 // c_rarg0 - source array address
1267 // c_rarg1 - destination array address
1268 // c_rarg2 - element count, treated as ssize_t, can be zero
1269 //
1270 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1271 // the hardware handle it. The two dwords within qwords that span
1272 // cache line boundaries will still be loaded and stored atomicly.
1273 //
1274 // Side Effects:
1275 // disjoint_int_copy_entry is set to the no-overlap entry point
1276 // used by generate_conjoint_int_oop_copy().
1277 //
1278 address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {
1279 Label l_2, l_3, l_4, l_stchk;
1280 StubCodeMark mark(this, "StubRoutines", name);
1281 __ align(CodeEntryAlignment);
1282 address start = __ pc();
1283 /*
1284 __ pushl(esi);
1285 __ movl(ecx, Address(esp, 4+12)); // count
1286 __ pushl(edi);
1287 __ movl(esi, Address(esp, 8+ 4)); // from
1288 __ movl(edi, Address(esp, 8+ 8)); // to
1289 */
1290 __ push(T3);
1291 __ push(T0);
1292 __ push(T1);
1293 __ push(T8);
1294 __ move(T1, A2);
1295 __ move(T3, A0);
1296 __ move(T0, A1);
1298 // __ cmpl(ecx, 32);
1299 // __ jcc(Assembler::belowEqual, l_2); // <= 32 dwords
1300 // __ rep_movl();
1301 __ b(l_2);
1302 __ delayed()->nop();
1303 if (is_oop) {
1304 // __ jmp(l_stchk);
1305 __ b(l_stchk);
1306 __ delayed()->nop();
1307 }
1308 // __ popl(edi);
1309 // __ popl(esi);
1310 // __ ret(0);
1311 __ pop(T8);
1312 __ pop(T1);
1313 __ pop(T0);
1314 __ pop(T3);
1315 __ jr(RA);
1316 __ delayed()->nop();
1318 __ bind(l_2);
1319 // __ subl(edi, esi);
1320 // __ testl(ecx, ecx);
1321 // __ jcc(Assembler::zero, l_4);
1322 __ beq(T1, R0, l_4);
1323 __ delayed()->nop();
1324 __ align(16);
1325 __ bind(l_3);
1326 //__ movl(edx, Address(esi));
1327 __ lw(AT, T3, 0);
1328 // __ movl(Address(edi, esi, Address::times_1), edx);
1329 __ sw(AT, T0, 0);
1330 // __ addl(esi, 4);
1331 __ addi(T3, T3, 4);
1332 __ addi(T0, T0, 4);
1333 // __ decl(ecx);
1334 __ addi(T1, T1, -1);
1335 // __ jcc(Assembler::notEqual, l_3);
1336 __ bne(T1, R0, l_3);
1337 __ delayed()->nop();
1338 if (is_oop) {
1339 __ bind(l_stchk);
1340 // __ movl(edi, Address(esp, 8+ 8));
1341 // __ movl(ecx, Address(esp, 8+ 12));
1342 __ move(T0, A1);
1343 __ move(T1, A2);
1344 array_store_check();
1345 }
1346 __ bind(l_4);
1347 // __ popl(edi);
1348 // __ popl(esi);
1349 // __ ret(0);
1350 __ pop(T8);
1351 __ pop(T1);
1352 __ pop(T0);
1353 __ pop(T3);
1354 __ jr(RA);
1355 __ delayed()->nop();
1356 return start;
1357 }
1359 // Arguments:
1360 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1361 // ignored
1362 // is_oop - true => oop array, so generate store check code
1363 // name - stub name string
1364 //
1365 // Inputs:
1366 // c_rarg0 - source array address
1367 // c_rarg1 - destination array address
1368 // c_rarg2 - element count, treated as ssize_t, can be zero
1369 //
1370 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1371 // the hardware handle it. The two dwords within qwords that span
1372 // cache line boundaries will still be loaded and stored atomicly.
1373 //
1374 address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {
1375 Label l_2, l_3, l_4, l_stchk;
1376 StubCodeMark mark(this, "StubRoutines", name);
1377 __ align(CodeEntryAlignment);
1378 address start = __ pc();
1379 address nooverlap_target;
1381 if (is_oop) {
1382 nooverlap_target = aligned ?
1383 StubRoutines::arrayof_oop_disjoint_arraycopy() :
1384 StubRoutines::oop_disjoint_arraycopy();
1385 }else {
1386 nooverlap_target = aligned ?
1387 StubRoutines::arrayof_jint_disjoint_arraycopy() :
1388 StubRoutines::jint_disjoint_arraycopy();
1389 }
1391 array_overlap_test(nooverlap_target, 2);
1393 __ push(T3);
1394 __ push(T0);
1395 __ push(T1);
1396 __ push(T8);
1398 /*
1399 __ pushl(esi);
1400 __ movl(ecx, Address(esp, 4+12)); // count
1401 __ pushl(edi);
1402 __ movl(esi, Address(esp, 8+ 4)); // from
1403 __ movl(edi, Address(esp, 8+ 8)); // to
1404 */
1405 __ move(T1, A2);
1406 __ move(T3, A0);
1407 __ move(T0, A1);
1409 //__ leal(esi, Address(esi, ecx, Address::times_4, -4)); // from + count*4 - 4
1410 __ sll(AT, T1, Address::times_4);
1411 __ add(AT, T3, AT);
1412 __ lea(T3 , Address(AT, -4));
1413 //__ std();
1414 //__ leal(edi, Address(edi, ecx, Address::times_4, -4)); // to + count*4 - 4
1415 __ sll(AT, T1, Address::times_4);
1416 __ add(AT, T0, AT);
1417 __ lea(T0 , Address(AT, -4));
1419 // __ cmpl(ecx, 32);
1420 // __ jcc(Assembler::above, l_3); // > 32 dwords
1421 // __ testl(ecx, ecx);
1422 //__ jcc(Assembler::zero, l_4);
1423 __ beq(T1, R0, l_4);
1424 __ delayed()->nop();
1425 // __ subl(edi, esi);
1426 __ align(16);
1427 __ bind(l_2);
1428 // __ movl(edx, Address(esi));
1429 __ lw(AT, T3, 0);
1430 // __ movl(Address(esi, edi, Address::times_1), edx);
1431 __ sw(AT, T0, 0);
1432 // __ subl(esi, 4);
1433 __ addi(T3, T3, -4);
1434 __ addi(T0, T0, -4);
1435 // __ decl(ecx);
1436 __ addi(T1, T1, -1);
1437 //__ jcc(Assembler::notEqual, l_2);
1438 __ bne(T1, R0, l_2);
1439 __ delayed()->nop();
1440 if (is_oop) {
1441 // __ jmp(l_stchk);
1442 __ b( l_stchk);
1443 __ delayed()->nop();
1444 }
1445 __ bind(l_4);
1446 // __ cld();
1447 // __ popl(edi);
1448 // __ popl(esi);
1449 // __ ret(0);
1450 __ pop(T8);
1451 __ pop(T1);
1452 __ pop(T0);
1453 __ pop(T3);
1454 __ jr(RA);
1455 __ delayed()->nop();
1456 __ bind(l_3);
1457 // __ rep_movl();
1458 if (is_oop) {
1459 __ bind(l_stchk);
1460 // __ movl(edi, Address(esp, 8+ 8));
1461 __ move(T0, A1);
1462 // __ movl(ecx, Address(esp, 8+ 12));
1463 __ move(T1, A2);
1464 array_store_check();
1465 }
1466 // __ cld();
1467 // __ popl(edi);
1468 // __ popl(esi);
1469 // __ ret(0);
1470 __ pop(T8);
1471 __ pop(T1);
1472 __ pop(T0);
1473 __ pop(T3);
1474 __ jr(RA);
1475 __ delayed()->nop();
1476 return start;
1477 }
1479 // Arguments:
1480 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1481 // ignored
1482 // is_oop - true => oop array, so generate store check code
1483 // name - stub name string
1484 //
1485 // Inputs:
1486 // c_rarg0 - source array address
1487 // c_rarg1 - destination array address
1488 // c_rarg2 - element count, treated as ssize_t, can be zero
1489 //
1490 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1491 // the hardware handle it. The two dwords within qwords that span
1492 // cache line boundaries will still be loaded and stored atomicly.
1493 //
1494 // Side Effects:
1495 // disjoint_int_copy_entry is set to the no-overlap entry point
1496 // used by generate_conjoint_int_oop_copy().
1497 //
1498 address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
1499 Label l_2, l_3, l_4, l_stchk;
1500 StubCodeMark mark(this, "StubRoutines", name);
1501 __ align(CodeEntryAlignment);
1502 address start = __ pc();
1503 __ push(T3);
1504 __ push(T0);
1505 __ push(T1);
1506 __ push(T8);
1507 __ move(T1, A2);
1508 __ move(T3, A0);
1509 __ move(T0, A1);
1511 // __ cmpl(ecx, 32);
1512 // __ jcc(Assembler::belowEqual, l_2); // <= 32 dwords
1513 // __ rep_movl();
1514 __ b(l_2);
1515 __ delayed()->nop();
1516 if (is_oop) {
1517 // __ jmp(l_stchk);
1518 __ b(l_stchk);
1519 __ delayed()->nop();
1520 }
1521 // __ popl(edi);
1522 // __ popl(esi);
1523 // __ ret(0);
1524 __ pop(T8);
1525 __ pop(T1);
1526 __ pop(T0);
1527 __ pop(T3);
1528 __ jr(RA);
1529 __ delayed()->nop();
1531 __ bind(l_2);
1532 // __ subl(edi, esi);
1533 // __ testl(ecx, ecx);
1534 // __ jcc(Assembler::zero, l_4);
1535 __ beq(T1, R0, l_4);
1536 __ delayed()->nop();
1537 __ align(16);
1538 __ bind(l_3);
1539 //__ movl(edx, Address(esi));
1540 __ ld(AT, T3, 0);
1541 // __ movl(Address(edi, esi, Address::times_1), edx);
1542 __ sd(AT, T0, 0);
1543 // __ addl(esi, 4);
1544 __ addi(T3, T3, 8);
1545 __ addi(T0, T0, 8);
1546 // __ decl(ecx);
1547 __ addi(T1, T1, -1);
1548 // __ jcc(Assembler::notEqual, l_3);
1549 __ bne(T1, R0, l_3);
1550 __ delayed()->nop();
1551 if (is_oop) {
1552 __ bind(l_stchk);
1553 // __ movl(edi, Address(esp, 8+ 8));
1554 // __ movl(ecx, Address(esp, 8+ 12));
1555 __ move(T0, A1);
1556 __ move(T1, A2);
1557 array_store_check();
1558 }
1559 __ bind(l_4);
1560 // __ popl(edi);
1561 // __ popl(esi);
1562 // __ ret(0);
1563 __ pop(T8);
1564 __ pop(T1);
1565 __ pop(T0);
1566 __ pop(T3);
1567 __ jr(RA);
1568 __ delayed()->nop();
1569 return start;
1570 }
1572 // Arguments:
1573 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1574 // ignored
1575 // is_oop - true => oop array, so generate store check code
1576 // name - stub name string
1577 //
1578 // Inputs:
1579 // c_rarg0 - source array address
1580 // c_rarg1 - destination array address
1581 // c_rarg2 - element count, treated as ssize_t, can be zero
1582 //
1583 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1584 // the hardware handle it. The two dwords within qwords that span
1585 // cache line boundaries will still be loaded and stored atomicly.
1586 //
1587 address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
1588 Label l_2, l_3, l_4, l_stchk;
1589 StubCodeMark mark(this, "StubRoutines", name);
1590 __ align(CodeEntryAlignment);
1591 address start = __ pc();
1592 address nooverlap_target;
1594 if (is_oop) {
1595 nooverlap_target = aligned ?
1596 StubRoutines::arrayof_oop_disjoint_arraycopy() :
1597 StubRoutines::oop_disjoint_arraycopy();
1598 }else {
1599 nooverlap_target = aligned ?
1600 StubRoutines::arrayof_jlong_disjoint_arraycopy() :
1601 StubRoutines::jlong_disjoint_arraycopy();
1602 }
1604 array_overlap_test(nooverlap_target, 3);
1606 __ push(T3);
1607 __ push(T0);
1608 __ push(T1);
1609 __ push(T8);
1611 __ move(T1, A2);
1612 __ move(T3, A0);
1613 __ move(T0, A1);
1615 //__ leal(esi, Address(esi, ecx, Address::times_4, -4)); // from + count*4 - 4
1616 __ sll(AT, T1, Address::times_8);
1617 __ add(AT, T3, AT);
1618 __ lea(T3 , Address(AT, -8));
1619 //__ std();
1620 //__ leal(edi, Address(edi, ecx, Address::times_4, -4)); // to + count*4 - 4
1621 __ sll(AT, T1, Address::times_8);
1622 __ add(AT, T0, AT);
1623 __ lea(T0 , Address(AT, -8));
1625 // __ cmpl(ecx, 32);
1626 // __ jcc(Assembler::above, l_3); // > 32 dwords
1627 // __ testl(ecx, ecx);
1628 //__ jcc(Assembler::zero, l_4);
1629 __ beq(T1, R0, l_4);
1630 __ delayed()->nop();
1631 // __ subl(edi, esi);
1632 __ align(16);
1633 __ bind(l_2);
1634 // __ movl(edx, Address(esi));
1635 __ ld(AT, T3, 0);
1636 // __ movl(Address(esi, edi, Address::times_1), edx);
1637 __ sd(AT, T0, 0);
1638 // __ subl(esi, 4);
1639 __ addi(T3, T3, -8);
1640 __ addi(T0, T0, -8);
1641 // __ decl(ecx);
1642 __ addi(T1, T1, -1);
1643 //__ jcc(Assembler::notEqual, l_2);
1644 __ bne(T1, R0, l_2);
1645 __ delayed()->nop();
1646 if (is_oop) {
1647 // __ jmp(l_stchk);
1648 __ b( l_stchk);
1649 __ delayed()->nop();
1650 }
1651 __ bind(l_4);
1652 // __ cld();
1653 // __ popl(edi);
1654 // __ popl(esi);
1655 // __ ret(0);
1656 __ pop(T8);
1657 __ pop(T1);
1658 __ pop(T0);
1659 __ pop(T3);
1660 __ jr(RA);
1661 __ delayed()->nop();
1662 __ bind(l_3);
1663 // __ rep_movl();
1664 if (is_oop) {
1665 __ bind(l_stchk);
1666 // __ movl(edi, Address(esp, 8+ 8));
1667 __ move(T0, A1);
1668 // __ movl(ecx, Address(esp, 8+ 12));
1669 __ move(T1, A2);
1670 array_store_check();
1671 }
1672 // __ cld();
1673 // __ popl(edi);
1674 // __ popl(esi);
1675 // __ ret(0);
1676 __ pop(T8);
1677 __ pop(T1);
1678 __ pop(T0);
1679 __ pop(T3);
1680 __ jr(RA);
1681 __ delayed()->nop();
1682 return start;
1683 }
1684 #if 0
1685 // Arguments:
1686 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1687 // ignored
1688 // is_oop - true => oop array, so generate store check code
1689 // name - stub name string
1690 //
1691 // Inputs:
1692 // c_rarg0 - source array address
1693 // c_rarg1 - destination array address
1694 // c_rarg2 - element count, treated as ssize_t, can be zero
1695 //
1696 address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
1697 __ align(CodeEntryAlignment);
1698 StubCodeMark mark(this, "StubRoutines", name);
1699 address start = __ pc();
1701 Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
1702 const Register from = rdi; // source array address
1703 const Register to = rsi; // destination array address
1704 const Register qword_count = rdx; // elements count
1705 const Register saved_count = rcx;
1707 __ enter(); // required for proper stackwalking of RuntimeStub frame
1708 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1710 address disjoint_copy_entry = NULL;
1711 if (is_oop) {
1712 assert(!UseCompressedOops, "shouldn't be called for compressed oops");
1713 disjoint_copy_entry = disjoint_oop_copy_entry;
1714 oop_copy_entry = __ pc();
1715 array_overlap_test(disjoint_oop_copy_entry, Address::times_8);
1716 } else {
1717 disjoint_copy_entry = disjoint_long_copy_entry;
1718 long_copy_entry = __ pc();
1719 array_overlap_test(disjoint_long_copy_entry, Address::times_8);
1720 }
1721 BLOCK_COMMENT("Entry:");
1722 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1724 array_overlap_test(disjoint_copy_entry, Address::times_8);
1725 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1726 // r9 and r10 may be used to save non-volatile registers
1728 // 'from', 'to' and 'qword_count' are now valid
1730 if (is_oop) {
1731 // Save to and count for store barrier
1732 __ movptr(saved_count, qword_count);
1733 // No registers are destroyed by this call
1734 gen_write_ref_array_pre_barrier(to, saved_count);
1735 }
1737 __ jmp(L_copy_32_bytes);
1739 // Copy trailing qwords
1740 __ BIND(L_copy_8_bytes);
1741 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1742 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1743 __ decrement(qword_count);
1744 __ jcc(Assembler::notZero, L_copy_8_bytes);
1746 if (is_oop) {
1747 __ jmp(L_exit);
1748 } else {
1749 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
1750 restore_arg_regs();
1751 __ xorptr(rax, rax); // return 0
1752 __ leave(); // required for proper stackwalking of RuntimeStub frame
1753 __ ret(0);
1754 }
1756 // Copy in 32-bytes chunks
1757 copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
1759 if (is_oop) {
1760 __ BIND(L_exit);
1761 __ lea(rcx, Address(to, saved_count, Address::times_8, -8));
1762 gen_write_ref_array_post_barrier(to, rcx, rax);
1763 inc_counter_np(SharedRuntime::_oop_array_copy_ctr);
1764 } else {
1765 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
1766 }
1767 restore_arg_regs();
1768 __ xorptr(rax, rax); // return 0
1769 __ leave(); // required for proper stackwalking of RuntimeStub frame
1770 __ ret(0);
1772 return start;
1773 }
1776 // Helper for generating a dynamic type check.
1777 // Smashes no registers.
1778 void generate_type_check(Register sub_klass,
1779 Register super_check_offset,
1780 Register super_klass,
1781 Label& L_success) {
1782 assert_different_registers(sub_klass, super_check_offset, super_klass);
1784 BLOCK_COMMENT("type_check:");
1786 Label L_miss;
1788 // a couple of useful fields in sub_klass:
1789 int ss_offset = (klassOopDesc::header_size() * HeapWordSize +
1790 Klass::secondary_supers_offset_in_bytes());
1791 int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
1792 Klass::secondary_super_cache_offset_in_bytes());
1793 Address secondary_supers_addr(sub_klass, ss_offset);
1794 Address super_cache_addr( sub_klass, sc_offset);
1796 // if the pointers are equal, we are done (e.g., String[] elements)
1797 __ cmpptr(super_klass, sub_klass);
1798 __ jcc(Assembler::equal, L_success);
1800 // check the supertype display:
1801 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
1802 __ cmpptr(super_klass, super_check_addr); // test the super type
1803 __ jcc(Assembler::equal, L_success);
1805 // if it was a primary super, we can just fail immediately
1806 __ cmpl(super_check_offset, sc_offset);
1807 __ jcc(Assembler::notEqual, L_miss);
1809 // Now do a linear scan of the secondary super-klass chain.
1810 // The repne_scan instruction uses fixed registers, which we must spill.
1811 // (We need a couple more temps in any case.)
1812 // This code is rarely used, so simplicity is a virtue here.
1813 inc_counter_np(SharedRuntime::_partial_subtype_ctr);
1814 {
1815 __ push(rax);
1816 __ push(rcx);
1817 __ push(rdi);
1818 assert_different_registers(sub_klass, super_klass, rax, rcx, rdi);
1820 __ movptr(rdi, secondary_supers_addr);
1821 // Load the array length.
1822 __ movl(rcx, Address(rdi, arrayOopDesc::length_offset_in_bytes()));
1823 // Skip to start of data.
1824 __ addptr(rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1825 // Scan rcx words at [rdi] for occurance of rax
1826 // Set NZ/Z based on last compare
1827 __ movptr(rax, super_klass);
1828 if (UseCompressedOops) {
1829 // Compare against compressed form. Don't need to uncompress because
1830 // looks like orig rax is restored in popq below.
1831 __ encode_heap_oop(rax);
1832 __ repne_scanl();
1833 } else {
1834 __ repne_scan();
1835 }
1837 // Unspill the temp. registers:
1838 __ pop(rdi);
1839 __ pop(rcx);
1840 __ pop(rax);
1842 __ jcc(Assembler::notEqual, L_miss);
1843 }
1845 // Success. Cache the super we found and proceed in triumph.
1846 __ movptr(super_cache_addr, super_klass); // note: rax is dead
1847 __ jmp(L_success);
1849 // Fall through on failure!
1850 __ BIND(L_miss);
1851 }
1853 //
1854 // Generate checkcasting array copy stub
1855 //
1856 // Input:
1857 // c_rarg0 - source array address
1858 // c_rarg1 - destination array address
1859 // c_rarg2 - element count, treated as ssize_t, can be zero
1860 // c_rarg3 - size_t ckoff (super_check_offset)
1861 // not Win64
1862 // c_rarg4 - oop ckval (super_klass)
1863 // Win64
1864 // rsp+40 - oop ckval (super_klass)
1865 //
1866 // Output:
1867 // rax == 0 - success
1868 // rax == -1^K - failure, where K is partial transfer count
1869 //
1870 address generate_checkcast_copy(const char *name) {
1872 Label L_load_element, L_store_element, L_do_card_marks, L_done;
1874 // Input registers (after setup_arg_regs)
1875 const Register from = rdi; // source array address
1876 const Register to = rsi; // destination array address
1877 const Register length = rdx; // elements count
1878 const Register ckoff = rcx; // super_check_offset
1879 const Register ckval = r8; // super_klass
1881 // Registers used as temps (r13, r14 are save-on-entry)
1882 const Register end_from = from; // source array end address
1883 const Register end_to = r13; // destination array end address
1884 const Register count = rdx; // -(count_remaining)
1885 const Register r14_length = r14; // saved copy of length
1886 // End pointers are inclusive, and if length is not zero they point
1887 // to the last unit copied: end_to[0] := end_from[0]
1889 const Register rax_oop = rax; // actual oop copied
1890 const Register r11_klass = r11; // oop._klass
1892 //---------------------------------------------------------------
1893 // Assembler stub will be used for this call to arraycopy
1894 // if the two arrays are subtypes of Object[] but the
1895 // destination array type is not equal to or a supertype
1896 // of the source type. Each element must be separately
1897 // checked.
1899 __ align(CodeEntryAlignment);
1900 StubCodeMark mark(this, "StubRoutines", name);
1901 address start = __ pc();
1903 __ enter(); // required for proper stackwalking of RuntimeStub frame
1905 checkcast_copy_entry = __ pc();
1906 BLOCK_COMMENT("Entry:");
1908 #ifdef ASSERT
1909 // caller guarantees that the arrays really are different
1910 // otherwise, we would have to make conjoint checks
1911 { Label L;
1912 array_overlap_test(L, TIMES_OOP);
1913 __ stop("checkcast_copy within a single array");
1914 __ bind(L);
1915 }
1916 #endif //ASSERT
1918 // allocate spill slots for r13, r14
1919 enum {
1920 saved_r13_offset,
1921 saved_r14_offset,
1922 saved_rbp_offset,
1923 saved_rip_offset,
1924 saved_rarg0_offset
1925 };
1926 __ subptr(rsp, saved_rbp_offset * wordSize);
1927 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
1928 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
1929 setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
1930 // ckoff => rcx, ckval => r8
1931 // r9 and r10 may be used to save non-volatile registers
1932 #ifdef _WIN64
1933 // last argument (#4) is on stack on Win64
1934 const int ckval_offset = saved_rarg0_offset + 4;
1935 __ movptr(ckval, Address(rsp, ckval_offset * wordSize));
1936 #endif
1938 // check that int operands are properly extended to size_t
1939 assert_clean_int(length, rax);
1940 assert_clean_int(ckoff, rax);
1942 #ifdef ASSERT
1943 BLOCK_COMMENT("assert consistent ckoff/ckval");
1944 // The ckoff and ckval must be mutually consistent,
1945 // even though caller generates both.
1946 { Label L;
1947 int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
1948 Klass::super_check_offset_offset_in_bytes());
1949 __ cmpl(ckoff, Address(ckval, sco_offset));
1950 __ jcc(Assembler::equal, L);
1951 __ stop("super_check_offset inconsistent");
1952 __ bind(L);
1953 }
1954 #endif //ASSERT
1956 // Loop-invariant addresses. They are exclusive end pointers.
1957 Address end_from_addr(from, length, TIMES_OOP, 0);
1958 Address end_to_addr(to, length, TIMES_OOP, 0);
1959 // Loop-variant addresses. They assume post-incremented count < 0.
1960 Address from_element_addr(end_from, count, TIMES_OOP, 0);
1961 Address to_element_addr(end_to, count, TIMES_OOP, 0);
1963 gen_write_ref_array_pre_barrier(to, count);
1965 // Copy from low to high addresses, indexed from the end of each array.
1966 __ lea(end_from, end_from_addr);
1967 __ lea(end_to, end_to_addr);
1968 __ movptr(r14_length, length); // save a copy of the length
1969 assert(length == count, ""); // else fix next line:
1970 __ negptr(count); // negate and test the length
1971 __ jcc(Assembler::notZero, L_load_element);
1973 // Empty array: Nothing to do.
1974 __ xorptr(rax, rax); // return 0 on (trivial) success
1975 __ jmp(L_done);
1977 // ======== begin loop ========
1978 // (Loop is rotated; its entry is L_load_element.)
1979 // Loop control:
1980 // for (count = -count; count != 0; count++)
1981 // Base pointers src, dst are biased by 8*(count-1),to last element.
1982 __ align(16);
1984 __ BIND(L_store_element);
1985 __ store_heap_oop(rax_oop, to_element_addr); // store the oop
1986 __ sync();
1987 __ increment(count); // increment the count toward zero
1988 __ jcc(Assembler::zero, L_do_card_marks);
1990 // ======== loop entry is here ========
1991 __ BIND(L_load_element);
1992 __ load_heap_oop(rax_oop, from_element_addr); // load the oop
1993 __ testptr(rax_oop, rax_oop);
1994 __ jcc(Assembler::zero, L_store_element);
1996 __ load_klass(r11_klass, rax_oop);// query the object klass
1997 generate_type_check(r11_klass, ckoff, ckval, L_store_element);
1998 // ======== end loop ========
2000 // It was a real error; we must depend on the caller to finish the job.
2001 // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2002 // Emit GC store barriers for the oops we have copied (r14 + rdx),
2003 // and report their number to the caller.
2004 assert_different_registers(rax, r14_length, count, to, end_to, rcx);
2005 __ lea(end_to, to_element_addr);
2006 gen_write_ref_array_post_barrier(to, end_to, rscratch1);
2007 __ movptr(rax, r14_length); // original oops
2008 __ addptr(rax, count); // K = (original - remaining) oops
2009 __ notptr(rax); // report (-1^K) to caller
2010 __ jmp(L_done);
2012 // Come here on success only.
2013 __ BIND(L_do_card_marks);
2014 __ addptr(end_to, -wordSize); // make an inclusive end pointer
2015 gen_write_ref_array_post_barrier(to, end_to, rscratch1);
2016 __ xorptr(rax, rax); // return 0 on success
2018 // Common exit point (success or failure).
2019 __ BIND(L_done);
2020 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2021 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2022 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2023 restore_arg_regs();
2024 __ leave(); // required for proper stackwalking of RuntimeStub frame
2025 __ ret(0);
2027 return start;
2028 }
2030 //
2031 // Generate 'unsafe' array copy stub
2032 // Though just as safe as the other stubs, it takes an unscaled
2033 // size_t argument instead of an element count.
2034 //
2035 // Input:
2036 // c_rarg0 - source array address
2037 // c_rarg1 - destination array address
2038 // c_rarg2 - byte count, treated as ssize_t, can be zero
2039 //
2040 // Examines the alignment of the operands and dispatches
2041 // to a long, int, short, or byte copy loop.
2042 //
2043 address generate_unsafe_copy(const char *name) {
2045 Label L_long_aligned, L_int_aligned, L_short_aligned;
2047 // Input registers (before setup_arg_regs)
2048 const Register from = c_rarg0; // source array address
2049 const Register to = c_rarg1; // destination array address
2050 const Register size = c_rarg2; // byte count (size_t)
2052 // Register used as a temp
2053 const Register bits = rax; // test copy of low bits
2055 __ align(CodeEntryAlignment);
2056 StubCodeMark mark(this, "StubRoutines", name);
2057 address start = __ pc();
2059 __ enter(); // required for proper stackwalking of RuntimeStub frame
2061 // bump this on entry, not on exit:
2062 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2064 __ mov(bits, from);
2065 __ orptr(bits, to);
2066 __ orptr(bits, size);
2068 __ testb(bits, BytesPerLong-1);
2069 __ jccb(Assembler::zero, L_long_aligned);
2071 __ testb(bits, BytesPerInt-1);
2072 __ jccb(Assembler::zero, L_int_aligned);
2074 __ testb(bits, BytesPerShort-1);
2075 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2077 __ BIND(L_short_aligned);
2078 __ shrptr(size, LogBytesPerShort); // size => short_count
2079 __ jump(RuntimeAddress(short_copy_entry));
2081 __ BIND(L_int_aligned);
2082 __ shrptr(size, LogBytesPerInt); // size => int_count
2083 __ jump(RuntimeAddress(int_copy_entry));
2085 __ BIND(L_long_aligned);
2086 __ shrptr(size, LogBytesPerLong); // size => qword_count
2087 __ jump(RuntimeAddress(long_copy_entry));
2089 return start;
2090 }
2092 // Perform range checks on the proposed arraycopy.
2093 // Kills temp, but nothing else.
2094 // Also, clean the sign bits of src_pos and dst_pos.
2095 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2096 Register src_pos, // source position (c_rarg1)
2097 Register dst, // destination array oo (c_rarg2)
2098 Register dst_pos, // destination position (c_rarg3)
2099 Register length,
2100 Register temp,
2101 Label& L_failed) {
2102 BLOCK_COMMENT("arraycopy_range_checks:");
2104 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2105 __ movl(temp, length);
2106 __ addl(temp, src_pos); // src_pos + length
2107 __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2108 __ jcc(Assembler::above, L_failed);
2110 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2111 __ movl(temp, length);
2112 __ addl(temp, dst_pos); // dst_pos + length
2113 __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2114 __ jcc(Assembler::above, L_failed);
2116 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2117 // Move with sign extension can be used since they are positive.
2118 __ movslq(src_pos, src_pos);
2119 __ movslq(dst_pos, dst_pos);
2121 BLOCK_COMMENT("arraycopy_range_checks done");
2122 }
2124 //
2125 // Generate generic array copy stubs
2126 //
2127 // Input:
2128 // c_rarg0 - src oop
2129 // c_rarg1 - src_pos (32-bits)
2130 // c_rarg2 - dst oop
2131 // c_rarg3 - dst_pos (32-bits)
2132 // not Win64
2133 // c_rarg4 - element count (32-bits)
2134 // Win64
2135 // rsp+40 - element count (32-bits)
2136 //
2137 // Output:
2138 // rax == 0 - success
2139 // rax == -1^K - failure, where K is partial transfer count
2140 //
2141 address generate_generic_copy(const char *name) {
2143 Label L_failed, L_failed_0, L_objArray;
2144 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2146 // Input registers
2147 const Register src = c_rarg0; // source array oop
2148 const Register src_pos = c_rarg1; // source position
2149 const Register dst = c_rarg2; // destination array oop
2150 const Register dst_pos = c_rarg3; // destination position
2151 // elements count is on stack on Win64
2152 #ifdef _WIN64
2153 #define C_RARG4 Address(rsp, 6 * wordSize)
2154 #else
2155 #define C_RARG4 c_rarg4
2156 #endif
2158 { int modulus = CodeEntryAlignment;
2159 int target = modulus - 5; // 5 = sizeof jmp(L_failed)
2160 int advance = target - (__ offset() % modulus);
2161 if (advance < 0) advance += modulus;
2162 if (advance > 0) __ nop(advance);
2163 }
2164 StubCodeMark mark(this, "StubRoutines", name);
2166 // Short-hop target to L_failed. Makes for denser prologue code.
2167 __ BIND(L_failed_0);
2168 __ jmp(L_failed);
2169 assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2171 __ align(CodeEntryAlignment);
2172 address start = __ pc();
2174 __ enter(); // required for proper stackwalking of RuntimeStub frame
2176 // bump this on entry, not on exit:
2177 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2179 //-----------------------------------------------------------------------
2180 // Assembler stub will be used for this call to arraycopy
2181 // if the following conditions are met:
2182 //
2183 // (1) src and dst must not be null.
2184 // (2) src_pos must not be negative.
2185 // (3) dst_pos must not be negative.
2186 // (4) length must not be negative.
2187 // (5) src klass and dst klass should be the same and not NULL.
2188 // (6) src and dst should be arrays.
2189 // (7) src_pos + length must not exceed length of src.
2190 // (8) dst_pos + length must not exceed length of dst.
2191 //
2193 // if (src == NULL) return -1;
2194 __ testptr(src, src); // src oop
2195 size_t j1off = __ offset();
2196 __ jccb(Assembler::zero, L_failed_0);
2198 // if (src_pos < 0) return -1;
2199 __ testl(src_pos, src_pos); // src_pos (32-bits)
2200 __ jccb(Assembler::negative, L_failed_0);
2202 // if (dst == NULL) return -1;
2203 __ testptr(dst, dst); // dst oop
2204 __ jccb(Assembler::zero, L_failed_0);
2206 // if (dst_pos < 0) return -1;
2207 __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2208 size_t j4off = __ offset();
2209 __ jccb(Assembler::negative, L_failed_0);
2211 // The first four tests are very dense code,
2212 // but not quite dense enough to put four
2213 // jumps in a 16-byte instruction fetch buffer.
2214 // That's good, because some branch predicters
2215 // do not like jumps so close together.
2216 // Make sure of this.
2217 guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2219 // registers used as temp
2220 const Register r11_length = r11; // elements count to copy
2221 const Register r10_src_klass = r10; // array klass
2222 const Register r9_dst_klass = r9; // dest array klass
2224 // if (length < 0) return -1;
2225 __ movl(r11_length, C_RARG4); // length (elements count, 32-bits value)
2226 __ testl(r11_length, r11_length);
2227 __ jccb(Assembler::negative, L_failed_0);
2229 __ load_klass(r10_src_klass, src);
2230 #ifdef ASSERT
2231 // assert(src->klass() != NULL);
2232 BLOCK_COMMENT("assert klasses not null");
2233 { Label L1, L2;
2234 __ testptr(r10_src_klass, r10_src_klass);
2235 __ jcc(Assembler::notZero, L2); // it is broken if klass is NULL
2236 __ bind(L1);
2237 __ stop("broken null klass");
2238 __ bind(L2);
2239 __ load_klass(r9_dst_klass, dst);
2240 __ cmpq(r9_dst_klass, 0);
2241 __ jcc(Assembler::equal, L1); // this would be broken also
2242 BLOCK_COMMENT("assert done");
2243 }
2244 #endif
2246 // Load layout helper (32-bits)
2247 //
2248 // |array_tag| | header_size | element_type | |log2_element_size|
2249 // 32 30 24 16 8 2 0
2250 //
2251 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2252 //
2254 int lh_offset = klassOopDesc::header_size() * HeapWordSize +
2255 Klass::layout_helper_offset_in_bytes();
2257 const Register rax_lh = rax; // layout helper
2259 __ movl(rax_lh, Address(r10_src_klass, lh_offset));
2261 // Handle objArrays completely differently...
2262 jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2263 __ cmpl(rax_lh, objArray_lh);
2264 __ jcc(Assembler::equal, L_objArray);
2266 // if (src->klass() != dst->klass()) return -1;
2267 __ load_klass(r9_dst_klass, dst);
2268 __ cmpq(r10_src_klass, r9_dst_klass);
2269 __ jcc(Assembler::notEqual, L_failed);
2271 // if (!src->is_Array()) return -1;
2272 __ cmpl(rax_lh, Klass::_lh_neutral_value);
2273 __ jcc(Assembler::greaterEqual, L_failed);
2275 // At this point, it is known to be a typeArray (array_tag 0x3).
2276 #ifdef ASSERT
2277 { Label L;
2278 __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
2279 __ jcc(Assembler::greaterEqual, L);
2280 __ stop("must be a primitive array");
2281 __ bind(L);
2282 }
2283 #endif
2285 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2286 r10, L_failed);
2288 // typeArrayKlass
2289 //
2290 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2291 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2292 //
2294 const Register r10_offset = r10; // array offset
2295 const Register rax_elsize = rax_lh; // element size
2297 __ movl(r10_offset, rax_lh);
2298 __ shrl(r10_offset, Klass::_lh_header_size_shift);
2299 __ andptr(r10_offset, Klass::_lh_header_size_mask); // array_offset
2300 __ addptr(src, r10_offset); // src array offset
2301 __ addptr(dst, r10_offset); // dst array offset
2302 BLOCK_COMMENT("choose copy loop based on element size");
2303 __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
2305 // next registers should be set before the jump to corresponding stub
2306 const Register from = c_rarg0; // source array address
2307 const Register to = c_rarg1; // destination array address
2308 const Register count = c_rarg2; // elements count
2310 // 'from', 'to', 'count' registers should be set in such order
2311 // since they are the same as 'src', 'src_pos', 'dst'.
2313 __ BIND(L_copy_bytes);
2314 __ cmpl(rax_elsize, 0);
2315 __ jccb(Assembler::notEqual, L_copy_shorts);
2316 __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
2317 __ lea(to, Address(dst, dst_pos, Address::times_1, 0));// dst_addr
2318 __ movl2ptr(count, r11_length); // length
2319 __ jump(RuntimeAddress(byte_copy_entry));
2321 __ BIND(L_copy_shorts);
2322 __ cmpl(rax_elsize, LogBytesPerShort);
2323 __ jccb(Assembler::notEqual, L_copy_ints);
2324 __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
2325 __ lea(to, Address(dst, dst_pos, Address::times_2, 0));// dst_addr
2326 __ movl2ptr(count, r11_length); // length
2327 __ jump(RuntimeAddress(short_copy_entry));
2329 __ BIND(L_copy_ints);
2330 __ cmpl(rax_elsize, LogBytesPerInt);
2331 __ jccb(Assembler::notEqual, L_copy_longs);
2332 __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
2333 __ lea(to, Address(dst, dst_pos, Address::times_4, 0));// dst_addr
2334 __ movl2ptr(count, r11_length); // length
2335 __ jump(RuntimeAddress(int_copy_entry));
2337 __ BIND(L_copy_longs);
2338 #ifdef ASSERT
2339 { Label L;
2340 __ cmpl(rax_elsize, LogBytesPerLong);
2341 __ jcc(Assembler::equal, L);
2342 __ stop("must be long copy, but elsize is wrong");
2343 __ bind(L);
2344 }
2345 #endif
2346 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
2347 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr
2348 __ movl2ptr(count, r11_length); // length
2349 __ jump(RuntimeAddress(long_copy_entry));
2351 // objArrayKlass
2352 __ BIND(L_objArray);
2353 // live at this point: r10_src_klass, src[_pos], dst[_pos]
2355 Label L_plain_copy, L_checkcast_copy;
2356 // test array classes for subtyping
2357 __ load_klass(r9_dst_klass, dst);
2358 __ cmpq(r10_src_klass, r9_dst_klass); // usual case is exact equality
2359 __ jcc(Assembler::notEqual, L_checkcast_copy);
2361 // Identically typed arrays can be copied without element-wise checks.
2362 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2363 r10, L_failed);
2365 __ lea(from, Address(src, src_pos, TIMES_OOP,
2366 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
2367 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
2368 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
2369 __ movl2ptr(count, r11_length); // length
2370 __ BIND(L_plain_copy);
2371 __ jump(RuntimeAddress(oop_copy_entry));
2373 __ BIND(L_checkcast_copy);
2374 // live at this point: r10_src_klass, !r11_length
2375 {
2376 // assert(r11_length == C_RARG4); // will reload from here
2377 Register r11_dst_klass = r11;
2378 __ load_klass(r11_dst_klass, dst);
2380 // Before looking at dst.length, make sure dst is also an objArray.
2381 __ cmpl(Address(r11_dst_klass, lh_offset), objArray_lh);
2382 __ jcc(Assembler::notEqual, L_failed);
2384 // It is safe to examine both src.length and dst.length.
2385 #ifndef _WIN64
2386 arraycopy_range_checks(src, src_pos, dst, dst_pos, C_RARG4,
2387 rax, L_failed);
2388 #else
2389 __ movl(r11_length, C_RARG4); // reload
2390 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2391 rax, L_failed);
2392 __ load_klass(r11_dst_klass, dst); // reload
2393 #endif
2395 // Marshal the base address arguments now, freeing registers.
2396 __ lea(from, Address(src, src_pos, TIMES_OOP,
2397 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2398 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
2399 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2400 __ movl(count, C_RARG4); // length (reloaded)
2401 Register sco_temp = c_rarg3; // this register is free now
2402 assert_different_registers(from, to, count, sco_temp,
2403 r11_dst_klass, r10_src_klass);
2404 assert_clean_int(count, sco_temp);
2406 // Generate the type check.
2407 int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
2408 Klass::super_check_offset_offset_in_bytes());
2409 __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
2410 assert_clean_int(sco_temp, rax);
2411 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
2413 // Fetch destination element klass from the objArrayKlass header.
2414 int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
2415 objArrayKlass::element_klass_offset_in_bytes());
2416 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
2417 __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
2418 assert_clean_int(sco_temp, rax);
2420 // the checkcast_copy loop needs two extra arguments:
2421 assert(c_rarg3 == sco_temp, "#3 already in place");
2422 __ movptr(C_RARG4, r11_dst_klass); // dst.klass.element_klass
2423 __ jump(RuntimeAddress(checkcast_copy_entry));
2424 }
2426 __ BIND(L_failed);
2427 __ xorptr(rax, rax);
2428 __ notptr(rax); // return -1
2429 __ leave(); // required for proper stackwalking of RuntimeStub frame
2430 __ ret(0);
2432 return start;
2433 }
2435 #undef length_arg
2436 #endif
2438 //FIXME
2439 address generate_disjoint_long_copy(bool aligned, const char *name) {
2440 Label l_1, l_2;
2441 StubCodeMark mark(this, "StubRoutines", name);
2442 __ align(CodeEntryAlignment);
2443 address start = __ pc();
2445 // __ movl(ecx, Address(esp, 4+8)); // count
2446 // __ movl(eax, Address(esp, 4+0)); // from
2447 // __ movl(edx, Address(esp, 4+4)); // to
2448 __ move(T1, A2);
2449 __ move(T3, A0);
2450 __ move(T0, A1);
2451 __ push(T3);
2452 __ push(T0);
2453 __ push(T1);
2454 //__ subl(edx, eax);
2455 //__ jmp(l_2);
2456 __ b(l_2);
2457 __ delayed()->nop();
2458 __ align(16);
2459 __ bind(l_1);
2460 // if (VM_Version::supports_mmx()) {
2461 // __ movq(mmx0, Address(eax));
2462 // __ movq(Address(eax, edx, Address::times_1), mmx0);
2463 // } else {
2464 // __ fild_d(Address(eax));
2465 __ ld(AT, T3, 0);
2466 // __ fistp_d(Address(eax, edx, Address::times_1));
2467 __ sd (AT, T0, 0);
2468 // }
2469 // __ addl(eax, 8);
2470 __ addi(T3, T3, 8);
2471 __ addi(T0, T0, 8);
2472 __ bind(l_2);
2473 // __ decl(ecx);
2474 __ addi(T1, T1, -1);
2475 // __ jcc(Assembler::greaterEqual, l_1);
2476 __ bgez(T1, l_1);
2477 __ delayed()->nop();
2478 // if (VM_Version::supports_mmx()) {
2479 // __ emms();
2480 // }
2481 // __ ret(0);
2482 __ pop(T1);
2483 __ pop(T0);
2484 __ pop(T3);
2485 __ jr(RA);
2486 __ delayed()->nop();
2487 return start;
2488 }
2491 address generate_conjoint_long_copy(bool aligned, const char *name) {
2492 Label l_1, l_2;
2493 StubCodeMark mark(this, "StubRoutines", name);
2494 __ align(CodeEntryAlignment);
2495 address start = __ pc();
2496 address nooverlap_target = aligned ?
2497 StubRoutines::arrayof_jlong_disjoint_arraycopy() :
2498 StubRoutines::jlong_disjoint_arraycopy();
2499 array_overlap_test(nooverlap_target, 3);
2501 __ push(T3);
2502 __ push(T0);
2503 __ push(T1);
2505 /* __ movl(ecx, Address(esp, 4+8)); // count
2506 __ movl(eax, Address(esp, 4+0)); // from
2507 __ movl(edx, Address(esp, 4+4)); // to
2508 __ jmp(l_2);
2510 */
2511 __ move(T1, A2);
2512 __ move(T3, A0);
2513 __ move(T0, A1);
2514 __ sll(AT, T1, Address::times_8);
2515 __ add(AT, T3, AT);
2516 __ lea(T3 , Address(AT, -8));
2517 __ sll(AT, T1, Address::times_8);
2518 __ add(AT, T0, AT);
2519 __ lea(T0 , Address(AT, -8));
2523 __ b(l_2);
2524 __ delayed()->nop();
2525 __ align(16);
2526 __ bind(l_1);
2527 /* if (VM_Version::supports_mmx()) {
2528 __ movq(mmx0, Address(eax, ecx, Address::times_8));
2529 __ movq(Address(edx, ecx,Address::times_8), mmx0);
2530 } else {
2531 __ fild_d(Address(eax, ecx, Address::times_8));
2532 __ fistp_d(Address(edx, ecx,Address::times_8));
2533 }
2534 */
2535 __ ld(AT, T3, 0);
2536 __ sd (AT, T0, 0);
2537 __ addi(T3, T3, -8);
2538 __ addi(T0, T0,-8);
2539 __ bind(l_2);
2540 // __ decl(ecx);
2541 __ addi(T1, T1, -1);
2542 //__ jcc(Assembler::greaterEqual, l_1);
2543 __ bgez(T1, l_1);
2544 __ delayed()->nop();
2545 // if (VM_Version::supports_mmx()) {
2546 // __ emms();
2547 // }
2548 // __ ret(0);
2549 __ pop(T1);
2550 __ pop(T0);
2551 __ pop(T3);
2552 __ jr(RA);
2553 __ delayed()->nop();
2554 return start;
2555 }
2557 void generate_arraycopy_stubs() {
2558 if (UseCompressedOops) {
2559 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, true, "oop_disjoint_arraycopy");
2560 StubRoutines::_oop_arraycopy = generate_conjoint_int_oop_copy(false, true, "oop_arraycopy");
2561 } else {
2562 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, true, "oop_disjoint_arraycopy");
2563 StubRoutines::_oop_arraycopy = generate_conjoint_long_oop_copy(false, true, "oop_arraycopy");
2564 }
2566 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
2567 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
2568 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, false, "jint_disjoint_arraycopy");
2569 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
2570 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
2572 // if (VM_Version::supports_mmx())
2573 //if (false)
2574 // StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_mmx_copy_aligned("arrayof_jshort_disjoint_arraycopy");
2575 // else
2576 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
2577 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(true, false, "arrayof_jint_disjoint_arraycopy");
2578 //StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(true, true, "arrayof_oop_disjoint_arraycopy");
2579 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
2581 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
2582 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, "jshort_arraycopy");
2583 StubRoutines::_jint_arraycopy = generate_conjoint_int_oop_copy(false, false, "jint_arraycopy");
2584 StubRoutines::_jlong_arraycopy = generate_conjoint_long_copy(false, "jlong_arraycopy");
2586 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
2587 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
2588 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_oop_copy(true, false, "arrayof_jint_arraycopy");
2589 //StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_int_oop_copy(true, true, "arrayof_oop_arraycopy");
2590 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");
2592 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy;
2593 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy;
2594 }
2596 //Wang: add a function to implement SafeFetch32 and SafeFetchN
2597 void generate_safefetch(const char* name, int size, address* entry,
2598 address* fault_pc, address* continuation_pc) {
2599 // safefetch signatures:
2600 // int SafeFetch32(int* adr, int errValue);
2601 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
2602 //
2603 // arguments:
2604 // A0 = adr
2605 // A1 = errValue
2606 //
2607 // result:
2608 // PPC_RET = *adr or errValue
2610 StubCodeMark mark(this, "StubRoutines", name);
2612 // Entry point, pc or function descriptor.
2613 *entry = __ pc();
2615 // Load *adr into A1, may fault.
2616 *fault_pc = __ pc();
2617 switch (size) {
2618 case 4:
2619 // int32_t
2620 __ lw(A1, A0, 0);
2621 break;
2622 case 8:
2623 // int64_t
2624 __ ld(A1, A0, 0);
2625 break;
2626 default:
2627 ShouldNotReachHere();
2628 }
2630 // return errValue or *adr
2631 *continuation_pc = __ pc();
2632 __ addu(V0,A1,R0);
2633 __ jr(RA);
2634 __ delayed()->nop();
2635 }
2638 #undef __
2639 #define __ masm->
2641 // Continuation point for throwing of implicit exceptions that are
2642 // not handled in the current activation. Fabricates an exception
2643 // oop and initiates normal exception dispatching in this
2644 // frame. Since we need to preserve callee-saved values (currently
2645 // only for C2, but done for C1 as well) we need a callee-saved oop
2646 // map and therefore have to make these stubs into RuntimeStubs
2647 // rather than BufferBlobs. If the compiler needs all registers to
2648 // be preserved between the fault point and the exception handler
2649 // then it must assume responsibility for that in
2650 // AbstractCompiler::continuation_for_implicit_null_exception or
2651 // continuation_for_implicit_division_by_zero_exception. All other
2652 // implicit exceptions (e.g., NullPointerException or
2653 // AbstractMethodError on entry) are either at call sites or
2654 // otherwise assume that stack unwinding will be initiated, so
2655 // caller saved registers were assumed volatile in the compiler.
2656 address generate_throw_exception(const char* name,
2657 address runtime_entry,
2658 bool restore_saved_exception_pc) {
2659 // Information about frame layout at time of blocking runtime call.
2660 // Note that we only have to preserve callee-saved registers since
2661 // the compilers are responsible for supplying a continuation point
2662 // if they expect all registers to be preserved.
2663 //#define aoqi_test
2664 #ifdef aoqi_test
2665 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
2666 #endif
2667 enum layout {
2668 thread_off, // last_java_sp
2669 S7_off, // callee saved register sp + 1
2670 S6_off, // callee saved register sp + 2
2671 S5_off, // callee saved register sp + 3
2672 S4_off, // callee saved register sp + 4
2673 S3_off, // callee saved register sp + 5
2674 S2_off, // callee saved register sp + 6
2675 S1_off, // callee saved register sp + 7
2676 S0_off, // callee saved register sp + 8
2677 FP_off,
2678 ret_address,
2679 framesize
2680 };
2682 int insts_size = 2048;
2683 int locs_size = 32;
2685 // CodeBuffer* code = new CodeBuffer(insts_size, locs_size, 0, 0, 0, false,
2686 // NULL, NULL, NULL, false, NULL, name, false);
2687 CodeBuffer code (name , insts_size, locs_size);
2688 #ifdef aoqi_test
2689 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
2690 #endif
2691 OopMapSet* oop_maps = new OopMapSet();
2692 #ifdef aoqi_test
2693 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
2694 #endif
2695 MacroAssembler* masm = new MacroAssembler(&code);
2696 #ifdef aoqi_test
2697 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
2698 #endif
2700 address start = __ pc();
2701 //__ stop("generate_throw_exception");
2702 /*
2703 __ move(AT, (int)&jerome1 );
2704 __ sw(SP, AT, 0);
2705 __ move(AT, (int)&jerome2 );
2706 __ sw(FP, AT, 0);
2707 __ move(AT, (int)&jerome3 );
2708 __ sw(RA, AT, 0);
2709 __ move(AT, (int)&jerome4 );
2710 __ sw(R0, AT, 0);
2711 __ move(AT, (int)&jerome5 );
2712 __ sw(R0, AT, 0);
2713 __ move(AT, (int)&jerome6 );
2714 __ sw(R0, AT, 0);
2715 __ move(AT, (int)&jerome7 );
2716 __ sw(R0, AT, 0);
2717 __ move(AT, (int)&jerome10 );
2718 __ sw(R0, AT, 0);
2720 __ pushad();
2722 //__ enter();
2723 __ call(CAST_FROM_FN_PTR(address, SharedRuntime::print_call_statistics),
2724 relocInfo::runtime_call_type);
2725 __ delayed()->nop();
2727 //__ leave();
2728 __ popad();
2730 */
2732 // This is an inlined and slightly modified version of call_VM
2733 // which has the ability to fetch the return PC out of
2734 // thread-local storage and also sets up last_Java_sp slightly
2735 // differently than the real call_VM
2736 #ifndef OPT_THREAD
2737 Register java_thread = TREG;
2738 __ get_thread(java_thread);
2739 #else
2740 Register java_thread = TREG;
2741 #endif
2742 #ifdef aoqi_test
2743 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
2744 #endif
2745 if (restore_saved_exception_pc) {
2746 __ ld(RA, java_thread, in_bytes(JavaThread::saved_exception_pc_offset())); // eax
2747 }
2749 __ enter(); // required for proper stackwalking of RuntimeStub frame
2751 __ addi(SP, SP, (-1) * (framesize-2) * wordSize); // prolog
2752 __ sd(S0, SP, S0_off * wordSize);
2753 __ sd(S1, SP, S1_off * wordSize);
2754 __ sd(S2, SP, S2_off * wordSize);
2755 __ sd(S3, SP, S3_off * wordSize);
2756 __ sd(S4, SP, S4_off * wordSize);
2757 __ sd(S5, SP, S5_off * wordSize);
2758 __ sd(S6, SP, S6_off * wordSize);
2759 __ sd(S7, SP, S7_off * wordSize);
2761 int frame_complete = __ pc() - start;
2762 // push java thread (becomes first argument of C function)
2763 __ sd(java_thread, SP, thread_off * wordSize);
2764 if (java_thread!=A0)
2765 __ move(A0, java_thread);
2767 // Set up last_Java_sp and last_Java_fp
2768 __ set_last_Java_frame(java_thread, SP, FP, NULL);
2769 __ relocate(relocInfo::internal_pc_type);
2770 {
2771 intptr_t save_pc = (intptr_t)__ pc() + NativeMovConstReg::instruction_size + NativeCall::return_address_offset + 4;
2772 __ li48(AT, save_pc);
2773 }
2774 __ sd(AT, java_thread, in_bytes(JavaThread::last_Java_pc_offset()));
2776 // Call runtime
2777 __ call(runtime_entry);
2778 __ delayed()->nop();
2779 // Generate oop map
2780 OopMap* map = new OopMap(framesize, 0);
2781 oop_maps->add_gc_map(__ offset(), map);
2783 // restore the thread (cannot use the pushed argument since arguments
2784 // may be overwritten by C code generated by an optimizing compiler);
2785 // however can use the register value directly if it is callee saved.
2786 #ifndef OPT_THREAD
2787 __ get_thread(java_thread);
2788 #endif
2790 __ ld(SP, java_thread, in_bytes(JavaThread::last_Java_sp_offset()));
2791 // __ reset_last_Java_frame(java_thread, true);
2792 __ reset_last_Java_frame(java_thread, true, true);
2794 // Restore callee save registers. This must be done after resetting the Java frame
2795 __ ld(S0, SP, S0_off * wordSize);
2796 __ ld(S1, SP, S1_off * wordSize);
2797 __ ld(S2, SP, S2_off * wordSize);
2798 __ ld(S3, SP, S3_off * wordSize);
2799 __ ld(S4, SP, S4_off * wordSize);
2800 __ ld(S5, SP, S5_off * wordSize);
2801 __ ld(S6, SP, S6_off * wordSize);
2802 __ ld(S7, SP, S7_off * wordSize);
2804 // discard arguments
2805 __ addi(SP, SP, (framesize-2) * wordSize); // epilog
2806 // __ leave(); // required for proper stackwalking of RuntimeStub frame
2807 __ addi(SP, FP, wordSize);
2808 __ ld(FP, SP, -1*wordSize);
2809 // check for pending exceptions
2810 #ifdef ASSERT
2811 Label L;
2812 __ lw(AT, java_thread, in_bytes(Thread::pending_exception_offset()));
2813 __ bne(AT, R0, L);
2814 __ delayed()->nop();
2815 __ should_not_reach_here();
2816 __ bind(L);
2817 #endif //ASSERT
2818 __ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
2819 __ delayed()->nop();
2820 #ifdef aoqi_test
2821 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
2822 #endif
2823 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code,frame_complete,
2824 framesize, oop_maps, false);
2825 #ifdef aoqi_test
2826 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
2827 #endif
2828 return stub->entry_point();
2829 }
2831 // Initialization
2832 void generate_initial() {
2833 /*
2834 // Generates all stubs and initializes the entry points
2836 // This platform-specific stub is needed by generate_call_stub()
2837 StubRoutines::mips::_mxcsr_std = generate_fp_mask("mxcsr_std", 0x0000000000001F80);
2839 // entry points that exist in all platforms Note: This is code
2840 // that could be shared among different platforms - however the
2841 // benefit seems to be smaller than the disadvantage of having a
2842 // much more complicated generator structure. See also comment in
2843 // stubRoutines.hpp.
2845 StubRoutines::_forward_exception_entry = generate_forward_exception();
2847 StubRoutines::_call_stub_entry =
2848 generate_call_stub(StubRoutines::_call_stub_return_address);
2850 // is referenced by megamorphic call
2851 StubRoutines::_catch_exception_entry = generate_catch_exception();
2853 // atomic calls
2854 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg();
2855 StubRoutines::_atomic_xchg_ptr_entry = generate_atomic_xchg_ptr();
2856 StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg();
2857 StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
2858 StubRoutines::_atomic_add_entry = generate_atomic_add();
2859 StubRoutines::_atomic_add_ptr_entry = generate_atomic_add_ptr();
2860 StubRoutines::_fence_entry = generate_orderaccess_fence();
2862 StubRoutines::_handler_for_unsafe_access_entry =
2863 generate_handler_for_unsafe_access();
2865 // platform dependent
2866 StubRoutines::mips::_get_previous_fp_entry = generate_get_previous_fp();
2868 StubRoutines::mips::_verify_mxcsr_entry = generate_verify_mxcsr();
2869 */
2870 // Generates all stubs and initializes the entry points
2872 //-------------------------------------------------------------
2873 //-----------------------------------------------------------
2874 // entry points that exist in all platforms
2875 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller
2876 // than the disadvantage of having a much more complicated generator structure.
2877 // See also comment in stubRoutines.hpp.
2878 StubRoutines::_forward_exception_entry = generate_forward_exception();
2879 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
2880 // is referenced by megamorphic call
2881 StubRoutines::_catch_exception_entry = generate_catch_exception();
2883 StubRoutines::_handler_for_unsafe_access_entry = generate_handler_for_unsafe_access();
2885 // platform dependent
2886 StubRoutines::gs2::_get_previous_fp_entry = generate_get_previous_fp();
2887 }
2889 void generate_all() {
2890 #ifdef aoqi_test
2891 tty->print_cr("%s:%d", __func__, __LINE__);
2892 #endif
2893 // Generates all stubs and initializes the entry points
2895 // These entry points require SharedInfo::stack0 to be set up in
2896 // non-core builds and need to be relocatable, so they each
2897 // fabricate a RuntimeStub internally.
2898 /*
2899 StubRoutines::_throw_AbstractMethodError_entry =
2900 generate_throw_exception("AbstractMethodError throw_exception",
2901 CAST_FROM_FN_PTR(address,
2902 SharedRuntime::
2903 throw_AbstractMethodError),
2904 false);
2906 StubRoutines::_throw_IncompatibleClassChangeError_entry =
2907 generate_throw_exception("IncompatibleClassChangeError throw_exception",
2908 CAST_FROM_FN_PTR(address,
2909 SharedRuntime::
2910 throw_IncompatibleClassChangeError),
2911 false);
2913 StubRoutines::_throw_ArithmeticException_entry =
2914 generate_throw_exception("ArithmeticException throw_exception",
2915 CAST_FROM_FN_PTR(address,
2916 SharedRuntime::
2917 throw_ArithmeticException),
2918 true);
2920 StubRoutines::_throw_NullPointerException_entry =
2921 generate_throw_exception("NullPointerException throw_exception",
2922 CAST_FROM_FN_PTR(address,
2923 SharedRuntime::
2924 throw_NullPointerException),
2925 true);
2927 StubRoutines::_throw_NullPointerException_at_call_entry =
2928 generate_throw_exception("NullPointerException at call throw_exception",
2929 CAST_FROM_FN_PTR(address,
2930 SharedRuntime::
2931 throw_NullPointerException_at_call),
2932 false);
2934 StubRoutines::_throw_StackOverflowError_entry =
2935 generate_throw_exception("StackOverflowError throw_exception",
2936 CAST_FROM_FN_PTR(address,
2937 SharedRuntime::
2938 throw_StackOverflowError),
2939 false);
2941 // entry points that are platform specific
2942 StubRoutines::mips::_f2i_fixup = generate_f2i_fixup();
2943 StubRoutines::mips::_f2l_fixup = generate_f2l_fixup();
2944 StubRoutines::mips::_d2i_fixup = generate_d2i_fixup();
2945 StubRoutines::mips::_d2l_fixup = generate_d2l_fixup();
2947 StubRoutines::mips::_float_sign_mask = generate_fp_mask("float_sign_mask", 0x7FFFFFFF7FFFFFFF);
2948 StubRoutines::mips::_float_sign_flip = generate_fp_mask("float_sign_flip", 0x8000000080000000);
2949 StubRoutines::mips::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
2950 StubRoutines::mips::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
2952 // support for verify_oop (must happen after universe_init)
2953 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
2955 // arraycopy stubs used by compilers
2956 generate_arraycopy_stubs();
2957 */
2958 #ifdef aoqi_test
2959 tty->print_cr("%s:%d", __func__, __LINE__);
2960 #endif
2961 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError), false);
2962 #ifdef aoqi_test
2963 tty->print_cr("%s:%d", __func__, __LINE__);
2964 #endif
2965 // StubRoutines::_throw_ArithmeticException_entry = generate_throw_exception("ArithmeticException throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_ArithmeticException), true);
2966 #ifdef aoqi_test
2967 tty->print_cr("%s:%d", __func__, __LINE__);
2968 #endif
2969 // StubRoutines::_throw_NullPointerException_entry = generate_throw_exception("NullPointerException throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException), true);
2970 #ifdef aoqi_test
2971 tty->print_cr("%s:%d", __func__, __LINE__);
2972 #endif
2973 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
2974 #ifdef aoqi_test
2975 tty->print_cr("%s:%d", __func__, __LINE__);
2976 #endif
2977 StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
2978 #ifdef aoqi_test
2979 tty->print_cr("%s:%d", __func__, __LINE__);
2980 #endif
2982 //------------------------------------------------------
2983 //------------------------------------------------------------------
2984 // entry points that are platform specific
2986 // support for verify_oop (must happen after universe_init)
2987 #ifdef aoqi_test
2988 tty->print_cr("%s:%d", __func__, __LINE__);
2989 #endif
2990 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
2991 #ifdef aoqi_test
2992 tty->print_cr("%s:%d", __func__, __LINE__);
2993 #endif
2994 #ifndef CORE
2995 // arraycopy stubs used by compilers
2996 generate_arraycopy_stubs();
2997 #ifdef aoqi_test
2998 tty->print_cr("%s:%d", __func__, __LINE__);
2999 #endif
3000 #endif
3002 // Safefetch stubs.
3003 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
3004 &StubRoutines::_safefetch32_fault_pc,
3005 &StubRoutines::_safefetch32_continuation_pc);
3006 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
3007 &StubRoutines::_safefetchN_fault_pc,
3008 &StubRoutines::_safefetchN_continuation_pc);
3009 }
3011 public:
3012 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3013 if (all) {
3014 generate_all();
3015 } else {
3016 generate_initial();
3017 }
3018 }
3019 }; // end class declaration
3020 /*
3021 address StubGenerator::disjoint_byte_copy_entry = NULL;
3022 address StubGenerator::disjoint_short_copy_entry = NULL;
3023 address StubGenerator::disjoint_int_copy_entry = NULL;
3024 address StubGenerator::disjoint_long_copy_entry = NULL;
3025 address StubGenerator::disjoint_oop_copy_entry = NULL;
3027 address StubGenerator::byte_copy_entry = NULL;
3028 address StubGenerator::short_copy_entry = NULL;
3029 address StubGenerator::int_copy_entry = NULL;
3030 address StubGenerator::long_copy_entry = NULL;
3031 address StubGenerator::oop_copy_entry = NULL;
3033 address StubGenerator::checkcast_copy_entry = NULL;
3034 */
3035 void StubGenerator_generate(CodeBuffer* code, bool all) {
3036 StubGenerator g(code, all);
3037 }