Mon, 30 May 2016 02:01:38 -0400
[C2] Rewrite generate_disjoint_short_copy.
Eliminated unaligned access and Optimized copy algorithm.
xml.transform improved by 50%, total GEO improved by 13%.
Copy Algorithm:
Generate stub for disjoint short copy. If "aligned" is true, the
"from" and "to" addresses are assumed to be heapword aligned.
Arguments for generated stub:
from: A0
to: A1
elm.count: A2 treated as signed
one element: 2 bytes
Strategy for aligned==true:
If length <= 9:
1. copy 1 elements at a time (l_5)
If length > 9:
1. copy 4 elements at a time until less than 4 elements are left (l_7)
2. copy 2 elements at a time until less than 2 elements are left (l_6)
3. copy last element if one was left in step 2. (l_1)
Strategy for aligned==false:
If length <= 9: same as aligned==true case
If length > 9:
1. continue with step 7. if the alignment of from and to mod 4
is different.
2. align from and to to 4 bytes by copying 1 element if necessary
3. at l_2 from and to are 4 byte aligned; continue with
6. if they cannot be aligned to 8 bytes because they have
got different alignment mod 8.
4. at this point we know that both, from and to, have the same
alignment mod 8, now copy one element if necessary to get
8 byte alignment of from and to.
5. copy 4 elements at a time until less than 4 elements are
left; depending on step 3. all load/stores are aligned.
6. copy 2 elements at a time until less than 2 elements are
left. (l_6)
7. copy 1 element at a time. (l_5)
8. copy last element if one was left in step 6. (l_1)
TODO:
1. use loongson 128-bit load/store
2. use loop unrolling optimization when len is big enough, for example if
len > 0x2000:
__ bind(l_x);
__ ld(AT, tmp1, 0);
__ ld(tmp, tmp1, 8);
__ sd(AT, tmp2, 0);
__ sd(tmp, tmp2, 8);
__ ld(AT, tmp1, 16);
__ ld(tmp, tmp1, 24);
__ sd(AT, tmp2, 16);
__ sd(tmp, tmp2, 24);
__ daddi(tmp1, tmp1, 32);
__ daddi(tmp2, tmp2, 32);
__ daddi(tmp3, tmp3, -16);
__ daddi(AT, tmp3, -16);
__ bgez(AT, l_x);
__ delayed()->nop();
1 /*
2 * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2015, 2016, Loongson Technology. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
26 #include "precompiled.hpp"
27 #include "asm/macroAssembler.hpp"
28 #include "asm/macroAssembler.inline.hpp"
29 #include "interpreter/interpreter.hpp"
30 #include "nativeInst_mips.hpp"
31 #include "oops/instanceOop.hpp"
32 #include "oops/method.hpp"
33 #include "oops/objArrayKlass.hpp"
34 #include "oops/oop.inline.hpp"
35 #include "prims/methodHandles.hpp"
36 #include "runtime/frame.inline.hpp"
37 #include "runtime/handles.inline.hpp"
38 #include "runtime/sharedRuntime.hpp"
39 #include "runtime/stubCodeGenerator.hpp"
40 #include "runtime/stubRoutines.hpp"
41 #include "runtime/thread.inline.hpp"
42 #include "utilities/top.hpp"
43 #ifdef COMPILER2
44 #include "opto/runtime.hpp"
45 #endif
48 // Declaration and definition of StubGenerator (no .hpp file).
49 // For a more detailed description of the stub routine structure
50 // see the comment in stubRoutines.hpp
52 #define __ _masm->
53 //#define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
54 //#define a__ ((Assembler*)_masm)->
56 //#ifdef PRODUCT
57 //#define BLOCK_COMMENT(str) /* nothing */
58 //#else
59 //#define BLOCK_COMMENT(str) __ block_comment(str)
60 //#endif
62 //#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
63 const int MXCSR_MASK = 0xFFC0; // Mask out any pending exceptions
65 // Stub Code definitions
67 static address handle_unsafe_access() {
68 JavaThread* thread = JavaThread::current();
69 address pc = thread->saved_exception_pc();
70 // pc is the instruction which we must emulate
71 // doing a no-op is fine: return garbage from the load
72 // therefore, compute npc
73 //address npc = Assembler::locate_next_instruction(pc);
74 address npc = (address)((unsigned long)pc + sizeof(unsigned long));
76 // request an async exception
77 thread->set_pending_unsafe_access_error();
79 // return address of next instruction to execute
80 return npc;
81 }
83 class StubGenerator: public StubCodeGenerator {
84 private:
86 // ABI mips n64
87 // This fig is not MIPS ABI. It is call Java from C ABI.
88 // Call stubs are used to call Java from C
89 //
90 // [ return_from_Java ]
91 // [ argument word n-1 ] <--- sp
92 // ...
93 // [ argument word 0 ]
94 // ...
95 //-10 [ S6 ]
96 // -9 [ S5 ]
97 // -8 [ S4 ]
98 // -7 [ S3 ]
99 // -6 [ S0 ]
100 // -5 [ TSR(S2) ]
101 // -4 [ LVP(S7) ]
102 // -3 [ BCP(S1) ]
103 // -2 [ saved fp ] <--- fp_after_call
104 // -1 [ return address ]
105 // 0 [ ptr. to call wrapper ] <--- a0 (old sp -->)fp
106 // 1 [ result ] <--- a1
107 // 2 [ result_type ] <--- a2
108 // 3 [ method ] <--- a3
109 // 4 [ entry_point ] <--- a4
110 // 5 [ parameters ] <--- a5
111 // 6 [ parameter_size ] <--- a6
112 // 7 [ thread ] <--- a7
114 //
115 // _LP64: n64 does not save paras in sp.
116 //
117 // [ return_from_Java ]
118 // [ argument word n-1 ] <--- sp
119 // ...
120 // [ argument word 0 ]
121 // ...
122 //-14 [ thread ]
123 //-13 [ result_type ] <--- a2
124 //-12 [ result ] <--- a1
125 //-11 [ ptr. to call wrapper ] <--- a0
126 //-10 [ S6 ]
127 // -9 [ S5 ]
128 // -8 [ S4 ]
129 // -7 [ S3 ]
130 // -6 [ S0 ]
131 // -5 [ TSR(S2) ]
132 // -4 [ LVP(S7) ]
133 // -3 [ BCP(S1) ]
134 // -2 [ saved fp ] <--- fp_after_call
135 // -1 [ return address ]
136 // 0 [ ] <--- old sp
137 /*
138 * 2014/01/16 Fu: Find a right place in the call_stub for GP.
139 * GP will point to the starting point of Interpreter::dispatch_table(itos).
140 * It should be saved/restored before/after Java calls.
141 *
142 */
143 enum call_stub_layout {
144 RA_off = -1,
145 FP_off = -2,
146 BCP_off = -3,
147 LVP_off = -4,
148 TSR_off = -5,
149 S1_off = -6,
150 S3_off = -7,
151 S4_off = -8,
152 S5_off = -9,
153 S6_off = -10,
154 result_off = -11,
155 result_type_off = -12,
156 thread_off = -13,
157 total_off = thread_off - 3,
158 GP_off = -16,
159 };
161 address generate_call_stub(address& return_address) {
163 StubCodeMark mark(this, "StubRoutines", "call_stub");
164 address start = __ pc();
166 // same as in generate_catch_exception()!
168 // stub code
169 // save ra and fp
170 __ sd(RA, SP, RA_off * wordSize);
171 __ sd(FP, SP, FP_off * wordSize);
172 __ sd(BCP, SP, BCP_off * wordSize);
173 __ sd(LVP, SP, LVP_off * wordSize);
174 __ sd(GP, SP, GP_off * wordSize);
175 __ sd(TSR, SP, TSR_off * wordSize);
176 __ sd(S1, SP, S1_off * wordSize);
177 __ sd(S3, SP, S3_off * wordSize);
178 __ sd(S4, SP, S4_off * wordSize);
179 __ sd(S5, SP, S5_off * wordSize);
180 __ sd(S6, SP, S6_off * wordSize);
183 __ li48(GP, (long)Interpreter::dispatch_table(itos));
185 // I think 14 is the max gap between argument and callee saved register
186 __ daddi(FP, SP, (-2) * wordSize);
187 __ daddi(SP, SP, total_off * wordSize);
188 //FIXME, aoqi. find a suitable place to save A1 & A2.
189 /*
190 __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);
191 __ sd(A1, FP, 3 * wordSize);
192 __ sd(A2, FP, 4 * wordSize);
193 __ sd(A3, FP, 5 * wordSize);
194 __ sd(A4, FP, 6 * wordSize);
195 __ sd(A5, FP, 7 * wordSize);
196 __ sd(A6, FP, 8 * wordSize);
197 __ sd(A7, FP, 9 * wordSize);
198 */
199 __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);
200 __ sd(A1, FP, result_off * wordSize);
201 __ sd(A2, FP, result_type_off * wordSize);
202 __ sd(A7, FP, thread_off * wordSize);
204 #ifdef OPT_THREAD
205 //__ get_thread(TREG);
206 __ move(TREG, A7);
208 //__ ld(TREG, FP, thread_off * wordSize);
209 #endif
210 //add for compressedoops
211 __ reinit_heapbase();
213 #ifdef ASSERT
214 // make sure we have no pending exceptions
215 {
216 Label L;
217 __ ld(AT, A7, in_bytes(Thread::pending_exception_offset()));
218 __ beq(AT, R0, L);
219 __ delayed()->nop();
220 /* FIXME: I do not know how to realize stop in mips arch, do it in the future */
221 __ stop("StubRoutines::call_stub: entered with pending exception");
222 __ bind(L);
223 }
224 #endif
226 // pass parameters if any
227 // A5: parameter
228 // A6: parameter_size
229 // T0: parameter_size_tmp(--)
230 // T2: offset(++)
231 // T3: tmp
232 Label parameters_done;
233 // judge if the parameter_size equals 0
234 __ beq(A6, R0, parameters_done);
235 __ delayed()->nop();
236 __ dsll(AT, A6, Interpreter::logStackElementSize);
237 __ dsub(SP, SP, AT);
238 __ move(AT, -StackAlignmentInBytes);
239 __ andr(SP, SP , AT);
240 // Copy Java parameters in reverse order (receiver last)
241 // Note that the argument order is inverted in the process
242 // source is edx[ecx: N-1..0]
243 // dest is esp[ebx: 0..N-1]
244 Label loop;
245 __ move(T0, A6);
246 __ move(T2, R0);
247 __ bind(loop);
249 // get parameter
250 __ dsll(T3, T0, LogBytesPerWord);
251 __ dadd(T3, T3, A5);
252 __ ld(AT, T3, -wordSize);
253 __ dsll(T3, T2, LogBytesPerWord);
254 __ dadd(T3, T3, SP);
255 __ sd(AT, T3, Interpreter::expr_offset_in_bytes(0));
256 __ daddi(T2, T2, 1);
257 __ daddi(T0, T0, -1);
258 __ bne(T0, R0, loop);
259 __ delayed()->nop();
260 // advance to next parameter
262 // call Java function
263 __ bind(parameters_done);
265 // receiver in V0, methodOop in Rmethod
267 __ move(Rmethod, A3);
268 __ move(Rsender, SP); //set sender sp
269 __ jalr(A4);
270 __ delayed()->nop();
271 return_address = __ pc();
273 Label common_return;
274 __ bind(common_return);
276 // store result depending on type
277 // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
278 __ ld(T0, FP, result_off * wordSize); // result --> T0
279 Label is_long, is_float, is_double, exit;
280 __ ld(T2, FP, result_type_off * wordSize); // result_type --> T2
281 __ daddi(T3, T2, (-1) * T_LONG);
282 __ beq(T3, R0, is_long);
283 __ delayed()->daddi(T3, T2, (-1) * T_FLOAT);
284 __ beq(T3, R0, is_float);
285 __ delayed()->daddi(T3, T2, (-1) * T_DOUBLE);
286 __ beq(T3, R0, is_double);
287 __ delayed()->nop();
289 // handle T_INT case
290 __ sd(V0, T0, 0 * wordSize);
291 __ bind(exit);
293 // restore
294 __ daddi(SP, FP, 2 * wordSize );
295 __ ld(RA, SP, RA_off * wordSize);
296 __ ld(FP, SP, FP_off * wordSize);
297 __ ld(BCP, SP, BCP_off * wordSize);
298 __ ld(LVP, SP, LVP_off * wordSize);
299 __ ld(GP, SP, GP_off * wordSize);
300 __ ld(TSR, SP, TSR_off * wordSize);
302 __ ld(S1, SP, S1_off * wordSize);
303 __ ld(S3, SP, S3_off * wordSize);
304 __ ld(S4, SP, S4_off * wordSize);
305 __ ld(S5, SP, S5_off * wordSize);
306 __ ld(S6, SP, S6_off * wordSize);
308 // return
309 __ jr(RA);
310 __ delayed()->nop();
312 // handle return types different from T_INT
313 __ bind(is_long);
314 __ sd(V0, T0, 0 * wordSize);
315 //__ sd(V1, T0, 1 * wordSize);
316 __ sd(R0, T0, 1 * wordSize);
317 __ b(exit);
318 __ delayed()->nop();
320 __ bind(is_float);
321 __ swc1(F0, T0, 0 * wordSize);
322 __ b(exit);
323 __ delayed()->nop();
325 __ bind(is_double);
326 __ sdc1(F0, T0, 0 * wordSize);
327 //__ sdc1(F1, T0, 1 * wordSize);
328 __ sd(R0, T0, 1 * wordSize);
329 __ b(exit);
330 __ delayed()->nop();
331 //FIXME, 1.6 mips version add operation of fpu here
332 StubRoutines::gs2::set_call_stub_compiled_return(__ pc());
333 __ b(common_return);
334 __ delayed()->nop();
335 return start;
336 }
338 // Return point for a Java call if there's an exception thrown in
339 // Java code. The exception is caught and transformed into a
340 // pending exception stored in JavaThread that can be tested from
341 // within the VM.
342 //
343 // Note: Usually the parameters are removed by the callee. In case
344 // of an exception crossing an activation frame boundary, that is
345 // not the case if the callee is compiled code => need to setup the
346 // rsp.
347 //
348 // rax: exception oop
350 address generate_catch_exception() {
351 StubCodeMark mark(this, "StubRoutines", "catch_exception");
352 address start = __ pc();
354 Register thread = TREG;
356 // get thread directly
357 #ifndef OPT_THREAD
358 __ ld(thread, FP, thread_off * wordSize);
359 #endif
361 #ifdef ASSERT
362 // verify that threads correspond
363 { Label L;
364 __ get_thread(T8);
365 __ beq(T8, thread, L);
366 __ delayed()->nop();
367 __ stop("StubRoutines::catch_exception: threads must correspond");
368 __ bind(L);
369 }
370 #endif
371 // set pending exception
372 __ verify_oop(V0);
373 __ sd(V0, thread, in_bytes(Thread::pending_exception_offset()));
374 __ li(AT, (long)__FILE__);
375 __ sd(AT, thread, in_bytes(Thread::exception_file_offset ()));
376 __ li(AT, (long)__LINE__);
377 __ sd(AT, thread, in_bytes(Thread::exception_line_offset ()));
379 // complete return to VM
380 assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
381 __ jmp(StubRoutines::_call_stub_return_address, relocInfo::none);
382 __ delayed()->nop();
384 return start;
385 }
387 // Continuation point for runtime calls returning with a pending
388 // exception. The pending exception check happened in the runtime
389 // or native call stub. The pending exception in Thread is
390 // converted into a Java-level exception.
391 //
392 // Contract with Java-level exception handlers:
393 // rax: exception
394 // rdx: throwing pc
395 //
396 // NOTE: At entry of this stub, exception-pc must be on stack !!
398 address generate_forward_exception() {
399 StubCodeMark mark(this, "StubRoutines", "forward exception");
400 //Register thread = TREG;
401 Register thread = TREG;
402 address start = __ pc();
404 // Upon entry, the sp points to the return address returning into Java
405 // (interpreted or compiled) code; i.e., the return address becomes the
406 // throwing pc.
407 //
408 // Arguments pushed before the runtime call are still on the stack but
409 // the exception handler will reset the stack pointer -> ignore them.
410 // A potential result in registers can be ignored as well.
412 #ifdef ASSERT
413 // make sure this code is only executed if there is a pending exception
414 #ifndef OPT_THREAD
415 __ get_thread(thread);
416 #endif
417 { Label L;
418 __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));
419 __ bne(AT, R0, L);
420 __ delayed()->nop();
421 __ stop("StubRoutines::forward exception: no pending exception (1)");
422 __ bind(L);
423 }
424 #endif
426 // compute exception handler into T9
427 __ ld(A1, SP, 0);
428 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, A1);
429 __ move(T9, V0);
430 __ pop(V1);
432 #ifndef OPT_THREAD
433 __ get_thread(thread);
434 #endif
435 __ ld(V0, thread, in_bytes(Thread::pending_exception_offset()));
436 __ sd(R0, thread, in_bytes(Thread::pending_exception_offset()));
438 #ifdef ASSERT
439 // make sure exception is set
440 { Label L;
441 __ bne(V0, R0, L);
442 __ delayed()->nop();
443 __ stop("StubRoutines::forward exception: no pending exception (2)");
444 __ bind(L);
445 }
446 #endif
448 // continue at exception handler (return address removed)
449 // V0: exception
450 // T9: exception handler
451 // V1: throwing pc
452 __ verify_oop(V0);
453 __ jr(T9);
454 __ delayed()->nop();
456 return start;
457 }
459 // Support for intptr_t get_previous_fp()
460 //
461 // This routine is used to find the previous frame pointer for the
462 // caller (current_frame_guess). This is used as part of debugging
463 // ps() is seemingly lost trying to find frames.
464 // This code assumes that caller current_frame_guess) has a frame.
465 address generate_get_previous_fp() {
466 StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
467 const Address old_fp (FP, 0);
468 const Address older_fp (V0, 0);
469 address start = __ pc();
470 __ enter();
471 __ lw(V0, old_fp); // callers fp
472 __ lw(V0, older_fp); // the frame for ps()
473 __ leave();
474 __ jr(RA);
475 __ delayed()->nop();
476 return start;
477 }
478 // The following routine generates a subroutine to throw an
479 // asynchronous UnknownError when an unsafe access gets a fault that
480 // could not be reasonably prevented by the programmer. (Example:
481 // SIGBUS/OBJERR.)
482 address generate_handler_for_unsafe_access() {
483 StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
484 address start = __ pc();
485 __ pushad(); // push registers
486 // Address next_pc(esp, RegisterImpl::number_of_registers * BytesPerWord);
487 __ call(CAST_FROM_FN_PTR(address, handle_unsafe_access), relocInfo::runtime_call_type);
488 __ delayed()->nop();
489 __ sw(V0, SP, RegisterImpl::number_of_registers * BytesPerWord);
490 __ popad();
491 __ jr(RA);
492 __ delayed()->nop();
493 return start;
494 }
496 // Non-destructive plausibility checks for oops
497 //
498 // Arguments:
499 // all args on stack!
500 //
501 // Stack after saving c_rarg3:
502 // [tos + 0]: saved c_rarg3
503 // [tos + 1]: saved c_rarg2
504 // [tos + 2]: saved r12 (several TemplateTable methods use it)
505 // [tos + 3]: saved flags
506 // [tos + 4]: return address
507 // * [tos + 5]: error message (char*)
508 // * [tos + 6]: object to verify (oop)
509 // * [tos + 7]: saved rax - saved by caller and bashed
510 // * = popped on exit
511 address generate_verify_oop() {
512 StubCodeMark mark(this, "StubRoutines", "verify_oop");
513 address start = __ pc();
514 __ reinit_heapbase();
515 __ verify_oop_subroutine();
516 address end = __ pc();
517 return start;
518 }
520 //
521 // Generate overlap test for array copy stubs
522 //
523 // Input:
524 // A0 - array1
525 // A1 - array2
526 // A2 - element count
527 //
528 // Note: this code can only use %eax, %ecx, and %edx
529 //
531 // use T9 as temp
532 void array_overlap_test(address no_overlap_target, int log2_elem_size) {
533 int elem_size = 1 << log2_elem_size;
534 Address::ScaleFactor sf = Address::times_1;
536 switch (log2_elem_size) {
537 case 0: sf = Address::times_1; break;
538 case 1: sf = Address::times_2; break;
539 case 2: sf = Address::times_4; break;
540 case 3: sf = Address::times_8; break;
541 }
543 __ dsll(AT, A2, sf);
544 __ dadd(AT, AT, A0);
545 __ lea(T9, Address(AT, -elem_size));
546 __ dsub(AT, A1, A0);
547 __ blez(AT, no_overlap_target);
548 __ delayed()->nop();
549 __ dsub(AT, A1, T9);
550 __ bgtz(AT, no_overlap_target);
551 __ delayed()->nop();
553 // 2016/05/10 aoqi: If A0 = 0xf... and A1 = 0x0..., than goto no_overlap_target
554 Label L;
555 __ bgez(A0, L);
556 __ delayed()->nop();
557 __ bgtz(A1, no_overlap_target);
558 __ delayed()->nop();
559 __ bind(L);
561 }
563 //
564 // Generate store check for array
565 //
566 // Input:
567 // %edi - starting address
568 // %ecx - element count
569 //
570 // The 2 input registers are overwritten
571 //
573 //
574 // Generate store check for array
575 //
576 // Input:
577 // T0 - starting address(edi)
578 // T1 - element count (ecx)
579 //
580 // The 2 input registers are overwritten
581 //
583 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
585 void array_store_check() {
586 BarrierSet* bs = Universe::heap()->barrier_set();
587 assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
588 CardTableModRefBS* ct = (CardTableModRefBS*)bs;
589 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
590 Label l_0;
592 __ dsll(AT, T1, TIMES_OOP);
593 __ dadd(AT, T0, AT);
594 __ daddiu(T1, AT, - BytesPerHeapOop);
596 __ shr(T0, CardTableModRefBS::card_shift);
597 __ shr(T1, CardTableModRefBS::card_shift);
599 __ dsub(T1, T1, T0); // end --> cards count
600 __ bind(l_0);
602 __ li48(AT, (long)ct->byte_map_base);
603 __ dadd(AT, AT, T0);
604 __ dadd(AT, AT, T1);
605 __ sb(R0, AT, 0);
606 //__ daddi(T1, T1, -4);
607 __ daddi(T1, T1, - 1);
608 __ bgez(T1, l_0);
609 __ delayed()->nop();
610 }
612 // Arguments:
613 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
614 // ignored
615 // name - stub name string
616 //
617 // Inputs:
618 // c_rarg0 - source array address
619 // c_rarg1 - destination array address
620 // c_rarg2 - element count, treated as ssize_t, can be zero
621 //
622 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
623 // we let the hardware handle it. The one to eight bytes within words,
624 // dwords or qwords that span cache line boundaries will still be loaded
625 // and stored atomically.
626 //
627 // Side Effects:
628 // disjoint_byte_copy_entry is set to the no-overlap entry point
629 // used by generate_conjoint_byte_copy().
630 //
631 address generate_disjoint_byte_copy(bool aligned, const char *name) {
632 StubCodeMark mark(this, "StubRoutines", name);
633 __ align(CodeEntryAlignment);
634 address start = __ pc();
635 Label l_0, l_1, l_2, l_3, l_4, l_5, l_6;
637 __ push(T3);
638 __ push(T0);
639 __ push(T1);
640 __ push(T8);
641 __ move(T3, A0);
642 __ move(T0, A1);
643 __ move(T1, A2);
644 __ move(T8, T1); // original count in T1
645 __ daddi(AT, T1, -3);
646 __ blez(AT, l_4);
647 __ delayed()->nop();
648 if (!aligned) {
649 //TODO: copy 8 bytes at one time
650 // 2016/5/8 Jin: only when src and dest has the same alignment can we do lw/sw */
651 __ andi(AT, T3, 3);
652 __ andi(T9, T0, 3);
653 __ bne(AT, T9, l_5);
654 __ delayed()->nop();
656 // align source address at dword address boundary
657 __ move(T1, 4);
658 __ sub(T1, T1, T3);
659 __ andi(T1, T1, 3);
660 __ beq(T1, R0, l_1);
661 __ delayed()->nop();
662 __ sub(T8,T8,T1);
663 __ bind(l_0);
664 __ lb(AT, T3, 0);
665 __ sb(AT, T0, 0);
666 __ addi(T3, T3, 1);
667 __ addi(T0, T0, 1);
668 __ addi(T1 ,T1, -1);
669 __ bne(T1, R0, l_0);
670 __ delayed()->nop();
671 __ bind(l_1);
672 __ move(T1, T8);
673 }
674 __ shr(T1, 2);
675 __ beq(T1, R0, l_4); // no dwords to move
676 __ delayed()->nop();
677 // copy aligned dwords
678 __ bind(l_2);
679 __ align(16);
680 __ bind(l_3);
681 __ lw(AT, T3, 0);
682 __ sw(AT, T0, 0 );
683 __ addi(T3, T3, 4);
684 __ addi(T0, T0, 4);
685 __ addi(T1, T1, -1);
686 __ bne(T1, R0, l_3);
687 __ delayed()->nop();
688 __ bind(l_4);
689 __ move(T1, T8);
690 __ andi(T1, T1, 3);
691 __ beq(T1, R0, l_6);
692 __ delayed()->nop();
693 // copy suffix
694 __ bind(l_5);
695 __ lb(AT, T3, 0);
696 __ sb(AT, T0, 0);
697 __ addi(T3, T3, 1);
698 __ addi(T0, T0, 1);
699 __ addi(T1, T1, -1);
700 __ bne(T1, R0, l_5 );
701 __ delayed()->nop();
702 __ bind(l_6);
703 __ pop(T8);
704 __ pop(T1);
705 __ pop(T0);
706 __ pop(T3);
707 __ jr(RA);
708 __ delayed()->nop();
709 return start;
710 }
712 // Arguments:
713 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
714 // ignored
715 // name - stub name string
716 //
717 // Inputs:
718 // A0 - source array address
719 // A1 - destination array address
720 // A2 - element count, treated as ssize_t, can be zero
721 //
722 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
723 // we let the hardware handle it. The one to eight bytes within words,
724 // dwords or qwords that span cache line boundaries will still be loaded
725 // and stored atomically.
726 //
727 address generate_conjoint_byte_copy(bool aligned, const char *name) {
728 __ align(CodeEntryAlignment);
729 StubCodeMark mark(this, "StubRoutines", name);
730 address start = __ pc();
732 Label l_copy_4_bytes_loop, l_copy_suffix, l_copy_suffix_loop, l_exit;
733 Label l_copy_byte, l_from_unaligned, l_unaligned, l_4_bytes_aligned;
735 address nooverlap_target = aligned ?
736 StubRoutines::arrayof_jbyte_disjoint_arraycopy() :
737 StubRoutines::jbyte_disjoint_arraycopy();
739 array_overlap_test(nooverlap_target, 0);
741 const Register from = A0; // source array address
742 const Register to = A1; // destination array address
743 const Register count = A2; // elements count
744 const Register end_from = T3; // source array end address
745 const Register end_to = T0; // destination array end address
746 const Register end_count = T1; // destination array end address
748 __ push(end_from);
749 __ push(end_to);
750 __ push(end_count);
751 __ push(T8);
753 // copy from high to low
754 __ move(end_count, count);
755 __ dadd(end_from, from, end_count);
756 __ dadd(end_to, to, end_count);
758 // 2016/05/08 aoqi: If end_from and end_to has differante alignment, unaligned copy is performed.
759 __ andi(AT, end_from, 3);
760 __ andi(T8, end_to, 3);
761 __ bne(AT, T8, l_copy_byte);
762 __ delayed()->nop();
764 // First deal with the unaligned data at the top.
765 __ bind(l_unaligned);
766 __ beq(end_count, R0, l_exit);
767 __ delayed()->nop();
769 __ andi(AT, end_from, 3);
770 __ bne(AT, R0, l_from_unaligned);
771 __ delayed()->nop();
773 __ andi(AT, end_to, 3);
774 __ beq(AT, R0, l_4_bytes_aligned);
775 __ delayed()->nop();
777 __ bind(l_from_unaligned);
778 __ lb(AT, end_from, -1);
779 __ sb(AT, end_to, -1);
780 __ daddi(end_from, end_from, -1);
781 __ daddi(end_to, end_to, -1);
782 __ daddi(end_count, end_count, -1);
783 __ b(l_unaligned);
784 __ delayed()->nop();
786 // now end_to, end_from point to 4-byte aligned high-ends
787 // end_count contains byte count that is not copied.
788 // copy 4 bytes at a time
789 __ bind(l_4_bytes_aligned);
791 __ move(T8, end_count);
792 __ daddi(AT, end_count, -3);
793 __ blez(AT, l_copy_suffix);
794 __ delayed()->nop();
796 //__ andi(T8, T8, 3);
797 __ lea(end_from, Address(end_from, -4));
798 __ lea(end_to, Address(end_to, -4));
800 __ dsrl(end_count, end_count, 2);
801 __ align(16);
802 __ bind(l_copy_4_bytes_loop); //l_copy_4_bytes
803 __ lw(AT, end_from, 0);
804 __ sw(AT, end_to, 0);
805 __ addi(end_from, end_from, -4);
806 __ addi(end_to, end_to, -4);
807 __ addi(end_count, end_count, -1);
808 __ bne(end_count, R0, l_copy_4_bytes_loop);
809 __ delayed()->nop();
811 __ b(l_copy_suffix);
812 __ delayed()->nop();
813 // copy dwords aligned or not with repeat move
814 // l_copy_suffix
815 // copy suffix (0-3 bytes)
816 __ bind(l_copy_suffix);
817 __ andi(T8, T8, 3);
818 __ beq(T8, R0, l_exit);
819 __ delayed()->nop();
820 __ addi(end_from, end_from, 3);
821 __ addi(end_to, end_to, 3);
822 __ bind(l_copy_suffix_loop);
823 __ lb(AT, end_from, 0);
824 __ sb(AT, end_to, 0);
825 __ addi(end_from, end_from, -1);
826 __ addi(end_to, end_to, -1);
827 __ addi(T8, T8, -1);
828 __ bne(T8, R0, l_copy_suffix_loop);
829 __ delayed()->nop();
831 __ bind(l_copy_byte);
832 __ beq(end_count, R0, l_exit);
833 __ delayed()->nop();
834 __ lb(AT, end_from, -1);
835 __ sb(AT, end_to, -1);
836 __ daddi(end_from, end_from, -1);
837 __ daddi(end_to, end_to, -1);
838 __ daddi(end_count, end_count, -1);
839 __ b(l_copy_byte);
840 __ delayed()->nop();
842 __ bind(l_exit);
843 __ pop(T8);
844 __ pop(end_count);
845 __ pop(end_to);
846 __ pop(end_from);
847 __ jr(RA);
848 __ delayed()->nop();
849 return start;
850 }
852 // Generate stub for disjoint short copy. If "aligned" is true, the
853 // "from" and "to" addresses are assumed to be heapword aligned.
854 //
855 // Arguments for generated stub:
856 // from: A0
857 // to: A1
858 // elm.count: A2 treated as signed
859 // one element: 2 bytes
860 //
861 // Strategy for aligned==true:
862 //
863 // If length <= 9:
864 // 1. copy 1 elements at a time (l_5)
865 //
866 // If length > 9:
867 // 1. copy 4 elements at a time until less than 4 elements are left (l_7)
868 // 2. copy 2 elements at a time until less than 2 elements are left (l_6)
869 // 3. copy last element if one was left in step 2. (l_1)
870 //
871 //
872 // Strategy for aligned==false:
873 //
874 // If length <= 9: same as aligned==true case
875 //
876 // If length > 9:
877 // 1. continue with step 7. if the alignment of from and to mod 4
878 // is different.
879 // 2. align from and to to 4 bytes by copying 1 element if necessary
880 // 3. at l_2 from and to are 4 byte aligned; continue with
881 // 6. if they cannot be aligned to 8 bytes because they have
882 // got different alignment mod 8.
883 // 4. at this point we know that both, from and to, have the same
884 // alignment mod 8, now copy one element if necessary to get
885 // 8 byte alignment of from and to.
886 // 5. copy 4 elements at a time until less than 4 elements are
887 // left; depending on step 3. all load/stores are aligned.
888 // 6. copy 2 elements at a time until less than 2 elements are
889 // left. (l_6)
890 // 7. copy 1 element at a time. (l_5)
891 // 8. copy last element if one was left in step 6. (l_1)
892 //
893 // TODO:
894 //
895 // 1. use loongson 128-bit load/store
896 // 2. use loop unrolling optimization when len is big enough, for example if len > 0x2000:
897 // __ bind(l_x);
898 // __ ld(AT, tmp1, 0);
899 // __ ld(tmp, tmp1, 8);
900 // __ sd(AT, tmp2, 0);
901 // __ sd(tmp, tmp2, 8);
902 // __ ld(AT, tmp1, 16);
903 // __ ld(tmp, tmp1, 24);
904 // __ sd(AT, tmp2, 16);
905 // __ sd(tmp, tmp2, 24);
906 // __ daddi(tmp1, tmp1, 32);
907 // __ daddi(tmp2, tmp2, 32);
908 // __ daddi(tmp3, tmp3, -16);
909 // __ daddi(AT, tmp3, -16);
910 // __ bgez(AT, l_x);
911 // __ delayed()->nop();
912 //
913 address generate_disjoint_short_copy(bool aligned, const char * name) {
914 StubCodeMark mark(this, "StubRoutines", name);
915 __ align(CodeEntryAlignment);
917 Register tmp1 = T0;
918 Register tmp2 = T1;
919 Register tmp3 = T3;
921 address start = __ pc();
923 __ push(tmp1);
924 __ push(tmp2);
925 __ push(tmp3);
926 __ move(tmp1, A0);
927 __ move(tmp2, A1);
928 __ move(tmp3, A2);
930 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
931 Label l_debug;
932 // don't try anything fancy if arrays don't have many elements
933 __ daddi(AT, tmp3, -9);
934 __ blez(AT, l_1);
935 __ delayed()->nop();
937 if (!aligned) {
938 __ xorr(AT, A0, A1);
939 __ andi(AT, AT, 1);
940 __ bne(AT, R0, l_debug); // if arrays don't have the same alignment mod 2, can this happen?
941 __ delayed()->nop();
943 __ xorr(AT, A0, A1);
944 __ andi(AT, AT, 3);
945 __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 1 element copy
946 __ delayed()->nop();
948 // At this point it is guaranteed that both, from and to have the same alignment mod 4.
950 // Copy 1 element if necessary to align to 4 bytes.
951 __ andi(AT, A0, 3);
952 __ beq(AT, R0, l_2);
953 __ delayed()->nop();
955 __ lhu(AT, tmp1, 0);
956 __ daddi(tmp1, tmp1, 2);
957 __ sh(AT, tmp2, 0);
958 __ daddi(tmp2, tmp2, 2);
959 __ daddi(tmp3, tmp3, -1);
960 __ bind(l_2);
962 // At this point the positions of both, from and to, are at least 4 byte aligned.
964 // Copy 4 elements at a time.
965 // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
966 __ xorr(AT, tmp1, tmp2);
967 __ andi(AT, AT, 7);
968 __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned
969 __ delayed()->nop();
971 // Copy a 2-element word if necessary to align to 8 bytes.
972 __ andi(AT, tmp1, 7);
973 __ beq(AT, R0, l_7);
974 __ delayed()->nop();
976 __ lw(AT, tmp1, 0);
977 __ daddi(tmp3, tmp3, -2);
978 __ sw(AT, tmp2, 0);
979 { // FasterArrayCopy
980 __ daddi(tmp1, tmp1, 4);
981 __ daddi(tmp2, tmp2, 4);
982 }
983 }
985 __ bind(l_7);
987 // Copy 4 elements at a time; either the loads or the stores can
988 // be unaligned if aligned == false.
990 { // FasterArrayCopy
991 __ daddi(AT, tmp3, -15);
992 __ blez(AT, l_6); // copy 2 at a time if less than 16 elements remain
993 __ delayed()->nop();
995 __ bind(l_8);
996 // For Loongson, there is 128-bit memory access. TODO
997 __ ld(AT, tmp1, 0);
998 __ sd(AT, tmp2, 0);
999 __ daddi(tmp1, tmp1, 8);
1000 __ daddi(tmp2, tmp2, 8);
1001 __ daddi(tmp3, tmp3, -4);
1002 __ daddi(AT, tmp3, -4);
1003 __ bgez(AT, l_8);
1004 __ delayed()->nop();
1005 }
1006 __ bind(l_6);
1008 // copy 2 element at a time
1009 { // FasterArrayCopy
1010 __ daddi(AT, tmp3, -1);
1011 __ blez(AT, l_1);
1012 __ delayed()->nop();
1014 __ bind(l_3);
1015 __ lw(AT, tmp1, 0);
1016 __ sw(AT, tmp2, 0);
1017 __ daddi(tmp1, tmp1, 4);
1018 __ daddi(tmp2, tmp2, 4);
1019 __ daddi(tmp3, tmp3, -2);
1020 __ daddi(AT, tmp3, -2);
1021 __ bgez(AT, l_3);
1022 __ delayed()->nop();
1024 }
1026 // do single element copy (8 bit), can this happen?
1027 __ bind(l_1);
1028 __ beq(R0, tmp3, l_4);
1029 __ delayed()->nop();
1031 { // FasterArrayCopy
1033 __ bind(l_5);
1034 __ lhu(AT, tmp1, 0);
1035 __ daddi(tmp3, tmp3, -1);
1036 __ sh(AT, tmp2, 0);
1037 __ daddi(tmp1, tmp1, 2);
1038 __ daddi(tmp2, tmp2, 2);
1039 __ daddi(AT, tmp3, -1);
1040 __ bgez(AT, l_5);
1041 __ delayed()->nop();
1042 }
1043 __ bind(l_4);
1044 __ pop(tmp3);
1045 __ pop(tmp2);
1046 __ pop(tmp1);
1048 __ jr(RA);
1049 __ delayed()->nop();
1051 __ bind(l_debug);
1052 __ stop("generate_disjoint_short_copy should not reach here");
1053 return start;
1054 }
1056 // Arguments:
1057 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1058 // ignored
1059 // name - stub name string
1060 //
1061 // Inputs:
1062 // c_rarg0 - source array address
1063 // c_rarg1 - destination array address
1064 // c_rarg2 - element count, treated as ssize_t, can be zero
1065 //
1066 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1067 // let the hardware handle it. The two or four words within dwords
1068 // or qwords that span cache line boundaries will still be loaded
1069 // and stored atomically.
1070 //
1071 address generate_conjoint_short_copy(bool aligned, const char *name) {
1072 Label l_1, l_2, l_3, l_4, l_5;
1073 StubCodeMark mark(this, "StubRoutines", name);
1074 __ align(CodeEntryAlignment);
1075 address start = __ pc();
1076 address nooverlap_target = aligned ?
1077 StubRoutines::arrayof_jshort_disjoint_arraycopy() :
1078 StubRoutines::jshort_disjoint_arraycopy();
1080 array_overlap_test(nooverlap_target, 1);
1082 __ push(T3);
1083 __ push(T0);
1084 __ push(T1);
1085 __ push(T8);
1087 /*
1088 __ pushl(esi);
1089 __ movl(ecx, Address(esp, 4+12)); // count
1090 __ pushl(edi);
1091 __ movl(esi, Address(esp, 8+ 4)); // from
1092 __ movl(edi, Address(esp, 8+ 8)); // to
1093 */
1094 __ move(T1, A2);
1095 __ move(T3, A0);
1096 __ move(T0, A1);
1099 // copy dwords from high to low
1100 // __ leal(esi, Address(esi, ecx, Address::times_2, -4)); // from + count*2 - 4
1101 __ sll(AT, T1, Address::times_2);
1102 __ add(AT, T3, AT);
1103 __ lea(T3, Address( AT, -4));
1104 //__ std();
1105 //__ leal(edi, Address(edi, ecx, Address::times_2, -4)); // to + count*2 - 4
1106 __ sll(AT,T1 , Address::times_2);
1107 __ add(AT, T0, AT);
1108 __ lea(T0, Address( AT, -4));
1109 // __ movl(eax, ecx);
1110 __ move(T8, T1);
1111 __ bind(l_1);
1112 // __ sarl(ecx, 1); // dword count
1113 __ sra(T1,T1, 1);
1114 //__ jcc(Assembler::equal, l_4); // no dwords to move
1115 __ beq(T1, R0, l_4);
1116 __ delayed()->nop();
1117 /* __ cmpl(ecx, 32);
1118 __ jcc(Assembler::above, l_3); // > 32 dwords
1119 // copy dwords with loop
1120 __ subl(edi, esi);
1121 */ __ align(16);
1122 __ bind(l_2);
1123 //__ movl(edx, Address(esi));
1124 __ lw(AT, T3, 0);
1125 //__ movl(Address(edi, esi, Address::times_1), edx);
1126 __ sw(AT, T0, 0);
1127 //__ subl(esi, 4);
1128 __ addi(T3, T3, -4);
1129 __ addi(T0, T0, -4);
1130 //__ decl(ecx);
1131 __ addi(T1, T1, -1);
1132 // __ jcc(Assembler::notEqual, l_2);
1133 __ bne(T1, R0, l_2);
1134 __ delayed()->nop();
1135 // __ addl(edi, esi);
1136 // __ jmp(l_4);
1137 __ b(l_4);
1138 __ delayed()->nop();
1139 // copy dwords with repeat move
1140 __ bind(l_3);
1141 // __ rep_movl();
1142 __ bind(l_4);
1143 // __ andl(eax, 1); // suffix count
1144 __ andi(T8, T8, 1); // suffix count
1145 //__ jcc(Assembler::equal, l_5); // no suffix
1146 __ beq(T8, R0, l_5 );
1147 __ delayed()->nop();
1148 // copy suffix
1149 // __ movw(edx, Address(esi, 2));
1150 __ lh(AT, T3, 2);
1151 // __ movw(Address(edi, 2), edx);
1152 __ sh(AT, T0, 2);
1153 __ bind(l_5);
1154 // __ cld();
1155 // __ popl(edi);
1156 // __ popl(esi);
1157 // __ ret(0);
1158 __ pop(T8);
1159 __ pop(T1);
1160 __ pop(T0);
1161 __ pop(T3);
1162 __ jr(RA);
1163 __ delayed()->nop();
1164 return start;
1165 }
1167 // Arguments:
1168 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1169 // ignored
1170 // is_oop - true => oop array, so generate store check code
1171 // name - stub name string
1172 //
1173 // Inputs:
1174 // c_rarg0 - source array address
1175 // c_rarg1 - destination array address
1176 // c_rarg2 - element count, treated as ssize_t, can be zero
1177 //
1178 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1179 // the hardware handle it. The two dwords within qwords that span
1180 // cache line boundaries will still be loaded and stored atomicly.
1181 //
1182 // Side Effects:
1183 // disjoint_int_copy_entry is set to the no-overlap entry point
1184 // used by generate_conjoint_int_oop_copy().
1185 //
1186 address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {
1187 Label l_2, l_3, l_4, l_stchk;
1188 StubCodeMark mark(this, "StubRoutines", name);
1189 __ align(CodeEntryAlignment);
1190 address start = __ pc();
1191 /*
1192 __ pushl(esi);
1193 __ movl(ecx, Address(esp, 4+12)); // count
1194 __ pushl(edi);
1195 __ movl(esi, Address(esp, 8+ 4)); // from
1196 __ movl(edi, Address(esp, 8+ 8)); // to
1197 */
1198 __ push(T3);
1199 __ push(T0);
1200 __ push(T1);
1201 __ push(T8);
1202 __ move(T1, A2);
1203 __ move(T3, A0);
1204 __ move(T0, A1);
1206 // __ cmpl(ecx, 32);
1207 // __ jcc(Assembler::belowEqual, l_2); // <= 32 dwords
1208 // __ rep_movl();
1209 __ b(l_2);
1210 __ delayed()->nop();
1211 if (is_oop) {
1212 // __ jmp(l_stchk);
1213 __ b(l_stchk);
1214 __ delayed()->nop();
1215 }
1216 // __ popl(edi);
1217 // __ popl(esi);
1218 // __ ret(0);
1219 __ pop(T8);
1220 __ pop(T1);
1221 __ pop(T0);
1222 __ pop(T3);
1223 __ jr(RA);
1224 __ delayed()->nop();
1226 __ bind(l_2);
1227 // __ subl(edi, esi);
1228 // __ testl(ecx, ecx);
1229 // __ jcc(Assembler::zero, l_4);
1230 __ beq(T1, R0, l_4);
1231 __ delayed()->nop();
1232 __ align(16);
1233 __ bind(l_3);
1234 //__ movl(edx, Address(esi));
1235 __ lw(AT, T3, 0);
1236 // __ movl(Address(edi, esi, Address::times_1), edx);
1237 __ sw(AT, T0, 0);
1238 // __ addl(esi, 4);
1239 __ addi(T3, T3, 4);
1240 __ addi(T0, T0, 4);
1241 // __ decl(ecx);
1242 __ addi(T1, T1, -1);
1243 // __ jcc(Assembler::notEqual, l_3);
1244 __ bne(T1, R0, l_3);
1245 __ delayed()->nop();
1246 if (is_oop) {
1247 __ bind(l_stchk);
1248 // __ movl(edi, Address(esp, 8+ 8));
1249 // __ movl(ecx, Address(esp, 8+ 12));
1250 __ move(T0, A1);
1251 __ move(T1, A2);
1252 array_store_check();
1253 }
1254 __ bind(l_4);
1255 // __ popl(edi);
1256 // __ popl(esi);
1257 // __ ret(0);
1258 __ pop(T8);
1259 __ pop(T1);
1260 __ pop(T0);
1261 __ pop(T3);
1262 __ jr(RA);
1263 __ delayed()->nop();
1264 return start;
1265 }
1267 // Arguments:
1268 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1269 // ignored
1270 // is_oop - true => oop array, so generate store check code
1271 // name - stub name string
1272 //
1273 // Inputs:
1274 // c_rarg0 - source array address
1275 // c_rarg1 - destination array address
1276 // c_rarg2 - element count, treated as ssize_t, can be zero
1277 //
1278 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1279 // the hardware handle it. The two dwords within qwords that span
1280 // cache line boundaries will still be loaded and stored atomicly.
1281 //
1282 address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {
1283 Label l_2, l_3, l_4, l_stchk;
1284 StubCodeMark mark(this, "StubRoutines", name);
1285 __ align(CodeEntryAlignment);
1286 address start = __ pc();
1287 address nooverlap_target;
1289 if (is_oop) {
1290 nooverlap_target = aligned ?
1291 StubRoutines::arrayof_oop_disjoint_arraycopy() :
1292 StubRoutines::oop_disjoint_arraycopy();
1293 }else {
1294 nooverlap_target = aligned ?
1295 StubRoutines::arrayof_jint_disjoint_arraycopy() :
1296 StubRoutines::jint_disjoint_arraycopy();
1297 }
1299 array_overlap_test(nooverlap_target, 2);
1301 __ push(T3);
1302 __ push(T0);
1303 __ push(T1);
1304 __ push(T8);
1306 /*
1307 __ pushl(esi);
1308 __ movl(ecx, Address(esp, 4+12)); // count
1309 __ pushl(edi);
1310 __ movl(esi, Address(esp, 8+ 4)); // from
1311 __ movl(edi, Address(esp, 8+ 8)); // to
1312 */
1313 __ move(T1, A2);
1314 __ move(T3, A0);
1315 __ move(T0, A1);
1317 //__ leal(esi, Address(esi, ecx, Address::times_4, -4)); // from + count*4 - 4
1318 __ sll(AT, T1, Address::times_4);
1319 __ add(AT, T3, AT);
1320 __ lea(T3 , Address(AT, -4));
1321 //__ std();
1322 //__ leal(edi, Address(edi, ecx, Address::times_4, -4)); // to + count*4 - 4
1323 __ sll(AT, T1, Address::times_4);
1324 __ add(AT, T0, AT);
1325 __ lea(T0 , Address(AT, -4));
1327 // __ cmpl(ecx, 32);
1328 // __ jcc(Assembler::above, l_3); // > 32 dwords
1329 // __ testl(ecx, ecx);
1330 //__ jcc(Assembler::zero, l_4);
1331 __ beq(T1, R0, l_4);
1332 __ delayed()->nop();
1333 // __ subl(edi, esi);
1334 __ align(16);
1335 __ bind(l_2);
1336 // __ movl(edx, Address(esi));
1337 __ lw(AT, T3, 0);
1338 // __ movl(Address(esi, edi, Address::times_1), edx);
1339 __ sw(AT, T0, 0);
1340 // __ subl(esi, 4);
1341 __ addi(T3, T3, -4);
1342 __ addi(T0, T0, -4);
1343 // __ decl(ecx);
1344 __ addi(T1, T1, -1);
1345 //__ jcc(Assembler::notEqual, l_2);
1346 __ bne(T1, R0, l_2);
1347 __ delayed()->nop();
1348 if (is_oop) {
1349 // __ jmp(l_stchk);
1350 __ b( l_stchk);
1351 __ delayed()->nop();
1352 }
1353 __ bind(l_4);
1354 // __ cld();
1355 // __ popl(edi);
1356 // __ popl(esi);
1357 // __ ret(0);
1358 __ pop(T8);
1359 __ pop(T1);
1360 __ pop(T0);
1361 __ pop(T3);
1362 __ jr(RA);
1363 __ delayed()->nop();
1364 __ bind(l_3);
1365 // __ rep_movl();
1366 if (is_oop) {
1367 __ bind(l_stchk);
1368 // __ movl(edi, Address(esp, 8+ 8));
1369 __ move(T0, A1);
1370 // __ movl(ecx, Address(esp, 8+ 12));
1371 __ move(T1, A2);
1372 array_store_check();
1373 }
1374 // __ cld();
1375 // __ popl(edi);
1376 // __ popl(esi);
1377 // __ ret(0);
1378 __ pop(T8);
1379 __ pop(T1);
1380 __ pop(T0);
1381 __ pop(T3);
1382 __ jr(RA);
1383 __ delayed()->nop();
1384 return start;
1385 }
1387 // Arguments:
1388 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1389 // ignored
1390 // is_oop - true => oop array, so generate store check code
1391 // name - stub name string
1392 //
1393 // Inputs:
1394 // c_rarg0 - source array address
1395 // c_rarg1 - destination array address
1396 // c_rarg2 - element count, treated as ssize_t, can be zero
1397 //
1398 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1399 // the hardware handle it. The two dwords within qwords that span
1400 // cache line boundaries will still be loaded and stored atomicly.
1401 //
1402 // Side Effects:
1403 // disjoint_int_copy_entry is set to the no-overlap entry point
1404 // used by generate_conjoint_int_oop_copy().
1405 //
1406 address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
1407 Label l_2, l_3, l_4, l_stchk;
1408 StubCodeMark mark(this, "StubRoutines", name);
1409 __ align(CodeEntryAlignment);
1410 address start = __ pc();
1411 __ push(T3);
1412 __ push(T0);
1413 __ push(T1);
1414 __ push(T8);
1415 __ move(T1, A2);
1416 __ move(T3, A0);
1417 __ move(T0, A1);
1419 // __ cmpl(ecx, 32);
1420 // __ jcc(Assembler::belowEqual, l_2); // <= 32 dwords
1421 // __ rep_movl();
1422 __ b(l_2);
1423 __ delayed()->nop();
1424 if (is_oop) {
1425 // __ jmp(l_stchk);
1426 __ b(l_stchk);
1427 __ delayed()->nop();
1428 }
1429 // __ popl(edi);
1430 // __ popl(esi);
1431 // __ ret(0);
1432 __ pop(T8);
1433 __ pop(T1);
1434 __ pop(T0);
1435 __ pop(T3);
1436 __ jr(RA);
1437 __ delayed()->nop();
1439 __ bind(l_2);
1440 // __ subl(edi, esi);
1441 // __ testl(ecx, ecx);
1442 // __ jcc(Assembler::zero, l_4);
1443 __ beq(T1, R0, l_4);
1444 __ delayed()->nop();
1445 __ align(16);
1446 __ bind(l_3);
1447 //__ movl(edx, Address(esi));
1448 __ ld(AT, T3, 0);
1449 // __ movl(Address(edi, esi, Address::times_1), edx);
1450 __ sd(AT, T0, 0);
1451 // __ addl(esi, 4);
1452 __ addi(T3, T3, 8);
1453 __ addi(T0, T0, 8);
1454 // __ decl(ecx);
1455 __ addi(T1, T1, -1);
1456 // __ jcc(Assembler::notEqual, l_3);
1457 __ bne(T1, R0, l_3);
1458 __ delayed()->nop();
1459 if (is_oop) {
1460 __ bind(l_stchk);
1461 // __ movl(edi, Address(esp, 8+ 8));
1462 // __ movl(ecx, Address(esp, 8+ 12));
1463 __ move(T0, A1);
1464 __ move(T1, A2);
1465 array_store_check();
1466 }
1467 __ bind(l_4);
1468 // __ popl(edi);
1469 // __ popl(esi);
1470 // __ ret(0);
1471 __ pop(T8);
1472 __ pop(T1);
1473 __ pop(T0);
1474 __ pop(T3);
1475 __ jr(RA);
1476 __ delayed()->nop();
1477 return start;
1478 }
1480 // Arguments:
1481 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1482 // ignored
1483 // is_oop - true => oop array, so generate store check code
1484 // name - stub name string
1485 //
1486 // Inputs:
1487 // c_rarg0 - source array address
1488 // c_rarg1 - destination array address
1489 // c_rarg2 - element count, treated as ssize_t, can be zero
1490 //
1491 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1492 // the hardware handle it. The two dwords within qwords that span
1493 // cache line boundaries will still be loaded and stored atomicly.
1494 //
1495 address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
1496 Label l_2, l_3, l_4, l_stchk;
1497 StubCodeMark mark(this, "StubRoutines", name);
1498 __ align(CodeEntryAlignment);
1499 address start = __ pc();
1500 address nooverlap_target;
1502 if (is_oop) {
1503 nooverlap_target = aligned ?
1504 StubRoutines::arrayof_oop_disjoint_arraycopy() :
1505 StubRoutines::oop_disjoint_arraycopy();
1506 }else {
1507 nooverlap_target = aligned ?
1508 StubRoutines::arrayof_jlong_disjoint_arraycopy() :
1509 StubRoutines::jlong_disjoint_arraycopy();
1510 }
1512 array_overlap_test(nooverlap_target, 3);
1514 __ push(T3);
1515 __ push(T0);
1516 __ push(T1);
1517 __ push(T8);
1519 __ move(T1, A2);
1520 __ move(T3, A0);
1521 __ move(T0, A1);
1523 //__ leal(esi, Address(esi, ecx, Address::times_4, -4)); // from + count*4 - 4
1524 __ sll(AT, T1, Address::times_8);
1525 __ add(AT, T3, AT);
1526 __ lea(T3 , Address(AT, -8));
1527 //__ std();
1528 //__ leal(edi, Address(edi, ecx, Address::times_4, -4)); // to + count*4 - 4
1529 __ sll(AT, T1, Address::times_8);
1530 __ add(AT, T0, AT);
1531 __ lea(T0 , Address(AT, -8));
1533 // __ cmpl(ecx, 32);
1534 // __ jcc(Assembler::above, l_3); // > 32 dwords
1535 // __ testl(ecx, ecx);
1536 //__ jcc(Assembler::zero, l_4);
1537 __ beq(T1, R0, l_4);
1538 __ delayed()->nop();
1539 // __ subl(edi, esi);
1540 __ align(16);
1541 __ bind(l_2);
1542 // __ movl(edx, Address(esi));
1543 __ ld(AT, T3, 0);
1544 // __ movl(Address(esi, edi, Address::times_1), edx);
1545 __ sd(AT, T0, 0);
1546 // __ subl(esi, 4);
1547 __ addi(T3, T3, -8);
1548 __ addi(T0, T0, -8);
1549 // __ decl(ecx);
1550 __ addi(T1, T1, -1);
1551 //__ jcc(Assembler::notEqual, l_2);
1552 __ bne(T1, R0, l_2);
1553 __ delayed()->nop();
1554 if (is_oop) {
1555 // __ jmp(l_stchk);
1556 __ b( l_stchk);
1557 __ delayed()->nop();
1558 }
1559 __ bind(l_4);
1560 // __ cld();
1561 // __ popl(edi);
1562 // __ popl(esi);
1563 // __ ret(0);
1564 __ pop(T8);
1565 __ pop(T1);
1566 __ pop(T0);
1567 __ pop(T3);
1568 __ jr(RA);
1569 __ delayed()->nop();
1570 __ bind(l_3);
1571 // __ rep_movl();
1572 if (is_oop) {
1573 __ bind(l_stchk);
1574 // __ movl(edi, Address(esp, 8+ 8));
1575 __ move(T0, A1);
1576 // __ movl(ecx, Address(esp, 8+ 12));
1577 __ move(T1, A2);
1578 array_store_check();
1579 }
1580 // __ cld();
1581 // __ popl(edi);
1582 // __ popl(esi);
1583 // __ ret(0);
1584 __ pop(T8);
1585 __ pop(T1);
1586 __ pop(T0);
1587 __ pop(T3);
1588 __ jr(RA);
1589 __ delayed()->nop();
1590 return start;
1591 }
1592 #if 0
1593 // Arguments:
1594 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1595 // ignored
1596 // is_oop - true => oop array, so generate store check code
1597 // name - stub name string
1598 //
1599 // Inputs:
1600 // c_rarg0 - source array address
1601 // c_rarg1 - destination array address
1602 // c_rarg2 - element count, treated as ssize_t, can be zero
1603 //
1604 address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
1605 __ align(CodeEntryAlignment);
1606 StubCodeMark mark(this, "StubRoutines", name);
1607 address start = __ pc();
1609 Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
1610 const Register from = rdi; // source array address
1611 const Register to = rsi; // destination array address
1612 const Register qword_count = rdx; // elements count
1613 const Register saved_count = rcx;
1615 __ enter(); // required for proper stackwalking of RuntimeStub frame
1616 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1618 address disjoint_copy_entry = NULL;
1619 if (is_oop) {
1620 assert(!UseCompressedOops, "shouldn't be called for compressed oops");
1621 disjoint_copy_entry = disjoint_oop_copy_entry;
1622 oop_copy_entry = __ pc();
1623 array_overlap_test(disjoint_oop_copy_entry, Address::times_8);
1624 } else {
1625 disjoint_copy_entry = disjoint_long_copy_entry;
1626 long_copy_entry = __ pc();
1627 array_overlap_test(disjoint_long_copy_entry, Address::times_8);
1628 }
1629 BLOCK_COMMENT("Entry:");
1630 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1632 array_overlap_test(disjoint_copy_entry, Address::times_8);
1633 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1634 // r9 and r10 may be used to save non-volatile registers
1636 // 'from', 'to' and 'qword_count' are now valid
1638 if (is_oop) {
1639 // Save to and count for store barrier
1640 __ movptr(saved_count, qword_count);
1641 // No registers are destroyed by this call
1642 gen_write_ref_array_pre_barrier(to, saved_count);
1643 }
1645 __ jmp(L_copy_32_bytes);
1647 // Copy trailing qwords
1648 __ BIND(L_copy_8_bytes);
1649 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1650 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1651 __ decrement(qword_count);
1652 __ jcc(Assembler::notZero, L_copy_8_bytes);
1654 if (is_oop) {
1655 __ jmp(L_exit);
1656 } else {
1657 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
1658 restore_arg_regs();
1659 __ xorptr(rax, rax); // return 0
1660 __ leave(); // required for proper stackwalking of RuntimeStub frame
1661 __ ret(0);
1662 }
1664 // Copy in 32-bytes chunks
1665 copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
1667 if (is_oop) {
1668 __ BIND(L_exit);
1669 __ lea(rcx, Address(to, saved_count, Address::times_8, -8));
1670 gen_write_ref_array_post_barrier(to, rcx, rax);
1671 inc_counter_np(SharedRuntime::_oop_array_copy_ctr);
1672 } else {
1673 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
1674 }
1675 restore_arg_regs();
1676 __ xorptr(rax, rax); // return 0
1677 __ leave(); // required for proper stackwalking of RuntimeStub frame
1678 __ ret(0);
1680 return start;
1681 }
1684 // Helper for generating a dynamic type check.
1685 // Smashes no registers.
1686 void generate_type_check(Register sub_klass,
1687 Register super_check_offset,
1688 Register super_klass,
1689 Label& L_success) {
1690 assert_different_registers(sub_klass, super_check_offset, super_klass);
1692 BLOCK_COMMENT("type_check:");
1694 Label L_miss;
1696 // a couple of useful fields in sub_klass:
1697 int ss_offset = (klassOopDesc::header_size() * HeapWordSize +
1698 Klass::secondary_supers_offset_in_bytes());
1699 int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
1700 Klass::secondary_super_cache_offset_in_bytes());
1701 Address secondary_supers_addr(sub_klass, ss_offset);
1702 Address super_cache_addr( sub_klass, sc_offset);
1704 // if the pointers are equal, we are done (e.g., String[] elements)
1705 __ cmpptr(super_klass, sub_klass);
1706 __ jcc(Assembler::equal, L_success);
1708 // check the supertype display:
1709 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
1710 __ cmpptr(super_klass, super_check_addr); // test the super type
1711 __ jcc(Assembler::equal, L_success);
1713 // if it was a primary super, we can just fail immediately
1714 __ cmpl(super_check_offset, sc_offset);
1715 __ jcc(Assembler::notEqual, L_miss);
1717 // Now do a linear scan of the secondary super-klass chain.
1718 // The repne_scan instruction uses fixed registers, which we must spill.
1719 // (We need a couple more temps in any case.)
1720 // This code is rarely used, so simplicity is a virtue here.
1721 inc_counter_np(SharedRuntime::_partial_subtype_ctr);
1722 {
1723 __ push(rax);
1724 __ push(rcx);
1725 __ push(rdi);
1726 assert_different_registers(sub_klass, super_klass, rax, rcx, rdi);
1728 __ movptr(rdi, secondary_supers_addr);
1729 // Load the array length.
1730 __ movl(rcx, Address(rdi, arrayOopDesc::length_offset_in_bytes()));
1731 // Skip to start of data.
1732 __ addptr(rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1733 // Scan rcx words at [rdi] for occurance of rax
1734 // Set NZ/Z based on last compare
1735 __ movptr(rax, super_klass);
1736 if (UseCompressedOops) {
1737 // Compare against compressed form. Don't need to uncompress because
1738 // looks like orig rax is restored in popq below.
1739 __ encode_heap_oop(rax);
1740 __ repne_scanl();
1741 } else {
1742 __ repne_scan();
1743 }
1745 // Unspill the temp. registers:
1746 __ pop(rdi);
1747 __ pop(rcx);
1748 __ pop(rax);
1750 __ jcc(Assembler::notEqual, L_miss);
1751 }
1753 // Success. Cache the super we found and proceed in triumph.
1754 __ movptr(super_cache_addr, super_klass); // note: rax is dead
1755 __ jmp(L_success);
1757 // Fall through on failure!
1758 __ BIND(L_miss);
1759 }
1761 //
1762 // Generate checkcasting array copy stub
1763 //
1764 // Input:
1765 // c_rarg0 - source array address
1766 // c_rarg1 - destination array address
1767 // c_rarg2 - element count, treated as ssize_t, can be zero
1768 // c_rarg3 - size_t ckoff (super_check_offset)
1769 // not Win64
1770 // c_rarg4 - oop ckval (super_klass)
1771 // Win64
1772 // rsp+40 - oop ckval (super_klass)
1773 //
1774 // Output:
1775 // rax == 0 - success
1776 // rax == -1^K - failure, where K is partial transfer count
1777 //
1778 address generate_checkcast_copy(const char *name) {
1780 Label L_load_element, L_store_element, L_do_card_marks, L_done;
1782 // Input registers (after setup_arg_regs)
1783 const Register from = rdi; // source array address
1784 const Register to = rsi; // destination array address
1785 const Register length = rdx; // elements count
1786 const Register ckoff = rcx; // super_check_offset
1787 const Register ckval = r8; // super_klass
1789 // Registers used as temps (r13, r14 are save-on-entry)
1790 const Register end_from = from; // source array end address
1791 const Register end_to = r13; // destination array end address
1792 const Register count = rdx; // -(count_remaining)
1793 const Register r14_length = r14; // saved copy of length
1794 // End pointers are inclusive, and if length is not zero they point
1795 // to the last unit copied: end_to[0] := end_from[0]
1797 const Register rax_oop = rax; // actual oop copied
1798 const Register r11_klass = r11; // oop._klass
1800 //---------------------------------------------------------------
1801 // Assembler stub will be used for this call to arraycopy
1802 // if the two arrays are subtypes of Object[] but the
1803 // destination array type is not equal to or a supertype
1804 // of the source type. Each element must be separately
1805 // checked.
1807 __ align(CodeEntryAlignment);
1808 StubCodeMark mark(this, "StubRoutines", name);
1809 address start = __ pc();
1811 __ enter(); // required for proper stackwalking of RuntimeStub frame
1813 checkcast_copy_entry = __ pc();
1814 BLOCK_COMMENT("Entry:");
1816 #ifdef ASSERT
1817 // caller guarantees that the arrays really are different
1818 // otherwise, we would have to make conjoint checks
1819 { Label L;
1820 array_overlap_test(L, TIMES_OOP);
1821 __ stop("checkcast_copy within a single array");
1822 __ bind(L);
1823 }
1824 #endif //ASSERT
1826 // allocate spill slots for r13, r14
1827 enum {
1828 saved_r13_offset,
1829 saved_r14_offset,
1830 saved_rbp_offset,
1831 saved_rip_offset,
1832 saved_rarg0_offset
1833 };
1834 __ subptr(rsp, saved_rbp_offset * wordSize);
1835 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
1836 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
1837 setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
1838 // ckoff => rcx, ckval => r8
1839 // r9 and r10 may be used to save non-volatile registers
1840 #ifdef _WIN64
1841 // last argument (#4) is on stack on Win64
1842 const int ckval_offset = saved_rarg0_offset + 4;
1843 __ movptr(ckval, Address(rsp, ckval_offset * wordSize));
1844 #endif
1846 // check that int operands are properly extended to size_t
1847 assert_clean_int(length, rax);
1848 assert_clean_int(ckoff, rax);
1850 #ifdef ASSERT
1851 BLOCK_COMMENT("assert consistent ckoff/ckval");
1852 // The ckoff and ckval must be mutually consistent,
1853 // even though caller generates both.
1854 { Label L;
1855 int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
1856 Klass::super_check_offset_offset_in_bytes());
1857 __ cmpl(ckoff, Address(ckval, sco_offset));
1858 __ jcc(Assembler::equal, L);
1859 __ stop("super_check_offset inconsistent");
1860 __ bind(L);
1861 }
1862 #endif //ASSERT
1864 // Loop-invariant addresses. They are exclusive end pointers.
1865 Address end_from_addr(from, length, TIMES_OOP, 0);
1866 Address end_to_addr(to, length, TIMES_OOP, 0);
1867 // Loop-variant addresses. They assume post-incremented count < 0.
1868 Address from_element_addr(end_from, count, TIMES_OOP, 0);
1869 Address to_element_addr(end_to, count, TIMES_OOP, 0);
1871 gen_write_ref_array_pre_barrier(to, count);
1873 // Copy from low to high addresses, indexed from the end of each array.
1874 __ lea(end_from, end_from_addr);
1875 __ lea(end_to, end_to_addr);
1876 __ movptr(r14_length, length); // save a copy of the length
1877 assert(length == count, ""); // else fix next line:
1878 __ negptr(count); // negate and test the length
1879 __ jcc(Assembler::notZero, L_load_element);
1881 // Empty array: Nothing to do.
1882 __ xorptr(rax, rax); // return 0 on (trivial) success
1883 __ jmp(L_done);
1885 // ======== begin loop ========
1886 // (Loop is rotated; its entry is L_load_element.)
1887 // Loop control:
1888 // for (count = -count; count != 0; count++)
1889 // Base pointers src, dst are biased by 8*(count-1),to last element.
1890 __ align(16);
1892 __ BIND(L_store_element);
1893 __ store_heap_oop(rax_oop, to_element_addr); // store the oop
1894 __ increment(count); // increment the count toward zero
1895 __ jcc(Assembler::zero, L_do_card_marks);
1897 // ======== loop entry is here ========
1898 __ BIND(L_load_element);
1899 __ load_heap_oop(rax_oop, from_element_addr); // load the oop
1900 __ testptr(rax_oop, rax_oop);
1901 __ jcc(Assembler::zero, L_store_element);
1903 __ load_klass(r11_klass, rax_oop);// query the object klass
1904 generate_type_check(r11_klass, ckoff, ckval, L_store_element);
1905 // ======== end loop ========
1907 // It was a real error; we must depend on the caller to finish the job.
1908 // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
1909 // Emit GC store barriers for the oops we have copied (r14 + rdx),
1910 // and report their number to the caller.
1911 assert_different_registers(rax, r14_length, count, to, end_to, rcx);
1912 __ lea(end_to, to_element_addr);
1913 gen_write_ref_array_post_barrier(to, end_to, rscratch1);
1914 __ movptr(rax, r14_length); // original oops
1915 __ addptr(rax, count); // K = (original - remaining) oops
1916 __ notptr(rax); // report (-1^K) to caller
1917 __ jmp(L_done);
1919 // Come here on success only.
1920 __ BIND(L_do_card_marks);
1921 __ addptr(end_to, -wordSize); // make an inclusive end pointer
1922 gen_write_ref_array_post_barrier(to, end_to, rscratch1);
1923 __ xorptr(rax, rax); // return 0 on success
1925 // Common exit point (success or failure).
1926 __ BIND(L_done);
1927 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
1928 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
1929 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1930 restore_arg_regs();
1931 __ leave(); // required for proper stackwalking of RuntimeStub frame
1932 __ ret(0);
1934 return start;
1935 }
1937 //
1938 // Generate 'unsafe' array copy stub
1939 // Though just as safe as the other stubs, it takes an unscaled
1940 // size_t argument instead of an element count.
1941 //
1942 // Input:
1943 // c_rarg0 - source array address
1944 // c_rarg1 - destination array address
1945 // c_rarg2 - byte count, treated as ssize_t, can be zero
1946 //
1947 // Examines the alignment of the operands and dispatches
1948 // to a long, int, short, or byte copy loop.
1949 //
1950 address generate_unsafe_copy(const char *name) {
1952 Label L_long_aligned, L_int_aligned, L_short_aligned;
1954 // Input registers (before setup_arg_regs)
1955 const Register from = c_rarg0; // source array address
1956 const Register to = c_rarg1; // destination array address
1957 const Register size = c_rarg2; // byte count (size_t)
1959 // Register used as a temp
1960 const Register bits = rax; // test copy of low bits
1962 __ align(CodeEntryAlignment);
1963 StubCodeMark mark(this, "StubRoutines", name);
1964 address start = __ pc();
1966 __ enter(); // required for proper stackwalking of RuntimeStub frame
1968 // bump this on entry, not on exit:
1969 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1971 __ mov(bits, from);
1972 __ orptr(bits, to);
1973 __ orptr(bits, size);
1975 __ testb(bits, BytesPerLong-1);
1976 __ jccb(Assembler::zero, L_long_aligned);
1978 __ testb(bits, BytesPerInt-1);
1979 __ jccb(Assembler::zero, L_int_aligned);
1981 __ testb(bits, BytesPerShort-1);
1982 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
1984 __ BIND(L_short_aligned);
1985 __ shrptr(size, LogBytesPerShort); // size => short_count
1986 __ jump(RuntimeAddress(short_copy_entry));
1988 __ BIND(L_int_aligned);
1989 __ shrptr(size, LogBytesPerInt); // size => int_count
1990 __ jump(RuntimeAddress(int_copy_entry));
1992 __ BIND(L_long_aligned);
1993 __ shrptr(size, LogBytesPerLong); // size => qword_count
1994 __ jump(RuntimeAddress(long_copy_entry));
1996 return start;
1997 }
1999 // Perform range checks on the proposed arraycopy.
2000 // Kills temp, but nothing else.
2001 // Also, clean the sign bits of src_pos and dst_pos.
2002 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2003 Register src_pos, // source position (c_rarg1)
2004 Register dst, // destination array oo (c_rarg2)
2005 Register dst_pos, // destination position (c_rarg3)
2006 Register length,
2007 Register temp,
2008 Label& L_failed) {
2009 BLOCK_COMMENT("arraycopy_range_checks:");
2011 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2012 __ movl(temp, length);
2013 __ addl(temp, src_pos); // src_pos + length
2014 __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2015 __ jcc(Assembler::above, L_failed);
2017 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2018 __ movl(temp, length);
2019 __ addl(temp, dst_pos); // dst_pos + length
2020 __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2021 __ jcc(Assembler::above, L_failed);
2023 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2024 // Move with sign extension can be used since they are positive.
2025 __ movslq(src_pos, src_pos);
2026 __ movslq(dst_pos, dst_pos);
2028 BLOCK_COMMENT("arraycopy_range_checks done");
2029 }
2031 //
2032 // Generate generic array copy stubs
2033 //
2034 // Input:
2035 // c_rarg0 - src oop
2036 // c_rarg1 - src_pos (32-bits)
2037 // c_rarg2 - dst oop
2038 // c_rarg3 - dst_pos (32-bits)
2039 // not Win64
2040 // c_rarg4 - element count (32-bits)
2041 // Win64
2042 // rsp+40 - element count (32-bits)
2043 //
2044 // Output:
2045 // rax == 0 - success
2046 // rax == -1^K - failure, where K is partial transfer count
2047 //
2048 address generate_generic_copy(const char *name) {
2050 Label L_failed, L_failed_0, L_objArray;
2051 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2053 // Input registers
2054 const Register src = c_rarg0; // source array oop
2055 const Register src_pos = c_rarg1; // source position
2056 const Register dst = c_rarg2; // destination array oop
2057 const Register dst_pos = c_rarg3; // destination position
2058 // elements count is on stack on Win64
2059 #ifdef _WIN64
2060 #define C_RARG4 Address(rsp, 6 * wordSize)
2061 #else
2062 #define C_RARG4 c_rarg4
2063 #endif
2065 { int modulus = CodeEntryAlignment;
2066 int target = modulus - 5; // 5 = sizeof jmp(L_failed)
2067 int advance = target - (__ offset() % modulus);
2068 if (advance < 0) advance += modulus;
2069 if (advance > 0) __ nop(advance);
2070 }
2071 StubCodeMark mark(this, "StubRoutines", name);
2073 // Short-hop target to L_failed. Makes for denser prologue code.
2074 __ BIND(L_failed_0);
2075 __ jmp(L_failed);
2076 assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2078 __ align(CodeEntryAlignment);
2079 address start = __ pc();
2081 __ enter(); // required for proper stackwalking of RuntimeStub frame
2083 // bump this on entry, not on exit:
2084 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2086 //-----------------------------------------------------------------------
2087 // Assembler stub will be used for this call to arraycopy
2088 // if the following conditions are met:
2089 //
2090 // (1) src and dst must not be null.
2091 // (2) src_pos must not be negative.
2092 // (3) dst_pos must not be negative.
2093 // (4) length must not be negative.
2094 // (5) src klass and dst klass should be the same and not NULL.
2095 // (6) src and dst should be arrays.
2096 // (7) src_pos + length must not exceed length of src.
2097 // (8) dst_pos + length must not exceed length of dst.
2098 //
2100 // if (src == NULL) return -1;
2101 __ testptr(src, src); // src oop
2102 size_t j1off = __ offset();
2103 __ jccb(Assembler::zero, L_failed_0);
2105 // if (src_pos < 0) return -1;
2106 __ testl(src_pos, src_pos); // src_pos (32-bits)
2107 __ jccb(Assembler::negative, L_failed_0);
2109 // if (dst == NULL) return -1;
2110 __ testptr(dst, dst); // dst oop
2111 __ jccb(Assembler::zero, L_failed_0);
2113 // if (dst_pos < 0) return -1;
2114 __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2115 size_t j4off = __ offset();
2116 __ jccb(Assembler::negative, L_failed_0);
2118 // The first four tests are very dense code,
2119 // but not quite dense enough to put four
2120 // jumps in a 16-byte instruction fetch buffer.
2121 // That's good, because some branch predicters
2122 // do not like jumps so close together.
2123 // Make sure of this.
2124 guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2126 // registers used as temp
2127 const Register r11_length = r11; // elements count to copy
2128 const Register r10_src_klass = r10; // array klass
2129 const Register r9_dst_klass = r9; // dest array klass
2131 // if (length < 0) return -1;
2132 __ movl(r11_length, C_RARG4); // length (elements count, 32-bits value)
2133 __ testl(r11_length, r11_length);
2134 __ jccb(Assembler::negative, L_failed_0);
2136 __ load_klass(r10_src_klass, src);
2137 #ifdef ASSERT
2138 // assert(src->klass() != NULL);
2139 BLOCK_COMMENT("assert klasses not null");
2140 { Label L1, L2;
2141 __ testptr(r10_src_klass, r10_src_klass);
2142 __ jcc(Assembler::notZero, L2); // it is broken if klass is NULL
2143 __ bind(L1);
2144 __ stop("broken null klass");
2145 __ bind(L2);
2146 __ load_klass(r9_dst_klass, dst);
2147 __ cmpq(r9_dst_klass, 0);
2148 __ jcc(Assembler::equal, L1); // this would be broken also
2149 BLOCK_COMMENT("assert done");
2150 }
2151 #endif
2153 // Load layout helper (32-bits)
2154 //
2155 // |array_tag| | header_size | element_type | |log2_element_size|
2156 // 32 30 24 16 8 2 0
2157 //
2158 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2159 //
2161 int lh_offset = klassOopDesc::header_size() * HeapWordSize +
2162 Klass::layout_helper_offset_in_bytes();
2164 const Register rax_lh = rax; // layout helper
2166 __ movl(rax_lh, Address(r10_src_klass, lh_offset));
2168 // Handle objArrays completely differently...
2169 jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2170 __ cmpl(rax_lh, objArray_lh);
2171 __ jcc(Assembler::equal, L_objArray);
2173 // if (src->klass() != dst->klass()) return -1;
2174 __ load_klass(r9_dst_klass, dst);
2175 __ cmpq(r10_src_klass, r9_dst_klass);
2176 __ jcc(Assembler::notEqual, L_failed);
2178 // if (!src->is_Array()) return -1;
2179 __ cmpl(rax_lh, Klass::_lh_neutral_value);
2180 __ jcc(Assembler::greaterEqual, L_failed);
2182 // At this point, it is known to be a typeArray (array_tag 0x3).
2183 #ifdef ASSERT
2184 { Label L;
2185 __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
2186 __ jcc(Assembler::greaterEqual, L);
2187 __ stop("must be a primitive array");
2188 __ bind(L);
2189 }
2190 #endif
2192 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2193 r10, L_failed);
2195 // typeArrayKlass
2196 //
2197 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2198 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2199 //
2201 const Register r10_offset = r10; // array offset
2202 const Register rax_elsize = rax_lh; // element size
2204 __ movl(r10_offset, rax_lh);
2205 __ shrl(r10_offset, Klass::_lh_header_size_shift);
2206 __ andptr(r10_offset, Klass::_lh_header_size_mask); // array_offset
2207 __ addptr(src, r10_offset); // src array offset
2208 __ addptr(dst, r10_offset); // dst array offset
2209 BLOCK_COMMENT("choose copy loop based on element size");
2210 __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
2212 // next registers should be set before the jump to corresponding stub
2213 const Register from = c_rarg0; // source array address
2214 const Register to = c_rarg1; // destination array address
2215 const Register count = c_rarg2; // elements count
2217 // 'from', 'to', 'count' registers should be set in such order
2218 // since they are the same as 'src', 'src_pos', 'dst'.
2220 __ BIND(L_copy_bytes);
2221 __ cmpl(rax_elsize, 0);
2222 __ jccb(Assembler::notEqual, L_copy_shorts);
2223 __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
2224 __ lea(to, Address(dst, dst_pos, Address::times_1, 0));// dst_addr
2225 __ movl2ptr(count, r11_length); // length
2226 __ jump(RuntimeAddress(byte_copy_entry));
2228 __ BIND(L_copy_shorts);
2229 __ cmpl(rax_elsize, LogBytesPerShort);
2230 __ jccb(Assembler::notEqual, L_copy_ints);
2231 __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
2232 __ lea(to, Address(dst, dst_pos, Address::times_2, 0));// dst_addr
2233 __ movl2ptr(count, r11_length); // length
2234 __ jump(RuntimeAddress(short_copy_entry));
2236 __ BIND(L_copy_ints);
2237 __ cmpl(rax_elsize, LogBytesPerInt);
2238 __ jccb(Assembler::notEqual, L_copy_longs);
2239 __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
2240 __ lea(to, Address(dst, dst_pos, Address::times_4, 0));// dst_addr
2241 __ movl2ptr(count, r11_length); // length
2242 __ jump(RuntimeAddress(int_copy_entry));
2244 __ BIND(L_copy_longs);
2245 #ifdef ASSERT
2246 { Label L;
2247 __ cmpl(rax_elsize, LogBytesPerLong);
2248 __ jcc(Assembler::equal, L);
2249 __ stop("must be long copy, but elsize is wrong");
2250 __ bind(L);
2251 }
2252 #endif
2253 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
2254 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr
2255 __ movl2ptr(count, r11_length); // length
2256 __ jump(RuntimeAddress(long_copy_entry));
2258 // objArrayKlass
2259 __ BIND(L_objArray);
2260 // live at this point: r10_src_klass, src[_pos], dst[_pos]
2262 Label L_plain_copy, L_checkcast_copy;
2263 // test array classes for subtyping
2264 __ load_klass(r9_dst_klass, dst);
2265 __ cmpq(r10_src_klass, r9_dst_klass); // usual case is exact equality
2266 __ jcc(Assembler::notEqual, L_checkcast_copy);
2268 // Identically typed arrays can be copied without element-wise checks.
2269 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2270 r10, L_failed);
2272 __ lea(from, Address(src, src_pos, TIMES_OOP,
2273 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
2274 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
2275 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
2276 __ movl2ptr(count, r11_length); // length
2277 __ BIND(L_plain_copy);
2278 __ jump(RuntimeAddress(oop_copy_entry));
2280 __ BIND(L_checkcast_copy);
2281 // live at this point: r10_src_klass, !r11_length
2282 {
2283 // assert(r11_length == C_RARG4); // will reload from here
2284 Register r11_dst_klass = r11;
2285 __ load_klass(r11_dst_klass, dst);
2287 // Before looking at dst.length, make sure dst is also an objArray.
2288 __ cmpl(Address(r11_dst_klass, lh_offset), objArray_lh);
2289 __ jcc(Assembler::notEqual, L_failed);
2291 // It is safe to examine both src.length and dst.length.
2292 #ifndef _WIN64
2293 arraycopy_range_checks(src, src_pos, dst, dst_pos, C_RARG4,
2294 rax, L_failed);
2295 #else
2296 __ movl(r11_length, C_RARG4); // reload
2297 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2298 rax, L_failed);
2299 __ load_klass(r11_dst_klass, dst); // reload
2300 #endif
2302 // Marshal the base address arguments now, freeing registers.
2303 __ lea(from, Address(src, src_pos, TIMES_OOP,
2304 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2305 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
2306 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2307 __ movl(count, C_RARG4); // length (reloaded)
2308 Register sco_temp = c_rarg3; // this register is free now
2309 assert_different_registers(from, to, count, sco_temp,
2310 r11_dst_klass, r10_src_klass);
2311 assert_clean_int(count, sco_temp);
2313 // Generate the type check.
2314 int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
2315 Klass::super_check_offset_offset_in_bytes());
2316 __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
2317 assert_clean_int(sco_temp, rax);
2318 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
2320 // Fetch destination element klass from the objArrayKlass header.
2321 int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
2322 objArrayKlass::element_klass_offset_in_bytes());
2323 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
2324 __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
2325 assert_clean_int(sco_temp, rax);
2327 // the checkcast_copy loop needs two extra arguments:
2328 assert(c_rarg3 == sco_temp, "#3 already in place");
2329 __ movptr(C_RARG4, r11_dst_klass); // dst.klass.element_klass
2330 __ jump(RuntimeAddress(checkcast_copy_entry));
2331 }
2333 __ BIND(L_failed);
2334 __ xorptr(rax, rax);
2335 __ notptr(rax); // return -1
2336 __ leave(); // required for proper stackwalking of RuntimeStub frame
2337 __ ret(0);
2339 return start;
2340 }
2342 #undef length_arg
2343 #endif
2345 //FIXME
2346 address generate_disjoint_long_copy(bool aligned, const char *name) {
2347 Label l_1, l_2;
2348 StubCodeMark mark(this, "StubRoutines", name);
2349 __ align(CodeEntryAlignment);
2350 address start = __ pc();
2352 // __ movl(ecx, Address(esp, 4+8)); // count
2353 // __ movl(eax, Address(esp, 4+0)); // from
2354 // __ movl(edx, Address(esp, 4+4)); // to
2355 __ move(T1, A2);
2356 __ move(T3, A0);
2357 __ move(T0, A1);
2358 __ push(T3);
2359 __ push(T0);
2360 __ push(T1);
2361 //__ subl(edx, eax);
2362 //__ jmp(l_2);
2363 __ b(l_2);
2364 __ delayed()->nop();
2365 __ align(16);
2366 __ bind(l_1);
2367 // if (VM_Version::supports_mmx()) {
2368 // __ movq(mmx0, Address(eax));
2369 // __ movq(Address(eax, edx, Address::times_1), mmx0);
2370 // } else {
2371 // __ fild_d(Address(eax));
2372 __ ld(AT, T3, 0);
2373 // __ fistp_d(Address(eax, edx, Address::times_1));
2374 __ sd (AT, T0, 0);
2375 // }
2376 // __ addl(eax, 8);
2377 __ addi(T3, T3, 8);
2378 __ addi(T0, T0, 8);
2379 __ bind(l_2);
2380 // __ decl(ecx);
2381 __ addi(T1, T1, -1);
2382 // __ jcc(Assembler::greaterEqual, l_1);
2383 __ bgez(T1, l_1);
2384 __ delayed()->nop();
2385 // if (VM_Version::supports_mmx()) {
2386 // __ emms();
2387 // }
2388 // __ ret(0);
2389 __ pop(T1);
2390 __ pop(T0);
2391 __ pop(T3);
2392 __ jr(RA);
2393 __ delayed()->nop();
2394 return start;
2395 }
2398 address generate_conjoint_long_copy(bool aligned, const char *name) {
2399 Label l_1, l_2;
2400 StubCodeMark mark(this, "StubRoutines", name);
2401 __ align(CodeEntryAlignment);
2402 address start = __ pc();
2403 address nooverlap_target = aligned ?
2404 StubRoutines::arrayof_jlong_disjoint_arraycopy() :
2405 StubRoutines::jlong_disjoint_arraycopy();
2406 array_overlap_test(nooverlap_target, 3);
2408 __ push(T3);
2409 __ push(T0);
2410 __ push(T1);
2412 /* __ movl(ecx, Address(esp, 4+8)); // count
2413 __ movl(eax, Address(esp, 4+0)); // from
2414 __ movl(edx, Address(esp, 4+4)); // to
2415 __ jmp(l_2);
2417 */
2418 __ move(T1, A2);
2419 __ move(T3, A0);
2420 __ move(T0, A1);
2421 __ sll(AT, T1, Address::times_8);
2422 __ add(AT, T3, AT);
2423 __ lea(T3 , Address(AT, -8));
2424 __ sll(AT, T1, Address::times_8);
2425 __ add(AT, T0, AT);
2426 __ lea(T0 , Address(AT, -8));
2430 __ b(l_2);
2431 __ delayed()->nop();
2432 __ align(16);
2433 __ bind(l_1);
2434 /* if (VM_Version::supports_mmx()) {
2435 __ movq(mmx0, Address(eax, ecx, Address::times_8));
2436 __ movq(Address(edx, ecx,Address::times_8), mmx0);
2437 } else {
2438 __ fild_d(Address(eax, ecx, Address::times_8));
2439 __ fistp_d(Address(edx, ecx,Address::times_8));
2440 }
2441 */
2442 __ ld(AT, T3, 0);
2443 __ sd (AT, T0, 0);
2444 __ addi(T3, T3, -8);
2445 __ addi(T0, T0,-8);
2446 __ bind(l_2);
2447 // __ decl(ecx);
2448 __ addi(T1, T1, -1);
2449 //__ jcc(Assembler::greaterEqual, l_1);
2450 __ bgez(T1, l_1);
2451 __ delayed()->nop();
2452 // if (VM_Version::supports_mmx()) {
2453 // __ emms();
2454 // }
2455 // __ ret(0);
2456 __ pop(T1);
2457 __ pop(T0);
2458 __ pop(T3);
2459 __ jr(RA);
2460 __ delayed()->nop();
2461 return start;
2462 }
2464 void generate_arraycopy_stubs() {
2465 if (UseCompressedOops) {
2466 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, true, "oop_disjoint_arraycopy");
2467 StubRoutines::_oop_arraycopy = generate_conjoint_int_oop_copy(false, true, "oop_arraycopy");
2468 } else {
2469 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, true, "oop_disjoint_arraycopy");
2470 StubRoutines::_oop_arraycopy = generate_conjoint_long_oop_copy(false, true, "oop_arraycopy");
2471 }
2473 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
2474 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
2475 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, false, "jint_disjoint_arraycopy");
2476 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
2477 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
2479 // if (VM_Version::supports_mmx())
2480 //if (false)
2481 // StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_mmx_copy_aligned("arrayof_jshort_disjoint_arraycopy");
2482 // else
2483 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
2484 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(true, false, "arrayof_jint_disjoint_arraycopy");
2485 //StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(true, true, "arrayof_oop_disjoint_arraycopy");
2486 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
2488 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
2489 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, "jshort_arraycopy");
2490 StubRoutines::_jint_arraycopy = generate_conjoint_int_oop_copy(false, false, "jint_arraycopy");
2491 StubRoutines::_jlong_arraycopy = generate_conjoint_long_copy(false, "jlong_arraycopy");
2493 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
2494 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
2495 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_oop_copy(true, false, "arrayof_jint_arraycopy");
2496 //StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_int_oop_copy(true, true, "arrayof_oop_arraycopy");
2497 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");
2499 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy;
2500 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy;
2501 }
2503 //Wang: add a function to implement SafeFetch32 and SafeFetchN
2504 void generate_safefetch(const char* name, int size, address* entry,
2505 address* fault_pc, address* continuation_pc) {
2506 // safefetch signatures:
2507 // int SafeFetch32(int* adr, int errValue);
2508 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
2509 //
2510 // arguments:
2511 // A0 = adr
2512 // A1 = errValue
2513 //
2514 // result:
2515 // PPC_RET = *adr or errValue
2517 StubCodeMark mark(this, "StubRoutines", name);
2519 // Entry point, pc or function descriptor.
2520 *entry = __ pc();
2522 // Load *adr into A1, may fault.
2523 *fault_pc = __ pc();
2524 switch (size) {
2525 case 4:
2526 // int32_t
2527 __ lw(A1, A0, 0);
2528 break;
2529 case 8:
2530 // int64_t
2531 __ ld(A1, A0, 0);
2532 break;
2533 default:
2534 ShouldNotReachHere();
2535 }
2537 // return errValue or *adr
2538 *continuation_pc = __ pc();
2539 __ addu(V0,A1,R0);
2540 __ jr(RA);
2541 __ delayed()->nop();
2542 }
2545 #undef __
2546 #define __ masm->
2548 // Continuation point for throwing of implicit exceptions that are
2549 // not handled in the current activation. Fabricates an exception
2550 // oop and initiates normal exception dispatching in this
2551 // frame. Since we need to preserve callee-saved values (currently
2552 // only for C2, but done for C1 as well) we need a callee-saved oop
2553 // map and therefore have to make these stubs into RuntimeStubs
2554 // rather than BufferBlobs. If the compiler needs all registers to
2555 // be preserved between the fault point and the exception handler
2556 // then it must assume responsibility for that in
2557 // AbstractCompiler::continuation_for_implicit_null_exception or
2558 // continuation_for_implicit_division_by_zero_exception. All other
2559 // implicit exceptions (e.g., NullPointerException or
2560 // AbstractMethodError on entry) are either at call sites or
2561 // otherwise assume that stack unwinding will be initiated, so
2562 // caller saved registers were assumed volatile in the compiler.
2563 address generate_throw_exception(const char* name,
2564 address runtime_entry,
2565 bool restore_saved_exception_pc) {
2566 // Information about frame layout at time of blocking runtime call.
2567 // Note that we only have to preserve callee-saved registers since
2568 // the compilers are responsible for supplying a continuation point
2569 // if they expect all registers to be preserved.
2570 //#define aoqi_test
2571 #ifdef aoqi_test
2572 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
2573 #endif
2574 enum layout {
2575 thread_off, // last_java_sp
2576 S7_off, // callee saved register sp + 1
2577 S6_off, // callee saved register sp + 2
2578 S5_off, // callee saved register sp + 3
2579 S4_off, // callee saved register sp + 4
2580 S3_off, // callee saved register sp + 5
2581 S2_off, // callee saved register sp + 6
2582 S1_off, // callee saved register sp + 7
2583 S0_off, // callee saved register sp + 8
2584 FP_off,
2585 ret_address,
2586 framesize
2587 };
2589 int insts_size = 2048;
2590 int locs_size = 32;
2592 // CodeBuffer* code = new CodeBuffer(insts_size, locs_size, 0, 0, 0, false,
2593 // NULL, NULL, NULL, false, NULL, name, false);
2594 CodeBuffer code (name , insts_size, locs_size);
2595 #ifdef aoqi_test
2596 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
2597 #endif
2598 OopMapSet* oop_maps = new OopMapSet();
2599 #ifdef aoqi_test
2600 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
2601 #endif
2602 MacroAssembler* masm = new MacroAssembler(&code);
2603 #ifdef aoqi_test
2604 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
2605 #endif
2607 address start = __ pc();
2608 //__ stop("generate_throw_exception");
2609 /*
2610 __ move(AT, (int)&jerome1 );
2611 __ sw(SP, AT, 0);
2612 __ move(AT, (int)&jerome2 );
2613 __ sw(FP, AT, 0);
2614 __ move(AT, (int)&jerome3 );
2615 __ sw(RA, AT, 0);
2616 __ move(AT, (int)&jerome4 );
2617 __ sw(R0, AT, 0);
2618 __ move(AT, (int)&jerome5 );
2619 __ sw(R0, AT, 0);
2620 __ move(AT, (int)&jerome6 );
2621 __ sw(R0, AT, 0);
2622 __ move(AT, (int)&jerome7 );
2623 __ sw(R0, AT, 0);
2624 __ move(AT, (int)&jerome10 );
2625 __ sw(R0, AT, 0);
2627 __ pushad();
2629 //__ enter();
2630 __ call(CAST_FROM_FN_PTR(address, SharedRuntime::print_call_statistics),
2631 relocInfo::runtime_call_type);
2632 __ delayed()->nop();
2634 //__ leave();
2635 __ popad();
2637 */
2639 // This is an inlined and slightly modified version of call_VM
2640 // which has the ability to fetch the return PC out of
2641 // thread-local storage and also sets up last_Java_sp slightly
2642 // differently than the real call_VM
2643 #ifndef OPT_THREAD
2644 Register java_thread = TREG;
2645 __ get_thread(java_thread);
2646 #else
2647 Register java_thread = TREG;
2648 #endif
2649 #ifdef aoqi_test
2650 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
2651 #endif
2652 if (restore_saved_exception_pc) {
2653 __ ld(RA, java_thread, in_bytes(JavaThread::saved_exception_pc_offset())); // eax
2654 }
2656 __ enter(); // required for proper stackwalking of RuntimeStub frame
2658 __ addi(SP, SP, (-1) * (framesize-2) * wordSize); // prolog
2659 __ sd(S0, SP, S0_off * wordSize);
2660 __ sd(S1, SP, S1_off * wordSize);
2661 __ sd(S2, SP, S2_off * wordSize);
2662 __ sd(S3, SP, S3_off * wordSize);
2663 __ sd(S4, SP, S4_off * wordSize);
2664 __ sd(S5, SP, S5_off * wordSize);
2665 __ sd(S6, SP, S6_off * wordSize);
2666 __ sd(S7, SP, S7_off * wordSize);
2668 int frame_complete = __ pc() - start;
2669 // push java thread (becomes first argument of C function)
2670 __ sd(java_thread, SP, thread_off * wordSize);
2671 if (java_thread!=A0)
2672 __ move(A0, java_thread);
2674 // Set up last_Java_sp and last_Java_fp
2675 __ set_last_Java_frame(java_thread, SP, FP, NULL);
2676 __ relocate(relocInfo::internal_pc_type);
2677 {
2678 intptr_t save_pc = (intptr_t)__ pc() + NativeMovConstReg::instruction_size + NativeCall::return_address_offset + 4;
2679 __ li48(AT, save_pc);
2680 }
2681 __ sd(AT, java_thread, in_bytes(JavaThread::last_Java_pc_offset()));
2683 // Call runtime
2684 __ call(runtime_entry);
2685 __ delayed()->nop();
2686 // Generate oop map
2687 OopMap* map = new OopMap(framesize, 0);
2688 oop_maps->add_gc_map(__ offset(), map);
2690 // restore the thread (cannot use the pushed argument since arguments
2691 // may be overwritten by C code generated by an optimizing compiler);
2692 // however can use the register value directly if it is callee saved.
2693 #ifndef OPT_THREAD
2694 __ get_thread(java_thread);
2695 #endif
2697 __ ld(SP, java_thread, in_bytes(JavaThread::last_Java_sp_offset()));
2698 // __ reset_last_Java_frame(java_thread, true);
2699 __ reset_last_Java_frame(java_thread, true, true);
2701 // Restore callee save registers. This must be done after resetting the Java frame
2702 __ ld(S0, SP, S0_off * wordSize);
2703 __ ld(S1, SP, S1_off * wordSize);
2704 __ ld(S2, SP, S2_off * wordSize);
2705 __ ld(S3, SP, S3_off * wordSize);
2706 __ ld(S4, SP, S4_off * wordSize);
2707 __ ld(S5, SP, S5_off * wordSize);
2708 __ ld(S6, SP, S6_off * wordSize);
2709 __ ld(S7, SP, S7_off * wordSize);
2711 // discard arguments
2712 __ addi(SP, SP, (framesize-2) * wordSize); // epilog
2713 // __ leave(); // required for proper stackwalking of RuntimeStub frame
2714 __ addi(SP, FP, wordSize);
2715 __ ld(FP, SP, -1*wordSize);
2716 // check for pending exceptions
2717 #ifdef ASSERT
2718 Label L;
2719 __ lw(AT, java_thread, in_bytes(Thread::pending_exception_offset()));
2720 __ bne(AT, R0, L);
2721 __ delayed()->nop();
2722 __ should_not_reach_here();
2723 __ bind(L);
2724 #endif //ASSERT
2725 __ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
2726 __ delayed()->nop();
2727 #ifdef aoqi_test
2728 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
2729 #endif
2730 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code,frame_complete,
2731 framesize, oop_maps, false);
2732 #ifdef aoqi_test
2733 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
2734 #endif
2735 return stub->entry_point();
2736 }
2738 // Initialization
2739 void generate_initial() {
2740 /*
2741 // Generates all stubs and initializes the entry points
2743 // This platform-specific stub is needed by generate_call_stub()
2744 StubRoutines::mips::_mxcsr_std = generate_fp_mask("mxcsr_std", 0x0000000000001F80);
2746 // entry points that exist in all platforms Note: This is code
2747 // that could be shared among different platforms - however the
2748 // benefit seems to be smaller than the disadvantage of having a
2749 // much more complicated generator structure. See also comment in
2750 // stubRoutines.hpp.
2752 StubRoutines::_forward_exception_entry = generate_forward_exception();
2754 StubRoutines::_call_stub_entry =
2755 generate_call_stub(StubRoutines::_call_stub_return_address);
2757 // is referenced by megamorphic call
2758 StubRoutines::_catch_exception_entry = generate_catch_exception();
2760 // atomic calls
2761 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg();
2762 StubRoutines::_atomic_xchg_ptr_entry = generate_atomic_xchg_ptr();
2763 StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg();
2764 StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
2765 StubRoutines::_atomic_add_entry = generate_atomic_add();
2766 StubRoutines::_atomic_add_ptr_entry = generate_atomic_add_ptr();
2767 StubRoutines::_fence_entry = generate_orderaccess_fence();
2769 StubRoutines::_handler_for_unsafe_access_entry =
2770 generate_handler_for_unsafe_access();
2772 // platform dependent
2773 StubRoutines::mips::_get_previous_fp_entry = generate_get_previous_fp();
2775 StubRoutines::mips::_verify_mxcsr_entry = generate_verify_mxcsr();
2776 */
2777 // Generates all stubs and initializes the entry points
2779 //-------------------------------------------------------------
2780 //-----------------------------------------------------------
2781 // entry points that exist in all platforms
2782 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller
2783 // than the disadvantage of having a much more complicated generator structure.
2784 // See also comment in stubRoutines.hpp.
2785 StubRoutines::_forward_exception_entry = generate_forward_exception();
2786 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
2787 // is referenced by megamorphic call
2788 StubRoutines::_catch_exception_entry = generate_catch_exception();
2790 StubRoutines::_handler_for_unsafe_access_entry = generate_handler_for_unsafe_access();
2792 // platform dependent
2793 StubRoutines::gs2::_get_previous_fp_entry = generate_get_previous_fp();
2794 }
2796 void generate_all() {
2797 #ifdef aoqi_test
2798 tty->print_cr("%s:%d", __func__, __LINE__);
2799 #endif
2800 // Generates all stubs and initializes the entry points
2802 // These entry points require SharedInfo::stack0 to be set up in
2803 // non-core builds and need to be relocatable, so they each
2804 // fabricate a RuntimeStub internally.
2805 /*
2806 StubRoutines::_throw_AbstractMethodError_entry =
2807 generate_throw_exception("AbstractMethodError throw_exception",
2808 CAST_FROM_FN_PTR(address,
2809 SharedRuntime::
2810 throw_AbstractMethodError),
2811 false);
2813 StubRoutines::_throw_IncompatibleClassChangeError_entry =
2814 generate_throw_exception("IncompatibleClassChangeError throw_exception",
2815 CAST_FROM_FN_PTR(address,
2816 SharedRuntime::
2817 throw_IncompatibleClassChangeError),
2818 false);
2820 StubRoutines::_throw_ArithmeticException_entry =
2821 generate_throw_exception("ArithmeticException throw_exception",
2822 CAST_FROM_FN_PTR(address,
2823 SharedRuntime::
2824 throw_ArithmeticException),
2825 true);
2827 StubRoutines::_throw_NullPointerException_entry =
2828 generate_throw_exception("NullPointerException throw_exception",
2829 CAST_FROM_FN_PTR(address,
2830 SharedRuntime::
2831 throw_NullPointerException),
2832 true);
2834 StubRoutines::_throw_NullPointerException_at_call_entry =
2835 generate_throw_exception("NullPointerException at call throw_exception",
2836 CAST_FROM_FN_PTR(address,
2837 SharedRuntime::
2838 throw_NullPointerException_at_call),
2839 false);
2841 StubRoutines::_throw_StackOverflowError_entry =
2842 generate_throw_exception("StackOverflowError throw_exception",
2843 CAST_FROM_FN_PTR(address,
2844 SharedRuntime::
2845 throw_StackOverflowError),
2846 false);
2848 // entry points that are platform specific
2849 StubRoutines::mips::_f2i_fixup = generate_f2i_fixup();
2850 StubRoutines::mips::_f2l_fixup = generate_f2l_fixup();
2851 StubRoutines::mips::_d2i_fixup = generate_d2i_fixup();
2852 StubRoutines::mips::_d2l_fixup = generate_d2l_fixup();
2854 StubRoutines::mips::_float_sign_mask = generate_fp_mask("float_sign_mask", 0x7FFFFFFF7FFFFFFF);
2855 StubRoutines::mips::_float_sign_flip = generate_fp_mask("float_sign_flip", 0x8000000080000000);
2856 StubRoutines::mips::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
2857 StubRoutines::mips::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
2859 // support for verify_oop (must happen after universe_init)
2860 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
2862 // arraycopy stubs used by compilers
2863 generate_arraycopy_stubs();
2864 */
2865 #ifdef aoqi_test
2866 tty->print_cr("%s:%d", __func__, __LINE__);
2867 #endif
2868 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError), false);
2869 #ifdef aoqi_test
2870 tty->print_cr("%s:%d", __func__, __LINE__);
2871 #endif
2872 // StubRoutines::_throw_ArithmeticException_entry = generate_throw_exception("ArithmeticException throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_ArithmeticException), true);
2873 #ifdef aoqi_test
2874 tty->print_cr("%s:%d", __func__, __LINE__);
2875 #endif
2876 // StubRoutines::_throw_NullPointerException_entry = generate_throw_exception("NullPointerException throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException), true);
2877 #ifdef aoqi_test
2878 tty->print_cr("%s:%d", __func__, __LINE__);
2879 #endif
2880 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
2881 #ifdef aoqi_test
2882 tty->print_cr("%s:%d", __func__, __LINE__);
2883 #endif
2884 StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
2885 #ifdef aoqi_test
2886 tty->print_cr("%s:%d", __func__, __LINE__);
2887 #endif
2889 //------------------------------------------------------
2890 //------------------------------------------------------------------
2891 // entry points that are platform specific
2893 // support for verify_oop (must happen after universe_init)
2894 #ifdef aoqi_test
2895 tty->print_cr("%s:%d", __func__, __LINE__);
2896 #endif
2897 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
2898 #ifdef aoqi_test
2899 tty->print_cr("%s:%d", __func__, __LINE__);
2900 #endif
2901 #ifndef CORE
2902 // arraycopy stubs used by compilers
2903 generate_arraycopy_stubs();
2904 #ifdef aoqi_test
2905 tty->print_cr("%s:%d", __func__, __LINE__);
2906 #endif
2907 #endif
2909 // Safefetch stubs.
2910 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
2911 &StubRoutines::_safefetch32_fault_pc,
2912 &StubRoutines::_safefetch32_continuation_pc);
2913 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
2914 &StubRoutines::_safefetchN_fault_pc,
2915 &StubRoutines::_safefetchN_continuation_pc);
2916 }
2918 public:
2919 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
2920 if (all) {
2921 generate_all();
2922 } else {
2923 generate_initial();
2924 }
2925 }
2926 }; // end class declaration
2927 /*
2928 address StubGenerator::disjoint_byte_copy_entry = NULL;
2929 address StubGenerator::disjoint_short_copy_entry = NULL;
2930 address StubGenerator::disjoint_int_copy_entry = NULL;
2931 address StubGenerator::disjoint_long_copy_entry = NULL;
2932 address StubGenerator::disjoint_oop_copy_entry = NULL;
2934 address StubGenerator::byte_copy_entry = NULL;
2935 address StubGenerator::short_copy_entry = NULL;
2936 address StubGenerator::int_copy_entry = NULL;
2937 address StubGenerator::long_copy_entry = NULL;
2938 address StubGenerator::oop_copy_entry = NULL;
2940 address StubGenerator::checkcast_copy_entry = NULL;
2941 */
2942 void StubGenerator_generate(CodeBuffer* code, bool all) {
2943 StubGenerator g(code, all);
2944 }