src/cpu/mips/vm/stubGenerator_mips_64.cpp

Mon, 30 May 2016 02:01:38 -0400

author
aoqi
date
Mon, 30 May 2016 02:01:38 -0400
changeset 13
bc227c49eaae
parent 8
cf5765c81f87
child 32
3b95e10c12fa
permissions
-rw-r--r--

[C2] Rewrite generate_disjoint_short_copy.
Eliminated unaligned access and Optimized copy algorithm.
xml.transform improved by 50%, total GEO improved by 13%.
Copy Algorithm:
Generate stub for disjoint short copy. If "aligned" is true, the
"from" and "to" addresses are assumed to be heapword aligned.

Arguments for generated stub:
from: A0
to: A1
elm.count: A2 treated as signed
one element: 2 bytes

Strategy for aligned==true:

If length <= 9:
1. copy 1 elements at a time (l_5)

If length > 9:
1. copy 4 elements at a time until less than 4 elements are left (l_7)
2. copy 2 elements at a time until less than 2 elements are left (l_6)
3. copy last element if one was left in step 2. (l_1)


Strategy for aligned==false:

If length <= 9: same as aligned==true case

If length > 9:
1. continue with step 7. if the alignment of from and to mod 4
is different.
2. align from and to to 4 bytes by copying 1 element if necessary
3. at l_2 from and to are 4 byte aligned; continue with
6. if they cannot be aligned to 8 bytes because they have
got different alignment mod 8.
4. at this point we know that both, from and to, have the same
alignment mod 8, now copy one element if necessary to get
8 byte alignment of from and to.
5. copy 4 elements at a time until less than 4 elements are
left; depending on step 3. all load/stores are aligned.
6. copy 2 elements at a time until less than 2 elements are
left. (l_6)
7. copy 1 element at a time. (l_5)
8. copy last element if one was left in step 6. (l_1)

TODO:

1. use loongson 128-bit load/store
2. use loop unrolling optimization when len is big enough, for example if
len > 0x2000:
__ bind(l_x);
__ ld(AT, tmp1, 0);
__ ld(tmp, tmp1, 8);
__ sd(AT, tmp2, 0);
__ sd(tmp, tmp2, 8);
__ ld(AT, tmp1, 16);
__ ld(tmp, tmp1, 24);
__ sd(AT, tmp2, 16);
__ sd(tmp, tmp2, 24);
__ daddi(tmp1, tmp1, 32);
__ daddi(tmp2, tmp2, 32);
__ daddi(tmp3, tmp3, -16);
__ daddi(AT, tmp3, -16);
__ bgez(AT, l_x);
__ delayed()->nop();

aoqi@1 1 /*
aoqi@1 2 * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
aoqi@1 3 * Copyright (c) 2015, 2016, Loongson Technology. All rights reserved.
aoqi@1 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
aoqi@1 5 *
aoqi@1 6 * This code is free software; you can redistribute it and/or modify it
aoqi@1 7 * under the terms of the GNU General Public License version 2 only, as
aoqi@1 8 * published by the Free Software Foundation.
aoqi@1 9 *
aoqi@1 10 * This code is distributed in the hope that it will be useful, but WITHOUT
aoqi@1 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
aoqi@1 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
aoqi@1 13 * version 2 for more details (a copy is included in the LICENSE file that
aoqi@1 14 * accompanied this code).
aoqi@1 15 *
aoqi@1 16 * You should have received a copy of the GNU General Public License version
aoqi@1 17 * 2 along with this work; if not, write to the Free Software Foundation,
aoqi@1 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
aoqi@1 19 *
aoqi@1 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
aoqi@1 21 * or visit www.oracle.com if you need additional information or have any
aoqi@1 22 * questions.
aoqi@1 23 *
aoqi@1 24 */
aoqi@1 25
aoqi@1 26 #include "precompiled.hpp"
aoqi@1 27 #include "asm/macroAssembler.hpp"
aoqi@1 28 #include "asm/macroAssembler.inline.hpp"
aoqi@1 29 #include "interpreter/interpreter.hpp"
aoqi@1 30 #include "nativeInst_mips.hpp"
aoqi@1 31 #include "oops/instanceOop.hpp"
aoqi@1 32 #include "oops/method.hpp"
aoqi@1 33 #include "oops/objArrayKlass.hpp"
aoqi@1 34 #include "oops/oop.inline.hpp"
aoqi@1 35 #include "prims/methodHandles.hpp"
aoqi@1 36 #include "runtime/frame.inline.hpp"
aoqi@1 37 #include "runtime/handles.inline.hpp"
aoqi@1 38 #include "runtime/sharedRuntime.hpp"
aoqi@1 39 #include "runtime/stubCodeGenerator.hpp"
aoqi@1 40 #include "runtime/stubRoutines.hpp"
aoqi@1 41 #include "runtime/thread.inline.hpp"
aoqi@1 42 #include "utilities/top.hpp"
aoqi@1 43 #ifdef COMPILER2
aoqi@1 44 #include "opto/runtime.hpp"
aoqi@1 45 #endif
aoqi@1 46
aoqi@1 47
aoqi@1 48 // Declaration and definition of StubGenerator (no .hpp file).
aoqi@1 49 // For a more detailed description of the stub routine structure
aoqi@1 50 // see the comment in stubRoutines.hpp
aoqi@1 51
aoqi@1 52 #define __ _masm->
aoqi@1 53 //#define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
aoqi@1 54 //#define a__ ((Assembler*)_masm)->
aoqi@1 55
aoqi@1 56 //#ifdef PRODUCT
aoqi@1 57 //#define BLOCK_COMMENT(str) /* nothing */
aoqi@1 58 //#else
aoqi@1 59 //#define BLOCK_COMMENT(str) __ block_comment(str)
aoqi@1 60 //#endif
aoqi@1 61
aoqi@1 62 //#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
aoqi@1 63 const int MXCSR_MASK = 0xFFC0; // Mask out any pending exceptions
aoqi@1 64
aoqi@1 65 // Stub Code definitions
aoqi@1 66
aoqi@1 67 static address handle_unsafe_access() {
aoqi@1 68 JavaThread* thread = JavaThread::current();
aoqi@1 69 address pc = thread->saved_exception_pc();
aoqi@1 70 // pc is the instruction which we must emulate
aoqi@1 71 // doing a no-op is fine: return garbage from the load
aoqi@1 72 // therefore, compute npc
aoqi@1 73 //address npc = Assembler::locate_next_instruction(pc);
aoqi@1 74 address npc = (address)((unsigned long)pc + sizeof(unsigned long));
aoqi@1 75
aoqi@1 76 // request an async exception
aoqi@1 77 thread->set_pending_unsafe_access_error();
aoqi@1 78
aoqi@1 79 // return address of next instruction to execute
aoqi@1 80 return npc;
aoqi@1 81 }
aoqi@1 82
aoqi@1 83 class StubGenerator: public StubCodeGenerator {
aoqi@1 84 private:
aoqi@1 85
aoqi@1 86 // ABI mips n64
aoqi@1 87 // This fig is not MIPS ABI. It is call Java from C ABI.
aoqi@1 88 // Call stubs are used to call Java from C
aoqi@1 89 //
aoqi@1 90 // [ return_from_Java ]
aoqi@1 91 // [ argument word n-1 ] <--- sp
aoqi@1 92 // ...
aoqi@1 93 // [ argument word 0 ]
aoqi@1 94 // ...
aoqi@1 95 //-10 [ S6 ]
aoqi@1 96 // -9 [ S5 ]
aoqi@1 97 // -8 [ S4 ]
aoqi@1 98 // -7 [ S3 ]
aoqi@1 99 // -6 [ S0 ]
aoqi@1 100 // -5 [ TSR(S2) ]
aoqi@1 101 // -4 [ LVP(S7) ]
aoqi@1 102 // -3 [ BCP(S1) ]
aoqi@1 103 // -2 [ saved fp ] <--- fp_after_call
aoqi@1 104 // -1 [ return address ]
aoqi@1 105 // 0 [ ptr. to call wrapper ] <--- a0 (old sp -->)fp
aoqi@1 106 // 1 [ result ] <--- a1
aoqi@1 107 // 2 [ result_type ] <--- a2
aoqi@1 108 // 3 [ method ] <--- a3
aoqi@1 109 // 4 [ entry_point ] <--- a4
aoqi@1 110 // 5 [ parameters ] <--- a5
aoqi@1 111 // 6 [ parameter_size ] <--- a6
aoqi@1 112 // 7 [ thread ] <--- a7
aoqi@1 113
aoqi@1 114 //
aoqi@1 115 // _LP64: n64 does not save paras in sp.
aoqi@1 116 //
aoqi@1 117 // [ return_from_Java ]
aoqi@1 118 // [ argument word n-1 ] <--- sp
aoqi@1 119 // ...
aoqi@1 120 // [ argument word 0 ]
aoqi@1 121 // ...
aoqi@1 122 //-14 [ thread ]
aoqi@1 123 //-13 [ result_type ] <--- a2
aoqi@1 124 //-12 [ result ] <--- a1
aoqi@1 125 //-11 [ ptr. to call wrapper ] <--- a0
aoqi@1 126 //-10 [ S6 ]
aoqi@1 127 // -9 [ S5 ]
aoqi@1 128 // -8 [ S4 ]
aoqi@1 129 // -7 [ S3 ]
aoqi@1 130 // -6 [ S0 ]
aoqi@1 131 // -5 [ TSR(S2) ]
aoqi@1 132 // -4 [ LVP(S7) ]
aoqi@1 133 // -3 [ BCP(S1) ]
aoqi@1 134 // -2 [ saved fp ] <--- fp_after_call
aoqi@1 135 // -1 [ return address ]
aoqi@1 136 // 0 [ ] <--- old sp
aoqi@1 137 /*
aoqi@1 138 * 2014/01/16 Fu: Find a right place in the call_stub for GP.
aoqi@1 139 * GP will point to the starting point of Interpreter::dispatch_table(itos).
aoqi@1 140 * It should be saved/restored before/after Java calls.
aoqi@1 141 *
aoqi@1 142 */
aoqi@1 143 enum call_stub_layout {
aoqi@1 144 RA_off = -1,
aoqi@1 145 FP_off = -2,
aoqi@1 146 BCP_off = -3,
aoqi@1 147 LVP_off = -4,
aoqi@1 148 TSR_off = -5,
aoqi@1 149 S1_off = -6,
aoqi@1 150 S3_off = -7,
aoqi@1 151 S4_off = -8,
aoqi@1 152 S5_off = -9,
aoqi@1 153 S6_off = -10,
aoqi@1 154 result_off = -11,
aoqi@1 155 result_type_off = -12,
aoqi@1 156 thread_off = -13,
aoqi@1 157 total_off = thread_off - 3,
aoqi@1 158 GP_off = -16,
aoqi@1 159 };
aoqi@1 160
aoqi@1 161 address generate_call_stub(address& return_address) {
aoqi@1 162
aoqi@1 163 StubCodeMark mark(this, "StubRoutines", "call_stub");
aoqi@1 164 address start = __ pc();
aoqi@1 165
aoqi@1 166 // same as in generate_catch_exception()!
aoqi@1 167
aoqi@1 168 // stub code
aoqi@1 169 // save ra and fp
aoqi@1 170 __ sd(RA, SP, RA_off * wordSize);
aoqi@1 171 __ sd(FP, SP, FP_off * wordSize);
aoqi@1 172 __ sd(BCP, SP, BCP_off * wordSize);
aoqi@1 173 __ sd(LVP, SP, LVP_off * wordSize);
aoqi@1 174 __ sd(GP, SP, GP_off * wordSize);
aoqi@1 175 __ sd(TSR, SP, TSR_off * wordSize);
aoqi@1 176 __ sd(S1, SP, S1_off * wordSize);
aoqi@1 177 __ sd(S3, SP, S3_off * wordSize);
aoqi@1 178 __ sd(S4, SP, S4_off * wordSize);
aoqi@1 179 __ sd(S5, SP, S5_off * wordSize);
aoqi@1 180 __ sd(S6, SP, S6_off * wordSize);
aoqi@1 181
aoqi@1 182
aoqi@1 183 __ li48(GP, (long)Interpreter::dispatch_table(itos));
aoqi@1 184
aoqi@1 185 // I think 14 is the max gap between argument and callee saved register
aoqi@1 186 __ daddi(FP, SP, (-2) * wordSize);
aoqi@1 187 __ daddi(SP, SP, total_off * wordSize);
aoqi@1 188 //FIXME, aoqi. find a suitable place to save A1 & A2.
aoqi@1 189 /*
aoqi@1 190 __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);
aoqi@1 191 __ sd(A1, FP, 3 * wordSize);
aoqi@1 192 __ sd(A2, FP, 4 * wordSize);
aoqi@1 193 __ sd(A3, FP, 5 * wordSize);
aoqi@1 194 __ sd(A4, FP, 6 * wordSize);
aoqi@1 195 __ sd(A5, FP, 7 * wordSize);
aoqi@1 196 __ sd(A6, FP, 8 * wordSize);
aoqi@1 197 __ sd(A7, FP, 9 * wordSize);
aoqi@1 198 */
aoqi@1 199 __ sd(A0, FP, frame::entry_frame_call_wrapper_offset * wordSize);
aoqi@1 200 __ sd(A1, FP, result_off * wordSize);
aoqi@1 201 __ sd(A2, FP, result_type_off * wordSize);
aoqi@1 202 __ sd(A7, FP, thread_off * wordSize);
aoqi@1 203
aoqi@1 204 #ifdef OPT_THREAD
aoqi@1 205 //__ get_thread(TREG);
aoqi@1 206 __ move(TREG, A7);
aoqi@1 207
aoqi@1 208 //__ ld(TREG, FP, thread_off * wordSize);
aoqi@1 209 #endif
aoqi@1 210 //add for compressedoops
aoqi@1 211 __ reinit_heapbase();
aoqi@1 212
aoqi@1 213 #ifdef ASSERT
aoqi@1 214 // make sure we have no pending exceptions
aoqi@1 215 {
aoqi@1 216 Label L;
aoqi@1 217 __ ld(AT, A7, in_bytes(Thread::pending_exception_offset()));
aoqi@1 218 __ beq(AT, R0, L);
aoqi@1 219 __ delayed()->nop();
aoqi@1 220 /* FIXME: I do not know how to realize stop in mips arch, do it in the future */
aoqi@1 221 __ stop("StubRoutines::call_stub: entered with pending exception");
aoqi@1 222 __ bind(L);
aoqi@1 223 }
aoqi@1 224 #endif
aoqi@1 225
aoqi@1 226 // pass parameters if any
aoqi@1 227 // A5: parameter
aoqi@1 228 // A6: parameter_size
aoqi@1 229 // T0: parameter_size_tmp(--)
aoqi@1 230 // T2: offset(++)
aoqi@1 231 // T3: tmp
aoqi@1 232 Label parameters_done;
aoqi@1 233 // judge if the parameter_size equals 0
aoqi@1 234 __ beq(A6, R0, parameters_done);
aoqi@1 235 __ delayed()->nop();
aoqi@1 236 __ dsll(AT, A6, Interpreter::logStackElementSize);
aoqi@1 237 __ dsub(SP, SP, AT);
aoqi@1 238 __ move(AT, -StackAlignmentInBytes);
aoqi@1 239 __ andr(SP, SP , AT);
aoqi@1 240 // Copy Java parameters in reverse order (receiver last)
aoqi@1 241 // Note that the argument order is inverted in the process
aoqi@1 242 // source is edx[ecx: N-1..0]
aoqi@1 243 // dest is esp[ebx: 0..N-1]
aoqi@1 244 Label loop;
aoqi@1 245 __ move(T0, A6);
aoqi@1 246 __ move(T2, R0);
aoqi@1 247 __ bind(loop);
aoqi@1 248
aoqi@1 249 // get parameter
aoqi@1 250 __ dsll(T3, T0, LogBytesPerWord);
aoqi@1 251 __ dadd(T3, T3, A5);
aoqi@1 252 __ ld(AT, T3, -wordSize);
aoqi@1 253 __ dsll(T3, T2, LogBytesPerWord);
aoqi@1 254 __ dadd(T3, T3, SP);
aoqi@1 255 __ sd(AT, T3, Interpreter::expr_offset_in_bytes(0));
aoqi@1 256 __ daddi(T2, T2, 1);
aoqi@1 257 __ daddi(T0, T0, -1);
aoqi@1 258 __ bne(T0, R0, loop);
aoqi@1 259 __ delayed()->nop();
aoqi@1 260 // advance to next parameter
aoqi@1 261
aoqi@1 262 // call Java function
aoqi@1 263 __ bind(parameters_done);
aoqi@1 264
aoqi@1 265 // receiver in V0, methodOop in Rmethod
aoqi@1 266
aoqi@1 267 __ move(Rmethod, A3);
aoqi@1 268 __ move(Rsender, SP); //set sender sp
aoqi@1 269 __ jalr(A4);
aoqi@1 270 __ delayed()->nop();
aoqi@1 271 return_address = __ pc();
aoqi@1 272
aoqi@1 273 Label common_return;
aoqi@1 274 __ bind(common_return);
aoqi@1 275
aoqi@1 276 // store result depending on type
aoqi@1 277 // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
aoqi@1 278 __ ld(T0, FP, result_off * wordSize); // result --> T0
aoqi@1 279 Label is_long, is_float, is_double, exit;
aoqi@1 280 __ ld(T2, FP, result_type_off * wordSize); // result_type --> T2
aoqi@1 281 __ daddi(T3, T2, (-1) * T_LONG);
aoqi@1 282 __ beq(T3, R0, is_long);
aoqi@1 283 __ delayed()->daddi(T3, T2, (-1) * T_FLOAT);
aoqi@1 284 __ beq(T3, R0, is_float);
aoqi@1 285 __ delayed()->daddi(T3, T2, (-1) * T_DOUBLE);
aoqi@1 286 __ beq(T3, R0, is_double);
aoqi@1 287 __ delayed()->nop();
aoqi@1 288
aoqi@1 289 // handle T_INT case
aoqi@1 290 __ sd(V0, T0, 0 * wordSize);
aoqi@1 291 __ bind(exit);
aoqi@1 292
aoqi@1 293 // restore
aoqi@1 294 __ daddi(SP, FP, 2 * wordSize );
aoqi@1 295 __ ld(RA, SP, RA_off * wordSize);
aoqi@1 296 __ ld(FP, SP, FP_off * wordSize);
aoqi@1 297 __ ld(BCP, SP, BCP_off * wordSize);
aoqi@1 298 __ ld(LVP, SP, LVP_off * wordSize);
aoqi@1 299 __ ld(GP, SP, GP_off * wordSize);
aoqi@1 300 __ ld(TSR, SP, TSR_off * wordSize);
aoqi@1 301
aoqi@1 302 __ ld(S1, SP, S1_off * wordSize);
aoqi@1 303 __ ld(S3, SP, S3_off * wordSize);
aoqi@1 304 __ ld(S4, SP, S4_off * wordSize);
aoqi@1 305 __ ld(S5, SP, S5_off * wordSize);
aoqi@1 306 __ ld(S6, SP, S6_off * wordSize);
aoqi@1 307
aoqi@1 308 // return
aoqi@1 309 __ jr(RA);
aoqi@1 310 __ delayed()->nop();
aoqi@1 311
aoqi@1 312 // handle return types different from T_INT
aoqi@1 313 __ bind(is_long);
aoqi@1 314 __ sd(V0, T0, 0 * wordSize);
aoqi@1 315 //__ sd(V1, T0, 1 * wordSize);
aoqi@1 316 __ sd(R0, T0, 1 * wordSize);
aoqi@1 317 __ b(exit);
aoqi@1 318 __ delayed()->nop();
aoqi@1 319
aoqi@1 320 __ bind(is_float);
aoqi@1 321 __ swc1(F0, T0, 0 * wordSize);
aoqi@1 322 __ b(exit);
aoqi@1 323 __ delayed()->nop();
aoqi@1 324
aoqi@1 325 __ bind(is_double);
aoqi@1 326 __ sdc1(F0, T0, 0 * wordSize);
aoqi@1 327 //__ sdc1(F1, T0, 1 * wordSize);
aoqi@1 328 __ sd(R0, T0, 1 * wordSize);
aoqi@1 329 __ b(exit);
aoqi@1 330 __ delayed()->nop();
aoqi@1 331 //FIXME, 1.6 mips version add operation of fpu here
aoqi@1 332 StubRoutines::gs2::set_call_stub_compiled_return(__ pc());
aoqi@1 333 __ b(common_return);
aoqi@1 334 __ delayed()->nop();
aoqi@1 335 return start;
aoqi@1 336 }
aoqi@1 337
aoqi@1 338 // Return point for a Java call if there's an exception thrown in
aoqi@1 339 // Java code. The exception is caught and transformed into a
aoqi@1 340 // pending exception stored in JavaThread that can be tested from
aoqi@1 341 // within the VM.
aoqi@1 342 //
aoqi@1 343 // Note: Usually the parameters are removed by the callee. In case
aoqi@1 344 // of an exception crossing an activation frame boundary, that is
aoqi@1 345 // not the case if the callee is compiled code => need to setup the
aoqi@1 346 // rsp.
aoqi@1 347 //
aoqi@1 348 // rax: exception oop
aoqi@1 349
aoqi@1 350 address generate_catch_exception() {
aoqi@1 351 StubCodeMark mark(this, "StubRoutines", "catch_exception");
aoqi@1 352 address start = __ pc();
aoqi@1 353
aoqi@1 354 Register thread = TREG;
aoqi@1 355
aoqi@1 356 // get thread directly
aoqi@1 357 #ifndef OPT_THREAD
aoqi@1 358 __ ld(thread, FP, thread_off * wordSize);
aoqi@1 359 #endif
aoqi@1 360
aoqi@1 361 #ifdef ASSERT
aoqi@1 362 // verify that threads correspond
aoqi@1 363 { Label L;
aoqi@1 364 __ get_thread(T8);
aoqi@1 365 __ beq(T8, thread, L);
aoqi@1 366 __ delayed()->nop();
aoqi@1 367 __ stop("StubRoutines::catch_exception: threads must correspond");
aoqi@1 368 __ bind(L);
aoqi@1 369 }
aoqi@1 370 #endif
aoqi@1 371 // set pending exception
aoqi@1 372 __ verify_oop(V0);
aoqi@1 373 __ sd(V0, thread, in_bytes(Thread::pending_exception_offset()));
aoqi@1 374 __ li(AT, (long)__FILE__);
aoqi@1 375 __ sd(AT, thread, in_bytes(Thread::exception_file_offset ()));
aoqi@1 376 __ li(AT, (long)__LINE__);
aoqi@1 377 __ sd(AT, thread, in_bytes(Thread::exception_line_offset ()));
aoqi@1 378
aoqi@1 379 // complete return to VM
aoqi@1 380 assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
aoqi@1 381 __ jmp(StubRoutines::_call_stub_return_address, relocInfo::none);
aoqi@1 382 __ delayed()->nop();
aoqi@1 383
aoqi@1 384 return start;
aoqi@1 385 }
aoqi@1 386
aoqi@1 387 // Continuation point for runtime calls returning with a pending
aoqi@1 388 // exception. The pending exception check happened in the runtime
aoqi@1 389 // or native call stub. The pending exception in Thread is
aoqi@1 390 // converted into a Java-level exception.
aoqi@1 391 //
aoqi@1 392 // Contract with Java-level exception handlers:
aoqi@1 393 // rax: exception
aoqi@1 394 // rdx: throwing pc
aoqi@1 395 //
aoqi@1 396 // NOTE: At entry of this stub, exception-pc must be on stack !!
aoqi@1 397
aoqi@1 398 address generate_forward_exception() {
aoqi@1 399 StubCodeMark mark(this, "StubRoutines", "forward exception");
aoqi@1 400 //Register thread = TREG;
aoqi@1 401 Register thread = TREG;
aoqi@1 402 address start = __ pc();
aoqi@1 403
aoqi@1 404 // Upon entry, the sp points to the return address returning into Java
aoqi@1 405 // (interpreted or compiled) code; i.e., the return address becomes the
aoqi@1 406 // throwing pc.
aoqi@1 407 //
aoqi@1 408 // Arguments pushed before the runtime call are still on the stack but
aoqi@1 409 // the exception handler will reset the stack pointer -> ignore them.
aoqi@1 410 // A potential result in registers can be ignored as well.
aoqi@1 411
aoqi@1 412 #ifdef ASSERT
aoqi@1 413 // make sure this code is only executed if there is a pending exception
aoqi@1 414 #ifndef OPT_THREAD
aoqi@1 415 __ get_thread(thread);
aoqi@1 416 #endif
aoqi@1 417 { Label L;
aoqi@1 418 __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));
aoqi@1 419 __ bne(AT, R0, L);
aoqi@1 420 __ delayed()->nop();
aoqi@1 421 __ stop("StubRoutines::forward exception: no pending exception (1)");
aoqi@1 422 __ bind(L);
aoqi@1 423 }
aoqi@1 424 #endif
aoqi@1 425
aoqi@1 426 // compute exception handler into T9
aoqi@1 427 __ ld(A1, SP, 0);
aoqi@1 428 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, A1);
aoqi@1 429 __ move(T9, V0);
aoqi@1 430 __ pop(V1);
aoqi@1 431
aoqi@1 432 #ifndef OPT_THREAD
aoqi@1 433 __ get_thread(thread);
aoqi@1 434 #endif
aoqi@1 435 __ ld(V0, thread, in_bytes(Thread::pending_exception_offset()));
aoqi@1 436 __ sd(R0, thread, in_bytes(Thread::pending_exception_offset()));
aoqi@1 437
aoqi@1 438 #ifdef ASSERT
aoqi@1 439 // make sure exception is set
aoqi@1 440 { Label L;
aoqi@1 441 __ bne(V0, R0, L);
aoqi@1 442 __ delayed()->nop();
aoqi@1 443 __ stop("StubRoutines::forward exception: no pending exception (2)");
aoqi@1 444 __ bind(L);
aoqi@1 445 }
aoqi@1 446 #endif
aoqi@1 447
aoqi@1 448 // continue at exception handler (return address removed)
aoqi@1 449 // V0: exception
aoqi@1 450 // T9: exception handler
aoqi@1 451 // V1: throwing pc
aoqi@1 452 __ verify_oop(V0);
aoqi@1 453 __ jr(T9);
aoqi@1 454 __ delayed()->nop();
aoqi@1 455
aoqi@1 456 return start;
aoqi@1 457 }
aoqi@1 458
aoqi@1 459 // Support for intptr_t get_previous_fp()
aoqi@1 460 //
aoqi@1 461 // This routine is used to find the previous frame pointer for the
aoqi@1 462 // caller (current_frame_guess). This is used as part of debugging
aoqi@1 463 // ps() is seemingly lost trying to find frames.
aoqi@1 464 // This code assumes that caller current_frame_guess) has a frame.
aoqi@1 465 address generate_get_previous_fp() {
aoqi@1 466 StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
aoqi@1 467 const Address old_fp (FP, 0);
aoqi@1 468 const Address older_fp (V0, 0);
aoqi@1 469 address start = __ pc();
aoqi@1 470 __ enter();
aoqi@1 471 __ lw(V0, old_fp); // callers fp
aoqi@1 472 __ lw(V0, older_fp); // the frame for ps()
aoqi@1 473 __ leave();
aoqi@1 474 __ jr(RA);
aoqi@1 475 __ delayed()->nop();
aoqi@1 476 return start;
aoqi@1 477 }
aoqi@1 478 // The following routine generates a subroutine to throw an
aoqi@1 479 // asynchronous UnknownError when an unsafe access gets a fault that
aoqi@1 480 // could not be reasonably prevented by the programmer. (Example:
aoqi@1 481 // SIGBUS/OBJERR.)
aoqi@1 482 address generate_handler_for_unsafe_access() {
aoqi@1 483 StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
aoqi@1 484 address start = __ pc();
aoqi@1 485 __ pushad(); // push registers
aoqi@1 486 // Address next_pc(esp, RegisterImpl::number_of_registers * BytesPerWord);
aoqi@1 487 __ call(CAST_FROM_FN_PTR(address, handle_unsafe_access), relocInfo::runtime_call_type);
aoqi@1 488 __ delayed()->nop();
aoqi@1 489 __ sw(V0, SP, RegisterImpl::number_of_registers * BytesPerWord);
aoqi@1 490 __ popad();
aoqi@1 491 __ jr(RA);
aoqi@1 492 __ delayed()->nop();
aoqi@1 493 return start;
aoqi@1 494 }
aoqi@1 495
aoqi@1 496 // Non-destructive plausibility checks for oops
aoqi@1 497 //
aoqi@1 498 // Arguments:
aoqi@1 499 // all args on stack!
aoqi@1 500 //
aoqi@1 501 // Stack after saving c_rarg3:
aoqi@1 502 // [tos + 0]: saved c_rarg3
aoqi@1 503 // [tos + 1]: saved c_rarg2
aoqi@1 504 // [tos + 2]: saved r12 (several TemplateTable methods use it)
aoqi@1 505 // [tos + 3]: saved flags
aoqi@1 506 // [tos + 4]: return address
aoqi@1 507 // * [tos + 5]: error message (char*)
aoqi@1 508 // * [tos + 6]: object to verify (oop)
aoqi@1 509 // * [tos + 7]: saved rax - saved by caller and bashed
aoqi@1 510 // * = popped on exit
aoqi@1 511 address generate_verify_oop() {
aoqi@1 512 StubCodeMark mark(this, "StubRoutines", "verify_oop");
aoqi@1 513 address start = __ pc();
aoqi@1 514 __ reinit_heapbase();
aoqi@1 515 __ verify_oop_subroutine();
aoqi@1 516 address end = __ pc();
aoqi@1 517 return start;
aoqi@1 518 }
aoqi@1 519
aoqi@1 520 //
aoqi@1 521 // Generate overlap test for array copy stubs
aoqi@1 522 //
aoqi@1 523 // Input:
aoqi@1 524 // A0 - array1
aoqi@1 525 // A1 - array2
aoqi@1 526 // A2 - element count
aoqi@1 527 //
aoqi@1 528 // Note: this code can only use %eax, %ecx, and %edx
aoqi@1 529 //
aoqi@1 530
aoqi@1 531 // use T9 as temp
aoqi@1 532 void array_overlap_test(address no_overlap_target, int log2_elem_size) {
aoqi@1 533 int elem_size = 1 << log2_elem_size;
aoqi@1 534 Address::ScaleFactor sf = Address::times_1;
aoqi@1 535
aoqi@1 536 switch (log2_elem_size) {
aoqi@1 537 case 0: sf = Address::times_1; break;
aoqi@1 538 case 1: sf = Address::times_2; break;
aoqi@1 539 case 2: sf = Address::times_4; break;
aoqi@1 540 case 3: sf = Address::times_8; break;
aoqi@1 541 }
aoqi@1 542
aoqi@1 543 __ dsll(AT, A2, sf);
aoqi@1 544 __ dadd(AT, AT, A0);
aoqi@1 545 __ lea(T9, Address(AT, -elem_size));
aoqi@1 546 __ dsub(AT, A1, A0);
aoqi@1 547 __ blez(AT, no_overlap_target);
aoqi@1 548 __ delayed()->nop();
aoqi@1 549 __ dsub(AT, A1, T9);
aoqi@1 550 __ bgtz(AT, no_overlap_target);
aoqi@1 551 __ delayed()->nop();
aoqi@1 552
aoqi@8 553 // 2016/05/10 aoqi: If A0 = 0xf... and A1 = 0x0..., than goto no_overlap_target
aoqi@8 554 Label L;
aoqi@8 555 __ bgez(A0, L);
aoqi@8 556 __ delayed()->nop();
aoqi@8 557 __ bgtz(A1, no_overlap_target);
aoqi@8 558 __ delayed()->nop();
aoqi@8 559 __ bind(L);
aoqi@8 560
aoqi@1 561 }
aoqi@1 562
aoqi@1 563 //
aoqi@1 564 // Generate store check for array
aoqi@1 565 //
aoqi@1 566 // Input:
aoqi@1 567 // %edi - starting address
aoqi@1 568 // %ecx - element count
aoqi@1 569 //
aoqi@1 570 // The 2 input registers are overwritten
aoqi@1 571 //
aoqi@1 572
aoqi@1 573 //
aoqi@1 574 // Generate store check for array
aoqi@1 575 //
aoqi@1 576 // Input:
aoqi@1 577 // T0 - starting address(edi)
aoqi@1 578 // T1 - element count (ecx)
aoqi@1 579 //
aoqi@1 580 // The 2 input registers are overwritten
aoqi@1 581 //
aoqi@1 582
aoqi@1 583 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
aoqi@1 584
aoqi@1 585 void array_store_check() {
aoqi@1 586 BarrierSet* bs = Universe::heap()->barrier_set();
aoqi@1 587 assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
aoqi@1 588 CardTableModRefBS* ct = (CardTableModRefBS*)bs;
aoqi@1 589 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
aoqi@1 590 Label l_0;
aoqi@1 591
aoqi@1 592 __ dsll(AT, T1, TIMES_OOP);
aoqi@1 593 __ dadd(AT, T0, AT);
aoqi@1 594 __ daddiu(T1, AT, - BytesPerHeapOop);
aoqi@1 595
aoqi@1 596 __ shr(T0, CardTableModRefBS::card_shift);
aoqi@1 597 __ shr(T1, CardTableModRefBS::card_shift);
aoqi@1 598
aoqi@1 599 __ dsub(T1, T1, T0); // end --> cards count
aoqi@1 600 __ bind(l_0);
aoqi@1 601
aoqi@1 602 __ li48(AT, (long)ct->byte_map_base);
aoqi@1 603 __ dadd(AT, AT, T0);
aoqi@1 604 __ dadd(AT, AT, T1);
aoqi@1 605 __ sb(R0, AT, 0);
aoqi@1 606 //__ daddi(T1, T1, -4);
aoqi@1 607 __ daddi(T1, T1, - 1);
aoqi@1 608 __ bgez(T1, l_0);
aoqi@1 609 __ delayed()->nop();
aoqi@1 610 }
aoqi@1 611
aoqi@1 612 // Arguments:
aoqi@1 613 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
aoqi@1 614 // ignored
aoqi@1 615 // name - stub name string
aoqi@1 616 //
aoqi@1 617 // Inputs:
aoqi@1 618 // c_rarg0 - source array address
aoqi@1 619 // c_rarg1 - destination array address
aoqi@1 620 // c_rarg2 - element count, treated as ssize_t, can be zero
aoqi@1 621 //
aoqi@1 622 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
aoqi@1 623 // we let the hardware handle it. The one to eight bytes within words,
aoqi@1 624 // dwords or qwords that span cache line boundaries will still be loaded
aoqi@1 625 // and stored atomically.
aoqi@1 626 //
aoqi@1 627 // Side Effects:
aoqi@1 628 // disjoint_byte_copy_entry is set to the no-overlap entry point
aoqi@1 629 // used by generate_conjoint_byte_copy().
aoqi@1 630 //
aoqi@1 631 address generate_disjoint_byte_copy(bool aligned, const char *name) {
aoqi@1 632 StubCodeMark mark(this, "StubRoutines", name);
aoqi@1 633 __ align(CodeEntryAlignment);
aoqi@1 634 address start = __ pc();
aoqi@1 635 Label l_0, l_1, l_2, l_3, l_4, l_5, l_6;
aoqi@1 636
aoqi@1 637 __ push(T3);
aoqi@1 638 __ push(T0);
aoqi@1 639 __ push(T1);
aoqi@1 640 __ push(T8);
aoqi@1 641 __ move(T3, A0);
aoqi@1 642 __ move(T0, A1);
aoqi@1 643 __ move(T1, A2);
aoqi@1 644 __ move(T8, T1); // original count in T1
aoqi@1 645 __ daddi(AT, T1, -3);
aoqi@1 646 __ blez(AT, l_4);
aoqi@1 647 __ delayed()->nop();
aoqi@1 648 if (!aligned) {
aoqi@8 649 //TODO: copy 8 bytes at one time
Jin@7 650 // 2016/5/8 Jin: only when src and dest has the same alignment can we do lw/sw */
Jin@7 651 __ andi(AT, T3, 3);
Jin@7 652 __ andi(T9, T0, 3);
Jin@7 653 __ bne(AT, T9, l_5);
Jin@7 654 __ delayed()->nop();
Jin@7 655
aoqi@1 656 // align source address at dword address boundary
aoqi@1 657 __ move(T1, 4);
aoqi@1 658 __ sub(T1, T1, T3);
aoqi@1 659 __ andi(T1, T1, 3);
aoqi@1 660 __ beq(T1, R0, l_1);
aoqi@1 661 __ delayed()->nop();
aoqi@1 662 __ sub(T8,T8,T1);
aoqi@1 663 __ bind(l_0);
aoqi@1 664 __ lb(AT, T3, 0);
aoqi@1 665 __ sb(AT, T0, 0);
aoqi@1 666 __ addi(T3, T3, 1);
aoqi@1 667 __ addi(T0, T0, 1);
aoqi@1 668 __ addi(T1 ,T1, -1);
aoqi@1 669 __ bne(T1, R0, l_0);
aoqi@1 670 __ delayed()->nop();
aoqi@1 671 __ bind(l_1);
aoqi@1 672 __ move(T1, T8);
aoqi@1 673 }
aoqi@1 674 __ shr(T1, 2);
aoqi@1 675 __ beq(T1, R0, l_4); // no dwords to move
aoqi@1 676 __ delayed()->nop();
aoqi@1 677 // copy aligned dwords
aoqi@1 678 __ bind(l_2);
aoqi@1 679 __ align(16);
aoqi@1 680 __ bind(l_3);
aoqi@1 681 __ lw(AT, T3, 0);
aoqi@1 682 __ sw(AT, T0, 0 );
aoqi@1 683 __ addi(T3, T3, 4);
aoqi@1 684 __ addi(T0, T0, 4);
aoqi@1 685 __ addi(T1, T1, -1);
aoqi@1 686 __ bne(T1, R0, l_3);
aoqi@1 687 __ delayed()->nop();
aoqi@1 688 __ bind(l_4);
aoqi@1 689 __ move(T1, T8);
aoqi@1 690 __ andi(T1, T1, 3);
aoqi@1 691 __ beq(T1, R0, l_6);
aoqi@1 692 __ delayed()->nop();
aoqi@1 693 // copy suffix
aoqi@1 694 __ bind(l_5);
aoqi@1 695 __ lb(AT, T3, 0);
aoqi@1 696 __ sb(AT, T0, 0);
aoqi@1 697 __ addi(T3, T3, 1);
aoqi@1 698 __ addi(T0, T0, 1);
aoqi@1 699 __ addi(T1, T1, -1);
aoqi@1 700 __ bne(T1, R0, l_5 );
aoqi@1 701 __ delayed()->nop();
aoqi@1 702 __ bind(l_6);
aoqi@1 703 __ pop(T8);
aoqi@1 704 __ pop(T1);
aoqi@1 705 __ pop(T0);
aoqi@1 706 __ pop(T3);
aoqi@1 707 __ jr(RA);
aoqi@1 708 __ delayed()->nop();
aoqi@1 709 return start;
aoqi@1 710 }
aoqi@1 711
aoqi@1 712 // Arguments:
aoqi@1 713 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
aoqi@1 714 // ignored
aoqi@1 715 // name - stub name string
aoqi@1 716 //
aoqi@1 717 // Inputs:
aoqi@8 718 // A0 - source array address
aoqi@8 719 // A1 - destination array address
aoqi@8 720 // A2 - element count, treated as ssize_t, can be zero
aoqi@1 721 //
aoqi@1 722 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
aoqi@1 723 // we let the hardware handle it. The one to eight bytes within words,
aoqi@1 724 // dwords or qwords that span cache line boundaries will still be loaded
aoqi@1 725 // and stored atomically.
aoqi@1 726 //
aoqi@1 727 address generate_conjoint_byte_copy(bool aligned, const char *name) {
aoqi@8 728 __ align(CodeEntryAlignment);
aoqi@8 729 StubCodeMark mark(this, "StubRoutines", name);
aoqi@8 730 address start = __ pc();
aoqi@1 731
aoqi@8 732 Label l_copy_4_bytes_loop, l_copy_suffix, l_copy_suffix_loop, l_exit;
aoqi@8 733 Label l_copy_byte, l_from_unaligned, l_unaligned, l_4_bytes_aligned;
aoqi@1 734
aoqi@8 735 address nooverlap_target = aligned ?
aoqi@8 736 StubRoutines::arrayof_jbyte_disjoint_arraycopy() :
aoqi@8 737 StubRoutines::jbyte_disjoint_arraycopy();
aoqi@1 738
aoqi@8 739 array_overlap_test(nooverlap_target, 0);
Jin@7 740
aoqi@8 741 const Register from = A0; // source array address
aoqi@8 742 const Register to = A1; // destination array address
aoqi@8 743 const Register count = A2; // elements count
aoqi@8 744 const Register end_from = T3; // source array end address
aoqi@8 745 const Register end_to = T0; // destination array end address
aoqi@8 746 const Register end_count = T1; // destination array end address
Jin@7 747
aoqi@8 748 __ push(end_from);
aoqi@8 749 __ push(end_to);
aoqi@8 750 __ push(end_count);
aoqi@8 751 __ push(T8);
Jin@7 752
aoqi@8 753 // copy from high to low
aoqi@8 754 __ move(end_count, count);
aoqi@8 755 __ dadd(end_from, from, end_count);
aoqi@8 756 __ dadd(end_to, to, end_count);
Jin@7 757
aoqi@8 758 // 2016/05/08 aoqi: If end_from and end_to has differante alignment, unaligned copy is performed.
aoqi@8 759 __ andi(AT, end_from, 3);
aoqi@8 760 __ andi(T8, end_to, 3);
aoqi@8 761 __ bne(AT, T8, l_copy_byte);
aoqi@8 762 __ delayed()->nop();
Jin@7 763
aoqi@8 764 // First deal with the unaligned data at the top.
aoqi@8 765 __ bind(l_unaligned);
aoqi@8 766 __ beq(end_count, R0, l_exit);
aoqi@8 767 __ delayed()->nop();
aoqi@8 768
aoqi@8 769 __ andi(AT, end_from, 3);
aoqi@8 770 __ bne(AT, R0, l_from_unaligned);
aoqi@8 771 __ delayed()->nop();
aoqi@8 772
aoqi@8 773 __ andi(AT, end_to, 3);
aoqi@8 774 __ beq(AT, R0, l_4_bytes_aligned);
aoqi@8 775 __ delayed()->nop();
aoqi@8 776
aoqi@8 777 __ bind(l_from_unaligned);
aoqi@8 778 __ lb(AT, end_from, -1);
aoqi@8 779 __ sb(AT, end_to, -1);
aoqi@8 780 __ daddi(end_from, end_from, -1);
aoqi@8 781 __ daddi(end_to, end_to, -1);
aoqi@8 782 __ daddi(end_count, end_count, -1);
aoqi@8 783 __ b(l_unaligned);
aoqi@8 784 __ delayed()->nop();
aoqi@8 785
aoqi@8 786 // now end_to, end_from point to 4-byte aligned high-ends
aoqi@8 787 // end_count contains byte count that is not copied.
aoqi@8 788 // copy 4 bytes at a time
aoqi@8 789 __ bind(l_4_bytes_aligned);
aoqi@8 790
aoqi@8 791 __ move(T8, end_count);
aoqi@8 792 __ daddi(AT, end_count, -3);
aoqi@8 793 __ blez(AT, l_copy_suffix);
aoqi@8 794 __ delayed()->nop();
aoqi@8 795
aoqi@8 796 //__ andi(T8, T8, 3);
aoqi@8 797 __ lea(end_from, Address(end_from, -4));
aoqi@8 798 __ lea(end_to, Address(end_to, -4));
aoqi@8 799
aoqi@8 800 __ dsrl(end_count, end_count, 2);
aoqi@8 801 __ align(16);
aoqi@8 802 __ bind(l_copy_4_bytes_loop); //l_copy_4_bytes
aoqi@8 803 __ lw(AT, end_from, 0);
aoqi@8 804 __ sw(AT, end_to, 0);
aoqi@8 805 __ addi(end_from, end_from, -4);
aoqi@8 806 __ addi(end_to, end_to, -4);
aoqi@8 807 __ addi(end_count, end_count, -1);
aoqi@8 808 __ bne(end_count, R0, l_copy_4_bytes_loop);
aoqi@8 809 __ delayed()->nop();
aoqi@8 810
aoqi@8 811 __ b(l_copy_suffix);
aoqi@8 812 __ delayed()->nop();
aoqi@8 813 // copy dwords aligned or not with repeat move
aoqi@8 814 // l_copy_suffix
aoqi@8 815 // copy suffix (0-3 bytes)
aoqi@8 816 __ bind(l_copy_suffix);
aoqi@8 817 __ andi(T8, T8, 3);
aoqi@8 818 __ beq(T8, R0, l_exit);
aoqi@8 819 __ delayed()->nop();
aoqi@8 820 __ addi(end_from, end_from, 3);
aoqi@8 821 __ addi(end_to, end_to, 3);
aoqi@8 822 __ bind(l_copy_suffix_loop);
aoqi@8 823 __ lb(AT, end_from, 0);
aoqi@8 824 __ sb(AT, end_to, 0);
aoqi@8 825 __ addi(end_from, end_from, -1);
aoqi@8 826 __ addi(end_to, end_to, -1);
aoqi@8 827 __ addi(T8, T8, -1);
aoqi@8 828 __ bne(T8, R0, l_copy_suffix_loop);
aoqi@8 829 __ delayed()->nop();
aoqi@8 830
aoqi@8 831 __ bind(l_copy_byte);
aoqi@8 832 __ beq(end_count, R0, l_exit);
aoqi@8 833 __ delayed()->nop();
aoqi@8 834 __ lb(AT, end_from, -1);
aoqi@8 835 __ sb(AT, end_to, -1);
aoqi@8 836 __ daddi(end_from, end_from, -1);
aoqi@8 837 __ daddi(end_to, end_to, -1);
aoqi@8 838 __ daddi(end_count, end_count, -1);
aoqi@8 839 __ b(l_copy_byte);
aoqi@8 840 __ delayed()->nop();
aoqi@8 841
aoqi@8 842 __ bind(l_exit);
aoqi@8 843 __ pop(T8);
aoqi@8 844 __ pop(end_count);
aoqi@8 845 __ pop(end_to);
aoqi@8 846 __ pop(end_from);
aoqi@8 847 __ jr(RA);
aoqi@8 848 __ delayed()->nop();
aoqi@8 849 return start;
aoqi@1 850 }
aoqi@1 851
aoqi@13 852 // Generate stub for disjoint short copy. If "aligned" is true, the
aoqi@13 853 // "from" and "to" addresses are assumed to be heapword aligned.
aoqi@1 854 //
aoqi@13 855 // Arguments for generated stub:
aoqi@13 856 // from: A0
aoqi@13 857 // to: A1
aoqi@13 858 // elm.count: A2 treated as signed
aoqi@13 859 // one element: 2 bytes
aoqi@1 860 //
aoqi@13 861 // Strategy for aligned==true:
aoqi@1 862 //
aoqi@13 863 // If length <= 9:
aoqi@13 864 // 1. copy 1 elements at a time (l_5)
aoqi@1 865 //
aoqi@13 866 // If length > 9:
aoqi@13 867 // 1. copy 4 elements at a time until less than 4 elements are left (l_7)
aoqi@13 868 // 2. copy 2 elements at a time until less than 2 elements are left (l_6)
aoqi@13 869 // 3. copy last element if one was left in step 2. (l_1)
aoqi@13 870 //
aoqi@13 871 //
aoqi@13 872 // Strategy for aligned==false:
aoqi@13 873 //
aoqi@13 874 // If length <= 9: same as aligned==true case
aoqi@13 875 //
aoqi@13 876 // If length > 9:
aoqi@13 877 // 1. continue with step 7. if the alignment of from and to mod 4
aoqi@13 878 // is different.
aoqi@13 879 // 2. align from and to to 4 bytes by copying 1 element if necessary
aoqi@13 880 // 3. at l_2 from and to are 4 byte aligned; continue with
aoqi@13 881 // 6. if they cannot be aligned to 8 bytes because they have
aoqi@13 882 // got different alignment mod 8.
aoqi@13 883 // 4. at this point we know that both, from and to, have the same
aoqi@13 884 // alignment mod 8, now copy one element if necessary to get
aoqi@13 885 // 8 byte alignment of from and to.
aoqi@13 886 // 5. copy 4 elements at a time until less than 4 elements are
aoqi@13 887 // left; depending on step 3. all load/stores are aligned.
aoqi@13 888 // 6. copy 2 elements at a time until less than 2 elements are
aoqi@13 889 // left. (l_6)
aoqi@13 890 // 7. copy 1 element at a time. (l_5)
aoqi@13 891 // 8. copy last element if one was left in step 6. (l_1)
aoqi@13 892 //
aoqi@13 893 // TODO:
aoqi@13 894 //
aoqi@13 895 // 1. use loongson 128-bit load/store
aoqi@13 896 // 2. use loop unrolling optimization when len is big enough, for example if len > 0x2000:
aoqi@13 897 // __ bind(l_x);
aoqi@13 898 // __ ld(AT, tmp1, 0);
aoqi@13 899 // __ ld(tmp, tmp1, 8);
aoqi@13 900 // __ sd(AT, tmp2, 0);
aoqi@13 901 // __ sd(tmp, tmp2, 8);
aoqi@13 902 // __ ld(AT, tmp1, 16);
aoqi@13 903 // __ ld(tmp, tmp1, 24);
aoqi@13 904 // __ sd(AT, tmp2, 16);
aoqi@13 905 // __ sd(tmp, tmp2, 24);
aoqi@13 906 // __ daddi(tmp1, tmp1, 32);
aoqi@13 907 // __ daddi(tmp2, tmp2, 32);
aoqi@13 908 // __ daddi(tmp3, tmp3, -16);
aoqi@13 909 // __ daddi(AT, tmp3, -16);
aoqi@13 910 // __ bgez(AT, l_x);
aoqi@13 911 // __ delayed()->nop();
aoqi@13 912 //
aoqi@13 913 address generate_disjoint_short_copy(bool aligned, const char * name) {
aoqi@13 914 StubCodeMark mark(this, "StubRoutines", name);
aoqi@13 915 __ align(CodeEntryAlignment);
aoqi@1 916
aoqi@13 917 Register tmp1 = T0;
aoqi@13 918 Register tmp2 = T1;
aoqi@13 919 Register tmp3 = T3;
aoqi@1 920
aoqi@13 921 address start = __ pc();
aoqi@13 922
aoqi@13 923 __ push(tmp1);
aoqi@13 924 __ push(tmp2);
aoqi@13 925 __ push(tmp3);
aoqi@13 926 __ move(tmp1, A0);
aoqi@13 927 __ move(tmp2, A1);
aoqi@13 928 __ move(tmp3, A2);
aoqi@13 929
aoqi@13 930 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
aoqi@13 931 Label l_debug;
aoqi@13 932 // don't try anything fancy if arrays don't have many elements
aoqi@13 933 __ daddi(AT, tmp3, -9);
aoqi@13 934 __ blez(AT, l_1);
aoqi@13 935 __ delayed()->nop();
aoqi@13 936
aoqi@13 937 if (!aligned) {
aoqi@13 938 __ xorr(AT, A0, A1);
aoqi@13 939 __ andi(AT, AT, 1);
aoqi@13 940 __ bne(AT, R0, l_debug); // if arrays don't have the same alignment mod 2, can this happen?
aoqi@13 941 __ delayed()->nop();
aoqi@13 942
aoqi@13 943 __ xorr(AT, A0, A1);
aoqi@13 944 __ andi(AT, AT, 3);
aoqi@13 945 __ bne(AT, R0, l_1); // if arrays don't have the same alignment mod 4, do 1 element copy
aoqi@13 946 __ delayed()->nop();
aoqi@13 947
aoqi@13 948 // At this point it is guaranteed that both, from and to have the same alignment mod 4.
aoqi@13 949
aoqi@13 950 // Copy 1 element if necessary to align to 4 bytes.
aoqi@13 951 __ andi(AT, A0, 3);
aoqi@13 952 __ beq(AT, R0, l_2);
aoqi@13 953 __ delayed()->nop();
aoqi@13 954
aoqi@13 955 __ lhu(AT, tmp1, 0);
aoqi@13 956 __ daddi(tmp1, tmp1, 2);
aoqi@13 957 __ sh(AT, tmp2, 0);
aoqi@13 958 __ daddi(tmp2, tmp2, 2);
aoqi@13 959 __ daddi(tmp3, tmp3, -1);
aoqi@13 960 __ bind(l_2);
aoqi@13 961
aoqi@13 962 // At this point the positions of both, from and to, are at least 4 byte aligned.
aoqi@13 963
aoqi@13 964 // Copy 4 elements at a time.
aoqi@13 965 // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
aoqi@13 966 __ xorr(AT, tmp1, tmp2);
aoqi@13 967 __ andi(AT, AT, 7);
aoqi@13 968 __ bne(AT, R0, l_6); // not same alignment mod 8 -> copy 2, either from or to will be unaligned
aoqi@13 969 __ delayed()->nop();
aoqi@13 970
aoqi@13 971 // Copy a 2-element word if necessary to align to 8 bytes.
aoqi@13 972 __ andi(AT, tmp1, 7);
aoqi@13 973 __ beq(AT, R0, l_7);
aoqi@13 974 __ delayed()->nop();
aoqi@13 975
aoqi@13 976 __ lw(AT, tmp1, 0);
aoqi@13 977 __ daddi(tmp3, tmp3, -2);
aoqi@13 978 __ sw(AT, tmp2, 0);
aoqi@13 979 { // FasterArrayCopy
aoqi@13 980 __ daddi(tmp1, tmp1, 4);
aoqi@13 981 __ daddi(tmp2, tmp2, 4);
aoqi@13 982 }
aoqi@13 983 }
aoqi@13 984
aoqi@13 985 __ bind(l_7);
aoqi@13 986
aoqi@13 987 // Copy 4 elements at a time; either the loads or the stores can
aoqi@13 988 // be unaligned if aligned == false.
aoqi@13 989
aoqi@13 990 { // FasterArrayCopy
aoqi@13 991 __ daddi(AT, tmp3, -15);
aoqi@13 992 __ blez(AT, l_6); // copy 2 at a time if less than 16 elements remain
aoqi@13 993 __ delayed()->nop();
aoqi@13 994
aoqi@13 995 __ bind(l_8);
aoqi@13 996 // For Loongson, there is 128-bit memory access. TODO
aoqi@13 997 __ ld(AT, tmp1, 0);
aoqi@13 998 __ sd(AT, tmp2, 0);
aoqi@13 999 __ daddi(tmp1, tmp1, 8);
aoqi@13 1000 __ daddi(tmp2, tmp2, 8);
aoqi@13 1001 __ daddi(tmp3, tmp3, -4);
aoqi@13 1002 __ daddi(AT, tmp3, -4);
aoqi@13 1003 __ bgez(AT, l_8);
aoqi@13 1004 __ delayed()->nop();
aoqi@13 1005 }
aoqi@13 1006 __ bind(l_6);
aoqi@13 1007
aoqi@13 1008 // copy 2 element at a time
aoqi@13 1009 { // FasterArrayCopy
aoqi@13 1010 __ daddi(AT, tmp3, -1);
aoqi@13 1011 __ blez(AT, l_1);
aoqi@13 1012 __ delayed()->nop();
aoqi@13 1013
aoqi@13 1014 __ bind(l_3);
aoqi@13 1015 __ lw(AT, tmp1, 0);
aoqi@13 1016 __ sw(AT, tmp2, 0);
aoqi@13 1017 __ daddi(tmp1, tmp1, 4);
aoqi@13 1018 __ daddi(tmp2, tmp2, 4);
aoqi@13 1019 __ daddi(tmp3, tmp3, -2);
aoqi@13 1020 __ daddi(AT, tmp3, -2);
aoqi@13 1021 __ bgez(AT, l_3);
aoqi@13 1022 __ delayed()->nop();
aoqi@13 1023
aoqi@13 1024 }
aoqi@13 1025
aoqi@13 1026 // do single element copy (8 bit), can this happen?
aoqi@13 1027 __ bind(l_1);
aoqi@13 1028 __ beq(R0, tmp3, l_4);
aoqi@13 1029 __ delayed()->nop();
aoqi@13 1030
aoqi@13 1031 { // FasterArrayCopy
aoqi@13 1032
aoqi@13 1033 __ bind(l_5);
aoqi@13 1034 __ lhu(AT, tmp1, 0);
aoqi@13 1035 __ daddi(tmp3, tmp3, -1);
aoqi@13 1036 __ sh(AT, tmp2, 0);
aoqi@13 1037 __ daddi(tmp1, tmp1, 2);
aoqi@13 1038 __ daddi(tmp2, tmp2, 2);
aoqi@13 1039 __ daddi(AT, tmp3, -1);
aoqi@13 1040 __ bgez(AT, l_5);
aoqi@13 1041 __ delayed()->nop();
aoqi@13 1042 }
aoqi@13 1043 __ bind(l_4);
aoqi@13 1044 __ pop(tmp3);
aoqi@13 1045 __ pop(tmp2);
aoqi@13 1046 __ pop(tmp1);
aoqi@13 1047
aoqi@13 1048 __ jr(RA);
aoqi@13 1049 __ delayed()->nop();
aoqi@13 1050
aoqi@13 1051 __ bind(l_debug);
aoqi@13 1052 __ stop("generate_disjoint_short_copy should not reach here");
aoqi@13 1053 return start;
aoqi@1 1054 }
aoqi@1 1055
aoqi@1 1056 // Arguments:
aoqi@1 1057 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
aoqi@1 1058 // ignored
aoqi@1 1059 // name - stub name string
aoqi@1 1060 //
aoqi@1 1061 // Inputs:
aoqi@1 1062 // c_rarg0 - source array address
aoqi@1 1063 // c_rarg1 - destination array address
aoqi@1 1064 // c_rarg2 - element count, treated as ssize_t, can be zero
aoqi@1 1065 //
aoqi@1 1066 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
aoqi@1 1067 // let the hardware handle it. The two or four words within dwords
aoqi@1 1068 // or qwords that span cache line boundaries will still be loaded
aoqi@1 1069 // and stored atomically.
aoqi@1 1070 //
aoqi@1 1071 address generate_conjoint_short_copy(bool aligned, const char *name) {
aoqi@1 1072 Label l_1, l_2, l_3, l_4, l_5;
aoqi@1 1073 StubCodeMark mark(this, "StubRoutines", name);
aoqi@1 1074 __ align(CodeEntryAlignment);
aoqi@1 1075 address start = __ pc();
aoqi@1 1076 address nooverlap_target = aligned ?
aoqi@1 1077 StubRoutines::arrayof_jshort_disjoint_arraycopy() :
aoqi@1 1078 StubRoutines::jshort_disjoint_arraycopy();
aoqi@1 1079
aoqi@1 1080 array_overlap_test(nooverlap_target, 1);
aoqi@1 1081
aoqi@1 1082 __ push(T3);
aoqi@1 1083 __ push(T0);
aoqi@1 1084 __ push(T1);
aoqi@1 1085 __ push(T8);
aoqi@1 1086
aoqi@1 1087 /*
aoqi@1 1088 __ pushl(esi);
aoqi@1 1089 __ movl(ecx, Address(esp, 4+12)); // count
aoqi@1 1090 __ pushl(edi);
aoqi@1 1091 __ movl(esi, Address(esp, 8+ 4)); // from
aoqi@1 1092 __ movl(edi, Address(esp, 8+ 8)); // to
aoqi@1 1093 */
aoqi@1 1094 __ move(T1, A2);
aoqi@1 1095 __ move(T3, A0);
aoqi@1 1096 __ move(T0, A1);
aoqi@1 1097
aoqi@1 1098
aoqi@1 1099 // copy dwords from high to low
aoqi@1 1100 // __ leal(esi, Address(esi, ecx, Address::times_2, -4)); // from + count*2 - 4
aoqi@1 1101 __ sll(AT, T1, Address::times_2);
aoqi@1 1102 __ add(AT, T3, AT);
aoqi@1 1103 __ lea(T3, Address( AT, -4));
aoqi@1 1104 //__ std();
aoqi@1 1105 //__ leal(edi, Address(edi, ecx, Address::times_2, -4)); // to + count*2 - 4
aoqi@1 1106 __ sll(AT,T1 , Address::times_2);
aoqi@1 1107 __ add(AT, T0, AT);
aoqi@1 1108 __ lea(T0, Address( AT, -4));
aoqi@1 1109 // __ movl(eax, ecx);
aoqi@1 1110 __ move(T8, T1);
aoqi@1 1111 __ bind(l_1);
aoqi@1 1112 // __ sarl(ecx, 1); // dword count
aoqi@1 1113 __ sra(T1,T1, 1);
aoqi@1 1114 //__ jcc(Assembler::equal, l_4); // no dwords to move
aoqi@1 1115 __ beq(T1, R0, l_4);
aoqi@1 1116 __ delayed()->nop();
aoqi@1 1117 /* __ cmpl(ecx, 32);
aoqi@1 1118 __ jcc(Assembler::above, l_3); // > 32 dwords
aoqi@1 1119 // copy dwords with loop
aoqi@1 1120 __ subl(edi, esi);
aoqi@1 1121 */ __ align(16);
aoqi@1 1122 __ bind(l_2);
aoqi@1 1123 //__ movl(edx, Address(esi));
aoqi@1 1124 __ lw(AT, T3, 0);
aoqi@1 1125 //__ movl(Address(edi, esi, Address::times_1), edx);
aoqi@1 1126 __ sw(AT, T0, 0);
aoqi@1 1127 //__ subl(esi, 4);
aoqi@1 1128 __ addi(T3, T3, -4);
aoqi@1 1129 __ addi(T0, T0, -4);
aoqi@1 1130 //__ decl(ecx);
aoqi@1 1131 __ addi(T1, T1, -1);
aoqi@1 1132 // __ jcc(Assembler::notEqual, l_2);
aoqi@1 1133 __ bne(T1, R0, l_2);
aoqi@1 1134 __ delayed()->nop();
aoqi@1 1135 // __ addl(edi, esi);
aoqi@1 1136 // __ jmp(l_4);
aoqi@1 1137 __ b(l_4);
aoqi@1 1138 __ delayed()->nop();
aoqi@1 1139 // copy dwords with repeat move
aoqi@1 1140 __ bind(l_3);
aoqi@1 1141 // __ rep_movl();
aoqi@1 1142 __ bind(l_4);
aoqi@1 1143 // __ andl(eax, 1); // suffix count
aoqi@1 1144 __ andi(T8, T8, 1); // suffix count
aoqi@1 1145 //__ jcc(Assembler::equal, l_5); // no suffix
aoqi@1 1146 __ beq(T8, R0, l_5 );
aoqi@1 1147 __ delayed()->nop();
aoqi@1 1148 // copy suffix
aoqi@1 1149 // __ movw(edx, Address(esi, 2));
aoqi@1 1150 __ lh(AT, T3, 2);
aoqi@1 1151 // __ movw(Address(edi, 2), edx);
aoqi@1 1152 __ sh(AT, T0, 2);
aoqi@1 1153 __ bind(l_5);
aoqi@1 1154 // __ cld();
aoqi@1 1155 // __ popl(edi);
aoqi@1 1156 // __ popl(esi);
aoqi@1 1157 // __ ret(0);
aoqi@1 1158 __ pop(T8);
aoqi@1 1159 __ pop(T1);
aoqi@1 1160 __ pop(T0);
aoqi@1 1161 __ pop(T3);
aoqi@1 1162 __ jr(RA);
aoqi@1 1163 __ delayed()->nop();
aoqi@1 1164 return start;
aoqi@1 1165 }
aoqi@1 1166
aoqi@1 1167 // Arguments:
aoqi@1 1168 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
aoqi@1 1169 // ignored
aoqi@1 1170 // is_oop - true => oop array, so generate store check code
aoqi@1 1171 // name - stub name string
aoqi@1 1172 //
aoqi@1 1173 // Inputs:
aoqi@1 1174 // c_rarg0 - source array address
aoqi@1 1175 // c_rarg1 - destination array address
aoqi@1 1176 // c_rarg2 - element count, treated as ssize_t, can be zero
aoqi@1 1177 //
aoqi@1 1178 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
aoqi@1 1179 // the hardware handle it. The two dwords within qwords that span
aoqi@1 1180 // cache line boundaries will still be loaded and stored atomicly.
aoqi@1 1181 //
aoqi@1 1182 // Side Effects:
aoqi@1 1183 // disjoint_int_copy_entry is set to the no-overlap entry point
aoqi@1 1184 // used by generate_conjoint_int_oop_copy().
aoqi@1 1185 //
aoqi@1 1186 address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {
aoqi@1 1187 Label l_2, l_3, l_4, l_stchk;
aoqi@1 1188 StubCodeMark mark(this, "StubRoutines", name);
aoqi@1 1189 __ align(CodeEntryAlignment);
aoqi@1 1190 address start = __ pc();
aoqi@1 1191 /*
aoqi@1 1192 __ pushl(esi);
aoqi@1 1193 __ movl(ecx, Address(esp, 4+12)); // count
aoqi@1 1194 __ pushl(edi);
aoqi@1 1195 __ movl(esi, Address(esp, 8+ 4)); // from
aoqi@1 1196 __ movl(edi, Address(esp, 8+ 8)); // to
aoqi@1 1197 */
aoqi@1 1198 __ push(T3);
aoqi@1 1199 __ push(T0);
aoqi@1 1200 __ push(T1);
aoqi@1 1201 __ push(T8);
aoqi@1 1202 __ move(T1, A2);
aoqi@1 1203 __ move(T3, A0);
aoqi@1 1204 __ move(T0, A1);
aoqi@1 1205
aoqi@1 1206 // __ cmpl(ecx, 32);
aoqi@1 1207 // __ jcc(Assembler::belowEqual, l_2); // <= 32 dwords
aoqi@1 1208 // __ rep_movl();
aoqi@1 1209 __ b(l_2);
aoqi@1 1210 __ delayed()->nop();
aoqi@1 1211 if (is_oop) {
aoqi@1 1212 // __ jmp(l_stchk);
aoqi@1 1213 __ b(l_stchk);
aoqi@1 1214 __ delayed()->nop();
aoqi@1 1215 }
aoqi@1 1216 // __ popl(edi);
aoqi@1 1217 // __ popl(esi);
aoqi@1 1218 // __ ret(0);
aoqi@1 1219 __ pop(T8);
aoqi@1 1220 __ pop(T1);
aoqi@1 1221 __ pop(T0);
aoqi@1 1222 __ pop(T3);
aoqi@1 1223 __ jr(RA);
aoqi@1 1224 __ delayed()->nop();
aoqi@1 1225
aoqi@1 1226 __ bind(l_2);
aoqi@1 1227 // __ subl(edi, esi);
aoqi@1 1228 // __ testl(ecx, ecx);
aoqi@1 1229 // __ jcc(Assembler::zero, l_4);
aoqi@1 1230 __ beq(T1, R0, l_4);
aoqi@1 1231 __ delayed()->nop();
aoqi@1 1232 __ align(16);
aoqi@1 1233 __ bind(l_3);
aoqi@1 1234 //__ movl(edx, Address(esi));
aoqi@1 1235 __ lw(AT, T3, 0);
aoqi@1 1236 // __ movl(Address(edi, esi, Address::times_1), edx);
aoqi@1 1237 __ sw(AT, T0, 0);
aoqi@1 1238 // __ addl(esi, 4);
aoqi@1 1239 __ addi(T3, T3, 4);
aoqi@1 1240 __ addi(T0, T0, 4);
aoqi@1 1241 // __ decl(ecx);
aoqi@1 1242 __ addi(T1, T1, -1);
aoqi@1 1243 // __ jcc(Assembler::notEqual, l_3);
aoqi@1 1244 __ bne(T1, R0, l_3);
aoqi@1 1245 __ delayed()->nop();
aoqi@1 1246 if (is_oop) {
aoqi@1 1247 __ bind(l_stchk);
aoqi@1 1248 // __ movl(edi, Address(esp, 8+ 8));
aoqi@1 1249 // __ movl(ecx, Address(esp, 8+ 12));
aoqi@1 1250 __ move(T0, A1);
aoqi@1 1251 __ move(T1, A2);
aoqi@1 1252 array_store_check();
aoqi@1 1253 }
aoqi@1 1254 __ bind(l_4);
aoqi@1 1255 // __ popl(edi);
aoqi@1 1256 // __ popl(esi);
aoqi@1 1257 // __ ret(0);
aoqi@1 1258 __ pop(T8);
aoqi@1 1259 __ pop(T1);
aoqi@1 1260 __ pop(T0);
aoqi@1 1261 __ pop(T3);
aoqi@1 1262 __ jr(RA);
aoqi@1 1263 __ delayed()->nop();
aoqi@1 1264 return start;
aoqi@1 1265 }
aoqi@1 1266
aoqi@1 1267 // Arguments:
aoqi@1 1268 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
aoqi@1 1269 // ignored
aoqi@1 1270 // is_oop - true => oop array, so generate store check code
aoqi@1 1271 // name - stub name string
aoqi@1 1272 //
aoqi@1 1273 // Inputs:
aoqi@1 1274 // c_rarg0 - source array address
aoqi@1 1275 // c_rarg1 - destination array address
aoqi@1 1276 // c_rarg2 - element count, treated as ssize_t, can be zero
aoqi@1 1277 //
aoqi@1 1278 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
aoqi@1 1279 // the hardware handle it. The two dwords within qwords that span
aoqi@1 1280 // cache line boundaries will still be loaded and stored atomicly.
aoqi@1 1281 //
aoqi@1 1282 address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {
aoqi@1 1283 Label l_2, l_3, l_4, l_stchk;
aoqi@1 1284 StubCodeMark mark(this, "StubRoutines", name);
aoqi@1 1285 __ align(CodeEntryAlignment);
aoqi@1 1286 address start = __ pc();
aoqi@1 1287 address nooverlap_target;
aoqi@1 1288
aoqi@1 1289 if (is_oop) {
aoqi@1 1290 nooverlap_target = aligned ?
aoqi@1 1291 StubRoutines::arrayof_oop_disjoint_arraycopy() :
aoqi@1 1292 StubRoutines::oop_disjoint_arraycopy();
aoqi@1 1293 }else {
aoqi@1 1294 nooverlap_target = aligned ?
aoqi@1 1295 StubRoutines::arrayof_jint_disjoint_arraycopy() :
aoqi@1 1296 StubRoutines::jint_disjoint_arraycopy();
aoqi@1 1297 }
aoqi@1 1298
aoqi@1 1299 array_overlap_test(nooverlap_target, 2);
aoqi@1 1300
aoqi@1 1301 __ push(T3);
aoqi@1 1302 __ push(T0);
aoqi@1 1303 __ push(T1);
aoqi@1 1304 __ push(T8);
aoqi@1 1305
aoqi@1 1306 /*
aoqi@1 1307 __ pushl(esi);
aoqi@1 1308 __ movl(ecx, Address(esp, 4+12)); // count
aoqi@1 1309 __ pushl(edi);
aoqi@1 1310 __ movl(esi, Address(esp, 8+ 4)); // from
aoqi@1 1311 __ movl(edi, Address(esp, 8+ 8)); // to
aoqi@1 1312 */
aoqi@1 1313 __ move(T1, A2);
aoqi@1 1314 __ move(T3, A0);
aoqi@1 1315 __ move(T0, A1);
aoqi@1 1316
aoqi@1 1317 //__ leal(esi, Address(esi, ecx, Address::times_4, -4)); // from + count*4 - 4
aoqi@1 1318 __ sll(AT, T1, Address::times_4);
aoqi@1 1319 __ add(AT, T3, AT);
aoqi@1 1320 __ lea(T3 , Address(AT, -4));
aoqi@1 1321 //__ std();
aoqi@1 1322 //__ leal(edi, Address(edi, ecx, Address::times_4, -4)); // to + count*4 - 4
aoqi@1 1323 __ sll(AT, T1, Address::times_4);
aoqi@1 1324 __ add(AT, T0, AT);
aoqi@1 1325 __ lea(T0 , Address(AT, -4));
aoqi@1 1326
aoqi@1 1327 // __ cmpl(ecx, 32);
aoqi@1 1328 // __ jcc(Assembler::above, l_3); // > 32 dwords
aoqi@1 1329 // __ testl(ecx, ecx);
aoqi@1 1330 //__ jcc(Assembler::zero, l_4);
aoqi@1 1331 __ beq(T1, R0, l_4);
aoqi@1 1332 __ delayed()->nop();
aoqi@1 1333 // __ subl(edi, esi);
aoqi@1 1334 __ align(16);
aoqi@1 1335 __ bind(l_2);
aoqi@1 1336 // __ movl(edx, Address(esi));
aoqi@1 1337 __ lw(AT, T3, 0);
aoqi@1 1338 // __ movl(Address(esi, edi, Address::times_1), edx);
aoqi@1 1339 __ sw(AT, T0, 0);
aoqi@1 1340 // __ subl(esi, 4);
aoqi@1 1341 __ addi(T3, T3, -4);
aoqi@1 1342 __ addi(T0, T0, -4);
aoqi@1 1343 // __ decl(ecx);
aoqi@1 1344 __ addi(T1, T1, -1);
aoqi@1 1345 //__ jcc(Assembler::notEqual, l_2);
aoqi@1 1346 __ bne(T1, R0, l_2);
aoqi@1 1347 __ delayed()->nop();
aoqi@1 1348 if (is_oop) {
aoqi@1 1349 // __ jmp(l_stchk);
aoqi@1 1350 __ b( l_stchk);
aoqi@1 1351 __ delayed()->nop();
aoqi@1 1352 }
aoqi@1 1353 __ bind(l_4);
aoqi@1 1354 // __ cld();
aoqi@1 1355 // __ popl(edi);
aoqi@1 1356 // __ popl(esi);
aoqi@1 1357 // __ ret(0);
aoqi@1 1358 __ pop(T8);
aoqi@1 1359 __ pop(T1);
aoqi@1 1360 __ pop(T0);
aoqi@1 1361 __ pop(T3);
aoqi@1 1362 __ jr(RA);
aoqi@1 1363 __ delayed()->nop();
aoqi@1 1364 __ bind(l_3);
aoqi@1 1365 // __ rep_movl();
aoqi@1 1366 if (is_oop) {
aoqi@1 1367 __ bind(l_stchk);
aoqi@1 1368 // __ movl(edi, Address(esp, 8+ 8));
aoqi@1 1369 __ move(T0, A1);
aoqi@1 1370 // __ movl(ecx, Address(esp, 8+ 12));
aoqi@1 1371 __ move(T1, A2);
aoqi@1 1372 array_store_check();
aoqi@1 1373 }
aoqi@1 1374 // __ cld();
aoqi@1 1375 // __ popl(edi);
aoqi@1 1376 // __ popl(esi);
aoqi@1 1377 // __ ret(0);
aoqi@1 1378 __ pop(T8);
aoqi@1 1379 __ pop(T1);
aoqi@1 1380 __ pop(T0);
aoqi@1 1381 __ pop(T3);
aoqi@1 1382 __ jr(RA);
aoqi@1 1383 __ delayed()->nop();
aoqi@1 1384 return start;
aoqi@1 1385 }
aoqi@1 1386
aoqi@1 1387 // Arguments:
aoqi@1 1388 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
aoqi@1 1389 // ignored
aoqi@1 1390 // is_oop - true => oop array, so generate store check code
aoqi@1 1391 // name - stub name string
aoqi@1 1392 //
aoqi@1 1393 // Inputs:
aoqi@1 1394 // c_rarg0 - source array address
aoqi@1 1395 // c_rarg1 - destination array address
aoqi@1 1396 // c_rarg2 - element count, treated as ssize_t, can be zero
aoqi@1 1397 //
aoqi@1 1398 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
aoqi@1 1399 // the hardware handle it. The two dwords within qwords that span
aoqi@1 1400 // cache line boundaries will still be loaded and stored atomicly.
aoqi@1 1401 //
aoqi@1 1402 // Side Effects:
aoqi@1 1403 // disjoint_int_copy_entry is set to the no-overlap entry point
aoqi@1 1404 // used by generate_conjoint_int_oop_copy().
aoqi@1 1405 //
aoqi@1 1406 address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
aoqi@1 1407 Label l_2, l_3, l_4, l_stchk;
aoqi@1 1408 StubCodeMark mark(this, "StubRoutines", name);
aoqi@1 1409 __ align(CodeEntryAlignment);
aoqi@1 1410 address start = __ pc();
aoqi@1 1411 __ push(T3);
aoqi@1 1412 __ push(T0);
aoqi@1 1413 __ push(T1);
aoqi@1 1414 __ push(T8);
aoqi@1 1415 __ move(T1, A2);
aoqi@1 1416 __ move(T3, A0);
aoqi@1 1417 __ move(T0, A1);
aoqi@1 1418
aoqi@1 1419 // __ cmpl(ecx, 32);
aoqi@1 1420 // __ jcc(Assembler::belowEqual, l_2); // <= 32 dwords
aoqi@1 1421 // __ rep_movl();
aoqi@1 1422 __ b(l_2);
aoqi@1 1423 __ delayed()->nop();
aoqi@1 1424 if (is_oop) {
aoqi@1 1425 // __ jmp(l_stchk);
aoqi@1 1426 __ b(l_stchk);
aoqi@1 1427 __ delayed()->nop();
aoqi@1 1428 }
aoqi@1 1429 // __ popl(edi);
aoqi@1 1430 // __ popl(esi);
aoqi@1 1431 // __ ret(0);
aoqi@1 1432 __ pop(T8);
aoqi@1 1433 __ pop(T1);
aoqi@1 1434 __ pop(T0);
aoqi@1 1435 __ pop(T3);
aoqi@1 1436 __ jr(RA);
aoqi@1 1437 __ delayed()->nop();
aoqi@1 1438
aoqi@1 1439 __ bind(l_2);
aoqi@1 1440 // __ subl(edi, esi);
aoqi@1 1441 // __ testl(ecx, ecx);
aoqi@1 1442 // __ jcc(Assembler::zero, l_4);
aoqi@1 1443 __ beq(T1, R0, l_4);
aoqi@1 1444 __ delayed()->nop();
aoqi@1 1445 __ align(16);
aoqi@1 1446 __ bind(l_3);
aoqi@1 1447 //__ movl(edx, Address(esi));
aoqi@1 1448 __ ld(AT, T3, 0);
aoqi@1 1449 // __ movl(Address(edi, esi, Address::times_1), edx);
aoqi@1 1450 __ sd(AT, T0, 0);
aoqi@1 1451 // __ addl(esi, 4);
aoqi@1 1452 __ addi(T3, T3, 8);
aoqi@1 1453 __ addi(T0, T0, 8);
aoqi@1 1454 // __ decl(ecx);
aoqi@1 1455 __ addi(T1, T1, -1);
aoqi@1 1456 // __ jcc(Assembler::notEqual, l_3);
aoqi@1 1457 __ bne(T1, R0, l_3);
aoqi@1 1458 __ delayed()->nop();
aoqi@1 1459 if (is_oop) {
aoqi@1 1460 __ bind(l_stchk);
aoqi@1 1461 // __ movl(edi, Address(esp, 8+ 8));
aoqi@1 1462 // __ movl(ecx, Address(esp, 8+ 12));
aoqi@1 1463 __ move(T0, A1);
aoqi@1 1464 __ move(T1, A2);
aoqi@1 1465 array_store_check();
aoqi@1 1466 }
aoqi@1 1467 __ bind(l_4);
aoqi@1 1468 // __ popl(edi);
aoqi@1 1469 // __ popl(esi);
aoqi@1 1470 // __ ret(0);
aoqi@1 1471 __ pop(T8);
aoqi@1 1472 __ pop(T1);
aoqi@1 1473 __ pop(T0);
aoqi@1 1474 __ pop(T3);
aoqi@1 1475 __ jr(RA);
aoqi@1 1476 __ delayed()->nop();
aoqi@1 1477 return start;
aoqi@1 1478 }
aoqi@1 1479
aoqi@1 1480 // Arguments:
aoqi@1 1481 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
aoqi@1 1482 // ignored
aoqi@1 1483 // is_oop - true => oop array, so generate store check code
aoqi@1 1484 // name - stub name string
aoqi@1 1485 //
aoqi@1 1486 // Inputs:
aoqi@1 1487 // c_rarg0 - source array address
aoqi@1 1488 // c_rarg1 - destination array address
aoqi@1 1489 // c_rarg2 - element count, treated as ssize_t, can be zero
aoqi@1 1490 //
aoqi@1 1491 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
aoqi@1 1492 // the hardware handle it. The two dwords within qwords that span
aoqi@1 1493 // cache line boundaries will still be loaded and stored atomicly.
aoqi@1 1494 //
aoqi@1 1495 address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
aoqi@1 1496 Label l_2, l_3, l_4, l_stchk;
aoqi@1 1497 StubCodeMark mark(this, "StubRoutines", name);
aoqi@1 1498 __ align(CodeEntryAlignment);
aoqi@1 1499 address start = __ pc();
aoqi@1 1500 address nooverlap_target;
aoqi@1 1501
aoqi@1 1502 if (is_oop) {
aoqi@1 1503 nooverlap_target = aligned ?
aoqi@1 1504 StubRoutines::arrayof_oop_disjoint_arraycopy() :
aoqi@1 1505 StubRoutines::oop_disjoint_arraycopy();
aoqi@1 1506 }else {
aoqi@1 1507 nooverlap_target = aligned ?
aoqi@1 1508 StubRoutines::arrayof_jlong_disjoint_arraycopy() :
aoqi@1 1509 StubRoutines::jlong_disjoint_arraycopy();
aoqi@1 1510 }
aoqi@1 1511
aoqi@1 1512 array_overlap_test(nooverlap_target, 3);
aoqi@1 1513
aoqi@1 1514 __ push(T3);
aoqi@1 1515 __ push(T0);
aoqi@1 1516 __ push(T1);
aoqi@1 1517 __ push(T8);
aoqi@1 1518
aoqi@1 1519 __ move(T1, A2);
aoqi@1 1520 __ move(T3, A0);
aoqi@1 1521 __ move(T0, A1);
aoqi@1 1522
aoqi@1 1523 //__ leal(esi, Address(esi, ecx, Address::times_4, -4)); // from + count*4 - 4
aoqi@1 1524 __ sll(AT, T1, Address::times_8);
aoqi@1 1525 __ add(AT, T3, AT);
aoqi@1 1526 __ lea(T3 , Address(AT, -8));
aoqi@1 1527 //__ std();
aoqi@1 1528 //__ leal(edi, Address(edi, ecx, Address::times_4, -4)); // to + count*4 - 4
aoqi@1 1529 __ sll(AT, T1, Address::times_8);
aoqi@1 1530 __ add(AT, T0, AT);
aoqi@1 1531 __ lea(T0 , Address(AT, -8));
aoqi@1 1532
aoqi@1 1533 // __ cmpl(ecx, 32);
aoqi@1 1534 // __ jcc(Assembler::above, l_3); // > 32 dwords
aoqi@1 1535 // __ testl(ecx, ecx);
aoqi@1 1536 //__ jcc(Assembler::zero, l_4);
aoqi@1 1537 __ beq(T1, R0, l_4);
aoqi@1 1538 __ delayed()->nop();
aoqi@1 1539 // __ subl(edi, esi);
aoqi@1 1540 __ align(16);
aoqi@1 1541 __ bind(l_2);
aoqi@1 1542 // __ movl(edx, Address(esi));
aoqi@1 1543 __ ld(AT, T3, 0);
aoqi@1 1544 // __ movl(Address(esi, edi, Address::times_1), edx);
aoqi@1 1545 __ sd(AT, T0, 0);
aoqi@1 1546 // __ subl(esi, 4);
aoqi@1 1547 __ addi(T3, T3, -8);
aoqi@1 1548 __ addi(T0, T0, -8);
aoqi@1 1549 // __ decl(ecx);
aoqi@1 1550 __ addi(T1, T1, -1);
aoqi@1 1551 //__ jcc(Assembler::notEqual, l_2);
aoqi@1 1552 __ bne(T1, R0, l_2);
aoqi@1 1553 __ delayed()->nop();
aoqi@1 1554 if (is_oop) {
aoqi@1 1555 // __ jmp(l_stchk);
aoqi@1 1556 __ b( l_stchk);
aoqi@1 1557 __ delayed()->nop();
aoqi@1 1558 }
aoqi@1 1559 __ bind(l_4);
aoqi@1 1560 // __ cld();
aoqi@1 1561 // __ popl(edi);
aoqi@1 1562 // __ popl(esi);
aoqi@1 1563 // __ ret(0);
aoqi@1 1564 __ pop(T8);
aoqi@1 1565 __ pop(T1);
aoqi@1 1566 __ pop(T0);
aoqi@1 1567 __ pop(T3);
aoqi@1 1568 __ jr(RA);
aoqi@1 1569 __ delayed()->nop();
aoqi@1 1570 __ bind(l_3);
aoqi@1 1571 // __ rep_movl();
aoqi@1 1572 if (is_oop) {
aoqi@1 1573 __ bind(l_stchk);
aoqi@1 1574 // __ movl(edi, Address(esp, 8+ 8));
aoqi@1 1575 __ move(T0, A1);
aoqi@1 1576 // __ movl(ecx, Address(esp, 8+ 12));
aoqi@1 1577 __ move(T1, A2);
aoqi@1 1578 array_store_check();
aoqi@1 1579 }
aoqi@1 1580 // __ cld();
aoqi@1 1581 // __ popl(edi);
aoqi@1 1582 // __ popl(esi);
aoqi@1 1583 // __ ret(0);
aoqi@1 1584 __ pop(T8);
aoqi@1 1585 __ pop(T1);
aoqi@1 1586 __ pop(T0);
aoqi@1 1587 __ pop(T3);
aoqi@1 1588 __ jr(RA);
aoqi@1 1589 __ delayed()->nop();
aoqi@1 1590 return start;
aoqi@1 1591 }
aoqi@1 1592 #if 0
aoqi@1 1593 // Arguments:
aoqi@1 1594 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
aoqi@1 1595 // ignored
aoqi@1 1596 // is_oop - true => oop array, so generate store check code
aoqi@1 1597 // name - stub name string
aoqi@1 1598 //
aoqi@1 1599 // Inputs:
aoqi@1 1600 // c_rarg0 - source array address
aoqi@1 1601 // c_rarg1 - destination array address
aoqi@1 1602 // c_rarg2 - element count, treated as ssize_t, can be zero
aoqi@1 1603 //
aoqi@1 1604 address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, const char *name) {
aoqi@1 1605 __ align(CodeEntryAlignment);
aoqi@1 1606 StubCodeMark mark(this, "StubRoutines", name);
aoqi@1 1607 address start = __ pc();
aoqi@1 1608
aoqi@1 1609 Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
aoqi@1 1610 const Register from = rdi; // source array address
aoqi@1 1611 const Register to = rsi; // destination array address
aoqi@1 1612 const Register qword_count = rdx; // elements count
aoqi@1 1613 const Register saved_count = rcx;
aoqi@1 1614
aoqi@1 1615 __ enter(); // required for proper stackwalking of RuntimeStub frame
aoqi@1 1616 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
aoqi@1 1617
aoqi@1 1618 address disjoint_copy_entry = NULL;
aoqi@1 1619 if (is_oop) {
aoqi@1 1620 assert(!UseCompressedOops, "shouldn't be called for compressed oops");
aoqi@1 1621 disjoint_copy_entry = disjoint_oop_copy_entry;
aoqi@1 1622 oop_copy_entry = __ pc();
aoqi@1 1623 array_overlap_test(disjoint_oop_copy_entry, Address::times_8);
aoqi@1 1624 } else {
aoqi@1 1625 disjoint_copy_entry = disjoint_long_copy_entry;
aoqi@1 1626 long_copy_entry = __ pc();
aoqi@1 1627 array_overlap_test(disjoint_long_copy_entry, Address::times_8);
aoqi@1 1628 }
aoqi@1 1629 BLOCK_COMMENT("Entry:");
aoqi@1 1630 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
aoqi@1 1631
aoqi@1 1632 array_overlap_test(disjoint_copy_entry, Address::times_8);
aoqi@1 1633 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
aoqi@1 1634 // r9 and r10 may be used to save non-volatile registers
aoqi@1 1635
aoqi@1 1636 // 'from', 'to' and 'qword_count' are now valid
aoqi@1 1637
aoqi@1 1638 if (is_oop) {
aoqi@1 1639 // Save to and count for store barrier
aoqi@1 1640 __ movptr(saved_count, qword_count);
aoqi@1 1641 // No registers are destroyed by this call
aoqi@1 1642 gen_write_ref_array_pre_barrier(to, saved_count);
aoqi@1 1643 }
aoqi@1 1644
aoqi@1 1645 __ jmp(L_copy_32_bytes);
aoqi@1 1646
aoqi@1 1647 // Copy trailing qwords
aoqi@1 1648 __ BIND(L_copy_8_bytes);
aoqi@1 1649 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
aoqi@1 1650 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
aoqi@1 1651 __ decrement(qword_count);
aoqi@1 1652 __ jcc(Assembler::notZero, L_copy_8_bytes);
aoqi@1 1653
aoqi@1 1654 if (is_oop) {
aoqi@1 1655 __ jmp(L_exit);
aoqi@1 1656 } else {
aoqi@1 1657 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
aoqi@1 1658 restore_arg_regs();
aoqi@1 1659 __ xorptr(rax, rax); // return 0
aoqi@1 1660 __ leave(); // required for proper stackwalking of RuntimeStub frame
aoqi@1 1661 __ ret(0);
aoqi@1 1662 }
aoqi@1 1663
aoqi@1 1664 // Copy in 32-bytes chunks
aoqi@1 1665 copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
aoqi@1 1666
aoqi@1 1667 if (is_oop) {
aoqi@1 1668 __ BIND(L_exit);
aoqi@1 1669 __ lea(rcx, Address(to, saved_count, Address::times_8, -8));
aoqi@1 1670 gen_write_ref_array_post_barrier(to, rcx, rax);
aoqi@1 1671 inc_counter_np(SharedRuntime::_oop_array_copy_ctr);
aoqi@1 1672 } else {
aoqi@1 1673 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
aoqi@1 1674 }
aoqi@1 1675 restore_arg_regs();
aoqi@1 1676 __ xorptr(rax, rax); // return 0
aoqi@1 1677 __ leave(); // required for proper stackwalking of RuntimeStub frame
aoqi@1 1678 __ ret(0);
aoqi@1 1679
aoqi@1 1680 return start;
aoqi@1 1681 }
aoqi@1 1682
aoqi@1 1683
aoqi@1 1684 // Helper for generating a dynamic type check.
aoqi@1 1685 // Smashes no registers.
aoqi@1 1686 void generate_type_check(Register sub_klass,
aoqi@1 1687 Register super_check_offset,
aoqi@1 1688 Register super_klass,
aoqi@1 1689 Label& L_success) {
aoqi@1 1690 assert_different_registers(sub_klass, super_check_offset, super_klass);
aoqi@1 1691
aoqi@1 1692 BLOCK_COMMENT("type_check:");
aoqi@1 1693
aoqi@1 1694 Label L_miss;
aoqi@1 1695
aoqi@1 1696 // a couple of useful fields in sub_klass:
aoqi@1 1697 int ss_offset = (klassOopDesc::header_size() * HeapWordSize +
aoqi@1 1698 Klass::secondary_supers_offset_in_bytes());
aoqi@1 1699 int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
aoqi@1 1700 Klass::secondary_super_cache_offset_in_bytes());
aoqi@1 1701 Address secondary_supers_addr(sub_klass, ss_offset);
aoqi@1 1702 Address super_cache_addr( sub_klass, sc_offset);
aoqi@1 1703
aoqi@1 1704 // if the pointers are equal, we are done (e.g., String[] elements)
aoqi@1 1705 __ cmpptr(super_klass, sub_klass);
aoqi@1 1706 __ jcc(Assembler::equal, L_success);
aoqi@1 1707
aoqi@1 1708 // check the supertype display:
aoqi@1 1709 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
aoqi@1 1710 __ cmpptr(super_klass, super_check_addr); // test the super type
aoqi@1 1711 __ jcc(Assembler::equal, L_success);
aoqi@1 1712
aoqi@1 1713 // if it was a primary super, we can just fail immediately
aoqi@1 1714 __ cmpl(super_check_offset, sc_offset);
aoqi@1 1715 __ jcc(Assembler::notEqual, L_miss);
aoqi@1 1716
aoqi@1 1717 // Now do a linear scan of the secondary super-klass chain.
aoqi@1 1718 // The repne_scan instruction uses fixed registers, which we must spill.
aoqi@1 1719 // (We need a couple more temps in any case.)
aoqi@1 1720 // This code is rarely used, so simplicity is a virtue here.
aoqi@1 1721 inc_counter_np(SharedRuntime::_partial_subtype_ctr);
aoqi@1 1722 {
aoqi@1 1723 __ push(rax);
aoqi@1 1724 __ push(rcx);
aoqi@1 1725 __ push(rdi);
aoqi@1 1726 assert_different_registers(sub_klass, super_klass, rax, rcx, rdi);
aoqi@1 1727
aoqi@1 1728 __ movptr(rdi, secondary_supers_addr);
aoqi@1 1729 // Load the array length.
aoqi@1 1730 __ movl(rcx, Address(rdi, arrayOopDesc::length_offset_in_bytes()));
aoqi@1 1731 // Skip to start of data.
aoqi@1 1732 __ addptr(rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
aoqi@1 1733 // Scan rcx words at [rdi] for occurance of rax
aoqi@1 1734 // Set NZ/Z based on last compare
aoqi@1 1735 __ movptr(rax, super_klass);
aoqi@1 1736 if (UseCompressedOops) {
aoqi@1 1737 // Compare against compressed form. Don't need to uncompress because
aoqi@1 1738 // looks like orig rax is restored in popq below.
aoqi@1 1739 __ encode_heap_oop(rax);
aoqi@1 1740 __ repne_scanl();
aoqi@1 1741 } else {
aoqi@1 1742 __ repne_scan();
aoqi@1 1743 }
aoqi@1 1744
aoqi@1 1745 // Unspill the temp. registers:
aoqi@1 1746 __ pop(rdi);
aoqi@1 1747 __ pop(rcx);
aoqi@1 1748 __ pop(rax);
aoqi@1 1749
aoqi@1 1750 __ jcc(Assembler::notEqual, L_miss);
aoqi@1 1751 }
aoqi@1 1752
aoqi@1 1753 // Success. Cache the super we found and proceed in triumph.
aoqi@1 1754 __ movptr(super_cache_addr, super_klass); // note: rax is dead
aoqi@1 1755 __ jmp(L_success);
aoqi@1 1756
aoqi@1 1757 // Fall through on failure!
aoqi@1 1758 __ BIND(L_miss);
aoqi@1 1759 }
aoqi@1 1760
aoqi@1 1761 //
aoqi@1 1762 // Generate checkcasting array copy stub
aoqi@1 1763 //
aoqi@1 1764 // Input:
aoqi@1 1765 // c_rarg0 - source array address
aoqi@1 1766 // c_rarg1 - destination array address
aoqi@1 1767 // c_rarg2 - element count, treated as ssize_t, can be zero
aoqi@1 1768 // c_rarg3 - size_t ckoff (super_check_offset)
aoqi@1 1769 // not Win64
aoqi@1 1770 // c_rarg4 - oop ckval (super_klass)
aoqi@1 1771 // Win64
aoqi@1 1772 // rsp+40 - oop ckval (super_klass)
aoqi@1 1773 //
aoqi@1 1774 // Output:
aoqi@1 1775 // rax == 0 - success
aoqi@1 1776 // rax == -1^K - failure, where K is partial transfer count
aoqi@1 1777 //
aoqi@1 1778 address generate_checkcast_copy(const char *name) {
aoqi@1 1779
aoqi@1 1780 Label L_load_element, L_store_element, L_do_card_marks, L_done;
aoqi@1 1781
aoqi@1 1782 // Input registers (after setup_arg_regs)
aoqi@1 1783 const Register from = rdi; // source array address
aoqi@1 1784 const Register to = rsi; // destination array address
aoqi@1 1785 const Register length = rdx; // elements count
aoqi@1 1786 const Register ckoff = rcx; // super_check_offset
aoqi@1 1787 const Register ckval = r8; // super_klass
aoqi@1 1788
aoqi@1 1789 // Registers used as temps (r13, r14 are save-on-entry)
aoqi@1 1790 const Register end_from = from; // source array end address
aoqi@1 1791 const Register end_to = r13; // destination array end address
aoqi@1 1792 const Register count = rdx; // -(count_remaining)
aoqi@1 1793 const Register r14_length = r14; // saved copy of length
aoqi@1 1794 // End pointers are inclusive, and if length is not zero they point
aoqi@1 1795 // to the last unit copied: end_to[0] := end_from[0]
aoqi@1 1796
aoqi@1 1797 const Register rax_oop = rax; // actual oop copied
aoqi@1 1798 const Register r11_klass = r11; // oop._klass
aoqi@1 1799
aoqi@1 1800 //---------------------------------------------------------------
aoqi@1 1801 // Assembler stub will be used for this call to arraycopy
aoqi@1 1802 // if the two arrays are subtypes of Object[] but the
aoqi@1 1803 // destination array type is not equal to or a supertype
aoqi@1 1804 // of the source type. Each element must be separately
aoqi@1 1805 // checked.
aoqi@1 1806
aoqi@1 1807 __ align(CodeEntryAlignment);
aoqi@1 1808 StubCodeMark mark(this, "StubRoutines", name);
aoqi@1 1809 address start = __ pc();
aoqi@1 1810
aoqi@1 1811 __ enter(); // required for proper stackwalking of RuntimeStub frame
aoqi@1 1812
aoqi@1 1813 checkcast_copy_entry = __ pc();
aoqi@1 1814 BLOCK_COMMENT("Entry:");
aoqi@1 1815
aoqi@1 1816 #ifdef ASSERT
aoqi@1 1817 // caller guarantees that the arrays really are different
aoqi@1 1818 // otherwise, we would have to make conjoint checks
aoqi@1 1819 { Label L;
aoqi@1 1820 array_overlap_test(L, TIMES_OOP);
aoqi@1 1821 __ stop("checkcast_copy within a single array");
aoqi@1 1822 __ bind(L);
aoqi@1 1823 }
aoqi@1 1824 #endif //ASSERT
aoqi@1 1825
aoqi@1 1826 // allocate spill slots for r13, r14
aoqi@1 1827 enum {
aoqi@1 1828 saved_r13_offset,
aoqi@1 1829 saved_r14_offset,
aoqi@1 1830 saved_rbp_offset,
aoqi@1 1831 saved_rip_offset,
aoqi@1 1832 saved_rarg0_offset
aoqi@1 1833 };
aoqi@1 1834 __ subptr(rsp, saved_rbp_offset * wordSize);
aoqi@1 1835 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
aoqi@1 1836 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
aoqi@1 1837 setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
aoqi@1 1838 // ckoff => rcx, ckval => r8
aoqi@1 1839 // r9 and r10 may be used to save non-volatile registers
aoqi@1 1840 #ifdef _WIN64
aoqi@1 1841 // last argument (#4) is on stack on Win64
aoqi@1 1842 const int ckval_offset = saved_rarg0_offset + 4;
aoqi@1 1843 __ movptr(ckval, Address(rsp, ckval_offset * wordSize));
aoqi@1 1844 #endif
aoqi@1 1845
aoqi@1 1846 // check that int operands are properly extended to size_t
aoqi@1 1847 assert_clean_int(length, rax);
aoqi@1 1848 assert_clean_int(ckoff, rax);
aoqi@1 1849
aoqi@1 1850 #ifdef ASSERT
aoqi@1 1851 BLOCK_COMMENT("assert consistent ckoff/ckval");
aoqi@1 1852 // The ckoff and ckval must be mutually consistent,
aoqi@1 1853 // even though caller generates both.
aoqi@1 1854 { Label L;
aoqi@1 1855 int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
aoqi@1 1856 Klass::super_check_offset_offset_in_bytes());
aoqi@1 1857 __ cmpl(ckoff, Address(ckval, sco_offset));
aoqi@1 1858 __ jcc(Assembler::equal, L);
aoqi@1 1859 __ stop("super_check_offset inconsistent");
aoqi@1 1860 __ bind(L);
aoqi@1 1861 }
aoqi@1 1862 #endif //ASSERT
aoqi@1 1863
aoqi@1 1864 // Loop-invariant addresses. They are exclusive end pointers.
aoqi@1 1865 Address end_from_addr(from, length, TIMES_OOP, 0);
aoqi@1 1866 Address end_to_addr(to, length, TIMES_OOP, 0);
aoqi@1 1867 // Loop-variant addresses. They assume post-incremented count < 0.
aoqi@1 1868 Address from_element_addr(end_from, count, TIMES_OOP, 0);
aoqi@1 1869 Address to_element_addr(end_to, count, TIMES_OOP, 0);
aoqi@1 1870
aoqi@1 1871 gen_write_ref_array_pre_barrier(to, count);
aoqi@1 1872
aoqi@1 1873 // Copy from low to high addresses, indexed from the end of each array.
aoqi@1 1874 __ lea(end_from, end_from_addr);
aoqi@1 1875 __ lea(end_to, end_to_addr);
aoqi@1 1876 __ movptr(r14_length, length); // save a copy of the length
aoqi@1 1877 assert(length == count, ""); // else fix next line:
aoqi@1 1878 __ negptr(count); // negate and test the length
aoqi@1 1879 __ jcc(Assembler::notZero, L_load_element);
aoqi@1 1880
aoqi@1 1881 // Empty array: Nothing to do.
aoqi@1 1882 __ xorptr(rax, rax); // return 0 on (trivial) success
aoqi@1 1883 __ jmp(L_done);
aoqi@1 1884
aoqi@1 1885 // ======== begin loop ========
aoqi@1 1886 // (Loop is rotated; its entry is L_load_element.)
aoqi@1 1887 // Loop control:
aoqi@1 1888 // for (count = -count; count != 0; count++)
aoqi@1 1889 // Base pointers src, dst are biased by 8*(count-1),to last element.
aoqi@1 1890 __ align(16);
aoqi@1 1891
aoqi@1 1892 __ BIND(L_store_element);
aoqi@1 1893 __ store_heap_oop(rax_oop, to_element_addr); // store the oop
aoqi@1 1894 __ increment(count); // increment the count toward zero
aoqi@1 1895 __ jcc(Assembler::zero, L_do_card_marks);
aoqi@1 1896
aoqi@1 1897 // ======== loop entry is here ========
aoqi@1 1898 __ BIND(L_load_element);
aoqi@1 1899 __ load_heap_oop(rax_oop, from_element_addr); // load the oop
aoqi@1 1900 __ testptr(rax_oop, rax_oop);
aoqi@1 1901 __ jcc(Assembler::zero, L_store_element);
aoqi@1 1902
aoqi@1 1903 __ load_klass(r11_klass, rax_oop);// query the object klass
aoqi@1 1904 generate_type_check(r11_klass, ckoff, ckval, L_store_element);
aoqi@1 1905 // ======== end loop ========
aoqi@1 1906
aoqi@1 1907 // It was a real error; we must depend on the caller to finish the job.
aoqi@1 1908 // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
aoqi@1 1909 // Emit GC store barriers for the oops we have copied (r14 + rdx),
aoqi@1 1910 // and report their number to the caller.
aoqi@1 1911 assert_different_registers(rax, r14_length, count, to, end_to, rcx);
aoqi@1 1912 __ lea(end_to, to_element_addr);
aoqi@1 1913 gen_write_ref_array_post_barrier(to, end_to, rscratch1);
aoqi@1 1914 __ movptr(rax, r14_length); // original oops
aoqi@1 1915 __ addptr(rax, count); // K = (original - remaining) oops
aoqi@1 1916 __ notptr(rax); // report (-1^K) to caller
aoqi@1 1917 __ jmp(L_done);
aoqi@1 1918
aoqi@1 1919 // Come here on success only.
aoqi@1 1920 __ BIND(L_do_card_marks);
aoqi@1 1921 __ addptr(end_to, -wordSize); // make an inclusive end pointer
aoqi@1 1922 gen_write_ref_array_post_barrier(to, end_to, rscratch1);
aoqi@1 1923 __ xorptr(rax, rax); // return 0 on success
aoqi@1 1924
aoqi@1 1925 // Common exit point (success or failure).
aoqi@1 1926 __ BIND(L_done);
aoqi@1 1927 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
aoqi@1 1928 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
aoqi@1 1929 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
aoqi@1 1930 restore_arg_regs();
aoqi@1 1931 __ leave(); // required for proper stackwalking of RuntimeStub frame
aoqi@1 1932 __ ret(0);
aoqi@1 1933
aoqi@1 1934 return start;
aoqi@1 1935 }
aoqi@1 1936
aoqi@1 1937 //
aoqi@1 1938 // Generate 'unsafe' array copy stub
aoqi@1 1939 // Though just as safe as the other stubs, it takes an unscaled
aoqi@1 1940 // size_t argument instead of an element count.
aoqi@1 1941 //
aoqi@1 1942 // Input:
aoqi@1 1943 // c_rarg0 - source array address
aoqi@1 1944 // c_rarg1 - destination array address
aoqi@1 1945 // c_rarg2 - byte count, treated as ssize_t, can be zero
aoqi@1 1946 //
aoqi@1 1947 // Examines the alignment of the operands and dispatches
aoqi@1 1948 // to a long, int, short, or byte copy loop.
aoqi@1 1949 //
aoqi@1 1950 address generate_unsafe_copy(const char *name) {
aoqi@1 1951
aoqi@1 1952 Label L_long_aligned, L_int_aligned, L_short_aligned;
aoqi@1 1953
aoqi@1 1954 // Input registers (before setup_arg_regs)
aoqi@1 1955 const Register from = c_rarg0; // source array address
aoqi@1 1956 const Register to = c_rarg1; // destination array address
aoqi@1 1957 const Register size = c_rarg2; // byte count (size_t)
aoqi@1 1958
aoqi@1 1959 // Register used as a temp
aoqi@1 1960 const Register bits = rax; // test copy of low bits
aoqi@1 1961
aoqi@1 1962 __ align(CodeEntryAlignment);
aoqi@1 1963 StubCodeMark mark(this, "StubRoutines", name);
aoqi@1 1964 address start = __ pc();
aoqi@1 1965
aoqi@1 1966 __ enter(); // required for proper stackwalking of RuntimeStub frame
aoqi@1 1967
aoqi@1 1968 // bump this on entry, not on exit:
aoqi@1 1969 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
aoqi@1 1970
aoqi@1 1971 __ mov(bits, from);
aoqi@1 1972 __ orptr(bits, to);
aoqi@1 1973 __ orptr(bits, size);
aoqi@1 1974
aoqi@1 1975 __ testb(bits, BytesPerLong-1);
aoqi@1 1976 __ jccb(Assembler::zero, L_long_aligned);
aoqi@1 1977
aoqi@1 1978 __ testb(bits, BytesPerInt-1);
aoqi@1 1979 __ jccb(Assembler::zero, L_int_aligned);
aoqi@1 1980
aoqi@1 1981 __ testb(bits, BytesPerShort-1);
aoqi@1 1982 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
aoqi@1 1983
aoqi@1 1984 __ BIND(L_short_aligned);
aoqi@1 1985 __ shrptr(size, LogBytesPerShort); // size => short_count
aoqi@1 1986 __ jump(RuntimeAddress(short_copy_entry));
aoqi@1 1987
aoqi@1 1988 __ BIND(L_int_aligned);
aoqi@1 1989 __ shrptr(size, LogBytesPerInt); // size => int_count
aoqi@1 1990 __ jump(RuntimeAddress(int_copy_entry));
aoqi@1 1991
aoqi@1 1992 __ BIND(L_long_aligned);
aoqi@1 1993 __ shrptr(size, LogBytesPerLong); // size => qword_count
aoqi@1 1994 __ jump(RuntimeAddress(long_copy_entry));
aoqi@1 1995
aoqi@1 1996 return start;
aoqi@1 1997 }
aoqi@1 1998
aoqi@1 1999 // Perform range checks on the proposed arraycopy.
aoqi@1 2000 // Kills temp, but nothing else.
aoqi@1 2001 // Also, clean the sign bits of src_pos and dst_pos.
aoqi@1 2002 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
aoqi@1 2003 Register src_pos, // source position (c_rarg1)
aoqi@1 2004 Register dst, // destination array oo (c_rarg2)
aoqi@1 2005 Register dst_pos, // destination position (c_rarg3)
aoqi@1 2006 Register length,
aoqi@1 2007 Register temp,
aoqi@1 2008 Label& L_failed) {
aoqi@1 2009 BLOCK_COMMENT("arraycopy_range_checks:");
aoqi@1 2010
aoqi@1 2011 // if (src_pos + length > arrayOop(src)->length()) FAIL;
aoqi@1 2012 __ movl(temp, length);
aoqi@1 2013 __ addl(temp, src_pos); // src_pos + length
aoqi@1 2014 __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
aoqi@1 2015 __ jcc(Assembler::above, L_failed);
aoqi@1 2016
aoqi@1 2017 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
aoqi@1 2018 __ movl(temp, length);
aoqi@1 2019 __ addl(temp, dst_pos); // dst_pos + length
aoqi@1 2020 __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
aoqi@1 2021 __ jcc(Assembler::above, L_failed);
aoqi@1 2022
aoqi@1 2023 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
aoqi@1 2024 // Move with sign extension can be used since they are positive.
aoqi@1 2025 __ movslq(src_pos, src_pos);
aoqi@1 2026 __ movslq(dst_pos, dst_pos);
aoqi@1 2027
aoqi@1 2028 BLOCK_COMMENT("arraycopy_range_checks done");
aoqi@1 2029 }
aoqi@1 2030
aoqi@1 2031 //
aoqi@1 2032 // Generate generic array copy stubs
aoqi@1 2033 //
aoqi@1 2034 // Input:
aoqi@1 2035 // c_rarg0 - src oop
aoqi@1 2036 // c_rarg1 - src_pos (32-bits)
aoqi@1 2037 // c_rarg2 - dst oop
aoqi@1 2038 // c_rarg3 - dst_pos (32-bits)
aoqi@1 2039 // not Win64
aoqi@1 2040 // c_rarg4 - element count (32-bits)
aoqi@1 2041 // Win64
aoqi@1 2042 // rsp+40 - element count (32-bits)
aoqi@1 2043 //
aoqi@1 2044 // Output:
aoqi@1 2045 // rax == 0 - success
aoqi@1 2046 // rax == -1^K - failure, where K is partial transfer count
aoqi@1 2047 //
aoqi@1 2048 address generate_generic_copy(const char *name) {
aoqi@1 2049
aoqi@1 2050 Label L_failed, L_failed_0, L_objArray;
aoqi@1 2051 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
aoqi@1 2052
aoqi@1 2053 // Input registers
aoqi@1 2054 const Register src = c_rarg0; // source array oop
aoqi@1 2055 const Register src_pos = c_rarg1; // source position
aoqi@1 2056 const Register dst = c_rarg2; // destination array oop
aoqi@1 2057 const Register dst_pos = c_rarg3; // destination position
aoqi@1 2058 // elements count is on stack on Win64
aoqi@1 2059 #ifdef _WIN64
aoqi@1 2060 #define C_RARG4 Address(rsp, 6 * wordSize)
aoqi@1 2061 #else
aoqi@1 2062 #define C_RARG4 c_rarg4
aoqi@1 2063 #endif
aoqi@1 2064
aoqi@1 2065 { int modulus = CodeEntryAlignment;
aoqi@1 2066 int target = modulus - 5; // 5 = sizeof jmp(L_failed)
aoqi@1 2067 int advance = target - (__ offset() % modulus);
aoqi@1 2068 if (advance < 0) advance += modulus;
aoqi@1 2069 if (advance > 0) __ nop(advance);
aoqi@1 2070 }
aoqi@1 2071 StubCodeMark mark(this, "StubRoutines", name);
aoqi@1 2072
aoqi@1 2073 // Short-hop target to L_failed. Makes for denser prologue code.
aoqi@1 2074 __ BIND(L_failed_0);
aoqi@1 2075 __ jmp(L_failed);
aoqi@1 2076 assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
aoqi@1 2077
aoqi@1 2078 __ align(CodeEntryAlignment);
aoqi@1 2079 address start = __ pc();
aoqi@1 2080
aoqi@1 2081 __ enter(); // required for proper stackwalking of RuntimeStub frame
aoqi@1 2082
aoqi@1 2083 // bump this on entry, not on exit:
aoqi@1 2084 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
aoqi@1 2085
aoqi@1 2086 //-----------------------------------------------------------------------
aoqi@1 2087 // Assembler stub will be used for this call to arraycopy
aoqi@1 2088 // if the following conditions are met:
aoqi@1 2089 //
aoqi@1 2090 // (1) src and dst must not be null.
aoqi@1 2091 // (2) src_pos must not be negative.
aoqi@1 2092 // (3) dst_pos must not be negative.
aoqi@1 2093 // (4) length must not be negative.
aoqi@1 2094 // (5) src klass and dst klass should be the same and not NULL.
aoqi@1 2095 // (6) src and dst should be arrays.
aoqi@1 2096 // (7) src_pos + length must not exceed length of src.
aoqi@1 2097 // (8) dst_pos + length must not exceed length of dst.
aoqi@1 2098 //
aoqi@1 2099
aoqi@1 2100 // if (src == NULL) return -1;
aoqi@1 2101 __ testptr(src, src); // src oop
aoqi@1 2102 size_t j1off = __ offset();
aoqi@1 2103 __ jccb(Assembler::zero, L_failed_0);
aoqi@1 2104
aoqi@1 2105 // if (src_pos < 0) return -1;
aoqi@1 2106 __ testl(src_pos, src_pos); // src_pos (32-bits)
aoqi@1 2107 __ jccb(Assembler::negative, L_failed_0);
aoqi@1 2108
aoqi@1 2109 // if (dst == NULL) return -1;
aoqi@1 2110 __ testptr(dst, dst); // dst oop
aoqi@1 2111 __ jccb(Assembler::zero, L_failed_0);
aoqi@1 2112
aoqi@1 2113 // if (dst_pos < 0) return -1;
aoqi@1 2114 __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
aoqi@1 2115 size_t j4off = __ offset();
aoqi@1 2116 __ jccb(Assembler::negative, L_failed_0);
aoqi@1 2117
aoqi@1 2118 // The first four tests are very dense code,
aoqi@1 2119 // but not quite dense enough to put four
aoqi@1 2120 // jumps in a 16-byte instruction fetch buffer.
aoqi@1 2121 // That's good, because some branch predicters
aoqi@1 2122 // do not like jumps so close together.
aoqi@1 2123 // Make sure of this.
aoqi@1 2124 guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
aoqi@1 2125
aoqi@1 2126 // registers used as temp
aoqi@1 2127 const Register r11_length = r11; // elements count to copy
aoqi@1 2128 const Register r10_src_klass = r10; // array klass
aoqi@1 2129 const Register r9_dst_klass = r9; // dest array klass
aoqi@1 2130
aoqi@1 2131 // if (length < 0) return -1;
aoqi@1 2132 __ movl(r11_length, C_RARG4); // length (elements count, 32-bits value)
aoqi@1 2133 __ testl(r11_length, r11_length);
aoqi@1 2134 __ jccb(Assembler::negative, L_failed_0);
aoqi@1 2135
aoqi@1 2136 __ load_klass(r10_src_klass, src);
aoqi@1 2137 #ifdef ASSERT
aoqi@1 2138 // assert(src->klass() != NULL);
aoqi@1 2139 BLOCK_COMMENT("assert klasses not null");
aoqi@1 2140 { Label L1, L2;
aoqi@1 2141 __ testptr(r10_src_klass, r10_src_klass);
aoqi@1 2142 __ jcc(Assembler::notZero, L2); // it is broken if klass is NULL
aoqi@1 2143 __ bind(L1);
aoqi@1 2144 __ stop("broken null klass");
aoqi@1 2145 __ bind(L2);
aoqi@1 2146 __ load_klass(r9_dst_klass, dst);
aoqi@1 2147 __ cmpq(r9_dst_klass, 0);
aoqi@1 2148 __ jcc(Assembler::equal, L1); // this would be broken also
aoqi@1 2149 BLOCK_COMMENT("assert done");
aoqi@1 2150 }
aoqi@1 2151 #endif
aoqi@1 2152
aoqi@1 2153 // Load layout helper (32-bits)
aoqi@1 2154 //
aoqi@1 2155 // |array_tag| | header_size | element_type | |log2_element_size|
aoqi@1 2156 // 32 30 24 16 8 2 0
aoqi@1 2157 //
aoqi@1 2158 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
aoqi@1 2159 //
aoqi@1 2160
aoqi@1 2161 int lh_offset = klassOopDesc::header_size() * HeapWordSize +
aoqi@1 2162 Klass::layout_helper_offset_in_bytes();
aoqi@1 2163
aoqi@1 2164 const Register rax_lh = rax; // layout helper
aoqi@1 2165
aoqi@1 2166 __ movl(rax_lh, Address(r10_src_klass, lh_offset));
aoqi@1 2167
aoqi@1 2168 // Handle objArrays completely differently...
aoqi@1 2169 jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
aoqi@1 2170 __ cmpl(rax_lh, objArray_lh);
aoqi@1 2171 __ jcc(Assembler::equal, L_objArray);
aoqi@1 2172
aoqi@1 2173 // if (src->klass() != dst->klass()) return -1;
aoqi@1 2174 __ load_klass(r9_dst_klass, dst);
aoqi@1 2175 __ cmpq(r10_src_klass, r9_dst_klass);
aoqi@1 2176 __ jcc(Assembler::notEqual, L_failed);
aoqi@1 2177
aoqi@1 2178 // if (!src->is_Array()) return -1;
aoqi@1 2179 __ cmpl(rax_lh, Klass::_lh_neutral_value);
aoqi@1 2180 __ jcc(Assembler::greaterEqual, L_failed);
aoqi@1 2181
aoqi@1 2182 // At this point, it is known to be a typeArray (array_tag 0x3).
aoqi@1 2183 #ifdef ASSERT
aoqi@1 2184 { Label L;
aoqi@1 2185 __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
aoqi@1 2186 __ jcc(Assembler::greaterEqual, L);
aoqi@1 2187 __ stop("must be a primitive array");
aoqi@1 2188 __ bind(L);
aoqi@1 2189 }
aoqi@1 2190 #endif
aoqi@1 2191
aoqi@1 2192 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
aoqi@1 2193 r10, L_failed);
aoqi@1 2194
aoqi@1 2195 // typeArrayKlass
aoqi@1 2196 //
aoqi@1 2197 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
aoqi@1 2198 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
aoqi@1 2199 //
aoqi@1 2200
aoqi@1 2201 const Register r10_offset = r10; // array offset
aoqi@1 2202 const Register rax_elsize = rax_lh; // element size
aoqi@1 2203
aoqi@1 2204 __ movl(r10_offset, rax_lh);
aoqi@1 2205 __ shrl(r10_offset, Klass::_lh_header_size_shift);
aoqi@1 2206 __ andptr(r10_offset, Klass::_lh_header_size_mask); // array_offset
aoqi@1 2207 __ addptr(src, r10_offset); // src array offset
aoqi@1 2208 __ addptr(dst, r10_offset); // dst array offset
aoqi@1 2209 BLOCK_COMMENT("choose copy loop based on element size");
aoqi@1 2210 __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
aoqi@1 2211
aoqi@1 2212 // next registers should be set before the jump to corresponding stub
aoqi@1 2213 const Register from = c_rarg0; // source array address
aoqi@1 2214 const Register to = c_rarg1; // destination array address
aoqi@1 2215 const Register count = c_rarg2; // elements count
aoqi@1 2216
aoqi@1 2217 // 'from', 'to', 'count' registers should be set in such order
aoqi@1 2218 // since they are the same as 'src', 'src_pos', 'dst'.
aoqi@1 2219
aoqi@1 2220 __ BIND(L_copy_bytes);
aoqi@1 2221 __ cmpl(rax_elsize, 0);
aoqi@1 2222 __ jccb(Assembler::notEqual, L_copy_shorts);
aoqi@1 2223 __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
aoqi@1 2224 __ lea(to, Address(dst, dst_pos, Address::times_1, 0));// dst_addr
aoqi@1 2225 __ movl2ptr(count, r11_length); // length
aoqi@1 2226 __ jump(RuntimeAddress(byte_copy_entry));
aoqi@1 2227
aoqi@1 2228 __ BIND(L_copy_shorts);
aoqi@1 2229 __ cmpl(rax_elsize, LogBytesPerShort);
aoqi@1 2230 __ jccb(Assembler::notEqual, L_copy_ints);
aoqi@1 2231 __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
aoqi@1 2232 __ lea(to, Address(dst, dst_pos, Address::times_2, 0));// dst_addr
aoqi@1 2233 __ movl2ptr(count, r11_length); // length
aoqi@1 2234 __ jump(RuntimeAddress(short_copy_entry));
aoqi@1 2235
aoqi@1 2236 __ BIND(L_copy_ints);
aoqi@1 2237 __ cmpl(rax_elsize, LogBytesPerInt);
aoqi@1 2238 __ jccb(Assembler::notEqual, L_copy_longs);
aoqi@1 2239 __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
aoqi@1 2240 __ lea(to, Address(dst, dst_pos, Address::times_4, 0));// dst_addr
aoqi@1 2241 __ movl2ptr(count, r11_length); // length
aoqi@1 2242 __ jump(RuntimeAddress(int_copy_entry));
aoqi@1 2243
aoqi@1 2244 __ BIND(L_copy_longs);
aoqi@1 2245 #ifdef ASSERT
aoqi@1 2246 { Label L;
aoqi@1 2247 __ cmpl(rax_elsize, LogBytesPerLong);
aoqi@1 2248 __ jcc(Assembler::equal, L);
aoqi@1 2249 __ stop("must be long copy, but elsize is wrong");
aoqi@1 2250 __ bind(L);
aoqi@1 2251 }
aoqi@1 2252 #endif
aoqi@1 2253 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
aoqi@1 2254 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr
aoqi@1 2255 __ movl2ptr(count, r11_length); // length
aoqi@1 2256 __ jump(RuntimeAddress(long_copy_entry));
aoqi@1 2257
aoqi@1 2258 // objArrayKlass
aoqi@1 2259 __ BIND(L_objArray);
aoqi@1 2260 // live at this point: r10_src_klass, src[_pos], dst[_pos]
aoqi@1 2261
aoqi@1 2262 Label L_plain_copy, L_checkcast_copy;
aoqi@1 2263 // test array classes for subtyping
aoqi@1 2264 __ load_klass(r9_dst_klass, dst);
aoqi@1 2265 __ cmpq(r10_src_klass, r9_dst_klass); // usual case is exact equality
aoqi@1 2266 __ jcc(Assembler::notEqual, L_checkcast_copy);
aoqi@1 2267
aoqi@1 2268 // Identically typed arrays can be copied without element-wise checks.
aoqi@1 2269 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
aoqi@1 2270 r10, L_failed);
aoqi@1 2271
aoqi@1 2272 __ lea(from, Address(src, src_pos, TIMES_OOP,
aoqi@1 2273 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
aoqi@1 2274 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
aoqi@1 2275 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
aoqi@1 2276 __ movl2ptr(count, r11_length); // length
aoqi@1 2277 __ BIND(L_plain_copy);
aoqi@1 2278 __ jump(RuntimeAddress(oop_copy_entry));
aoqi@1 2279
aoqi@1 2280 __ BIND(L_checkcast_copy);
aoqi@1 2281 // live at this point: r10_src_klass, !r11_length
aoqi@1 2282 {
aoqi@1 2283 // assert(r11_length == C_RARG4); // will reload from here
aoqi@1 2284 Register r11_dst_klass = r11;
aoqi@1 2285 __ load_klass(r11_dst_klass, dst);
aoqi@1 2286
aoqi@1 2287 // Before looking at dst.length, make sure dst is also an objArray.
aoqi@1 2288 __ cmpl(Address(r11_dst_klass, lh_offset), objArray_lh);
aoqi@1 2289 __ jcc(Assembler::notEqual, L_failed);
aoqi@1 2290
aoqi@1 2291 // It is safe to examine both src.length and dst.length.
aoqi@1 2292 #ifndef _WIN64
aoqi@1 2293 arraycopy_range_checks(src, src_pos, dst, dst_pos, C_RARG4,
aoqi@1 2294 rax, L_failed);
aoqi@1 2295 #else
aoqi@1 2296 __ movl(r11_length, C_RARG4); // reload
aoqi@1 2297 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
aoqi@1 2298 rax, L_failed);
aoqi@1 2299 __ load_klass(r11_dst_klass, dst); // reload
aoqi@1 2300 #endif
aoqi@1 2301
aoqi@1 2302 // Marshal the base address arguments now, freeing registers.
aoqi@1 2303 __ lea(from, Address(src, src_pos, TIMES_OOP,
aoqi@1 2304 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
aoqi@1 2305 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
aoqi@1 2306 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
aoqi@1 2307 __ movl(count, C_RARG4); // length (reloaded)
aoqi@1 2308 Register sco_temp = c_rarg3; // this register is free now
aoqi@1 2309 assert_different_registers(from, to, count, sco_temp,
aoqi@1 2310 r11_dst_klass, r10_src_klass);
aoqi@1 2311 assert_clean_int(count, sco_temp);
aoqi@1 2312
aoqi@1 2313 // Generate the type check.
aoqi@1 2314 int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
aoqi@1 2315 Klass::super_check_offset_offset_in_bytes());
aoqi@1 2316 __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
aoqi@1 2317 assert_clean_int(sco_temp, rax);
aoqi@1 2318 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
aoqi@1 2319
aoqi@1 2320 // Fetch destination element klass from the objArrayKlass header.
aoqi@1 2321 int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
aoqi@1 2322 objArrayKlass::element_klass_offset_in_bytes());
aoqi@1 2323 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
aoqi@1 2324 __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
aoqi@1 2325 assert_clean_int(sco_temp, rax);
aoqi@1 2326
aoqi@1 2327 // the checkcast_copy loop needs two extra arguments:
aoqi@1 2328 assert(c_rarg3 == sco_temp, "#3 already in place");
aoqi@1 2329 __ movptr(C_RARG4, r11_dst_klass); // dst.klass.element_klass
aoqi@1 2330 __ jump(RuntimeAddress(checkcast_copy_entry));
aoqi@1 2331 }
aoqi@1 2332
aoqi@1 2333 __ BIND(L_failed);
aoqi@1 2334 __ xorptr(rax, rax);
aoqi@1 2335 __ notptr(rax); // return -1
aoqi@1 2336 __ leave(); // required for proper stackwalking of RuntimeStub frame
aoqi@1 2337 __ ret(0);
aoqi@1 2338
aoqi@1 2339 return start;
aoqi@1 2340 }
aoqi@1 2341
aoqi@1 2342 #undef length_arg
aoqi@1 2343 #endif
aoqi@1 2344
aoqi@1 2345 //FIXME
aoqi@1 2346 address generate_disjoint_long_copy(bool aligned, const char *name) {
aoqi@1 2347 Label l_1, l_2;
aoqi@1 2348 StubCodeMark mark(this, "StubRoutines", name);
aoqi@1 2349 __ align(CodeEntryAlignment);
aoqi@1 2350 address start = __ pc();
aoqi@1 2351
aoqi@1 2352 // __ movl(ecx, Address(esp, 4+8)); // count
aoqi@1 2353 // __ movl(eax, Address(esp, 4+0)); // from
aoqi@1 2354 // __ movl(edx, Address(esp, 4+4)); // to
aoqi@1 2355 __ move(T1, A2);
aoqi@1 2356 __ move(T3, A0);
aoqi@1 2357 __ move(T0, A1);
aoqi@1 2358 __ push(T3);
aoqi@1 2359 __ push(T0);
aoqi@1 2360 __ push(T1);
aoqi@1 2361 //__ subl(edx, eax);
aoqi@1 2362 //__ jmp(l_2);
aoqi@1 2363 __ b(l_2);
aoqi@1 2364 __ delayed()->nop();
aoqi@1 2365 __ align(16);
aoqi@1 2366 __ bind(l_1);
aoqi@1 2367 // if (VM_Version::supports_mmx()) {
aoqi@1 2368 // __ movq(mmx0, Address(eax));
aoqi@1 2369 // __ movq(Address(eax, edx, Address::times_1), mmx0);
aoqi@1 2370 // } else {
aoqi@1 2371 // __ fild_d(Address(eax));
aoqi@1 2372 __ ld(AT, T3, 0);
aoqi@1 2373 // __ fistp_d(Address(eax, edx, Address::times_1));
aoqi@1 2374 __ sd (AT, T0, 0);
aoqi@1 2375 // }
aoqi@1 2376 // __ addl(eax, 8);
aoqi@1 2377 __ addi(T3, T3, 8);
aoqi@1 2378 __ addi(T0, T0, 8);
aoqi@1 2379 __ bind(l_2);
aoqi@1 2380 // __ decl(ecx);
aoqi@1 2381 __ addi(T1, T1, -1);
aoqi@1 2382 // __ jcc(Assembler::greaterEqual, l_1);
aoqi@1 2383 __ bgez(T1, l_1);
aoqi@1 2384 __ delayed()->nop();
aoqi@1 2385 // if (VM_Version::supports_mmx()) {
aoqi@1 2386 // __ emms();
aoqi@1 2387 // }
aoqi@1 2388 // __ ret(0);
aoqi@1 2389 __ pop(T1);
aoqi@1 2390 __ pop(T0);
aoqi@1 2391 __ pop(T3);
aoqi@1 2392 __ jr(RA);
aoqi@1 2393 __ delayed()->nop();
aoqi@1 2394 return start;
aoqi@1 2395 }
aoqi@1 2396
aoqi@1 2397
aoqi@1 2398 address generate_conjoint_long_copy(bool aligned, const char *name) {
aoqi@1 2399 Label l_1, l_2;
aoqi@1 2400 StubCodeMark mark(this, "StubRoutines", name);
aoqi@1 2401 __ align(CodeEntryAlignment);
aoqi@1 2402 address start = __ pc();
aoqi@1 2403 address nooverlap_target = aligned ?
aoqi@1 2404 StubRoutines::arrayof_jlong_disjoint_arraycopy() :
aoqi@1 2405 StubRoutines::jlong_disjoint_arraycopy();
aoqi@1 2406 array_overlap_test(nooverlap_target, 3);
aoqi@1 2407
aoqi@1 2408 __ push(T3);
aoqi@1 2409 __ push(T0);
aoqi@1 2410 __ push(T1);
aoqi@1 2411
aoqi@1 2412 /* __ movl(ecx, Address(esp, 4+8)); // count
aoqi@1 2413 __ movl(eax, Address(esp, 4+0)); // from
aoqi@1 2414 __ movl(edx, Address(esp, 4+4)); // to
aoqi@1 2415 __ jmp(l_2);
aoqi@1 2416
aoqi@1 2417 */
aoqi@1 2418 __ move(T1, A2);
aoqi@1 2419 __ move(T3, A0);
aoqi@1 2420 __ move(T0, A1);
aoqi@1 2421 __ sll(AT, T1, Address::times_8);
aoqi@1 2422 __ add(AT, T3, AT);
aoqi@1 2423 __ lea(T3 , Address(AT, -8));
aoqi@1 2424 __ sll(AT, T1, Address::times_8);
aoqi@1 2425 __ add(AT, T0, AT);
aoqi@1 2426 __ lea(T0 , Address(AT, -8));
aoqi@1 2427
aoqi@1 2428
aoqi@1 2429
aoqi@1 2430 __ b(l_2);
aoqi@1 2431 __ delayed()->nop();
aoqi@1 2432 __ align(16);
aoqi@1 2433 __ bind(l_1);
aoqi@1 2434 /* if (VM_Version::supports_mmx()) {
aoqi@1 2435 __ movq(mmx0, Address(eax, ecx, Address::times_8));
aoqi@1 2436 __ movq(Address(edx, ecx,Address::times_8), mmx0);
aoqi@1 2437 } else {
aoqi@1 2438 __ fild_d(Address(eax, ecx, Address::times_8));
aoqi@1 2439 __ fistp_d(Address(edx, ecx,Address::times_8));
aoqi@1 2440 }
aoqi@1 2441 */
aoqi@1 2442 __ ld(AT, T3, 0);
aoqi@1 2443 __ sd (AT, T0, 0);
aoqi@1 2444 __ addi(T3, T3, -8);
aoqi@1 2445 __ addi(T0, T0,-8);
aoqi@1 2446 __ bind(l_2);
aoqi@1 2447 // __ decl(ecx);
aoqi@1 2448 __ addi(T1, T1, -1);
aoqi@1 2449 //__ jcc(Assembler::greaterEqual, l_1);
aoqi@1 2450 __ bgez(T1, l_1);
aoqi@1 2451 __ delayed()->nop();
aoqi@1 2452 // if (VM_Version::supports_mmx()) {
aoqi@1 2453 // __ emms();
aoqi@1 2454 // }
aoqi@1 2455 // __ ret(0);
aoqi@1 2456 __ pop(T1);
aoqi@1 2457 __ pop(T0);
aoqi@1 2458 __ pop(T3);
aoqi@1 2459 __ jr(RA);
aoqi@1 2460 __ delayed()->nop();
aoqi@1 2461 return start;
aoqi@1 2462 }
aoqi@1 2463
aoqi@1 2464 void generate_arraycopy_stubs() {
aoqi@1 2465 if (UseCompressedOops) {
aoqi@1 2466 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, true, "oop_disjoint_arraycopy");
aoqi@1 2467 StubRoutines::_oop_arraycopy = generate_conjoint_int_oop_copy(false, true, "oop_arraycopy");
aoqi@1 2468 } else {
aoqi@1 2469 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, true, "oop_disjoint_arraycopy");
aoqi@1 2470 StubRoutines::_oop_arraycopy = generate_conjoint_long_oop_copy(false, true, "oop_arraycopy");
aoqi@1 2471 }
aoqi@1 2472
aoqi@1 2473 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
aoqi@1 2474 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
aoqi@1 2475 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, false, "jint_disjoint_arraycopy");
aoqi@1 2476 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
aoqi@1 2477 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
aoqi@1 2478
aoqi@1 2479 // if (VM_Version::supports_mmx())
aoqi@1 2480 //if (false)
aoqi@1 2481 // StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_mmx_copy_aligned("arrayof_jshort_disjoint_arraycopy");
aoqi@1 2482 // else
aoqi@1 2483 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
aoqi@1 2484 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(true, false, "arrayof_jint_disjoint_arraycopy");
aoqi@1 2485 //StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(true, true, "arrayof_oop_disjoint_arraycopy");
aoqi@1 2486 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
aoqi@1 2487
aoqi@1 2488 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
aoqi@1 2489 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, "jshort_arraycopy");
aoqi@1 2490 StubRoutines::_jint_arraycopy = generate_conjoint_int_oop_copy(false, false, "jint_arraycopy");
aoqi@1 2491 StubRoutines::_jlong_arraycopy = generate_conjoint_long_copy(false, "jlong_arraycopy");
aoqi@1 2492
aoqi@1 2493 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
aoqi@1 2494 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
aoqi@1 2495 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_oop_copy(true, false, "arrayof_jint_arraycopy");
aoqi@1 2496 //StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_int_oop_copy(true, true, "arrayof_oop_arraycopy");
aoqi@1 2497 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");
aoqi@1 2498
aoqi@1 2499 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy;
aoqi@1 2500 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy;
aoqi@1 2501 }
aoqi@1 2502
aoqi@1 2503 //Wang: add a function to implement SafeFetch32 and SafeFetchN
aoqi@1 2504 void generate_safefetch(const char* name, int size, address* entry,
aoqi@1 2505 address* fault_pc, address* continuation_pc) {
aoqi@1 2506 // safefetch signatures:
aoqi@1 2507 // int SafeFetch32(int* adr, int errValue);
aoqi@1 2508 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
aoqi@1 2509 //
aoqi@1 2510 // arguments:
aoqi@1 2511 // A0 = adr
aoqi@1 2512 // A1 = errValue
aoqi@1 2513 //
aoqi@1 2514 // result:
aoqi@1 2515 // PPC_RET = *adr or errValue
aoqi@1 2516
aoqi@1 2517 StubCodeMark mark(this, "StubRoutines", name);
aoqi@1 2518
aoqi@1 2519 // Entry point, pc or function descriptor.
aoqi@1 2520 *entry = __ pc();
aoqi@1 2521
aoqi@1 2522 // Load *adr into A1, may fault.
aoqi@1 2523 *fault_pc = __ pc();
aoqi@1 2524 switch (size) {
aoqi@1 2525 case 4:
aoqi@1 2526 // int32_t
aoqi@1 2527 __ lw(A1, A0, 0);
aoqi@1 2528 break;
aoqi@1 2529 case 8:
aoqi@1 2530 // int64_t
aoqi@1 2531 __ ld(A1, A0, 0);
aoqi@1 2532 break;
aoqi@1 2533 default:
aoqi@1 2534 ShouldNotReachHere();
aoqi@1 2535 }
aoqi@1 2536
aoqi@1 2537 // return errValue or *adr
aoqi@1 2538 *continuation_pc = __ pc();
aoqi@1 2539 __ addu(V0,A1,R0);
aoqi@1 2540 __ jr(RA);
aoqi@1 2541 __ delayed()->nop();
aoqi@1 2542 }
aoqi@1 2543
aoqi@1 2544
aoqi@1 2545 #undef __
aoqi@1 2546 #define __ masm->
aoqi@1 2547
aoqi@1 2548 // Continuation point for throwing of implicit exceptions that are
aoqi@1 2549 // not handled in the current activation. Fabricates an exception
aoqi@1 2550 // oop and initiates normal exception dispatching in this
aoqi@1 2551 // frame. Since we need to preserve callee-saved values (currently
aoqi@1 2552 // only for C2, but done for C1 as well) we need a callee-saved oop
aoqi@1 2553 // map and therefore have to make these stubs into RuntimeStubs
aoqi@1 2554 // rather than BufferBlobs. If the compiler needs all registers to
aoqi@1 2555 // be preserved between the fault point and the exception handler
aoqi@1 2556 // then it must assume responsibility for that in
aoqi@1 2557 // AbstractCompiler::continuation_for_implicit_null_exception or
aoqi@1 2558 // continuation_for_implicit_division_by_zero_exception. All other
aoqi@1 2559 // implicit exceptions (e.g., NullPointerException or
aoqi@1 2560 // AbstractMethodError on entry) are either at call sites or
aoqi@1 2561 // otherwise assume that stack unwinding will be initiated, so
aoqi@1 2562 // caller saved registers were assumed volatile in the compiler.
aoqi@1 2563 address generate_throw_exception(const char* name,
aoqi@1 2564 address runtime_entry,
aoqi@1 2565 bool restore_saved_exception_pc) {
aoqi@1 2566 // Information about frame layout at time of blocking runtime call.
aoqi@1 2567 // Note that we only have to preserve callee-saved registers since
aoqi@1 2568 // the compilers are responsible for supplying a continuation point
aoqi@1 2569 // if they expect all registers to be preserved.
aoqi@1 2570 //#define aoqi_test
aoqi@1 2571 #ifdef aoqi_test
aoqi@1 2572 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
aoqi@1 2573 #endif
aoqi@1 2574 enum layout {
aoqi@1 2575 thread_off, // last_java_sp
aoqi@1 2576 S7_off, // callee saved register sp + 1
aoqi@1 2577 S6_off, // callee saved register sp + 2
aoqi@1 2578 S5_off, // callee saved register sp + 3
aoqi@1 2579 S4_off, // callee saved register sp + 4
aoqi@1 2580 S3_off, // callee saved register sp + 5
aoqi@1 2581 S2_off, // callee saved register sp + 6
aoqi@1 2582 S1_off, // callee saved register sp + 7
aoqi@1 2583 S0_off, // callee saved register sp + 8
aoqi@1 2584 FP_off,
aoqi@1 2585 ret_address,
aoqi@1 2586 framesize
aoqi@1 2587 };
aoqi@1 2588
aoqi@1 2589 int insts_size = 2048;
aoqi@1 2590 int locs_size = 32;
aoqi@1 2591
aoqi@1 2592 // CodeBuffer* code = new CodeBuffer(insts_size, locs_size, 0, 0, 0, false,
aoqi@1 2593 // NULL, NULL, NULL, false, NULL, name, false);
aoqi@1 2594 CodeBuffer code (name , insts_size, locs_size);
aoqi@1 2595 #ifdef aoqi_test
aoqi@1 2596 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
aoqi@1 2597 #endif
aoqi@1 2598 OopMapSet* oop_maps = new OopMapSet();
aoqi@1 2599 #ifdef aoqi_test
aoqi@1 2600 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
aoqi@1 2601 #endif
aoqi@1 2602 MacroAssembler* masm = new MacroAssembler(&code);
aoqi@1 2603 #ifdef aoqi_test
aoqi@1 2604 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
aoqi@1 2605 #endif
aoqi@1 2606
aoqi@1 2607 address start = __ pc();
aoqi@1 2608 //__ stop("generate_throw_exception");
aoqi@1 2609 /*
aoqi@1 2610 __ move(AT, (int)&jerome1 );
aoqi@1 2611 __ sw(SP, AT, 0);
aoqi@1 2612 __ move(AT, (int)&jerome2 );
aoqi@1 2613 __ sw(FP, AT, 0);
aoqi@1 2614 __ move(AT, (int)&jerome3 );
aoqi@1 2615 __ sw(RA, AT, 0);
aoqi@1 2616 __ move(AT, (int)&jerome4 );
aoqi@1 2617 __ sw(R0, AT, 0);
aoqi@1 2618 __ move(AT, (int)&jerome5 );
aoqi@1 2619 __ sw(R0, AT, 0);
aoqi@1 2620 __ move(AT, (int)&jerome6 );
aoqi@1 2621 __ sw(R0, AT, 0);
aoqi@1 2622 __ move(AT, (int)&jerome7 );
aoqi@1 2623 __ sw(R0, AT, 0);
aoqi@1 2624 __ move(AT, (int)&jerome10 );
aoqi@1 2625 __ sw(R0, AT, 0);
aoqi@1 2626
aoqi@1 2627 __ pushad();
aoqi@1 2628
aoqi@1 2629 //__ enter();
aoqi@1 2630 __ call(CAST_FROM_FN_PTR(address, SharedRuntime::print_call_statistics),
aoqi@1 2631 relocInfo::runtime_call_type);
aoqi@1 2632 __ delayed()->nop();
aoqi@1 2633
aoqi@1 2634 //__ leave();
aoqi@1 2635 __ popad();
aoqi@1 2636
aoqi@1 2637 */
aoqi@1 2638
aoqi@1 2639 // This is an inlined and slightly modified version of call_VM
aoqi@1 2640 // which has the ability to fetch the return PC out of
aoqi@1 2641 // thread-local storage and also sets up last_Java_sp slightly
aoqi@1 2642 // differently than the real call_VM
aoqi@1 2643 #ifndef OPT_THREAD
aoqi@1 2644 Register java_thread = TREG;
aoqi@1 2645 __ get_thread(java_thread);
aoqi@1 2646 #else
aoqi@1 2647 Register java_thread = TREG;
aoqi@1 2648 #endif
aoqi@1 2649 #ifdef aoqi_test
aoqi@1 2650 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
aoqi@1 2651 #endif
aoqi@1 2652 if (restore_saved_exception_pc) {
aoqi@1 2653 __ ld(RA, java_thread, in_bytes(JavaThread::saved_exception_pc_offset())); // eax
aoqi@1 2654 }
aoqi@1 2655
aoqi@1 2656 __ enter(); // required for proper stackwalking of RuntimeStub frame
aoqi@1 2657
aoqi@1 2658 __ addi(SP, SP, (-1) * (framesize-2) * wordSize); // prolog
aoqi@1 2659 __ sd(S0, SP, S0_off * wordSize);
aoqi@1 2660 __ sd(S1, SP, S1_off * wordSize);
aoqi@1 2661 __ sd(S2, SP, S2_off * wordSize);
aoqi@1 2662 __ sd(S3, SP, S3_off * wordSize);
aoqi@1 2663 __ sd(S4, SP, S4_off * wordSize);
aoqi@1 2664 __ sd(S5, SP, S5_off * wordSize);
aoqi@1 2665 __ sd(S6, SP, S6_off * wordSize);
aoqi@1 2666 __ sd(S7, SP, S7_off * wordSize);
aoqi@1 2667
aoqi@1 2668 int frame_complete = __ pc() - start;
aoqi@1 2669 // push java thread (becomes first argument of C function)
aoqi@1 2670 __ sd(java_thread, SP, thread_off * wordSize);
aoqi@1 2671 if (java_thread!=A0)
aoqi@1 2672 __ move(A0, java_thread);
aoqi@1 2673
aoqi@1 2674 // Set up last_Java_sp and last_Java_fp
aoqi@1 2675 __ set_last_Java_frame(java_thread, SP, FP, NULL);
aoqi@1 2676 __ relocate(relocInfo::internal_pc_type);
aoqi@1 2677 {
aoqi@1 2678 intptr_t save_pc = (intptr_t)__ pc() + NativeMovConstReg::instruction_size + NativeCall::return_address_offset + 4;
aoqi@1 2679 __ li48(AT, save_pc);
aoqi@1 2680 }
aoqi@1 2681 __ sd(AT, java_thread, in_bytes(JavaThread::last_Java_pc_offset()));
aoqi@1 2682
aoqi@1 2683 // Call runtime
aoqi@1 2684 __ call(runtime_entry);
aoqi@1 2685 __ delayed()->nop();
aoqi@1 2686 // Generate oop map
aoqi@1 2687 OopMap* map = new OopMap(framesize, 0);
aoqi@1 2688 oop_maps->add_gc_map(__ offset(), map);
aoqi@1 2689
aoqi@1 2690 // restore the thread (cannot use the pushed argument since arguments
aoqi@1 2691 // may be overwritten by C code generated by an optimizing compiler);
aoqi@1 2692 // however can use the register value directly if it is callee saved.
aoqi@1 2693 #ifndef OPT_THREAD
aoqi@1 2694 __ get_thread(java_thread);
aoqi@1 2695 #endif
aoqi@1 2696
aoqi@1 2697 __ ld(SP, java_thread, in_bytes(JavaThread::last_Java_sp_offset()));
aoqi@1 2698 // __ reset_last_Java_frame(java_thread, true);
aoqi@1 2699 __ reset_last_Java_frame(java_thread, true, true);
aoqi@1 2700
aoqi@1 2701 // Restore callee save registers. This must be done after resetting the Java frame
aoqi@1 2702 __ ld(S0, SP, S0_off * wordSize);
aoqi@1 2703 __ ld(S1, SP, S1_off * wordSize);
aoqi@1 2704 __ ld(S2, SP, S2_off * wordSize);
aoqi@1 2705 __ ld(S3, SP, S3_off * wordSize);
aoqi@1 2706 __ ld(S4, SP, S4_off * wordSize);
aoqi@1 2707 __ ld(S5, SP, S5_off * wordSize);
aoqi@1 2708 __ ld(S6, SP, S6_off * wordSize);
aoqi@1 2709 __ ld(S7, SP, S7_off * wordSize);
aoqi@1 2710
aoqi@1 2711 // discard arguments
aoqi@1 2712 __ addi(SP, SP, (framesize-2) * wordSize); // epilog
aoqi@1 2713 // __ leave(); // required for proper stackwalking of RuntimeStub frame
aoqi@1 2714 __ addi(SP, FP, wordSize);
aoqi@1 2715 __ ld(FP, SP, -1*wordSize);
aoqi@1 2716 // check for pending exceptions
aoqi@1 2717 #ifdef ASSERT
aoqi@1 2718 Label L;
aoqi@1 2719 __ lw(AT, java_thread, in_bytes(Thread::pending_exception_offset()));
aoqi@1 2720 __ bne(AT, R0, L);
aoqi@1 2721 __ delayed()->nop();
aoqi@1 2722 __ should_not_reach_here();
aoqi@1 2723 __ bind(L);
aoqi@1 2724 #endif //ASSERT
aoqi@1 2725 __ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
aoqi@1 2726 __ delayed()->nop();
aoqi@1 2727 #ifdef aoqi_test
aoqi@1 2728 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
aoqi@1 2729 #endif
aoqi@1 2730 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code,frame_complete,
aoqi@1 2731 framesize, oop_maps, false);
aoqi@1 2732 #ifdef aoqi_test
aoqi@1 2733 tty->print_cr("%s:%d name:%s", __func__, __LINE__, name);
aoqi@1 2734 #endif
aoqi@1 2735 return stub->entry_point();
aoqi@1 2736 }
aoqi@1 2737
aoqi@1 2738 // Initialization
aoqi@1 2739 void generate_initial() {
aoqi@1 2740 /*
aoqi@1 2741 // Generates all stubs and initializes the entry points
aoqi@1 2742
aoqi@1 2743 // This platform-specific stub is needed by generate_call_stub()
aoqi@1 2744 StubRoutines::mips::_mxcsr_std = generate_fp_mask("mxcsr_std", 0x0000000000001F80);
aoqi@1 2745
aoqi@1 2746 // entry points that exist in all platforms Note: This is code
aoqi@1 2747 // that could be shared among different platforms - however the
aoqi@1 2748 // benefit seems to be smaller than the disadvantage of having a
aoqi@1 2749 // much more complicated generator structure. See also comment in
aoqi@1 2750 // stubRoutines.hpp.
aoqi@1 2751
aoqi@1 2752 StubRoutines::_forward_exception_entry = generate_forward_exception();
aoqi@1 2753
aoqi@1 2754 StubRoutines::_call_stub_entry =
aoqi@1 2755 generate_call_stub(StubRoutines::_call_stub_return_address);
aoqi@1 2756
aoqi@1 2757 // is referenced by megamorphic call
aoqi@1 2758 StubRoutines::_catch_exception_entry = generate_catch_exception();
aoqi@1 2759
aoqi@1 2760 // atomic calls
aoqi@1 2761 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg();
aoqi@1 2762 StubRoutines::_atomic_xchg_ptr_entry = generate_atomic_xchg_ptr();
aoqi@1 2763 StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg();
aoqi@1 2764 StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
aoqi@1 2765 StubRoutines::_atomic_add_entry = generate_atomic_add();
aoqi@1 2766 StubRoutines::_atomic_add_ptr_entry = generate_atomic_add_ptr();
aoqi@1 2767 StubRoutines::_fence_entry = generate_orderaccess_fence();
aoqi@1 2768
aoqi@1 2769 StubRoutines::_handler_for_unsafe_access_entry =
aoqi@1 2770 generate_handler_for_unsafe_access();
aoqi@1 2771
aoqi@1 2772 // platform dependent
aoqi@1 2773 StubRoutines::mips::_get_previous_fp_entry = generate_get_previous_fp();
aoqi@1 2774
aoqi@1 2775 StubRoutines::mips::_verify_mxcsr_entry = generate_verify_mxcsr();
aoqi@1 2776 */
aoqi@1 2777 // Generates all stubs and initializes the entry points
aoqi@1 2778
aoqi@1 2779 //-------------------------------------------------------------
aoqi@1 2780 //-----------------------------------------------------------
aoqi@1 2781 // entry points that exist in all platforms
aoqi@1 2782 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller
aoqi@1 2783 // than the disadvantage of having a much more complicated generator structure.
aoqi@1 2784 // See also comment in stubRoutines.hpp.
aoqi@1 2785 StubRoutines::_forward_exception_entry = generate_forward_exception();
aoqi@1 2786 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
aoqi@1 2787 // is referenced by megamorphic call
aoqi@1 2788 StubRoutines::_catch_exception_entry = generate_catch_exception();
aoqi@1 2789
aoqi@1 2790 StubRoutines::_handler_for_unsafe_access_entry = generate_handler_for_unsafe_access();
aoqi@1 2791
aoqi@1 2792 // platform dependent
aoqi@1 2793 StubRoutines::gs2::_get_previous_fp_entry = generate_get_previous_fp();
aoqi@1 2794 }
aoqi@1 2795
aoqi@1 2796 void generate_all() {
aoqi@1 2797 #ifdef aoqi_test
aoqi@1 2798 tty->print_cr("%s:%d", __func__, __LINE__);
aoqi@1 2799 #endif
aoqi@1 2800 // Generates all stubs and initializes the entry points
aoqi@1 2801
aoqi@1 2802 // These entry points require SharedInfo::stack0 to be set up in
aoqi@1 2803 // non-core builds and need to be relocatable, so they each
aoqi@1 2804 // fabricate a RuntimeStub internally.
aoqi@1 2805 /*
aoqi@1 2806 StubRoutines::_throw_AbstractMethodError_entry =
aoqi@1 2807 generate_throw_exception("AbstractMethodError throw_exception",
aoqi@1 2808 CAST_FROM_FN_PTR(address,
aoqi@1 2809 SharedRuntime::
aoqi@1 2810 throw_AbstractMethodError),
aoqi@1 2811 false);
aoqi@1 2812
aoqi@1 2813 StubRoutines::_throw_IncompatibleClassChangeError_entry =
aoqi@1 2814 generate_throw_exception("IncompatibleClassChangeError throw_exception",
aoqi@1 2815 CAST_FROM_FN_PTR(address,
aoqi@1 2816 SharedRuntime::
aoqi@1 2817 throw_IncompatibleClassChangeError),
aoqi@1 2818 false);
aoqi@1 2819
aoqi@1 2820 StubRoutines::_throw_ArithmeticException_entry =
aoqi@1 2821 generate_throw_exception("ArithmeticException throw_exception",
aoqi@1 2822 CAST_FROM_FN_PTR(address,
aoqi@1 2823 SharedRuntime::
aoqi@1 2824 throw_ArithmeticException),
aoqi@1 2825 true);
aoqi@1 2826
aoqi@1 2827 StubRoutines::_throw_NullPointerException_entry =
aoqi@1 2828 generate_throw_exception("NullPointerException throw_exception",
aoqi@1 2829 CAST_FROM_FN_PTR(address,
aoqi@1 2830 SharedRuntime::
aoqi@1 2831 throw_NullPointerException),
aoqi@1 2832 true);
aoqi@1 2833
aoqi@1 2834 StubRoutines::_throw_NullPointerException_at_call_entry =
aoqi@1 2835 generate_throw_exception("NullPointerException at call throw_exception",
aoqi@1 2836 CAST_FROM_FN_PTR(address,
aoqi@1 2837 SharedRuntime::
aoqi@1 2838 throw_NullPointerException_at_call),
aoqi@1 2839 false);
aoqi@1 2840
aoqi@1 2841 StubRoutines::_throw_StackOverflowError_entry =
aoqi@1 2842 generate_throw_exception("StackOverflowError throw_exception",
aoqi@1 2843 CAST_FROM_FN_PTR(address,
aoqi@1 2844 SharedRuntime::
aoqi@1 2845 throw_StackOverflowError),
aoqi@1 2846 false);
aoqi@1 2847
aoqi@1 2848 // entry points that are platform specific
aoqi@1 2849 StubRoutines::mips::_f2i_fixup = generate_f2i_fixup();
aoqi@1 2850 StubRoutines::mips::_f2l_fixup = generate_f2l_fixup();
aoqi@1 2851 StubRoutines::mips::_d2i_fixup = generate_d2i_fixup();
aoqi@1 2852 StubRoutines::mips::_d2l_fixup = generate_d2l_fixup();
aoqi@1 2853
aoqi@1 2854 StubRoutines::mips::_float_sign_mask = generate_fp_mask("float_sign_mask", 0x7FFFFFFF7FFFFFFF);
aoqi@1 2855 StubRoutines::mips::_float_sign_flip = generate_fp_mask("float_sign_flip", 0x8000000080000000);
aoqi@1 2856 StubRoutines::mips::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
aoqi@1 2857 StubRoutines::mips::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
aoqi@1 2858
aoqi@1 2859 // support for verify_oop (must happen after universe_init)
aoqi@1 2860 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
aoqi@1 2861
aoqi@1 2862 // arraycopy stubs used by compilers
aoqi@1 2863 generate_arraycopy_stubs();
aoqi@1 2864 */
aoqi@1 2865 #ifdef aoqi_test
aoqi@1 2866 tty->print_cr("%s:%d", __func__, __LINE__);
aoqi@1 2867 #endif
aoqi@1 2868 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError), false);
aoqi@1 2869 #ifdef aoqi_test
aoqi@1 2870 tty->print_cr("%s:%d", __func__, __LINE__);
aoqi@1 2871 #endif
aoqi@1 2872 // StubRoutines::_throw_ArithmeticException_entry = generate_throw_exception("ArithmeticException throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_ArithmeticException), true);
aoqi@1 2873 #ifdef aoqi_test
aoqi@1 2874 tty->print_cr("%s:%d", __func__, __LINE__);
aoqi@1 2875 #endif
aoqi@1 2876 // StubRoutines::_throw_NullPointerException_entry = generate_throw_exception("NullPointerException throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException), true);
aoqi@1 2877 #ifdef aoqi_test
aoqi@1 2878 tty->print_cr("%s:%d", __func__, __LINE__);
aoqi@1 2879 #endif
aoqi@1 2880 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
aoqi@1 2881 #ifdef aoqi_test
aoqi@1 2882 tty->print_cr("%s:%d", __func__, __LINE__);
aoqi@1 2883 #endif
aoqi@1 2884 StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
aoqi@1 2885 #ifdef aoqi_test
aoqi@1 2886 tty->print_cr("%s:%d", __func__, __LINE__);
aoqi@1 2887 #endif
aoqi@1 2888
aoqi@1 2889 //------------------------------------------------------
aoqi@1 2890 //------------------------------------------------------------------
aoqi@1 2891 // entry points that are platform specific
aoqi@1 2892
aoqi@1 2893 // support for verify_oop (must happen after universe_init)
aoqi@1 2894 #ifdef aoqi_test
aoqi@1 2895 tty->print_cr("%s:%d", __func__, __LINE__);
aoqi@1 2896 #endif
aoqi@1 2897 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
aoqi@1 2898 #ifdef aoqi_test
aoqi@1 2899 tty->print_cr("%s:%d", __func__, __LINE__);
aoqi@1 2900 #endif
aoqi@1 2901 #ifndef CORE
aoqi@1 2902 // arraycopy stubs used by compilers
aoqi@1 2903 generate_arraycopy_stubs();
aoqi@1 2904 #ifdef aoqi_test
aoqi@1 2905 tty->print_cr("%s:%d", __func__, __LINE__);
aoqi@1 2906 #endif
aoqi@1 2907 #endif
aoqi@1 2908
aoqi@1 2909 // Safefetch stubs.
aoqi@1 2910 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
aoqi@1 2911 &StubRoutines::_safefetch32_fault_pc,
aoqi@1 2912 &StubRoutines::_safefetch32_continuation_pc);
aoqi@1 2913 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
aoqi@1 2914 &StubRoutines::_safefetchN_fault_pc,
aoqi@1 2915 &StubRoutines::_safefetchN_continuation_pc);
aoqi@1 2916 }
aoqi@1 2917
aoqi@1 2918 public:
aoqi@1 2919 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
aoqi@1 2920 if (all) {
aoqi@1 2921 generate_all();
aoqi@1 2922 } else {
aoqi@1 2923 generate_initial();
aoqi@1 2924 }
aoqi@1 2925 }
aoqi@1 2926 }; // end class declaration
aoqi@1 2927 /*
aoqi@1 2928 address StubGenerator::disjoint_byte_copy_entry = NULL;
aoqi@1 2929 address StubGenerator::disjoint_short_copy_entry = NULL;
aoqi@1 2930 address StubGenerator::disjoint_int_copy_entry = NULL;
aoqi@1 2931 address StubGenerator::disjoint_long_copy_entry = NULL;
aoqi@1 2932 address StubGenerator::disjoint_oop_copy_entry = NULL;
aoqi@1 2933
aoqi@1 2934 address StubGenerator::byte_copy_entry = NULL;
aoqi@1 2935 address StubGenerator::short_copy_entry = NULL;
aoqi@1 2936 address StubGenerator::int_copy_entry = NULL;
aoqi@1 2937 address StubGenerator::long_copy_entry = NULL;
aoqi@1 2938 address StubGenerator::oop_copy_entry = NULL;
aoqi@1 2939
aoqi@1 2940 address StubGenerator::checkcast_copy_entry = NULL;
aoqi@1 2941 */
aoqi@1 2942 void StubGenerator_generate(CodeBuffer* code, bool all) {
aoqi@1 2943 StubGenerator g(code, all);
aoqi@1 2944 }

mercurial