src/cpu/ppc/vm/stubGenerator_ppc.cpp

Wed, 15 Apr 2020 11:49:55 +0800

author
aoqi
date
Wed, 15 Apr 2020 11:49:55 +0800
changeset 9852
70aa912cebe5
parent 9756
2be326848943
permissions
-rw-r--r--

Merge

goetz@6458 1 /*
gromero@9662 2 * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
gromero@9662 3 * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
goetz@6458 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
goetz@6458 5 *
goetz@6458 6 * This code is free software; you can redistribute it and/or modify it
goetz@6458 7 * under the terms of the GNU General Public License version 2 only, as
goetz@6458 8 * published by the Free Software Foundation.
goetz@6458 9 *
goetz@6458 10 * This code is distributed in the hope that it will be useful, but WITHOUT
goetz@6458 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
goetz@6458 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
goetz@6458 13 * version 2 for more details (a copy is included in the LICENSE file that
goetz@6458 14 * accompanied this code).
goetz@6458 15 *
goetz@6458 16 * You should have received a copy of the GNU General Public License version
goetz@6458 17 * 2 along with this work; if not, write to the Free Software Foundation,
goetz@6458 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
goetz@6458 19 *
goetz@6458 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
goetz@6458 21 * or visit www.oracle.com if you need additional information or have any
goetz@6458 22 * questions.
goetz@6458 23 *
goetz@6458 24 */
goetz@6458 25
goetz@6458 26 #include "precompiled.hpp"
goetz@6458 27 #include "asm/macroAssembler.inline.hpp"
goetz@6458 28 #include "interpreter/interpreter.hpp"
goetz@6458 29 #include "nativeInst_ppc.hpp"
goetz@6458 30 #include "oops/instanceOop.hpp"
goetz@6458 31 #include "oops/method.hpp"
goetz@6458 32 #include "oops/objArrayKlass.hpp"
goetz@6458 33 #include "oops/oop.inline.hpp"
goetz@6458 34 #include "prims/methodHandles.hpp"
goetz@6458 35 #include "runtime/frame.inline.hpp"
goetz@6458 36 #include "runtime/handles.inline.hpp"
goetz@6458 37 #include "runtime/sharedRuntime.hpp"
goetz@6458 38 #include "runtime/stubCodeGenerator.hpp"
goetz@6458 39 #include "runtime/stubRoutines.hpp"
goetz@6458 40 #include "utilities/top.hpp"
goetz@6512 41 #include "runtime/thread.inline.hpp"
goetz@6458 42
goetz@6458 43 #define __ _masm->
goetz@6458 44
goetz@6458 45 #ifdef PRODUCT
goetz@6458 46 #define BLOCK_COMMENT(str) // nothing
goetz@6458 47 #else
goetz@6458 48 #define BLOCK_COMMENT(str) __ block_comment(str)
goetz@6458 49 #endif
goetz@6458 50
goetz@6458 51 class StubGenerator: public StubCodeGenerator {
goetz@6458 52 private:
goetz@6458 53
goetz@6458 54 // Call stubs are used to call Java from C
goetz@6458 55 //
goetz@6458 56 // Arguments:
goetz@6458 57 //
goetz@6458 58 // R3 - call wrapper address : address
goetz@6458 59 // R4 - result : intptr_t*
goetz@6458 60 // R5 - result type : BasicType
goetz@6458 61 // R6 - method : Method
goetz@6458 62 // R7 - frame mgr entry point : address
goetz@6458 63 // R8 - parameter block : intptr_t*
goetz@6458 64 // R9 - parameter count in words : int
goetz@6458 65 // R10 - thread : Thread*
goetz@6458 66 //
goetz@6458 67 address generate_call_stub(address& return_address) {
goetz@6458 68 // Setup a new c frame, copy java arguments, call frame manager or
goetz@6458 69 // native_entry, and process result.
goetz@6458 70
goetz@6458 71 StubCodeMark mark(this, "StubRoutines", "call_stub");
goetz@6458 72
goetz@6511 73 address start = __ function_entry();
goetz@6458 74
goetz@6458 75 // some sanity checks
goetz@6511 76 assert((sizeof(frame::abi_minframe) % 16) == 0, "unaligned");
goetz@6511 77 assert((sizeof(frame::abi_reg_args) % 16) == 0, "unaligned");
goetz@6458 78 assert((sizeof(frame::spill_nonvolatiles) % 16) == 0, "unaligned");
goetz@6458 79 assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned");
goetz@6458 80 assert((sizeof(frame::entry_frame_locals) % 16) == 0, "unaligned");
goetz@6458 81
goetz@6458 82 Register r_arg_call_wrapper_addr = R3;
goetz@6458 83 Register r_arg_result_addr = R4;
goetz@6458 84 Register r_arg_result_type = R5;
goetz@6458 85 Register r_arg_method = R6;
goetz@6458 86 Register r_arg_entry = R7;
goetz@6458 87 Register r_arg_thread = R10;
goetz@6458 88
goetz@6458 89 Register r_temp = R24;
goetz@6458 90 Register r_top_of_arguments_addr = R25;
goetz@6458 91 Register r_entryframe_fp = R26;
goetz@6458 92
goetz@6458 93 {
goetz@6458 94 // Stack on entry to call_stub:
goetz@6458 95 //
goetz@6458 96 // F1 [C_FRAME]
goetz@6458 97 // ...
goetz@6458 98
goetz@6458 99 Register r_arg_argument_addr = R8;
goetz@6458 100 Register r_arg_argument_count = R9;
goetz@6458 101 Register r_frame_alignment_in_bytes = R27;
goetz@6458 102 Register r_argument_addr = R28;
goetz@6458 103 Register r_argumentcopy_addr = R29;
goetz@6458 104 Register r_argument_size_in_bytes = R30;
goetz@6458 105 Register r_frame_size = R23;
goetz@6458 106
goetz@6458 107 Label arguments_copied;
goetz@6458 108
goetz@6458 109 // Save LR/CR to caller's C_FRAME.
goetz@6458 110 __ save_LR_CR(R0);
goetz@6458 111
goetz@6458 112 // Zero extend arg_argument_count.
goetz@6458 113 __ clrldi(r_arg_argument_count, r_arg_argument_count, 32);
goetz@6458 114
goetz@6458 115 // Save non-volatiles GPRs to ENTRY_FRAME (not yet pushed, but it's safe).
goetz@6458 116 __ save_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
goetz@6458 117
goetz@6458 118 // Keep copy of our frame pointer (caller's SP).
goetz@6458 119 __ mr(r_entryframe_fp, R1_SP);
goetz@6458 120
goetz@6458 121 BLOCK_COMMENT("Push ENTRY_FRAME including arguments");
goetz@6458 122 // Push ENTRY_FRAME including arguments:
goetz@6458 123 //
goetz@6458 124 // F0 [TOP_IJAVA_FRAME_ABI]
goetz@6458 125 // alignment (optional)
goetz@6458 126 // [outgoing Java arguments]
goetz@6458 127 // [ENTRY_FRAME_LOCALS]
goetz@6458 128 // F1 [C_FRAME]
goetz@6458 129 // ...
goetz@6458 130
goetz@6458 131 // calculate frame size
goetz@6458 132
goetz@6458 133 // unaligned size of arguments
goetz@6458 134 __ sldi(r_argument_size_in_bytes,
goetz@6458 135 r_arg_argument_count, Interpreter::logStackElementSize);
goetz@6458 136 // arguments alignment (max 1 slot)
goetz@6458 137 // FIXME: use round_to() here
goetz@6458 138 __ andi_(r_frame_alignment_in_bytes, r_arg_argument_count, 1);
goetz@6458 139 __ sldi(r_frame_alignment_in_bytes,
goetz@6495 140 r_frame_alignment_in_bytes, Interpreter::logStackElementSize);
goetz@6458 141
goetz@6458 142 // size = unaligned size of arguments + top abi's size
goetz@6458 143 __ addi(r_frame_size, r_argument_size_in_bytes,
goetz@6458 144 frame::top_ijava_frame_abi_size);
goetz@6458 145 // size += arguments alignment
goetz@6458 146 __ add(r_frame_size,
goetz@6495 147 r_frame_size, r_frame_alignment_in_bytes);
goetz@6458 148 // size += size of call_stub locals
goetz@6458 149 __ addi(r_frame_size,
goetz@6458 150 r_frame_size, frame::entry_frame_locals_size);
goetz@6458 151
goetz@6458 152 // push ENTRY_FRAME
goetz@6458 153 __ push_frame(r_frame_size, r_temp);
goetz@6458 154
goetz@6458 155 // initialize call_stub locals (step 1)
goetz@6458 156 __ std(r_arg_call_wrapper_addr,
goetz@6458 157 _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp);
goetz@6458 158 __ std(r_arg_result_addr,
goetz@6458 159 _entry_frame_locals_neg(result_address), r_entryframe_fp);
goetz@6458 160 __ std(r_arg_result_type,
goetz@6458 161 _entry_frame_locals_neg(result_type), r_entryframe_fp);
goetz@6458 162 // we will save arguments_tos_address later
goetz@6458 163
goetz@6458 164
goetz@6458 165 BLOCK_COMMENT("Copy Java arguments");
goetz@6458 166 // copy Java arguments
goetz@6458 167
goetz@6458 168 // Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later.
goetz@6458 169 // FIXME: why not simply use SP+frame::top_ijava_frame_size?
goetz@6458 170 __ addi(r_top_of_arguments_addr,
goetz@6458 171 R1_SP, frame::top_ijava_frame_abi_size);
goetz@6458 172 __ add(r_top_of_arguments_addr,
goetz@6495 173 r_top_of_arguments_addr, r_frame_alignment_in_bytes);
goetz@6458 174
goetz@6458 175 // any arguments to copy?
goetz@6458 176 __ cmpdi(CCR0, r_arg_argument_count, 0);
goetz@6458 177 __ beq(CCR0, arguments_copied);
goetz@6458 178
goetz@6458 179 // prepare loop and copy arguments in reverse order
goetz@6458 180 {
goetz@6458 181 // init CTR with arg_argument_count
goetz@6458 182 __ mtctr(r_arg_argument_count);
goetz@6458 183
goetz@6458 184 // let r_argumentcopy_addr point to last outgoing Java arguments P
goetz@6458 185 __ mr(r_argumentcopy_addr, r_top_of_arguments_addr);
goetz@6458 186
goetz@6458 187 // let r_argument_addr point to last incoming java argument
goetz@6458 188 __ add(r_argument_addr,
goetz@6458 189 r_arg_argument_addr, r_argument_size_in_bytes);
goetz@6458 190 __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
goetz@6458 191
goetz@6458 192 // now loop while CTR > 0 and copy arguments
goetz@6458 193 {
goetz@6458 194 Label next_argument;
goetz@6458 195 __ bind(next_argument);
goetz@6458 196
goetz@6458 197 __ ld(r_temp, 0, r_argument_addr);
goetz@6458 198 // argument_addr--;
goetz@6458 199 __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
goetz@6458 200 __ std(r_temp, 0, r_argumentcopy_addr);
goetz@6458 201 // argumentcopy_addr++;
goetz@6458 202 __ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord);
goetz@6458 203
goetz@6458 204 __ bdnz(next_argument);
goetz@6458 205 }
goetz@6458 206 }
goetz@6458 207
goetz@6458 208 // Arguments copied, continue.
goetz@6458 209 __ bind(arguments_copied);
goetz@6458 210 }
goetz@6458 211
goetz@6458 212 {
goetz@6458 213 BLOCK_COMMENT("Call frame manager or native entry.");
goetz@6458 214 // Call frame manager or native entry.
goetz@7222 215 Register r_new_arg_entry = R14;
goetz@6458 216 assert_different_registers(r_new_arg_entry, r_top_of_arguments_addr,
goetz@6458 217 r_arg_method, r_arg_thread);
goetz@6458 218
goetz@6458 219 __ mr(r_new_arg_entry, r_arg_entry);
goetz@6458 220
goetz@6458 221 // Register state on entry to frame manager / native entry:
goetz@6458 222 //
goetz@6495 223 // tos - intptr_t* sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8
goetz@6458 224 // R19_method - Method
goetz@6458 225 // R16_thread - JavaThread*
goetz@6458 226
goetz@6495 227 // Tos must point to last argument - element_size.
goetz@6512 228 #ifdef CC_INTERP
goetz@6495 229 const Register tos = R17_tos;
goetz@6512 230 #else
goetz@6512 231 const Register tos = R15_esp;
goetz@6512 232 #endif
goetz@6495 233 __ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);
goetz@6458 234
goetz@6458 235 // initialize call_stub locals (step 2)
goetz@6495 236 // now save tos as arguments_tos_address
goetz@6495 237 __ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
goetz@6458 238
goetz@6458 239 // load argument registers for call
goetz@6458 240 __ mr(R19_method, r_arg_method);
goetz@6458 241 __ mr(R16_thread, r_arg_thread);
goetz@6495 242 assert(tos != r_arg_method, "trashed r_arg_method");
goetz@6495 243 assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");
goetz@6458 244
goetz@6458 245 // Set R15_prev_state to 0 for simplifying checks in callee.
goetz@6512 246 #ifdef CC_INTERP
goetz@6458 247 __ li(R15_prev_state, 0);
goetz@6512 248 #else
goetz@6512 249 __ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R11_scratch1);
goetz@6512 250 #endif
goetz@6458 251 // Stack on entry to frame manager / native entry:
goetz@6458 252 //
goetz@6458 253 // F0 [TOP_IJAVA_FRAME_ABI]
goetz@6458 254 // alignment (optional)
goetz@6458 255 // [outgoing Java arguments]
goetz@6458 256 // [ENTRY_FRAME_LOCALS]
goetz@6458 257 // F1 [C_FRAME]
goetz@6458 258 // ...
goetz@6458 259 //
goetz@6458 260
goetz@6458 261 // global toc register
goetz@6458 262 __ load_const(R29, MacroAssembler::global_toc(), R11_scratch1);
goetz@6458 263
goetz@6458 264 // Load narrow oop base.
goetz@6458 265 __ reinit_heapbase(R30, R11_scratch1);
goetz@6458 266
goetz@6458 267 // Remember the senderSP so we interpreter can pop c2i arguments off of the stack
goetz@6458 268 // when called via a c2i.
goetz@6458 269
goetz@6458 270 // Pass initial_caller_sp to framemanager.
goetz@6458 271 __ mr(R21_tmp1, R1_SP);
goetz@6458 272
goetz@6458 273 // Do a light-weight C-call here, r_new_arg_entry holds the address
goetz@6458 274 // of the interpreter entry point (frame manager or native entry)
goetz@6458 275 // and save runtime-value of LR in return_address.
goetz@6495 276 assert(r_new_arg_entry != tos && r_new_arg_entry != R19_method && r_new_arg_entry != R16_thread,
goetz@6458 277 "trashed r_new_arg_entry");
goetz@6458 278 return_address = __ call_stub(r_new_arg_entry);
goetz@6458 279 }
goetz@6458 280
goetz@6458 281 {
goetz@6458 282 BLOCK_COMMENT("Returned from frame manager or native entry.");
goetz@6458 283 // Returned from frame manager or native entry.
goetz@6458 284 // Now pop frame, process result, and return to caller.
goetz@6458 285
goetz@6458 286 // Stack on exit from frame manager / native entry:
goetz@6458 287 //
goetz@6458 288 // F0 [ABI]
goetz@6458 289 // ...
goetz@6458 290 // [ENTRY_FRAME_LOCALS]
goetz@6458 291 // F1 [C_FRAME]
goetz@6458 292 // ...
goetz@6458 293 //
goetz@6458 294 // Just pop the topmost frame ...
goetz@6458 295 //
goetz@6458 296
goetz@6458 297 Label ret_is_object;
goetz@6458 298 Label ret_is_long;
goetz@6458 299 Label ret_is_float;
goetz@6458 300 Label ret_is_double;
goetz@6458 301
goetz@6458 302 Register r_entryframe_fp = R30;
goetz@6458 303 Register r_lr = R7_ARG5;
goetz@6458 304 Register r_cr = R8_ARG6;
goetz@6458 305
goetz@6458 306 // Reload some volatile registers which we've spilled before the call
goetz@6458 307 // to frame manager / native entry.
goetz@6458 308 // Access all locals via frame pointer, because we know nothing about
goetz@6458 309 // the topmost frame's size.
goetz@6458 310 __ ld(r_entryframe_fp, _abi(callers_sp), R1_SP);
goetz@6458 311 assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);
goetz@6458 312 __ ld(r_arg_result_addr,
goetz@6458 313 _entry_frame_locals_neg(result_address), r_entryframe_fp);
goetz@6458 314 __ ld(r_arg_result_type,
goetz@6458 315 _entry_frame_locals_neg(result_type), r_entryframe_fp);
goetz@6458 316 __ ld(r_cr, _abi(cr), r_entryframe_fp);
goetz@6458 317 __ ld(r_lr, _abi(lr), r_entryframe_fp);
goetz@6458 318
goetz@6458 319 // pop frame and restore non-volatiles, LR and CR
goetz@6458 320 __ mr(R1_SP, r_entryframe_fp);
goetz@6458 321 __ mtcr(r_cr);
goetz@6458 322 __ mtlr(r_lr);
goetz@6458 323
goetz@6458 324 // Store result depending on type. Everything that is not
goetz@6458 325 // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
goetz@6458 326 __ cmpwi(CCR0, r_arg_result_type, T_OBJECT);
goetz@6458 327 __ cmpwi(CCR1, r_arg_result_type, T_LONG);
goetz@6495 328 __ cmpwi(CCR5, r_arg_result_type, T_FLOAT);
goetz@6495 329 __ cmpwi(CCR6, r_arg_result_type, T_DOUBLE);
goetz@6458 330
goetz@6458 331 // restore non-volatile registers
goetz@6458 332 __ restore_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
goetz@6458 333
goetz@6458 334
goetz@6458 335 // Stack on exit from call_stub:
goetz@6458 336 //
goetz@6458 337 // 0 [C_FRAME]
goetz@6458 338 // ...
goetz@6458 339 //
goetz@6458 340 // no call_stub frames left.
goetz@6458 341
goetz@6458 342 // All non-volatiles have been restored at this point!!
goetz@6458 343 assert(R3_RET == R3, "R3_RET should be R3");
goetz@6458 344
goetz@6458 345 __ beq(CCR0, ret_is_object);
goetz@6458 346 __ beq(CCR1, ret_is_long);
goetz@6495 347 __ beq(CCR5, ret_is_float);
goetz@6495 348 __ beq(CCR6, ret_is_double);
goetz@6458 349
goetz@6458 350 // default:
goetz@6458 351 __ stw(R3_RET, 0, r_arg_result_addr);
goetz@6458 352 __ blr(); // return to caller
goetz@6458 353
goetz@6458 354 // case T_OBJECT:
goetz@6458 355 __ bind(ret_is_object);
goetz@6458 356 __ std(R3_RET, 0, r_arg_result_addr);
goetz@6458 357 __ blr(); // return to caller
goetz@6458 358
goetz@6458 359 // case T_LONG:
goetz@6458 360 __ bind(ret_is_long);
goetz@6458 361 __ std(R3_RET, 0, r_arg_result_addr);
goetz@6458 362 __ blr(); // return to caller
goetz@6458 363
goetz@6458 364 // case T_FLOAT:
goetz@6458 365 __ bind(ret_is_float);
goetz@6458 366 __ stfs(F1_RET, 0, r_arg_result_addr);
goetz@6458 367 __ blr(); // return to caller
goetz@6458 368
goetz@6458 369 // case T_DOUBLE:
goetz@6458 370 __ bind(ret_is_double);
goetz@6458 371 __ stfd(F1_RET, 0, r_arg_result_addr);
goetz@6458 372 __ blr(); // return to caller
goetz@6458 373 }
goetz@6458 374
goetz@6458 375 return start;
goetz@6458 376 }
goetz@6458 377
goetz@6458 378 // Return point for a Java call if there's an exception thrown in
goetz@6458 379 // Java code. The exception is caught and transformed into a
goetz@6458 380 // pending exception stored in JavaThread that can be tested from
goetz@6458 381 // within the VM.
goetz@6458 382 //
goetz@6458 383 address generate_catch_exception() {
goetz@6458 384 StubCodeMark mark(this, "StubRoutines", "catch_exception");
goetz@6458 385
goetz@6458 386 address start = __ pc();
goetz@6458 387
goetz@6458 388 // Registers alive
goetz@6458 389 //
goetz@6458 390 // R16_thread
goetz@6458 391 // R3_ARG1 - address of pending exception
goetz@6458 392 // R4_ARG2 - return address in call stub
goetz@6458 393
goetz@6458 394 const Register exception_file = R21_tmp1;
goetz@6458 395 const Register exception_line = R22_tmp2;
goetz@6458 396
goetz@6458 397 __ load_const(exception_file, (void*)__FILE__);
goetz@6458 398 __ load_const(exception_line, (void*)__LINE__);
goetz@6458 399
goetz@6458 400 __ std(R3_ARG1, thread_(pending_exception));
goetz@6458 401 // store into `char *'
goetz@6458 402 __ std(exception_file, thread_(exception_file));
goetz@6458 403 // store into `int'
goetz@6458 404 __ stw(exception_line, thread_(exception_line));
goetz@6458 405
goetz@6458 406 // complete return to VM
goetz@6458 407 assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
goetz@6458 408
goetz@6458 409 __ mtlr(R4_ARG2);
goetz@6458 410 // continue in call stub
goetz@6458 411 __ blr();
goetz@6458 412
goetz@6458 413 return start;
goetz@6458 414 }
goetz@6458 415
goetz@6458 416 // Continuation point for runtime calls returning with a pending
goetz@6458 417 // exception. The pending exception check happened in the runtime
goetz@6458 418 // or native call stub. The pending exception in Thread is
goetz@6458 419 // converted into a Java-level exception.
goetz@6458 420 //
goetz@6458 421 address generate_forward_exception() {
goetz@6458 422 StubCodeMark mark(this, "StubRoutines", "forward_exception");
goetz@6458 423 address start = __ pc();
goetz@6458 424
goetz@6458 425 #if !defined(PRODUCT)
goetz@6458 426 if (VerifyOops) {
goetz@6458 427 // Get pending exception oop.
goetz@6458 428 __ ld(R3_ARG1,
goetz@6458 429 in_bytes(Thread::pending_exception_offset()),
goetz@6458 430 R16_thread);
goetz@6458 431 // Make sure that this code is only executed if there is a pending exception.
goetz@6458 432 {
goetz@6458 433 Label L;
goetz@6458 434 __ cmpdi(CCR0, R3_ARG1, 0);
goetz@6458 435 __ bne(CCR0, L);
goetz@6458 436 __ stop("StubRoutines::forward exception: no pending exception (1)");
goetz@6458 437 __ bind(L);
goetz@6458 438 }
goetz@6458 439 __ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");
goetz@6458 440 }
goetz@6458 441 #endif
goetz@6458 442
goetz@6458 443 // Save LR/CR and copy exception pc (LR) into R4_ARG2.
goetz@6458 444 __ save_LR_CR(R4_ARG2);
goetz@6511 445 __ push_frame_reg_args(0, R0);
goetz@6458 446 // Find exception handler.
goetz@6458 447 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
goetz@6458 448 SharedRuntime::exception_handler_for_return_address),
goetz@6458 449 R16_thread,
goetz@6458 450 R4_ARG2);
goetz@6458 451 // Copy handler's address.
goetz@6458 452 __ mtctr(R3_RET);
goetz@6458 453 __ pop_frame();
goetz@6458 454 __ restore_LR_CR(R0);
goetz@6458 455
goetz@6458 456 // Set up the arguments for the exception handler:
goetz@6458 457 // - R3_ARG1: exception oop
goetz@6458 458 // - R4_ARG2: exception pc.
goetz@6458 459
goetz@6458 460 // Load pending exception oop.
goetz@6458 461 __ ld(R3_ARG1,
goetz@6458 462 in_bytes(Thread::pending_exception_offset()),
goetz@6458 463 R16_thread);
goetz@6458 464
goetz@6458 465 // The exception pc is the return address in the caller.
goetz@6458 466 // Must load it into R4_ARG2.
goetz@6458 467 __ mflr(R4_ARG2);
goetz@6458 468
goetz@6458 469 #ifdef ASSERT
goetz@6458 470 // Make sure exception is set.
goetz@6458 471 {
goetz@6458 472 Label L;
goetz@6458 473 __ cmpdi(CCR0, R3_ARG1, 0);
goetz@6458 474 __ bne(CCR0, L);
goetz@6458 475 __ stop("StubRoutines::forward exception: no pending exception (2)");
goetz@6458 476 __ bind(L);
goetz@6458 477 }
goetz@6458 478 #endif
goetz@6458 479
goetz@6458 480 // Clear the pending exception.
goetz@6458 481 __ li(R0, 0);
goetz@6458 482 __ std(R0,
goetz@6458 483 in_bytes(Thread::pending_exception_offset()),
goetz@6458 484 R16_thread);
goetz@6458 485 // Jump to exception handler.
goetz@6458 486 __ bctr();
goetz@6458 487
goetz@6458 488 return start;
goetz@6458 489 }
goetz@6458 490
goetz@6458 491 #undef __
goetz@6458 492 #define __ masm->
goetz@6458 493 // Continuation point for throwing of implicit exceptions that are
goetz@6458 494 // not handled in the current activation. Fabricates an exception
goetz@6458 495 // oop and initiates normal exception dispatching in this
goetz@6458 496 // frame. Only callee-saved registers are preserved (through the
goetz@6458 497 // normal register window / RegisterMap handling). If the compiler
goetz@6458 498 // needs all registers to be preserved between the fault point and
goetz@6458 499 // the exception handler then it must assume responsibility for that
goetz@6458 500 // in AbstractCompiler::continuation_for_implicit_null_exception or
goetz@6458 501 // continuation_for_implicit_division_by_zero_exception. All other
goetz@6458 502 // implicit exceptions (e.g., NullPointerException or
goetz@6458 503 // AbstractMethodError on entry) are either at call sites or
goetz@6458 504 // otherwise assume that stack unwinding will be initiated, so
goetz@6458 505 // caller saved registers were assumed volatile in the compiler.
goetz@6458 506 //
goetz@6458 507 // Note that we generate only this stub into a RuntimeStub, because
goetz@6458 508 // it needs to be properly traversed and ignored during GC, so we
goetz@6458 509 // change the meaning of the "__" macro within this method.
goetz@6458 510 //
goetz@6458 511 // Note: the routine set_pc_not_at_call_for_caller in
goetz@6458 512 // SharedRuntime.cpp requires that this code be generated into a
goetz@6458 513 // RuntimeStub.
goetz@6458 514 address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc,
goetz@6458 515 Register arg1 = noreg, Register arg2 = noreg) {
goetz@6458 516 CodeBuffer code(name, 1024 DEBUG_ONLY(+ 512), 0);
goetz@6458 517 MacroAssembler* masm = new MacroAssembler(&code);
goetz@6458 518
goetz@6458 519 OopMapSet* oop_maps = new OopMapSet();
goetz@6511 520 int frame_size_in_bytes = frame::abi_reg_args_size;
goetz@6458 521 OopMap* map = new OopMap(frame_size_in_bytes / sizeof(jint), 0);
goetz@6458 522
goetz@6458 523 StubCodeMark mark(this, "StubRoutines", "throw_exception");
goetz@6458 524
goetz@6458 525 address start = __ pc();
goetz@6458 526
goetz@6458 527 __ save_LR_CR(R11_scratch1);
goetz@6458 528
goetz@6458 529 // Push a frame.
goetz@6511 530 __ push_frame_reg_args(0, R11_scratch1);
goetz@6458 531
goetz@6458 532 address frame_complete_pc = __ pc();
goetz@6458 533
goetz@6458 534 if (restore_saved_exception_pc) {
goetz@6458 535 __ unimplemented("StubGenerator::throw_exception with restore_saved_exception_pc", 74);
goetz@6458 536 }
goetz@6458 537
goetz@6458 538 // Note that we always have a runtime stub frame on the top of
goetz@6458 539 // stack by this point. Remember the offset of the instruction
goetz@6458 540 // whose address will be moved to R11_scratch1.
goetz@6458 541 address gc_map_pc = __ get_PC_trash_LR(R11_scratch1);
goetz@6458 542
goetz@6458 543 __ set_last_Java_frame(/*sp*/R1_SP, /*pc*/R11_scratch1);
goetz@6458 544
goetz@6458 545 __ mr(R3_ARG1, R16_thread);
goetz@6458 546 if (arg1 != noreg) {
goetz@6458 547 __ mr(R4_ARG2, arg1);
goetz@6458 548 }
goetz@6458 549 if (arg2 != noreg) {
goetz@6458 550 __ mr(R5_ARG3, arg2);
goetz@6458 551 }
goetz@6511 552 #if defined(ABI_ELFv2)
goetz@6511 553 __ call_c(runtime_entry, relocInfo::none);
goetz@6511 554 #else
goetz@6511 555 __ call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, runtime_entry), relocInfo::none);
goetz@6511 556 #endif
goetz@6458 557
goetz@6458 558 // Set an oopmap for the call site.
goetz@6458 559 oop_maps->add_gc_map((int)(gc_map_pc - start), map);
goetz@6458 560
goetz@6458 561 __ reset_last_Java_frame();
goetz@6458 562
goetz@6458 563 #ifdef ASSERT
goetz@6458 564 // Make sure that this code is only executed if there is a pending
goetz@6458 565 // exception.
goetz@6458 566 {
goetz@6458 567 Label L;
goetz@6458 568 __ ld(R0,
goetz@6458 569 in_bytes(Thread::pending_exception_offset()),
goetz@6458 570 R16_thread);
goetz@6458 571 __ cmpdi(CCR0, R0, 0);
goetz@6458 572 __ bne(CCR0, L);
goetz@6458 573 __ stop("StubRoutines::throw_exception: no pending exception");
goetz@6458 574 __ bind(L);
goetz@6458 575 }
goetz@6458 576 #endif
goetz@6458 577
goetz@6458 578 // Pop frame.
goetz@6458 579 __ pop_frame();
goetz@6458 580
goetz@6458 581 __ restore_LR_CR(R11_scratch1);
goetz@6458 582
goetz@6458 583 __ load_const(R11_scratch1, StubRoutines::forward_exception_entry());
goetz@6458 584 __ mtctr(R11_scratch1);
goetz@6458 585 __ bctr();
goetz@6458 586
goetz@6458 587 // Create runtime stub with OopMap.
goetz@6458 588 RuntimeStub* stub =
goetz@6458 589 RuntimeStub::new_runtime_stub(name, &code,
goetz@6458 590 /*frame_complete=*/ (int)(frame_complete_pc - start),
goetz@6458 591 frame_size_in_bytes/wordSize,
goetz@6458 592 oop_maps,
goetz@6458 593 false);
goetz@6458 594 return stub->entry_point();
goetz@6458 595 }
goetz@6458 596 #undef __
goetz@6458 597 #define __ _masm->
goetz@6458 598
goetz@6458 599 // Generate G1 pre-write barrier for array.
goetz@6458 600 //
goetz@6458 601 // Input:
goetz@6458 602 // from - register containing src address (only needed for spilling)
goetz@6458 603 // to - register containing starting address
goetz@6458 604 // count - register containing element count
goetz@6458 605 // tmp - scratch register
goetz@6458 606 //
goetz@6458 607 // Kills:
goetz@6458 608 // nothing
goetz@6458 609 //
goetz@6458 610 void gen_write_ref_array_pre_barrier(Register from, Register to, Register count, bool dest_uninitialized, Register Rtmp1) {
goetz@6458 611 BarrierSet* const bs = Universe::heap()->barrier_set();
goetz@6458 612 switch (bs->kind()) {
goetz@6458 613 case BarrierSet::G1SATBCT:
goetz@6458 614 case BarrierSet::G1SATBCTLogging:
goetz@6458 615 // With G1, don't generate the call if we statically know that the target in uninitialized
goetz@6458 616 if (!dest_uninitialized) {
goetz@6458 617 const int spill_slots = 4 * wordSize;
goetz@6511 618 const int frame_size = frame::abi_reg_args_size + spill_slots;
goetz@6495 619 Label filtered;
goetz@6495 620
goetz@6495 621 // Is marking active?
goetz@6495 622 if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
goetz@6495 623 __ lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
goetz@6495 624 } else {
goetz@6495 625 guarantee(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
goetz@6495 626 __ lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
goetz@6495 627 }
goetz@6495 628 __ cmpdi(CCR0, Rtmp1, 0);
goetz@6495 629 __ beq(CCR0, filtered);
goetz@6458 630
goetz@6458 631 __ save_LR_CR(R0);
goetz@6511 632 __ push_frame_reg_args(spill_slots, R0);
goetz@6458 633 __ std(from, frame_size - 1 * wordSize, R1_SP);
goetz@6458 634 __ std(to, frame_size - 2 * wordSize, R1_SP);
goetz@6458 635 __ std(count, frame_size - 3 * wordSize, R1_SP);
goetz@6458 636
goetz@6458 637 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), to, count);
goetz@6458 638
goetz@6458 639 __ ld(from, frame_size - 1 * wordSize, R1_SP);
goetz@6458 640 __ ld(to, frame_size - 2 * wordSize, R1_SP);
goetz@6458 641 __ ld(count, frame_size - 3 * wordSize, R1_SP);
goetz@6458 642 __ pop_frame();
goetz@6458 643 __ restore_LR_CR(R0);
goetz@6495 644
goetz@6495 645 __ bind(filtered);
goetz@6458 646 }
goetz@6458 647 break;
goetz@6458 648 case BarrierSet::CardTableModRef:
goetz@6458 649 case BarrierSet::CardTableExtension:
goetz@6458 650 case BarrierSet::ModRef:
goetz@6458 651 break;
goetz@6458 652 default:
goetz@6458 653 ShouldNotReachHere();
goetz@6458 654 }
goetz@6458 655 }
goetz@6458 656
goetz@6458 657 // Generate CMS/G1 post-write barrier for array.
goetz@6458 658 //
goetz@6458 659 // Input:
goetz@6458 660 // addr - register containing starting address
goetz@6458 661 // count - register containing element count
goetz@6458 662 // tmp - scratch register
goetz@6458 663 //
goetz@6458 664 // The input registers and R0 are overwritten.
goetz@6458 665 //
goetz@6495 666 void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp, bool branchToEnd) {
goetz@6458 667 BarrierSet* const bs = Universe::heap()->barrier_set();
goetz@6458 668
goetz@6458 669 switch (bs->kind()) {
goetz@6458 670 case BarrierSet::G1SATBCT:
goetz@6458 671 case BarrierSet::G1SATBCTLogging:
goetz@6458 672 {
goetz@6495 673 if (branchToEnd) {
goetz@6495 674 __ save_LR_CR(R0);
goetz@6495 675 // We need this frame only to spill LR.
goetz@6511 676 __ push_frame_reg_args(0, R0);
goetz@6495 677 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), addr, count);
goetz@6495 678 __ pop_frame();
goetz@6495 679 __ restore_LR_CR(R0);
goetz@6495 680 } else {
goetz@6495 681 // Tail call: fake call from stub caller by branching without linking.
goetz@6495 682 address entry_point = (address)CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post);
goetz@6495 683 __ mr_if_needed(R3_ARG1, addr);
goetz@6495 684 __ mr_if_needed(R4_ARG2, count);
goetz@6495 685 __ load_const(R11, entry_point, R0);
goetz@6495 686 __ call_c_and_return_to_caller(R11);
goetz@6495 687 }
goetz@6458 688 }
goetz@6458 689 break;
goetz@6458 690 case BarrierSet::CardTableModRef:
goetz@6458 691 case BarrierSet::CardTableExtension:
goetz@6458 692 {
goetz@6458 693 Label Lskip_loop, Lstore_loop;
goetz@6458 694 if (UseConcMarkSweepGC) {
goetz@6458 695 // TODO PPC port: contribute optimization / requires shared changes
goetz@6458 696 __ release();
goetz@6458 697 }
goetz@6458 698
goetz@6458 699 CardTableModRefBS* const ct = (CardTableModRefBS*)bs;
goetz@6458 700 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
goetz@6458 701 assert_different_registers(addr, count, tmp);
goetz@6458 702
goetz@6458 703 __ sldi(count, count, LogBytesPerHeapOop);
goetz@6458 704 __ addi(count, count, -BytesPerHeapOop);
goetz@6458 705 __ add(count, addr, count);
goetz@6458 706 // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
goetz@6458 707 __ srdi(addr, addr, CardTableModRefBS::card_shift);
goetz@6458 708 __ srdi(count, count, CardTableModRefBS::card_shift);
goetz@6458 709 __ subf(count, addr, count);
goetz@6458 710 assert_different_registers(R0, addr, count, tmp);
goetz@6458 711 __ load_const(tmp, (address)ct->byte_map_base);
goetz@6458 712 __ addic_(count, count, 1);
goetz@6458 713 __ beq(CCR0, Lskip_loop);
goetz@6458 714 __ li(R0, 0);
goetz@6458 715 __ mtctr(count);
goetz@6458 716 // Byte store loop
goetz@6458 717 __ bind(Lstore_loop);
goetz@6458 718 __ stbx(R0, tmp, addr);
goetz@6458 719 __ addi(addr, addr, 1);
goetz@6458 720 __ bdnz(Lstore_loop);
goetz@6458 721 __ bind(Lskip_loop);
goetz@6495 722
goetz@6495 723 if (!branchToEnd) __ blr();
goetz@6458 724 }
goetz@6458 725 break;
goetz@6458 726 case BarrierSet::ModRef:
goetz@6495 727 if (!branchToEnd) __ blr();
goetz@6458 728 break;
goetz@6458 729 default:
goetz@6458 730 ShouldNotReachHere();
goetz@6458 731 }
goetz@6458 732 }
goetz@6458 733
goetz@6458 734 // Support for void zero_words_aligned8(HeapWord* to, size_t count)
goetz@6458 735 //
goetz@6458 736 // Arguments:
goetz@6458 737 // to:
goetz@6458 738 // count:
goetz@6458 739 //
goetz@6458 740 // Destroys:
goetz@6458 741 //
goetz@6458 742 address generate_zero_words_aligned8() {
goetz@6458 743 StubCodeMark mark(this, "StubRoutines", "zero_words_aligned8");
goetz@6458 744
goetz@6458 745 // Implemented as in ClearArray.
goetz@6511 746 address start = __ function_entry();
goetz@6458 747
goetz@6458 748 Register base_ptr_reg = R3_ARG1; // tohw (needs to be 8b aligned)
goetz@6458 749 Register cnt_dwords_reg = R4_ARG2; // count (in dwords)
goetz@6458 750 Register tmp1_reg = R5_ARG3;
goetz@6458 751 Register tmp2_reg = R6_ARG4;
goetz@6458 752 Register zero_reg = R7_ARG5;
goetz@6458 753
goetz@6458 754 // Procedure for large arrays (uses data cache block zero instruction).
goetz@6458 755 Label dwloop, fast, fastloop, restloop, lastdword, done;
goetz@6458 756 int cl_size=VM_Version::get_cache_line_size(), cl_dwords=cl_size>>3, cl_dwordaddr_bits=exact_log2(cl_dwords);
goetz@6458 757 int min_dcbz=2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines.
goetz@6458 758
goetz@6458 759 // Clear up to 128byte boundary if long enough, dword_cnt=(16-(base>>3))%16.
goetz@6458 760 __ dcbtst(base_ptr_reg); // Indicate write access to first cache line ...
goetz@6458 761 __ andi(tmp2_reg, cnt_dwords_reg, 1); // to check if number of dwords is even.
goetz@6458 762 __ srdi_(tmp1_reg, cnt_dwords_reg, 1); // number of double dwords
goetz@6458 763 __ load_const_optimized(zero_reg, 0L); // Use as zero register.
goetz@6458 764
goetz@6458 765 __ cmpdi(CCR1, tmp2_reg, 0); // cnt_dwords even?
goetz@6458 766 __ beq(CCR0, lastdword); // size <= 1
goetz@6458 767 __ mtctr(tmp1_reg); // Speculatively preload counter for rest loop (>0).
goetz@6458 768 __ cmpdi(CCR0, cnt_dwords_reg, (min_dcbz+1)*cl_dwords-1); // Big enough to ensure >=min_dcbz cache lines are included?
goetz@6458 769 __ neg(tmp1_reg, base_ptr_reg); // bit 0..58: bogus, bit 57..60: (16-(base>>3))%16, bit 61..63: 000
goetz@6458 770
goetz@6458 771 __ blt(CCR0, restloop); // Too small. (<31=(2*cl_dwords)-1 is sufficient, but bigger performs better.)
goetz@6458 772 __ rldicl_(tmp1_reg, tmp1_reg, 64-3, 64-cl_dwordaddr_bits); // Extract number of dwords to 128byte boundary=(16-(base>>3))%16.
goetz@6458 773
goetz@6458 774 __ beq(CCR0, fast); // already 128byte aligned
goetz@6458 775 __ mtctr(tmp1_reg); // Set ctr to hit 128byte boundary (0<ctr<cnt).
goetz@6458 776 __ subf(cnt_dwords_reg, tmp1_reg, cnt_dwords_reg); // rest (>0 since size>=256-8)
goetz@6458 777
goetz@6458 778 // Clear in first cache line dword-by-dword if not already 128byte aligned.
goetz@6458 779 __ bind(dwloop);
goetz@6458 780 __ std(zero_reg, 0, base_ptr_reg); // Clear 8byte aligned block.
goetz@6458 781 __ addi(base_ptr_reg, base_ptr_reg, 8);
goetz@6458 782 __ bdnz(dwloop);
goetz@6458 783
goetz@6458 784 // clear 128byte blocks
goetz@6458 785 __ bind(fast);
goetz@6458 786 __ srdi(tmp1_reg, cnt_dwords_reg, cl_dwordaddr_bits); // loop count for 128byte loop (>0 since size>=256-8)
goetz@6458 787 __ andi(tmp2_reg, cnt_dwords_reg, 1); // to check if rest even
goetz@6458 788
goetz@6458 789 __ mtctr(tmp1_reg); // load counter
goetz@6458 790 __ cmpdi(CCR1, tmp2_reg, 0); // rest even?
goetz@6458 791 __ rldicl_(tmp1_reg, cnt_dwords_reg, 63, 65-cl_dwordaddr_bits); // rest in double dwords
goetz@6458 792
goetz@6458 793 __ bind(fastloop);
goetz@6458 794 __ dcbz(base_ptr_reg); // Clear 128byte aligned block.
goetz@6458 795 __ addi(base_ptr_reg, base_ptr_reg, cl_size);
goetz@6458 796 __ bdnz(fastloop);
goetz@6458 797
goetz@6458 798 //__ dcbtst(base_ptr_reg); // Indicate write access to last cache line.
goetz@6458 799 __ beq(CCR0, lastdword); // rest<=1
goetz@6458 800 __ mtctr(tmp1_reg); // load counter
goetz@6458 801
goetz@6458 802 // Clear rest.
goetz@6458 803 __ bind(restloop);
goetz@6458 804 __ std(zero_reg, 0, base_ptr_reg); // Clear 8byte aligned block.
goetz@6458 805 __ std(zero_reg, 8, base_ptr_reg); // Clear 8byte aligned block.
goetz@6458 806 __ addi(base_ptr_reg, base_ptr_reg, 16);
goetz@6458 807 __ bdnz(restloop);
goetz@6458 808
goetz@6458 809 __ bind(lastdword);
goetz@6458 810 __ beq(CCR1, done);
goetz@6458 811 __ std(zero_reg, 0, base_ptr_reg);
goetz@6458 812 __ bind(done);
goetz@6458 813 __ blr(); // return
goetz@6458 814
goetz@6458 815 return start;
goetz@6458 816 }
goetz@6458 817
goetz@6458 818 // The following routine generates a subroutine to throw an asynchronous
goetz@6458 819 // UnknownError when an unsafe access gets a fault that could not be
goetz@6458 820 // reasonably prevented by the programmer. (Example: SIGBUS/OBJERR.)
goetz@6458 821 //
goetz@6458 822 address generate_handler_for_unsafe_access() {
goetz@6458 823 StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
goetz@6511 824 address start = __ function_entry();
goetz@6458 825 __ unimplemented("StubRoutines::handler_for_unsafe_access", 93);
goetz@6458 826 return start;
goetz@6458 827 }
goetz@6458 828
goetz@6458 829 #if !defined(PRODUCT)
goetz@6458 830 // Wrapper which calls oopDesc::is_oop_or_null()
goetz@6458 831 // Only called by MacroAssembler::verify_oop
goetz@6458 832 static void verify_oop_helper(const char* message, oop o) {
goetz@6458 833 if (!o->is_oop_or_null()) {
goetz@6458 834 fatal(message);
goetz@6458 835 }
goetz@6458 836 ++ StubRoutines::_verify_oop_count;
goetz@6458 837 }
goetz@6458 838 #endif
goetz@6458 839
goetz@6458 840 // Return address of code to be called from code generated by
goetz@6458 841 // MacroAssembler::verify_oop.
goetz@6458 842 //
goetz@6458 843 // Don't generate, rather use C++ code.
goetz@6458 844 address generate_verify_oop() {
goetz@6458 845 StubCodeMark mark(this, "StubRoutines", "verify_oop");
goetz@6458 846
goetz@6458 847 // this is actually a `FunctionDescriptor*'.
goetz@6458 848 address start = 0;
goetz@6458 849
goetz@6458 850 #if !defined(PRODUCT)
goetz@6458 851 start = CAST_FROM_FN_PTR(address, verify_oop_helper);
goetz@6458 852 #endif
goetz@6458 853
goetz@6458 854 return start;
goetz@6458 855 }
goetz@6458 856
goetz@6458 857 // Fairer handling of safepoints for native methods.
goetz@6458 858 //
goetz@6458 859 // Generate code which reads from the polling page. This special handling is needed as the
goetz@6458 860 // linux-ppc64 kernel before 2.6.6 doesn't set si_addr on some segfaults in 64bit mode
goetz@6458 861 // (cf. http://www.kernel.org/pub/linux/kernel/v2.6/ChangeLog-2.6.6), especially when we try
goetz@6458 862 // to read from the safepoint polling page.
goetz@6458 863 address generate_load_from_poll() {
goetz@6458 864 StubCodeMark mark(this, "StubRoutines", "generate_load_from_poll");
goetz@6511 865 address start = __ function_entry();
goetz@6458 866 __ unimplemented("StubRoutines::verify_oop", 95); // TODO PPC port
goetz@6458 867 return start;
goetz@6458 868 }
goetz@6458 869
goetz@6458 870 // -XX:+OptimizeFill : convert fill/copy loops into intrinsic
goetz@6458 871 //
goetz@6458 872 // The code is implemented(ported from sparc) as we believe it benefits JVM98, however
goetz@6458 873 // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!
goetz@6458 874 //
goetz@6495 875 // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition
goetz@6458 876 // for turning on loop predication optimization, and hence the behavior of "array range check"
goetz@6458 877 // and "loop invariant check" could be influenced, which potentially boosted JVM98.
goetz@6458 878 //
goetz@6495 879 // Generate stub for disjoint short fill. If "aligned" is true, the
goetz@6495 880 // "to" address is assumed to be heapword aligned.
goetz@6458 881 //
goetz@6458 882 // Arguments for generated stub:
goetz@6495 883 // to: R3_ARG1
goetz@6495 884 // value: R4_ARG2
goetz@6495 885 // count: R5_ARG3 treated as signed
goetz@6458 886 //
goetz@6458 887 address generate_fill(BasicType t, bool aligned, const char* name) {
goetz@6458 888 StubCodeMark mark(this, "StubRoutines", name);
goetz@6511 889 address start = __ function_entry();
goetz@6458 890
goetz@6495 891 const Register to = R3_ARG1; // source array address
goetz@6495 892 const Register value = R4_ARG2; // fill value
goetz@6495 893 const Register count = R5_ARG3; // elements count
goetz@6495 894 const Register temp = R6_ARG4; // temp register
goetz@6458 895
goetz@6495 896 //assert_clean_int(count, O3); // Make sure 'count' is clean int.
goetz@6458 897
goetz@6458 898 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
goetz@6458 899 Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;
goetz@6458 900
goetz@6458 901 int shift = -1;
goetz@6458 902 switch (t) {
goetz@6458 903 case T_BYTE:
goetz@6458 904 shift = 2;
goetz@6495 905 // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
goetz@6458 906 __ rldimi(value, value, 8, 48); // 8 bit -> 16 bit
goetz@6495 907 __ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
goetz@6458 908 __ blt(CCR0, L_fill_elements);
goetz@6458 909 __ rldimi(value, value, 16, 32); // 16 bit -> 32 bit
goetz@6458 910 break;
goetz@6458 911 case T_SHORT:
goetz@6458 912 shift = 1;
goetz@6495 913 // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
goetz@6458 914 __ rldimi(value, value, 16, 32); // 16 bit -> 32 bit
goetz@6495 915 __ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
goetz@6458 916 __ blt(CCR0, L_fill_elements);
goetz@6458 917 break;
goetz@6458 918 case T_INT:
goetz@6458 919 shift = 0;
goetz@6495 920 __ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
goetz@6458 921 __ blt(CCR0, L_fill_4_bytes);
goetz@6458 922 break;
goetz@6458 923 default: ShouldNotReachHere();
goetz@6458 924 }
goetz@6458 925
goetz@6458 926 if (!aligned && (t == T_BYTE || t == T_SHORT)) {
goetz@6495 927 // Align source address at 4 bytes address boundary.
goetz@6458 928 if (t == T_BYTE) {
goetz@6495 929 // One byte misalignment happens only for byte arrays.
goetz@6458 930 __ andi_(temp, to, 1);
goetz@6458 931 __ beq(CCR0, L_skip_align1);
goetz@6458 932 __ stb(value, 0, to);
goetz@6458 933 __ addi(to, to, 1);
goetz@6458 934 __ addi(count, count, -1);
goetz@6458 935 __ bind(L_skip_align1);
goetz@6458 936 }
goetz@6458 937 // Two bytes misalignment happens only for byte and short (char) arrays.
goetz@6458 938 __ andi_(temp, to, 2);
goetz@6458 939 __ beq(CCR0, L_skip_align2);
goetz@6458 940 __ sth(value, 0, to);
goetz@6458 941 __ addi(to, to, 2);
goetz@6458 942 __ addi(count, count, -(1 << (shift - 1)));
goetz@6458 943 __ bind(L_skip_align2);
goetz@6458 944 }
goetz@6458 945
goetz@6458 946 if (!aligned) {
goetz@6458 947 // Align to 8 bytes, we know we are 4 byte aligned to start.
goetz@6458 948 __ andi_(temp, to, 7);
goetz@6458 949 __ beq(CCR0, L_fill_32_bytes);
goetz@6458 950 __ stw(value, 0, to);
goetz@6458 951 __ addi(to, to, 4);
goetz@6458 952 __ addi(count, count, -(1 << shift));
goetz@6458 953 __ bind(L_fill_32_bytes);
goetz@6458 954 }
goetz@6458 955
goetz@6495 956 __ li(temp, 8<<shift); // Prepare for 32 byte loop.
goetz@6495 957 // Clone bytes int->long as above.
goetz@6495 958 __ rldimi(value, value, 32, 0); // 32 bit -> 64 bit
goetz@6458 959
goetz@6458 960 Label L_check_fill_8_bytes;
goetz@6495 961 // Fill 32-byte chunks.
goetz@6458 962 __ subf_(count, temp, count);
goetz@6458 963 __ blt(CCR0, L_check_fill_8_bytes);
goetz@6458 964
goetz@6458 965 Label L_fill_32_bytes_loop;
goetz@6458 966 __ align(32);
goetz@6458 967 __ bind(L_fill_32_bytes_loop);
goetz@6458 968
goetz@6458 969 __ std(value, 0, to);
goetz@6458 970 __ std(value, 8, to);
goetz@6495 971 __ subf_(count, temp, count); // Update count.
goetz@6458 972 __ std(value, 16, to);
goetz@6458 973 __ std(value, 24, to);
goetz@6458 974
goetz@6458 975 __ addi(to, to, 32);
goetz@6458 976 __ bge(CCR0, L_fill_32_bytes_loop);
goetz@6458 977
goetz@6458 978 __ bind(L_check_fill_8_bytes);
goetz@6458 979 __ add_(count, temp, count);
goetz@6458 980 __ beq(CCR0, L_exit);
goetz@6458 981 __ addic_(count, count, -(2 << shift));
goetz@6458 982 __ blt(CCR0, L_fill_4_bytes);
goetz@6458 983
goetz@6458 984 //
goetz@6458 985 // Length is too short, just fill 8 bytes at a time.
goetz@6458 986 //
goetz@6458 987 Label L_fill_8_bytes_loop;
goetz@6458 988 __ bind(L_fill_8_bytes_loop);
goetz@6458 989 __ std(value, 0, to);
goetz@6458 990 __ addic_(count, count, -(2 << shift));
goetz@6458 991 __ addi(to, to, 8);
goetz@6458 992 __ bge(CCR0, L_fill_8_bytes_loop);
goetz@6458 993
goetz@6495 994 // Fill trailing 4 bytes.
goetz@6458 995 __ bind(L_fill_4_bytes);
goetz@6458 996 __ andi_(temp, count, 1<<shift);
goetz@6458 997 __ beq(CCR0, L_fill_2_bytes);
goetz@6458 998
goetz@6458 999 __ stw(value, 0, to);
goetz@6458 1000 if (t == T_BYTE || t == T_SHORT) {
goetz@6458 1001 __ addi(to, to, 4);
goetz@6495 1002 // Fill trailing 2 bytes.
goetz@6458 1003 __ bind(L_fill_2_bytes);
goetz@6458 1004 __ andi_(temp, count, 1<<(shift-1));
goetz@6458 1005 __ beq(CCR0, L_fill_byte);
goetz@6458 1006 __ sth(value, 0, to);
goetz@6458 1007 if (t == T_BYTE) {
goetz@6458 1008 __ addi(to, to, 2);
goetz@6495 1009 // Fill trailing byte.
goetz@6458 1010 __ bind(L_fill_byte);
goetz@6458 1011 __ andi_(count, count, 1);
goetz@6458 1012 __ beq(CCR0, L_exit);
goetz@6458 1013 __ stb(value, 0, to);
goetz@6458 1014 } else {
goetz@6458 1015 __ bind(L_fill_byte);
goetz@6458 1016 }
goetz@6458 1017 } else {
goetz@6458 1018 __ bind(L_fill_2_bytes);
goetz@6458 1019 }
goetz@6458 1020 __ bind(L_exit);
goetz@6458 1021 __ blr();
goetz@6458 1022
goetz@6495 1023 // Handle copies less than 8 bytes. Int is handled elsewhere.
goetz@6458 1024 if (t == T_BYTE) {
goetz@6458 1025 __ bind(L_fill_elements);
goetz@6458 1026 Label L_fill_2, L_fill_4;
goetz@6458 1027 __ andi_(temp, count, 1);
goetz@6458 1028 __ beq(CCR0, L_fill_2);
goetz@6458 1029 __ stb(value, 0, to);
goetz@6458 1030 __ addi(to, to, 1);
goetz@6458 1031 __ bind(L_fill_2);
goetz@6458 1032 __ andi_(temp, count, 2);
goetz@6458 1033 __ beq(CCR0, L_fill_4);
goetz@6458 1034 __ stb(value, 0, to);
goetz@6458 1035 __ stb(value, 0, to);
goetz@6458 1036 __ addi(to, to, 2);
goetz@6458 1037 __ bind(L_fill_4);
goetz@6458 1038 __ andi_(temp, count, 4);
goetz@6458 1039 __ beq(CCR0, L_exit);
goetz@6458 1040 __ stb(value, 0, to);
goetz@6458 1041 __ stb(value, 1, to);
goetz@6458 1042 __ stb(value, 2, to);
goetz@6458 1043 __ stb(value, 3, to);
goetz@6458 1044 __ blr();
goetz@6458 1045 }
goetz@6458 1046
goetz@6458 1047 if (t == T_SHORT) {
goetz@6458 1048 Label L_fill_2;
goetz@6458 1049 __ bind(L_fill_elements);
goetz@6458 1050 __ andi_(temp, count, 1);
goetz@6458 1051 __ beq(CCR0, L_fill_2);
goetz@6458 1052 __ sth(value, 0, to);
goetz@6458 1053 __ addi(to, to, 2);
goetz@6458 1054 __ bind(L_fill_2);
goetz@6458 1055 __ andi_(temp, count, 2);
goetz@6458 1056 __ beq(CCR0, L_exit);
goetz@6458 1057 __ sth(value, 0, to);
goetz@6458 1058 __ sth(value, 2, to);
goetz@6458 1059 __ blr();
goetz@6458 1060 }
goetz@6458 1061 return start;
goetz@6458 1062 }
goetz@6458 1063
goetz@6458 1064
goetz@6495 1065 // Generate overlap test for array copy stubs.
goetz@6458 1066 //
goetz@6458 1067 // Input:
goetz@6458 1068 // R3_ARG1 - from
goetz@6458 1069 // R4_ARG2 - to
goetz@6458 1070 // R5_ARG3 - element count
goetz@6458 1071 //
goetz@6458 1072 void array_overlap_test(address no_overlap_target, int log2_elem_size) {
goetz@6458 1073 Register tmp1 = R6_ARG4;
goetz@6458 1074 Register tmp2 = R7_ARG5;
goetz@6458 1075
goetz@6458 1076 Label l_overlap;
goetz@6458 1077 #ifdef ASSERT
goetz@6458 1078 __ srdi_(tmp2, R5_ARG3, 31);
goetz@6458 1079 __ asm_assert_eq("missing zero extend", 0xAFFE);
goetz@6458 1080 #endif
goetz@6458 1081
goetz@6458 1082 __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
goetz@6458 1083 __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
goetz@6458 1084 __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
goetz@6458 1085 __ cmpld(CCR1, tmp1, tmp2);
goetz@6458 1086 __ crand(/*CCR0 lt*/0, /*CCR1 lt*/4+0, /*CCR0 lt*/0);
goetz@6458 1087 __ blt(CCR0, l_overlap); // Src before dst and distance smaller than size.
goetz@6458 1088
goetz@6458 1089 // need to copy forwards
goetz@6458 1090 if (__ is_within_range_of_b(no_overlap_target, __ pc())) {
goetz@6458 1091 __ b(no_overlap_target);
goetz@6458 1092 } else {
goetz@6458 1093 __ load_const(tmp1, no_overlap_target, tmp2);
goetz@6458 1094 __ mtctr(tmp1);
goetz@6458 1095 __ bctr();
goetz@6458 1096 }
goetz@6458 1097
goetz@6458 1098 __ bind(l_overlap);
goetz@6458 1099 // need to copy backwards
goetz@6458 1100 }
goetz@6458 1101
goetz@6458 1102 // The guideline in the implementations of generate_disjoint_xxx_copy
goetz@6458 1103 // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
goetz@6458 1104 // single instructions, but to avoid alignment interrupts (see subsequent
goetz@6458 1105 // comment). Furthermore, we try to minimize misaligned access, even
goetz@6458 1106 // though they cause no alignment interrupt.
goetz@6458 1107 //
goetz@6458 1108 // In Big-Endian mode, the PowerPC architecture requires implementations to
goetz@6458 1109 // handle automatically misaligned integer halfword and word accesses,
goetz@6458 1110 // word-aligned integer doubleword accesses, and word-aligned floating-point
goetz@6458 1111 // accesses. Other accesses may or may not generate an Alignment interrupt
goetz@6458 1112 // depending on the implementation.
goetz@6458 1113 // Alignment interrupt handling may require on the order of hundreds of cycles,
goetz@6458 1114 // so every effort should be made to avoid misaligned memory values.
goetz@6458 1115 //
goetz@6458 1116 //
goetz@6458 1117 // Generate stub for disjoint byte copy. If "aligned" is true, the
goetz@6458 1118 // "from" and "to" addresses are assumed to be heapword aligned.
goetz@6458 1119 //
goetz@6458 1120 // Arguments for generated stub:
goetz@6458 1121 // from: R3_ARG1
goetz@6458 1122 // to: R4_ARG2
goetz@6458 1123 // count: R5_ARG3 treated as signed
goetz@6458 1124 //
goetz@6458 1125 address generate_disjoint_byte_copy(bool aligned, const char * name) {
goetz@6458 1126 StubCodeMark mark(this, "StubRoutines", name);
goetz@6511 1127 address start = __ function_entry();
goetz@6458 1128
goetz@6458 1129 Register tmp1 = R6_ARG4;
goetz@6458 1130 Register tmp2 = R7_ARG5;
goetz@6458 1131 Register tmp3 = R8_ARG6;
goetz@6458 1132 Register tmp4 = R9_ARG7;
goetz@6458 1133
mhorie@9680 1134 VectorSRegister tmp_vsr1 = VSR1;
mhorie@9680 1135 VectorSRegister tmp_vsr2 = VSR2;
mhorie@9680 1136
mhorie@9680 1137 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10;
mhorie@9680 1138
goetz@6458 1139 // Don't try anything fancy if arrays don't have many elements.
goetz@6458 1140 __ li(tmp3, 0);
goetz@6458 1141 __ cmpwi(CCR0, R5_ARG3, 17);
goetz@6458 1142 __ ble(CCR0, l_6); // copy 4 at a time
goetz@6458 1143
goetz@6458 1144 if (!aligned) {
goetz@6458 1145 __ xorr(tmp1, R3_ARG1, R4_ARG2);
goetz@6458 1146 __ andi_(tmp1, tmp1, 3);
goetz@6458 1147 __ bne(CCR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
goetz@6458 1148
goetz@6458 1149 // Copy elements if necessary to align to 4 bytes.
goetz@6458 1150 __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
goetz@6458 1151 __ andi_(tmp1, tmp1, 3);
goetz@6458 1152 __ beq(CCR0, l_2);
goetz@6458 1153
goetz@6458 1154 __ subf(R5_ARG3, tmp1, R5_ARG3);
goetz@6458 1155 __ bind(l_9);
goetz@6458 1156 __ lbz(tmp2, 0, R3_ARG1);
goetz@6458 1157 __ addic_(tmp1, tmp1, -1);
goetz@6458 1158 __ stb(tmp2, 0, R4_ARG2);
goetz@6458 1159 __ addi(R3_ARG1, R3_ARG1, 1);
goetz@6458 1160 __ addi(R4_ARG2, R4_ARG2, 1);
goetz@6458 1161 __ bne(CCR0, l_9);
goetz@6458 1162
goetz@6458 1163 __ bind(l_2);
goetz@6458 1164 }
goetz@6458 1165
goetz@6458 1166 // copy 8 elements at a time
goetz@6458 1167 __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
goetz@6458 1168 __ andi_(tmp1, tmp2, 7);
goetz@6458 1169 __ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8
goetz@6458 1170
goetz@6458 1171 // copy a 2-element word if necessary to align to 8 bytes
goetz@6458 1172 __ andi_(R0, R3_ARG1, 7);
goetz@6458 1173 __ beq(CCR0, l_7);
goetz@6458 1174
goetz@6458 1175 __ lwzx(tmp2, R3_ARG1, tmp3);
goetz@6458 1176 __ addi(R5_ARG3, R5_ARG3, -4);
goetz@6458 1177 __ stwx(tmp2, R4_ARG2, tmp3);
goetz@6458 1178 { // FasterArrayCopy
goetz@6458 1179 __ addi(R3_ARG1, R3_ARG1, 4);
goetz@6458 1180 __ addi(R4_ARG2, R4_ARG2, 4);
goetz@6458 1181 }
goetz@6458 1182 __ bind(l_7);
goetz@6458 1183
goetz@6458 1184 { // FasterArrayCopy
goetz@6458 1185 __ cmpwi(CCR0, R5_ARG3, 31);
goetz@6458 1186 __ ble(CCR0, l_6); // copy 2 at a time if less than 32 elements remain
goetz@6458 1187
goetz@6458 1188 __ srdi(tmp1, R5_ARG3, 5);
goetz@6458 1189 __ andi_(R5_ARG3, R5_ARG3, 31);
goetz@6458 1190 __ mtctr(tmp1);
goetz@6458 1191
mhorie@9680 1192 if (!VM_Version::has_vsx()) {
mhorie@9680 1193
goetz@6458 1194 __ bind(l_8);
goetz@6458 1195 // Use unrolled version for mass copying (copy 32 elements a time)
goetz@6458 1196 // Load feeding store gets zero latency on Power6, however not on Power5.
goetz@6458 1197 // Therefore, the following sequence is made for the good of both.
goetz@6458 1198 __ ld(tmp1, 0, R3_ARG1);
goetz@6458 1199 __ ld(tmp2, 8, R3_ARG1);
goetz@6458 1200 __ ld(tmp3, 16, R3_ARG1);
goetz@6458 1201 __ ld(tmp4, 24, R3_ARG1);
goetz@6458 1202 __ std(tmp1, 0, R4_ARG2);
goetz@6458 1203 __ std(tmp2, 8, R4_ARG2);
goetz@6458 1204 __ std(tmp3, 16, R4_ARG2);
goetz@6458 1205 __ std(tmp4, 24, R4_ARG2);
goetz@6458 1206 __ addi(R3_ARG1, R3_ARG1, 32);
goetz@6458 1207 __ addi(R4_ARG2, R4_ARG2, 32);
goetz@6458 1208 __ bdnz(l_8);
mhorie@9680 1209
mhorie@9680 1210 } else { // Processor supports VSX, so use it to mass copy.
mhorie@9680 1211
mhorie@9680 1212 // Prefetch the data into the L2 cache.
mhorie@9680 1213 __ dcbt(R3_ARG1, 0);
mhorie@9680 1214
mhorie@9680 1215 // If supported set DSCR pre-fetch to deepest.
mhorie@9680 1216 if (VM_Version::has_mfdscr()) {
mhorie@9680 1217 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
mhorie@9680 1218 __ mtdscr(tmp2);
mhorie@9680 1219 }
mhorie@9680 1220
mhorie@9680 1221 __ li(tmp1, 16);
mhorie@9680 1222
mhorie@9680 1223 // Backbranch target aligned to 32-byte. Not 16-byte align as
mhorie@9680 1224 // loop contains < 8 instructions that fit inside a single
mhorie@9680 1225 // i-cache sector.
mhorie@9680 1226 __ align(32);
mhorie@9680 1227
mhorie@9680 1228 __ bind(l_10);
mhorie@9680 1229 // Use loop with VSX load/store instructions to
mhorie@9680 1230 // copy 32 elements a time.
gromero@9684 1231 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
gromero@9684 1232 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
mhorie@9680 1233 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
mhorie@9680 1234 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
mhorie@9680 1235 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
mhorie@9680 1236 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
mhorie@9680 1237 __ bdnz(l_10); // Dec CTR and loop if not zero.
mhorie@9680 1238
mhorie@9680 1239 // Restore DSCR pre-fetch value.
mhorie@9680 1240 if (VM_Version::has_mfdscr()) {
mhorie@9680 1241 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
mhorie@9680 1242 __ mtdscr(tmp2);
mhorie@9680 1243 }
mhorie@9680 1244
mhorie@9680 1245 } // VSX
mhorie@9680 1246 } // FasterArrayCopy
goetz@6458 1247
goetz@6458 1248 __ bind(l_6);
goetz@6458 1249
goetz@6458 1250 // copy 4 elements at a time
goetz@6458 1251 __ cmpwi(CCR0, R5_ARG3, 4);
goetz@6458 1252 __ blt(CCR0, l_1);
goetz@6458 1253 __ srdi(tmp1, R5_ARG3, 2);
goetz@6458 1254 __ mtctr(tmp1); // is > 0
goetz@6458 1255 __ andi_(R5_ARG3, R5_ARG3, 3);
goetz@6458 1256
goetz@6458 1257 { // FasterArrayCopy
goetz@6458 1258 __ addi(R3_ARG1, R3_ARG1, -4);
goetz@6458 1259 __ addi(R4_ARG2, R4_ARG2, -4);
goetz@6458 1260 __ bind(l_3);
goetz@6458 1261 __ lwzu(tmp2, 4, R3_ARG1);
goetz@6458 1262 __ stwu(tmp2, 4, R4_ARG2);
goetz@6458 1263 __ bdnz(l_3);
goetz@6458 1264 __ addi(R3_ARG1, R3_ARG1, 4);
goetz@6458 1265 __ addi(R4_ARG2, R4_ARG2, 4);
goetz@6458 1266 }
goetz@6458 1267
goetz@6458 1268 // do single element copy
goetz@6458 1269 __ bind(l_1);
goetz@6458 1270 __ cmpwi(CCR0, R5_ARG3, 0);
goetz@6458 1271 __ beq(CCR0, l_4);
goetz@6458 1272
goetz@6458 1273 { // FasterArrayCopy
goetz@6458 1274 __ mtctr(R5_ARG3);
goetz@6458 1275 __ addi(R3_ARG1, R3_ARG1, -1);
goetz@6458 1276 __ addi(R4_ARG2, R4_ARG2, -1);
goetz@6458 1277
goetz@6458 1278 __ bind(l_5);
goetz@6458 1279 __ lbzu(tmp2, 1, R3_ARG1);
goetz@6458 1280 __ stbu(tmp2, 1, R4_ARG2);
goetz@6458 1281 __ bdnz(l_5);
goetz@6458 1282 }
goetz@6458 1283
goetz@6458 1284 __ bind(l_4);
goetz@6458 1285 __ blr();
goetz@6458 1286
goetz@6458 1287 return start;
goetz@6458 1288 }
goetz@6458 1289
goetz@6458 1290 // Generate stub for conjoint byte copy. If "aligned" is true, the
goetz@6458 1291 // "from" and "to" addresses are assumed to be heapword aligned.
goetz@6458 1292 //
goetz@6458 1293 // Arguments for generated stub:
goetz@6458 1294 // from: R3_ARG1
goetz@6458 1295 // to: R4_ARG2
goetz@6458 1296 // count: R5_ARG3 treated as signed
goetz@6458 1297 //
goetz@6458 1298 address generate_conjoint_byte_copy(bool aligned, const char * name) {
goetz@6458 1299 StubCodeMark mark(this, "StubRoutines", name);
goetz@6511 1300 address start = __ function_entry();
goetz@6458 1301
goetz@6458 1302 Register tmp1 = R6_ARG4;
goetz@6458 1303 Register tmp2 = R7_ARG5;
goetz@6458 1304 Register tmp3 = R8_ARG6;
goetz@6458 1305
goetz@6511 1306 #if defined(ABI_ELFv2)
goetz@6511 1307 address nooverlap_target = aligned ?
goetz@6511 1308 StubRoutines::arrayof_jbyte_disjoint_arraycopy() :
goetz@6511 1309 StubRoutines::jbyte_disjoint_arraycopy();
goetz@6511 1310 #else
goetz@6458 1311 address nooverlap_target = aligned ?
goetz@6458 1312 ((FunctionDescriptor*)StubRoutines::arrayof_jbyte_disjoint_arraycopy())->entry() :
goetz@6458 1313 ((FunctionDescriptor*)StubRoutines::jbyte_disjoint_arraycopy())->entry();
goetz@6511 1314 #endif
goetz@6458 1315
goetz@6458 1316 array_overlap_test(nooverlap_target, 0);
goetz@6458 1317 // Do reverse copy. We assume the case of actual overlap is rare enough
goetz@6458 1318 // that we don't have to optimize it.
goetz@6458 1319 Label l_1, l_2;
goetz@6458 1320
goetz@6458 1321 __ b(l_2);
goetz@6458 1322 __ bind(l_1);
goetz@6458 1323 __ stbx(tmp1, R4_ARG2, R5_ARG3);
goetz@6458 1324 __ bind(l_2);
goetz@6458 1325 __ addic_(R5_ARG3, R5_ARG3, -1);
goetz@6458 1326 __ lbzx(tmp1, R3_ARG1, R5_ARG3);
goetz@6458 1327 __ bge(CCR0, l_1);
goetz@6458 1328
goetz@6458 1329 __ blr();
goetz@6458 1330
goetz@6458 1331 return start;
goetz@6458 1332 }
goetz@6458 1333
goetz@6458 1334 // Generate stub for disjoint short copy. If "aligned" is true, the
goetz@6458 1335 // "from" and "to" addresses are assumed to be heapword aligned.
goetz@6458 1336 //
goetz@6458 1337 // Arguments for generated stub:
goetz@6458 1338 // from: R3_ARG1
goetz@6458 1339 // to: R4_ARG2
goetz@6458 1340 // elm.count: R5_ARG3 treated as signed
goetz@6458 1341 //
goetz@6458 1342 // Strategy for aligned==true:
goetz@6458 1343 //
goetz@6458 1344 // If length <= 9:
goetz@6458 1345 // 1. copy 2 elements at a time (l_6)
goetz@6458 1346 // 2. copy last element if original element count was odd (l_1)
goetz@6458 1347 //
goetz@6458 1348 // If length > 9:
goetz@6458 1349 // 1. copy 4 elements at a time until less than 4 elements are left (l_7)
goetz@6458 1350 // 2. copy 2 elements at a time until less than 2 elements are left (l_6)
goetz@6458 1351 // 3. copy last element if one was left in step 2. (l_1)
goetz@6458 1352 //
goetz@6458 1353 //
goetz@6458 1354 // Strategy for aligned==false:
goetz@6458 1355 //
goetz@6458 1356 // If length <= 9: same as aligned==true case, but NOTE: load/stores
goetz@6458 1357 // can be unaligned (see comment below)
goetz@6458 1358 //
goetz@6458 1359 // If length > 9:
goetz@6458 1360 // 1. continue with step 6. if the alignment of from and to mod 4
goetz@6458 1361 // is different.
goetz@6458 1362 // 2. align from and to to 4 bytes by copying 1 element if necessary
goetz@6458 1363 // 3. at l_2 from and to are 4 byte aligned; continue with
goetz@6458 1364 // 5. if they cannot be aligned to 8 bytes because they have
goetz@6458 1365 // got different alignment mod 8.
goetz@6458 1366 // 4. at this point we know that both, from and to, have the same
goetz@6458 1367 // alignment mod 8, now copy one element if necessary to get
goetz@6458 1368 // 8 byte alignment of from and to.
goetz@6458 1369 // 5. copy 4 elements at a time until less than 4 elements are
goetz@6458 1370 // left; depending on step 3. all load/stores are aligned or
goetz@6458 1371 // either all loads or all stores are unaligned.
goetz@6458 1372 // 6. copy 2 elements at a time until less than 2 elements are
goetz@6458 1373 // left (l_6); arriving here from step 1., there is a chance
goetz@6458 1374 // that all accesses are unaligned.
goetz@6458 1375 // 7. copy last element if one was left in step 6. (l_1)
goetz@6458 1376 //
goetz@6458 1377 // There are unaligned data accesses using integer load/store
goetz@6458 1378 // instructions in this stub. POWER allows such accesses.
goetz@6458 1379 //
goetz@6458 1380 // According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
goetz@6458 1381 // Chapter 2: Effect of Operand Placement on Performance) unaligned
goetz@6458 1382 // integer load/stores have good performance. Only unaligned
goetz@6458 1383 // floating point load/stores can have poor performance.
goetz@6458 1384 //
goetz@6458 1385 // TODO:
goetz@6458 1386 //
goetz@6458 1387 // 1. check if aligning the backbranch target of loops is beneficial
goetz@6458 1388 //
goetz@6458 1389 address generate_disjoint_short_copy(bool aligned, const char * name) {
goetz@6458 1390 StubCodeMark mark(this, "StubRoutines", name);
goetz@6458 1391
goetz@6458 1392 Register tmp1 = R6_ARG4;
goetz@6458 1393 Register tmp2 = R7_ARG5;
goetz@6458 1394 Register tmp3 = R8_ARG6;
goetz@6458 1395 Register tmp4 = R9_ARG7;
goetz@6458 1396
gromero@9662 1397 VectorSRegister tmp_vsr1 = VSR1;
gromero@9662 1398 VectorSRegister tmp_vsr2 = VSR2;
gromero@9662 1399
goetz@6511 1400 address start = __ function_entry();
goetz@6458 1401
gromero@9662 1402 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
gromero@9662 1403
goetz@6458 1404 // don't try anything fancy if arrays don't have many elements
goetz@6458 1405 __ li(tmp3, 0);
goetz@6458 1406 __ cmpwi(CCR0, R5_ARG3, 9);
goetz@6458 1407 __ ble(CCR0, l_6); // copy 2 at a time
goetz@6458 1408
goetz@6458 1409 if (!aligned) {
goetz@6458 1410 __ xorr(tmp1, R3_ARG1, R4_ARG2);
goetz@6458 1411 __ andi_(tmp1, tmp1, 3);
goetz@6458 1412 __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
goetz@6458 1413
goetz@6458 1414 // At this point it is guaranteed that both, from and to have the same alignment mod 4.
goetz@6458 1415
goetz@6458 1416 // Copy 1 element if necessary to align to 4 bytes.
goetz@6458 1417 __ andi_(tmp1, R3_ARG1, 3);
goetz@6458 1418 __ beq(CCR0, l_2);
goetz@6458 1419
goetz@6458 1420 __ lhz(tmp2, 0, R3_ARG1);
goetz@6458 1421 __ addi(R3_ARG1, R3_ARG1, 2);
goetz@6458 1422 __ sth(tmp2, 0, R4_ARG2);
goetz@6458 1423 __ addi(R4_ARG2, R4_ARG2, 2);
goetz@6458 1424 __ addi(R5_ARG3, R5_ARG3, -1);
goetz@6458 1425 __ bind(l_2);
goetz@6458 1426
goetz@6458 1427 // At this point the positions of both, from and to, are at least 4 byte aligned.
goetz@6458 1428
goetz@6458 1429 // Copy 4 elements at a time.
goetz@6458 1430 // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
goetz@6458 1431 __ xorr(tmp2, R3_ARG1, R4_ARG2);
goetz@6458 1432 __ andi_(tmp1, tmp2, 7);
goetz@6458 1433 __ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
goetz@6458 1434
goetz@6458 1435 // Copy a 2-element word if necessary to align to 8 bytes.
goetz@6458 1436 __ andi_(R0, R3_ARG1, 7);
goetz@6458 1437 __ beq(CCR0, l_7);
goetz@6458 1438
goetz@6458 1439 __ lwzx(tmp2, R3_ARG1, tmp3);
goetz@6458 1440 __ addi(R5_ARG3, R5_ARG3, -2);
goetz@6458 1441 __ stwx(tmp2, R4_ARG2, tmp3);
goetz@6458 1442 { // FasterArrayCopy
goetz@6458 1443 __ addi(R3_ARG1, R3_ARG1, 4);
goetz@6458 1444 __ addi(R4_ARG2, R4_ARG2, 4);
goetz@6458 1445 }
goetz@6458 1446 }
goetz@6458 1447
goetz@6458 1448 __ bind(l_7);
goetz@6458 1449
goetz@6458 1450 // Copy 4 elements at a time; either the loads or the stores can
goetz@6458 1451 // be unaligned if aligned == false.
goetz@6458 1452
goetz@6458 1453 { // FasterArrayCopy
goetz@6458 1454 __ cmpwi(CCR0, R5_ARG3, 15);
goetz@6458 1455 __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain
goetz@6458 1456
goetz@6458 1457 __ srdi(tmp1, R5_ARG3, 4);
goetz@6458 1458 __ andi_(R5_ARG3, R5_ARG3, 15);
goetz@6458 1459 __ mtctr(tmp1);
goetz@6458 1460
gromero@9662 1461 if (!VM_Version::has_vsx()) {
gromero@9662 1462
gromero@9662 1463 __ bind(l_8);
gromero@9662 1464 // Use unrolled version for mass copying (copy 16 elements a time).
gromero@9662 1465 // Load feeding store gets zero latency on Power6, however not on Power5.
gromero@9662 1466 // Therefore, the following sequence is made for the good of both.
gromero@9662 1467 __ ld(tmp1, 0, R3_ARG1);
gromero@9662 1468 __ ld(tmp2, 8, R3_ARG1);
gromero@9662 1469 __ ld(tmp3, 16, R3_ARG1);
gromero@9662 1470 __ ld(tmp4, 24, R3_ARG1);
gromero@9662 1471 __ std(tmp1, 0, R4_ARG2);
gromero@9662 1472 __ std(tmp2, 8, R4_ARG2);
gromero@9662 1473 __ std(tmp3, 16, R4_ARG2);
gromero@9662 1474 __ std(tmp4, 24, R4_ARG2);
gromero@9662 1475 __ addi(R3_ARG1, R3_ARG1, 32);
gromero@9662 1476 __ addi(R4_ARG2, R4_ARG2, 32);
gromero@9662 1477 __ bdnz(l_8);
gromero@9662 1478
gromero@9662 1479 } else { // Processor supports VSX, so use it to mass copy.
gromero@9662 1480
gromero@9662 1481 // Prefetch src data into L2 cache.
gromero@9662 1482 __ dcbt(R3_ARG1, 0);
gromero@9662 1483
gromero@9662 1484 // If supported set DSCR pre-fetch to deepest.
gromero@9662 1485 if (VM_Version::has_mfdscr()) {
gromero@9662 1486 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
gromero@9662 1487 __ mtdscr(tmp2);
gromero@9662 1488 }
gromero@9662 1489 __ li(tmp1, 16);
gromero@9662 1490
gromero@9662 1491 // Backbranch target aligned to 32-byte. It's not aligned 16-byte
gromero@9662 1492 // as loop contains < 8 instructions that fit inside a single
gromero@9662 1493 // i-cache sector.
gromero@9662 1494 __ align(32);
gromero@9662 1495
gromero@9662 1496 __ bind(l_9);
gromero@9662 1497 // Use loop with VSX load/store instructions to
gromero@9662 1498 // copy 16 elements a time.
gromero@9684 1499 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load from src.
gromero@9684 1500 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst.
gromero@9662 1501 __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16.
gromero@9662 1502 __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
gromero@9662 1503 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32.
gromero@9662 1504 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32.
gromero@9662 1505 __ bdnz(l_9); // Dec CTR and loop if not zero.
gromero@9662 1506
gromero@9662 1507 // Restore DSCR pre-fetch value.
gromero@9662 1508 if (VM_Version::has_mfdscr()) {
gromero@9662 1509 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
gromero@9662 1510 __ mtdscr(tmp2);
gromero@9662 1511 }
gromero@9662 1512
gromero@9662 1513 }
gromero@9662 1514 } // FasterArrayCopy
goetz@6458 1515 __ bind(l_6);
goetz@6458 1516
goetz@6458 1517 // copy 2 elements at a time
goetz@6458 1518 { // FasterArrayCopy
goetz@6458 1519 __ cmpwi(CCR0, R5_ARG3, 2);
goetz@6458 1520 __ blt(CCR0, l_1);
goetz@6458 1521 __ srdi(tmp1, R5_ARG3, 1);
goetz@6458 1522 __ andi_(R5_ARG3, R5_ARG3, 1);
goetz@6458 1523
goetz@6458 1524 __ addi(R3_ARG1, R3_ARG1, -4);
goetz@6458 1525 __ addi(R4_ARG2, R4_ARG2, -4);
goetz@6458 1526 __ mtctr(tmp1);
goetz@6458 1527
goetz@6458 1528 __ bind(l_3);
goetz@6458 1529 __ lwzu(tmp2, 4, R3_ARG1);
goetz@6458 1530 __ stwu(tmp2, 4, R4_ARG2);
goetz@6458 1531 __ bdnz(l_3);
goetz@6458 1532
goetz@6458 1533 __ addi(R3_ARG1, R3_ARG1, 4);
goetz@6458 1534 __ addi(R4_ARG2, R4_ARG2, 4);
goetz@6458 1535 }
goetz@6458 1536
goetz@6458 1537 // do single element copy
goetz@6458 1538 __ bind(l_1);
goetz@6458 1539 __ cmpwi(CCR0, R5_ARG3, 0);
goetz@6458 1540 __ beq(CCR0, l_4);
goetz@6458 1541
goetz@6458 1542 { // FasterArrayCopy
goetz@6458 1543 __ mtctr(R5_ARG3);
goetz@6458 1544 __ addi(R3_ARG1, R3_ARG1, -2);
goetz@6458 1545 __ addi(R4_ARG2, R4_ARG2, -2);
goetz@6458 1546
goetz@6458 1547 __ bind(l_5);
goetz@6458 1548 __ lhzu(tmp2, 2, R3_ARG1);
goetz@6458 1549 __ sthu(tmp2, 2, R4_ARG2);
goetz@6458 1550 __ bdnz(l_5);
goetz@6458 1551 }
goetz@6458 1552 __ bind(l_4);
goetz@6458 1553 __ blr();
goetz@6458 1554
goetz@6458 1555 return start;
goetz@6458 1556 }
goetz@6458 1557
goetz@6458 1558 // Generate stub for conjoint short copy. If "aligned" is true, the
goetz@6458 1559 // "from" and "to" addresses are assumed to be heapword aligned.
goetz@6458 1560 //
goetz@6458 1561 // Arguments for generated stub:
goetz@6458 1562 // from: R3_ARG1
goetz@6458 1563 // to: R4_ARG2
goetz@6458 1564 // count: R5_ARG3 treated as signed
goetz@6458 1565 //
goetz@6458 1566 address generate_conjoint_short_copy(bool aligned, const char * name) {
goetz@6458 1567 StubCodeMark mark(this, "StubRoutines", name);
goetz@6511 1568 address start = __ function_entry();
goetz@6458 1569
goetz@6458 1570 Register tmp1 = R6_ARG4;
goetz@6458 1571 Register tmp2 = R7_ARG5;
goetz@6458 1572 Register tmp3 = R8_ARG6;
goetz@6458 1573
goetz@6511 1574 #if defined(ABI_ELFv2)
goetz@6511 1575 address nooverlap_target = aligned ?
goetz@6511 1576 StubRoutines::arrayof_jshort_disjoint_arraycopy() :
goetz@6511 1577 StubRoutines::jshort_disjoint_arraycopy();
goetz@6511 1578 #else
goetz@6458 1579 address nooverlap_target = aligned ?
goetz@6458 1580 ((FunctionDescriptor*)StubRoutines::arrayof_jshort_disjoint_arraycopy())->entry() :
goetz@6458 1581 ((FunctionDescriptor*)StubRoutines::jshort_disjoint_arraycopy())->entry();
goetz@6511 1582 #endif
goetz@6458 1583
goetz@6458 1584 array_overlap_test(nooverlap_target, 1);
goetz@6458 1585
goetz@6458 1586 Label l_1, l_2;
goetz@6458 1587 __ sldi(tmp1, R5_ARG3, 1);
goetz@6458 1588 __ b(l_2);
goetz@6458 1589 __ bind(l_1);
goetz@6458 1590 __ sthx(tmp2, R4_ARG2, tmp1);
goetz@6458 1591 __ bind(l_2);
goetz@6458 1592 __ addic_(tmp1, tmp1, -2);
goetz@6458 1593 __ lhzx(tmp2, R3_ARG1, tmp1);
goetz@6458 1594 __ bge(CCR0, l_1);
goetz@6458 1595
goetz@6458 1596 __ blr();
goetz@6458 1597
goetz@6458 1598 return start;
goetz@6458 1599 }
goetz@6458 1600
goetz@6458 1601 // Generate core code for disjoint int copy (and oop copy on 32-bit). If "aligned"
goetz@6458 1602 // is true, the "from" and "to" addresses are assumed to be heapword aligned.
goetz@6458 1603 //
goetz@6458 1604 // Arguments:
goetz@6458 1605 // from: R3_ARG1
goetz@6458 1606 // to: R4_ARG2
goetz@6458 1607 // count: R5_ARG3 treated as signed
goetz@6458 1608 //
goetz@6458 1609 void generate_disjoint_int_copy_core(bool aligned) {
goetz@6458 1610 Register tmp1 = R6_ARG4;
goetz@6458 1611 Register tmp2 = R7_ARG5;
goetz@6458 1612 Register tmp3 = R8_ARG6;
goetz@6458 1613 Register tmp4 = R0;
goetz@6458 1614
mhorie@9680 1615 VectorSRegister tmp_vsr1 = VSR1;
mhorie@9680 1616 VectorSRegister tmp_vsr2 = VSR2;
mhorie@9680 1617
mhorie@9680 1618 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
mhorie@9680 1619
goetz@6458 1620 // for short arrays, just do single element copy
goetz@6458 1621 __ li(tmp3, 0);
goetz@6458 1622 __ cmpwi(CCR0, R5_ARG3, 5);
goetz@6458 1623 __ ble(CCR0, l_2);
goetz@6458 1624
goetz@6458 1625 if (!aligned) {
goetz@6458 1626 // check if arrays have same alignment mod 8.
goetz@6458 1627 __ xorr(tmp1, R3_ARG1, R4_ARG2);
goetz@6458 1628 __ andi_(R0, tmp1, 7);
goetz@6458 1629 // Not the same alignment, but ld and std just need to be 4 byte aligned.
goetz@6458 1630 __ bne(CCR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
goetz@6458 1631
goetz@6458 1632 // copy 1 element to align to and from on an 8 byte boundary
goetz@6458 1633 __ andi_(R0, R3_ARG1, 7);
goetz@6458 1634 __ beq(CCR0, l_4);
goetz@6458 1635
goetz@6458 1636 __ lwzx(tmp2, R3_ARG1, tmp3);
goetz@6458 1637 __ addi(R5_ARG3, R5_ARG3, -1);
goetz@6458 1638 __ stwx(tmp2, R4_ARG2, tmp3);
goetz@6458 1639 { // FasterArrayCopy
goetz@6458 1640 __ addi(R3_ARG1, R3_ARG1, 4);
goetz@6458 1641 __ addi(R4_ARG2, R4_ARG2, 4);
goetz@6458 1642 }
goetz@6458 1643 __ bind(l_4);
goetz@6458 1644 }
goetz@6458 1645
goetz@6458 1646 { // FasterArrayCopy
goetz@6458 1647 __ cmpwi(CCR0, R5_ARG3, 7);
goetz@6458 1648 __ ble(CCR0, l_2); // copy 1 at a time if less than 8 elements remain
goetz@6458 1649
goetz@6458 1650 __ srdi(tmp1, R5_ARG3, 3);
goetz@6458 1651 __ andi_(R5_ARG3, R5_ARG3, 7);
goetz@6458 1652 __ mtctr(tmp1);
goetz@6458 1653
mhorie@9680 1654 if (!VM_Version::has_vsx()) {
mhorie@9680 1655
goetz@6458 1656 __ bind(l_6);
goetz@6458 1657 // Use unrolled version for mass copying (copy 8 elements a time).
goetz@6458 1658 // Load feeding store gets zero latency on power6, however not on power 5.
goetz@6458 1659 // Therefore, the following sequence is made for the good of both.
goetz@6458 1660 __ ld(tmp1, 0, R3_ARG1);
goetz@6458 1661 __ ld(tmp2, 8, R3_ARG1);
goetz@6458 1662 __ ld(tmp3, 16, R3_ARG1);
goetz@6458 1663 __ ld(tmp4, 24, R3_ARG1);
goetz@6458 1664 __ std(tmp1, 0, R4_ARG2);
goetz@6458 1665 __ std(tmp2, 8, R4_ARG2);
goetz@6458 1666 __ std(tmp3, 16, R4_ARG2);
goetz@6458 1667 __ std(tmp4, 24, R4_ARG2);
goetz@6458 1668 __ addi(R3_ARG1, R3_ARG1, 32);
goetz@6458 1669 __ addi(R4_ARG2, R4_ARG2, 32);
goetz@6458 1670 __ bdnz(l_6);
mhorie@9680 1671
mhorie@9680 1672 } else { // Processor supports VSX, so use it to mass copy.
mhorie@9680 1673
mhorie@9680 1674 // Prefetch the data into the L2 cache.
mhorie@9680 1675 __ dcbt(R3_ARG1, 0);
mhorie@9680 1676
mhorie@9680 1677 // If supported set DSCR pre-fetch to deepest.
mhorie@9680 1678 if (VM_Version::has_mfdscr()) {
mhorie@9680 1679 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
mhorie@9680 1680 __ mtdscr(tmp2);
mhorie@9680 1681 }
mhorie@9680 1682
mhorie@9680 1683 __ li(tmp1, 16);
mhorie@9680 1684
mhorie@9680 1685 // Backbranch target aligned to 32-byte. Not 16-byte align as
mhorie@9680 1686 // loop contains < 8 instructions that fit inside a single
mhorie@9680 1687 // i-cache sector.
mhorie@9680 1688 __ align(32);
mhorie@9680 1689
mhorie@9680 1690 __ bind(l_7);
mhorie@9680 1691 // Use loop with VSX load/store instructions to
mhorie@9680 1692 // copy 8 elements a time.
gromero@9684 1693 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
gromero@9684 1694 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
mhorie@9680 1695 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
mhorie@9680 1696 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
mhorie@9680 1697 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
mhorie@9680 1698 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
mhorie@9680 1699 __ bdnz(l_7); // Dec CTR and loop if not zero.
mhorie@9680 1700
mhorie@9680 1701 // Restore DSCR pre-fetch value.
mhorie@9680 1702 if (VM_Version::has_mfdscr()) {
mhorie@9680 1703 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
mhorie@9680 1704 __ mtdscr(tmp2);
mhorie@9680 1705 }
mhorie@9680 1706
mhorie@9680 1707 } // VSX
mhorie@9680 1708 } // FasterArrayCopy
goetz@6458 1709
goetz@6458 1710 // copy 1 element at a time
goetz@6458 1711 __ bind(l_2);
goetz@6458 1712 __ cmpwi(CCR0, R5_ARG3, 0);
goetz@6458 1713 __ beq(CCR0, l_1);
goetz@6458 1714
goetz@6458 1715 { // FasterArrayCopy
goetz@6458 1716 __ mtctr(R5_ARG3);
goetz@6458 1717 __ addi(R3_ARG1, R3_ARG1, -4);
goetz@6458 1718 __ addi(R4_ARG2, R4_ARG2, -4);
goetz@6458 1719
goetz@6458 1720 __ bind(l_3);
goetz@6458 1721 __ lwzu(tmp2, 4, R3_ARG1);
goetz@6458 1722 __ stwu(tmp2, 4, R4_ARG2);
goetz@6458 1723 __ bdnz(l_3);
goetz@6458 1724 }
goetz@6458 1725
goetz@6458 1726 __ bind(l_1);
goetz@6458 1727 return;
goetz@6458 1728 }
goetz@6458 1729
goetz@6458 1730 // Generate stub for disjoint int copy. If "aligned" is true, the
goetz@6458 1731 // "from" and "to" addresses are assumed to be heapword aligned.
goetz@6458 1732 //
goetz@6458 1733 // Arguments for generated stub:
goetz@6458 1734 // from: R3_ARG1
goetz@6458 1735 // to: R4_ARG2
goetz@6458 1736 // count: R5_ARG3 treated as signed
goetz@6458 1737 //
goetz@6458 1738 address generate_disjoint_int_copy(bool aligned, const char * name) {
goetz@6458 1739 StubCodeMark mark(this, "StubRoutines", name);
goetz@6511 1740 address start = __ function_entry();
goetz@6458 1741 generate_disjoint_int_copy_core(aligned);
goetz@6458 1742 __ blr();
goetz@6458 1743 return start;
goetz@6458 1744 }
goetz@6458 1745
goetz@6458 1746 // Generate core code for conjoint int copy (and oop copy on
goetz@6458 1747 // 32-bit). If "aligned" is true, the "from" and "to" addresses
goetz@6458 1748 // are assumed to be heapword aligned.
goetz@6458 1749 //
goetz@6458 1750 // Arguments:
goetz@6458 1751 // from: R3_ARG1
goetz@6458 1752 // to: R4_ARG2
goetz@6458 1753 // count: R5_ARG3 treated as signed
goetz@6458 1754 //
goetz@6458 1755 void generate_conjoint_int_copy_core(bool aligned) {
goetz@6458 1756 // Do reverse copy. We assume the case of actual overlap is rare enough
goetz@6458 1757 // that we don't have to optimize it.
goetz@6458 1758
gromero@9684 1759 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
goetz@6458 1760
goetz@6458 1761 Register tmp1 = R6_ARG4;
goetz@6458 1762 Register tmp2 = R7_ARG5;
goetz@6458 1763 Register tmp3 = R8_ARG6;
goetz@6458 1764 Register tmp4 = R0;
goetz@6458 1765
gromero@9684 1766 VectorSRegister tmp_vsr1 = VSR1;
gromero@9684 1767 VectorSRegister tmp_vsr2 = VSR2;
gromero@9684 1768
goetz@6458 1769 { // FasterArrayCopy
goetz@6458 1770 __ cmpwi(CCR0, R5_ARG3, 0);
goetz@6458 1771 __ beq(CCR0, l_6);
goetz@6458 1772
goetz@6458 1773 __ sldi(R5_ARG3, R5_ARG3, 2);
goetz@6458 1774 __ add(R3_ARG1, R3_ARG1, R5_ARG3);
goetz@6458 1775 __ add(R4_ARG2, R4_ARG2, R5_ARG3);
goetz@6458 1776 __ srdi(R5_ARG3, R5_ARG3, 2);
goetz@6458 1777
gromero@9684 1778 if (!aligned) {
gromero@9684 1779 // check if arrays have same alignment mod 8.
gromero@9684 1780 __ xorr(tmp1, R3_ARG1, R4_ARG2);
gromero@9684 1781 __ andi_(R0, tmp1, 7);
gromero@9684 1782 // Not the same alignment, but ld and std just need to be 4 byte aligned.
gromero@9684 1783 __ bne(CCR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time
gromero@9684 1784
gromero@9684 1785 // copy 1 element to align to and from on an 8 byte boundary
gromero@9684 1786 __ andi_(R0, R3_ARG1, 7);
gromero@9684 1787 __ beq(CCR0, l_7);
gromero@9684 1788
gromero@9684 1789 __ addi(R3_ARG1, R3_ARG1, -4);
gromero@9684 1790 __ addi(R4_ARG2, R4_ARG2, -4);
gromero@9684 1791 __ addi(R5_ARG3, R5_ARG3, -1);
gromero@9684 1792 __ lwzx(tmp2, R3_ARG1);
gromero@9684 1793 __ stwx(tmp2, R4_ARG2);
gromero@9684 1794 __ bind(l_7);
gromero@9684 1795 }
gromero@9684 1796
goetz@6458 1797 __ cmpwi(CCR0, R5_ARG3, 7);
goetz@6458 1798 __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain
goetz@6458 1799
goetz@6458 1800 __ srdi(tmp1, R5_ARG3, 3);
goetz@6458 1801 __ andi(R5_ARG3, R5_ARG3, 7);
goetz@6458 1802 __ mtctr(tmp1);
goetz@6458 1803
gromero@9684 1804 if (!VM_Version::has_vsx()) {
goetz@6458 1805 __ bind(l_4);
goetz@6458 1806 // Use unrolled version for mass copying (copy 4 elements a time).
goetz@6458 1807 // Load feeding store gets zero latency on Power6, however not on Power5.
goetz@6458 1808 // Therefore, the following sequence is made for the good of both.
goetz@6458 1809 __ addi(R3_ARG1, R3_ARG1, -32);
goetz@6458 1810 __ addi(R4_ARG2, R4_ARG2, -32);
goetz@6458 1811 __ ld(tmp4, 24, R3_ARG1);
goetz@6458 1812 __ ld(tmp3, 16, R3_ARG1);
goetz@6458 1813 __ ld(tmp2, 8, R3_ARG1);
goetz@6458 1814 __ ld(tmp1, 0, R3_ARG1);
goetz@6458 1815 __ std(tmp4, 24, R4_ARG2);
goetz@6458 1816 __ std(tmp3, 16, R4_ARG2);
goetz@6458 1817 __ std(tmp2, 8, R4_ARG2);
goetz@6458 1818 __ std(tmp1, 0, R4_ARG2);
goetz@6458 1819 __ bdnz(l_4);
gromero@9684 1820 } else { // Processor supports VSX, so use it to mass copy.
gromero@9684 1821 // Prefetch the data into the L2 cache.
gromero@9684 1822 __ dcbt(R3_ARG1, 0);
gromero@9684 1823
gromero@9684 1824 // If supported set DSCR pre-fetch to deepest.
gromero@9684 1825 if (VM_Version::has_mfdscr()) {
gromero@9684 1826 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
gromero@9684 1827 __ mtdscr(tmp2);
gromero@9684 1828 }
gromero@9684 1829
gromero@9684 1830 __ li(tmp1, 16);
gromero@9684 1831
gromero@9684 1832 // Backbranch target aligned to 32-byte. Not 16-byte align as
gromero@9684 1833 // loop contains < 8 instructions that fit inside a single
gromero@9684 1834 // i-cache sector.
gromero@9684 1835 __ align(32);
gromero@9684 1836
gromero@9684 1837 __ bind(l_4);
gromero@9684 1838 // Use loop with VSX load/store instructions to
gromero@9684 1839 // copy 8 elements a time.
gromero@9684 1840 __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32
gromero@9684 1841 __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32
gromero@9684 1842 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16
gromero@9684 1843 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
gromero@9684 1844 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
gromero@9684 1845 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
gromero@9684 1846 __ bdnz(l_4);
gromero@9684 1847
gromero@9684 1848 // Restore DSCR pre-fetch value.
gromero@9684 1849 if (VM_Version::has_mfdscr()) {
gromero@9684 1850 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
gromero@9684 1851 __ mtdscr(tmp2);
gromero@9684 1852 }
gromero@9684 1853 }
goetz@6458 1854
goetz@6458 1855 __ cmpwi(CCR0, R5_ARG3, 0);
goetz@6458 1856 __ beq(CCR0, l_6);
goetz@6458 1857
goetz@6458 1858 __ bind(l_5);
goetz@6458 1859 __ mtctr(R5_ARG3);
goetz@6458 1860 __ bind(l_3);
goetz@6458 1861 __ lwz(R0, -4, R3_ARG1);
goetz@6458 1862 __ stw(R0, -4, R4_ARG2);
goetz@6458 1863 __ addi(R3_ARG1, R3_ARG1, -4);
goetz@6458 1864 __ addi(R4_ARG2, R4_ARG2, -4);
goetz@6458 1865 __ bdnz(l_3);
goetz@6458 1866
goetz@6458 1867 __ bind(l_6);
goetz@6458 1868 }
goetz@6458 1869 }
goetz@6458 1870
goetz@6458 1871 // Generate stub for conjoint int copy. If "aligned" is true, the
goetz@6458 1872 // "from" and "to" addresses are assumed to be heapword aligned.
goetz@6458 1873 //
goetz@6458 1874 // Arguments for generated stub:
goetz@6458 1875 // from: R3_ARG1
goetz@6458 1876 // to: R4_ARG2
goetz@6458 1877 // count: R5_ARG3 treated as signed
goetz@6458 1878 //
goetz@6458 1879 address generate_conjoint_int_copy(bool aligned, const char * name) {
goetz@6458 1880 StubCodeMark mark(this, "StubRoutines", name);
goetz@6511 1881 address start = __ function_entry();
goetz@6458 1882
goetz@6511 1883 #if defined(ABI_ELFv2)
goetz@6511 1884 address nooverlap_target = aligned ?
goetz@6511 1885 StubRoutines::arrayof_jint_disjoint_arraycopy() :
goetz@6511 1886 StubRoutines::jint_disjoint_arraycopy();
goetz@6511 1887 #else
goetz@6458 1888 address nooverlap_target = aligned ?
goetz@6458 1889 ((FunctionDescriptor*)StubRoutines::arrayof_jint_disjoint_arraycopy())->entry() :
goetz@6458 1890 ((FunctionDescriptor*)StubRoutines::jint_disjoint_arraycopy())->entry();
goetz@6511 1891 #endif
goetz@6458 1892
goetz@6458 1893 array_overlap_test(nooverlap_target, 2);
goetz@6458 1894
goetz@6458 1895 generate_conjoint_int_copy_core(aligned);
goetz@6458 1896
goetz@6458 1897 __ blr();
goetz@6458 1898
goetz@6458 1899 return start;
goetz@6458 1900 }
goetz@6458 1901
goetz@6458 1902 // Generate core code for disjoint long copy (and oop copy on
goetz@6458 1903 // 64-bit). If "aligned" is true, the "from" and "to" addresses
goetz@6458 1904 // are assumed to be heapword aligned.
goetz@6458 1905 //
goetz@6458 1906 // Arguments:
goetz@6458 1907 // from: R3_ARG1
goetz@6458 1908 // to: R4_ARG2
goetz@6458 1909 // count: R5_ARG3 treated as signed
goetz@6458 1910 //
goetz@6458 1911 void generate_disjoint_long_copy_core(bool aligned) {
goetz@6458 1912 Register tmp1 = R6_ARG4;
goetz@6458 1913 Register tmp2 = R7_ARG5;
goetz@6458 1914 Register tmp3 = R8_ARG6;
goetz@6458 1915 Register tmp4 = R0;
goetz@6458 1916
mhorie@9680 1917 Label l_1, l_2, l_3, l_4, l_5;
mhorie@9680 1918
mhorie@9680 1919 VectorSRegister tmp_vsr1 = VSR1;
mhorie@9680 1920 VectorSRegister tmp_vsr2 = VSR2;
goetz@6458 1921
goetz@6458 1922 { // FasterArrayCopy
goetz@6458 1923 __ cmpwi(CCR0, R5_ARG3, 3);
goetz@6458 1924 __ ble(CCR0, l_3); // copy 1 at a time if less than 4 elements remain
goetz@6458 1925
goetz@6458 1926 __ srdi(tmp1, R5_ARG3, 2);
goetz@6458 1927 __ andi_(R5_ARG3, R5_ARG3, 3);
goetz@6458 1928 __ mtctr(tmp1);
goetz@6458 1929
mhorie@9680 1930 if (!VM_Version::has_vsx()) {
goetz@6458 1931 __ bind(l_4);
goetz@6458 1932 // Use unrolled version for mass copying (copy 4 elements a time).
goetz@6458 1933 // Load feeding store gets zero latency on Power6, however not on Power5.
goetz@6458 1934 // Therefore, the following sequence is made for the good of both.
goetz@6458 1935 __ ld(tmp1, 0, R3_ARG1);
goetz@6458 1936 __ ld(tmp2, 8, R3_ARG1);
goetz@6458 1937 __ ld(tmp3, 16, R3_ARG1);
goetz@6458 1938 __ ld(tmp4, 24, R3_ARG1);
goetz@6458 1939 __ std(tmp1, 0, R4_ARG2);
goetz@6458 1940 __ std(tmp2, 8, R4_ARG2);
goetz@6458 1941 __ std(tmp3, 16, R4_ARG2);
goetz@6458 1942 __ std(tmp4, 24, R4_ARG2);
goetz@6458 1943 __ addi(R3_ARG1, R3_ARG1, 32);
goetz@6458 1944 __ addi(R4_ARG2, R4_ARG2, 32);
goetz@6458 1945 __ bdnz(l_4);
mhorie@9680 1946
mhorie@9680 1947 } else { // Processor supports VSX, so use it to mass copy.
mhorie@9680 1948
mhorie@9680 1949 // Prefetch the data into the L2 cache.
mhorie@9680 1950 __ dcbt(R3_ARG1, 0);
mhorie@9680 1951
mhorie@9680 1952 // If supported set DSCR pre-fetch to deepest.
mhorie@9680 1953 if (VM_Version::has_mfdscr()) {
mhorie@9680 1954 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
mhorie@9680 1955 __ mtdscr(tmp2);
mhorie@9680 1956 }
mhorie@9680 1957
mhorie@9680 1958 __ li(tmp1, 16);
mhorie@9680 1959
mhorie@9680 1960 // Backbranch target aligned to 32-byte. Not 16-byte align as
mhorie@9680 1961 // loop contains < 8 instructions that fit inside a single
mhorie@9680 1962 // i-cache sector.
mhorie@9680 1963 __ align(32);
mhorie@9680 1964
mhorie@9680 1965 __ bind(l_5);
mhorie@9680 1966 // Use loop with VSX load/store instructions to
mhorie@9680 1967 // copy 4 elements a time.
gromero@9684 1968 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
gromero@9684 1969 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
mhorie@9680 1970 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
mhorie@9680 1971 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
mhorie@9680 1972 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
mhorie@9680 1973 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
mhorie@9680 1974 __ bdnz(l_5); // Dec CTR and loop if not zero.
mhorie@9680 1975
mhorie@9680 1976 // Restore DSCR pre-fetch value.
mhorie@9680 1977 if (VM_Version::has_mfdscr()) {
mhorie@9680 1978 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
mhorie@9680 1979 __ mtdscr(tmp2);
mhorie@9680 1980 }
mhorie@9680 1981
mhorie@9680 1982 } // VSX
mhorie@9680 1983 } // FasterArrayCopy
goetz@6458 1984
goetz@6458 1985 // copy 1 element at a time
goetz@6458 1986 __ bind(l_3);
goetz@6458 1987 __ cmpwi(CCR0, R5_ARG3, 0);
goetz@6458 1988 __ beq(CCR0, l_1);
goetz@6458 1989
goetz@6458 1990 { // FasterArrayCopy
goetz@6458 1991 __ mtctr(R5_ARG3);
goetz@6458 1992 __ addi(R3_ARG1, R3_ARG1, -8);
goetz@6458 1993 __ addi(R4_ARG2, R4_ARG2, -8);
goetz@6458 1994
goetz@6458 1995 __ bind(l_2);
goetz@6458 1996 __ ldu(R0, 8, R3_ARG1);
goetz@6458 1997 __ stdu(R0, 8, R4_ARG2);
goetz@6458 1998 __ bdnz(l_2);
goetz@6458 1999
goetz@6458 2000 }
goetz@6458 2001 __ bind(l_1);
goetz@6458 2002 }
goetz@6458 2003
goetz@6458 2004 // Generate stub for disjoint long copy. If "aligned" is true, the
goetz@6458 2005 // "from" and "to" addresses are assumed to be heapword aligned.
goetz@6458 2006 //
goetz@6458 2007 // Arguments for generated stub:
goetz@6458 2008 // from: R3_ARG1
goetz@6458 2009 // to: R4_ARG2
goetz@6458 2010 // count: R5_ARG3 treated as signed
goetz@6458 2011 //
goetz@6458 2012 address generate_disjoint_long_copy(bool aligned, const char * name) {
goetz@6458 2013 StubCodeMark mark(this, "StubRoutines", name);
goetz@6511 2014 address start = __ function_entry();
goetz@6458 2015 generate_disjoint_long_copy_core(aligned);
goetz@6458 2016 __ blr();
goetz@6458 2017
goetz@6458 2018 return start;
goetz@6458 2019 }
goetz@6458 2020
goetz@6458 2021 // Generate core code for conjoint long copy (and oop copy on
goetz@6458 2022 // 64-bit). If "aligned" is true, the "from" and "to" addresses
goetz@6458 2023 // are assumed to be heapword aligned.
goetz@6458 2024 //
goetz@6458 2025 // Arguments:
goetz@6458 2026 // from: R3_ARG1
goetz@6458 2027 // to: R4_ARG2
goetz@6458 2028 // count: R5_ARG3 treated as signed
goetz@6458 2029 //
goetz@6458 2030 void generate_conjoint_long_copy_core(bool aligned) {
goetz@6458 2031 Register tmp1 = R6_ARG4;
goetz@6458 2032 Register tmp2 = R7_ARG5;
goetz@6458 2033 Register tmp3 = R8_ARG6;
goetz@6458 2034 Register tmp4 = R0;
goetz@6458 2035
gromero@9684 2036 VectorSRegister tmp_vsr1 = VSR1;
gromero@9684 2037 VectorSRegister tmp_vsr2 = VSR2;
gromero@9684 2038
goetz@6458 2039 Label l_1, l_2, l_3, l_4, l_5;
goetz@6458 2040
goetz@6458 2041 __ cmpwi(CCR0, R5_ARG3, 0);
goetz@6458 2042 __ beq(CCR0, l_1);
goetz@6458 2043
goetz@6458 2044 { // FasterArrayCopy
goetz@6458 2045 __ sldi(R5_ARG3, R5_ARG3, 3);
goetz@6458 2046 __ add(R3_ARG1, R3_ARG1, R5_ARG3);
goetz@6458 2047 __ add(R4_ARG2, R4_ARG2, R5_ARG3);
goetz@6458 2048 __ srdi(R5_ARG3, R5_ARG3, 3);
goetz@6458 2049
goetz@6458 2050 __ cmpwi(CCR0, R5_ARG3, 3);
goetz@6458 2051 __ ble(CCR0, l_5); // copy 1 at a time if less than 4 elements remain
goetz@6458 2052
goetz@6458 2053 __ srdi(tmp1, R5_ARG3, 2);
goetz@6458 2054 __ andi(R5_ARG3, R5_ARG3, 3);
goetz@6458 2055 __ mtctr(tmp1);
goetz@6458 2056
gromero@9684 2057 if (!VM_Version::has_vsx()) {
goetz@6458 2058 __ bind(l_4);
goetz@6458 2059 // Use unrolled version for mass copying (copy 4 elements a time).
goetz@6458 2060 // Load feeding store gets zero latency on Power6, however not on Power5.
goetz@6458 2061 // Therefore, the following sequence is made for the good of both.
goetz@6458 2062 __ addi(R3_ARG1, R3_ARG1, -32);
goetz@6458 2063 __ addi(R4_ARG2, R4_ARG2, -32);
goetz@6458 2064 __ ld(tmp4, 24, R3_ARG1);
goetz@6458 2065 __ ld(tmp3, 16, R3_ARG1);
goetz@6458 2066 __ ld(tmp2, 8, R3_ARG1);
goetz@6458 2067 __ ld(tmp1, 0, R3_ARG1);
goetz@6458 2068 __ std(tmp4, 24, R4_ARG2);
goetz@6458 2069 __ std(tmp3, 16, R4_ARG2);
goetz@6458 2070 __ std(tmp2, 8, R4_ARG2);
goetz@6458 2071 __ std(tmp1, 0, R4_ARG2);
goetz@6458 2072 __ bdnz(l_4);
gromero@9684 2073 } else { // Processor supports VSX, so use it to mass copy.
gromero@9684 2074 // Prefetch the data into the L2 cache.
gromero@9684 2075 __ dcbt(R3_ARG1, 0);
gromero@9684 2076
gromero@9684 2077 // If supported set DSCR pre-fetch to deepest.
gromero@9684 2078 if (VM_Version::has_mfdscr()) {
gromero@9684 2079 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
gromero@9684 2080 __ mtdscr(tmp2);
gromero@9684 2081 }
gromero@9684 2082
gromero@9684 2083 __ li(tmp1, 16);
gromero@9684 2084
gromero@9684 2085 // Backbranch target aligned to 32-byte. Not 16-byte align as
gromero@9684 2086 // loop contains < 8 instructions that fit inside a single
gromero@9684 2087 // i-cache sector.
gromero@9684 2088 __ align(32);
gromero@9684 2089
gromero@9684 2090 __ bind(l_4);
gromero@9684 2091 // Use loop with VSX load/store instructions to
gromero@9684 2092 // copy 4 elements a time.
gromero@9684 2093 __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32
gromero@9684 2094 __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32
gromero@9684 2095 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16
gromero@9684 2096 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
gromero@9684 2097 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
gromero@9684 2098 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
gromero@9684 2099 __ bdnz(l_4);
gromero@9684 2100
gromero@9684 2101 // Restore DSCR pre-fetch value.
gromero@9684 2102 if (VM_Version::has_mfdscr()) {
gromero@9684 2103 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
gromero@9684 2104 __ mtdscr(tmp2);
gromero@9684 2105 }
gromero@9684 2106 }
goetz@6458 2107
goetz@6458 2108 __ cmpwi(CCR0, R5_ARG3, 0);
goetz@6458 2109 __ beq(CCR0, l_1);
goetz@6458 2110
goetz@6458 2111 __ bind(l_5);
goetz@6458 2112 __ mtctr(R5_ARG3);
goetz@6458 2113 __ bind(l_3);
goetz@6458 2114 __ ld(R0, -8, R3_ARG1);
goetz@6458 2115 __ std(R0, -8, R4_ARG2);
goetz@6458 2116 __ addi(R3_ARG1, R3_ARG1, -8);
goetz@6458 2117 __ addi(R4_ARG2, R4_ARG2, -8);
goetz@6458 2118 __ bdnz(l_3);
goetz@6458 2119
goetz@6458 2120 }
goetz@6458 2121 __ bind(l_1);
goetz@6458 2122 }
goetz@6458 2123
goetz@6458 2124 // Generate stub for conjoint long copy. If "aligned" is true, the
goetz@6458 2125 // "from" and "to" addresses are assumed to be heapword aligned.
goetz@6458 2126 //
goetz@6458 2127 // Arguments for generated stub:
goetz@6458 2128 // from: R3_ARG1
goetz@6458 2129 // to: R4_ARG2
goetz@6458 2130 // count: R5_ARG3 treated as signed
goetz@6458 2131 //
goetz@6458 2132 address generate_conjoint_long_copy(bool aligned, const char * name) {
goetz@6458 2133 StubCodeMark mark(this, "StubRoutines", name);
goetz@6511 2134 address start = __ function_entry();
goetz@6458 2135
goetz@6511 2136 #if defined(ABI_ELFv2)
goetz@6511 2137 address nooverlap_target = aligned ?
goetz@6511 2138 StubRoutines::arrayof_jlong_disjoint_arraycopy() :
goetz@6511 2139 StubRoutines::jlong_disjoint_arraycopy();
goetz@6511 2140 #else
goetz@6458 2141 address nooverlap_target = aligned ?
goetz@6458 2142 ((FunctionDescriptor*)StubRoutines::arrayof_jlong_disjoint_arraycopy())->entry() :
goetz@6458 2143 ((FunctionDescriptor*)StubRoutines::jlong_disjoint_arraycopy())->entry();
goetz@6511 2144 #endif
goetz@6458 2145
goetz@6458 2146 array_overlap_test(nooverlap_target, 3);
goetz@6458 2147 generate_conjoint_long_copy_core(aligned);
goetz@6458 2148
goetz@6458 2149 __ blr();
goetz@6458 2150
goetz@6458 2151 return start;
goetz@6458 2152 }
goetz@6458 2153
goetz@6458 2154 // Generate stub for conjoint oop copy. If "aligned" is true, the
goetz@6458 2155 // "from" and "to" addresses are assumed to be heapword aligned.
goetz@6458 2156 //
goetz@6458 2157 // Arguments for generated stub:
goetz@6458 2158 // from: R3_ARG1
goetz@6458 2159 // to: R4_ARG2
goetz@6458 2160 // count: R5_ARG3 treated as signed
goetz@6458 2161 // dest_uninitialized: G1 support
goetz@6458 2162 //
goetz@6458 2163 address generate_conjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
goetz@6458 2164 StubCodeMark mark(this, "StubRoutines", name);
goetz@6458 2165
goetz@6511 2166 address start = __ function_entry();
goetz@6458 2167
goetz@6511 2168 #if defined(ABI_ELFv2)
goetz@6511 2169 address nooverlap_target = aligned ?
goetz@6511 2170 StubRoutines::arrayof_oop_disjoint_arraycopy() :
goetz@6511 2171 StubRoutines::oop_disjoint_arraycopy();
goetz@6511 2172 #else
goetz@6458 2173 address nooverlap_target = aligned ?
goetz@6458 2174 ((FunctionDescriptor*)StubRoutines::arrayof_oop_disjoint_arraycopy())->entry() :
goetz@6458 2175 ((FunctionDescriptor*)StubRoutines::oop_disjoint_arraycopy())->entry();
goetz@6511 2176 #endif
goetz@6458 2177
goetz@6458 2178 gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);
goetz@6458 2179
goetz@6458 2180 // Save arguments.
goetz@6458 2181 __ mr(R9_ARG7, R4_ARG2);
goetz@6458 2182 __ mr(R10_ARG8, R5_ARG3);
goetz@6458 2183
goetz@6458 2184 if (UseCompressedOops) {
goetz@6458 2185 array_overlap_test(nooverlap_target, 2);
goetz@6458 2186 generate_conjoint_int_copy_core(aligned);
goetz@6458 2187 } else {
goetz@6458 2188 array_overlap_test(nooverlap_target, 3);
goetz@6458 2189 generate_conjoint_long_copy_core(aligned);
goetz@6458 2190 }
goetz@6458 2191
goetz@6495 2192 gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);
goetz@6458 2193 return start;
goetz@6458 2194 }
goetz@6458 2195
goetz@6458 2196 // Generate stub for disjoint oop copy. If "aligned" is true, the
goetz@6458 2197 // "from" and "to" addresses are assumed to be heapword aligned.
goetz@6458 2198 //
goetz@6458 2199 // Arguments for generated stub:
goetz@6458 2200 // from: R3_ARG1
goetz@6458 2201 // to: R4_ARG2
goetz@6458 2202 // count: R5_ARG3 treated as signed
goetz@6458 2203 // dest_uninitialized: G1 support
goetz@6458 2204 //
goetz@6458 2205 address generate_disjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
goetz@6458 2206 StubCodeMark mark(this, "StubRoutines", name);
goetz@6511 2207 address start = __ function_entry();
goetz@6458 2208
goetz@6458 2209 gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);
goetz@6458 2210
goetz@6458 2211 // save some arguments, disjoint_long_copy_core destroys them.
goetz@6458 2212 // needed for post barrier
goetz@6458 2213 __ mr(R9_ARG7, R4_ARG2);
goetz@6458 2214 __ mr(R10_ARG8, R5_ARG3);
goetz@6458 2215
goetz@6458 2216 if (UseCompressedOops) {
goetz@6458 2217 generate_disjoint_int_copy_core(aligned);
goetz@6458 2218 } else {
goetz@6458 2219 generate_disjoint_long_copy_core(aligned);
goetz@6458 2220 }
goetz@6458 2221
goetz@6495 2222 gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);
goetz@6458 2223
goetz@6458 2224 return start;
goetz@6458 2225 }
goetz@6458 2226
mdoerr@9730 2227 // Arguments for generated stub:
simonis@8608 2228 // R3_ARG1 - source byte array address
simonis@8608 2229 // R4_ARG2 - destination byte array address
simonis@8608 2230 // R5_ARG3 - round key array
simonis@8608 2231 address generate_aescrypt_encryptBlock() {
simonis@8608 2232 assert(UseAES, "need AES instructions and misaligned SSE support");
simonis@8608 2233 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
simonis@8608 2234
simonis@8608 2235 address start = __ function_entry();
simonis@8608 2236
simonis@8608 2237 Label L_doLast;
simonis@8608 2238
simonis@8608 2239 Register from = R3_ARG1; // source array address
simonis@8608 2240 Register to = R4_ARG2; // destination array address
simonis@8608 2241 Register key = R5_ARG3; // round key array
simonis@8608 2242
simonis@8608 2243 Register keylen = R8;
simonis@8608 2244 Register temp = R9;
simonis@8608 2245 Register keypos = R10;
simonis@8608 2246 Register fifteen = R12;
simonis@8608 2247
simonis@8608 2248 VectorRegister vRet = VR0;
simonis@8608 2249
simonis@8608 2250 VectorRegister vKey1 = VR1;
simonis@8608 2251 VectorRegister vKey2 = VR2;
simonis@8608 2252 VectorRegister vKey3 = VR3;
simonis@8608 2253 VectorRegister vKey4 = VR4;
simonis@8608 2254
simonis@8608 2255 VectorRegister fromPerm = VR5;
simonis@8608 2256 VectorRegister keyPerm = VR6;
simonis@8608 2257 VectorRegister toPerm = VR7;
simonis@8608 2258 VectorRegister fSplt = VR8;
simonis@8608 2259
simonis@8608 2260 VectorRegister vTmp1 = VR9;
simonis@8608 2261 VectorRegister vTmp2 = VR10;
simonis@8608 2262 VectorRegister vTmp3 = VR11;
simonis@8608 2263 VectorRegister vTmp4 = VR12;
simonis@8608 2264
simonis@8608 2265 __ li (fifteen, 15);
simonis@8608 2266
simonis@8608 2267 // load unaligned from[0-15] to vsRet
simonis@8608 2268 __ lvx (vRet, from);
simonis@8608 2269 __ lvx (vTmp1, fifteen, from);
simonis@8608 2270 __ lvsl (fromPerm, from);
mdoerr@9730 2271 #ifdef VM_LITTLE_ENDIAN
mdoerr@9730 2272 __ vspltisb (fSplt, 0x0f);
simonis@8608 2273 __ vxor (fromPerm, fromPerm, fSplt);
mdoerr@9730 2274 #endif
simonis@8608 2275 __ vperm (vRet, vRet, vTmp1, fromPerm);
simonis@8608 2276
simonis@8608 2277 // load keylen (44 or 52 or 60)
simonis@8608 2278 __ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
simonis@8608 2279
simonis@8608 2280 // to load keys
mdoerr@9730 2281 __ load_perm (keyPerm, key);
mdoerr@9730 2282 #ifdef VM_LITTLE_ENDIAN
simonis@8608 2283 __ vspltisb (vTmp2, -16);
simonis@8608 2284 __ vrld (keyPerm, keyPerm, vTmp2);
simonis@8608 2285 __ vrld (keyPerm, keyPerm, vTmp2);
mdoerr@9603 2286 __ vsldoi (keyPerm, keyPerm, keyPerm, 8);
mdoerr@9730 2287 #endif
mdoerr@9730 2288
mdoerr@9730 2289 // load the 1st round key to vTmp1
mdoerr@9730 2290 __ lvx (vTmp1, key);
mdoerr@9730 2291 __ li (keypos, 16);
simonis@8608 2292 __ lvx (vKey1, keypos, key);
mdoerr@9730 2293 __ vec_perm (vTmp1, vKey1, keyPerm);
mdoerr@9730 2294
mdoerr@9730 2295 // 1st round
mdoerr@9730 2296 __ vxor (vRet, vRet, vTmp1);
mdoerr@9730 2297
mdoerr@9730 2298 // load the 2nd round key to vKey1
mdoerr@9730 2299 __ li (keypos, 32);
mdoerr@9730 2300 __ lvx (vKey2, keypos, key);
mdoerr@9730 2301 __ vec_perm (vKey1, vKey2, keyPerm);
mdoerr@9730 2302
mdoerr@9730 2303 // load the 3rd round key to vKey2
mdoerr@9730 2304 __ li (keypos, 48);
mdoerr@9730 2305 __ lvx (vKey3, keypos, key);
mdoerr@9730 2306 __ vec_perm (vKey2, vKey3, keyPerm);
mdoerr@9730 2307
mdoerr@9730 2308 // load the 4th round key to vKey3
mdoerr@9730 2309 __ li (keypos, 64);
mdoerr@9730 2310 __ lvx (vKey4, keypos, key);
mdoerr@9730 2311 __ vec_perm (vKey3, vKey4, keyPerm);
mdoerr@9730 2312
mdoerr@9730 2313 // load the 5th round key to vKey4
mdoerr@9730 2314 __ li (keypos, 80);
simonis@8608 2315 __ lvx (vTmp1, keypos, key);
mdoerr@9730 2316 __ vec_perm (vKey4, vTmp1, keyPerm);
mdoerr@9730 2317
mdoerr@9730 2318 // 2nd - 5th rounds
mdoerr@9730 2319 __ vcipher (vRet, vRet, vKey1);
mdoerr@9730 2320 __ vcipher (vRet, vRet, vKey2);
mdoerr@9730 2321 __ vcipher (vRet, vRet, vKey3);
mdoerr@9730 2322 __ vcipher (vRet, vRet, vKey4);
mdoerr@9730 2323
mdoerr@9730 2324 // load the 6th round key to vKey1
mdoerr@9730 2325 __ li (keypos, 96);
mdoerr@9730 2326 __ lvx (vKey2, keypos, key);
mdoerr@9730 2327 __ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
mdoerr@9730 2328
mdoerr@9730 2329 // load the 7th round key to vKey2
mdoerr@9730 2330 __ li (keypos, 112);
mdoerr@9730 2331 __ lvx (vKey3, keypos, key);
mdoerr@9730 2332 __ vec_perm (vKey2, vKey3, keyPerm);
mdoerr@9730 2333
mdoerr@9730 2334 // load the 8th round key to vKey3
mdoerr@9730 2335 __ li (keypos, 128);
mdoerr@9730 2336 __ lvx (vKey4, keypos, key);
mdoerr@9730 2337 __ vec_perm (vKey3, vKey4, keyPerm);
mdoerr@9730 2338
mdoerr@9730 2339 // load the 9th round key to vKey4
mdoerr@9730 2340 __ li (keypos, 144);
simonis@8608 2341 __ lvx (vTmp1, keypos, key);
mdoerr@9730 2342 __ vec_perm (vKey4, vTmp1, keyPerm);
mdoerr@9730 2343
mdoerr@9730 2344 // 6th - 9th rounds
mdoerr@9730 2345 __ vcipher (vRet, vRet, vKey1);
mdoerr@9730 2346 __ vcipher (vRet, vRet, vKey2);
mdoerr@9730 2347 __ vcipher (vRet, vRet, vKey3);
mdoerr@9730 2348 __ vcipher (vRet, vRet, vKey4);
mdoerr@9730 2349
mdoerr@9730 2350 // load the 10th round key to vKey1
mdoerr@9730 2351 __ li (keypos, 160);
mdoerr@9730 2352 __ lvx (vKey2, keypos, key);
mdoerr@9730 2353 __ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
mdoerr@9730 2354
mdoerr@9730 2355 // load the 11th round key to vKey2
mdoerr@9730 2356 __ li (keypos, 176);
simonis@8608 2357 __ lvx (vTmp1, keypos, key);
mdoerr@9730 2358 __ vec_perm (vKey2, vTmp1, keyPerm);
simonis@8608 2359
simonis@8608 2360 // if all round keys are loaded, skip next 4 rounds
simonis@8608 2361 __ cmpwi (CCR0, keylen, 44);
simonis@8608 2362 __ beq (CCR0, L_doLast);
simonis@8608 2363
simonis@8608 2364 // 10th - 11th rounds
mdoerr@9730 2365 __ vcipher (vRet, vRet, vKey1);
mdoerr@9730 2366 __ vcipher (vRet, vRet, vKey2);
simonis@8608 2367
simonis@8608 2368 // load the 12th round key to vKey1
mdoerr@9730 2369 __ li (keypos, 192);
mdoerr@9730 2370 __ lvx (vKey2, keypos, key);
mdoerr@9730 2371 __ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
simonis@8608 2372
simonis@8608 2373 // load the 13th round key to vKey2
mdoerr@9730 2374 __ li (keypos, 208);
simonis@8608 2375 __ lvx (vTmp1, keypos, key);
mdoerr@9730 2376 __ vec_perm (vKey2, vTmp1, keyPerm);
simonis@8608 2377
simonis@8608 2378 // if all round keys are loaded, skip next 2 rounds
simonis@8608 2379 __ cmpwi (CCR0, keylen, 52);
simonis@8608 2380 __ beq (CCR0, L_doLast);
simonis@8608 2381
simonis@8608 2382 // 12th - 13th rounds
mdoerr@9730 2383 __ vcipher (vRet, vRet, vKey1);
mdoerr@9730 2384 __ vcipher (vRet, vRet, vKey2);
simonis@8608 2385
simonis@8608 2386 // load the 14th round key to vKey1
mdoerr@9730 2387 __ li (keypos, 224);
mdoerr@9730 2388 __ lvx (vKey2, keypos, key);
mdoerr@9730 2389 __ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
simonis@8608 2390
simonis@8608 2391 // load the 15th round key to vKey2
mdoerr@9730 2392 __ li (keypos, 240);
simonis@8608 2393 __ lvx (vTmp1, keypos, key);
mdoerr@9730 2394 __ vec_perm (vKey2, vTmp1, keyPerm);
simonis@8608 2395
simonis@8608 2396 __ bind(L_doLast);
simonis@8608 2397
simonis@8608 2398 // last two rounds
mdoerr@9730 2399 __ vcipher (vRet, vRet, vKey1);
mdoerr@9730 2400 __ vcipherlast (vRet, vRet, vKey2);
mdoerr@9730 2401
mdoerr@9730 2402 // store result (unaligned)
mdoerr@9730 2403 #ifdef VM_LITTLE_ENDIAN
mdoerr@9730 2404 __ lvsl (toPerm, to);
mdoerr@9730 2405 #else
mdoerr@9730 2406 __ lvsr (toPerm, to);
mdoerr@9730 2407 #endif
mdoerr@9730 2408 __ vspltisb (vTmp3, -1);
mdoerr@9730 2409 __ vspltisb (vTmp4, 0);
simonis@8608 2410 __ lvx (vTmp1, to);
mdoerr@9730 2411 __ lvx (vTmp2, fifteen, to);
mdoerr@9730 2412 #ifdef VM_LITTLE_ENDIAN
mdoerr@9730 2413 __ vperm (vTmp3, vTmp3, vTmp4, toPerm); // generate select mask
mdoerr@9730 2414 __ vxor (toPerm, toPerm, fSplt); // swap bytes
mdoerr@9730 2415 #else
mdoerr@9730 2416 __ vperm (vTmp3, vTmp4, vTmp3, toPerm); // generate select mask
mdoerr@9730 2417 #endif
mdoerr@9730 2418 __ vperm (vTmp4, vRet, vRet, toPerm); // rotate data
mdoerr@9730 2419 __ vsel (vTmp2, vTmp4, vTmp2, vTmp3);
mdoerr@9730 2420 __ vsel (vTmp1, vTmp1, vTmp4, vTmp3);
mdoerr@9730 2421 __ stvx (vTmp2, fifteen, to); // store this one first (may alias)
simonis@8608 2422 __ stvx (vTmp1, to);
simonis@8608 2423
simonis@8608 2424 __ blr();
simonis@8608 2425 return start;
simonis@8608 2426 }
simonis@8608 2427
mdoerr@9730 2428 // Arguments for generated stub:
simonis@8608 2429 // R3_ARG1 - source byte array address
simonis@8608 2430 // R4_ARG2 - destination byte array address
simonis@8608 2431 // R5_ARG3 - K (key) in little endian int array
simonis@8608 2432 address generate_aescrypt_decryptBlock() {
simonis@8608 2433 assert(UseAES, "need AES instructions and misaligned SSE support");
simonis@8608 2434 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
simonis@8608 2435
simonis@8608 2436 address start = __ function_entry();
simonis@8608 2437
simonis@8608 2438 Label L_doLast;
simonis@8608 2439 Label L_do44;
simonis@8608 2440 Label L_do52;
simonis@8608 2441 Label L_do60;
simonis@8608 2442
simonis@8608 2443 Register from = R3_ARG1; // source array address
simonis@8608 2444 Register to = R4_ARG2; // destination array address
simonis@8608 2445 Register key = R5_ARG3; // round key array
simonis@8608 2446
simonis@8608 2447 Register keylen = R8;
simonis@8608 2448 Register temp = R9;
simonis@8608 2449 Register keypos = R10;
simonis@8608 2450 Register fifteen = R12;
simonis@8608 2451
simonis@8608 2452 VectorRegister vRet = VR0;
simonis@8608 2453
simonis@8608 2454 VectorRegister vKey1 = VR1;
simonis@8608 2455 VectorRegister vKey2 = VR2;
simonis@8608 2456 VectorRegister vKey3 = VR3;
simonis@8608 2457 VectorRegister vKey4 = VR4;
simonis@8608 2458 VectorRegister vKey5 = VR5;
simonis@8608 2459
simonis@8608 2460 VectorRegister fromPerm = VR6;
simonis@8608 2461 VectorRegister keyPerm = VR7;
simonis@8608 2462 VectorRegister toPerm = VR8;
simonis@8608 2463 VectorRegister fSplt = VR9;
simonis@8608 2464
simonis@8608 2465 VectorRegister vTmp1 = VR10;
simonis@8608 2466 VectorRegister vTmp2 = VR11;
simonis@8608 2467 VectorRegister vTmp3 = VR12;
simonis@8608 2468 VectorRegister vTmp4 = VR13;
simonis@8608 2469
simonis@8608 2470 __ li (fifteen, 15);
simonis@8608 2471
simonis@8608 2472 // load unaligned from[0-15] to vsRet
simonis@8608 2473 __ lvx (vRet, from);
simonis@8608 2474 __ lvx (vTmp1, fifteen, from);
simonis@8608 2475 __ lvsl (fromPerm, from);
mdoerr@9730 2476 #ifdef VM_LITTLE_ENDIAN
mdoerr@9730 2477 __ vspltisb (fSplt, 0x0f);
simonis@8608 2478 __ vxor (fromPerm, fromPerm, fSplt);
mdoerr@9730 2479 #endif
simonis@8608 2480 __ vperm (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE]
simonis@8608 2481
simonis@8608 2482 // load keylen (44 or 52 or 60)
simonis@8608 2483 __ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
simonis@8608 2484
simonis@8608 2485 // to load keys
mdoerr@9730 2486 __ load_perm (keyPerm, key);
mdoerr@9730 2487 #ifdef VM_LITTLE_ENDIAN
simonis@8608 2488 __ vxor (vTmp2, vTmp2, vTmp2);
simonis@8608 2489 __ vspltisb (vTmp2, -16);
simonis@8608 2490 __ vrld (keyPerm, keyPerm, vTmp2);
simonis@8608 2491 __ vrld (keyPerm, keyPerm, vTmp2);
mdoerr@9603 2492 __ vsldoi (keyPerm, keyPerm, keyPerm, 8);
mdoerr@9730 2493 #endif
simonis@8608 2494
simonis@8608 2495 __ cmpwi (CCR0, keylen, 44);
simonis@8608 2496 __ beq (CCR0, L_do44);
simonis@8608 2497
simonis@8608 2498 __ cmpwi (CCR0, keylen, 52);
simonis@8608 2499 __ beq (CCR0, L_do52);
simonis@8608 2500
mdoerr@9730 2501 // load the 15th round key to vKey1
simonis@8608 2502 __ li (keypos, 240);
mdoerr@9730 2503 __ lvx (vKey1, keypos, key);
mdoerr@9730 2504 __ li (keypos, 224);
mdoerr@9730 2505 __ lvx (vKey2, keypos, key);
mdoerr@9730 2506 __ vec_perm (vKey1, vKey2, vKey1, keyPerm);
mdoerr@9730 2507
mdoerr@9730 2508 // load the 14th round key to vKey2
mdoerr@9730 2509 __ li (keypos, 208);
mdoerr@9730 2510 __ lvx (vKey3, keypos, key);
mdoerr@9730 2511 __ vec_perm (vKey2, vKey3, vKey2, keyPerm);
mdoerr@9730 2512
mdoerr@9730 2513 // load the 13th round key to vKey3
mdoerr@9730 2514 __ li (keypos, 192);
mdoerr@9730 2515 __ lvx (vKey4, keypos, key);
mdoerr@9730 2516 __ vec_perm (vKey3, vKey4, vKey3, keyPerm);
mdoerr@9730 2517
mdoerr@9730 2518 // load the 12th round key to vKey4
mdoerr@9730 2519 __ li (keypos, 176);
mdoerr@9730 2520 __ lvx (vKey5, keypos, key);
mdoerr@9730 2521 __ vec_perm (vKey4, vKey5, vKey4, keyPerm);
mdoerr@9730 2522
mdoerr@9730 2523 // load the 11th round key to vKey5
mdoerr@9730 2524 __ li (keypos, 160);
simonis@8608 2525 __ lvx (vTmp1, keypos, key);
mdoerr@9730 2526 __ vec_perm (vKey5, vTmp1, vKey5, keyPerm);
simonis@8608 2527
simonis@8608 2528 // 1st - 5th rounds
simonis@8608 2529 __ vxor (vRet, vRet, vKey1);
simonis@8608 2530 __ vncipher (vRet, vRet, vKey2);
simonis@8608 2531 __ vncipher (vRet, vRet, vKey3);
simonis@8608 2532 __ vncipher (vRet, vRet, vKey4);
simonis@8608 2533 __ vncipher (vRet, vRet, vKey5);
simonis@8608 2534
simonis@8608 2535 __ b (L_doLast);
simonis@8608 2536
simonis@8608 2537 __ bind (L_do52);
simonis@8608 2538
mdoerr@9730 2539 // load the 13th round key to vKey1
simonis@8608 2540 __ li (keypos, 208);
mdoerr@9730 2541 __ lvx (vKey1, keypos, key);
mdoerr@9730 2542 __ li (keypos, 192);
mdoerr@9730 2543 __ lvx (vKey2, keypos, key);
mdoerr@9730 2544 __ vec_perm (vKey1, vKey2, vKey1, keyPerm);
mdoerr@9730 2545
mdoerr@9730 2546 // load the 12th round key to vKey2
mdoerr@9730 2547 __ li (keypos, 176);
mdoerr@9730 2548 __ lvx (vKey3, keypos, key);
mdoerr@9730 2549 __ vec_perm (vKey2, vKey3, vKey2, keyPerm);
mdoerr@9730 2550
mdoerr@9730 2551 // load the 11th round key to vKey3
mdoerr@9730 2552 __ li (keypos, 160);
simonis@8608 2553 __ lvx (vTmp1, keypos, key);
mdoerr@9730 2554 __ vec_perm (vKey3, vTmp1, vKey3, keyPerm);
simonis@8608 2555
simonis@8608 2556 // 1st - 3rd rounds
simonis@8608 2557 __ vxor (vRet, vRet, vKey1);
simonis@8608 2558 __ vncipher (vRet, vRet, vKey2);
simonis@8608 2559 __ vncipher (vRet, vRet, vKey3);
simonis@8608 2560
simonis@8608 2561 __ b (L_doLast);
simonis@8608 2562
simonis@8608 2563 __ bind (L_do44);
simonis@8608 2564
mdoerr@9730 2565 // load the 11th round key to vKey1
simonis@8608 2566 __ li (keypos, 176);
mdoerr@9730 2567 __ lvx (vKey1, keypos, key);
mdoerr@9730 2568 __ li (keypos, 160);
simonis@8608 2569 __ lvx (vTmp1, keypos, key);
mdoerr@9730 2570 __ vec_perm (vKey1, vTmp1, vKey1, keyPerm);
simonis@8608 2571
simonis@8608 2572 // 1st round
simonis@8608 2573 __ vxor (vRet, vRet, vKey1);
simonis@8608 2574
simonis@8608 2575 __ bind (L_doLast);
simonis@8608 2576
mdoerr@9730 2577 // load the 10th round key to vKey1
mdoerr@9730 2578 __ li (keypos, 144);
mdoerr@9730 2579 __ lvx (vKey2, keypos, key);
mdoerr@9730 2580 __ vec_perm (vKey1, vKey2, vTmp1, keyPerm);
mdoerr@9730 2581
mdoerr@9730 2582 // load the 9th round key to vKey2
mdoerr@9730 2583 __ li (keypos, 128);
mdoerr@9730 2584 __ lvx (vKey3, keypos, key);
mdoerr@9730 2585 __ vec_perm (vKey2, vKey3, vKey2, keyPerm);
mdoerr@9730 2586
mdoerr@9730 2587 // load the 8th round key to vKey3
mdoerr@9730 2588 __ li (keypos, 112);
mdoerr@9730 2589 __ lvx (vKey4, keypos, key);
mdoerr@9730 2590 __ vec_perm (vKey3, vKey4, vKey3, keyPerm);
mdoerr@9730 2591
mdoerr@9730 2592 // load the 7th round key to vKey4
mdoerr@9730 2593 __ li (keypos, 96);
mdoerr@9730 2594 __ lvx (vKey5, keypos, key);
mdoerr@9730 2595 __ vec_perm (vKey4, vKey5, vKey4, keyPerm);
mdoerr@9730 2596
mdoerr@9730 2597 // load the 6th round key to vKey5
mdoerr@9730 2598 __ li (keypos, 80);
simonis@8608 2599 __ lvx (vTmp1, keypos, key);
mdoerr@9730 2600 __ vec_perm (vKey5, vTmp1, vKey5, keyPerm);
simonis@8608 2601
simonis@8608 2602 // last 10th - 6th rounds
simonis@8608 2603 __ vncipher (vRet, vRet, vKey1);
simonis@8608 2604 __ vncipher (vRet, vRet, vKey2);
simonis@8608 2605 __ vncipher (vRet, vRet, vKey3);
simonis@8608 2606 __ vncipher (vRet, vRet, vKey4);
simonis@8608 2607 __ vncipher (vRet, vRet, vKey5);
simonis@8608 2608
mdoerr@9730 2609 // load the 5th round key to vKey1
mdoerr@9730 2610 __ li (keypos, 64);
mdoerr@9730 2611 __ lvx (vKey2, keypos, key);
mdoerr@9730 2612 __ vec_perm (vKey1, vKey2, vTmp1, keyPerm);
mdoerr@9730 2613
mdoerr@9730 2614 // load the 4th round key to vKey2
mdoerr@9730 2615 __ li (keypos, 48);
mdoerr@9730 2616 __ lvx (vKey3, keypos, key);
mdoerr@9730 2617 __ vec_perm (vKey2, vKey3, vKey2, keyPerm);
mdoerr@9730 2618
mdoerr@9730 2619 // load the 3rd round key to vKey3
mdoerr@9730 2620 __ li (keypos, 32);
mdoerr@9730 2621 __ lvx (vKey4, keypos, key);
mdoerr@9730 2622 __ vec_perm (vKey3, vKey4, vKey3, keyPerm);
mdoerr@9730 2623
mdoerr@9730 2624 // load the 2nd round key to vKey4
mdoerr@9730 2625 __ li (keypos, 16);
mdoerr@9730 2626 __ lvx (vKey5, keypos, key);
mdoerr@9730 2627 __ vec_perm (vKey4, vKey5, vKey4, keyPerm);
mdoerr@9730 2628
mdoerr@9730 2629 // load the 1st round key to vKey5
mdoerr@9730 2630 __ lvx (vTmp1, key);
mdoerr@9730 2631 __ vec_perm (vKey5, vTmp1, vKey5, keyPerm);
simonis@8608 2632
simonis@8608 2633 // last 5th - 1th rounds
simonis@8608 2634 __ vncipher (vRet, vRet, vKey1);
simonis@8608 2635 __ vncipher (vRet, vRet, vKey2);
simonis@8608 2636 __ vncipher (vRet, vRet, vKey3);
simonis@8608 2637 __ vncipher (vRet, vRet, vKey4);
simonis@8608 2638 __ vncipherlast (vRet, vRet, vKey5);
simonis@8608 2639
mdoerr@9730 2640 // store result (unaligned)
mdoerr@9730 2641 #ifdef VM_LITTLE_ENDIAN
mdoerr@9730 2642 __ lvsl (toPerm, to);
mdoerr@9730 2643 #else
mdoerr@9730 2644 __ lvsr (toPerm, to);
mdoerr@9730 2645 #endif
mdoerr@9730 2646 __ vspltisb (vTmp3, -1);
mdoerr@9730 2647 __ vspltisb (vTmp4, 0);
simonis@8608 2648 __ lvx (vTmp1, to);
mdoerr@9730 2649 __ lvx (vTmp2, fifteen, to);
mdoerr@9730 2650 #ifdef VM_LITTLE_ENDIAN
mdoerr@9730 2651 __ vperm (vTmp3, vTmp3, vTmp4, toPerm); // generate select mask
mdoerr@9730 2652 __ vxor (toPerm, toPerm, fSplt); // swap bytes
mdoerr@9730 2653 #else
mdoerr@9730 2654 __ vperm (vTmp3, vTmp4, vTmp3, toPerm); // generate select mask
mdoerr@9730 2655 #endif
mdoerr@9730 2656 __ vperm (vTmp4, vRet, vRet, toPerm); // rotate data
mdoerr@9730 2657 __ vsel (vTmp2, vTmp4, vTmp2, vTmp3);
mdoerr@9730 2658 __ vsel (vTmp1, vTmp1, vTmp4, vTmp3);
mdoerr@9730 2659 __ stvx (vTmp2, fifteen, to); // store this one first (may alias)
simonis@8608 2660 __ stvx (vTmp1, to);
simonis@8608 2661
simonis@8608 2662 __ blr();
simonis@8608 2663 return start;
simonis@8608 2664 }
simonis@8608 2665
ogatak@9713 2666 address generate_sha256_implCompress(bool multi_block, const char *name) {
ogatak@9713 2667 assert(UseSHA, "need SHA instructions");
ogatak@9713 2668 StubCodeMark mark(this, "StubRoutines", name);
ogatak@9713 2669 address start = __ function_entry();
ogatak@9713 2670
ogatak@9713 2671 __ sha256 (multi_block);
ogatak@9713 2672
ogatak@9713 2673 __ blr();
ogatak@9713 2674 return start;
ogatak@9713 2675 }
ogatak@9713 2676
ogatak@9713 2677 address generate_sha512_implCompress(bool multi_block, const char *name) {
ogatak@9713 2678 assert(UseSHA, "need SHA instructions");
ogatak@9713 2679 StubCodeMark mark(this, "StubRoutines", name);
ogatak@9713 2680 address start = __ function_entry();
ogatak@9713 2681
ogatak@9713 2682 __ sha512 (multi_block);
ogatak@9713 2683
ogatak@9713 2684 __ blr();
ogatak@9713 2685 return start;
ogatak@9713 2686 }
ogatak@9713 2687
goetz@6458 2688 void generate_arraycopy_stubs() {
goetz@6458 2689 // Note: the disjoint stubs must be generated first, some of
goetz@6458 2690 // the conjoint stubs use them.
goetz@6458 2691
goetz@6458 2692 // non-aligned disjoint versions
goetz@6458 2693 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
goetz@6458 2694 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
goetz@6458 2695 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, "jint_disjoint_arraycopy");
goetz@6458 2696 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
goetz@6458 2697 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy", false);
goetz@6458 2698 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy_uninit", true);
goetz@6458 2699
goetz@6458 2700 // aligned disjoint versions
goetz@6458 2701 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
goetz@6458 2702 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
goetz@6458 2703 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, "arrayof_jint_disjoint_arraycopy");
goetz@6458 2704 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
goetz@6458 2705 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(true, "arrayof_oop_disjoint_arraycopy", false);
goetz@6458 2706 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, "oop_disjoint_arraycopy_uninit", true);
goetz@6458 2707
goetz@6458 2708 // non-aligned conjoint versions
goetz@6458 2709 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
goetz@6458 2710 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, "jshort_arraycopy");
goetz@6458 2711 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, "jint_arraycopy");
goetz@6458 2712 StubRoutines::_jlong_arraycopy = generate_conjoint_long_copy(false, "jlong_arraycopy");
goetz@6458 2713 StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(false, "oop_arraycopy", false);
goetz@6458 2714 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, "oop_arraycopy_uninit", true);
goetz@6458 2715
goetz@6458 2716 // aligned conjoint versions
goetz@6458 2717 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
goetz@6458 2718 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
goetz@6458 2719 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, "arrayof_jint_arraycopy");
goetz@6458 2720 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");
goetz@6458 2721 StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", false);
goetz@6458 2722 StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", true);
goetz@6458 2723
goetz@6458 2724 // fill routines
goetz@6458 2725 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
goetz@6458 2726 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
goetz@6458 2727 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
goetz@6458 2728 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
goetz@6458 2729 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
goetz@6458 2730 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
goetz@6458 2731 }
goetz@6458 2732
goetz@6458 2733 // Safefetch stubs.
goetz@6458 2734 void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) {
goetz@6458 2735 // safefetch signatures:
goetz@6458 2736 // int SafeFetch32(int* adr, int errValue);
goetz@6458 2737 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
goetz@6458 2738 //
goetz@6458 2739 // arguments:
goetz@6458 2740 // R3_ARG1 = adr
goetz@6458 2741 // R4_ARG2 = errValue
goetz@6458 2742 //
goetz@6458 2743 // result:
goetz@6458 2744 // R3_RET = *adr or errValue
goetz@6458 2745
goetz@6458 2746 StubCodeMark mark(this, "StubRoutines", name);
goetz@6458 2747
goetz@6458 2748 // Entry point, pc or function descriptor.
goetz@6511 2749 *entry = __ function_entry();
goetz@6458 2750
goetz@6458 2751 // Load *adr into R4_ARG2, may fault.
goetz@6458 2752 *fault_pc = __ pc();
goetz@6458 2753 switch (size) {
goetz@6458 2754 case 4:
goetz@6458 2755 // int32_t, signed extended
goetz@6458 2756 __ lwa(R4_ARG2, 0, R3_ARG1);
goetz@6458 2757 break;
goetz@6458 2758 case 8:
goetz@6458 2759 // int64_t
goetz@6458 2760 __ ld(R4_ARG2, 0, R3_ARG1);
goetz@6458 2761 break;
goetz@6458 2762 default:
goetz@6458 2763 ShouldNotReachHere();
goetz@6458 2764 }
goetz@6458 2765
goetz@6458 2766 // return errValue or *adr
goetz@6458 2767 *continuation_pc = __ pc();
goetz@6458 2768 __ mr(R3_RET, R4_ARG2);
goetz@6458 2769 __ blr();
goetz@6458 2770 }
goetz@6458 2771
gromero@9496 2772 /**
gromero@9496 2773 * Arguments:
gromero@9496 2774 *
gromero@9496 2775 * Inputs:
gromero@9496 2776 * R3_ARG1 - int crc
gromero@9496 2777 * R4_ARG2 - byte* buf
gromero@9496 2778 * R5_ARG3 - int length (of buffer)
gromero@9496 2779 *
gromero@9496 2780 * scratch:
mdoerr@9497 2781 * R2, R6-R12
gromero@9496 2782 *
gromero@9496 2783 * Ouput:
gromero@9496 2784 * R3_RET - int crc result
gromero@9496 2785 */
gromero@9496 2786 // Compute CRC32 function.
gromero@9496 2787 address generate_CRC32_updateBytes(const char* name) {
gromero@9496 2788 __ align(CodeEntryAlignment);
gromero@9496 2789 StubCodeMark mark(this, "StubRoutines", name);
gromero@9496 2790 address start = __ function_entry(); // Remember stub start address (is rtn value).
gromero@9496 2791
gromero@9496 2792 // arguments to kernel_crc32:
mdoerr@9497 2793 const Register crc = R3_ARG1; // Current checksum, preset by caller or result from previous call.
mdoerr@9497 2794 const Register data = R4_ARG2; // source byte array
mdoerr@9497 2795 const Register dataLen = R5_ARG3; // #bytes to process
mdoerr@9497 2796
mdoerr@9497 2797 const Register table = R6; // crc table address
mdoerr@9497 2798
mdoerr@9497 2799 #ifdef VM_LITTLE_ENDIAN
mdoerr@9497 2800 if (VM_Version::has_vpmsumb()) {
mdoerr@9497 2801 const Register constants = R2; // constants address
mdoerr@9497 2802 const Register bconstants = R8; // barret table address
mdoerr@9497 2803
mdoerr@9497 2804 const Register t0 = R9;
mdoerr@9497 2805 const Register t1 = R10;
mdoerr@9497 2806 const Register t2 = R11;
mdoerr@9497 2807 const Register t3 = R12;
mdoerr@9497 2808 const Register t4 = R7;
mdoerr@9497 2809
mdoerr@9497 2810 BLOCK_COMMENT("Stub body {");
mdoerr@9497 2811 assert_different_registers(crc, data, dataLen, table);
mdoerr@9497 2812
mdoerr@9497 2813 StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
mdoerr@9497 2814 StubRoutines::ppc64::generate_load_crc_constants_addr(_masm, constants);
mdoerr@9497 2815 StubRoutines::ppc64::generate_load_crc_barret_constants_addr(_masm, bconstants);
mdoerr@9497 2816
mdoerr@9497 2817 __ kernel_crc32_1word_vpmsumd(crc, data, dataLen, table, constants, bconstants, t0, t1, t2, t3, t4);
mdoerr@9497 2818
mdoerr@9497 2819 BLOCK_COMMENT("return");
mdoerr@9497 2820 __ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
mdoerr@9497 2821 __ blr();
mdoerr@9497 2822
mdoerr@9497 2823 BLOCK_COMMENT("} Stub body");
mdoerr@9497 2824 } else
mdoerr@9497 2825 #endif
mdoerr@9497 2826 {
mdoerr@9497 2827 const Register t0 = R2;
mdoerr@9497 2828 const Register t1 = R7;
mdoerr@9497 2829 const Register t2 = R8;
mdoerr@9497 2830 const Register t3 = R9;
mdoerr@9497 2831 const Register tc0 = R10;
mdoerr@9497 2832 const Register tc1 = R11;
mdoerr@9497 2833 const Register tc2 = R12;
mdoerr@9497 2834
mdoerr@9497 2835 BLOCK_COMMENT("Stub body {");
mdoerr@9497 2836 assert_different_registers(crc, data, dataLen, table);
mdoerr@9497 2837
mdoerr@9497 2838 StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
mdoerr@9497 2839
mdoerr@9497 2840 __ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, table);
mdoerr@9497 2841
mdoerr@9497 2842 BLOCK_COMMENT("return");
mdoerr@9497 2843 __ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
mdoerr@9497 2844 __ blr();
mdoerr@9497 2845
mdoerr@9497 2846 BLOCK_COMMENT("} Stub body");
mdoerr@9497 2847 }
mdoerr@9497 2848
gromero@9496 2849 return start;
gromero@9496 2850 }
gromero@9496 2851
goetz@6458 2852 // Initialization
goetz@6458 2853 void generate_initial() {
goetz@6458 2854 // Generates all stubs and initializes the entry points
goetz@6458 2855
goetz@6458 2856 // Entry points that exist in all platforms.
goetz@6458 2857 // Note: This is code that could be shared among different platforms - however the
goetz@6458 2858 // benefit seems to be smaller than the disadvantage of having a
goetz@6458 2859 // much more complicated generator structure. See also comment in
goetz@6458 2860 // stubRoutines.hpp.
goetz@6458 2861
goetz@6458 2862 StubRoutines::_forward_exception_entry = generate_forward_exception();
goetz@6458 2863 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
goetz@6458 2864 StubRoutines::_catch_exception_entry = generate_catch_exception();
goetz@6501 2865
goetz@6501 2866 // Build this early so it's available for the interpreter.
goetz@6501 2867 StubRoutines::_throw_StackOverflowError_entry =
goetz@6501 2868 generate_throw_exception("StackOverflowError throw_exception",
goetz@6501 2869 CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
gromero@9496 2870
gromero@9496 2871 // CRC32 Intrinsics.
gromero@9496 2872 if (UseCRC32Intrinsics) {
gromero@9496 2873 StubRoutines::_crc_table_adr = (address)StubRoutines::ppc64::_crc_table;
gromero@9496 2874 StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes("CRC32_updateBytes");
gromero@9496 2875 }
goetz@6458 2876 }
goetz@6458 2877
goetz@6458 2878 void generate_all() {
goetz@6458 2879 // Generates all stubs and initializes the entry points
goetz@6458 2880
goetz@6458 2881 // These entry points require SharedInfo::stack0 to be set up in
goetz@6458 2882 // non-core builds
goetz@6458 2883 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError), false);
goetz@6458 2884 // Handle IncompatibleClassChangeError in itable stubs.
goetz@6458 2885 StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError), false);
goetz@6458 2886 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
goetz@6458 2887
goetz@6458 2888 StubRoutines::_handler_for_unsafe_access_entry = generate_handler_for_unsafe_access();
goetz@6458 2889
goetz@6458 2890 // support for verify_oop (must happen after universe_init)
goetz@6458 2891 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
goetz@6458 2892
goetz@6458 2893 // arraycopy stubs used by compilers
goetz@6458 2894 generate_arraycopy_stubs();
goetz@6458 2895
goetz@6512 2896 // Safefetch stubs.
goetz@6458 2897 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
goetz@6458 2898 &StubRoutines::_safefetch32_fault_pc,
goetz@6458 2899 &StubRoutines::_safefetch32_continuation_pc);
goetz@6458 2900 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
goetz@6458 2901 &StubRoutines::_safefetchN_fault_pc,
goetz@6458 2902 &StubRoutines::_safefetchN_continuation_pc);
simonis@8608 2903
simonis@8608 2904 if (UseAESIntrinsics) {
simonis@8608 2905 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
simonis@8608 2906 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
simonis@8608 2907 }
simonis@8608 2908
mdoerr@8903 2909 if (UseMontgomeryMultiplyIntrinsic) {
mdoerr@8903 2910 StubRoutines::_montgomeryMultiply
mdoerr@8903 2911 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
mdoerr@8903 2912 }
mdoerr@8903 2913 if (UseMontgomerySquareIntrinsic) {
mdoerr@8903 2914 StubRoutines::_montgomerySquare
mdoerr@8903 2915 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
mdoerr@8903 2916 }
ogatak@9713 2917
ogatak@9713 2918 if (UseSHA256Intrinsics) {
ogatak@9713 2919 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
ogatak@9713 2920 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
ogatak@9713 2921 }
ogatak@9713 2922 if (UseSHA512Intrinsics) {
ogatak@9713 2923 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
ogatak@9713 2924 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
ogatak@9713 2925 }
goetz@6458 2926 }
goetz@6458 2927
goetz@6458 2928 public:
goetz@6458 2929 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
goetz@6458 2930 // replace the standard masm with a special one:
goetz@6458 2931 _masm = new MacroAssembler(code);
goetz@6458 2932 if (all) {
goetz@6458 2933 generate_all();
goetz@6458 2934 } else {
goetz@6458 2935 generate_initial();
goetz@6458 2936 }
goetz@6458 2937 }
goetz@6458 2938 };
goetz@6458 2939
goetz@6458 2940 void StubGenerator_generate(CodeBuffer* code, bool all) {
goetz@6458 2941 StubGenerator g(code, all);
goetz@6458 2942 }

mercurial