Mon, 24 Feb 2014 15:12:26 -0800
8033805: Move Fast_Lock/Fast_Unlock code from .ad files to macroassembler
Summary: Consolidated C2 x86 locking code in one place in macroAssembler_x86.cpp.
Reviewed-by: roland
1 /*
2 * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
25 #include "precompiled.hpp"
26 #include "asm/assembler.hpp"
27 #include "asm/assembler.inline.hpp"
28 #include "compiler/disassembler.hpp"
29 #include "gc_interface/collectedHeap.inline.hpp"
30 #include "interpreter/interpreter.hpp"
31 #include "memory/cardTableModRefBS.hpp"
32 #include "memory/resourceArea.hpp"
33 #include "memory/universe.hpp"
34 #include "prims/methodHandles.hpp"
35 #include "runtime/biasedLocking.hpp"
36 #include "runtime/interfaceSupport.hpp"
37 #include "runtime/objectMonitor.hpp"
38 #include "runtime/os.hpp"
39 #include "runtime/sharedRuntime.hpp"
40 #include "runtime/stubRoutines.hpp"
41 #include "utilities/macros.hpp"
42 #if INCLUDE_ALL_GCS
43 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
44 #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
45 #include "gc_implementation/g1/heapRegion.hpp"
46 #endif // INCLUDE_ALL_GCS
48 #ifdef PRODUCT
49 #define BLOCK_COMMENT(str) /* nothing */
50 #define STOP(error) stop(error)
51 #else
52 #define BLOCK_COMMENT(str) block_comment(str)
53 #define STOP(error) block_comment(error); stop(error)
54 #endif
56 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
59 #ifdef ASSERT
60 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
61 #endif
63 static Assembler::Condition reverse[] = {
64 Assembler::noOverflow /* overflow = 0x0 */ ,
65 Assembler::overflow /* noOverflow = 0x1 */ ,
66 Assembler::aboveEqual /* carrySet = 0x2, below = 0x2 */ ,
67 Assembler::below /* aboveEqual = 0x3, carryClear = 0x3 */ ,
68 Assembler::notZero /* zero = 0x4, equal = 0x4 */ ,
69 Assembler::zero /* notZero = 0x5, notEqual = 0x5 */ ,
70 Assembler::above /* belowEqual = 0x6 */ ,
71 Assembler::belowEqual /* above = 0x7 */ ,
72 Assembler::positive /* negative = 0x8 */ ,
73 Assembler::negative /* positive = 0x9 */ ,
74 Assembler::noParity /* parity = 0xa */ ,
75 Assembler::parity /* noParity = 0xb */ ,
76 Assembler::greaterEqual /* less = 0xc */ ,
77 Assembler::less /* greaterEqual = 0xd */ ,
78 Assembler::greater /* lessEqual = 0xe */ ,
79 Assembler::lessEqual /* greater = 0xf, */
81 };
84 // Implementation of MacroAssembler
86 // First all the versions that have distinct versions depending on 32/64 bit
87 // Unless the difference is trivial (1 line or so).
89 #ifndef _LP64
91 // 32bit versions
93 Address MacroAssembler::as_Address(AddressLiteral adr) {
94 return Address(adr.target(), adr.rspec());
95 }
97 Address MacroAssembler::as_Address(ArrayAddress adr) {
98 return Address::make_array(adr);
99 }
101 void MacroAssembler::call_VM_leaf_base(address entry_point,
102 int number_of_arguments) {
103 call(RuntimeAddress(entry_point));
104 increment(rsp, number_of_arguments * wordSize);
105 }
107 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
108 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
109 }
111 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
112 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
113 }
115 void MacroAssembler::cmpoop(Address src1, jobject obj) {
116 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
117 }
119 void MacroAssembler::cmpoop(Register src1, jobject obj) {
120 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
121 }
123 void MacroAssembler::extend_sign(Register hi, Register lo) {
124 // According to Intel Doc. AP-526, "Integer Divide", p.18.
125 if (VM_Version::is_P6() && hi == rdx && lo == rax) {
126 cdql();
127 } else {
128 movl(hi, lo);
129 sarl(hi, 31);
130 }
131 }
133 void MacroAssembler::jC2(Register tmp, Label& L) {
134 // set parity bit if FPU flag C2 is set (via rax)
135 save_rax(tmp);
136 fwait(); fnstsw_ax();
137 sahf();
138 restore_rax(tmp);
139 // branch
140 jcc(Assembler::parity, L);
141 }
143 void MacroAssembler::jnC2(Register tmp, Label& L) {
144 // set parity bit if FPU flag C2 is set (via rax)
145 save_rax(tmp);
146 fwait(); fnstsw_ax();
147 sahf();
148 restore_rax(tmp);
149 // branch
150 jcc(Assembler::noParity, L);
151 }
153 // 32bit can do a case table jump in one instruction but we no longer allow the base
154 // to be installed in the Address class
155 void MacroAssembler::jump(ArrayAddress entry) {
156 jmp(as_Address(entry));
157 }
159 // Note: y_lo will be destroyed
160 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
161 // Long compare for Java (semantics as described in JVM spec.)
162 Label high, low, done;
164 cmpl(x_hi, y_hi);
165 jcc(Assembler::less, low);
166 jcc(Assembler::greater, high);
167 // x_hi is the return register
168 xorl(x_hi, x_hi);
169 cmpl(x_lo, y_lo);
170 jcc(Assembler::below, low);
171 jcc(Assembler::equal, done);
173 bind(high);
174 xorl(x_hi, x_hi);
175 increment(x_hi);
176 jmp(done);
178 bind(low);
179 xorl(x_hi, x_hi);
180 decrementl(x_hi);
182 bind(done);
183 }
185 void MacroAssembler::lea(Register dst, AddressLiteral src) {
186 mov_literal32(dst, (int32_t)src.target(), src.rspec());
187 }
189 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
190 // leal(dst, as_Address(adr));
191 // see note in movl as to why we must use a move
192 mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
193 }
195 void MacroAssembler::leave() {
196 mov(rsp, rbp);
197 pop(rbp);
198 }
200 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
201 // Multiplication of two Java long values stored on the stack
202 // as illustrated below. Result is in rdx:rax.
203 //
204 // rsp ---> [ ?? ] \ \
205 // .... | y_rsp_offset |
206 // [ y_lo ] / (in bytes) | x_rsp_offset
207 // [ y_hi ] | (in bytes)
208 // .... |
209 // [ x_lo ] /
210 // [ x_hi ]
211 // ....
212 //
213 // Basic idea: lo(result) = lo(x_lo * y_lo)
214 // hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
215 Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
216 Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
217 Label quick;
218 // load x_hi, y_hi and check if quick
219 // multiplication is possible
220 movl(rbx, x_hi);
221 movl(rcx, y_hi);
222 movl(rax, rbx);
223 orl(rbx, rcx); // rbx, = 0 <=> x_hi = 0 and y_hi = 0
224 jcc(Assembler::zero, quick); // if rbx, = 0 do quick multiply
225 // do full multiplication
226 // 1st step
227 mull(y_lo); // x_hi * y_lo
228 movl(rbx, rax); // save lo(x_hi * y_lo) in rbx,
229 // 2nd step
230 movl(rax, x_lo);
231 mull(rcx); // x_lo * y_hi
232 addl(rbx, rax); // add lo(x_lo * y_hi) to rbx,
233 // 3rd step
234 bind(quick); // note: rbx, = 0 if quick multiply!
235 movl(rax, x_lo);
236 mull(y_lo); // x_lo * y_lo
237 addl(rdx, rbx); // correct hi(x_lo * y_lo)
238 }
240 void MacroAssembler::lneg(Register hi, Register lo) {
241 negl(lo);
242 adcl(hi, 0);
243 negl(hi);
244 }
246 void MacroAssembler::lshl(Register hi, Register lo) {
247 // Java shift left long support (semantics as described in JVM spec., p.305)
248 // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
249 // shift value is in rcx !
250 assert(hi != rcx, "must not use rcx");
251 assert(lo != rcx, "must not use rcx");
252 const Register s = rcx; // shift count
253 const int n = BitsPerWord;
254 Label L;
255 andl(s, 0x3f); // s := s & 0x3f (s < 0x40)
256 cmpl(s, n); // if (s < n)
257 jcc(Assembler::less, L); // else (s >= n)
258 movl(hi, lo); // x := x << n
259 xorl(lo, lo);
260 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
261 bind(L); // s (mod n) < n
262 shldl(hi, lo); // x := x << s
263 shll(lo);
264 }
267 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
268 // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
269 // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
270 assert(hi != rcx, "must not use rcx");
271 assert(lo != rcx, "must not use rcx");
272 const Register s = rcx; // shift count
273 const int n = BitsPerWord;
274 Label L;
275 andl(s, 0x3f); // s := s & 0x3f (s < 0x40)
276 cmpl(s, n); // if (s < n)
277 jcc(Assembler::less, L); // else (s >= n)
278 movl(lo, hi); // x := x >> n
279 if (sign_extension) sarl(hi, 31);
280 else xorl(hi, hi);
281 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
282 bind(L); // s (mod n) < n
283 shrdl(lo, hi); // x := x >> s
284 if (sign_extension) sarl(hi);
285 else shrl(hi);
286 }
288 void MacroAssembler::movoop(Register dst, jobject obj) {
289 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
290 }
292 void MacroAssembler::movoop(Address dst, jobject obj) {
293 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
294 }
296 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
297 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
298 }
300 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
301 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
302 }
304 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
305 if (src.is_lval()) {
306 mov_literal32(dst, (intptr_t)src.target(), src.rspec());
307 } else {
308 movl(dst, as_Address(src));
309 }
310 }
312 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
313 movl(as_Address(dst), src);
314 }
316 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
317 movl(dst, as_Address(src));
318 }
320 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
321 void MacroAssembler::movptr(Address dst, intptr_t src) {
322 movl(dst, src);
323 }
326 void MacroAssembler::pop_callee_saved_registers() {
327 pop(rcx);
328 pop(rdx);
329 pop(rdi);
330 pop(rsi);
331 }
333 void MacroAssembler::pop_fTOS() {
334 fld_d(Address(rsp, 0));
335 addl(rsp, 2 * wordSize);
336 }
338 void MacroAssembler::push_callee_saved_registers() {
339 push(rsi);
340 push(rdi);
341 push(rdx);
342 push(rcx);
343 }
345 void MacroAssembler::push_fTOS() {
346 subl(rsp, 2 * wordSize);
347 fstp_d(Address(rsp, 0));
348 }
351 void MacroAssembler::pushoop(jobject obj) {
352 push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
353 }
355 void MacroAssembler::pushklass(Metadata* obj) {
356 push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
357 }
359 void MacroAssembler::pushptr(AddressLiteral src) {
360 if (src.is_lval()) {
361 push_literal32((int32_t)src.target(), src.rspec());
362 } else {
363 pushl(as_Address(src));
364 }
365 }
367 void MacroAssembler::set_word_if_not_zero(Register dst) {
368 xorl(dst, dst);
369 set_byte_if_not_zero(dst);
370 }
372 static void pass_arg0(MacroAssembler* masm, Register arg) {
373 masm->push(arg);
374 }
376 static void pass_arg1(MacroAssembler* masm, Register arg) {
377 masm->push(arg);
378 }
380 static void pass_arg2(MacroAssembler* masm, Register arg) {
381 masm->push(arg);
382 }
384 static void pass_arg3(MacroAssembler* masm, Register arg) {
385 masm->push(arg);
386 }
388 #ifndef PRODUCT
389 extern "C" void findpc(intptr_t x);
390 #endif
392 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
393 // In order to get locks to work, we need to fake a in_VM state
394 JavaThread* thread = JavaThread::current();
395 JavaThreadState saved_state = thread->thread_state();
396 thread->set_thread_state(_thread_in_vm);
397 if (ShowMessageBoxOnError) {
398 JavaThread* thread = JavaThread::current();
399 JavaThreadState saved_state = thread->thread_state();
400 thread->set_thread_state(_thread_in_vm);
401 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
402 ttyLocker ttyl;
403 BytecodeCounter::print();
404 }
405 // To see where a verify_oop failed, get $ebx+40/X for this frame.
406 // This is the value of eip which points to where verify_oop will return.
407 if (os::message_box(msg, "Execution stopped, print registers?")) {
408 print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
409 BREAKPOINT;
410 }
411 } else {
412 ttyLocker ttyl;
413 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
414 }
415 // Don't assert holding the ttyLock
416 assert(false, err_msg("DEBUG MESSAGE: %s", msg));
417 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
418 }
420 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
421 ttyLocker ttyl;
422 FlagSetting fs(Debugging, true);
423 tty->print_cr("eip = 0x%08x", eip);
424 #ifndef PRODUCT
425 if ((WizardMode || Verbose) && PrintMiscellaneous) {
426 tty->cr();
427 findpc(eip);
428 tty->cr();
429 }
430 #endif
431 #define PRINT_REG(rax) \
432 { tty->print("%s = ", #rax); os::print_location(tty, rax); }
433 PRINT_REG(rax);
434 PRINT_REG(rbx);
435 PRINT_REG(rcx);
436 PRINT_REG(rdx);
437 PRINT_REG(rdi);
438 PRINT_REG(rsi);
439 PRINT_REG(rbp);
440 PRINT_REG(rsp);
441 #undef PRINT_REG
442 // Print some words near top of staack.
443 int* dump_sp = (int*) rsp;
444 for (int col1 = 0; col1 < 8; col1++) {
445 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
446 os::print_location(tty, *dump_sp++);
447 }
448 for (int row = 0; row < 16; row++) {
449 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
450 for (int col = 0; col < 8; col++) {
451 tty->print(" 0x%08x", *dump_sp++);
452 }
453 tty->cr();
454 }
455 // Print some instructions around pc:
456 Disassembler::decode((address)eip-64, (address)eip);
457 tty->print_cr("--------");
458 Disassembler::decode((address)eip, (address)eip+32);
459 }
461 void MacroAssembler::stop(const char* msg) {
462 ExternalAddress message((address)msg);
463 // push address of message
464 pushptr(message.addr());
465 { Label L; call(L, relocInfo::none); bind(L); } // push eip
466 pusha(); // push registers
467 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
468 hlt();
469 }
471 void MacroAssembler::warn(const char* msg) {
472 push_CPU_state();
474 ExternalAddress message((address) msg);
475 // push address of message
476 pushptr(message.addr());
478 call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
479 addl(rsp, wordSize); // discard argument
480 pop_CPU_state();
481 }
483 void MacroAssembler::print_state() {
484 { Label L; call(L, relocInfo::none); bind(L); } // push eip
485 pusha(); // push registers
487 push_CPU_state();
488 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
489 pop_CPU_state();
491 popa();
492 addl(rsp, wordSize);
493 }
495 #else // _LP64
497 // 64 bit versions
499 Address MacroAssembler::as_Address(AddressLiteral adr) {
500 // amd64 always does this as a pc-rel
501 // we can be absolute or disp based on the instruction type
502 // jmp/call are displacements others are absolute
503 assert(!adr.is_lval(), "must be rval");
504 assert(reachable(adr), "must be");
505 return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
507 }
509 Address MacroAssembler::as_Address(ArrayAddress adr) {
510 AddressLiteral base = adr.base();
511 lea(rscratch1, base);
512 Address index = adr.index();
513 assert(index._disp == 0, "must not have disp"); // maybe it can?
514 Address array(rscratch1, index._index, index._scale, index._disp);
515 return array;
516 }
518 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
519 Label L, E;
521 #ifdef _WIN64
522 // Windows always allocates space for it's register args
523 assert(num_args <= 4, "only register arguments supported");
524 subq(rsp, frame::arg_reg_save_area_bytes);
525 #endif
527 // Align stack if necessary
528 testl(rsp, 15);
529 jcc(Assembler::zero, L);
531 subq(rsp, 8);
532 {
533 call(RuntimeAddress(entry_point));
534 }
535 addq(rsp, 8);
536 jmp(E);
538 bind(L);
539 {
540 call(RuntimeAddress(entry_point));
541 }
543 bind(E);
545 #ifdef _WIN64
546 // restore stack pointer
547 addq(rsp, frame::arg_reg_save_area_bytes);
548 #endif
550 }
552 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
553 assert(!src2.is_lval(), "should use cmpptr");
555 if (reachable(src2)) {
556 cmpq(src1, as_Address(src2));
557 } else {
558 lea(rscratch1, src2);
559 Assembler::cmpq(src1, Address(rscratch1, 0));
560 }
561 }
563 int MacroAssembler::corrected_idivq(Register reg) {
564 // Full implementation of Java ldiv and lrem; checks for special
565 // case as described in JVM spec., p.243 & p.271. The function
566 // returns the (pc) offset of the idivl instruction - may be needed
567 // for implicit exceptions.
568 //
569 // normal case special case
570 //
571 // input : rax: dividend min_long
572 // reg: divisor (may not be eax/edx) -1
573 //
574 // output: rax: quotient (= rax idiv reg) min_long
575 // rdx: remainder (= rax irem reg) 0
576 assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
577 static const int64_t min_long = 0x8000000000000000;
578 Label normal_case, special_case;
580 // check for special case
581 cmp64(rax, ExternalAddress((address) &min_long));
582 jcc(Assembler::notEqual, normal_case);
583 xorl(rdx, rdx); // prepare rdx for possible special case (where
584 // remainder = 0)
585 cmpq(reg, -1);
586 jcc(Assembler::equal, special_case);
588 // handle normal case
589 bind(normal_case);
590 cdqq();
591 int idivq_offset = offset();
592 idivq(reg);
594 // normal and special case exit
595 bind(special_case);
597 return idivq_offset;
598 }
600 void MacroAssembler::decrementq(Register reg, int value) {
601 if (value == min_jint) { subq(reg, value); return; }
602 if (value < 0) { incrementq(reg, -value); return; }
603 if (value == 0) { ; return; }
604 if (value == 1 && UseIncDec) { decq(reg) ; return; }
605 /* else */ { subq(reg, value) ; return; }
606 }
608 void MacroAssembler::decrementq(Address dst, int value) {
609 if (value == min_jint) { subq(dst, value); return; }
610 if (value < 0) { incrementq(dst, -value); return; }
611 if (value == 0) { ; return; }
612 if (value == 1 && UseIncDec) { decq(dst) ; return; }
613 /* else */ { subq(dst, value) ; return; }
614 }
616 void MacroAssembler::incrementq(Register reg, int value) {
617 if (value == min_jint) { addq(reg, value); return; }
618 if (value < 0) { decrementq(reg, -value); return; }
619 if (value == 0) { ; return; }
620 if (value == 1 && UseIncDec) { incq(reg) ; return; }
621 /* else */ { addq(reg, value) ; return; }
622 }
624 void MacroAssembler::incrementq(Address dst, int value) {
625 if (value == min_jint) { addq(dst, value); return; }
626 if (value < 0) { decrementq(dst, -value); return; }
627 if (value == 0) { ; return; }
628 if (value == 1 && UseIncDec) { incq(dst) ; return; }
629 /* else */ { addq(dst, value) ; return; }
630 }
632 // 32bit can do a case table jump in one instruction but we no longer allow the base
633 // to be installed in the Address class
634 void MacroAssembler::jump(ArrayAddress entry) {
635 lea(rscratch1, entry.base());
636 Address dispatch = entry.index();
637 assert(dispatch._base == noreg, "must be");
638 dispatch._base = rscratch1;
639 jmp(dispatch);
640 }
642 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
643 ShouldNotReachHere(); // 64bit doesn't use two regs
644 cmpq(x_lo, y_lo);
645 }
647 void MacroAssembler::lea(Register dst, AddressLiteral src) {
648 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
649 }
651 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
652 mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
653 movptr(dst, rscratch1);
654 }
656 void MacroAssembler::leave() {
657 // %%% is this really better? Why not on 32bit too?
658 emit_int8((unsigned char)0xC9); // LEAVE
659 }
661 void MacroAssembler::lneg(Register hi, Register lo) {
662 ShouldNotReachHere(); // 64bit doesn't use two regs
663 negq(lo);
664 }
666 void MacroAssembler::movoop(Register dst, jobject obj) {
667 mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
668 }
670 void MacroAssembler::movoop(Address dst, jobject obj) {
671 mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
672 movq(dst, rscratch1);
673 }
675 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
676 mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
677 }
679 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
680 mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
681 movq(dst, rscratch1);
682 }
684 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
685 if (src.is_lval()) {
686 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
687 } else {
688 if (reachable(src)) {
689 movq(dst, as_Address(src));
690 } else {
691 lea(rscratch1, src);
692 movq(dst, Address(rscratch1,0));
693 }
694 }
695 }
697 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
698 movq(as_Address(dst), src);
699 }
701 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
702 movq(dst, as_Address(src));
703 }
705 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
706 void MacroAssembler::movptr(Address dst, intptr_t src) {
707 mov64(rscratch1, src);
708 movq(dst, rscratch1);
709 }
711 // These are mostly for initializing NULL
712 void MacroAssembler::movptr(Address dst, int32_t src) {
713 movslq(dst, src);
714 }
716 void MacroAssembler::movptr(Register dst, int32_t src) {
717 mov64(dst, (intptr_t)src);
718 }
720 void MacroAssembler::pushoop(jobject obj) {
721 movoop(rscratch1, obj);
722 push(rscratch1);
723 }
725 void MacroAssembler::pushklass(Metadata* obj) {
726 mov_metadata(rscratch1, obj);
727 push(rscratch1);
728 }
730 void MacroAssembler::pushptr(AddressLiteral src) {
731 lea(rscratch1, src);
732 if (src.is_lval()) {
733 push(rscratch1);
734 } else {
735 pushq(Address(rscratch1, 0));
736 }
737 }
739 void MacroAssembler::reset_last_Java_frame(bool clear_fp,
740 bool clear_pc) {
741 // we must set sp to zero to clear frame
742 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
743 // must clear fp, so that compiled frames are not confused; it is
744 // possible that we need it only for debugging
745 if (clear_fp) {
746 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
747 }
749 if (clear_pc) {
750 movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
751 }
752 }
754 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
755 Register last_java_fp,
756 address last_java_pc) {
757 // determine last_java_sp register
758 if (!last_java_sp->is_valid()) {
759 last_java_sp = rsp;
760 }
762 // last_java_fp is optional
763 if (last_java_fp->is_valid()) {
764 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
765 last_java_fp);
766 }
768 // last_java_pc is optional
769 if (last_java_pc != NULL) {
770 Address java_pc(r15_thread,
771 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
772 lea(rscratch1, InternalAddress(last_java_pc));
773 movptr(java_pc, rscratch1);
774 }
776 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
777 }
779 static void pass_arg0(MacroAssembler* masm, Register arg) {
780 if (c_rarg0 != arg ) {
781 masm->mov(c_rarg0, arg);
782 }
783 }
785 static void pass_arg1(MacroAssembler* masm, Register arg) {
786 if (c_rarg1 != arg ) {
787 masm->mov(c_rarg1, arg);
788 }
789 }
791 static void pass_arg2(MacroAssembler* masm, Register arg) {
792 if (c_rarg2 != arg ) {
793 masm->mov(c_rarg2, arg);
794 }
795 }
797 static void pass_arg3(MacroAssembler* masm, Register arg) {
798 if (c_rarg3 != arg ) {
799 masm->mov(c_rarg3, arg);
800 }
801 }
803 void MacroAssembler::stop(const char* msg) {
804 address rip = pc();
805 pusha(); // get regs on stack
806 lea(c_rarg0, ExternalAddress((address) msg));
807 lea(c_rarg1, InternalAddress(rip));
808 movq(c_rarg2, rsp); // pass pointer to regs array
809 andq(rsp, -16); // align stack as required by ABI
810 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
811 hlt();
812 }
814 void MacroAssembler::warn(const char* msg) {
815 push(rbp);
816 movq(rbp, rsp);
817 andq(rsp, -16); // align stack as required by push_CPU_state and call
818 push_CPU_state(); // keeps alignment at 16 bytes
819 lea(c_rarg0, ExternalAddress((address) msg));
820 call_VM_leaf(CAST_FROM_FN_PTR(address, warning), c_rarg0);
821 pop_CPU_state();
822 mov(rsp, rbp);
823 pop(rbp);
824 }
826 void MacroAssembler::print_state() {
827 address rip = pc();
828 pusha(); // get regs on stack
829 push(rbp);
830 movq(rbp, rsp);
831 andq(rsp, -16); // align stack as required by push_CPU_state and call
832 push_CPU_state(); // keeps alignment at 16 bytes
834 lea(c_rarg0, InternalAddress(rip));
835 lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
836 call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
838 pop_CPU_state();
839 mov(rsp, rbp);
840 pop(rbp);
841 popa();
842 }
844 #ifndef PRODUCT
845 extern "C" void findpc(intptr_t x);
846 #endif
848 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
849 // In order to get locks to work, we need to fake a in_VM state
850 if (ShowMessageBoxOnError) {
851 JavaThread* thread = JavaThread::current();
852 JavaThreadState saved_state = thread->thread_state();
853 thread->set_thread_state(_thread_in_vm);
854 #ifndef PRODUCT
855 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
856 ttyLocker ttyl;
857 BytecodeCounter::print();
858 }
859 #endif
860 // To see where a verify_oop failed, get $ebx+40/X for this frame.
861 // XXX correct this offset for amd64
862 // This is the value of eip which points to where verify_oop will return.
863 if (os::message_box(msg, "Execution stopped, print registers?")) {
864 print_state64(pc, regs);
865 BREAKPOINT;
866 assert(false, "start up GDB");
867 }
868 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
869 } else {
870 ttyLocker ttyl;
871 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
872 msg);
873 assert(false, err_msg("DEBUG MESSAGE: %s", msg));
874 }
875 }
877 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
878 ttyLocker ttyl;
879 FlagSetting fs(Debugging, true);
880 tty->print_cr("rip = 0x%016lx", pc);
881 #ifndef PRODUCT
882 tty->cr();
883 findpc(pc);
884 tty->cr();
885 #endif
886 #define PRINT_REG(rax, value) \
887 { tty->print("%s = ", #rax); os::print_location(tty, value); }
888 PRINT_REG(rax, regs[15]);
889 PRINT_REG(rbx, regs[12]);
890 PRINT_REG(rcx, regs[14]);
891 PRINT_REG(rdx, regs[13]);
892 PRINT_REG(rdi, regs[8]);
893 PRINT_REG(rsi, regs[9]);
894 PRINT_REG(rbp, regs[10]);
895 PRINT_REG(rsp, regs[11]);
896 PRINT_REG(r8 , regs[7]);
897 PRINT_REG(r9 , regs[6]);
898 PRINT_REG(r10, regs[5]);
899 PRINT_REG(r11, regs[4]);
900 PRINT_REG(r12, regs[3]);
901 PRINT_REG(r13, regs[2]);
902 PRINT_REG(r14, regs[1]);
903 PRINT_REG(r15, regs[0]);
904 #undef PRINT_REG
905 // Print some words near top of staack.
906 int64_t* rsp = (int64_t*) regs[11];
907 int64_t* dump_sp = rsp;
908 for (int col1 = 0; col1 < 8; col1++) {
909 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp);
910 os::print_location(tty, *dump_sp++);
911 }
912 for (int row = 0; row < 25; row++) {
913 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp);
914 for (int col = 0; col < 4; col++) {
915 tty->print(" 0x%016lx", *dump_sp++);
916 }
917 tty->cr();
918 }
919 // Print some instructions around pc:
920 Disassembler::decode((address)pc-64, (address)pc);
921 tty->print_cr("--------");
922 Disassembler::decode((address)pc, (address)pc+32);
923 }
925 #endif // _LP64
927 // Now versions that are common to 32/64 bit
929 void MacroAssembler::addptr(Register dst, int32_t imm32) {
930 LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
931 }
933 void MacroAssembler::addptr(Register dst, Register src) {
934 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
935 }
937 void MacroAssembler::addptr(Address dst, Register src) {
938 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
939 }
941 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
942 if (reachable(src)) {
943 Assembler::addsd(dst, as_Address(src));
944 } else {
945 lea(rscratch1, src);
946 Assembler::addsd(dst, Address(rscratch1, 0));
947 }
948 }
950 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
951 if (reachable(src)) {
952 addss(dst, as_Address(src));
953 } else {
954 lea(rscratch1, src);
955 addss(dst, Address(rscratch1, 0));
956 }
957 }
959 void MacroAssembler::align(int modulus) {
960 if (offset() % modulus != 0) {
961 nop(modulus - (offset() % modulus));
962 }
963 }
965 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) {
966 // Used in sign-masking with aligned address.
967 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
968 if (reachable(src)) {
969 Assembler::andpd(dst, as_Address(src));
970 } else {
971 lea(rscratch1, src);
972 Assembler::andpd(dst, Address(rscratch1, 0));
973 }
974 }
976 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) {
977 // Used in sign-masking with aligned address.
978 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
979 if (reachable(src)) {
980 Assembler::andps(dst, as_Address(src));
981 } else {
982 lea(rscratch1, src);
983 Assembler::andps(dst, Address(rscratch1, 0));
984 }
985 }
987 void MacroAssembler::andptr(Register dst, int32_t imm32) {
988 LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
989 }
991 void MacroAssembler::atomic_incl(AddressLiteral counter_addr) {
992 pushf();
993 if (reachable(counter_addr)) {
994 if (os::is_MP())
995 lock();
996 incrementl(as_Address(counter_addr));
997 } else {
998 lea(rscratch1, counter_addr);
999 if (os::is_MP())
1000 lock();
1001 incrementl(Address(rscratch1, 0));
1002 }
1003 popf();
1004 }
1006 // Writes to stack successive pages until offset reached to check for
1007 // stack overflow + shadow pages. This clobbers tmp.
1008 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1009 movptr(tmp, rsp);
1010 // Bang stack for total size given plus shadow page size.
1011 // Bang one page at a time because large size can bang beyond yellow and
1012 // red zones.
1013 Label loop;
1014 bind(loop);
1015 movl(Address(tmp, (-os::vm_page_size())), size );
1016 subptr(tmp, os::vm_page_size());
1017 subl(size, os::vm_page_size());
1018 jcc(Assembler::greater, loop);
1020 // Bang down shadow pages too.
1021 // At this point, (tmp-0) is the last address touched, so don't
1022 // touch it again. (It was touched as (tmp-pagesize) but then tmp
1023 // was post-decremented.) Skip this address by starting at i=1, and
1024 // touch a few more pages below. N.B. It is important to touch all
1025 // the way down to and including i=StackShadowPages.
1026 for (int i = 1; i <= StackShadowPages; i++) {
1027 // this could be any sized move but this is can be a debugging crumb
1028 // so the bigger the better.
1029 movptr(Address(tmp, (-i*os::vm_page_size())), size );
1030 }
1031 }
1033 int MacroAssembler::biased_locking_enter(Register lock_reg,
1034 Register obj_reg,
1035 Register swap_reg,
1036 Register tmp_reg,
1037 bool swap_reg_contains_mark,
1038 Label& done,
1039 Label* slow_case,
1040 BiasedLockingCounters* counters) {
1041 assert(UseBiasedLocking, "why call this otherwise?");
1042 assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1043 LP64_ONLY( assert(tmp_reg != noreg, "tmp_reg must be supplied"); )
1044 bool need_tmp_reg = false;
1045 if (tmp_reg == noreg) {
1046 need_tmp_reg = true;
1047 tmp_reg = lock_reg;
1048 assert_different_registers(lock_reg, obj_reg, swap_reg);
1049 } else {
1050 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1051 }
1052 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
1053 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes());
1054 Address saved_mark_addr(lock_reg, 0);
1056 if (PrintBiasedLockingStatistics && counters == NULL) {
1057 counters = BiasedLocking::counters();
1058 }
1059 // Biased locking
1060 // See whether the lock is currently biased toward our thread and
1061 // whether the epoch is still valid
1062 // Note that the runtime guarantees sufficient alignment of JavaThread
1063 // pointers to allow age to be placed into low bits
1064 // First check to see whether biasing is even enabled for this object
1065 Label cas_label;
1066 int null_check_offset = -1;
1067 if (!swap_reg_contains_mark) {
1068 null_check_offset = offset();
1069 movptr(swap_reg, mark_addr);
1070 }
1071 if (need_tmp_reg) {
1072 push(tmp_reg);
1073 }
1074 movptr(tmp_reg, swap_reg);
1075 andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place);
1076 cmpptr(tmp_reg, markOopDesc::biased_lock_pattern);
1077 if (need_tmp_reg) {
1078 pop(tmp_reg);
1079 }
1080 jcc(Assembler::notEqual, cas_label);
1081 // The bias pattern is present in the object's header. Need to check
1082 // whether the bias owner and the epoch are both still current.
1083 #ifndef _LP64
1084 // Note that because there is no current thread register on x86_32 we
1085 // need to store off the mark word we read out of the object to
1086 // avoid reloading it and needing to recheck invariants below. This
1087 // store is unfortunate but it makes the overall code shorter and
1088 // simpler.
1089 movptr(saved_mark_addr, swap_reg);
1090 #endif
1091 if (need_tmp_reg) {
1092 push(tmp_reg);
1093 }
1094 if (swap_reg_contains_mark) {
1095 null_check_offset = offset();
1096 }
1097 load_prototype_header(tmp_reg, obj_reg);
1098 #ifdef _LP64
1099 orptr(tmp_reg, r15_thread);
1100 xorptr(tmp_reg, swap_reg);
1101 Register header_reg = tmp_reg;
1102 #else
1103 xorptr(tmp_reg, swap_reg);
1104 get_thread(swap_reg);
1105 xorptr(swap_reg, tmp_reg);
1106 Register header_reg = swap_reg;
1107 #endif
1108 andptr(header_reg, ~((int) markOopDesc::age_mask_in_place));
1109 if (need_tmp_reg) {
1110 pop(tmp_reg);
1111 }
1112 if (counters != NULL) {
1113 cond_inc32(Assembler::zero,
1114 ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1115 }
1116 jcc(Assembler::equal, done);
1118 Label try_revoke_bias;
1119 Label try_rebias;
1121 // At this point we know that the header has the bias pattern and
1122 // that we are not the bias owner in the current epoch. We need to
1123 // figure out more details about the state of the header in order to
1124 // know what operations can be legally performed on the object's
1125 // header.
1127 // If the low three bits in the xor result aren't clear, that means
1128 // the prototype header is no longer biased and we have to revoke
1129 // the bias on this object.
1130 testptr(header_reg, markOopDesc::biased_lock_mask_in_place);
1131 jccb(Assembler::notZero, try_revoke_bias);
1133 // Biasing is still enabled for this data type. See whether the
1134 // epoch of the current bias is still valid, meaning that the epoch
1135 // bits of the mark word are equal to the epoch bits of the
1136 // prototype header. (Note that the prototype header's epoch bits
1137 // only change at a safepoint.) If not, attempt to rebias the object
1138 // toward the current thread. Note that we must be absolutely sure
1139 // that the current epoch is invalid in order to do this because
1140 // otherwise the manipulations it performs on the mark word are
1141 // illegal.
1142 testptr(header_reg, markOopDesc::epoch_mask_in_place);
1143 jccb(Assembler::notZero, try_rebias);
1145 // The epoch of the current bias is still valid but we know nothing
1146 // about the owner; it might be set or it might be clear. Try to
1147 // acquire the bias of the object using an atomic operation. If this
1148 // fails we will go in to the runtime to revoke the object's bias.
1149 // Note that we first construct the presumed unbiased header so we
1150 // don't accidentally blow away another thread's valid bias.
1151 NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1152 andptr(swap_reg,
1153 markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
1154 if (need_tmp_reg) {
1155 push(tmp_reg);
1156 }
1157 #ifdef _LP64
1158 movptr(tmp_reg, swap_reg);
1159 orptr(tmp_reg, r15_thread);
1160 #else
1161 get_thread(tmp_reg);
1162 orptr(tmp_reg, swap_reg);
1163 #endif
1164 if (os::is_MP()) {
1165 lock();
1166 }
1167 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1168 if (need_tmp_reg) {
1169 pop(tmp_reg);
1170 }
1171 // If the biasing toward our thread failed, this means that
1172 // another thread succeeded in biasing it toward itself and we
1173 // need to revoke that bias. The revocation will occur in the
1174 // interpreter runtime in the slow case.
1175 if (counters != NULL) {
1176 cond_inc32(Assembler::zero,
1177 ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1178 }
1179 if (slow_case != NULL) {
1180 jcc(Assembler::notZero, *slow_case);
1181 }
1182 jmp(done);
1184 bind(try_rebias);
1185 // At this point we know the epoch has expired, meaning that the
1186 // current "bias owner", if any, is actually invalid. Under these
1187 // circumstances _only_, we are allowed to use the current header's
1188 // value as the comparison value when doing the cas to acquire the
1189 // bias in the current epoch. In other words, we allow transfer of
1190 // the bias from one thread to another directly in this situation.
1191 //
1192 // FIXME: due to a lack of registers we currently blow away the age
1193 // bits in this situation. Should attempt to preserve them.
1194 if (need_tmp_reg) {
1195 push(tmp_reg);
1196 }
1197 load_prototype_header(tmp_reg, obj_reg);
1198 #ifdef _LP64
1199 orptr(tmp_reg, r15_thread);
1200 #else
1201 get_thread(swap_reg);
1202 orptr(tmp_reg, swap_reg);
1203 movptr(swap_reg, saved_mark_addr);
1204 #endif
1205 if (os::is_MP()) {
1206 lock();
1207 }
1208 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1209 if (need_tmp_reg) {
1210 pop(tmp_reg);
1211 }
1212 // If the biasing toward our thread failed, then another thread
1213 // succeeded in biasing it toward itself and we need to revoke that
1214 // bias. The revocation will occur in the runtime in the slow case.
1215 if (counters != NULL) {
1216 cond_inc32(Assembler::zero,
1217 ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1218 }
1219 if (slow_case != NULL) {
1220 jcc(Assembler::notZero, *slow_case);
1221 }
1222 jmp(done);
1224 bind(try_revoke_bias);
1225 // The prototype mark in the klass doesn't have the bias bit set any
1226 // more, indicating that objects of this data type are not supposed
1227 // to be biased any more. We are going to try to reset the mark of
1228 // this object to the prototype value and fall through to the
1229 // CAS-based locking scheme. Note that if our CAS fails, it means
1230 // that another thread raced us for the privilege of revoking the
1231 // bias of this particular object, so it's okay to continue in the
1232 // normal locking code.
1233 //
1234 // FIXME: due to a lack of registers we currently blow away the age
1235 // bits in this situation. Should attempt to preserve them.
1236 NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1237 if (need_tmp_reg) {
1238 push(tmp_reg);
1239 }
1240 load_prototype_header(tmp_reg, obj_reg);
1241 if (os::is_MP()) {
1242 lock();
1243 }
1244 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1245 if (need_tmp_reg) {
1246 pop(tmp_reg);
1247 }
1248 // Fall through to the normal CAS-based lock, because no matter what
1249 // the result of the above CAS, some thread must have succeeded in
1250 // removing the bias bit from the object's header.
1251 if (counters != NULL) {
1252 cond_inc32(Assembler::zero,
1253 ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1254 }
1256 bind(cas_label);
1258 return null_check_offset;
1259 }
1261 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1262 assert(UseBiasedLocking, "why call this otherwise?");
1264 // Check for biased locking unlock case, which is a no-op
1265 // Note: we do not have to check the thread ID for two reasons.
1266 // First, the interpreter checks for IllegalMonitorStateException at
1267 // a higher level. Second, if the bias was revoked while we held the
1268 // lock, the object could not be rebiased toward another thread, so
1269 // the bias bit would be clear.
1270 movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1271 andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
1272 cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
1273 jcc(Assembler::equal, done);
1274 }
1276 #ifdef COMPILER2
1277 // Fast_Lock and Fast_Unlock used by C2
1279 // Because the transitions from emitted code to the runtime
1280 // monitorenter/exit helper stubs are so slow it's critical that
1281 // we inline both the stack-locking fast-path and the inflated fast path.
1282 //
1283 // See also: cmpFastLock and cmpFastUnlock.
1284 //
1285 // What follows is a specialized inline transliteration of the code
1286 // in slow_enter() and slow_exit(). If we're concerned about I$ bloat
1287 // another option would be to emit TrySlowEnter and TrySlowExit methods
1288 // at startup-time. These methods would accept arguments as
1289 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
1290 // indications in the icc.ZFlag. Fast_Lock and Fast_Unlock would simply
1291 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
1292 // In practice, however, the # of lock sites is bounded and is usually small.
1293 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
1294 // if the processor uses simple bimodal branch predictors keyed by EIP
1295 // Since the helper routines would be called from multiple synchronization
1296 // sites.
1297 //
1298 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
1299 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
1300 // to those specialized methods. That'd give us a mostly platform-independent
1301 // implementation that the JITs could optimize and inline at their pleasure.
1302 // Done correctly, the only time we'd need to cross to native could would be
1303 // to park() or unpark() threads. We'd also need a few more unsafe operators
1304 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
1305 // (b) explicit barriers or fence operations.
1306 //
1307 // TODO:
1308 //
1309 // * Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
1310 // This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
1311 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
1312 // the lock operators would typically be faster than reifying Self.
1313 //
1314 // * Ideally I'd define the primitives as:
1315 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
1316 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
1317 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
1318 // Instead, we're stuck with a rather awkward and brittle register assignments below.
1319 // Furthermore the register assignments are overconstrained, possibly resulting in
1320 // sub-optimal code near the synchronization site.
1321 //
1322 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
1323 // Alternately, use a better sp-proximity test.
1324 //
1325 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
1326 // Either one is sufficient to uniquely identify a thread.
1327 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
1328 //
1329 // * Intrinsify notify() and notifyAll() for the common cases where the
1330 // object is locked by the calling thread but the waitlist is empty.
1331 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
1332 //
1333 // * use jccb and jmpb instead of jcc and jmp to improve code density.
1334 // But beware of excessive branch density on AMD Opterons.
1335 //
1336 // * Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
1337 // or failure of the fast-path. If the fast-path fails then we pass
1338 // control to the slow-path, typically in C. In Fast_Lock and
1339 // Fast_Unlock we often branch to DONE_LABEL, just to find that C2
1340 // will emit a conditional branch immediately after the node.
1341 // So we have branches to branches and lots of ICC.ZF games.
1342 // Instead, it might be better to have C2 pass a "FailureLabel"
1343 // into Fast_Lock and Fast_Unlock. In the case of success, control
1344 // will drop through the node. ICC.ZF is undefined at exit.
1345 // In the case of failure, the node will branch directly to the
1346 // FailureLabel
1349 // obj: object to lock
1350 // box: on-stack box address (displaced header location) - KILLED
1351 // rax,: tmp -- KILLED
1352 // scr: tmp -- KILLED
1353 void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, Register scrReg, BiasedLockingCounters* counters) {
1354 // Ensure the register assignents are disjoint
1355 guarantee (objReg != boxReg, "");
1356 guarantee (objReg != tmpReg, "");
1357 guarantee (objReg != scrReg, "");
1358 guarantee (boxReg != tmpReg, "");
1359 guarantee (boxReg != scrReg, "");
1360 guarantee (tmpReg == rax, "");
1362 if (counters != NULL) {
1363 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()));
1364 }
1365 if (EmitSync & 1) {
1366 // set box->dhw = unused_mark (3)
1367 // Force all sync thru slow-path: slow_enter() and slow_exit()
1368 movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1369 cmpptr (rsp, (int32_t)NULL_WORD);
1370 } else
1371 if (EmitSync & 2) {
1372 Label DONE_LABEL ;
1373 if (UseBiasedLocking) {
1374 // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
1375 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
1376 }
1378 movptr(tmpReg, Address(objReg, 0)); // fetch markword
1379 orptr (tmpReg, 0x1);
1380 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
1381 if (os::is_MP()) {
1382 lock();
1383 }
1384 cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg
1385 jccb(Assembler::equal, DONE_LABEL);
1386 // Recursive locking
1387 subptr(tmpReg, rsp);
1388 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1389 movptr(Address(boxReg, 0), tmpReg);
1390 bind(DONE_LABEL);
1391 } else {
1392 // Possible cases that we'll encounter in fast_lock
1393 // ------------------------------------------------
1394 // * Inflated
1395 // -- unlocked
1396 // -- Locked
1397 // = by self
1398 // = by other
1399 // * biased
1400 // -- by Self
1401 // -- by other
1402 // * neutral
1403 // * stack-locked
1404 // -- by self
1405 // = sp-proximity test hits
1406 // = sp-proximity test generates false-negative
1407 // -- by other
1408 //
1410 Label IsInflated, DONE_LABEL;
1412 // it's stack-locked, biased or neutral
1413 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
1414 // order to reduce the number of conditional branches in the most common cases.
1415 // Beware -- there's a subtle invariant that fetch of the markword
1416 // at [FETCH], below, will never observe a biased encoding (*101b).
1417 // If this invariant is not held we risk exclusion (safety) failure.
1418 if (UseBiasedLocking && !UseOptoBiasInlining) {
1419 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, true, DONE_LABEL, NULL, counters);
1420 }
1422 movptr(tmpReg, Address(objReg, 0)); // [FETCH]
1423 testl (tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased
1424 jccb (Assembler::notZero, IsInflated);
1426 // Attempt stack-locking ...
1427 orptr (tmpReg, 0x1);
1428 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
1429 if (os::is_MP()) {
1430 lock();
1431 }
1432 cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg
1433 if (counters != NULL) {
1434 cond_inc32(Assembler::equal,
1435 ExternalAddress((address)counters->fast_path_entry_count_addr()));
1436 }
1437 jccb(Assembler::equal, DONE_LABEL);
1439 // Recursive locking
1440 subptr(tmpReg, rsp);
1441 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1442 movptr(Address(boxReg, 0), tmpReg);
1443 if (counters != NULL) {
1444 cond_inc32(Assembler::equal,
1445 ExternalAddress((address)counters->fast_path_entry_count_addr()));
1446 }
1447 jmpb(DONE_LABEL);
1449 bind(IsInflated);
1450 #ifndef _LP64
1451 // The object is inflated.
1452 //
1453 // TODO-FIXME: eliminate the ugly use of manifest constants:
1454 // Use markOopDesc::monitor_value instead of "2".
1455 // use markOop::unused_mark() instead of "3".
1456 // The tmpReg value is an objectMonitor reference ORed with
1457 // markOopDesc::monitor_value (2). We can either convert tmpReg to an
1458 // objectmonitor pointer by masking off the "2" bit or we can just
1459 // use tmpReg as an objectmonitor pointer but bias the objectmonitor
1460 // field offsets with "-2" to compensate for and annul the low-order tag bit.
1461 //
1462 // I use the latter as it avoids AGI stalls.
1463 // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]"
1464 // instead of "mov r, [tmpReg+OFFSETOF(Owner)]".
1465 //
1466 #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2)
1468 // boxReg refers to the on-stack BasicLock in the current frame.
1469 // We'd like to write:
1470 // set box->_displaced_header = markOop::unused_mark(). Any non-0 value suffices.
1471 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers
1472 // additional latency as we have another ST in the store buffer that must drain.
1474 if (EmitSync & 8192) {
1475 movptr(Address(boxReg, 0), 3); // results in ST-before-CAS penalty
1476 get_thread (scrReg);
1477 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
1478 movptr(tmpReg, NULL_WORD); // consider: xor vs mov
1479 if (os::is_MP()) {
1480 lock();
1481 }
1482 cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
1483 } else
1484 if ((EmitSync & 128) == 0) { // avoid ST-before-CAS
1485 movptr(scrReg, boxReg);
1486 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
1488 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1489 if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1490 // prefetchw [eax + Offset(_owner)-2]
1491 prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
1492 }
1494 if ((EmitSync & 64) == 0) {
1495 // Optimistic form: consider XORL tmpReg,tmpReg
1496 movptr(tmpReg, NULL_WORD);
1497 } else {
1498 // Can suffer RTS->RTO upgrades on shared or cold $ lines
1499 // Test-And-CAS instead of CAS
1500 movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); // rax, = m->_owner
1501 testptr(tmpReg, tmpReg); // Locked ?
1502 jccb (Assembler::notZero, DONE_LABEL);
1503 }
1505 // Appears unlocked - try to swing _owner from null to non-null.
1506 // Ideally, I'd manifest "Self" with get_thread and then attempt
1507 // to CAS the register containing Self into m->Owner.
1508 // But we don't have enough registers, so instead we can either try to CAS
1509 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
1510 // we later store "Self" into m->Owner. Transiently storing a stack address
1511 // (rsp or the address of the box) into m->owner is harmless.
1512 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
1513 if (os::is_MP()) {
1514 lock();
1515 }
1516 cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
1517 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3
1518 jccb (Assembler::notZero, DONE_LABEL);
1519 get_thread (scrReg); // beware: clobbers ICCs
1520 movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg);
1521 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success
1523 // If the CAS fails we can either retry or pass control to the slow-path.
1524 // We use the latter tactic.
1525 // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1526 // If the CAS was successful ...
1527 // Self has acquired the lock
1528 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1529 // Intentional fall-through into DONE_LABEL ...
1530 } else {
1531 movptr(Address(boxReg, 0), intptr_t(markOopDesc::unused_mark())); // results in ST-before-CAS penalty
1532 movptr(boxReg, tmpReg);
1534 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1535 if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1536 // prefetchw [eax + Offset(_owner)-2]
1537 prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
1538 }
1540 if ((EmitSync & 64) == 0) {
1541 // Optimistic form
1542 xorptr (tmpReg, tmpReg);
1543 } else {
1544 // Can suffer RTS->RTO upgrades on shared or cold $ lines
1545 movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); // rax, = m->_owner
1546 testptr(tmpReg, tmpReg); // Locked ?
1547 jccb (Assembler::notZero, DONE_LABEL);
1548 }
1550 // Appears unlocked - try to swing _owner from null to non-null.
1551 // Use either "Self" (in scr) or rsp as thread identity in _owner.
1552 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
1553 get_thread (scrReg);
1554 if (os::is_MP()) {
1555 lock();
1556 }
1557 cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
1559 // If the CAS fails we can either retry or pass control to the slow-path.
1560 // We use the latter tactic.
1561 // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1562 // If the CAS was successful ...
1563 // Self has acquired the lock
1564 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1565 // Intentional fall-through into DONE_LABEL ...
1566 }
1567 #else // _LP64
1568 // It's inflated
1570 // TODO: someday avoid the ST-before-CAS penalty by
1571 // relocating (deferring) the following ST.
1572 // We should also think about trying a CAS without having
1573 // fetched _owner. If the CAS is successful we may
1574 // avoid an RTO->RTS upgrade on the $line.
1576 // Without cast to int32_t a movptr will destroy r10 which is typically obj
1577 movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1579 mov (boxReg, tmpReg);
1580 movptr (tmpReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
1581 testptr(tmpReg, tmpReg);
1582 jccb (Assembler::notZero, DONE_LABEL);
1584 // It's inflated and appears unlocked
1585 if (os::is_MP()) {
1586 lock();
1587 }
1588 cmpxchgptr(r15_thread, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
1589 // Intentional fall-through into DONE_LABEL ...
1591 #endif
1593 // DONE_LABEL is a hot target - we'd really like to place it at the
1594 // start of cache line by padding with NOPs.
1595 // See the AMD and Intel software optimization manuals for the
1596 // most efficient "long" NOP encodings.
1597 // Unfortunately none of our alignment mechanisms suffice.
1598 bind(DONE_LABEL);
1600 // At DONE_LABEL the icc ZFlag is set as follows ...
1601 // Fast_Unlock uses the same protocol.
1602 // ZFlag == 1 -> Success
1603 // ZFlag == 0 -> Failure - force control through the slow-path
1604 }
1605 }
1607 // obj: object to unlock
1608 // box: box address (displaced header location), killed. Must be EAX.
1609 // tmp: killed, cannot be obj nor box.
1610 //
1611 // Some commentary on balanced locking:
1612 //
1613 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1614 // Methods that don't have provably balanced locking are forced to run in the
1615 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1616 // The interpreter provides two properties:
1617 // I1: At return-time the interpreter automatically and quietly unlocks any
1618 // objects acquired the current activation (frame). Recall that the
1619 // interpreter maintains an on-stack list of locks currently held by
1620 // a frame.
1621 // I2: If a method attempts to unlock an object that is not held by the
1622 // the frame the interpreter throws IMSX.
1623 //
1624 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
1625 // B() doesn't have provably balanced locking so it runs in the interpreter.
1626 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
1627 // is still locked by A().
1628 //
1629 // The only other source of unbalanced locking would be JNI. The "Java Native Interface:
1630 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1631 // should not be unlocked by "normal" java-level locking and vice-versa. The specification
1632 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1634 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
1635 guarantee (objReg != boxReg, "");
1636 guarantee (objReg != tmpReg, "");
1637 guarantee (boxReg != tmpReg, "");
1638 guarantee (boxReg == rax, "");
1640 if (EmitSync & 4) {
1641 // Disable - inhibit all inlining. Force control through the slow-path
1642 cmpptr (rsp, 0);
1643 } else
1644 if (EmitSync & 8) {
1645 Label DONE_LABEL;
1646 if (UseBiasedLocking) {
1647 biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1648 }
1649 // Classic stack-locking code ...
1650 // Check whether the displaced header is 0
1651 //(=> recursive unlock)
1652 movptr(tmpReg, Address(boxReg, 0));
1653 testptr(tmpReg, tmpReg);
1654 jccb(Assembler::zero, DONE_LABEL);
1655 // If not recursive lock, reset the header to displaced header
1656 if (os::is_MP()) {
1657 lock();
1658 }
1659 cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
1660 bind(DONE_LABEL);
1661 } else {
1662 Label DONE_LABEL, Stacked, CheckSucc;
1664 // Critically, the biased locking test must have precedence over
1665 // and appear before the (box->dhw == 0) recursive stack-lock test.
1666 if (UseBiasedLocking && !UseOptoBiasInlining) {
1667 biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1668 }
1670 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
1671 movptr(tmpReg, Address(objReg, 0)); // Examine the object's markword
1672 jccb (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock
1674 testptr(tmpReg, 0x02); // Inflated?
1675 jccb (Assembler::zero, Stacked);
1677 // It's inflated.
1678 // Despite our balanced locking property we still check that m->_owner == Self
1679 // as java routines or native JNI code called by this thread might
1680 // have released the lock.
1681 // Refer to the comments in synchronizer.cpp for how we might encode extra
1682 // state in _succ so we can avoid fetching EntryList|cxq.
1683 //
1684 // I'd like to add more cases in fast_lock() and fast_unlock() --
1685 // such as recursive enter and exit -- but we have to be wary of
1686 // I$ bloat, T$ effects and BP$ effects.
1687 //
1688 // If there's no contention try a 1-0 exit. That is, exit without
1689 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how
1690 // we detect and recover from the race that the 1-0 exit admits.
1691 //
1692 // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
1693 // before it STs null into _owner, releasing the lock. Updates
1694 // to data protected by the critical section must be visible before
1695 // we drop the lock (and thus before any other thread could acquire
1696 // the lock and observe the fields protected by the lock).
1697 // IA32's memory-model is SPO, so STs are ordered with respect to
1698 // each other and there's no need for an explicit barrier (fence).
1699 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
1700 #ifndef _LP64
1701 get_thread (boxReg);
1702 if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1703 // prefetchw [ebx + Offset(_owner)-2]
1704 prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
1705 }
1707 // Note that we could employ various encoding schemes to reduce
1708 // the number of loads below (currently 4) to just 2 or 3.
1709 // Refer to the comments in synchronizer.cpp.
1710 // In practice the chain of fetches doesn't seem to impact performance, however.
1711 if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
1712 // Attempt to reduce branch density - AMD's branch predictor.
1713 xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
1714 orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
1715 orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
1716 orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
1717 jccb (Assembler::notZero, DONE_LABEL);
1718 movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
1719 jmpb (DONE_LABEL);
1720 } else {
1721 xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
1722 orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
1723 jccb (Assembler::notZero, DONE_LABEL);
1724 movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
1725 orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
1726 jccb (Assembler::notZero, CheckSucc);
1727 movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
1728 jmpb (DONE_LABEL);
1729 }
1731 // The Following code fragment (EmitSync & 65536) improves the performance of
1732 // contended applications and contended synchronization microbenchmarks.
1733 // Unfortunately the emission of the code - even though not executed - causes regressions
1734 // in scimark and jetstream, evidently because of $ effects. Replacing the code
1735 // with an equal number of never-executed NOPs results in the same regression.
1736 // We leave it off by default.
1738 if ((EmitSync & 65536) != 0) {
1739 Label LSuccess, LGoSlowPath ;
1741 bind (CheckSucc);
1743 // Optional pre-test ... it's safe to elide this
1744 if ((EmitSync & 16) == 0) {
1745 cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
1746 jccb (Assembler::zero, LGoSlowPath);
1747 }
1749 // We have a classic Dekker-style idiom:
1750 // ST m->_owner = 0 ; MEMBAR; LD m->_succ
1751 // There are a number of ways to implement the barrier:
1752 // (1) lock:andl &m->_owner, 0
1753 // is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
1754 // LOCK: ANDL [ebx+Offset(_Owner)-2], 0
1755 // Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
1756 // (2) If supported, an explicit MFENCE is appealing.
1757 // In older IA32 processors MFENCE is slower than lock:add or xchg
1758 // particularly if the write-buffer is full as might be the case if
1759 // if stores closely precede the fence or fence-equivalent instruction.
1760 // In more modern implementations MFENCE appears faster, however.
1761 // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
1762 // The $lines underlying the top-of-stack should be in M-state.
1763 // The locked add instruction is serializing, of course.
1764 // (4) Use xchg, which is serializing
1765 // mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
1766 // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
1767 // The integer condition codes will tell us if succ was 0.
1768 // Since _succ and _owner should reside in the same $line and
1769 // we just stored into _owner, it's likely that the $line
1770 // remains in M-state for the lock:orl.
1771 //
1772 // We currently use (3), although it's likely that switching to (2)
1773 // is correct for the future.
1775 movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
1776 if (os::is_MP()) {
1777 if (VM_Version::supports_sse2() && 1 == FenceInstruction) {
1778 mfence();
1779 } else {
1780 lock (); addptr(Address(rsp, 0), 0);
1781 }
1782 }
1783 // Ratify _succ remains non-null
1784 cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0);
1785 jccb (Assembler::notZero, LSuccess);
1787 xorptr(boxReg, boxReg); // box is really EAX
1788 if (os::is_MP()) { lock(); }
1789 cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
1790 jccb (Assembler::notEqual, LSuccess);
1791 // Since we're low on registers we installed rsp as a placeholding in _owner.
1792 // Now install Self over rsp. This is safe as we're transitioning from
1793 // non-null to non=null
1794 get_thread (boxReg);
1795 movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg);
1796 // Intentional fall-through into LGoSlowPath ...
1798 bind (LGoSlowPath);
1799 orptr(boxReg, 1); // set ICC.ZF=0 to indicate failure
1800 jmpb (DONE_LABEL);
1802 bind (LSuccess);
1803 xorptr(boxReg, boxReg); // set ICC.ZF=1 to indicate success
1804 jmpb (DONE_LABEL);
1805 }
1807 bind (Stacked);
1808 // It's not inflated and it's not recursively stack-locked and it's not biased.
1809 // It must be stack-locked.
1810 // Try to reset the header to displaced header.
1811 // The "box" value on the stack is stable, so we can reload
1812 // and be assured we observe the same value as above.
1813 movptr(tmpReg, Address(boxReg, 0));
1814 if (os::is_MP()) {
1815 lock();
1816 }
1817 cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
1818 // Intention fall-thru into DONE_LABEL
1820 // DONE_LABEL is a hot target - we'd really like to place it at the
1821 // start of cache line by padding with NOPs.
1822 // See the AMD and Intel software optimization manuals for the
1823 // most efficient "long" NOP encodings.
1824 // Unfortunately none of our alignment mechanisms suffice.
1825 if ((EmitSync & 65536) == 0) {
1826 bind (CheckSucc);
1827 }
1828 #else // _LP64
1829 // It's inflated
1830 movptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
1831 xorptr(boxReg, r15_thread);
1832 orptr (boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
1833 jccb (Assembler::notZero, DONE_LABEL);
1834 movptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
1835 orptr (boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
1836 jccb (Assembler::notZero, CheckSucc);
1837 movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD);
1838 jmpb (DONE_LABEL);
1840 if ((EmitSync & 65536) == 0) {
1841 Label LSuccess, LGoSlowPath ;
1842 bind (CheckSucc);
1843 cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
1844 jccb (Assembler::zero, LGoSlowPath);
1846 // I'd much rather use lock:andl m->_owner, 0 as it's faster than the
1847 // the explicit ST;MEMBAR combination, but masm doesn't currently support
1848 // "ANDQ M,IMM". Don't use MFENCE here. lock:add to TOS, xchg, etc
1849 // are all faster when the write buffer is populated.
1850 movptr (Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD);
1851 if (os::is_MP()) {
1852 lock (); addl (Address(rsp, 0), 0);
1853 }
1854 cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
1855 jccb (Assembler::notZero, LSuccess);
1857 movptr (boxReg, (int32_t)NULL_WORD); // box is really EAX
1858 if (os::is_MP()) { lock(); }
1859 cmpxchgptr(r15_thread, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
1860 jccb (Assembler::notEqual, LSuccess);
1861 // Intentional fall-through into slow-path
1863 bind (LGoSlowPath);
1864 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure
1865 jmpb (DONE_LABEL);
1867 bind (LSuccess);
1868 testl (boxReg, 0); // set ICC.ZF=1 to indicate success
1869 jmpb (DONE_LABEL);
1870 }
1872 bind (Stacked);
1873 movptr(tmpReg, Address (boxReg, 0)); // re-fetch
1874 if (os::is_MP()) { lock(); }
1875 cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
1877 if (EmitSync & 65536) {
1878 bind (CheckSucc);
1879 }
1880 #endif
1881 bind(DONE_LABEL);
1882 // Avoid branch to branch on AMD processors
1883 if (EmitSync & 32768) {
1884 nop();
1885 }
1886 }
1887 }
1888 #endif // COMPILER2
1890 void MacroAssembler::c2bool(Register x) {
1891 // implements x == 0 ? 0 : 1
1892 // note: must only look at least-significant byte of x
1893 // since C-style booleans are stored in one byte
1894 // only! (was bug)
1895 andl(x, 0xFF);
1896 setb(Assembler::notZero, x);
1897 }
1899 // Wouldn't need if AddressLiteral version had new name
1900 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
1901 Assembler::call(L, rtype);
1902 }
1904 void MacroAssembler::call(Register entry) {
1905 Assembler::call(entry);
1906 }
1908 void MacroAssembler::call(AddressLiteral entry) {
1909 if (reachable(entry)) {
1910 Assembler::call_literal(entry.target(), entry.rspec());
1911 } else {
1912 lea(rscratch1, entry);
1913 Assembler::call(rscratch1);
1914 }
1915 }
1917 void MacroAssembler::ic_call(address entry) {
1918 RelocationHolder rh = virtual_call_Relocation::spec(pc());
1919 movptr(rax, (intptr_t)Universe::non_oop_word());
1920 call(AddressLiteral(entry, rh));
1921 }
1923 // Implementation of call_VM versions
1925 void MacroAssembler::call_VM(Register oop_result,
1926 address entry_point,
1927 bool check_exceptions) {
1928 Label C, E;
1929 call(C, relocInfo::none);
1930 jmp(E);
1932 bind(C);
1933 call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1934 ret(0);
1936 bind(E);
1937 }
1939 void MacroAssembler::call_VM(Register oop_result,
1940 address entry_point,
1941 Register arg_1,
1942 bool check_exceptions) {
1943 Label C, E;
1944 call(C, relocInfo::none);
1945 jmp(E);
1947 bind(C);
1948 pass_arg1(this, arg_1);
1949 call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1950 ret(0);
1952 bind(E);
1953 }
1955 void MacroAssembler::call_VM(Register oop_result,
1956 address entry_point,
1957 Register arg_1,
1958 Register arg_2,
1959 bool check_exceptions) {
1960 Label C, E;
1961 call(C, relocInfo::none);
1962 jmp(E);
1964 bind(C);
1966 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1968 pass_arg2(this, arg_2);
1969 pass_arg1(this, arg_1);
1970 call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1971 ret(0);
1973 bind(E);
1974 }
1976 void MacroAssembler::call_VM(Register oop_result,
1977 address entry_point,
1978 Register arg_1,
1979 Register arg_2,
1980 Register arg_3,
1981 bool check_exceptions) {
1982 Label C, E;
1983 call(C, relocInfo::none);
1984 jmp(E);
1986 bind(C);
1988 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1989 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1990 pass_arg3(this, arg_3);
1992 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1993 pass_arg2(this, arg_2);
1995 pass_arg1(this, arg_1);
1996 call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1997 ret(0);
1999 bind(E);
2000 }
2002 void MacroAssembler::call_VM(Register oop_result,
2003 Register last_java_sp,
2004 address entry_point,
2005 int number_of_arguments,
2006 bool check_exceptions) {
2007 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2008 call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2009 }
2011 void MacroAssembler::call_VM(Register oop_result,
2012 Register last_java_sp,
2013 address entry_point,
2014 Register arg_1,
2015 bool check_exceptions) {
2016 pass_arg1(this, arg_1);
2017 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2018 }
2020 void MacroAssembler::call_VM(Register oop_result,
2021 Register last_java_sp,
2022 address entry_point,
2023 Register arg_1,
2024 Register arg_2,
2025 bool check_exceptions) {
2027 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2028 pass_arg2(this, arg_2);
2029 pass_arg1(this, arg_1);
2030 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2031 }
2033 void MacroAssembler::call_VM(Register oop_result,
2034 Register last_java_sp,
2035 address entry_point,
2036 Register arg_1,
2037 Register arg_2,
2038 Register arg_3,
2039 bool check_exceptions) {
2040 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2041 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2042 pass_arg3(this, arg_3);
2043 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2044 pass_arg2(this, arg_2);
2045 pass_arg1(this, arg_1);
2046 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2047 }
2049 void MacroAssembler::super_call_VM(Register oop_result,
2050 Register last_java_sp,
2051 address entry_point,
2052 int number_of_arguments,
2053 bool check_exceptions) {
2054 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2055 MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2056 }
2058 void MacroAssembler::super_call_VM(Register oop_result,
2059 Register last_java_sp,
2060 address entry_point,
2061 Register arg_1,
2062 bool check_exceptions) {
2063 pass_arg1(this, arg_1);
2064 super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2065 }
2067 void MacroAssembler::super_call_VM(Register oop_result,
2068 Register last_java_sp,
2069 address entry_point,
2070 Register arg_1,
2071 Register arg_2,
2072 bool check_exceptions) {
2074 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2075 pass_arg2(this, arg_2);
2076 pass_arg1(this, arg_1);
2077 super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2078 }
2080 void MacroAssembler::super_call_VM(Register oop_result,
2081 Register last_java_sp,
2082 address entry_point,
2083 Register arg_1,
2084 Register arg_2,
2085 Register arg_3,
2086 bool check_exceptions) {
2087 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2088 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2089 pass_arg3(this, arg_3);
2090 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2091 pass_arg2(this, arg_2);
2092 pass_arg1(this, arg_1);
2093 super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2094 }
2096 void MacroAssembler::call_VM_base(Register oop_result,
2097 Register java_thread,
2098 Register last_java_sp,
2099 address entry_point,
2100 int number_of_arguments,
2101 bool check_exceptions) {
2102 // determine java_thread register
2103 if (!java_thread->is_valid()) {
2104 #ifdef _LP64
2105 java_thread = r15_thread;
2106 #else
2107 java_thread = rdi;
2108 get_thread(java_thread);
2109 #endif // LP64
2110 }
2111 // determine last_java_sp register
2112 if (!last_java_sp->is_valid()) {
2113 last_java_sp = rsp;
2114 }
2115 // debugging support
2116 assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
2117 LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
2118 #ifdef ASSERT
2119 // TraceBytecodes does not use r12 but saves it over the call, so don't verify
2120 // r12 is the heapbase.
2121 LP64_ONLY(if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
2122 #endif // ASSERT
2124 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
2125 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
2127 // push java thread (becomes first argument of C function)
2129 NOT_LP64(push(java_thread); number_of_arguments++);
2130 LP64_ONLY(mov(c_rarg0, r15_thread));
2132 // set last Java frame before call
2133 assert(last_java_sp != rbp, "can't use ebp/rbp");
2135 // Only interpreter should have to set fp
2136 set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
2138 // do the call, remove parameters
2139 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
2141 // restore the thread (cannot use the pushed argument since arguments
2142 // may be overwritten by C code generated by an optimizing compiler);
2143 // however can use the register value directly if it is callee saved.
2144 if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
2145 // rdi & rsi (also r15) are callee saved -> nothing to do
2146 #ifdef ASSERT
2147 guarantee(java_thread != rax, "change this code");
2148 push(rax);
2149 { Label L;
2150 get_thread(rax);
2151 cmpptr(java_thread, rax);
2152 jcc(Assembler::equal, L);
2153 STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
2154 bind(L);
2155 }
2156 pop(rax);
2157 #endif
2158 } else {
2159 get_thread(java_thread);
2160 }
2161 // reset last Java frame
2162 // Only interpreter should have to clear fp
2163 reset_last_Java_frame(java_thread, true, false);
2165 #ifndef CC_INTERP
2166 // C++ interp handles this in the interpreter
2167 check_and_handle_popframe(java_thread);
2168 check_and_handle_earlyret(java_thread);
2169 #endif /* CC_INTERP */
2171 if (check_exceptions) {
2172 // check for pending exceptions (java_thread is set upon return)
2173 cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
2174 #ifndef _LP64
2175 jump_cc(Assembler::notEqual,
2176 RuntimeAddress(StubRoutines::forward_exception_entry()));
2177 #else
2178 // This used to conditionally jump to forward_exception however it is
2179 // possible if we relocate that the branch will not reach. So we must jump
2180 // around so we can always reach
2182 Label ok;
2183 jcc(Assembler::equal, ok);
2184 jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2185 bind(ok);
2186 #endif // LP64
2187 }
2189 // get oop result if there is one and reset the value in the thread
2190 if (oop_result->is_valid()) {
2191 get_vm_result(oop_result, java_thread);
2192 }
2193 }
2195 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
2197 // Calculate the value for last_Java_sp
2198 // somewhat subtle. call_VM does an intermediate call
2199 // which places a return address on the stack just under the
2200 // stack pointer as the user finsihed with it. This allows
2201 // use to retrieve last_Java_pc from last_Java_sp[-1].
2202 // On 32bit we then have to push additional args on the stack to accomplish
2203 // the actual requested call. On 64bit call_VM only can use register args
2204 // so the only extra space is the return address that call_VM created.
2205 // This hopefully explains the calculations here.
2207 #ifdef _LP64
2208 // We've pushed one address, correct last_Java_sp
2209 lea(rax, Address(rsp, wordSize));
2210 #else
2211 lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
2212 #endif // LP64
2214 call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
2216 }
2218 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
2219 call_VM_leaf_base(entry_point, number_of_arguments);
2220 }
2222 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
2223 pass_arg0(this, arg_0);
2224 call_VM_leaf(entry_point, 1);
2225 }
2227 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2229 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2230 pass_arg1(this, arg_1);
2231 pass_arg0(this, arg_0);
2232 call_VM_leaf(entry_point, 2);
2233 }
2235 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2236 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2237 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2238 pass_arg2(this, arg_2);
2239 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2240 pass_arg1(this, arg_1);
2241 pass_arg0(this, arg_0);
2242 call_VM_leaf(entry_point, 3);
2243 }
2245 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
2246 pass_arg0(this, arg_0);
2247 MacroAssembler::call_VM_leaf_base(entry_point, 1);
2248 }
2250 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2252 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2253 pass_arg1(this, arg_1);
2254 pass_arg0(this, arg_0);
2255 MacroAssembler::call_VM_leaf_base(entry_point, 2);
2256 }
2258 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2259 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2260 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2261 pass_arg2(this, arg_2);
2262 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2263 pass_arg1(this, arg_1);
2264 pass_arg0(this, arg_0);
2265 MacroAssembler::call_VM_leaf_base(entry_point, 3);
2266 }
2268 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
2269 LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
2270 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2271 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2272 pass_arg3(this, arg_3);
2273 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2274 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2275 pass_arg2(this, arg_2);
2276 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2277 pass_arg1(this, arg_1);
2278 pass_arg0(this, arg_0);
2279 MacroAssembler::call_VM_leaf_base(entry_point, 4);
2280 }
2282 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
2283 movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
2284 movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
2285 verify_oop(oop_result, "broken oop in call_VM_base");
2286 }
2288 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
2289 movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
2290 movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
2291 }
2293 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
2294 }
2296 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
2297 }
2299 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
2300 if (reachable(src1)) {
2301 cmpl(as_Address(src1), imm);
2302 } else {
2303 lea(rscratch1, src1);
2304 cmpl(Address(rscratch1, 0), imm);
2305 }
2306 }
2308 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
2309 assert(!src2.is_lval(), "use cmpptr");
2310 if (reachable(src2)) {
2311 cmpl(src1, as_Address(src2));
2312 } else {
2313 lea(rscratch1, src2);
2314 cmpl(src1, Address(rscratch1, 0));
2315 }
2316 }
2318 void MacroAssembler::cmp32(Register src1, int32_t imm) {
2319 Assembler::cmpl(src1, imm);
2320 }
2322 void MacroAssembler::cmp32(Register src1, Address src2) {
2323 Assembler::cmpl(src1, src2);
2324 }
2326 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2327 ucomisd(opr1, opr2);
2329 Label L;
2330 if (unordered_is_less) {
2331 movl(dst, -1);
2332 jcc(Assembler::parity, L);
2333 jcc(Assembler::below , L);
2334 movl(dst, 0);
2335 jcc(Assembler::equal , L);
2336 increment(dst);
2337 } else { // unordered is greater
2338 movl(dst, 1);
2339 jcc(Assembler::parity, L);
2340 jcc(Assembler::above , L);
2341 movl(dst, 0);
2342 jcc(Assembler::equal , L);
2343 decrementl(dst);
2344 }
2345 bind(L);
2346 }
2348 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2349 ucomiss(opr1, opr2);
2351 Label L;
2352 if (unordered_is_less) {
2353 movl(dst, -1);
2354 jcc(Assembler::parity, L);
2355 jcc(Assembler::below , L);
2356 movl(dst, 0);
2357 jcc(Assembler::equal , L);
2358 increment(dst);
2359 } else { // unordered is greater
2360 movl(dst, 1);
2361 jcc(Assembler::parity, L);
2362 jcc(Assembler::above , L);
2363 movl(dst, 0);
2364 jcc(Assembler::equal , L);
2365 decrementl(dst);
2366 }
2367 bind(L);
2368 }
2371 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
2372 if (reachable(src1)) {
2373 cmpb(as_Address(src1), imm);
2374 } else {
2375 lea(rscratch1, src1);
2376 cmpb(Address(rscratch1, 0), imm);
2377 }
2378 }
2380 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
2381 #ifdef _LP64
2382 if (src2.is_lval()) {
2383 movptr(rscratch1, src2);
2384 Assembler::cmpq(src1, rscratch1);
2385 } else if (reachable(src2)) {
2386 cmpq(src1, as_Address(src2));
2387 } else {
2388 lea(rscratch1, src2);
2389 Assembler::cmpq(src1, Address(rscratch1, 0));
2390 }
2391 #else
2392 if (src2.is_lval()) {
2393 cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2394 } else {
2395 cmpl(src1, as_Address(src2));
2396 }
2397 #endif // _LP64
2398 }
2400 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
2401 assert(src2.is_lval(), "not a mem-mem compare");
2402 #ifdef _LP64
2403 // moves src2's literal address
2404 movptr(rscratch1, src2);
2405 Assembler::cmpq(src1, rscratch1);
2406 #else
2407 cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2408 #endif // _LP64
2409 }
2411 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
2412 if (reachable(adr)) {
2413 if (os::is_MP())
2414 lock();
2415 cmpxchgptr(reg, as_Address(adr));
2416 } else {
2417 lea(rscratch1, adr);
2418 if (os::is_MP())
2419 lock();
2420 cmpxchgptr(reg, Address(rscratch1, 0));
2421 }
2422 }
2424 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
2425 LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
2426 }
2428 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
2429 if (reachable(src)) {
2430 Assembler::comisd(dst, as_Address(src));
2431 } else {
2432 lea(rscratch1, src);
2433 Assembler::comisd(dst, Address(rscratch1, 0));
2434 }
2435 }
2437 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
2438 if (reachable(src)) {
2439 Assembler::comiss(dst, as_Address(src));
2440 } else {
2441 lea(rscratch1, src);
2442 Assembler::comiss(dst, Address(rscratch1, 0));
2443 }
2444 }
2447 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
2448 Condition negated_cond = negate_condition(cond);
2449 Label L;
2450 jcc(negated_cond, L);
2451 atomic_incl(counter_addr);
2452 bind(L);
2453 }
2455 int MacroAssembler::corrected_idivl(Register reg) {
2456 // Full implementation of Java idiv and irem; checks for
2457 // special case as described in JVM spec., p.243 & p.271.
2458 // The function returns the (pc) offset of the idivl
2459 // instruction - may be needed for implicit exceptions.
2460 //
2461 // normal case special case
2462 //
2463 // input : rax,: dividend min_int
2464 // reg: divisor (may not be rax,/rdx) -1
2465 //
2466 // output: rax,: quotient (= rax, idiv reg) min_int
2467 // rdx: remainder (= rax, irem reg) 0
2468 assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
2469 const int min_int = 0x80000000;
2470 Label normal_case, special_case;
2472 // check for special case
2473 cmpl(rax, min_int);
2474 jcc(Assembler::notEqual, normal_case);
2475 xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
2476 cmpl(reg, -1);
2477 jcc(Assembler::equal, special_case);
2479 // handle normal case
2480 bind(normal_case);
2481 cdql();
2482 int idivl_offset = offset();
2483 idivl(reg);
2485 // normal and special case exit
2486 bind(special_case);
2488 return idivl_offset;
2489 }
2493 void MacroAssembler::decrementl(Register reg, int value) {
2494 if (value == min_jint) {subl(reg, value) ; return; }
2495 if (value < 0) { incrementl(reg, -value); return; }
2496 if (value == 0) { ; return; }
2497 if (value == 1 && UseIncDec) { decl(reg) ; return; }
2498 /* else */ { subl(reg, value) ; return; }
2499 }
2501 void MacroAssembler::decrementl(Address dst, int value) {
2502 if (value == min_jint) {subl(dst, value) ; return; }
2503 if (value < 0) { incrementl(dst, -value); return; }
2504 if (value == 0) { ; return; }
2505 if (value == 1 && UseIncDec) { decl(dst) ; return; }
2506 /* else */ { subl(dst, value) ; return; }
2507 }
2509 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2510 assert (shift_value > 0, "illegal shift value");
2511 Label _is_positive;
2512 testl (reg, reg);
2513 jcc (Assembler::positive, _is_positive);
2514 int offset = (1 << shift_value) - 1 ;
2516 if (offset == 1) {
2517 incrementl(reg);
2518 } else {
2519 addl(reg, offset);
2520 }
2522 bind (_is_positive);
2523 sarl(reg, shift_value);
2524 }
2526 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
2527 if (reachable(src)) {
2528 Assembler::divsd(dst, as_Address(src));
2529 } else {
2530 lea(rscratch1, src);
2531 Assembler::divsd(dst, Address(rscratch1, 0));
2532 }
2533 }
2535 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
2536 if (reachable(src)) {
2537 Assembler::divss(dst, as_Address(src));
2538 } else {
2539 lea(rscratch1, src);
2540 Assembler::divss(dst, Address(rscratch1, 0));
2541 }
2542 }
2544 // !defined(COMPILER2) is because of stupid core builds
2545 #if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2)
2546 void MacroAssembler::empty_FPU_stack() {
2547 if (VM_Version::supports_mmx()) {
2548 emms();
2549 } else {
2550 for (int i = 8; i-- > 0; ) ffree(i);
2551 }
2552 }
2553 #endif // !LP64 || C1 || !C2
2556 // Defines obj, preserves var_size_in_bytes
2557 void MacroAssembler::eden_allocate(Register obj,
2558 Register var_size_in_bytes,
2559 int con_size_in_bytes,
2560 Register t1,
2561 Label& slow_case) {
2562 assert(obj == rax, "obj must be in rax, for cmpxchg");
2563 assert_different_registers(obj, var_size_in_bytes, t1);
2564 if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
2565 jmp(slow_case);
2566 } else {
2567 Register end = t1;
2568 Label retry;
2569 bind(retry);
2570 ExternalAddress heap_top((address) Universe::heap()->top_addr());
2571 movptr(obj, heap_top);
2572 if (var_size_in_bytes == noreg) {
2573 lea(end, Address(obj, con_size_in_bytes));
2574 } else {
2575 lea(end, Address(obj, var_size_in_bytes, Address::times_1));
2576 }
2577 // if end < obj then we wrapped around => object too long => slow case
2578 cmpptr(end, obj);
2579 jcc(Assembler::below, slow_case);
2580 cmpptr(end, ExternalAddress((address) Universe::heap()->end_addr()));
2581 jcc(Assembler::above, slow_case);
2582 // Compare obj with the top addr, and if still equal, store the new top addr in
2583 // end at the address of the top addr pointer. Sets ZF if was equal, and clears
2584 // it otherwise. Use lock prefix for atomicity on MPs.
2585 locked_cmpxchgptr(end, heap_top);
2586 jcc(Assembler::notEqual, retry);
2587 }
2588 }
2590 void MacroAssembler::enter() {
2591 push(rbp);
2592 mov(rbp, rsp);
2593 }
2595 // A 5 byte nop that is safe for patching (see patch_verified_entry)
2596 void MacroAssembler::fat_nop() {
2597 if (UseAddressNop) {
2598 addr_nop_5();
2599 } else {
2600 emit_int8(0x26); // es:
2601 emit_int8(0x2e); // cs:
2602 emit_int8(0x64); // fs:
2603 emit_int8(0x65); // gs:
2604 emit_int8((unsigned char)0x90);
2605 }
2606 }
2608 void MacroAssembler::fcmp(Register tmp) {
2609 fcmp(tmp, 1, true, true);
2610 }
2612 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2613 assert(!pop_right || pop_left, "usage error");
2614 if (VM_Version::supports_cmov()) {
2615 assert(tmp == noreg, "unneeded temp");
2616 if (pop_left) {
2617 fucomip(index);
2618 } else {
2619 fucomi(index);
2620 }
2621 if (pop_right) {
2622 fpop();
2623 }
2624 } else {
2625 assert(tmp != noreg, "need temp");
2626 if (pop_left) {
2627 if (pop_right) {
2628 fcompp();
2629 } else {
2630 fcomp(index);
2631 }
2632 } else {
2633 fcom(index);
2634 }
2635 // convert FPU condition into eflags condition via rax,
2636 save_rax(tmp);
2637 fwait(); fnstsw_ax();
2638 sahf();
2639 restore_rax(tmp);
2640 }
2641 // condition codes set as follows:
2642 //
2643 // CF (corresponds to C0) if x < y
2644 // PF (corresponds to C2) if unordered
2645 // ZF (corresponds to C3) if x = y
2646 }
2648 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2649 fcmp2int(dst, unordered_is_less, 1, true, true);
2650 }
2652 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2653 fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2654 Label L;
2655 if (unordered_is_less) {
2656 movl(dst, -1);
2657 jcc(Assembler::parity, L);
2658 jcc(Assembler::below , L);
2659 movl(dst, 0);
2660 jcc(Assembler::equal , L);
2661 increment(dst);
2662 } else { // unordered is greater
2663 movl(dst, 1);
2664 jcc(Assembler::parity, L);
2665 jcc(Assembler::above , L);
2666 movl(dst, 0);
2667 jcc(Assembler::equal , L);
2668 decrementl(dst);
2669 }
2670 bind(L);
2671 }
2673 void MacroAssembler::fld_d(AddressLiteral src) {
2674 fld_d(as_Address(src));
2675 }
2677 void MacroAssembler::fld_s(AddressLiteral src) {
2678 fld_s(as_Address(src));
2679 }
2681 void MacroAssembler::fld_x(AddressLiteral src) {
2682 Assembler::fld_x(as_Address(src));
2683 }
2685 void MacroAssembler::fldcw(AddressLiteral src) {
2686 Assembler::fldcw(as_Address(src));
2687 }
2689 void MacroAssembler::pow_exp_core_encoding() {
2690 // kills rax, rcx, rdx
2691 subptr(rsp,sizeof(jdouble));
2692 // computes 2^X. Stack: X ...
2693 // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and
2694 // keep it on the thread's stack to compute 2^int(X) later
2695 // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1)
2696 // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X))
2697 fld_s(0); // Stack: X X ...
2698 frndint(); // Stack: int(X) X ...
2699 fsuba(1); // Stack: int(X) X-int(X) ...
2700 fistp_s(Address(rsp,0)); // move int(X) as integer to thread's stack. Stack: X-int(X) ...
2701 f2xm1(); // Stack: 2^(X-int(X))-1 ...
2702 fld1(); // Stack: 1 2^(X-int(X))-1 ...
2703 faddp(1); // Stack: 2^(X-int(X))
2704 // computes 2^(int(X)): add exponent bias (1023) to int(X), then
2705 // shift int(X)+1023 to exponent position.
2706 // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11
2707 // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent
2708 // values so detect them and set result to NaN.
2709 movl(rax,Address(rsp,0));
2710 movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding
2711 addl(rax, 1023);
2712 movl(rdx,rax);
2713 shll(rax,20);
2714 // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN.
2715 addl(rdx,1);
2716 // Check that 1 < int(X)+1023+1 < 2048
2717 // in 3 steps:
2718 // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048
2719 // 2- (int(X)+1023+1)&-2048 != 0
2720 // 3- (int(X)+1023+1)&-2048 != 1
2721 // Do 2- first because addl just updated the flags.
2722 cmov32(Assembler::equal,rax,rcx);
2723 cmpl(rdx,1);
2724 cmov32(Assembler::equal,rax,rcx);
2725 testl(rdx,rcx);
2726 cmov32(Assembler::notEqual,rax,rcx);
2727 movl(Address(rsp,4),rax);
2728 movl(Address(rsp,0),0);
2729 fmul_d(Address(rsp,0)); // Stack: 2^X ...
2730 addptr(rsp,sizeof(jdouble));
2731 }
2733 void MacroAssembler::increase_precision() {
2734 subptr(rsp, BytesPerWord);
2735 fnstcw(Address(rsp, 0));
2736 movl(rax, Address(rsp, 0));
2737 orl(rax, 0x300);
2738 push(rax);
2739 fldcw(Address(rsp, 0));
2740 pop(rax);
2741 }
2743 void MacroAssembler::restore_precision() {
2744 fldcw(Address(rsp, 0));
2745 addptr(rsp, BytesPerWord);
2746 }
2748 void MacroAssembler::fast_pow() {
2749 // computes X^Y = 2^(Y * log2(X))
2750 // if fast computation is not possible, result is NaN. Requires
2751 // fallback from user of this macro.
2752 // increase precision for intermediate steps of the computation
2753 increase_precision();
2754 fyl2x(); // Stack: (Y*log2(X)) ...
2755 pow_exp_core_encoding(); // Stack: exp(X) ...
2756 restore_precision();
2757 }
2759 void MacroAssembler::fast_exp() {
2760 // computes exp(X) = 2^(X * log2(e))
2761 // if fast computation is not possible, result is NaN. Requires
2762 // fallback from user of this macro.
2763 // increase precision for intermediate steps of the computation
2764 increase_precision();
2765 fldl2e(); // Stack: log2(e) X ...
2766 fmulp(1); // Stack: (X*log2(e)) ...
2767 pow_exp_core_encoding(); // Stack: exp(X) ...
2768 restore_precision();
2769 }
2771 void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) {
2772 // kills rax, rcx, rdx
2773 // pow and exp needs 2 extra registers on the fpu stack.
2774 Label slow_case, done;
2775 Register tmp = noreg;
2776 if (!VM_Version::supports_cmov()) {
2777 // fcmp needs a temporary so preserve rdx,
2778 tmp = rdx;
2779 }
2780 Register tmp2 = rax;
2781 Register tmp3 = rcx;
2783 if (is_exp) {
2784 // Stack: X
2785 fld_s(0); // duplicate argument for runtime call. Stack: X X
2786 fast_exp(); // Stack: exp(X) X
2787 fcmp(tmp, 0, false, false); // Stack: exp(X) X
2788 // exp(X) not equal to itself: exp(X) is NaN go to slow case.
2789 jcc(Assembler::parity, slow_case);
2790 // get rid of duplicate argument. Stack: exp(X)
2791 if (num_fpu_regs_in_use > 0) {
2792 fxch();
2793 fpop();
2794 } else {
2795 ffree(1);
2796 }
2797 jmp(done);
2798 } else {
2799 // Stack: X Y
2800 Label x_negative, y_odd;
2802 fldz(); // Stack: 0 X Y
2803 fcmp(tmp, 1, true, false); // Stack: X Y
2804 jcc(Assembler::above, x_negative);
2806 // X >= 0
2808 fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y
2809 fld_s(1); // Stack: X Y X Y
2810 fast_pow(); // Stack: X^Y X Y
2811 fcmp(tmp, 0, false, false); // Stack: X^Y X Y
2812 // X^Y not equal to itself: X^Y is NaN go to slow case.
2813 jcc(Assembler::parity, slow_case);
2814 // get rid of duplicate arguments. Stack: X^Y
2815 if (num_fpu_regs_in_use > 0) {
2816 fxch(); fpop();
2817 fxch(); fpop();
2818 } else {
2819 ffree(2);
2820 ffree(1);
2821 }
2822 jmp(done);
2824 // X <= 0
2825 bind(x_negative);
2827 fld_s(1); // Stack: Y X Y
2828 frndint(); // Stack: int(Y) X Y
2829 fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
2830 jcc(Assembler::notEqual, slow_case);
2832 subptr(rsp, 8);
2834 // For X^Y, when X < 0, Y has to be an integer and the final
2835 // result depends on whether it's odd or even. We just checked
2836 // that int(Y) == Y. We move int(Y) to gp registers as a 64 bit
2837 // integer to test its parity. If int(Y) is huge and doesn't fit
2838 // in the 64 bit integer range, the integer indefinite value will
2839 // end up in the gp registers. Huge numbers are all even, the
2840 // integer indefinite number is even so it's fine.
2842 #ifdef ASSERT
2843 // Let's check we don't end up with an integer indefinite number
2844 // when not expected. First test for huge numbers: check whether
2845 // int(Y)+1 == int(Y) which is true for very large numbers and
2846 // those are all even. A 64 bit integer is guaranteed to not
2847 // overflow for numbers where y+1 != y (when precision is set to
2848 // double precision).
2849 Label y_not_huge;
2851 fld1(); // Stack: 1 int(Y) X Y
2852 fadd(1); // Stack: 1+int(Y) int(Y) X Y
2854 #ifdef _LP64
2855 // trip to memory to force the precision down from double extended
2856 // precision
2857 fstp_d(Address(rsp, 0));
2858 fld_d(Address(rsp, 0));
2859 #endif
2861 fcmp(tmp, 1, true, false); // Stack: int(Y) X Y
2862 #endif
2864 // move int(Y) as 64 bit integer to thread's stack
2865 fistp_d(Address(rsp,0)); // Stack: X Y
2867 #ifdef ASSERT
2868 jcc(Assembler::notEqual, y_not_huge);
2870 // Y is huge so we know it's even. It may not fit in a 64 bit
2871 // integer and we don't want the debug code below to see the
2872 // integer indefinite value so overwrite int(Y) on the thread's
2873 // stack with 0.
2874 movl(Address(rsp, 0), 0);
2875 movl(Address(rsp, 4), 0);
2877 bind(y_not_huge);
2878 #endif
2880 fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y
2881 fld_s(1); // Stack: X Y X Y
2882 fabs(); // Stack: abs(X) Y X Y
2883 fast_pow(); // Stack: abs(X)^Y X Y
2884 fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
2885 // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
2887 pop(tmp2);
2888 NOT_LP64(pop(tmp3));
2889 jcc(Assembler::parity, slow_case);
2891 #ifdef ASSERT
2892 // Check that int(Y) is not integer indefinite value (int
2893 // overflow). Shouldn't happen because for values that would
2894 // overflow, 1+int(Y)==Y which was tested earlier.
2895 #ifndef _LP64
2896 {
2897 Label integer;
2898 testl(tmp2, tmp2);
2899 jcc(Assembler::notZero, integer);
2900 cmpl(tmp3, 0x80000000);
2901 jcc(Assembler::notZero, integer);
2902 STOP("integer indefinite value shouldn't be seen here");
2903 bind(integer);
2904 }
2905 #else
2906 {
2907 Label integer;
2908 mov(tmp3, tmp2); // preserve tmp2 for parity check below
2909 shlq(tmp3, 1);
2910 jcc(Assembler::carryClear, integer);
2911 jcc(Assembler::notZero, integer);
2912 STOP("integer indefinite value shouldn't be seen here");
2913 bind(integer);
2914 }
2915 #endif
2916 #endif
2918 // get rid of duplicate arguments. Stack: X^Y
2919 if (num_fpu_regs_in_use > 0) {
2920 fxch(); fpop();
2921 fxch(); fpop();
2922 } else {
2923 ffree(2);
2924 ffree(1);
2925 }
2927 testl(tmp2, 1);
2928 jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
2929 // X <= 0, Y even: X^Y = -abs(X)^Y
2931 fchs(); // Stack: -abs(X)^Y Y
2932 jmp(done);
2933 }
2935 // slow case: runtime call
2936 bind(slow_case);
2938 fpop(); // pop incorrect result or int(Y)
2940 fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow),
2941 is_exp ? 1 : 2, num_fpu_regs_in_use);
2943 // Come here with result in F-TOS
2944 bind(done);
2945 }
2947 void MacroAssembler::fpop() {
2948 ffree();
2949 fincstp();
2950 }
2952 void MacroAssembler::fremr(Register tmp) {
2953 save_rax(tmp);
2954 { Label L;
2955 bind(L);
2956 fprem();
2957 fwait(); fnstsw_ax();
2958 #ifdef _LP64
2959 testl(rax, 0x400);
2960 jcc(Assembler::notEqual, L);
2961 #else
2962 sahf();
2963 jcc(Assembler::parity, L);
2964 #endif // _LP64
2965 }
2966 restore_rax(tmp);
2967 // Result is in ST0.
2968 // Note: fxch & fpop to get rid of ST1
2969 // (otherwise FPU stack could overflow eventually)
2970 fxch(1);
2971 fpop();
2972 }
2975 void MacroAssembler::incrementl(AddressLiteral dst) {
2976 if (reachable(dst)) {
2977 incrementl(as_Address(dst));
2978 } else {
2979 lea(rscratch1, dst);
2980 incrementl(Address(rscratch1, 0));
2981 }
2982 }
2984 void MacroAssembler::incrementl(ArrayAddress dst) {
2985 incrementl(as_Address(dst));
2986 }
2988 void MacroAssembler::incrementl(Register reg, int value) {
2989 if (value == min_jint) {addl(reg, value) ; return; }
2990 if (value < 0) { decrementl(reg, -value); return; }
2991 if (value == 0) { ; return; }
2992 if (value == 1 && UseIncDec) { incl(reg) ; return; }
2993 /* else */ { addl(reg, value) ; return; }
2994 }
2996 void MacroAssembler::incrementl(Address dst, int value) {
2997 if (value == min_jint) {addl(dst, value) ; return; }
2998 if (value < 0) { decrementl(dst, -value); return; }
2999 if (value == 0) { ; return; }
3000 if (value == 1 && UseIncDec) { incl(dst) ; return; }
3001 /* else */ { addl(dst, value) ; return; }
3002 }
3004 void MacroAssembler::jump(AddressLiteral dst) {
3005 if (reachable(dst)) {
3006 jmp_literal(dst.target(), dst.rspec());
3007 } else {
3008 lea(rscratch1, dst);
3009 jmp(rscratch1);
3010 }
3011 }
3013 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
3014 if (reachable(dst)) {
3015 InstructionMark im(this);
3016 relocate(dst.reloc());
3017 const int short_size = 2;
3018 const int long_size = 6;
3019 int offs = (intptr_t)dst.target() - ((intptr_t)pc());
3020 if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
3021 // 0111 tttn #8-bit disp
3022 emit_int8(0x70 | cc);
3023 emit_int8((offs - short_size) & 0xFF);
3024 } else {
3025 // 0000 1111 1000 tttn #32-bit disp
3026 emit_int8(0x0F);
3027 emit_int8((unsigned char)(0x80 | cc));
3028 emit_int32(offs - long_size);
3029 }
3030 } else {
3031 #ifdef ASSERT
3032 warning("reversing conditional branch");
3033 #endif /* ASSERT */
3034 Label skip;
3035 jccb(reverse[cc], skip);
3036 lea(rscratch1, dst);
3037 Assembler::jmp(rscratch1);
3038 bind(skip);
3039 }
3040 }
3042 void MacroAssembler::ldmxcsr(AddressLiteral src) {
3043 if (reachable(src)) {
3044 Assembler::ldmxcsr(as_Address(src));
3045 } else {
3046 lea(rscratch1, src);
3047 Assembler::ldmxcsr(Address(rscratch1, 0));
3048 }
3049 }
3051 int MacroAssembler::load_signed_byte(Register dst, Address src) {
3052 int off;
3053 if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3054 off = offset();
3055 movsbl(dst, src); // movsxb
3056 } else {
3057 off = load_unsigned_byte(dst, src);
3058 shll(dst, 24);
3059 sarl(dst, 24);
3060 }
3061 return off;
3062 }
3064 // Note: load_signed_short used to be called load_signed_word.
3065 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
3066 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
3067 // The term "word" in HotSpot means a 32- or 64-bit machine word.
3068 int MacroAssembler::load_signed_short(Register dst, Address src) {
3069 int off;
3070 if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3071 // This is dubious to me since it seems safe to do a signed 16 => 64 bit
3072 // version but this is what 64bit has always done. This seems to imply
3073 // that users are only using 32bits worth.
3074 off = offset();
3075 movswl(dst, src); // movsxw
3076 } else {
3077 off = load_unsigned_short(dst, src);
3078 shll(dst, 16);
3079 sarl(dst, 16);
3080 }
3081 return off;
3082 }
3084 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3085 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3086 // and "3.9 Partial Register Penalties", p. 22).
3087 int off;
3088 if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
3089 off = offset();
3090 movzbl(dst, src); // movzxb
3091 } else {
3092 xorl(dst, dst);
3093 off = offset();
3094 movb(dst, src);
3095 }
3096 return off;
3097 }
3099 // Note: load_unsigned_short used to be called load_unsigned_word.
3100 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3101 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3102 // and "3.9 Partial Register Penalties", p. 22).
3103 int off;
3104 if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
3105 off = offset();
3106 movzwl(dst, src); // movzxw
3107 } else {
3108 xorl(dst, dst);
3109 off = offset();
3110 movw(dst, src);
3111 }
3112 return off;
3113 }
3115 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
3116 switch (size_in_bytes) {
3117 #ifndef _LP64
3118 case 8:
3119 assert(dst2 != noreg, "second dest register required");
3120 movl(dst, src);
3121 movl(dst2, src.plus_disp(BytesPerInt));
3122 break;
3123 #else
3124 case 8: movq(dst, src); break;
3125 #endif
3126 case 4: movl(dst, src); break;
3127 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3128 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3129 default: ShouldNotReachHere();
3130 }
3131 }
3133 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
3134 switch (size_in_bytes) {
3135 #ifndef _LP64
3136 case 8:
3137 assert(src2 != noreg, "second source register required");
3138 movl(dst, src);
3139 movl(dst.plus_disp(BytesPerInt), src2);
3140 break;
3141 #else
3142 case 8: movq(dst, src); break;
3143 #endif
3144 case 4: movl(dst, src); break;
3145 case 2: movw(dst, src); break;
3146 case 1: movb(dst, src); break;
3147 default: ShouldNotReachHere();
3148 }
3149 }
3151 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
3152 if (reachable(dst)) {
3153 movl(as_Address(dst), src);
3154 } else {
3155 lea(rscratch1, dst);
3156 movl(Address(rscratch1, 0), src);
3157 }
3158 }
3160 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
3161 if (reachable(src)) {
3162 movl(dst, as_Address(src));
3163 } else {
3164 lea(rscratch1, src);
3165 movl(dst, Address(rscratch1, 0));
3166 }
3167 }
3169 // C++ bool manipulation
3171 void MacroAssembler::movbool(Register dst, Address src) {
3172 if(sizeof(bool) == 1)
3173 movb(dst, src);
3174 else if(sizeof(bool) == 2)
3175 movw(dst, src);
3176 else if(sizeof(bool) == 4)
3177 movl(dst, src);
3178 else
3179 // unsupported
3180 ShouldNotReachHere();
3181 }
3183 void MacroAssembler::movbool(Address dst, bool boolconst) {
3184 if(sizeof(bool) == 1)
3185 movb(dst, (int) boolconst);
3186 else if(sizeof(bool) == 2)
3187 movw(dst, (int) boolconst);
3188 else if(sizeof(bool) == 4)
3189 movl(dst, (int) boolconst);
3190 else
3191 // unsupported
3192 ShouldNotReachHere();
3193 }
3195 void MacroAssembler::movbool(Address dst, Register src) {
3196 if(sizeof(bool) == 1)
3197 movb(dst, src);
3198 else if(sizeof(bool) == 2)
3199 movw(dst, src);
3200 else if(sizeof(bool) == 4)
3201 movl(dst, src);
3202 else
3203 // unsupported
3204 ShouldNotReachHere();
3205 }
3207 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
3208 movb(as_Address(dst), src);
3209 }
3211 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
3212 if (reachable(src)) {
3213 movdl(dst, as_Address(src));
3214 } else {
3215 lea(rscratch1, src);
3216 movdl(dst, Address(rscratch1, 0));
3217 }
3218 }
3220 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
3221 if (reachable(src)) {
3222 movq(dst, as_Address(src));
3223 } else {
3224 lea(rscratch1, src);
3225 movq(dst, Address(rscratch1, 0));
3226 }
3227 }
3229 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
3230 if (reachable(src)) {
3231 if (UseXmmLoadAndClearUpper) {
3232 movsd (dst, as_Address(src));
3233 } else {
3234 movlpd(dst, as_Address(src));
3235 }
3236 } else {
3237 lea(rscratch1, src);
3238 if (UseXmmLoadAndClearUpper) {
3239 movsd (dst, Address(rscratch1, 0));
3240 } else {
3241 movlpd(dst, Address(rscratch1, 0));
3242 }
3243 }
3244 }
3246 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
3247 if (reachable(src)) {
3248 movss(dst, as_Address(src));
3249 } else {
3250 lea(rscratch1, src);
3251 movss(dst, Address(rscratch1, 0));
3252 }
3253 }
3255 void MacroAssembler::movptr(Register dst, Register src) {
3256 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3257 }
3259 void MacroAssembler::movptr(Register dst, Address src) {
3260 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3261 }
3263 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
3264 void MacroAssembler::movptr(Register dst, intptr_t src) {
3265 LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3266 }
3268 void MacroAssembler::movptr(Address dst, Register src) {
3269 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3270 }
3272 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src) {
3273 if (reachable(src)) {
3274 Assembler::movdqu(dst, as_Address(src));
3275 } else {
3276 lea(rscratch1, src);
3277 Assembler::movdqu(dst, Address(rscratch1, 0));
3278 }
3279 }
3281 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
3282 if (reachable(src)) {
3283 Assembler::movdqa(dst, as_Address(src));
3284 } else {
3285 lea(rscratch1, src);
3286 Assembler::movdqa(dst, Address(rscratch1, 0));
3287 }
3288 }
3290 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
3291 if (reachable(src)) {
3292 Assembler::movsd(dst, as_Address(src));
3293 } else {
3294 lea(rscratch1, src);
3295 Assembler::movsd(dst, Address(rscratch1, 0));
3296 }
3297 }
3299 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
3300 if (reachable(src)) {
3301 Assembler::movss(dst, as_Address(src));
3302 } else {
3303 lea(rscratch1, src);
3304 Assembler::movss(dst, Address(rscratch1, 0));
3305 }
3306 }
3308 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
3309 if (reachable(src)) {
3310 Assembler::mulsd(dst, as_Address(src));
3311 } else {
3312 lea(rscratch1, src);
3313 Assembler::mulsd(dst, Address(rscratch1, 0));
3314 }
3315 }
3317 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
3318 if (reachable(src)) {
3319 Assembler::mulss(dst, as_Address(src));
3320 } else {
3321 lea(rscratch1, src);
3322 Assembler::mulss(dst, Address(rscratch1, 0));
3323 }
3324 }
3326 void MacroAssembler::null_check(Register reg, int offset) {
3327 if (needs_explicit_null_check(offset)) {
3328 // provoke OS NULL exception if reg = NULL by
3329 // accessing M[reg] w/o changing any (non-CC) registers
3330 // NOTE: cmpl is plenty here to provoke a segv
3331 cmpptr(rax, Address(reg, 0));
3332 // Note: should probably use testl(rax, Address(reg, 0));
3333 // may be shorter code (however, this version of
3334 // testl needs to be implemented first)
3335 } else {
3336 // nothing to do, (later) access of M[reg + offset]
3337 // will provoke OS NULL exception if reg = NULL
3338 }
3339 }
3341 void MacroAssembler::os_breakpoint() {
3342 // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
3343 // (e.g., MSVC can't call ps() otherwise)
3344 call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
3345 }
3347 void MacroAssembler::pop_CPU_state() {
3348 pop_FPU_state();
3349 pop_IU_state();
3350 }
3352 void MacroAssembler::pop_FPU_state() {
3353 NOT_LP64(frstor(Address(rsp, 0));)
3354 LP64_ONLY(fxrstor(Address(rsp, 0));)
3355 addptr(rsp, FPUStateSizeInWords * wordSize);
3356 }
3358 void MacroAssembler::pop_IU_state() {
3359 popa();
3360 LP64_ONLY(addq(rsp, 8));
3361 popf();
3362 }
3364 // Save Integer and Float state
3365 // Warning: Stack must be 16 byte aligned (64bit)
3366 void MacroAssembler::push_CPU_state() {
3367 push_IU_state();
3368 push_FPU_state();
3369 }
3371 void MacroAssembler::push_FPU_state() {
3372 subptr(rsp, FPUStateSizeInWords * wordSize);
3373 #ifndef _LP64
3374 fnsave(Address(rsp, 0));
3375 fwait();
3376 #else
3377 fxsave(Address(rsp, 0));
3378 #endif // LP64
3379 }
3381 void MacroAssembler::push_IU_state() {
3382 // Push flags first because pusha kills them
3383 pushf();
3384 // Make sure rsp stays 16-byte aligned
3385 LP64_ONLY(subq(rsp, 8));
3386 pusha();
3387 }
3389 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) {
3390 // determine java_thread register
3391 if (!java_thread->is_valid()) {
3392 java_thread = rdi;
3393 get_thread(java_thread);
3394 }
3395 // we must set sp to zero to clear frame
3396 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3397 if (clear_fp) {
3398 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3399 }
3401 if (clear_pc)
3402 movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3404 }
3406 void MacroAssembler::restore_rax(Register tmp) {
3407 if (tmp == noreg) pop(rax);
3408 else if (tmp != rax) mov(rax, tmp);
3409 }
3411 void MacroAssembler::round_to(Register reg, int modulus) {
3412 addptr(reg, modulus - 1);
3413 andptr(reg, -modulus);
3414 }
3416 void MacroAssembler::save_rax(Register tmp) {
3417 if (tmp == noreg) push(rax);
3418 else if (tmp != rax) mov(tmp, rax);
3419 }
3421 // Write serialization page so VM thread can do a pseudo remote membar.
3422 // We use the current thread pointer to calculate a thread specific
3423 // offset to write to within the page. This minimizes bus traffic
3424 // due to cache line collision.
3425 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
3426 movl(tmp, thread);
3427 shrl(tmp, os::get_serialize_page_shift_count());
3428 andl(tmp, (os::vm_page_size() - sizeof(int)));
3430 Address index(noreg, tmp, Address::times_1);
3431 ExternalAddress page(os::get_memory_serialize_page());
3433 // Size of store must match masking code above
3434 movl(as_Address(ArrayAddress(page, index)), tmp);
3435 }
3437 // Calls to C land
3438 //
3439 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3440 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3441 // has to be reset to 0. This is required to allow proper stack traversal.
3442 void MacroAssembler::set_last_Java_frame(Register java_thread,
3443 Register last_java_sp,
3444 Register last_java_fp,
3445 address last_java_pc) {
3446 // determine java_thread register
3447 if (!java_thread->is_valid()) {
3448 java_thread = rdi;
3449 get_thread(java_thread);
3450 }
3451 // determine last_java_sp register
3452 if (!last_java_sp->is_valid()) {
3453 last_java_sp = rsp;
3454 }
3456 // last_java_fp is optional
3458 if (last_java_fp->is_valid()) {
3459 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3460 }
3462 // last_java_pc is optional
3464 if (last_java_pc != NULL) {
3465 lea(Address(java_thread,
3466 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3467 InternalAddress(last_java_pc));
3469 }
3470 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3471 }
3473 void MacroAssembler::shlptr(Register dst, int imm8) {
3474 LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3475 }
3477 void MacroAssembler::shrptr(Register dst, int imm8) {
3478 LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3479 }
3481 void MacroAssembler::sign_extend_byte(Register reg) {
3482 if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3483 movsbl(reg, reg); // movsxb
3484 } else {
3485 shll(reg, 24);
3486 sarl(reg, 24);
3487 }
3488 }
3490 void MacroAssembler::sign_extend_short(Register reg) {
3491 if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3492 movswl(reg, reg); // movsxw
3493 } else {
3494 shll(reg, 16);
3495 sarl(reg, 16);
3496 }
3497 }
3499 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3500 assert(reachable(src), "Address should be reachable");
3501 testl(dst, as_Address(src));
3502 }
3504 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3505 if (reachable(src)) {
3506 Assembler::sqrtsd(dst, as_Address(src));
3507 } else {
3508 lea(rscratch1, src);
3509 Assembler::sqrtsd(dst, Address(rscratch1, 0));
3510 }
3511 }
3513 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3514 if (reachable(src)) {
3515 Assembler::sqrtss(dst, as_Address(src));
3516 } else {
3517 lea(rscratch1, src);
3518 Assembler::sqrtss(dst, Address(rscratch1, 0));
3519 }
3520 }
3522 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
3523 if (reachable(src)) {
3524 Assembler::subsd(dst, as_Address(src));
3525 } else {
3526 lea(rscratch1, src);
3527 Assembler::subsd(dst, Address(rscratch1, 0));
3528 }
3529 }
3531 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3532 if (reachable(src)) {
3533 Assembler::subss(dst, as_Address(src));
3534 } else {
3535 lea(rscratch1, src);
3536 Assembler::subss(dst, Address(rscratch1, 0));
3537 }
3538 }
3540 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
3541 if (reachable(src)) {
3542 Assembler::ucomisd(dst, as_Address(src));
3543 } else {
3544 lea(rscratch1, src);
3545 Assembler::ucomisd(dst, Address(rscratch1, 0));
3546 }
3547 }
3549 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3550 if (reachable(src)) {
3551 Assembler::ucomiss(dst, as_Address(src));
3552 } else {
3553 lea(rscratch1, src);
3554 Assembler::ucomiss(dst, Address(rscratch1, 0));
3555 }
3556 }
3558 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
3559 // Used in sign-bit flipping with aligned address.
3560 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3561 if (reachable(src)) {
3562 Assembler::xorpd(dst, as_Address(src));
3563 } else {
3564 lea(rscratch1, src);
3565 Assembler::xorpd(dst, Address(rscratch1, 0));
3566 }
3567 }
3569 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
3570 // Used in sign-bit flipping with aligned address.
3571 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3572 if (reachable(src)) {
3573 Assembler::xorps(dst, as_Address(src));
3574 } else {
3575 lea(rscratch1, src);
3576 Assembler::xorps(dst, Address(rscratch1, 0));
3577 }
3578 }
3580 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3581 // Used in sign-bit flipping with aligned address.
3582 bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3583 assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3584 if (reachable(src)) {
3585 Assembler::pshufb(dst, as_Address(src));
3586 } else {
3587 lea(rscratch1, src);
3588 Assembler::pshufb(dst, Address(rscratch1, 0));
3589 }
3590 }
3592 // AVX 3-operands instructions
3594 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3595 if (reachable(src)) {
3596 vaddsd(dst, nds, as_Address(src));
3597 } else {
3598 lea(rscratch1, src);
3599 vaddsd(dst, nds, Address(rscratch1, 0));
3600 }
3601 }
3603 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3604 if (reachable(src)) {
3605 vaddss(dst, nds, as_Address(src));
3606 } else {
3607 lea(rscratch1, src);
3608 vaddss(dst, nds, Address(rscratch1, 0));
3609 }
3610 }
3612 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
3613 if (reachable(src)) {
3614 vandpd(dst, nds, as_Address(src), vector256);
3615 } else {
3616 lea(rscratch1, src);
3617 vandpd(dst, nds, Address(rscratch1, 0), vector256);
3618 }
3619 }
3621 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
3622 if (reachable(src)) {
3623 vandps(dst, nds, as_Address(src), vector256);
3624 } else {
3625 lea(rscratch1, src);
3626 vandps(dst, nds, Address(rscratch1, 0), vector256);
3627 }
3628 }
3630 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3631 if (reachable(src)) {
3632 vdivsd(dst, nds, as_Address(src));
3633 } else {
3634 lea(rscratch1, src);
3635 vdivsd(dst, nds, Address(rscratch1, 0));
3636 }
3637 }
3639 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3640 if (reachable(src)) {
3641 vdivss(dst, nds, as_Address(src));
3642 } else {
3643 lea(rscratch1, src);
3644 vdivss(dst, nds, Address(rscratch1, 0));
3645 }
3646 }
3648 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3649 if (reachable(src)) {
3650 vmulsd(dst, nds, as_Address(src));
3651 } else {
3652 lea(rscratch1, src);
3653 vmulsd(dst, nds, Address(rscratch1, 0));
3654 }
3655 }
3657 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3658 if (reachable(src)) {
3659 vmulss(dst, nds, as_Address(src));
3660 } else {
3661 lea(rscratch1, src);
3662 vmulss(dst, nds, Address(rscratch1, 0));
3663 }
3664 }
3666 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3667 if (reachable(src)) {
3668 vsubsd(dst, nds, as_Address(src));
3669 } else {
3670 lea(rscratch1, src);
3671 vsubsd(dst, nds, Address(rscratch1, 0));
3672 }
3673 }
3675 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3676 if (reachable(src)) {
3677 vsubss(dst, nds, as_Address(src));
3678 } else {
3679 lea(rscratch1, src);
3680 vsubss(dst, nds, Address(rscratch1, 0));
3681 }
3682 }
3684 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
3685 if (reachable(src)) {
3686 vxorpd(dst, nds, as_Address(src), vector256);
3687 } else {
3688 lea(rscratch1, src);
3689 vxorpd(dst, nds, Address(rscratch1, 0), vector256);
3690 }
3691 }
3693 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
3694 if (reachable(src)) {
3695 vxorps(dst, nds, as_Address(src), vector256);
3696 } else {
3697 lea(rscratch1, src);
3698 vxorps(dst, nds, Address(rscratch1, 0), vector256);
3699 }
3700 }
3703 //////////////////////////////////////////////////////////////////////////////////
3704 #if INCLUDE_ALL_GCS
3706 void MacroAssembler::g1_write_barrier_pre(Register obj,
3707 Register pre_val,
3708 Register thread,
3709 Register tmp,
3710 bool tosca_live,
3711 bool expand_call) {
3713 // If expand_call is true then we expand the call_VM_leaf macro
3714 // directly to skip generating the check by
3715 // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
3717 #ifdef _LP64
3718 assert(thread == r15_thread, "must be");
3719 #endif // _LP64
3721 Label done;
3722 Label runtime;
3724 assert(pre_val != noreg, "check this code");
3726 if (obj != noreg) {
3727 assert_different_registers(obj, pre_val, tmp);
3728 assert(pre_val != rax, "check this code");
3729 }
3731 Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3732 PtrQueue::byte_offset_of_active()));
3733 Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3734 PtrQueue::byte_offset_of_index()));
3735 Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3736 PtrQueue::byte_offset_of_buf()));
3739 // Is marking active?
3740 if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
3741 cmpl(in_progress, 0);
3742 } else {
3743 assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
3744 cmpb(in_progress, 0);
3745 }
3746 jcc(Assembler::equal, done);
3748 // Do we need to load the previous value?
3749 if (obj != noreg) {
3750 load_heap_oop(pre_val, Address(obj, 0));
3751 }
3753 // Is the previous value null?
3754 cmpptr(pre_val, (int32_t) NULL_WORD);
3755 jcc(Assembler::equal, done);
3757 // Can we store original value in the thread's buffer?
3758 // Is index == 0?
3759 // (The index field is typed as size_t.)
3761 movptr(tmp, index); // tmp := *index_adr
3762 cmpptr(tmp, 0); // tmp == 0?
3763 jcc(Assembler::equal, runtime); // If yes, goto runtime
3765 subptr(tmp, wordSize); // tmp := tmp - wordSize
3766 movptr(index, tmp); // *index_adr := tmp
3767 addptr(tmp, buffer); // tmp := tmp + *buffer_adr
3769 // Record the previous value
3770 movptr(Address(tmp, 0), pre_val);
3771 jmp(done);
3773 bind(runtime);
3774 // save the live input values
3775 if(tosca_live) push(rax);
3777 if (obj != noreg && obj != rax)
3778 push(obj);
3780 if (pre_val != rax)
3781 push(pre_val);
3783 // Calling the runtime using the regular call_VM_leaf mechanism generates
3784 // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
3785 // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL.
3786 //
3787 // If we care generating the pre-barrier without a frame (e.g. in the
3788 // intrinsified Reference.get() routine) then ebp might be pointing to
3789 // the caller frame and so this check will most likely fail at runtime.
3790 //
3791 // Expanding the call directly bypasses the generation of the check.
3792 // So when we do not have have a full interpreter frame on the stack
3793 // expand_call should be passed true.
3795 NOT_LP64( push(thread); )
3797 if (expand_call) {
3798 LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
3799 pass_arg1(this, thread);
3800 pass_arg0(this, pre_val);
3801 MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
3802 } else {
3803 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
3804 }
3806 NOT_LP64( pop(thread); )
3808 // save the live input values
3809 if (pre_val != rax)
3810 pop(pre_val);
3812 if (obj != noreg && obj != rax)
3813 pop(obj);
3815 if(tosca_live) pop(rax);
3817 bind(done);
3818 }
3820 void MacroAssembler::g1_write_barrier_post(Register store_addr,
3821 Register new_val,
3822 Register thread,
3823 Register tmp,
3824 Register tmp2) {
3825 #ifdef _LP64
3826 assert(thread == r15_thread, "must be");
3827 #endif // _LP64
3829 Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
3830 PtrQueue::byte_offset_of_index()));
3831 Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
3832 PtrQueue::byte_offset_of_buf()));
3834 BarrierSet* bs = Universe::heap()->barrier_set();
3835 CardTableModRefBS* ct = (CardTableModRefBS*)bs;
3836 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3838 Label done;
3839 Label runtime;
3841 // Does store cross heap regions?
3843 movptr(tmp, store_addr);
3844 xorptr(tmp, new_val);
3845 shrptr(tmp, HeapRegion::LogOfHRGrainBytes);
3846 jcc(Assembler::equal, done);
3848 // crosses regions, storing NULL?
3850 cmpptr(new_val, (int32_t) NULL_WORD);
3851 jcc(Assembler::equal, done);
3853 // storing region crossing non-NULL, is card already dirty?
3855 const Register card_addr = tmp;
3856 const Register cardtable = tmp2;
3858 movptr(card_addr, store_addr);
3859 shrptr(card_addr, CardTableModRefBS::card_shift);
3860 // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT
3861 // a valid address and therefore is not properly handled by the relocation code.
3862 movptr(cardtable, (intptr_t)ct->byte_map_base);
3863 addptr(card_addr, cardtable);
3865 cmpb(Address(card_addr, 0), (int)G1SATBCardTableModRefBS::g1_young_card_val());
3866 jcc(Assembler::equal, done);
3868 membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));
3869 cmpb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val());
3870 jcc(Assembler::equal, done);
3873 // storing a region crossing, non-NULL oop, card is clean.
3874 // dirty card and log.
3876 movb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val());
3878 cmpl(queue_index, 0);
3879 jcc(Assembler::equal, runtime);
3880 subl(queue_index, wordSize);
3881 movptr(tmp2, buffer);
3882 #ifdef _LP64
3883 movslq(rscratch1, queue_index);
3884 addq(tmp2, rscratch1);
3885 movq(Address(tmp2, 0), card_addr);
3886 #else
3887 addl(tmp2, queue_index);
3888 movl(Address(tmp2, 0), card_addr);
3889 #endif
3890 jmp(done);
3892 bind(runtime);
3893 // save the live input values
3894 push(store_addr);
3895 push(new_val);
3896 #ifdef _LP64
3897 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, r15_thread);
3898 #else
3899 push(thread);
3900 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
3901 pop(thread);
3902 #endif
3903 pop(new_val);
3904 pop(store_addr);
3906 bind(done);
3907 }
3909 #endif // INCLUDE_ALL_GCS
3910 //////////////////////////////////////////////////////////////////////////////////
3913 void MacroAssembler::store_check(Register obj) {
3914 // Does a store check for the oop in register obj. The content of
3915 // register obj is destroyed afterwards.
3916 store_check_part_1(obj);
3917 store_check_part_2(obj);
3918 }
3920 void MacroAssembler::store_check(Register obj, Address dst) {
3921 store_check(obj);
3922 }
3925 // split the store check operation so that other instructions can be scheduled inbetween
3926 void MacroAssembler::store_check_part_1(Register obj) {
3927 BarrierSet* bs = Universe::heap()->barrier_set();
3928 assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
3929 shrptr(obj, CardTableModRefBS::card_shift);
3930 }
3932 void MacroAssembler::store_check_part_2(Register obj) {
3933 BarrierSet* bs = Universe::heap()->barrier_set();
3934 assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
3935 CardTableModRefBS* ct = (CardTableModRefBS*)bs;
3936 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3938 // The calculation for byte_map_base is as follows:
3939 // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
3940 // So this essentially converts an address to a displacement and it will
3941 // never need to be relocated. On 64bit however the value may be too
3942 // large for a 32bit displacement.
3943 intptr_t disp = (intptr_t) ct->byte_map_base;
3944 if (is_simm32(disp)) {
3945 Address cardtable(noreg, obj, Address::times_1, disp);
3946 movb(cardtable, 0);
3947 } else {
3948 // By doing it as an ExternalAddress 'disp' could be converted to a rip-relative
3949 // displacement and done in a single instruction given favorable mapping and a
3950 // smarter version of as_Address. However, 'ExternalAddress' generates a relocation
3951 // entry and that entry is not properly handled by the relocation code.
3952 AddressLiteral cardtable((address)ct->byte_map_base, relocInfo::none);
3953 Address index(noreg, obj, Address::times_1);
3954 movb(as_Address(ArrayAddress(cardtable, index)), 0);
3955 }
3956 }
3958 void MacroAssembler::subptr(Register dst, int32_t imm32) {
3959 LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
3960 }
3962 // Force generation of a 4 byte immediate value even if it fits into 8bit
3963 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
3964 LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
3965 }
3967 void MacroAssembler::subptr(Register dst, Register src) {
3968 LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
3969 }
3971 // C++ bool manipulation
3972 void MacroAssembler::testbool(Register dst) {
3973 if(sizeof(bool) == 1)
3974 testb(dst, 0xff);
3975 else if(sizeof(bool) == 2) {
3976 // testw implementation needed for two byte bools
3977 ShouldNotReachHere();
3978 } else if(sizeof(bool) == 4)
3979 testl(dst, dst);
3980 else
3981 // unsupported
3982 ShouldNotReachHere();
3983 }
3985 void MacroAssembler::testptr(Register dst, Register src) {
3986 LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
3987 }
3989 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3990 void MacroAssembler::tlab_allocate(Register obj,
3991 Register var_size_in_bytes,
3992 int con_size_in_bytes,
3993 Register t1,
3994 Register t2,
3995 Label& slow_case) {
3996 assert_different_registers(obj, t1, t2);
3997 assert_different_registers(obj, var_size_in_bytes, t1);
3998 Register end = t2;
3999 Register thread = NOT_LP64(t1) LP64_ONLY(r15_thread);
4001 verify_tlab();
4003 NOT_LP64(get_thread(thread));
4005 movptr(obj, Address(thread, JavaThread::tlab_top_offset()));
4006 if (var_size_in_bytes == noreg) {
4007 lea(end, Address(obj, con_size_in_bytes));
4008 } else {
4009 lea(end, Address(obj, var_size_in_bytes, Address::times_1));
4010 }
4011 cmpptr(end, Address(thread, JavaThread::tlab_end_offset()));
4012 jcc(Assembler::above, slow_case);
4014 // update the tlab top pointer
4015 movptr(Address(thread, JavaThread::tlab_top_offset()), end);
4017 // recover var_size_in_bytes if necessary
4018 if (var_size_in_bytes == end) {
4019 subptr(var_size_in_bytes, obj);
4020 }
4021 verify_tlab();
4022 }
4024 // Preserves rbx, and rdx.
4025 Register MacroAssembler::tlab_refill(Label& retry,
4026 Label& try_eden,
4027 Label& slow_case) {
4028 Register top = rax;
4029 Register t1 = rcx;
4030 Register t2 = rsi;
4031 Register thread_reg = NOT_LP64(rdi) LP64_ONLY(r15_thread);
4032 assert_different_registers(top, thread_reg, t1, t2, /* preserve: */ rbx, rdx);
4033 Label do_refill, discard_tlab;
4035 if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
4036 // No allocation in the shared eden.
4037 jmp(slow_case);
4038 }
4040 NOT_LP64(get_thread(thread_reg));
4042 movptr(top, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4043 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4045 // calculate amount of free space
4046 subptr(t1, top);
4047 shrptr(t1, LogHeapWordSize);
4049 // Retain tlab and allocate object in shared space if
4050 // the amount free in the tlab is too large to discard.
4051 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
4052 jcc(Assembler::lessEqual, discard_tlab);
4054 // Retain
4055 // %%% yuck as movptr...
4056 movptr(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
4057 addptr(Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())), t2);
4058 if (TLABStats) {
4059 // increment number of slow_allocations
4060 addl(Address(thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset())), 1);
4061 }
4062 jmp(try_eden);
4064 bind(discard_tlab);
4065 if (TLABStats) {
4066 // increment number of refills
4067 addl(Address(thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1);
4068 // accumulate wastage -- t1 is amount free in tlab
4069 addl(Address(thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1);
4070 }
4072 // if tlab is currently allocated (top or end != null) then
4073 // fill [top, end + alignment_reserve) with array object
4074 testptr(top, top);
4075 jcc(Assembler::zero, do_refill);
4077 // set up the mark word
4078 movptr(Address(top, oopDesc::mark_offset_in_bytes()), (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
4079 // set the length to the remaining space
4080 subptr(t1, typeArrayOopDesc::header_size(T_INT));
4081 addptr(t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
4082 shlptr(t1, log2_intptr(HeapWordSize/sizeof(jint)));
4083 movl(Address(top, arrayOopDesc::length_offset_in_bytes()), t1);
4084 // set klass to intArrayKlass
4085 // dubious reloc why not an oop reloc?
4086 movptr(t1, ExternalAddress((address)Universe::intArrayKlassObj_addr()));
4087 // store klass last. concurrent gcs assumes klass length is valid if
4088 // klass field is not null.
4089 store_klass(top, t1);
4091 movptr(t1, top);
4092 subptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4093 incr_allocated_bytes(thread_reg, t1, 0);
4095 // refill the tlab with an eden allocation
4096 bind(do_refill);
4097 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
4098 shlptr(t1, LogHeapWordSize);
4099 // allocate new tlab, address returned in top
4100 eden_allocate(top, t1, 0, t2, slow_case);
4102 // Check that t1 was preserved in eden_allocate.
4103 #ifdef ASSERT
4104 if (UseTLAB) {
4105 Label ok;
4106 Register tsize = rsi;
4107 assert_different_registers(tsize, thread_reg, t1);
4108 push(tsize);
4109 movptr(tsize, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
4110 shlptr(tsize, LogHeapWordSize);
4111 cmpptr(t1, tsize);
4112 jcc(Assembler::equal, ok);
4113 STOP("assert(t1 != tlab size)");
4114 should_not_reach_here();
4116 bind(ok);
4117 pop(tsize);
4118 }
4119 #endif
4120 movptr(Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())), top);
4121 movptr(Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())), top);
4122 addptr(top, t1);
4123 subptr(top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
4124 movptr(Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())), top);
4125 verify_tlab();
4126 jmp(retry);
4128 return thread_reg; // for use by caller
4129 }
4131 void MacroAssembler::incr_allocated_bytes(Register thread,
4132 Register var_size_in_bytes,
4133 int con_size_in_bytes,
4134 Register t1) {
4135 if (!thread->is_valid()) {
4136 #ifdef _LP64
4137 thread = r15_thread;
4138 #else
4139 assert(t1->is_valid(), "need temp reg");
4140 thread = t1;
4141 get_thread(thread);
4142 #endif
4143 }
4145 #ifdef _LP64
4146 if (var_size_in_bytes->is_valid()) {
4147 addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
4148 } else {
4149 addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
4150 }
4151 #else
4152 if (var_size_in_bytes->is_valid()) {
4153 addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
4154 } else {
4155 addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
4156 }
4157 adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
4158 #endif
4159 }
4161 void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
4162 pusha();
4164 // if we are coming from c1, xmm registers may be live
4165 int off = 0;
4166 if (UseSSE == 1) {
4167 subptr(rsp, sizeof(jdouble)*8);
4168 movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
4169 movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
4170 movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
4171 movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
4172 movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
4173 movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
4174 movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
4175 movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
4176 } else if (UseSSE >= 2) {
4177 #ifdef COMPILER2
4178 if (MaxVectorSize > 16) {
4179 assert(UseAVX > 0, "256bit vectors are supported only with AVX");
4180 // Save upper half of YMM registes
4181 subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4182 vextractf128h(Address(rsp, 0),xmm0);
4183 vextractf128h(Address(rsp, 16),xmm1);
4184 vextractf128h(Address(rsp, 32),xmm2);
4185 vextractf128h(Address(rsp, 48),xmm3);
4186 vextractf128h(Address(rsp, 64),xmm4);
4187 vextractf128h(Address(rsp, 80),xmm5);
4188 vextractf128h(Address(rsp, 96),xmm6);
4189 vextractf128h(Address(rsp,112),xmm7);
4190 #ifdef _LP64
4191 vextractf128h(Address(rsp,128),xmm8);
4192 vextractf128h(Address(rsp,144),xmm9);
4193 vextractf128h(Address(rsp,160),xmm10);
4194 vextractf128h(Address(rsp,176),xmm11);
4195 vextractf128h(Address(rsp,192),xmm12);
4196 vextractf128h(Address(rsp,208),xmm13);
4197 vextractf128h(Address(rsp,224),xmm14);
4198 vextractf128h(Address(rsp,240),xmm15);
4199 #endif
4200 }
4201 #endif
4202 // Save whole 128bit (16 bytes) XMM regiters
4203 subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4204 movdqu(Address(rsp,off++*16),xmm0);
4205 movdqu(Address(rsp,off++*16),xmm1);
4206 movdqu(Address(rsp,off++*16),xmm2);
4207 movdqu(Address(rsp,off++*16),xmm3);
4208 movdqu(Address(rsp,off++*16),xmm4);
4209 movdqu(Address(rsp,off++*16),xmm5);
4210 movdqu(Address(rsp,off++*16),xmm6);
4211 movdqu(Address(rsp,off++*16),xmm7);
4212 #ifdef _LP64
4213 movdqu(Address(rsp,off++*16),xmm8);
4214 movdqu(Address(rsp,off++*16),xmm9);
4215 movdqu(Address(rsp,off++*16),xmm10);
4216 movdqu(Address(rsp,off++*16),xmm11);
4217 movdqu(Address(rsp,off++*16),xmm12);
4218 movdqu(Address(rsp,off++*16),xmm13);
4219 movdqu(Address(rsp,off++*16),xmm14);
4220 movdqu(Address(rsp,off++*16),xmm15);
4221 #endif
4222 }
4224 // Preserve registers across runtime call
4225 int incoming_argument_and_return_value_offset = -1;
4226 if (num_fpu_regs_in_use > 1) {
4227 // Must preserve all other FPU regs (could alternatively convert
4228 // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash
4229 // FPU state, but can not trust C compiler)
4230 NEEDS_CLEANUP;
4231 // NOTE that in this case we also push the incoming argument(s) to
4232 // the stack and restore it later; we also use this stack slot to
4233 // hold the return value from dsin, dcos etc.
4234 for (int i = 0; i < num_fpu_regs_in_use; i++) {
4235 subptr(rsp, sizeof(jdouble));
4236 fstp_d(Address(rsp, 0));
4237 }
4238 incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
4239 for (int i = nb_args-1; i >= 0; i--) {
4240 fld_d(Address(rsp, incoming_argument_and_return_value_offset-i*sizeof(jdouble)));
4241 }
4242 }
4244 subptr(rsp, nb_args*sizeof(jdouble));
4245 for (int i = 0; i < nb_args; i++) {
4246 fstp_d(Address(rsp, i*sizeof(jdouble)));
4247 }
4249 #ifdef _LP64
4250 if (nb_args > 0) {
4251 movdbl(xmm0, Address(rsp, 0));
4252 }
4253 if (nb_args > 1) {
4254 movdbl(xmm1, Address(rsp, sizeof(jdouble)));
4255 }
4256 assert(nb_args <= 2, "unsupported number of args");
4257 #endif // _LP64
4259 // NOTE: we must not use call_VM_leaf here because that requires a
4260 // complete interpreter frame in debug mode -- same bug as 4387334
4261 // MacroAssembler::call_VM_leaf_base is perfectly safe and will
4262 // do proper 64bit abi
4264 NEEDS_CLEANUP;
4265 // Need to add stack banging before this runtime call if it needs to
4266 // be taken; however, there is no generic stack banging routine at
4267 // the MacroAssembler level
4269 MacroAssembler::call_VM_leaf_base(runtime_entry, 0);
4271 #ifdef _LP64
4272 movsd(Address(rsp, 0), xmm0);
4273 fld_d(Address(rsp, 0));
4274 #endif // _LP64
4275 addptr(rsp, sizeof(jdouble) * nb_args);
4276 if (num_fpu_regs_in_use > 1) {
4277 // Must save return value to stack and then restore entire FPU
4278 // stack except incoming arguments
4279 fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
4280 for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) {
4281 fld_d(Address(rsp, 0));
4282 addptr(rsp, sizeof(jdouble));
4283 }
4284 fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
4285 addptr(rsp, sizeof(jdouble) * nb_args);
4286 }
4288 off = 0;
4289 if (UseSSE == 1) {
4290 movflt(xmm0, Address(rsp,off++*sizeof(jdouble)));
4291 movflt(xmm1, Address(rsp,off++*sizeof(jdouble)));
4292 movflt(xmm2, Address(rsp,off++*sizeof(jdouble)));
4293 movflt(xmm3, Address(rsp,off++*sizeof(jdouble)));
4294 movflt(xmm4, Address(rsp,off++*sizeof(jdouble)));
4295 movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
4296 movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
4297 movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
4298 addptr(rsp, sizeof(jdouble)*8);
4299 } else if (UseSSE >= 2) {
4300 // Restore whole 128bit (16 bytes) XMM regiters
4301 movdqu(xmm0, Address(rsp,off++*16));
4302 movdqu(xmm1, Address(rsp,off++*16));
4303 movdqu(xmm2, Address(rsp,off++*16));
4304 movdqu(xmm3, Address(rsp,off++*16));
4305 movdqu(xmm4, Address(rsp,off++*16));
4306 movdqu(xmm5, Address(rsp,off++*16));
4307 movdqu(xmm6, Address(rsp,off++*16));
4308 movdqu(xmm7, Address(rsp,off++*16));
4309 #ifdef _LP64
4310 movdqu(xmm8, Address(rsp,off++*16));
4311 movdqu(xmm9, Address(rsp,off++*16));
4312 movdqu(xmm10, Address(rsp,off++*16));
4313 movdqu(xmm11, Address(rsp,off++*16));
4314 movdqu(xmm12, Address(rsp,off++*16));
4315 movdqu(xmm13, Address(rsp,off++*16));
4316 movdqu(xmm14, Address(rsp,off++*16));
4317 movdqu(xmm15, Address(rsp,off++*16));
4318 #endif
4319 addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4320 #ifdef COMPILER2
4321 if (MaxVectorSize > 16) {
4322 // Restore upper half of YMM registes.
4323 vinsertf128h(xmm0, Address(rsp, 0));
4324 vinsertf128h(xmm1, Address(rsp, 16));
4325 vinsertf128h(xmm2, Address(rsp, 32));
4326 vinsertf128h(xmm3, Address(rsp, 48));
4327 vinsertf128h(xmm4, Address(rsp, 64));
4328 vinsertf128h(xmm5, Address(rsp, 80));
4329 vinsertf128h(xmm6, Address(rsp, 96));
4330 vinsertf128h(xmm7, Address(rsp,112));
4331 #ifdef _LP64
4332 vinsertf128h(xmm8, Address(rsp,128));
4333 vinsertf128h(xmm9, Address(rsp,144));
4334 vinsertf128h(xmm10, Address(rsp,160));
4335 vinsertf128h(xmm11, Address(rsp,176));
4336 vinsertf128h(xmm12, Address(rsp,192));
4337 vinsertf128h(xmm13, Address(rsp,208));
4338 vinsertf128h(xmm14, Address(rsp,224));
4339 vinsertf128h(xmm15, Address(rsp,240));
4340 #endif
4341 addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4342 }
4343 #endif
4344 }
4345 popa();
4346 }
4348 static const double pi_4 = 0.7853981633974483;
4350 void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
4351 // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
4352 // was attempted in this code; unfortunately it appears that the
4353 // switch to 80-bit precision and back causes this to be
4354 // unprofitable compared with simply performing a runtime call if
4355 // the argument is out of the (-pi/4, pi/4) range.
4357 Register tmp = noreg;
4358 if (!VM_Version::supports_cmov()) {
4359 // fcmp needs a temporary so preserve rbx,
4360 tmp = rbx;
4361 push(tmp);
4362 }
4364 Label slow_case, done;
4366 ExternalAddress pi4_adr = (address)&pi_4;
4367 if (reachable(pi4_adr)) {
4368 // x ?<= pi/4
4369 fld_d(pi4_adr);
4370 fld_s(1); // Stack: X PI/4 X
4371 fabs(); // Stack: |X| PI/4 X
4372 fcmp(tmp);
4373 jcc(Assembler::above, slow_case);
4375 // fastest case: -pi/4 <= x <= pi/4
4376 switch(trig) {
4377 case 's':
4378 fsin();
4379 break;
4380 case 'c':
4381 fcos();
4382 break;
4383 case 't':
4384 ftan();
4385 break;
4386 default:
4387 assert(false, "bad intrinsic");
4388 break;
4389 }
4390 jmp(done);
4391 }
4393 // slow case: runtime call
4394 bind(slow_case);
4396 switch(trig) {
4397 case 's':
4398 {
4399 fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 1, num_fpu_regs_in_use);
4400 }
4401 break;
4402 case 'c':
4403 {
4404 fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 1, num_fpu_regs_in_use);
4405 }
4406 break;
4407 case 't':
4408 {
4409 fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 1, num_fpu_regs_in_use);
4410 }
4411 break;
4412 default:
4413 assert(false, "bad intrinsic");
4414 break;
4415 }
4417 // Come here with result in F-TOS
4418 bind(done);
4420 if (tmp != noreg) {
4421 pop(tmp);
4422 }
4423 }
4426 // Look up the method for a megamorphic invokeinterface call.
4427 // The target method is determined by <intf_klass, itable_index>.
4428 // The receiver klass is in recv_klass.
4429 // On success, the result will be in method_result, and execution falls through.
4430 // On failure, execution transfers to the given label.
4431 void MacroAssembler::lookup_interface_method(Register recv_klass,
4432 Register intf_klass,
4433 RegisterOrConstant itable_index,
4434 Register method_result,
4435 Register scan_temp,
4436 Label& L_no_such_interface) {
4437 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
4438 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
4439 "caller must use same register for non-constant itable index as for method");
4441 // Compute start of first itableOffsetEntry (which is at the end of the vtable)
4442 int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
4443 int itentry_off = itableMethodEntry::method_offset_in_bytes();
4444 int scan_step = itableOffsetEntry::size() * wordSize;
4445 int vte_size = vtableEntry::size() * wordSize;
4446 Address::ScaleFactor times_vte_scale = Address::times_ptr;
4447 assert(vte_size == wordSize, "else adjust times_vte_scale");
4449 movl(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize));
4451 // %%% Could store the aligned, prescaled offset in the klassoop.
4452 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
4453 if (HeapWordsPerLong > 1) {
4454 // Round up to align_object_offset boundary
4455 // see code for InstanceKlass::start_of_itable!
4456 round_to(scan_temp, BytesPerLong);
4457 }
4459 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
4460 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4461 lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
4463 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
4464 // if (scan->interface() == intf) {
4465 // result = (klass + scan->offset() + itable_index);
4466 // }
4467 // }
4468 Label search, found_method;
4470 for (int peel = 1; peel >= 0; peel--) {
4471 movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
4472 cmpptr(intf_klass, method_result);
4474 if (peel) {
4475 jccb(Assembler::equal, found_method);
4476 } else {
4477 jccb(Assembler::notEqual, search);
4478 // (invert the test to fall through to found_method...)
4479 }
4481 if (!peel) break;
4483 bind(search);
4485 // Check that the previous entry is non-null. A null entry means that
4486 // the receiver class doesn't implement the interface, and wasn't the
4487 // same as when the caller was compiled.
4488 testptr(method_result, method_result);
4489 jcc(Assembler::zero, L_no_such_interface);
4490 addptr(scan_temp, scan_step);
4491 }
4493 bind(found_method);
4495 // Got a hit.
4496 movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
4497 movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
4498 }
4501 // virtual method calling
4502 void MacroAssembler::lookup_virtual_method(Register recv_klass,
4503 RegisterOrConstant vtable_index,
4504 Register method_result) {
4505 const int base = InstanceKlass::vtable_start_offset() * wordSize;
4506 assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
4507 Address vtable_entry_addr(recv_klass,
4508 vtable_index, Address::times_ptr,
4509 base + vtableEntry::method_offset_in_bytes());
4510 movptr(method_result, vtable_entry_addr);
4511 }
4514 void MacroAssembler::check_klass_subtype(Register sub_klass,
4515 Register super_klass,
4516 Register temp_reg,
4517 Label& L_success) {
4518 Label L_failure;
4519 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL);
4520 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
4521 bind(L_failure);
4522 }
4525 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4526 Register super_klass,
4527 Register temp_reg,
4528 Label* L_success,
4529 Label* L_failure,
4530 Label* L_slow_path,
4531 RegisterOrConstant super_check_offset) {
4532 assert_different_registers(sub_klass, super_klass, temp_reg);
4533 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
4534 if (super_check_offset.is_register()) {
4535 assert_different_registers(sub_klass, super_klass,
4536 super_check_offset.as_register());
4537 } else if (must_load_sco) {
4538 assert(temp_reg != noreg, "supply either a temp or a register offset");
4539 }
4541 Label L_fallthrough;
4542 int label_nulls = 0;
4543 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
4544 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
4545 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
4546 assert(label_nulls <= 1, "at most one NULL in the batch");
4548 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4549 int sco_offset = in_bytes(Klass::super_check_offset_offset());
4550 Address super_check_offset_addr(super_klass, sco_offset);
4552 // Hacked jcc, which "knows" that L_fallthrough, at least, is in
4553 // range of a jccb. If this routine grows larger, reconsider at
4554 // least some of these.
4555 #define local_jcc(assembler_cond, label) \
4556 if (&(label) == &L_fallthrough) jccb(assembler_cond, label); \
4557 else jcc( assembler_cond, label) /*omit semi*/
4559 // Hacked jmp, which may only be used just before L_fallthrough.
4560 #define final_jmp(label) \
4561 if (&(label) == &L_fallthrough) { /*do nothing*/ } \
4562 else jmp(label) /*omit semi*/
4564 // If the pointers are equal, we are done (e.g., String[] elements).
4565 // This self-check enables sharing of secondary supertype arrays among
4566 // non-primary types such as array-of-interface. Otherwise, each such
4567 // type would need its own customized SSA.
4568 // We move this check to the front of the fast path because many
4569 // type checks are in fact trivially successful in this manner,
4570 // so we get a nicely predicted branch right at the start of the check.
4571 cmpptr(sub_klass, super_klass);
4572 local_jcc(Assembler::equal, *L_success);
4574 // Check the supertype display:
4575 if (must_load_sco) {
4576 // Positive movl does right thing on LP64.
4577 movl(temp_reg, super_check_offset_addr);
4578 super_check_offset = RegisterOrConstant(temp_reg);
4579 }
4580 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4581 cmpptr(super_klass, super_check_addr); // load displayed supertype
4583 // This check has worked decisively for primary supers.
4584 // Secondary supers are sought in the super_cache ('super_cache_addr').
4585 // (Secondary supers are interfaces and very deeply nested subtypes.)
4586 // This works in the same check above because of a tricky aliasing
4587 // between the super_cache and the primary super display elements.
4588 // (The 'super_check_addr' can address either, as the case requires.)
4589 // Note that the cache is updated below if it does not help us find
4590 // what we need immediately.
4591 // So if it was a primary super, we can just fail immediately.
4592 // Otherwise, it's the slow path for us (no success at this point).
4594 if (super_check_offset.is_register()) {
4595 local_jcc(Assembler::equal, *L_success);
4596 cmpl(super_check_offset.as_register(), sc_offset);
4597 if (L_failure == &L_fallthrough) {
4598 local_jcc(Assembler::equal, *L_slow_path);
4599 } else {
4600 local_jcc(Assembler::notEqual, *L_failure);
4601 final_jmp(*L_slow_path);
4602 }
4603 } else if (super_check_offset.as_constant() == sc_offset) {
4604 // Need a slow path; fast failure is impossible.
4605 if (L_slow_path == &L_fallthrough) {
4606 local_jcc(Assembler::equal, *L_success);
4607 } else {
4608 local_jcc(Assembler::notEqual, *L_slow_path);
4609 final_jmp(*L_success);
4610 }
4611 } else {
4612 // No slow path; it's a fast decision.
4613 if (L_failure == &L_fallthrough) {
4614 local_jcc(Assembler::equal, *L_success);
4615 } else {
4616 local_jcc(Assembler::notEqual, *L_failure);
4617 final_jmp(*L_success);
4618 }
4619 }
4621 bind(L_fallthrough);
4623 #undef local_jcc
4624 #undef final_jmp
4625 }
4628 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4629 Register super_klass,
4630 Register temp_reg,
4631 Register temp2_reg,
4632 Label* L_success,
4633 Label* L_failure,
4634 bool set_cond_codes) {
4635 assert_different_registers(sub_klass, super_klass, temp_reg);
4636 if (temp2_reg != noreg)
4637 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4638 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4640 Label L_fallthrough;
4641 int label_nulls = 0;
4642 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
4643 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
4644 assert(label_nulls <= 1, "at most one NULL in the batch");
4646 // a couple of useful fields in sub_klass:
4647 int ss_offset = in_bytes(Klass::secondary_supers_offset());
4648 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4649 Address secondary_supers_addr(sub_klass, ss_offset);
4650 Address super_cache_addr( sub_klass, sc_offset);
4652 // Do a linear scan of the secondary super-klass chain.
4653 // This code is rarely used, so simplicity is a virtue here.
4654 // The repne_scan instruction uses fixed registers, which we must spill.
4655 // Don't worry too much about pre-existing connections with the input regs.
4657 assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4658 assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4660 // Get super_klass value into rax (even if it was in rdi or rcx).
4661 bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4662 if (super_klass != rax || UseCompressedOops) {
4663 if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4664 mov(rax, super_klass);
4665 }
4666 if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4667 if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4669 #ifndef PRODUCT
4670 int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4671 ExternalAddress pst_counter_addr((address) pst_counter);
4672 NOT_LP64( incrementl(pst_counter_addr) );
4673 LP64_ONLY( lea(rcx, pst_counter_addr) );
4674 LP64_ONLY( incrementl(Address(rcx, 0)) );
4675 #endif //PRODUCT
4677 // We will consult the secondary-super array.
4678 movptr(rdi, secondary_supers_addr);
4679 // Load the array length. (Positive movl does right thing on LP64.)
4680 movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4681 // Skip to start of data.
4682 addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4684 // Scan RCX words at [RDI] for an occurrence of RAX.
4685 // Set NZ/Z based on last compare.
4686 // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4687 // not change flags (only scas instruction which is repeated sets flags).
4688 // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4690 testptr(rax,rax); // Set Z = 0
4691 repne_scan();
4693 // Unspill the temp. registers:
4694 if (pushed_rdi) pop(rdi);
4695 if (pushed_rcx) pop(rcx);
4696 if (pushed_rax) pop(rax);
4698 if (set_cond_codes) {
4699 // Special hack for the AD files: rdi is guaranteed non-zero.
4700 assert(!pushed_rdi, "rdi must be left non-NULL");
4701 // Also, the condition codes are properly set Z/NZ on succeed/failure.
4702 }
4704 if (L_failure == &L_fallthrough)
4705 jccb(Assembler::notEqual, *L_failure);
4706 else jcc(Assembler::notEqual, *L_failure);
4708 // Success. Cache the super we found and proceed in triumph.
4709 movptr(super_cache_addr, super_klass);
4711 if (L_success != &L_fallthrough) {
4712 jmp(*L_success);
4713 }
4715 #undef IS_A_TEMP
4717 bind(L_fallthrough);
4718 }
4721 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4722 if (VM_Version::supports_cmov()) {
4723 cmovl(cc, dst, src);
4724 } else {
4725 Label L;
4726 jccb(negate_condition(cc), L);
4727 movl(dst, src);
4728 bind(L);
4729 }
4730 }
4732 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4733 if (VM_Version::supports_cmov()) {
4734 cmovl(cc, dst, src);
4735 } else {
4736 Label L;
4737 jccb(negate_condition(cc), L);
4738 movl(dst, src);
4739 bind(L);
4740 }
4741 }
4743 void MacroAssembler::verify_oop(Register reg, const char* s) {
4744 if (!VerifyOops) return;
4746 // Pass register number to verify_oop_subroutine
4747 const char* b = NULL;
4748 {
4749 ResourceMark rm;
4750 stringStream ss;
4751 ss.print("verify_oop: %s: %s", reg->name(), s);
4752 b = code_string(ss.as_string());
4753 }
4754 BLOCK_COMMENT("verify_oop {");
4755 #ifdef _LP64
4756 push(rscratch1); // save r10, trashed by movptr()
4757 #endif
4758 push(rax); // save rax,
4759 push(reg); // pass register argument
4760 ExternalAddress buffer((address) b);
4761 // avoid using pushptr, as it modifies scratch registers
4762 // and our contract is not to modify anything
4763 movptr(rax, buffer.addr());
4764 push(rax);
4765 // call indirectly to solve generation ordering problem
4766 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4767 call(rax);
4768 // Caller pops the arguments (oop, message) and restores rax, r10
4769 BLOCK_COMMENT("} verify_oop");
4770 }
4773 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
4774 Register tmp,
4775 int offset) {
4776 intptr_t value = *delayed_value_addr;
4777 if (value != 0)
4778 return RegisterOrConstant(value + offset);
4780 // load indirectly to solve generation ordering problem
4781 movptr(tmp, ExternalAddress((address) delayed_value_addr));
4783 #ifdef ASSERT
4784 { Label L;
4785 testptr(tmp, tmp);
4786 if (WizardMode) {
4787 const char* buf = NULL;
4788 {
4789 ResourceMark rm;
4790 stringStream ss;
4791 ss.print("DelayedValue="INTPTR_FORMAT, delayed_value_addr[1]);
4792 buf = code_string(ss.as_string());
4793 }
4794 jcc(Assembler::notZero, L);
4795 STOP(buf);
4796 } else {
4797 jccb(Assembler::notZero, L);
4798 hlt();
4799 }
4800 bind(L);
4801 }
4802 #endif
4804 if (offset != 0)
4805 addptr(tmp, offset);
4807 return RegisterOrConstant(tmp);
4808 }
4811 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4812 int extra_slot_offset) {
4813 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4814 int stackElementSize = Interpreter::stackElementSize;
4815 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4816 #ifdef ASSERT
4817 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4818 assert(offset1 - offset == stackElementSize, "correct arithmetic");
4819 #endif
4820 Register scale_reg = noreg;
4821 Address::ScaleFactor scale_factor = Address::no_scale;
4822 if (arg_slot.is_constant()) {
4823 offset += arg_slot.as_constant() * stackElementSize;
4824 } else {
4825 scale_reg = arg_slot.as_register();
4826 scale_factor = Address::times(stackElementSize);
4827 }
4828 offset += wordSize; // return PC is on stack
4829 return Address(rsp, scale_reg, scale_factor, offset);
4830 }
4833 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
4834 if (!VerifyOops) return;
4836 // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
4837 // Pass register number to verify_oop_subroutine
4838 const char* b = NULL;
4839 {
4840 ResourceMark rm;
4841 stringStream ss;
4842 ss.print("verify_oop_addr: %s", s);
4843 b = code_string(ss.as_string());
4844 }
4845 #ifdef _LP64
4846 push(rscratch1); // save r10, trashed by movptr()
4847 #endif
4848 push(rax); // save rax,
4849 // addr may contain rsp so we will have to adjust it based on the push
4850 // we just did (and on 64 bit we do two pushes)
4851 // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
4852 // stores rax into addr which is backwards of what was intended.
4853 if (addr.uses(rsp)) {
4854 lea(rax, addr);
4855 pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
4856 } else {
4857 pushptr(addr);
4858 }
4860 ExternalAddress buffer((address) b);
4861 // pass msg argument
4862 // avoid using pushptr, as it modifies scratch registers
4863 // and our contract is not to modify anything
4864 movptr(rax, buffer.addr());
4865 push(rax);
4867 // call indirectly to solve generation ordering problem
4868 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4869 call(rax);
4870 // Caller pops the arguments (addr, message) and restores rax, r10.
4871 }
4873 void MacroAssembler::verify_tlab() {
4874 #ifdef ASSERT
4875 if (UseTLAB && VerifyOops) {
4876 Label next, ok;
4877 Register t1 = rsi;
4878 Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
4880 push(t1);
4881 NOT_LP64(push(thread_reg));
4882 NOT_LP64(get_thread(thread_reg));
4884 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4885 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4886 jcc(Assembler::aboveEqual, next);
4887 STOP("assert(top >= start)");
4888 should_not_reach_here();
4890 bind(next);
4891 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4892 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4893 jcc(Assembler::aboveEqual, ok);
4894 STOP("assert(top <= end)");
4895 should_not_reach_here();
4897 bind(ok);
4898 NOT_LP64(pop(thread_reg));
4899 pop(t1);
4900 }
4901 #endif
4902 }
4904 class ControlWord {
4905 public:
4906 int32_t _value;
4908 int rounding_control() const { return (_value >> 10) & 3 ; }
4909 int precision_control() const { return (_value >> 8) & 3 ; }
4910 bool precision() const { return ((_value >> 5) & 1) != 0; }
4911 bool underflow() const { return ((_value >> 4) & 1) != 0; }
4912 bool overflow() const { return ((_value >> 3) & 1) != 0; }
4913 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
4914 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
4915 bool invalid() const { return ((_value >> 0) & 1) != 0; }
4917 void print() const {
4918 // rounding control
4919 const char* rc;
4920 switch (rounding_control()) {
4921 case 0: rc = "round near"; break;
4922 case 1: rc = "round down"; break;
4923 case 2: rc = "round up "; break;
4924 case 3: rc = "chop "; break;
4925 };
4926 // precision control
4927 const char* pc;
4928 switch (precision_control()) {
4929 case 0: pc = "24 bits "; break;
4930 case 1: pc = "reserved"; break;
4931 case 2: pc = "53 bits "; break;
4932 case 3: pc = "64 bits "; break;
4933 };
4934 // flags
4935 char f[9];
4936 f[0] = ' ';
4937 f[1] = ' ';
4938 f[2] = (precision ()) ? 'P' : 'p';
4939 f[3] = (underflow ()) ? 'U' : 'u';
4940 f[4] = (overflow ()) ? 'O' : 'o';
4941 f[5] = (zero_divide ()) ? 'Z' : 'z';
4942 f[6] = (denormalized()) ? 'D' : 'd';
4943 f[7] = (invalid ()) ? 'I' : 'i';
4944 f[8] = '\x0';
4945 // output
4946 printf("%04x masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
4947 }
4949 };
4951 class StatusWord {
4952 public:
4953 int32_t _value;
4955 bool busy() const { return ((_value >> 15) & 1) != 0; }
4956 bool C3() const { return ((_value >> 14) & 1) != 0; }
4957 bool C2() const { return ((_value >> 10) & 1) != 0; }
4958 bool C1() const { return ((_value >> 9) & 1) != 0; }
4959 bool C0() const { return ((_value >> 8) & 1) != 0; }
4960 int top() const { return (_value >> 11) & 7 ; }
4961 bool error_status() const { return ((_value >> 7) & 1) != 0; }
4962 bool stack_fault() const { return ((_value >> 6) & 1) != 0; }
4963 bool precision() const { return ((_value >> 5) & 1) != 0; }
4964 bool underflow() const { return ((_value >> 4) & 1) != 0; }
4965 bool overflow() const { return ((_value >> 3) & 1) != 0; }
4966 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
4967 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
4968 bool invalid() const { return ((_value >> 0) & 1) != 0; }
4970 void print() const {
4971 // condition codes
4972 char c[5];
4973 c[0] = (C3()) ? '3' : '-';
4974 c[1] = (C2()) ? '2' : '-';
4975 c[2] = (C1()) ? '1' : '-';
4976 c[3] = (C0()) ? '0' : '-';
4977 c[4] = '\x0';
4978 // flags
4979 char f[9];
4980 f[0] = (error_status()) ? 'E' : '-';
4981 f[1] = (stack_fault ()) ? 'S' : '-';
4982 f[2] = (precision ()) ? 'P' : '-';
4983 f[3] = (underflow ()) ? 'U' : '-';
4984 f[4] = (overflow ()) ? 'O' : '-';
4985 f[5] = (zero_divide ()) ? 'Z' : '-';
4986 f[6] = (denormalized()) ? 'D' : '-';
4987 f[7] = (invalid ()) ? 'I' : '-';
4988 f[8] = '\x0';
4989 // output
4990 printf("%04x flags = %s, cc = %s, top = %d", _value & 0xFFFF, f, c, top());
4991 }
4993 };
4995 class TagWord {
4996 public:
4997 int32_t _value;
4999 int tag_at(int i) const { return (_value >> (i*2)) & 3; }
5001 void print() const {
5002 printf("%04x", _value & 0xFFFF);
5003 }
5005 };
5007 class FPU_Register {
5008 public:
5009 int32_t _m0;
5010 int32_t _m1;
5011 int16_t _ex;
5013 bool is_indefinite() const {
5014 return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
5015 }
5017 void print() const {
5018 char sign = (_ex < 0) ? '-' : '+';
5019 const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : " ";
5020 printf("%c%04hx.%08x%08x %s", sign, _ex, _m1, _m0, kind);
5021 };
5023 };
5025 class FPU_State {
5026 public:
5027 enum {
5028 register_size = 10,
5029 number_of_registers = 8,
5030 register_mask = 7
5031 };
5033 ControlWord _control_word;
5034 StatusWord _status_word;
5035 TagWord _tag_word;
5036 int32_t _error_offset;
5037 int32_t _error_selector;
5038 int32_t _data_offset;
5039 int32_t _data_selector;
5040 int8_t _register[register_size * number_of_registers];
5042 int tag_for_st(int i) const { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
5043 FPU_Register* st(int i) const { return (FPU_Register*)&_register[register_size * i]; }
5045 const char* tag_as_string(int tag) const {
5046 switch (tag) {
5047 case 0: return "valid";
5048 case 1: return "zero";
5049 case 2: return "special";
5050 case 3: return "empty";
5051 }
5052 ShouldNotReachHere();
5053 return NULL;
5054 }
5056 void print() const {
5057 // print computation registers
5058 { int t = _status_word.top();
5059 for (int i = 0; i < number_of_registers; i++) {
5060 int j = (i - t) & register_mask;
5061 printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
5062 st(j)->print();
5063 printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
5064 }
5065 }
5066 printf("\n");
5067 // print control registers
5068 printf("ctrl = "); _control_word.print(); printf("\n");
5069 printf("stat = "); _status_word .print(); printf("\n");
5070 printf("tags = "); _tag_word .print(); printf("\n");
5071 }
5073 };
5075 class Flag_Register {
5076 public:
5077 int32_t _value;
5079 bool overflow() const { return ((_value >> 11) & 1) != 0; }
5080 bool direction() const { return ((_value >> 10) & 1) != 0; }
5081 bool sign() const { return ((_value >> 7) & 1) != 0; }
5082 bool zero() const { return ((_value >> 6) & 1) != 0; }
5083 bool auxiliary_carry() const { return ((_value >> 4) & 1) != 0; }
5084 bool parity() const { return ((_value >> 2) & 1) != 0; }
5085 bool carry() const { return ((_value >> 0) & 1) != 0; }
5087 void print() const {
5088 // flags
5089 char f[8];
5090 f[0] = (overflow ()) ? 'O' : '-';
5091 f[1] = (direction ()) ? 'D' : '-';
5092 f[2] = (sign ()) ? 'S' : '-';
5093 f[3] = (zero ()) ? 'Z' : '-';
5094 f[4] = (auxiliary_carry()) ? 'A' : '-';
5095 f[5] = (parity ()) ? 'P' : '-';
5096 f[6] = (carry ()) ? 'C' : '-';
5097 f[7] = '\x0';
5098 // output
5099 printf("%08x flags = %s", _value, f);
5100 }
5102 };
5104 class IU_Register {
5105 public:
5106 int32_t _value;
5108 void print() const {
5109 printf("%08x %11d", _value, _value);
5110 }
5112 };
5114 class IU_State {
5115 public:
5116 Flag_Register _eflags;
5117 IU_Register _rdi;
5118 IU_Register _rsi;
5119 IU_Register _rbp;
5120 IU_Register _rsp;
5121 IU_Register _rbx;
5122 IU_Register _rdx;
5123 IU_Register _rcx;
5124 IU_Register _rax;
5126 void print() const {
5127 // computation registers
5128 printf("rax, = "); _rax.print(); printf("\n");
5129 printf("rbx, = "); _rbx.print(); printf("\n");
5130 printf("rcx = "); _rcx.print(); printf("\n");
5131 printf("rdx = "); _rdx.print(); printf("\n");
5132 printf("rdi = "); _rdi.print(); printf("\n");
5133 printf("rsi = "); _rsi.print(); printf("\n");
5134 printf("rbp, = "); _rbp.print(); printf("\n");
5135 printf("rsp = "); _rsp.print(); printf("\n");
5136 printf("\n");
5137 // control registers
5138 printf("flgs = "); _eflags.print(); printf("\n");
5139 }
5140 };
5143 class CPU_State {
5144 public:
5145 FPU_State _fpu_state;
5146 IU_State _iu_state;
5148 void print() const {
5149 printf("--------------------------------------------------\n");
5150 _iu_state .print();
5151 printf("\n");
5152 _fpu_state.print();
5153 printf("--------------------------------------------------\n");
5154 }
5156 };
5159 static void _print_CPU_state(CPU_State* state) {
5160 state->print();
5161 };
5164 void MacroAssembler::print_CPU_state() {
5165 push_CPU_state();
5166 push(rsp); // pass CPU state
5167 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5168 addptr(rsp, wordSize); // discard argument
5169 pop_CPU_state();
5170 }
5173 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
5174 static int counter = 0;
5175 FPU_State* fs = &state->_fpu_state;
5176 counter++;
5177 // For leaf calls, only verify that the top few elements remain empty.
5178 // We only need 1 empty at the top for C2 code.
5179 if( stack_depth < 0 ) {
5180 if( fs->tag_for_st(7) != 3 ) {
5181 printf("FPR7 not empty\n");
5182 state->print();
5183 assert(false, "error");
5184 return false;
5185 }
5186 return true; // All other stack states do not matter
5187 }
5189 assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
5190 "bad FPU control word");
5192 // compute stack depth
5193 int i = 0;
5194 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) < 3) i++;
5195 int d = i;
5196 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
5197 // verify findings
5198 if (i != FPU_State::number_of_registers) {
5199 // stack not contiguous
5200 printf("%s: stack not contiguous at ST%d\n", s, i);
5201 state->print();
5202 assert(false, "error");
5203 return false;
5204 }
5205 // check if computed stack depth corresponds to expected stack depth
5206 if (stack_depth < 0) {
5207 // expected stack depth is -stack_depth or less
5208 if (d > -stack_depth) {
5209 // too many elements on the stack
5210 printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
5211 state->print();
5212 assert(false, "error");
5213 return false;
5214 }
5215 } else {
5216 // expected stack depth is stack_depth
5217 if (d != stack_depth) {
5218 // wrong stack depth
5219 printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
5220 state->print();
5221 assert(false, "error");
5222 return false;
5223 }
5224 }
5225 // everything is cool
5226 return true;
5227 }
5230 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
5231 if (!VerifyFPU) return;
5232 push_CPU_state();
5233 push(rsp); // pass CPU state
5234 ExternalAddress msg((address) s);
5235 // pass message string s
5236 pushptr(msg.addr());
5237 push(stack_depth); // pass stack depth
5238 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
5239 addptr(rsp, 3 * wordSize); // discard arguments
5240 // check for error
5241 { Label L;
5242 testl(rax, rax);
5243 jcc(Assembler::notZero, L);
5244 int3(); // break if error condition
5245 bind(L);
5246 }
5247 pop_CPU_state();
5248 }
5250 void MacroAssembler::restore_cpu_control_state_after_jni() {
5251 // Either restore the MXCSR register after returning from the JNI Call
5252 // or verify that it wasn't changed (with -Xcheck:jni flag).
5253 if (VM_Version::supports_sse()) {
5254 if (RestoreMXCSROnJNICalls) {
5255 ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
5256 } else if (CheckJNICalls) {
5257 call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5258 }
5259 }
5260 if (VM_Version::supports_avx()) {
5261 // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5262 vzeroupper();
5263 }
5265 #ifndef _LP64
5266 // Either restore the x87 floating pointer control word after returning
5267 // from the JNI call or verify that it wasn't changed.
5268 if (CheckJNICalls) {
5269 call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
5270 }
5271 #endif // _LP64
5272 }
5275 void MacroAssembler::load_klass(Register dst, Register src) {
5276 #ifdef _LP64
5277 if (UseCompressedClassPointers) {
5278 movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5279 decode_klass_not_null(dst);
5280 } else
5281 #endif
5282 movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5283 }
5285 void MacroAssembler::load_prototype_header(Register dst, Register src) {
5286 load_klass(dst, src);
5287 movptr(dst, Address(dst, Klass::prototype_header_offset()));
5288 }
5290 void MacroAssembler::store_klass(Register dst, Register src) {
5291 #ifdef _LP64
5292 if (UseCompressedClassPointers) {
5293 encode_klass_not_null(src);
5294 movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5295 } else
5296 #endif
5297 movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5298 }
5300 void MacroAssembler::load_heap_oop(Register dst, Address src) {
5301 #ifdef _LP64
5302 // FIXME: Must change all places where we try to load the klass.
5303 if (UseCompressedOops) {
5304 movl(dst, src);
5305 decode_heap_oop(dst);
5306 } else
5307 #endif
5308 movptr(dst, src);
5309 }
5311 // Doesn't do verfication, generates fixed size code
5312 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src) {
5313 #ifdef _LP64
5314 if (UseCompressedOops) {
5315 movl(dst, src);
5316 decode_heap_oop_not_null(dst);
5317 } else
5318 #endif
5319 movptr(dst, src);
5320 }
5322 void MacroAssembler::store_heap_oop(Address dst, Register src) {
5323 #ifdef _LP64
5324 if (UseCompressedOops) {
5325 assert(!dst.uses(src), "not enough registers");
5326 encode_heap_oop(src);
5327 movl(dst, src);
5328 } else
5329 #endif
5330 movptr(dst, src);
5331 }
5333 void MacroAssembler::cmp_heap_oop(Register src1, Address src2, Register tmp) {
5334 assert_different_registers(src1, tmp);
5335 #ifdef _LP64
5336 if (UseCompressedOops) {
5337 bool did_push = false;
5338 if (tmp == noreg) {
5339 tmp = rax;
5340 push(tmp);
5341 did_push = true;
5342 assert(!src2.uses(rsp), "can't push");
5343 }
5344 load_heap_oop(tmp, src2);
5345 cmpptr(src1, tmp);
5346 if (did_push) pop(tmp);
5347 } else
5348 #endif
5349 cmpptr(src1, src2);
5350 }
5352 // Used for storing NULLs.
5353 void MacroAssembler::store_heap_oop_null(Address dst) {
5354 #ifdef _LP64
5355 if (UseCompressedOops) {
5356 movl(dst, (int32_t)NULL_WORD);
5357 } else {
5358 movslq(dst, (int32_t)NULL_WORD);
5359 }
5360 #else
5361 movl(dst, (int32_t)NULL_WORD);
5362 #endif
5363 }
5365 #ifdef _LP64
5366 void MacroAssembler::store_klass_gap(Register dst, Register src) {
5367 if (UseCompressedClassPointers) {
5368 // Store to klass gap in destination
5369 movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5370 }
5371 }
5373 #ifdef ASSERT
5374 void MacroAssembler::verify_heapbase(const char* msg) {
5375 assert (UseCompressedOops, "should be compressed");
5376 assert (Universe::heap() != NULL, "java heap should be initialized");
5377 if (CheckCompressedOops) {
5378 Label ok;
5379 push(rscratch1); // cmpptr trashes rscratch1
5380 cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
5381 jcc(Assembler::equal, ok);
5382 STOP(msg);
5383 bind(ok);
5384 pop(rscratch1);
5385 }
5386 }
5387 #endif
5389 // Algorithm must match oop.inline.hpp encode_heap_oop.
5390 void MacroAssembler::encode_heap_oop(Register r) {
5391 #ifdef ASSERT
5392 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5393 #endif
5394 verify_oop(r, "broken oop in encode_heap_oop");
5395 if (Universe::narrow_oop_base() == NULL) {
5396 if (Universe::narrow_oop_shift() != 0) {
5397 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5398 shrq(r, LogMinObjAlignmentInBytes);
5399 }
5400 return;
5401 }
5402 testq(r, r);
5403 cmovq(Assembler::equal, r, r12_heapbase);
5404 subq(r, r12_heapbase);
5405 shrq(r, LogMinObjAlignmentInBytes);
5406 }
5408 void MacroAssembler::encode_heap_oop_not_null(Register r) {
5409 #ifdef ASSERT
5410 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5411 if (CheckCompressedOops) {
5412 Label ok;
5413 testq(r, r);
5414 jcc(Assembler::notEqual, ok);
5415 STOP("null oop passed to encode_heap_oop_not_null");
5416 bind(ok);
5417 }
5418 #endif
5419 verify_oop(r, "broken oop in encode_heap_oop_not_null");
5420 if (Universe::narrow_oop_base() != NULL) {
5421 subq(r, r12_heapbase);
5422 }
5423 if (Universe::narrow_oop_shift() != 0) {
5424 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5425 shrq(r, LogMinObjAlignmentInBytes);
5426 }
5427 }
5429 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5430 #ifdef ASSERT
5431 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5432 if (CheckCompressedOops) {
5433 Label ok;
5434 testq(src, src);
5435 jcc(Assembler::notEqual, ok);
5436 STOP("null oop passed to encode_heap_oop_not_null2");
5437 bind(ok);
5438 }
5439 #endif
5440 verify_oop(src, "broken oop in encode_heap_oop_not_null2");
5441 if (dst != src) {
5442 movq(dst, src);
5443 }
5444 if (Universe::narrow_oop_base() != NULL) {
5445 subq(dst, r12_heapbase);
5446 }
5447 if (Universe::narrow_oop_shift() != 0) {
5448 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5449 shrq(dst, LogMinObjAlignmentInBytes);
5450 }
5451 }
5453 void MacroAssembler::decode_heap_oop(Register r) {
5454 #ifdef ASSERT
5455 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5456 #endif
5457 if (Universe::narrow_oop_base() == NULL) {
5458 if (Universe::narrow_oop_shift() != 0) {
5459 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5460 shlq(r, LogMinObjAlignmentInBytes);
5461 }
5462 } else {
5463 Label done;
5464 shlq(r, LogMinObjAlignmentInBytes);
5465 jccb(Assembler::equal, done);
5466 addq(r, r12_heapbase);
5467 bind(done);
5468 }
5469 verify_oop(r, "broken oop in decode_heap_oop");
5470 }
5472 void MacroAssembler::decode_heap_oop_not_null(Register r) {
5473 // Note: it will change flags
5474 assert (UseCompressedOops, "should only be used for compressed headers");
5475 assert (Universe::heap() != NULL, "java heap should be initialized");
5476 // Cannot assert, unverified entry point counts instructions (see .ad file)
5477 // vtableStubs also counts instructions in pd_code_size_limit.
5478 // Also do not verify_oop as this is called by verify_oop.
5479 if (Universe::narrow_oop_shift() != 0) {
5480 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5481 shlq(r, LogMinObjAlignmentInBytes);
5482 if (Universe::narrow_oop_base() != NULL) {
5483 addq(r, r12_heapbase);
5484 }
5485 } else {
5486 assert (Universe::narrow_oop_base() == NULL, "sanity");
5487 }
5488 }
5490 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5491 // Note: it will change flags
5492 assert (UseCompressedOops, "should only be used for compressed headers");
5493 assert (Universe::heap() != NULL, "java heap should be initialized");
5494 // Cannot assert, unverified entry point counts instructions (see .ad file)
5495 // vtableStubs also counts instructions in pd_code_size_limit.
5496 // Also do not verify_oop as this is called by verify_oop.
5497 if (Universe::narrow_oop_shift() != 0) {
5498 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5499 if (LogMinObjAlignmentInBytes == Address::times_8) {
5500 leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5501 } else {
5502 if (dst != src) {
5503 movq(dst, src);
5504 }
5505 shlq(dst, LogMinObjAlignmentInBytes);
5506 if (Universe::narrow_oop_base() != NULL) {
5507 addq(dst, r12_heapbase);
5508 }
5509 }
5510 } else {
5511 assert (Universe::narrow_oop_base() == NULL, "sanity");
5512 if (dst != src) {
5513 movq(dst, src);
5514 }
5515 }
5516 }
5518 void MacroAssembler::encode_klass_not_null(Register r) {
5519 if (Universe::narrow_klass_base() != NULL) {
5520 // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5521 assert(r != r12_heapbase, "Encoding a klass in r12");
5522 mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
5523 subq(r, r12_heapbase);
5524 }
5525 if (Universe::narrow_klass_shift() != 0) {
5526 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5527 shrq(r, LogKlassAlignmentInBytes);
5528 }
5529 if (Universe::narrow_klass_base() != NULL) {
5530 reinit_heapbase();
5531 }
5532 }
5534 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
5535 if (dst == src) {
5536 encode_klass_not_null(src);
5537 } else {
5538 if (Universe::narrow_klass_base() != NULL) {
5539 mov64(dst, (int64_t)Universe::narrow_klass_base());
5540 negq(dst);
5541 addq(dst, src);
5542 } else {
5543 movptr(dst, src);
5544 }
5545 if (Universe::narrow_klass_shift() != 0) {
5546 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5547 shrq(dst, LogKlassAlignmentInBytes);
5548 }
5549 }
5550 }
5552 // Function instr_size_for_decode_klass_not_null() counts the instructions
5553 // generated by decode_klass_not_null(register r) and reinit_heapbase(),
5554 // when (Universe::heap() != NULL). Hence, if the instructions they
5555 // generate change, then this method needs to be updated.
5556 int MacroAssembler::instr_size_for_decode_klass_not_null() {
5557 assert (UseCompressedClassPointers, "only for compressed klass ptrs");
5558 if (Universe::narrow_klass_base() != NULL) {
5559 // mov64 + addq + shlq? + mov64 (for reinit_heapbase()).
5560 return (Universe::narrow_klass_shift() == 0 ? 20 : 24);
5561 } else {
5562 // longest load decode klass function, mov64, leaq
5563 return 16;
5564 }
5565 }
5567 // !!! If the instructions that get generated here change then function
5568 // instr_size_for_decode_klass_not_null() needs to get updated.
5569 void MacroAssembler::decode_klass_not_null(Register r) {
5570 // Note: it will change flags
5571 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5572 assert(r != r12_heapbase, "Decoding a klass in r12");
5573 // Cannot assert, unverified entry point counts instructions (see .ad file)
5574 // vtableStubs also counts instructions in pd_code_size_limit.
5575 // Also do not verify_oop as this is called by verify_oop.
5576 if (Universe::narrow_klass_shift() != 0) {
5577 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5578 shlq(r, LogKlassAlignmentInBytes);
5579 }
5580 // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5581 if (Universe::narrow_klass_base() != NULL) {
5582 mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
5583 addq(r, r12_heapbase);
5584 reinit_heapbase();
5585 }
5586 }
5588 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
5589 // Note: it will change flags
5590 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5591 if (dst == src) {
5592 decode_klass_not_null(dst);
5593 } else {
5594 // Cannot assert, unverified entry point counts instructions (see .ad file)
5595 // vtableStubs also counts instructions in pd_code_size_limit.
5596 // Also do not verify_oop as this is called by verify_oop.
5597 mov64(dst, (int64_t)Universe::narrow_klass_base());
5598 if (Universe::narrow_klass_shift() != 0) {
5599 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5600 assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
5601 leaq(dst, Address(dst, src, Address::times_8, 0));
5602 } else {
5603 addq(dst, src);
5604 }
5605 }
5606 }
5608 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5609 assert (UseCompressedOops, "should only be used for compressed headers");
5610 assert (Universe::heap() != NULL, "java heap should be initialized");
5611 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5612 int oop_index = oop_recorder()->find_index(obj);
5613 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5614 mov_narrow_oop(dst, oop_index, rspec);
5615 }
5617 void MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5618 assert (UseCompressedOops, "should only be used for compressed headers");
5619 assert (Universe::heap() != NULL, "java heap should be initialized");
5620 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5621 int oop_index = oop_recorder()->find_index(obj);
5622 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5623 mov_narrow_oop(dst, oop_index, rspec);
5624 }
5626 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5627 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5628 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5629 int klass_index = oop_recorder()->find_index(k);
5630 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5631 mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
5632 }
5634 void MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5635 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5636 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5637 int klass_index = oop_recorder()->find_index(k);
5638 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5639 mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
5640 }
5642 void MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5643 assert (UseCompressedOops, "should only be used for compressed headers");
5644 assert (Universe::heap() != NULL, "java heap should be initialized");
5645 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5646 int oop_index = oop_recorder()->find_index(obj);
5647 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5648 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5649 }
5651 void MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5652 assert (UseCompressedOops, "should only be used for compressed headers");
5653 assert (Universe::heap() != NULL, "java heap should be initialized");
5654 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5655 int oop_index = oop_recorder()->find_index(obj);
5656 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5657 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5658 }
5660 void MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5661 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5662 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5663 int klass_index = oop_recorder()->find_index(k);
5664 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5665 Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
5666 }
5668 void MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5669 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5670 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5671 int klass_index = oop_recorder()->find_index(k);
5672 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5673 Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
5674 }
5676 void MacroAssembler::reinit_heapbase() {
5677 if (UseCompressedOops || UseCompressedClassPointers) {
5678 if (Universe::heap() != NULL) {
5679 if (Universe::narrow_oop_base() == NULL) {
5680 MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5681 } else {
5682 mov64(r12_heapbase, (int64_t)Universe::narrow_ptrs_base());
5683 }
5684 } else {
5685 movptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
5686 }
5687 }
5688 }
5690 #endif // _LP64
5693 // C2 compiled method's prolog code.
5694 void MacroAssembler::verified_entry(int framesize, bool stack_bang, bool fp_mode_24b) {
5696 // WARNING: Initial instruction MUST be 5 bytes or longer so that
5697 // NativeJump::patch_verified_entry will be able to patch out the entry
5698 // code safely. The push to verify stack depth is ok at 5 bytes,
5699 // the frame allocation can be either 3 or 6 bytes. So if we don't do
5700 // stack bang then we must use the 6 byte frame allocation even if
5701 // we have no frame. :-(
5703 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
5704 // Remove word for return addr
5705 framesize -= wordSize;
5707 // Calls to C2R adapters often do not accept exceptional returns.
5708 // We require that their callers must bang for them. But be careful, because
5709 // some VM calls (such as call site linkage) can use several kilobytes of
5710 // stack. But the stack safety zone should account for that.
5711 // See bugs 4446381, 4468289, 4497237.
5712 if (stack_bang) {
5713 generate_stack_overflow_check(framesize);
5715 // We always push rbp, so that on return to interpreter rbp, will be
5716 // restored correctly and we can correct the stack.
5717 push(rbp);
5718 // Remove word for ebp
5719 framesize -= wordSize;
5721 // Create frame
5722 if (framesize) {
5723 subptr(rsp, framesize);
5724 }
5725 } else {
5726 // Create frame (force generation of a 4 byte immediate value)
5727 subptr_imm32(rsp, framesize);
5729 // Save RBP register now.
5730 framesize -= wordSize;
5731 movptr(Address(rsp, framesize), rbp);
5732 }
5734 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
5735 framesize -= wordSize;
5736 movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
5737 }
5739 #ifndef _LP64
5740 // If method sets FPU control word do it now
5741 if (fp_mode_24b) {
5742 fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
5743 }
5744 if (UseSSE >= 2 && VerifyFPU) {
5745 verify_FPU(0, "FPU stack must be clean on entry");
5746 }
5747 #endif
5749 #ifdef ASSERT
5750 if (VerifyStackAtCalls) {
5751 Label L;
5752 push(rax);
5753 mov(rax, rsp);
5754 andptr(rax, StackAlignmentInBytes-1);
5755 cmpptr(rax, StackAlignmentInBytes-wordSize);
5756 pop(rax);
5757 jcc(Assembler::equal, L);
5758 STOP("Stack is not properly aligned!");
5759 bind(L);
5760 }
5761 #endif
5763 }
5765 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp) {
5766 // cnt - number of qwords (8-byte words).
5767 // base - start address, qword aligned.
5768 assert(base==rdi, "base register must be edi for rep stos");
5769 assert(tmp==rax, "tmp register must be eax for rep stos");
5770 assert(cnt==rcx, "cnt register must be ecx for rep stos");
5772 xorptr(tmp, tmp);
5773 if (UseFastStosb) {
5774 shlptr(cnt,3); // convert to number of bytes
5775 rep_stosb();
5776 } else {
5777 NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM
5778 rep_stos();
5779 }
5780 }
5782 // IndexOf for constant substrings with size >= 8 chars
5783 // which don't need to be loaded through stack.
5784 void MacroAssembler::string_indexofC8(Register str1, Register str2,
5785 Register cnt1, Register cnt2,
5786 int int_cnt2, Register result,
5787 XMMRegister vec, Register tmp) {
5788 ShortBranchVerifier sbv(this);
5789 assert(UseSSE42Intrinsics, "SSE4.2 is required");
5791 // This method uses pcmpestri inxtruction with bound registers
5792 // inputs:
5793 // xmm - substring
5794 // rax - substring length (elements count)
5795 // mem - scanned string
5796 // rdx - string length (elements count)
5797 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
5798 // outputs:
5799 // rcx - matched index in string
5800 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
5802 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
5803 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
5804 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
5806 // Note, inline_string_indexOf() generates checks:
5807 // if (substr.count > string.count) return -1;
5808 // if (substr.count == 0) return 0;
5809 assert(int_cnt2 >= 8, "this code isused only for cnt2 >= 8 chars");
5811 // Load substring.
5812 movdqu(vec, Address(str2, 0));
5813 movl(cnt2, int_cnt2);
5814 movptr(result, str1); // string addr
5816 if (int_cnt2 > 8) {
5817 jmpb(SCAN_TO_SUBSTR);
5819 // Reload substr for rescan, this code
5820 // is executed only for large substrings (> 8 chars)
5821 bind(RELOAD_SUBSTR);
5822 movdqu(vec, Address(str2, 0));
5823 negptr(cnt2); // Jumped here with negative cnt2, convert to positive
5825 bind(RELOAD_STR);
5826 // We came here after the beginning of the substring was
5827 // matched but the rest of it was not so we need to search
5828 // again. Start from the next element after the previous match.
5830 // cnt2 is number of substring reminding elements and
5831 // cnt1 is number of string reminding elements when cmp failed.
5832 // Restored cnt1 = cnt1 - cnt2 + int_cnt2
5833 subl(cnt1, cnt2);
5834 addl(cnt1, int_cnt2);
5835 movl(cnt2, int_cnt2); // Now restore cnt2
5837 decrementl(cnt1); // Shift to next element
5838 cmpl(cnt1, cnt2);
5839 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
5841 addptr(result, 2);
5843 } // (int_cnt2 > 8)
5845 // Scan string for start of substr in 16-byte vectors
5846 bind(SCAN_TO_SUBSTR);
5847 pcmpestri(vec, Address(result, 0), 0x0d);
5848 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
5849 subl(cnt1, 8);
5850 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
5851 cmpl(cnt1, cnt2);
5852 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
5853 addptr(result, 16);
5854 jmpb(SCAN_TO_SUBSTR);
5856 // Found a potential substr
5857 bind(FOUND_CANDIDATE);
5858 // Matched whole vector if first element matched (tmp(rcx) == 0).
5859 if (int_cnt2 == 8) {
5860 jccb(Assembler::overflow, RET_FOUND); // OF == 1
5861 } else { // int_cnt2 > 8
5862 jccb(Assembler::overflow, FOUND_SUBSTR);
5863 }
5864 // After pcmpestri tmp(rcx) contains matched element index
5865 // Compute start addr of substr
5866 lea(result, Address(result, tmp, Address::times_2));
5868 // Make sure string is still long enough
5869 subl(cnt1, tmp);
5870 cmpl(cnt1, cnt2);
5871 if (int_cnt2 == 8) {
5872 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
5873 } else { // int_cnt2 > 8
5874 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
5875 }
5876 // Left less then substring.
5878 bind(RET_NOT_FOUND);
5879 movl(result, -1);
5880 jmpb(EXIT);
5882 if (int_cnt2 > 8) {
5883 // This code is optimized for the case when whole substring
5884 // is matched if its head is matched.
5885 bind(MATCH_SUBSTR_HEAD);
5886 pcmpestri(vec, Address(result, 0), 0x0d);
5887 // Reload only string if does not match
5888 jccb(Assembler::noOverflow, RELOAD_STR); // OF == 0
5890 Label CONT_SCAN_SUBSTR;
5891 // Compare the rest of substring (> 8 chars).
5892 bind(FOUND_SUBSTR);
5893 // First 8 chars are already matched.
5894 negptr(cnt2);
5895 addptr(cnt2, 8);
5897 bind(SCAN_SUBSTR);
5898 subl(cnt1, 8);
5899 cmpl(cnt2, -8); // Do not read beyond substring
5900 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
5901 // Back-up strings to avoid reading beyond substring:
5902 // cnt1 = cnt1 - cnt2 + 8
5903 addl(cnt1, cnt2); // cnt2 is negative
5904 addl(cnt1, 8);
5905 movl(cnt2, 8); negptr(cnt2);
5906 bind(CONT_SCAN_SUBSTR);
5907 if (int_cnt2 < (int)G) {
5908 movdqu(vec, Address(str2, cnt2, Address::times_2, int_cnt2*2));
5909 pcmpestri(vec, Address(result, cnt2, Address::times_2, int_cnt2*2), 0x0d);
5910 } else {
5911 // calculate index in register to avoid integer overflow (int_cnt2*2)
5912 movl(tmp, int_cnt2);
5913 addptr(tmp, cnt2);
5914 movdqu(vec, Address(str2, tmp, Address::times_2, 0));
5915 pcmpestri(vec, Address(result, tmp, Address::times_2, 0), 0x0d);
5916 }
5917 // Need to reload strings pointers if not matched whole vector
5918 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
5919 addptr(cnt2, 8);
5920 jcc(Assembler::negative, SCAN_SUBSTR);
5921 // Fall through if found full substring
5923 } // (int_cnt2 > 8)
5925 bind(RET_FOUND);
5926 // Found result if we matched full small substring.
5927 // Compute substr offset
5928 subptr(result, str1);
5929 shrl(result, 1); // index
5930 bind(EXIT);
5932 } // string_indexofC8
5934 // Small strings are loaded through stack if they cross page boundary.
5935 void MacroAssembler::string_indexof(Register str1, Register str2,
5936 Register cnt1, Register cnt2,
5937 int int_cnt2, Register result,
5938 XMMRegister vec, Register tmp) {
5939 ShortBranchVerifier sbv(this);
5940 assert(UseSSE42Intrinsics, "SSE4.2 is required");
5941 //
5942 // int_cnt2 is length of small (< 8 chars) constant substring
5943 // or (-1) for non constant substring in which case its length
5944 // is in cnt2 register.
5945 //
5946 // Note, inline_string_indexOf() generates checks:
5947 // if (substr.count > string.count) return -1;
5948 // if (substr.count == 0) return 0;
5949 //
5950 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < 8), "should be != 0");
5952 // This method uses pcmpestri inxtruction with bound registers
5953 // inputs:
5954 // xmm - substring
5955 // rax - substring length (elements count)
5956 // mem - scanned string
5957 // rdx - string length (elements count)
5958 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
5959 // outputs:
5960 // rcx - matched index in string
5961 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
5963 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
5964 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
5965 FOUND_CANDIDATE;
5967 { //========================================================
5968 // We don't know where these strings are located
5969 // and we can't read beyond them. Load them through stack.
5970 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
5972 movptr(tmp, rsp); // save old SP
5974 if (int_cnt2 > 0) { // small (< 8 chars) constant substring
5975 if (int_cnt2 == 1) { // One char
5976 load_unsigned_short(result, Address(str2, 0));
5977 movdl(vec, result); // move 32 bits
5978 } else if (int_cnt2 == 2) { // Two chars
5979 movdl(vec, Address(str2, 0)); // move 32 bits
5980 } else if (int_cnt2 == 4) { // Four chars
5981 movq(vec, Address(str2, 0)); // move 64 bits
5982 } else { // cnt2 = { 3, 5, 6, 7 }
5983 // Array header size is 12 bytes in 32-bit VM
5984 // + 6 bytes for 3 chars == 18 bytes,
5985 // enough space to load vec and shift.
5986 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
5987 movdqu(vec, Address(str2, (int_cnt2*2)-16));
5988 psrldq(vec, 16-(int_cnt2*2));
5989 }
5990 } else { // not constant substring
5991 cmpl(cnt2, 8);
5992 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
5994 // We can read beyond string if srt+16 does not cross page boundary
5995 // since heaps are aligned and mapped by pages.
5996 assert(os::vm_page_size() < (int)G, "default page should be small");
5997 movl(result, str2); // We need only low 32 bits
5998 andl(result, (os::vm_page_size()-1));
5999 cmpl(result, (os::vm_page_size()-16));
6000 jccb(Assembler::belowEqual, CHECK_STR);
6002 // Move small strings to stack to allow load 16 bytes into vec.
6003 subptr(rsp, 16);
6004 int stk_offset = wordSize-2;
6005 push(cnt2);
6007 bind(COPY_SUBSTR);
6008 load_unsigned_short(result, Address(str2, cnt2, Address::times_2, -2));
6009 movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
6010 decrement(cnt2);
6011 jccb(Assembler::notZero, COPY_SUBSTR);
6013 pop(cnt2);
6014 movptr(str2, rsp); // New substring address
6015 } // non constant
6017 bind(CHECK_STR);
6018 cmpl(cnt1, 8);
6019 jccb(Assembler::aboveEqual, BIG_STRINGS);
6021 // Check cross page boundary.
6022 movl(result, str1); // We need only low 32 bits
6023 andl(result, (os::vm_page_size()-1));
6024 cmpl(result, (os::vm_page_size()-16));
6025 jccb(Assembler::belowEqual, BIG_STRINGS);
6027 subptr(rsp, 16);
6028 int stk_offset = -2;
6029 if (int_cnt2 < 0) { // not constant
6030 push(cnt2);
6031 stk_offset += wordSize;
6032 }
6033 movl(cnt2, cnt1);
6035 bind(COPY_STR);
6036 load_unsigned_short(result, Address(str1, cnt2, Address::times_2, -2));
6037 movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
6038 decrement(cnt2);
6039 jccb(Assembler::notZero, COPY_STR);
6041 if (int_cnt2 < 0) { // not constant
6042 pop(cnt2);
6043 }
6044 movptr(str1, rsp); // New string address
6046 bind(BIG_STRINGS);
6047 // Load substring.
6048 if (int_cnt2 < 0) { // -1
6049 movdqu(vec, Address(str2, 0));
6050 push(cnt2); // substr count
6051 push(str2); // substr addr
6052 push(str1); // string addr
6053 } else {
6054 // Small (< 8 chars) constant substrings are loaded already.
6055 movl(cnt2, int_cnt2);
6056 }
6057 push(tmp); // original SP
6059 } // Finished loading
6061 //========================================================
6062 // Start search
6063 //
6065 movptr(result, str1); // string addr
6067 if (int_cnt2 < 0) { // Only for non constant substring
6068 jmpb(SCAN_TO_SUBSTR);
6070 // SP saved at sp+0
6071 // String saved at sp+1*wordSize
6072 // Substr saved at sp+2*wordSize
6073 // Substr count saved at sp+3*wordSize
6075 // Reload substr for rescan, this code
6076 // is executed only for large substrings (> 8 chars)
6077 bind(RELOAD_SUBSTR);
6078 movptr(str2, Address(rsp, 2*wordSize));
6079 movl(cnt2, Address(rsp, 3*wordSize));
6080 movdqu(vec, Address(str2, 0));
6081 // We came here after the beginning of the substring was
6082 // matched but the rest of it was not so we need to search
6083 // again. Start from the next element after the previous match.
6084 subptr(str1, result); // Restore counter
6085 shrl(str1, 1);
6086 addl(cnt1, str1);
6087 decrementl(cnt1); // Shift to next element
6088 cmpl(cnt1, cnt2);
6089 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
6091 addptr(result, 2);
6092 } // non constant
6094 // Scan string for start of substr in 16-byte vectors
6095 bind(SCAN_TO_SUBSTR);
6096 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6097 pcmpestri(vec, Address(result, 0), 0x0d);
6098 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
6099 subl(cnt1, 8);
6100 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
6101 cmpl(cnt1, cnt2);
6102 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
6103 addptr(result, 16);
6105 bind(ADJUST_STR);
6106 cmpl(cnt1, 8); // Do not read beyond string
6107 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
6108 // Back-up string to avoid reading beyond string.
6109 lea(result, Address(result, cnt1, Address::times_2, -16));
6110 movl(cnt1, 8);
6111 jmpb(SCAN_TO_SUBSTR);
6113 // Found a potential substr
6114 bind(FOUND_CANDIDATE);
6115 // After pcmpestri tmp(rcx) contains matched element index
6117 // Make sure string is still long enough
6118 subl(cnt1, tmp);
6119 cmpl(cnt1, cnt2);
6120 jccb(Assembler::greaterEqual, FOUND_SUBSTR);
6121 // Left less then substring.
6123 bind(RET_NOT_FOUND);
6124 movl(result, -1);
6125 jmpb(CLEANUP);
6127 bind(FOUND_SUBSTR);
6128 // Compute start addr of substr
6129 lea(result, Address(result, tmp, Address::times_2));
6131 if (int_cnt2 > 0) { // Constant substring
6132 // Repeat search for small substring (< 8 chars)
6133 // from new point without reloading substring.
6134 // Have to check that we don't read beyond string.
6135 cmpl(tmp, 8-int_cnt2);
6136 jccb(Assembler::greater, ADJUST_STR);
6137 // Fall through if matched whole substring.
6138 } else { // non constant
6139 assert(int_cnt2 == -1, "should be != 0");
6141 addl(tmp, cnt2);
6142 // Found result if we matched whole substring.
6143 cmpl(tmp, 8);
6144 jccb(Assembler::lessEqual, RET_FOUND);
6146 // Repeat search for small substring (<= 8 chars)
6147 // from new point 'str1' without reloading substring.
6148 cmpl(cnt2, 8);
6149 // Have to check that we don't read beyond string.
6150 jccb(Assembler::lessEqual, ADJUST_STR);
6152 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
6153 // Compare the rest of substring (> 8 chars).
6154 movptr(str1, result);
6156 cmpl(tmp, cnt2);
6157 // First 8 chars are already matched.
6158 jccb(Assembler::equal, CHECK_NEXT);
6160 bind(SCAN_SUBSTR);
6161 pcmpestri(vec, Address(str1, 0), 0x0d);
6162 // Need to reload strings pointers if not matched whole vector
6163 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6165 bind(CHECK_NEXT);
6166 subl(cnt2, 8);
6167 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
6168 addptr(str1, 16);
6169 addptr(str2, 16);
6170 subl(cnt1, 8);
6171 cmpl(cnt2, 8); // Do not read beyond substring
6172 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
6173 // Back-up strings to avoid reading beyond substring.
6174 lea(str2, Address(str2, cnt2, Address::times_2, -16));
6175 lea(str1, Address(str1, cnt2, Address::times_2, -16));
6176 subl(cnt1, cnt2);
6177 movl(cnt2, 8);
6178 addl(cnt1, 8);
6179 bind(CONT_SCAN_SUBSTR);
6180 movdqu(vec, Address(str2, 0));
6181 jmpb(SCAN_SUBSTR);
6183 bind(RET_FOUND_LONG);
6184 movptr(str1, Address(rsp, wordSize));
6185 } // non constant
6187 bind(RET_FOUND);
6188 // Compute substr offset
6189 subptr(result, str1);
6190 shrl(result, 1); // index
6192 bind(CLEANUP);
6193 pop(rsp); // restore SP
6195 } // string_indexof
6197 // Compare strings.
6198 void MacroAssembler::string_compare(Register str1, Register str2,
6199 Register cnt1, Register cnt2, Register result,
6200 XMMRegister vec1) {
6201 ShortBranchVerifier sbv(this);
6202 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
6204 // Compute the minimum of the string lengths and the
6205 // difference of the string lengths (stack).
6206 // Do the conditional move stuff
6207 movl(result, cnt1);
6208 subl(cnt1, cnt2);
6209 push(cnt1);
6210 cmov32(Assembler::lessEqual, cnt2, result);
6212 // Is the minimum length zero?
6213 testl(cnt2, cnt2);
6214 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6216 // Compare first characters
6217 load_unsigned_short(result, Address(str1, 0));
6218 load_unsigned_short(cnt1, Address(str2, 0));
6219 subl(result, cnt1);
6220 jcc(Assembler::notZero, POP_LABEL);
6221 cmpl(cnt2, 1);
6222 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6224 // Check if the strings start at the same location.
6225 cmpptr(str1, str2);
6226 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6228 Address::ScaleFactor scale = Address::times_2;
6229 int stride = 8;
6231 if (UseAVX >= 2 && UseSSE42Intrinsics) {
6232 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
6233 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
6234 Label COMPARE_TAIL_LONG;
6235 int pcmpmask = 0x19;
6237 // Setup to compare 16-chars (32-bytes) vectors,
6238 // start from first character again because it has aligned address.
6239 int stride2 = 16;
6240 int adr_stride = stride << scale;
6241 int adr_stride2 = stride2 << scale;
6243 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
6244 // rax and rdx are used by pcmpestri as elements counters
6245 movl(result, cnt2);
6246 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
6247 jcc(Assembler::zero, COMPARE_TAIL_LONG);
6249 // fast path : compare first 2 8-char vectors.
6250 bind(COMPARE_16_CHARS);
6251 movdqu(vec1, Address(str1, 0));
6252 pcmpestri(vec1, Address(str2, 0), pcmpmask);
6253 jccb(Assembler::below, COMPARE_INDEX_CHAR);
6255 movdqu(vec1, Address(str1, adr_stride));
6256 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
6257 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
6258 addl(cnt1, stride);
6260 // Compare the characters at index in cnt1
6261 bind(COMPARE_INDEX_CHAR); //cnt1 has the offset of the mismatching character
6262 load_unsigned_short(result, Address(str1, cnt1, scale));
6263 load_unsigned_short(cnt2, Address(str2, cnt1, scale));
6264 subl(result, cnt2);
6265 jmp(POP_LABEL);
6267 // Setup the registers to start vector comparison loop
6268 bind(COMPARE_WIDE_VECTORS);
6269 lea(str1, Address(str1, result, scale));
6270 lea(str2, Address(str2, result, scale));
6271 subl(result, stride2);
6272 subl(cnt2, stride2);
6273 jccb(Assembler::zero, COMPARE_WIDE_TAIL);
6274 negptr(result);
6276 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
6277 bind(COMPARE_WIDE_VECTORS_LOOP);
6278 vmovdqu(vec1, Address(str1, result, scale));
6279 vpxor(vec1, Address(str2, result, scale));
6280 vptest(vec1, vec1);
6281 jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
6282 addptr(result, stride2);
6283 subl(cnt2, stride2);
6284 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
6285 // clean upper bits of YMM registers
6286 vzeroupper();
6288 // compare wide vectors tail
6289 bind(COMPARE_WIDE_TAIL);
6290 testptr(result, result);
6291 jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6293 movl(result, stride2);
6294 movl(cnt2, result);
6295 negptr(result);
6296 jmpb(COMPARE_WIDE_VECTORS_LOOP);
6298 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
6299 bind(VECTOR_NOT_EQUAL);
6300 // clean upper bits of YMM registers
6301 vzeroupper();
6302 lea(str1, Address(str1, result, scale));
6303 lea(str2, Address(str2, result, scale));
6304 jmp(COMPARE_16_CHARS);
6306 // Compare tail chars, length between 1 to 15 chars
6307 bind(COMPARE_TAIL_LONG);
6308 movl(cnt2, result);
6309 cmpl(cnt2, stride);
6310 jccb(Assembler::less, COMPARE_SMALL_STR);
6312 movdqu(vec1, Address(str1, 0));
6313 pcmpestri(vec1, Address(str2, 0), pcmpmask);
6314 jcc(Assembler::below, COMPARE_INDEX_CHAR);
6315 subptr(cnt2, stride);
6316 jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6317 lea(str1, Address(str1, result, scale));
6318 lea(str2, Address(str2, result, scale));
6319 negptr(cnt2);
6320 jmpb(WHILE_HEAD_LABEL);
6322 bind(COMPARE_SMALL_STR);
6323 } else if (UseSSE42Intrinsics) {
6324 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
6325 int pcmpmask = 0x19;
6326 // Setup to compare 8-char (16-byte) vectors,
6327 // start from first character again because it has aligned address.
6328 movl(result, cnt2);
6329 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
6330 jccb(Assembler::zero, COMPARE_TAIL);
6332 lea(str1, Address(str1, result, scale));
6333 lea(str2, Address(str2, result, scale));
6334 negptr(result);
6336 // pcmpestri
6337 // inputs:
6338 // vec1- substring
6339 // rax - negative string length (elements count)
6340 // mem - scaned string
6341 // rdx - string length (elements count)
6342 // pcmpmask - cmp mode: 11000 (string compare with negated result)
6343 // + 00 (unsigned bytes) or + 01 (unsigned shorts)
6344 // outputs:
6345 // rcx - first mismatched element index
6346 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
6348 bind(COMPARE_WIDE_VECTORS);
6349 movdqu(vec1, Address(str1, result, scale));
6350 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
6351 // After pcmpestri cnt1(rcx) contains mismatched element index
6353 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
6354 addptr(result, stride);
6355 subptr(cnt2, stride);
6356 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
6358 // compare wide vectors tail
6359 testptr(result, result);
6360 jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6362 movl(cnt2, stride);
6363 movl(result, stride);
6364 negptr(result);
6365 movdqu(vec1, Address(str1, result, scale));
6366 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
6367 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
6369 // Mismatched characters in the vectors
6370 bind(VECTOR_NOT_EQUAL);
6371 addptr(cnt1, result);
6372 load_unsigned_short(result, Address(str1, cnt1, scale));
6373 load_unsigned_short(cnt2, Address(str2, cnt1, scale));
6374 subl(result, cnt2);
6375 jmpb(POP_LABEL);
6377 bind(COMPARE_TAIL); // limit is zero
6378 movl(cnt2, result);
6379 // Fallthru to tail compare
6380 }
6381 // Shift str2 and str1 to the end of the arrays, negate min
6382 lea(str1, Address(str1, cnt2, scale));
6383 lea(str2, Address(str2, cnt2, scale));
6384 decrementl(cnt2); // first character was compared already
6385 negptr(cnt2);
6387 // Compare the rest of the elements
6388 bind(WHILE_HEAD_LABEL);
6389 load_unsigned_short(result, Address(str1, cnt2, scale, 0));
6390 load_unsigned_short(cnt1, Address(str2, cnt2, scale, 0));
6391 subl(result, cnt1);
6392 jccb(Assembler::notZero, POP_LABEL);
6393 increment(cnt2);
6394 jccb(Assembler::notZero, WHILE_HEAD_LABEL);
6396 // Strings are equal up to min length. Return the length difference.
6397 bind(LENGTH_DIFF_LABEL);
6398 pop(result);
6399 jmpb(DONE_LABEL);
6401 // Discard the stored length difference
6402 bind(POP_LABEL);
6403 pop(cnt1);
6405 // That's it
6406 bind(DONE_LABEL);
6407 }
6409 // Compare char[] arrays aligned to 4 bytes or substrings.
6410 void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Register ary2,
6411 Register limit, Register result, Register chr,
6412 XMMRegister vec1, XMMRegister vec2) {
6413 ShortBranchVerifier sbv(this);
6414 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR;
6416 int length_offset = arrayOopDesc::length_offset_in_bytes();
6417 int base_offset = arrayOopDesc::base_offset_in_bytes(T_CHAR);
6419 // Check the input args
6420 cmpptr(ary1, ary2);
6421 jcc(Assembler::equal, TRUE_LABEL);
6423 if (is_array_equ) {
6424 // Need additional checks for arrays_equals.
6425 testptr(ary1, ary1);
6426 jcc(Assembler::zero, FALSE_LABEL);
6427 testptr(ary2, ary2);
6428 jcc(Assembler::zero, FALSE_LABEL);
6430 // Check the lengths
6431 movl(limit, Address(ary1, length_offset));
6432 cmpl(limit, Address(ary2, length_offset));
6433 jcc(Assembler::notEqual, FALSE_LABEL);
6434 }
6436 // count == 0
6437 testl(limit, limit);
6438 jcc(Assembler::zero, TRUE_LABEL);
6440 if (is_array_equ) {
6441 // Load array address
6442 lea(ary1, Address(ary1, base_offset));
6443 lea(ary2, Address(ary2, base_offset));
6444 }
6446 shll(limit, 1); // byte count != 0
6447 movl(result, limit); // copy
6449 if (UseAVX >= 2) {
6450 // With AVX2, use 32-byte vector compare
6451 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
6453 // Compare 32-byte vectors
6454 andl(result, 0x0000001e); // tail count (in bytes)
6455 andl(limit, 0xffffffe0); // vector count (in bytes)
6456 jccb(Assembler::zero, COMPARE_TAIL);
6458 lea(ary1, Address(ary1, limit, Address::times_1));
6459 lea(ary2, Address(ary2, limit, Address::times_1));
6460 negptr(limit);
6462 bind(COMPARE_WIDE_VECTORS);
6463 vmovdqu(vec1, Address(ary1, limit, Address::times_1));
6464 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
6465 vpxor(vec1, vec2);
6467 vptest(vec1, vec1);
6468 jccb(Assembler::notZero, FALSE_LABEL);
6469 addptr(limit, 32);
6470 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
6472 testl(result, result);
6473 jccb(Assembler::zero, TRUE_LABEL);
6475 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
6476 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
6477 vpxor(vec1, vec2);
6479 vptest(vec1, vec1);
6480 jccb(Assembler::notZero, FALSE_LABEL);
6481 jmpb(TRUE_LABEL);
6483 bind(COMPARE_TAIL); // limit is zero
6484 movl(limit, result);
6485 // Fallthru to tail compare
6486 } else if (UseSSE42Intrinsics) {
6487 // With SSE4.2, use double quad vector compare
6488 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
6490 // Compare 16-byte vectors
6491 andl(result, 0x0000000e); // tail count (in bytes)
6492 andl(limit, 0xfffffff0); // vector count (in bytes)
6493 jccb(Assembler::zero, COMPARE_TAIL);
6495 lea(ary1, Address(ary1, limit, Address::times_1));
6496 lea(ary2, Address(ary2, limit, Address::times_1));
6497 negptr(limit);
6499 bind(COMPARE_WIDE_VECTORS);
6500 movdqu(vec1, Address(ary1, limit, Address::times_1));
6501 movdqu(vec2, Address(ary2, limit, Address::times_1));
6502 pxor(vec1, vec2);
6504 ptest(vec1, vec1);
6505 jccb(Assembler::notZero, FALSE_LABEL);
6506 addptr(limit, 16);
6507 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
6509 testl(result, result);
6510 jccb(Assembler::zero, TRUE_LABEL);
6512 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
6513 movdqu(vec2, Address(ary2, result, Address::times_1, -16));
6514 pxor(vec1, vec2);
6516 ptest(vec1, vec1);
6517 jccb(Assembler::notZero, FALSE_LABEL);
6518 jmpb(TRUE_LABEL);
6520 bind(COMPARE_TAIL); // limit is zero
6521 movl(limit, result);
6522 // Fallthru to tail compare
6523 }
6525 // Compare 4-byte vectors
6526 andl(limit, 0xfffffffc); // vector count (in bytes)
6527 jccb(Assembler::zero, COMPARE_CHAR);
6529 lea(ary1, Address(ary1, limit, Address::times_1));
6530 lea(ary2, Address(ary2, limit, Address::times_1));
6531 negptr(limit);
6533 bind(COMPARE_VECTORS);
6534 movl(chr, Address(ary1, limit, Address::times_1));
6535 cmpl(chr, Address(ary2, limit, Address::times_1));
6536 jccb(Assembler::notEqual, FALSE_LABEL);
6537 addptr(limit, 4);
6538 jcc(Assembler::notZero, COMPARE_VECTORS);
6540 // Compare trailing char (final 2 bytes), if any
6541 bind(COMPARE_CHAR);
6542 testl(result, 0x2); // tail char
6543 jccb(Assembler::zero, TRUE_LABEL);
6544 load_unsigned_short(chr, Address(ary1, 0));
6545 load_unsigned_short(limit, Address(ary2, 0));
6546 cmpl(chr, limit);
6547 jccb(Assembler::notEqual, FALSE_LABEL);
6549 bind(TRUE_LABEL);
6550 movl(result, 1); // return true
6551 jmpb(DONE);
6553 bind(FALSE_LABEL);
6554 xorl(result, result); // return false
6556 // That's it
6557 bind(DONE);
6558 if (UseAVX >= 2) {
6559 // clean upper bits of YMM registers
6560 vzeroupper();
6561 }
6562 }
6564 void MacroAssembler::generate_fill(BasicType t, bool aligned,
6565 Register to, Register value, Register count,
6566 Register rtmp, XMMRegister xtmp) {
6567 ShortBranchVerifier sbv(this);
6568 assert_different_registers(to, value, count, rtmp);
6569 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
6570 Label L_fill_2_bytes, L_fill_4_bytes;
6572 int shift = -1;
6573 switch (t) {
6574 case T_BYTE:
6575 shift = 2;
6576 break;
6577 case T_SHORT:
6578 shift = 1;
6579 break;
6580 case T_INT:
6581 shift = 0;
6582 break;
6583 default: ShouldNotReachHere();
6584 }
6586 if (t == T_BYTE) {
6587 andl(value, 0xff);
6588 movl(rtmp, value);
6589 shll(rtmp, 8);
6590 orl(value, rtmp);
6591 }
6592 if (t == T_SHORT) {
6593 andl(value, 0xffff);
6594 }
6595 if (t == T_BYTE || t == T_SHORT) {
6596 movl(rtmp, value);
6597 shll(rtmp, 16);
6598 orl(value, rtmp);
6599 }
6601 cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
6602 jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
6603 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
6604 // align source address at 4 bytes address boundary
6605 if (t == T_BYTE) {
6606 // One byte misalignment happens only for byte arrays
6607 testptr(to, 1);
6608 jccb(Assembler::zero, L_skip_align1);
6609 movb(Address(to, 0), value);
6610 increment(to);
6611 decrement(count);
6612 BIND(L_skip_align1);
6613 }
6614 // Two bytes misalignment happens only for byte and short (char) arrays
6615 testptr(to, 2);
6616 jccb(Assembler::zero, L_skip_align2);
6617 movw(Address(to, 0), value);
6618 addptr(to, 2);
6619 subl(count, 1<<(shift-1));
6620 BIND(L_skip_align2);
6621 }
6622 if (UseSSE < 2) {
6623 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
6624 // Fill 32-byte chunks
6625 subl(count, 8 << shift);
6626 jcc(Assembler::less, L_check_fill_8_bytes);
6627 align(16);
6629 BIND(L_fill_32_bytes_loop);
6631 for (int i = 0; i < 32; i += 4) {
6632 movl(Address(to, i), value);
6633 }
6635 addptr(to, 32);
6636 subl(count, 8 << shift);
6637 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
6638 BIND(L_check_fill_8_bytes);
6639 addl(count, 8 << shift);
6640 jccb(Assembler::zero, L_exit);
6641 jmpb(L_fill_8_bytes);
6643 //
6644 // length is too short, just fill qwords
6645 //
6646 BIND(L_fill_8_bytes_loop);
6647 movl(Address(to, 0), value);
6648 movl(Address(to, 4), value);
6649 addptr(to, 8);
6650 BIND(L_fill_8_bytes);
6651 subl(count, 1 << (shift + 1));
6652 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
6653 // fall through to fill 4 bytes
6654 } else {
6655 Label L_fill_32_bytes;
6656 if (!UseUnalignedLoadStores) {
6657 // align to 8 bytes, we know we are 4 byte aligned to start
6658 testptr(to, 4);
6659 jccb(Assembler::zero, L_fill_32_bytes);
6660 movl(Address(to, 0), value);
6661 addptr(to, 4);
6662 subl(count, 1<<shift);
6663 }
6664 BIND(L_fill_32_bytes);
6665 {
6666 assert( UseSSE >= 2, "supported cpu only" );
6667 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
6668 movdl(xtmp, value);
6669 if (UseAVX >= 2 && UseUnalignedLoadStores) {
6670 // Fill 64-byte chunks
6671 Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
6672 vpbroadcastd(xtmp, xtmp);
6674 subl(count, 16 << shift);
6675 jcc(Assembler::less, L_check_fill_32_bytes);
6676 align(16);
6678 BIND(L_fill_64_bytes_loop);
6679 vmovdqu(Address(to, 0), xtmp);
6680 vmovdqu(Address(to, 32), xtmp);
6681 addptr(to, 64);
6682 subl(count, 16 << shift);
6683 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
6685 BIND(L_check_fill_32_bytes);
6686 addl(count, 8 << shift);
6687 jccb(Assembler::less, L_check_fill_8_bytes);
6688 vmovdqu(Address(to, 0), xtmp);
6689 addptr(to, 32);
6690 subl(count, 8 << shift);
6692 BIND(L_check_fill_8_bytes);
6693 // clean upper bits of YMM registers
6694 vzeroupper();
6695 } else {
6696 // Fill 32-byte chunks
6697 pshufd(xtmp, xtmp, 0);
6699 subl(count, 8 << shift);
6700 jcc(Assembler::less, L_check_fill_8_bytes);
6701 align(16);
6703 BIND(L_fill_32_bytes_loop);
6705 if (UseUnalignedLoadStores) {
6706 movdqu(Address(to, 0), xtmp);
6707 movdqu(Address(to, 16), xtmp);
6708 } else {
6709 movq(Address(to, 0), xtmp);
6710 movq(Address(to, 8), xtmp);
6711 movq(Address(to, 16), xtmp);
6712 movq(Address(to, 24), xtmp);
6713 }
6715 addptr(to, 32);
6716 subl(count, 8 << shift);
6717 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
6719 BIND(L_check_fill_8_bytes);
6720 }
6721 addl(count, 8 << shift);
6722 jccb(Assembler::zero, L_exit);
6723 jmpb(L_fill_8_bytes);
6725 //
6726 // length is too short, just fill qwords
6727 //
6728 BIND(L_fill_8_bytes_loop);
6729 movq(Address(to, 0), xtmp);
6730 addptr(to, 8);
6731 BIND(L_fill_8_bytes);
6732 subl(count, 1 << (shift + 1));
6733 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
6734 }
6735 }
6736 // fill trailing 4 bytes
6737 BIND(L_fill_4_bytes);
6738 testl(count, 1<<shift);
6739 jccb(Assembler::zero, L_fill_2_bytes);
6740 movl(Address(to, 0), value);
6741 if (t == T_BYTE || t == T_SHORT) {
6742 addptr(to, 4);
6743 BIND(L_fill_2_bytes);
6744 // fill trailing 2 bytes
6745 testl(count, 1<<(shift-1));
6746 jccb(Assembler::zero, L_fill_byte);
6747 movw(Address(to, 0), value);
6748 if (t == T_BYTE) {
6749 addptr(to, 2);
6750 BIND(L_fill_byte);
6751 // fill trailing byte
6752 testl(count, 1);
6753 jccb(Assembler::zero, L_exit);
6754 movb(Address(to, 0), value);
6755 } else {
6756 BIND(L_fill_byte);
6757 }
6758 } else {
6759 BIND(L_fill_2_bytes);
6760 }
6761 BIND(L_exit);
6762 }
6764 // encode char[] to byte[] in ISO_8859_1
6765 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
6766 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
6767 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
6768 Register tmp5, Register result) {
6769 // rsi: src
6770 // rdi: dst
6771 // rdx: len
6772 // rcx: tmp5
6773 // rax: result
6774 ShortBranchVerifier sbv(this);
6775 assert_different_registers(src, dst, len, tmp5, result);
6776 Label L_done, L_copy_1_char, L_copy_1_char_exit;
6778 // set result
6779 xorl(result, result);
6780 // check for zero length
6781 testl(len, len);
6782 jcc(Assembler::zero, L_done);
6783 movl(result, len);
6785 // Setup pointers
6786 lea(src, Address(src, len, Address::times_2)); // char[]
6787 lea(dst, Address(dst, len, Address::times_1)); // byte[]
6788 negptr(len);
6790 if (UseSSE42Intrinsics || UseAVX >= 2) {
6791 Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
6792 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
6794 if (UseAVX >= 2) {
6795 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
6796 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
6797 movdl(tmp1Reg, tmp5);
6798 vpbroadcastd(tmp1Reg, tmp1Reg);
6799 jmpb(L_chars_32_check);
6801 bind(L_copy_32_chars);
6802 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
6803 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
6804 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ true);
6805 vptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
6806 jccb(Assembler::notZero, L_copy_32_chars_exit);
6807 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector256 */ true);
6808 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector256 */ true);
6809 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
6811 bind(L_chars_32_check);
6812 addptr(len, 32);
6813 jccb(Assembler::lessEqual, L_copy_32_chars);
6815 bind(L_copy_32_chars_exit);
6816 subptr(len, 16);
6817 jccb(Assembler::greater, L_copy_16_chars_exit);
6819 } else if (UseSSE42Intrinsics) {
6820 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
6821 movdl(tmp1Reg, tmp5);
6822 pshufd(tmp1Reg, tmp1Reg, 0);
6823 jmpb(L_chars_16_check);
6824 }
6826 bind(L_copy_16_chars);
6827 if (UseAVX >= 2) {
6828 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
6829 vptest(tmp2Reg, tmp1Reg);
6830 jccb(Assembler::notZero, L_copy_16_chars_exit);
6831 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector256 */ true);
6832 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector256 */ true);
6833 } else {
6834 if (UseAVX > 0) {
6835 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
6836 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
6837 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ false);
6838 } else {
6839 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
6840 por(tmp2Reg, tmp3Reg);
6841 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
6842 por(tmp2Reg, tmp4Reg);
6843 }
6844 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
6845 jccb(Assembler::notZero, L_copy_16_chars_exit);
6846 packuswb(tmp3Reg, tmp4Reg);
6847 }
6848 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
6850 bind(L_chars_16_check);
6851 addptr(len, 16);
6852 jccb(Assembler::lessEqual, L_copy_16_chars);
6854 bind(L_copy_16_chars_exit);
6855 if (UseAVX >= 2) {
6856 // clean upper bits of YMM registers
6857 vzeroupper();
6858 }
6859 subptr(len, 8);
6860 jccb(Assembler::greater, L_copy_8_chars_exit);
6862 bind(L_copy_8_chars);
6863 movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
6864 ptest(tmp3Reg, tmp1Reg);
6865 jccb(Assembler::notZero, L_copy_8_chars_exit);
6866 packuswb(tmp3Reg, tmp1Reg);
6867 movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
6868 addptr(len, 8);
6869 jccb(Assembler::lessEqual, L_copy_8_chars);
6871 bind(L_copy_8_chars_exit);
6872 subptr(len, 8);
6873 jccb(Assembler::zero, L_done);
6874 }
6876 bind(L_copy_1_char);
6877 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
6878 testl(tmp5, 0xff00); // check if Unicode char
6879 jccb(Assembler::notZero, L_copy_1_char_exit);
6880 movb(Address(dst, len, Address::times_1, 0), tmp5);
6881 addptr(len, 1);
6882 jccb(Assembler::less, L_copy_1_char);
6884 bind(L_copy_1_char_exit);
6885 addptr(result, len); // len is negative count of not processed elements
6886 bind(L_done);
6887 }
6889 /**
6890 * Emits code to update CRC-32 with a byte value according to constants in table
6891 *
6892 * @param [in,out]crc Register containing the crc.
6893 * @param [in]val Register containing the byte to fold into the CRC.
6894 * @param [in]table Register containing the table of crc constants.
6895 *
6896 * uint32_t crc;
6897 * val = crc_table[(val ^ crc) & 0xFF];
6898 * crc = val ^ (crc >> 8);
6899 *
6900 */
6901 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
6902 xorl(val, crc);
6903 andl(val, 0xFF);
6904 shrl(crc, 8); // unsigned shift
6905 xorl(crc, Address(table, val, Address::times_4, 0));
6906 }
6908 /**
6909 * Fold 128-bit data chunk
6910 */
6911 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
6912 vpclmulhdq(xtmp, xK, xcrc); // [123:64]
6913 vpclmulldq(xcrc, xK, xcrc); // [63:0]
6914 vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */);
6915 pxor(xcrc, xtmp);
6916 }
6918 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
6919 vpclmulhdq(xtmp, xK, xcrc);
6920 vpclmulldq(xcrc, xK, xcrc);
6921 pxor(xcrc, xbuf);
6922 pxor(xcrc, xtmp);
6923 }
6925 /**
6926 * 8-bit folds to compute 32-bit CRC
6927 *
6928 * uint64_t xcrc;
6929 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
6930 */
6931 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
6932 movdl(tmp, xcrc);
6933 andl(tmp, 0xFF);
6934 movdl(xtmp, Address(table, tmp, Address::times_4, 0));
6935 psrldq(xcrc, 1); // unsigned shift one byte
6936 pxor(xcrc, xtmp);
6937 }
6939 /**
6940 * uint32_t crc;
6941 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
6942 */
6943 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
6944 movl(tmp, crc);
6945 andl(tmp, 0xFF);
6946 shrl(crc, 8);
6947 xorl(crc, Address(table, tmp, Address::times_4, 0));
6948 }
6950 /**
6951 * @param crc register containing existing CRC (32-bit)
6952 * @param buf register pointing to input byte buffer (byte*)
6953 * @param len register containing number of bytes
6954 * @param table register that will contain address of CRC table
6955 * @param tmp scratch register
6956 */
6957 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
6958 assert_different_registers(crc, buf, len, table, tmp, rax);
6960 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
6961 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
6963 lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
6964 notl(crc); // ~crc
6965 cmpl(len, 16);
6966 jcc(Assembler::less, L_tail);
6968 // Align buffer to 16 bytes
6969 movl(tmp, buf);
6970 andl(tmp, 0xF);
6971 jccb(Assembler::zero, L_aligned);
6972 subl(tmp, 16);
6973 addl(len, tmp);
6975 align(4);
6976 BIND(L_align_loop);
6977 movsbl(rax, Address(buf, 0)); // load byte with sign extension
6978 update_byte_crc32(crc, rax, table);
6979 increment(buf);
6980 incrementl(tmp);
6981 jccb(Assembler::less, L_align_loop);
6983 BIND(L_aligned);
6984 movl(tmp, len); // save
6985 shrl(len, 4);
6986 jcc(Assembler::zero, L_tail_restore);
6988 // Fold crc into first bytes of vector
6989 movdqa(xmm1, Address(buf, 0));
6990 movdl(rax, xmm1);
6991 xorl(crc, rax);
6992 pinsrd(xmm1, crc, 0);
6993 addptr(buf, 16);
6994 subl(len, 4); // len > 0
6995 jcc(Assembler::less, L_fold_tail);
6997 movdqa(xmm2, Address(buf, 0));
6998 movdqa(xmm3, Address(buf, 16));
6999 movdqa(xmm4, Address(buf, 32));
7000 addptr(buf, 48);
7001 subl(len, 3);
7002 jcc(Assembler::lessEqual, L_fold_512b);
7004 // Fold total 512 bits of polynomial on each iteration,
7005 // 128 bits per each of 4 parallel streams.
7006 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
7008 align(32);
7009 BIND(L_fold_512b_loop);
7010 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
7011 fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
7012 fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
7013 fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
7014 addptr(buf, 64);
7015 subl(len, 4);
7016 jcc(Assembler::greater, L_fold_512b_loop);
7018 // Fold 512 bits to 128 bits.
7019 BIND(L_fold_512b);
7020 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7021 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
7022 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7023 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7025 // Fold the rest of 128 bits data chunks
7026 BIND(L_fold_tail);
7027 addl(len, 3);
7028 jccb(Assembler::lessEqual, L_fold_128b);
7029 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7031 BIND(L_fold_tail_loop);
7032 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
7033 addptr(buf, 16);
7034 decrementl(len);
7035 jccb(Assembler::greater, L_fold_tail_loop);
7037 // Fold 128 bits in xmm1 down into 32 bits in crc register.
7038 BIND(L_fold_128b);
7039 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
7040 vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7041 vpand(xmm3, xmm0, xmm2, false /* vector256 */);
7042 vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7043 psrldq(xmm1, 8);
7044 psrldq(xmm2, 4);
7045 pxor(xmm0, xmm1);
7046 pxor(xmm0, xmm2);
7048 // 8 8-bit folds to compute 32-bit CRC.
7049 for (int j = 0; j < 4; j++) {
7050 fold_8bit_crc32(xmm0, table, xmm1, rax);
7051 }
7052 movdl(crc, xmm0); // mov 32 bits to general register
7053 for (int j = 0; j < 4; j++) {
7054 fold_8bit_crc32(crc, table, rax);
7055 }
7057 BIND(L_tail_restore);
7058 movl(len, tmp); // restore
7059 BIND(L_tail);
7060 andl(len, 0xf);
7061 jccb(Assembler::zero, L_exit);
7063 // Fold the rest of bytes
7064 align(4);
7065 BIND(L_tail_loop);
7066 movsbl(rax, Address(buf, 0)); // load byte with sign extension
7067 update_byte_crc32(crc, rax, table);
7068 increment(buf);
7069 decrementl(len);
7070 jccb(Assembler::greater, L_tail_loop);
7072 BIND(L_exit);
7073 notl(crc); // ~c
7074 }
7076 #undef BIND
7077 #undef BLOCK_COMMENT
7080 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
7081 switch (cond) {
7082 // Note some conditions are synonyms for others
7083 case Assembler::zero: return Assembler::notZero;
7084 case Assembler::notZero: return Assembler::zero;
7085 case Assembler::less: return Assembler::greaterEqual;
7086 case Assembler::lessEqual: return Assembler::greater;
7087 case Assembler::greater: return Assembler::lessEqual;
7088 case Assembler::greaterEqual: return Assembler::less;
7089 case Assembler::below: return Assembler::aboveEqual;
7090 case Assembler::belowEqual: return Assembler::above;
7091 case Assembler::above: return Assembler::belowEqual;
7092 case Assembler::aboveEqual: return Assembler::below;
7093 case Assembler::overflow: return Assembler::noOverflow;
7094 case Assembler::noOverflow: return Assembler::overflow;
7095 case Assembler::negative: return Assembler::positive;
7096 case Assembler::positive: return Assembler::negative;
7097 case Assembler::parity: return Assembler::noParity;
7098 case Assembler::noParity: return Assembler::parity;
7099 }
7100 ShouldNotReachHere(); return Assembler::overflow;
7101 }
7103 SkipIfEqual::SkipIfEqual(
7104 MacroAssembler* masm, const bool* flag_addr, bool value) {
7105 _masm = masm;
7106 _masm->cmp8(ExternalAddress((address)flag_addr), value);
7107 _masm->jcc(Assembler::equal, _label);
7108 }
7110 SkipIfEqual::~SkipIfEqual() {
7111 _masm->bind(_label);
7112 }