src/cpu/x86/vm/x86_32.ad

changeset 435
a61af66fc99e
child 506
3d62cb85208d
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/cpu/x86/vm/x86_32.ad	Sat Dec 01 00:00:00 2007 +0000
     1.3 @@ -0,0 +1,12778 @@
     1.4 +//
     1.5 +// Copyright 1997-2007 Sun Microsystems, Inc.  All Rights Reserved.
     1.6 +// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     1.7 +//
     1.8 +// This code is free software; you can redistribute it and/or modify it
     1.9 +// under the terms of the GNU General Public License version 2 only, as
    1.10 +// published by the Free Software Foundation.
    1.11 +//
    1.12 +// This code is distributed in the hope that it will be useful, but WITHOUT
    1.13 +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    1.14 +// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    1.15 +// version 2 for more details (a copy is included in the LICENSE file that
    1.16 +// accompanied this code).
    1.17 +//
    1.18 +// You should have received a copy of the GNU General Public License version
    1.19 +// 2 along with this work; if not, write to the Free Software Foundation,
    1.20 +// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    1.21 +//
    1.22 +// Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
    1.23 +// CA 95054 USA or visit www.sun.com if you need additional information or
    1.24 +// have any questions.
    1.25 +//
    1.26 +//
    1.27 +
    1.28 +// X86 Architecture Description File
    1.29 +
    1.30 +//----------REGISTER DEFINITION BLOCK------------------------------------------
    1.31 +// This information is used by the matcher and the register allocator to
    1.32 +// describe individual registers and classes of registers within the target
    1.33 +// archtecture.
    1.34 +
    1.35 +register %{
    1.36 +//----------Architecture Description Register Definitions----------------------
    1.37 +// General Registers
    1.38 +// "reg_def"  name ( register save type, C convention save type,
    1.39 +//                   ideal register type, encoding );
    1.40 +// Register Save Types:
    1.41 +//
    1.42 +// NS  = No-Save:       The register allocator assumes that these registers
    1.43 +//                      can be used without saving upon entry to the method, &
    1.44 +//                      that they do not need to be saved at call sites.
    1.45 +//
    1.46 +// SOC = Save-On-Call:  The register allocator assumes that these registers
    1.47 +//                      can be used without saving upon entry to the method,
    1.48 +//                      but that they must be saved at call sites.
    1.49 +//
    1.50 +// SOE = Save-On-Entry: The register allocator assumes that these registers
    1.51 +//                      must be saved before using them upon entry to the
    1.52 +//                      method, but they do not need to be saved at call
    1.53 +//                      sites.
    1.54 +//
    1.55 +// AS  = Always-Save:   The register allocator assumes that these registers
    1.56 +//                      must be saved before using them upon entry to the
    1.57 +//                      method, & that they must be saved at call sites.
    1.58 +//
    1.59 +// Ideal Register Type is used to determine how to save & restore a
    1.60 +// register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
    1.61 +// spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
    1.62 +//
    1.63 +// The encoding number is the actual bit-pattern placed into the opcodes.
    1.64 +
    1.65 +// General Registers
    1.66 +// Previously set EBX, ESI, and EDI as save-on-entry for java code
    1.67 +// Turn off SOE in java-code due to frequent use of uncommon-traps.
    1.68 +// Now that allocator is better, turn on ESI and EDI as SOE registers.
    1.69 +
    1.70 +reg_def EBX(SOC, SOE, Op_RegI, 3, rbx->as_VMReg());
    1.71 +reg_def ECX(SOC, SOC, Op_RegI, 1, rcx->as_VMReg());
    1.72 +reg_def ESI(SOC, SOE, Op_RegI, 6, rsi->as_VMReg());
    1.73 +reg_def EDI(SOC, SOE, Op_RegI, 7, rdi->as_VMReg());
    1.74 +// now that adapter frames are gone EBP is always saved and restored by the prolog/epilog code
    1.75 +reg_def EBP(NS, SOE, Op_RegI, 5, rbp->as_VMReg());
    1.76 +reg_def EDX(SOC, SOC, Op_RegI, 2, rdx->as_VMReg());
    1.77 +reg_def EAX(SOC, SOC, Op_RegI, 0, rax->as_VMReg());
    1.78 +reg_def ESP( NS,  NS, Op_RegI, 4, rsp->as_VMReg());
    1.79 +
    1.80 +// Special Registers
    1.81 +reg_def EFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
    1.82 +
    1.83 +// Float registers.  We treat TOS/FPR0 special.  It is invisible to the
    1.84 +// allocator, and only shows up in the encodings.
    1.85 +reg_def FPR0L( SOC, SOC, Op_RegF, 0, VMRegImpl::Bad());
    1.86 +reg_def FPR0H( SOC, SOC, Op_RegF, 0, VMRegImpl::Bad());
    1.87 +// Ok so here's the trick FPR1 is really st(0) except in the midst
    1.88 +// of emission of assembly for a machnode. During the emission the fpu stack
    1.89 +// is pushed making FPR1 == st(1) temporarily. However at any safepoint
    1.90 +// the stack will not have this element so FPR1 == st(0) from the
    1.91 +// oopMap viewpoint. This same weirdness with numbering causes
    1.92 +// instruction encoding to have to play games with the register
    1.93 +// encode to correct for this 0/1 issue. See MachSpillCopyNode::implementation
    1.94 +// where it does flt->flt moves to see an example
    1.95 +//
    1.96 +reg_def FPR1L( SOC, SOC, Op_RegF, 1, as_FloatRegister(0)->as_VMReg());
    1.97 +reg_def FPR1H( SOC, SOC, Op_RegF, 1, as_FloatRegister(0)->as_VMReg()->next());
    1.98 +reg_def FPR2L( SOC, SOC, Op_RegF, 2, as_FloatRegister(1)->as_VMReg());
    1.99 +reg_def FPR2H( SOC, SOC, Op_RegF, 2, as_FloatRegister(1)->as_VMReg()->next());
   1.100 +reg_def FPR3L( SOC, SOC, Op_RegF, 3, as_FloatRegister(2)->as_VMReg());
   1.101 +reg_def FPR3H( SOC, SOC, Op_RegF, 3, as_FloatRegister(2)->as_VMReg()->next());
   1.102 +reg_def FPR4L( SOC, SOC, Op_RegF, 4, as_FloatRegister(3)->as_VMReg());
   1.103 +reg_def FPR4H( SOC, SOC, Op_RegF, 4, as_FloatRegister(3)->as_VMReg()->next());
   1.104 +reg_def FPR5L( SOC, SOC, Op_RegF, 5, as_FloatRegister(4)->as_VMReg());
   1.105 +reg_def FPR5H( SOC, SOC, Op_RegF, 5, as_FloatRegister(4)->as_VMReg()->next());
   1.106 +reg_def FPR6L( SOC, SOC, Op_RegF, 6, as_FloatRegister(5)->as_VMReg());
   1.107 +reg_def FPR6H( SOC, SOC, Op_RegF, 6, as_FloatRegister(5)->as_VMReg()->next());
   1.108 +reg_def FPR7L( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg());
   1.109 +reg_def FPR7H( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next());
   1.110 +
   1.111 +// XMM registers.  128-bit registers or 4 words each, labeled a-d.
   1.112 +// Word a in each register holds a Float, words ab hold a Double.
   1.113 +// We currently do not use the SIMD capabilities, so registers cd
   1.114 +// are unused at the moment.
   1.115 +reg_def XMM0a( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   1.116 +reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next());
   1.117 +reg_def XMM1a( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   1.118 +reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next());
   1.119 +reg_def XMM2a( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
   1.120 +reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next());
   1.121 +reg_def XMM3a( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
   1.122 +reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next());
   1.123 +reg_def XMM4a( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
   1.124 +reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next());
   1.125 +reg_def XMM5a( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
   1.126 +reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next());
   1.127 +reg_def XMM6a( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
   1.128 +reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next());
   1.129 +reg_def XMM7a( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
   1.130 +reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next());
   1.131 +
   1.132 +// Specify priority of register selection within phases of register
   1.133 +// allocation.  Highest priority is first.  A useful heuristic is to
   1.134 +// give registers a low priority when they are required by machine
   1.135 +// instructions, like EAX and EDX.  Registers which are used as
   1.136 +// pairs must fall on an even boundry (witness the FPR#L's in this list).
   1.137 +// For the Intel integer registers, the equivalent Long pairs are
   1.138 +// EDX:EAX, EBX:ECX, and EDI:EBP.
   1.139 +alloc_class chunk0( ECX,   EBX,   EBP,   EDI,   EAX,   EDX,   ESI, ESP,
   1.140 +                    FPR0L, FPR0H, FPR1L, FPR1H, FPR2L, FPR2H,
   1.141 +                    FPR3L, FPR3H, FPR4L, FPR4H, FPR5L, FPR5H,
   1.142 +                    FPR6L, FPR6H, FPR7L, FPR7H );
   1.143 +
   1.144 +alloc_class chunk1( XMM0a, XMM0b,
   1.145 +                    XMM1a, XMM1b,
   1.146 +                    XMM2a, XMM2b,
   1.147 +                    XMM3a, XMM3b,
   1.148 +                    XMM4a, XMM4b,
   1.149 +                    XMM5a, XMM5b,
   1.150 +                    XMM6a, XMM6b,
   1.151 +                    XMM7a, XMM7b, EFLAGS);
   1.152 +
   1.153 +
   1.154 +//----------Architecture Description Register Classes--------------------------
   1.155 +// Several register classes are automatically defined based upon information in
   1.156 +// this architecture description.
   1.157 +// 1) reg_class inline_cache_reg           ( /* as def'd in frame section */ )
   1.158 +// 2) reg_class compiler_method_oop_reg    ( /* as def'd in frame section */ )
   1.159 +// 2) reg_class interpreter_method_oop_reg ( /* as def'd in frame section */ )
   1.160 +// 3) reg_class stack_slots( /* one chunk of stack-based "registers" */ )
   1.161 +//
   1.162 +// Class for all registers
   1.163 +reg_class any_reg(EAX, EDX, EBP, EDI, ESI, ECX, EBX, ESP);
   1.164 +// Class for general registers
   1.165 +reg_class e_reg(EAX, EDX, EBP, EDI, ESI, ECX, EBX);
   1.166 +// Class for general registers which may be used for implicit null checks on win95
   1.167 +// Also safe for use by tailjump. We don't want to allocate in rbp,
   1.168 +reg_class e_reg_no_rbp(EAX, EDX, EDI, ESI, ECX, EBX);
   1.169 +// Class of "X" registers
   1.170 +reg_class x_reg(EBX, ECX, EDX, EAX);
   1.171 +// Class of registers that can appear in an address with no offset.
   1.172 +// EBP and ESP require an extra instruction byte for zero offset.
   1.173 +// Used in fast-unlock
   1.174 +reg_class p_reg(EDX, EDI, ESI, EBX);
   1.175 +// Class for general registers not including ECX
   1.176 +reg_class ncx_reg(EAX, EDX, EBP, EDI, ESI, EBX);
   1.177 +// Class for general registers not including EAX
   1.178 +reg_class nax_reg(EDX, EDI, ESI, ECX, EBX);
   1.179 +// Class for general registers not including EAX or EBX.
   1.180 +reg_class nabx_reg(EDX, EDI, ESI, ECX, EBP);
   1.181 +// Class of EAX (for multiply and divide operations)
   1.182 +reg_class eax_reg(EAX);
   1.183 +// Class of EBX (for atomic add)
   1.184 +reg_class ebx_reg(EBX);
   1.185 +// Class of ECX (for shift and JCXZ operations and cmpLTMask)
   1.186 +reg_class ecx_reg(ECX);
   1.187 +// Class of EDX (for multiply and divide operations)
   1.188 +reg_class edx_reg(EDX);
   1.189 +// Class of EDI (for synchronization)
   1.190 +reg_class edi_reg(EDI);
   1.191 +// Class of ESI (for synchronization)
   1.192 +reg_class esi_reg(ESI);
   1.193 +// Singleton class for interpreter's stack pointer
   1.194 +reg_class ebp_reg(EBP);
   1.195 +// Singleton class for stack pointer
   1.196 +reg_class sp_reg(ESP);
   1.197 +// Singleton class for instruction pointer
   1.198 +// reg_class ip_reg(EIP);
   1.199 +// Singleton class for condition codes
   1.200 +reg_class int_flags(EFLAGS);
   1.201 +// Class of integer register pairs
   1.202 +reg_class long_reg( EAX,EDX, ECX,EBX, EBP,EDI );
   1.203 +// Class of integer register pairs that aligns with calling convention
   1.204 +reg_class eadx_reg( EAX,EDX );
   1.205 +reg_class ebcx_reg( ECX,EBX );
   1.206 +// Not AX or DX, used in divides
   1.207 +reg_class nadx_reg( EBX,ECX,ESI,EDI,EBP );
   1.208 +
   1.209 +// Floating point registers.  Notice FPR0 is not a choice.
   1.210 +// FPR0 is not ever allocated; we use clever encodings to fake
   1.211 +// a 2-address instructions out of Intels FP stack.
   1.212 +reg_class flt_reg( FPR1L,FPR2L,FPR3L,FPR4L,FPR5L,FPR6L,FPR7L );
   1.213 +
   1.214 +// make a register class for SSE registers
   1.215 +reg_class xmm_reg(XMM0a, XMM1a, XMM2a, XMM3a, XMM4a, XMM5a, XMM6a, XMM7a);
   1.216 +
   1.217 +// make a double register class for SSE2 registers
   1.218 +reg_class xdb_reg(XMM0a,XMM0b, XMM1a,XMM1b, XMM2a,XMM2b, XMM3a,XMM3b,
   1.219 +                  XMM4a,XMM4b, XMM5a,XMM5b, XMM6a,XMM6b, XMM7a,XMM7b );
   1.220 +
   1.221 +reg_class dbl_reg( FPR1L,FPR1H, FPR2L,FPR2H, FPR3L,FPR3H,
   1.222 +                   FPR4L,FPR4H, FPR5L,FPR5H, FPR6L,FPR6H,
   1.223 +                   FPR7L,FPR7H );
   1.224 +
   1.225 +reg_class flt_reg0( FPR1L );
   1.226 +reg_class dbl_reg0( FPR1L,FPR1H );
   1.227 +reg_class dbl_reg1( FPR2L,FPR2H );
   1.228 +reg_class dbl_notreg0( FPR2L,FPR2H, FPR3L,FPR3H, FPR4L,FPR4H,
   1.229 +                       FPR5L,FPR5H, FPR6L,FPR6H, FPR7L,FPR7H );
   1.230 +
   1.231 +// XMM6 and XMM7 could be used as temporary registers for long, float and
   1.232 +// double values for SSE2.
   1.233 +reg_class xdb_reg6( XMM6a,XMM6b );
   1.234 +reg_class xdb_reg7( XMM7a,XMM7b );
   1.235 +%}
   1.236 +
   1.237 +
   1.238 +//----------SOURCE BLOCK-------------------------------------------------------
   1.239 +// This is a block of C++ code which provides values, functions, and
   1.240 +// definitions necessary in the rest of the architecture description
   1.241 +source %{
   1.242 +#define   RELOC_IMM32    Assembler::imm32_operand
   1.243 +#define   RELOC_DISP32   Assembler::disp32_operand
   1.244 +
   1.245 +#define __ _masm.
   1.246 +
   1.247 +// How to find the high register of a Long pair, given the low register
   1.248 +#define   HIGH_FROM_LOW(x) ((x)+2)
   1.249 +
   1.250 +// These masks are used to provide 128-bit aligned bitmasks to the XMM
   1.251 +// instructions, to allow sign-masking or sign-bit flipping.  They allow
   1.252 +// fast versions of NegF/NegD and AbsF/AbsD.
   1.253 +
   1.254 +// Note: 'double' and 'long long' have 32-bits alignment on x86.
   1.255 +static jlong* double_quadword(jlong *adr, jlong lo, jlong hi) {
   1.256 +  // Use the expression (adr)&(~0xF) to provide 128-bits aligned address
   1.257 +  // of 128-bits operands for SSE instructions.
   1.258 +  jlong *operand = (jlong*)(((uintptr_t)adr)&((uintptr_t)(~0xF)));
   1.259 +  // Store the value to a 128-bits operand.
   1.260 +  operand[0] = lo;
   1.261 +  operand[1] = hi;
   1.262 +  return operand;
   1.263 +}
   1.264 +
   1.265 +// Buffer for 128-bits masks used by SSE instructions.
   1.266 +static jlong fp_signmask_pool[(4+1)*2]; // 4*128bits(data) + 128bits(alignment)
   1.267 +
   1.268 +// Static initialization during VM startup.
   1.269 +static jlong *float_signmask_pool  = double_quadword(&fp_signmask_pool[1*2], CONST64(0x7FFFFFFF7FFFFFFF), CONST64(0x7FFFFFFF7FFFFFFF));
   1.270 +static jlong *double_signmask_pool = double_quadword(&fp_signmask_pool[2*2], CONST64(0x7FFFFFFFFFFFFFFF), CONST64(0x7FFFFFFFFFFFFFFF));
   1.271 +static jlong *float_signflip_pool  = double_quadword(&fp_signmask_pool[3*2], CONST64(0x8000000080000000), CONST64(0x8000000080000000));
   1.272 +static jlong *double_signflip_pool = double_quadword(&fp_signmask_pool[4*2], CONST64(0x8000000000000000), CONST64(0x8000000000000000));
   1.273 +
   1.274 +// !!!!! Special hack to get all type of calls to specify the byte offset
   1.275 +//       from the start of the call to the point where the return address
   1.276 +//       will point.
   1.277 +int MachCallStaticJavaNode::ret_addr_offset() {
   1.278 +  return 5 + (Compile::current()->in_24_bit_fp_mode() ? 6 : 0);  // 5 bytes from start of call to where return address points
   1.279 +}
   1.280 +
   1.281 +int MachCallDynamicJavaNode::ret_addr_offset() {
   1.282 +  return 10 + (Compile::current()->in_24_bit_fp_mode() ? 6 : 0);  // 10 bytes from start of call to where return address points
   1.283 +}
   1.284 +
   1.285 +static int sizeof_FFree_Float_Stack_All = -1;
   1.286 +
   1.287 +int MachCallRuntimeNode::ret_addr_offset() {
   1.288 +  assert(sizeof_FFree_Float_Stack_All != -1, "must have been emitted already");
   1.289 +  return sizeof_FFree_Float_Stack_All + 5 + (Compile::current()->in_24_bit_fp_mode() ? 6 : 0);
   1.290 +}
   1.291 +
   1.292 +// Indicate if the safepoint node needs the polling page as an input.
   1.293 +// Since x86 does have absolute addressing, it doesn't.
   1.294 +bool SafePointNode::needs_polling_address_input() {
   1.295 +  return false;
   1.296 +}
   1.297 +
   1.298 +//
   1.299 +// Compute padding required for nodes which need alignment
   1.300 +//
   1.301 +
   1.302 +// The address of the call instruction needs to be 4-byte aligned to
   1.303 +// ensure that it does not span a cache line so that it can be patched.
   1.304 +int CallStaticJavaDirectNode::compute_padding(int current_offset) const {
   1.305 +  if (Compile::current()->in_24_bit_fp_mode())
   1.306 +    current_offset += 6;    // skip fldcw in pre_call_FPU, if any
   1.307 +  current_offset += 1;      // skip call opcode byte
   1.308 +  return round_to(current_offset, alignment_required()) - current_offset;
   1.309 +}
   1.310 +
   1.311 +// The address of the call instruction needs to be 4-byte aligned to
   1.312 +// ensure that it does not span a cache line so that it can be patched.
   1.313 +int CallDynamicJavaDirectNode::compute_padding(int current_offset) const {
   1.314 +  if (Compile::current()->in_24_bit_fp_mode())
   1.315 +    current_offset += 6;    // skip fldcw in pre_call_FPU, if any
   1.316 +  current_offset += 5;      // skip MOV instruction
   1.317 +  current_offset += 1;      // skip call opcode byte
   1.318 +  return round_to(current_offset, alignment_required()) - current_offset;
   1.319 +}
   1.320 +
   1.321 +#ifndef PRODUCT
   1.322 +void MachBreakpointNode::format( PhaseRegAlloc *, outputStream* st ) const {
   1.323 +  st->print("INT3");
   1.324 +}
   1.325 +#endif
   1.326 +
   1.327 +// EMIT_RM()
   1.328 +void emit_rm(CodeBuffer &cbuf, int f1, int f2, int f3) {
   1.329 +  unsigned char c = (unsigned char)((f1 << 6) | (f2 << 3) | f3);
   1.330 +  *(cbuf.code_end()) = c;
   1.331 +  cbuf.set_code_end(cbuf.code_end() + 1);
   1.332 +}
   1.333 +
   1.334 +// EMIT_CC()
   1.335 +void emit_cc(CodeBuffer &cbuf, int f1, int f2) {
   1.336 +  unsigned char c = (unsigned char)( f1 | f2 );
   1.337 +  *(cbuf.code_end()) = c;
   1.338 +  cbuf.set_code_end(cbuf.code_end() + 1);
   1.339 +}
   1.340 +
   1.341 +// EMIT_OPCODE()
   1.342 +void emit_opcode(CodeBuffer &cbuf, int code) {
   1.343 +  *(cbuf.code_end()) = (unsigned char)code;
   1.344 +  cbuf.set_code_end(cbuf.code_end() + 1);
   1.345 +}
   1.346 +
   1.347 +// EMIT_OPCODE() w/ relocation information
   1.348 +void emit_opcode(CodeBuffer &cbuf, int code, relocInfo::relocType reloc, int offset = 0) {
   1.349 +  cbuf.relocate(cbuf.inst_mark() + offset, reloc);
   1.350 +  emit_opcode(cbuf, code);
   1.351 +}
   1.352 +
   1.353 +// EMIT_D8()
   1.354 +void emit_d8(CodeBuffer &cbuf, int d8) {
   1.355 +  *(cbuf.code_end()) = (unsigned char)d8;
   1.356 +  cbuf.set_code_end(cbuf.code_end() + 1);
   1.357 +}
   1.358 +
   1.359 +// EMIT_D16()
   1.360 +void emit_d16(CodeBuffer &cbuf, int d16) {
   1.361 +  *((short *)(cbuf.code_end())) = d16;
   1.362 +  cbuf.set_code_end(cbuf.code_end() + 2);
   1.363 +}
   1.364 +
   1.365 +// EMIT_D32()
   1.366 +void emit_d32(CodeBuffer &cbuf, int d32) {
   1.367 +  *((int *)(cbuf.code_end())) = d32;
   1.368 +  cbuf.set_code_end(cbuf.code_end() + 4);
   1.369 +}
   1.370 +
   1.371 +// emit 32 bit value and construct relocation entry from relocInfo::relocType
   1.372 +void emit_d32_reloc(CodeBuffer &cbuf, int d32, relocInfo::relocType reloc,
   1.373 +        int format) {
   1.374 +  cbuf.relocate(cbuf.inst_mark(), reloc, format);
   1.375 +
   1.376 +  *((int *)(cbuf.code_end())) = d32;
   1.377 +  cbuf.set_code_end(cbuf.code_end() + 4);
   1.378 +}
   1.379 +
   1.380 +// emit 32 bit value and construct relocation entry from RelocationHolder
   1.381 +void emit_d32_reloc(CodeBuffer &cbuf, int d32, RelocationHolder const& rspec,
   1.382 +        int format) {
   1.383 +#ifdef ASSERT
   1.384 +  if (rspec.reloc()->type() == relocInfo::oop_type && d32 != 0 && d32 != (int)Universe::non_oop_word()) {
   1.385 +    assert(oop(d32)->is_oop() && oop(d32)->is_perm(), "cannot embed non-perm oops in code");
   1.386 +  }
   1.387 +#endif
   1.388 +  cbuf.relocate(cbuf.inst_mark(), rspec, format);
   1.389 +
   1.390 +  *((int *)(cbuf.code_end())) = d32;
   1.391 +  cbuf.set_code_end(cbuf.code_end() + 4);
   1.392 +}
   1.393 +
   1.394 +// Access stack slot for load or store
   1.395 +void store_to_stackslot(CodeBuffer &cbuf, int opcode, int rm_field, int disp) {
   1.396 +  emit_opcode( cbuf, opcode );               // (e.g., FILD   [ESP+src])
   1.397 +  if( -128 <= disp && disp <= 127 ) {
   1.398 +    emit_rm( cbuf, 0x01, rm_field, ESP_enc );  // R/M byte
   1.399 +    emit_rm( cbuf, 0x00, ESP_enc, ESP_enc);    // SIB byte
   1.400 +    emit_d8 (cbuf, disp);     // Displacement  // R/M byte
   1.401 +  } else {
   1.402 +    emit_rm( cbuf, 0x02, rm_field, ESP_enc );  // R/M byte
   1.403 +    emit_rm( cbuf, 0x00, ESP_enc, ESP_enc);    // SIB byte
   1.404 +    emit_d32(cbuf, disp);     // Displacement  // R/M byte
   1.405 +  }
   1.406 +}
   1.407 +
   1.408 +   // eRegI ereg, memory mem) %{    // emit_reg_mem
   1.409 +void encode_RegMem( CodeBuffer &cbuf, int reg_encoding, int base, int index, int scale, int displace, bool displace_is_oop ) {
   1.410 +  // There is no index & no scale, use form without SIB byte
   1.411 +  if ((index == 0x4) &&
   1.412 +      (scale == 0) && (base != ESP_enc)) {
   1.413 +    // If no displacement, mode is 0x0; unless base is [EBP]
   1.414 +    if ( (displace == 0) && (base != EBP_enc) ) {
   1.415 +      emit_rm(cbuf, 0x0, reg_encoding, base);
   1.416 +    }
   1.417 +    else {                    // If 8-bit displacement, mode 0x1
   1.418 +      if ((displace >= -128) && (displace <= 127)
   1.419 +          && !(displace_is_oop) ) {
   1.420 +        emit_rm(cbuf, 0x1, reg_encoding, base);
   1.421 +        emit_d8(cbuf, displace);
   1.422 +      }
   1.423 +      else {                  // If 32-bit displacement
   1.424 +        if (base == -1) { // Special flag for absolute address
   1.425 +          emit_rm(cbuf, 0x0, reg_encoding, 0x5);
   1.426 +          // (manual lies; no SIB needed here)
   1.427 +          if ( displace_is_oop ) {
   1.428 +            emit_d32_reloc(cbuf, displace, relocInfo::oop_type, 1);
   1.429 +          } else {
   1.430 +            emit_d32      (cbuf, displace);
   1.431 +          }
   1.432 +        }
   1.433 +        else {                // Normal base + offset
   1.434 +          emit_rm(cbuf, 0x2, reg_encoding, base);
   1.435 +          if ( displace_is_oop ) {
   1.436 +            emit_d32_reloc(cbuf, displace, relocInfo::oop_type, 1);
   1.437 +          } else {
   1.438 +            emit_d32      (cbuf, displace);
   1.439 +          }
   1.440 +        }
   1.441 +      }
   1.442 +    }
   1.443 +  }
   1.444 +  else {                      // Else, encode with the SIB byte
   1.445 +    // If no displacement, mode is 0x0; unless base is [EBP]
   1.446 +    if (displace == 0 && (base != EBP_enc)) {  // If no displacement
   1.447 +      emit_rm(cbuf, 0x0, reg_encoding, 0x4);
   1.448 +      emit_rm(cbuf, scale, index, base);
   1.449 +    }
   1.450 +    else {                    // If 8-bit displacement, mode 0x1
   1.451 +      if ((displace >= -128) && (displace <= 127)
   1.452 +          && !(displace_is_oop) ) {
   1.453 +        emit_rm(cbuf, 0x1, reg_encoding, 0x4);
   1.454 +        emit_rm(cbuf, scale, index, base);
   1.455 +        emit_d8(cbuf, displace);
   1.456 +      }
   1.457 +      else {                  // If 32-bit displacement
   1.458 +        if (base == 0x04 ) {
   1.459 +          emit_rm(cbuf, 0x2, reg_encoding, 0x4);
   1.460 +          emit_rm(cbuf, scale, index, 0x04);
   1.461 +        } else {
   1.462 +          emit_rm(cbuf, 0x2, reg_encoding, 0x4);
   1.463 +          emit_rm(cbuf, scale, index, base);
   1.464 +        }
   1.465 +        if ( displace_is_oop ) {
   1.466 +          emit_d32_reloc(cbuf, displace, relocInfo::oop_type, 1);
   1.467 +        } else {
   1.468 +          emit_d32      (cbuf, displace);
   1.469 +        }
   1.470 +      }
   1.471 +    }
   1.472 +  }
   1.473 +}
   1.474 +
   1.475 +
   1.476 +void encode_Copy( CodeBuffer &cbuf, int dst_encoding, int src_encoding ) {
   1.477 +  if( dst_encoding == src_encoding ) {
   1.478 +    // reg-reg copy, use an empty encoding
   1.479 +  } else {
   1.480 +    emit_opcode( cbuf, 0x8B );
   1.481 +    emit_rm(cbuf, 0x3, dst_encoding, src_encoding );
   1.482 +  }
   1.483 +}
   1.484 +
   1.485 +void encode_CopyXD( CodeBuffer &cbuf, int dst_encoding, int src_encoding ) {
   1.486 +  if( dst_encoding == src_encoding ) {
   1.487 +    // reg-reg copy, use an empty encoding
   1.488 +  } else {
   1.489 +    MacroAssembler _masm(&cbuf);
   1.490 +
   1.491 +    __ movdqa(as_XMMRegister(dst_encoding), as_XMMRegister(src_encoding));
   1.492 +  }
   1.493 +}
   1.494 +
   1.495 +
   1.496 +//=============================================================================
   1.497 +#ifndef PRODUCT
   1.498 +void MachPrologNode::format( PhaseRegAlloc *ra_, outputStream* st ) const {
   1.499 +  Compile* C = ra_->C;
   1.500 +  if( C->in_24_bit_fp_mode() ) {
   1.501 +    tty->print("FLDCW  24 bit fpu control word");
   1.502 +    tty->print_cr(""); tty->print("\t");
   1.503 +  }
   1.504 +
   1.505 +  int framesize = C->frame_slots() << LogBytesPerInt;
   1.506 +  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
   1.507 +  // Remove two words for return addr and rbp,
   1.508 +  framesize -= 2*wordSize;
   1.509 +
   1.510 +  // Calls to C2R adapters often do not accept exceptional returns.
   1.511 +  // We require that their callers must bang for them.  But be careful, because
   1.512 +  // some VM calls (such as call site linkage) can use several kilobytes of
   1.513 +  // stack.  But the stack safety zone should account for that.
   1.514 +  // See bugs 4446381, 4468289, 4497237.
   1.515 +  if (C->need_stack_bang(framesize)) {
   1.516 +    tty->print_cr("# stack bang"); tty->print("\t");
   1.517 +  }
   1.518 +  tty->print_cr("PUSHL  EBP"); tty->print("\t");
   1.519 +
   1.520 +  if( VerifyStackAtCalls ) { // Majik cookie to verify stack depth
   1.521 +    tty->print("PUSH   0xBADB100D\t# Majik cookie for stack depth check");
   1.522 +    tty->print_cr(""); tty->print("\t");
   1.523 +    framesize -= wordSize;
   1.524 +  }
   1.525 +
   1.526 +  if ((C->in_24_bit_fp_mode() || VerifyStackAtCalls ) && framesize < 128 ) {
   1.527 +    if (framesize) {
   1.528 +      tty->print("SUB    ESP,%d\t# Create frame",framesize);
   1.529 +    }
   1.530 +  } else {
   1.531 +    tty->print("SUB    ESP,%d\t# Create frame",framesize);
   1.532 +  }
   1.533 +}
   1.534 +#endif
   1.535 +
   1.536 +
   1.537 +void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
   1.538 +  Compile* C = ra_->C;
   1.539 +
   1.540 +  if (UseSSE >= 2 && VerifyFPU) {
   1.541 +    MacroAssembler masm(&cbuf);
   1.542 +    masm.verify_FPU(0, "FPU stack must be clean on entry");
   1.543 +  }
   1.544 +
   1.545 +  // WARNING: Initial instruction MUST be 5 bytes or longer so that
   1.546 +  // NativeJump::patch_verified_entry will be able to patch out the entry
   1.547 +  // code safely. The fldcw is ok at 6 bytes, the push to verify stack
   1.548 +  // depth is ok at 5 bytes, the frame allocation can be either 3 or
   1.549 +  // 6 bytes. So if we don't do the fldcw or the push then we must
   1.550 +  // use the 6 byte frame allocation even if we have no frame. :-(
   1.551 +  // If method sets FPU control word do it now
   1.552 +  if( C->in_24_bit_fp_mode() ) {
   1.553 +    MacroAssembler masm(&cbuf);
   1.554 +    masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
   1.555 +  }
   1.556 +
   1.557 +  int framesize = C->frame_slots() << LogBytesPerInt;
   1.558 +  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
   1.559 +  // Remove two words for return addr and rbp,
   1.560 +  framesize -= 2*wordSize;
   1.561 +
   1.562 +  // Calls to C2R adapters often do not accept exceptional returns.
   1.563 +  // We require that their callers must bang for them.  But be careful, because
   1.564 +  // some VM calls (such as call site linkage) can use several kilobytes of
   1.565 +  // stack.  But the stack safety zone should account for that.
   1.566 +  // See bugs 4446381, 4468289, 4497237.
   1.567 +  if (C->need_stack_bang(framesize)) {
   1.568 +    MacroAssembler masm(&cbuf);
   1.569 +    masm.generate_stack_overflow_check(framesize);
   1.570 +  }
   1.571 +
   1.572 +  // We always push rbp, so that on return to interpreter rbp, will be
   1.573 +  // restored correctly and we can correct the stack.
   1.574 +  emit_opcode(cbuf, 0x50 | EBP_enc);
   1.575 +
   1.576 +  if( VerifyStackAtCalls ) { // Majik cookie to verify stack depth
   1.577 +    emit_opcode(cbuf, 0x68); // push 0xbadb100d
   1.578 +    emit_d32(cbuf, 0xbadb100d);
   1.579 +    framesize -= wordSize;
   1.580 +  }
   1.581 +
   1.582 +  if ((C->in_24_bit_fp_mode() || VerifyStackAtCalls ) && framesize < 128 ) {
   1.583 +    if (framesize) {
   1.584 +      emit_opcode(cbuf, 0x83);   // sub  SP,#framesize
   1.585 +      emit_rm(cbuf, 0x3, 0x05, ESP_enc);
   1.586 +      emit_d8(cbuf, framesize);
   1.587 +    }
   1.588 +  } else {
   1.589 +    emit_opcode(cbuf, 0x81);   // sub  SP,#framesize
   1.590 +    emit_rm(cbuf, 0x3, 0x05, ESP_enc);
   1.591 +    emit_d32(cbuf, framesize);
   1.592 +  }
   1.593 +  C->set_frame_complete(cbuf.code_end() - cbuf.code_begin());
   1.594 +
   1.595 +#ifdef ASSERT
   1.596 +  if (VerifyStackAtCalls) {
   1.597 +    Label L;
   1.598 +    MacroAssembler masm(&cbuf);
   1.599 +    masm.pushl(rax);
   1.600 +    masm.movl(rax, rsp);
   1.601 +    masm.andl(rax, StackAlignmentInBytes-1);
   1.602 +    masm.cmpl(rax, StackAlignmentInBytes-wordSize);
   1.603 +    masm.popl(rax);
   1.604 +    masm.jcc(Assembler::equal, L);
   1.605 +    masm.stop("Stack is not properly aligned!");
   1.606 +    masm.bind(L);
   1.607 +  }
   1.608 +#endif
   1.609 +
   1.610 +}
   1.611 +
   1.612 +uint MachPrologNode::size(PhaseRegAlloc *ra_) const {
   1.613 +  return MachNode::size(ra_); // too many variables; just compute it the hard way
   1.614 +}
   1.615 +
   1.616 +int MachPrologNode::reloc() const {
   1.617 +  return 0; // a large enough number
   1.618 +}
   1.619 +
   1.620 +//=============================================================================
   1.621 +#ifndef PRODUCT
   1.622 +void MachEpilogNode::format( PhaseRegAlloc *ra_, outputStream* st ) const {
   1.623 +  Compile *C = ra_->C;
   1.624 +  int framesize = C->frame_slots() << LogBytesPerInt;
   1.625 +  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
   1.626 +  // Remove two words for return addr and rbp,
   1.627 +  framesize -= 2*wordSize;
   1.628 +
   1.629 +  if( C->in_24_bit_fp_mode() ) {
   1.630 +    st->print("FLDCW  standard control word");
   1.631 +    st->cr(); st->print("\t");
   1.632 +  }
   1.633 +  if( framesize ) {
   1.634 +    st->print("ADD    ESP,%d\t# Destroy frame",framesize);
   1.635 +    st->cr(); st->print("\t");
   1.636 +  }
   1.637 +  st->print_cr("POPL   EBP"); st->print("\t");
   1.638 +  if( do_polling() && C->is_method_compilation() ) {
   1.639 +    st->print("TEST   PollPage,EAX\t! Poll Safepoint");
   1.640 +    st->cr(); st->print("\t");
   1.641 +  }
   1.642 +}
   1.643 +#endif
   1.644 +
   1.645 +void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
   1.646 +  Compile *C = ra_->C;
   1.647 +
   1.648 +  // If method set FPU control word, restore to standard control word
   1.649 +  if( C->in_24_bit_fp_mode() ) {
   1.650 +    MacroAssembler masm(&cbuf);
   1.651 +    masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
   1.652 +  }
   1.653 +
   1.654 +  int framesize = C->frame_slots() << LogBytesPerInt;
   1.655 +  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
   1.656 +  // Remove two words for return addr and rbp,
   1.657 +  framesize -= 2*wordSize;
   1.658 +
   1.659 +  // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here
   1.660 +
   1.661 +  if( framesize >= 128 ) {
   1.662 +    emit_opcode(cbuf, 0x81); // add  SP, #framesize
   1.663 +    emit_rm(cbuf, 0x3, 0x00, ESP_enc);
   1.664 +    emit_d32(cbuf, framesize);
   1.665 +  }
   1.666 +  else if( framesize ) {
   1.667 +    emit_opcode(cbuf, 0x83); // add  SP, #framesize
   1.668 +    emit_rm(cbuf, 0x3, 0x00, ESP_enc);
   1.669 +    emit_d8(cbuf, framesize);
   1.670 +  }
   1.671 +
   1.672 +  emit_opcode(cbuf, 0x58 | EBP_enc);
   1.673 +
   1.674 +  if( do_polling() && C->is_method_compilation() ) {
   1.675 +    cbuf.relocate(cbuf.code_end(), relocInfo::poll_return_type, 0);
   1.676 +    emit_opcode(cbuf,0x85);
   1.677 +    emit_rm(cbuf, 0x0, EAX_enc, 0x5); // EAX
   1.678 +    emit_d32(cbuf, (intptr_t)os::get_polling_page());
   1.679 +  }
   1.680 +}
   1.681 +
   1.682 +uint MachEpilogNode::size(PhaseRegAlloc *ra_) const {
   1.683 +  Compile *C = ra_->C;
   1.684 +  // If method set FPU control word, restore to standard control word
   1.685 +  int size = C->in_24_bit_fp_mode() ? 6 : 0;
   1.686 +  if( do_polling() && C->is_method_compilation() ) size += 6;
   1.687 +
   1.688 +  int framesize = C->frame_slots() << LogBytesPerInt;
   1.689 +  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
   1.690 +  // Remove two words for return addr and rbp,
   1.691 +  framesize -= 2*wordSize;
   1.692 +
   1.693 +  size++; // popl rbp,
   1.694 +
   1.695 +  if( framesize >= 128 ) {
   1.696 +    size += 6;
   1.697 +  } else {
   1.698 +    size += framesize ? 3 : 0;
   1.699 +  }
   1.700 +  return size;
   1.701 +}
   1.702 +
   1.703 +int MachEpilogNode::reloc() const {
   1.704 +  return 0; // a large enough number
   1.705 +}
   1.706 +
   1.707 +const Pipeline * MachEpilogNode::pipeline() const {
   1.708 +  return MachNode::pipeline_class();
   1.709 +}
   1.710 +
   1.711 +int MachEpilogNode::safepoint_offset() const { return 0; }
   1.712 +
   1.713 +//=============================================================================
   1.714 +
   1.715 +enum RC { rc_bad, rc_int, rc_float, rc_xmm, rc_stack };
   1.716 +static enum RC rc_class( OptoReg::Name reg ) {
   1.717 +
   1.718 +  if( !OptoReg::is_valid(reg)  ) return rc_bad;
   1.719 +  if (OptoReg::is_stack(reg)) return rc_stack;
   1.720 +
   1.721 +  VMReg r = OptoReg::as_VMReg(reg);
   1.722 +  if (r->is_Register()) return rc_int;
   1.723 +  if (r->is_FloatRegister()) {
   1.724 +    assert(UseSSE < 2, "shouldn't be used in SSE2+ mode");
   1.725 +    return rc_float;
   1.726 +  }
   1.727 +  assert(r->is_XMMRegister(), "must be");
   1.728 +  return rc_xmm;
   1.729 +}
   1.730 +
   1.731 +static int impl_helper( CodeBuffer *cbuf, bool do_size, bool is_load, int offset, int reg, int opcode, const char *op_str, int size ) {
   1.732 +  if( cbuf ) {
   1.733 +    emit_opcode  (*cbuf, opcode );
   1.734 +    encode_RegMem(*cbuf, Matcher::_regEncode[reg], ESP_enc, 0x4, 0, offset, false);
   1.735 +#ifndef PRODUCT
   1.736 +  } else if( !do_size ) {
   1.737 +    if( size != 0 ) tty->print("\n\t");
   1.738 +    if( opcode == 0x8B || opcode == 0x89 ) { // MOV
   1.739 +      if( is_load ) tty->print("%s   %s,[ESP + #%d]",op_str,Matcher::regName[reg],offset);
   1.740 +      else          tty->print("%s   [ESP + #%d],%s",op_str,offset,Matcher::regName[reg]);
   1.741 +    } else { // FLD, FST, PUSH, POP
   1.742 +      tty->print("%s [ESP + #%d]",op_str,offset);
   1.743 +    }
   1.744 +#endif
   1.745 +  }
   1.746 +  int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
   1.747 +  return size+3+offset_size;
   1.748 +}
   1.749 +
   1.750 +// Helper for XMM registers.  Extra opcode bits, limited syntax.
   1.751 +static int impl_x_helper( CodeBuffer *cbuf, bool do_size, bool is_load,
   1.752 +                         int offset, int reg_lo, int reg_hi, int size ) {
   1.753 +  if( cbuf ) {
   1.754 +    if( reg_lo+1 == reg_hi ) { // double move?
   1.755 +      if( is_load && !UseXmmLoadAndClearUpper )
   1.756 +        emit_opcode(*cbuf, 0x66 ); // use 'movlpd' for load
   1.757 +      else
   1.758 +        emit_opcode(*cbuf, 0xF2 ); // use 'movsd' otherwise
   1.759 +    } else {
   1.760 +      emit_opcode(*cbuf, 0xF3 );
   1.761 +    }
   1.762 +    emit_opcode(*cbuf, 0x0F );
   1.763 +    if( reg_lo+1 == reg_hi && is_load && !UseXmmLoadAndClearUpper )
   1.764 +      emit_opcode(*cbuf, 0x12 );   // use 'movlpd' for load
   1.765 +    else
   1.766 +      emit_opcode(*cbuf, is_load ? 0x10 : 0x11 );
   1.767 +    encode_RegMem(*cbuf, Matcher::_regEncode[reg_lo], ESP_enc, 0x4, 0, offset, false);
   1.768 +#ifndef PRODUCT
   1.769 +  } else if( !do_size ) {
   1.770 +    if( size != 0 ) tty->print("\n\t");
   1.771 +    if( reg_lo+1 == reg_hi ) { // double move?
   1.772 +      if( is_load ) tty->print("%s %s,[ESP + #%d]",
   1.773 +                               UseXmmLoadAndClearUpper ? "MOVSD " : "MOVLPD",
   1.774 +                               Matcher::regName[reg_lo], offset);
   1.775 +      else          tty->print("MOVSD  [ESP + #%d],%s",
   1.776 +                               offset, Matcher::regName[reg_lo]);
   1.777 +    } else {
   1.778 +      if( is_load ) tty->print("MOVSS  %s,[ESP + #%d]",
   1.779 +                               Matcher::regName[reg_lo], offset);
   1.780 +      else          tty->print("MOVSS  [ESP + #%d],%s",
   1.781 +                               offset, Matcher::regName[reg_lo]);
   1.782 +    }
   1.783 +#endif
   1.784 +  }
   1.785 +  int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
   1.786 +  return size+5+offset_size;
   1.787 +}
   1.788 +
   1.789 +
   1.790 +static int impl_movx_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
   1.791 +                            int src_hi, int dst_hi, int size ) {
   1.792 +  if( UseXmmRegToRegMoveAll ) {//Use movaps,movapd to move between xmm registers
   1.793 +    if( cbuf ) {
   1.794 +      if( (src_lo+1 == src_hi && dst_lo+1 == dst_hi) ) {
   1.795 +        emit_opcode(*cbuf, 0x66 );
   1.796 +      }
   1.797 +      emit_opcode(*cbuf, 0x0F );
   1.798 +      emit_opcode(*cbuf, 0x28 );
   1.799 +      emit_rm    (*cbuf, 0x3, Matcher::_regEncode[dst_lo], Matcher::_regEncode[src_lo] );
   1.800 +#ifndef PRODUCT
   1.801 +    } else if( !do_size ) {
   1.802 +      if( size != 0 ) tty->print("\n\t");
   1.803 +      if( src_lo+1 == src_hi && dst_lo+1 == dst_hi ) { // double move?
   1.804 +        tty->print("MOVAPD %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
   1.805 +      } else {
   1.806 +        tty->print("MOVAPS %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
   1.807 +      }
   1.808 +#endif
   1.809 +    }
   1.810 +    return size + ((src_lo+1 == src_hi && dst_lo+1 == dst_hi) ? 4 : 3);
   1.811 +  } else {
   1.812 +    if( cbuf ) {
   1.813 +      emit_opcode(*cbuf, (src_lo+1 == src_hi && dst_lo+1 == dst_hi) ? 0xF2 : 0xF3 );
   1.814 +      emit_opcode(*cbuf, 0x0F );
   1.815 +      emit_opcode(*cbuf, 0x10 );
   1.816 +      emit_rm    (*cbuf, 0x3, Matcher::_regEncode[dst_lo], Matcher::_regEncode[src_lo] );
   1.817 +#ifndef PRODUCT
   1.818 +    } else if( !do_size ) {
   1.819 +      if( size != 0 ) tty->print("\n\t");
   1.820 +      if( src_lo+1 == src_hi && dst_lo+1 == dst_hi ) { // double move?
   1.821 +        tty->print("MOVSD  %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
   1.822 +      } else {
   1.823 +        tty->print("MOVSS  %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
   1.824 +      }
   1.825 +#endif
   1.826 +    }
   1.827 +    return size+4;
   1.828 +  }
   1.829 +}
   1.830 +
   1.831 +static int impl_mov_helper( CodeBuffer *cbuf, bool do_size, int src, int dst, int size ) {
   1.832 +  if( cbuf ) {
   1.833 +    emit_opcode(*cbuf, 0x8B );
   1.834 +    emit_rm    (*cbuf, 0x3, Matcher::_regEncode[dst], Matcher::_regEncode[src] );
   1.835 +#ifndef PRODUCT
   1.836 +  } else if( !do_size ) {
   1.837 +    if( size != 0 ) tty->print("\n\t");
   1.838 +    tty->print("MOV    %s,%s",Matcher::regName[dst],Matcher::regName[src]);
   1.839 +#endif
   1.840 +  }
   1.841 +  return size+2;
   1.842 +}
   1.843 +
   1.844 +static int impl_fp_store_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int src_hi, int dst_lo, int dst_hi, int offset, int size ) {
   1.845 +  if( src_lo != FPR1L_num ) {      // Move value to top of FP stack, if not already there
   1.846 +    if( cbuf ) {
   1.847 +      emit_opcode( *cbuf, 0xD9 );  // FLD (i.e., push it)
   1.848 +      emit_d8( *cbuf, 0xC0-1+Matcher::_regEncode[src_lo] );
   1.849 +#ifndef PRODUCT
   1.850 +    } else if( !do_size ) {
   1.851 +      if( size != 0 ) tty->print("\n\t");
   1.852 +      tty->print("FLD    %s",Matcher::regName[src_lo]);
   1.853 +#endif
   1.854 +    }
   1.855 +    size += 2;
   1.856 +  }
   1.857 +
   1.858 +  int st_op = (src_lo != FPR1L_num) ? EBX_num /*store & pop*/ : EDX_num /*store no pop*/;
   1.859 +  const char *op_str;
   1.860 +  int op;
   1.861 +  if( src_lo+1 == src_hi && dst_lo+1 == dst_hi ) { // double store?
   1.862 +    op_str = (src_lo != FPR1L_num) ? "FSTP_D" : "FST_D ";
   1.863 +    op = 0xDD;
   1.864 +  } else {                   // 32-bit store
   1.865 +    op_str = (src_lo != FPR1L_num) ? "FSTP_S" : "FST_S ";
   1.866 +    op = 0xD9;
   1.867 +    assert( !OptoReg::is_valid(src_hi) && !OptoReg::is_valid(dst_hi), "no non-adjacent float-stores" );
   1.868 +  }
   1.869 +
   1.870 +  return impl_helper(cbuf,do_size,false,offset,st_op,op,op_str,size);
   1.871 +}
   1.872 +
   1.873 +uint MachSpillCopyNode::implementation( CodeBuffer *cbuf, PhaseRegAlloc *ra_, bool do_size, outputStream* st ) const {
   1.874 +  // Get registers to move
   1.875 +  OptoReg::Name src_second = ra_->get_reg_second(in(1));
   1.876 +  OptoReg::Name src_first = ra_->get_reg_first(in(1));
   1.877 +  OptoReg::Name dst_second = ra_->get_reg_second(this );
   1.878 +  OptoReg::Name dst_first = ra_->get_reg_first(this );
   1.879 +
   1.880 +  enum RC src_second_rc = rc_class(src_second);
   1.881 +  enum RC src_first_rc = rc_class(src_first);
   1.882 +  enum RC dst_second_rc = rc_class(dst_second);
   1.883 +  enum RC dst_first_rc = rc_class(dst_first);
   1.884 +
   1.885 +  assert( OptoReg::is_valid(src_first) && OptoReg::is_valid(dst_first), "must move at least 1 register" );
   1.886 +
   1.887 +  // Generate spill code!
   1.888 +  int size = 0;
   1.889 +
   1.890 +  if( src_first == dst_first && src_second == dst_second )
   1.891 +    return size;            // Self copy, no move
   1.892 +
   1.893 +  // --------------------------------------
   1.894 +  // Check for mem-mem move.  push/pop to move.
   1.895 +  if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) {
   1.896 +    if( src_second == dst_first ) { // overlapping stack copy ranges
   1.897 +      assert( src_second_rc == rc_stack && dst_second_rc == rc_stack, "we only expect a stk-stk copy here" );
   1.898 +      size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_second),ESI_num,0xFF,"PUSH  ",size);
   1.899 +      size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_second),EAX_num,0x8F,"POP   ",size);
   1.900 +      src_second_rc = dst_second_rc = rc_bad;  // flag as already moved the second bits
   1.901 +    }
   1.902 +    // move low bits
   1.903 +    size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_first),ESI_num,0xFF,"PUSH  ",size);
   1.904 +    size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_first),EAX_num,0x8F,"POP   ",size);
   1.905 +    if( src_second_rc == rc_stack && dst_second_rc == rc_stack ) { // mov second bits
   1.906 +      size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_second),ESI_num,0xFF,"PUSH  ",size);
   1.907 +      size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_second),EAX_num,0x8F,"POP   ",size);
   1.908 +    }
   1.909 +    return size;
   1.910 +  }
   1.911 +
   1.912 +  // --------------------------------------
   1.913 +  // Check for integer reg-reg copy
   1.914 +  if( src_first_rc == rc_int && dst_first_rc == rc_int )
   1.915 +    size = impl_mov_helper(cbuf,do_size,src_first,dst_first,size);
   1.916 +
   1.917 +  // Check for integer store
   1.918 +  if( src_first_rc == rc_int && dst_first_rc == rc_stack )
   1.919 +    size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_first),src_first,0x89,"MOV ",size);
   1.920 +
   1.921 +  // Check for integer load
   1.922 +  if( dst_first_rc == rc_int && src_first_rc == rc_stack )
   1.923 +    size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_first),dst_first,0x8B,"MOV ",size);
   1.924 +
   1.925 +  // --------------------------------------
   1.926 +  // Check for float reg-reg copy
   1.927 +  if( src_first_rc == rc_float && dst_first_rc == rc_float ) {
   1.928 +    assert( (src_second_rc == rc_bad && dst_second_rc == rc_bad) ||
   1.929 +            (src_first+1 == src_second && dst_first+1 == dst_second), "no non-adjacent float-moves" );
   1.930 +    if( cbuf ) {
   1.931 +
   1.932 +      // Note the mucking with the register encode to compensate for the 0/1
   1.933 +      // indexing issue mentioned in a comment in the reg_def sections
   1.934 +      // for FPR registers many lines above here.
   1.935 +
   1.936 +      if( src_first != FPR1L_num ) {
   1.937 +        emit_opcode  (*cbuf, 0xD9 );           // FLD    ST(i)
   1.938 +        emit_d8      (*cbuf, 0xC0+Matcher::_regEncode[src_first]-1 );
   1.939 +        emit_opcode  (*cbuf, 0xDD );           // FSTP   ST(i)
   1.940 +        emit_d8      (*cbuf, 0xD8+Matcher::_regEncode[dst_first] );
   1.941 +     } else {
   1.942 +        emit_opcode  (*cbuf, 0xDD );           // FST    ST(i)
   1.943 +        emit_d8      (*cbuf, 0xD0+Matcher::_regEncode[dst_first]-1 );
   1.944 +     }
   1.945 +#ifndef PRODUCT
   1.946 +    } else if( !do_size ) {
   1.947 +      if( size != 0 ) st->print("\n\t");
   1.948 +      if( src_first != FPR1L_num ) st->print("FLD    %s\n\tFSTP   %s",Matcher::regName[src_first],Matcher::regName[dst_first]);
   1.949 +      else                      st->print(             "FST    %s",                            Matcher::regName[dst_first]);
   1.950 +#endif
   1.951 +    }
   1.952 +    return size + ((src_first != FPR1L_num) ? 2+2 : 2);
   1.953 +  }
   1.954 +
   1.955 +  // Check for float store
   1.956 +  if( src_first_rc == rc_float && dst_first_rc == rc_stack ) {
   1.957 +    return impl_fp_store_helper(cbuf,do_size,src_first,src_second,dst_first,dst_second,ra_->reg2offset(dst_first),size);
   1.958 +  }
   1.959 +
   1.960 +  // Check for float load
   1.961 +  if( dst_first_rc == rc_float && src_first_rc == rc_stack ) {
   1.962 +    int offset = ra_->reg2offset(src_first);
   1.963 +    const char *op_str;
   1.964 +    int op;
   1.965 +    if( src_first+1 == src_second && dst_first+1 == dst_second ) { // double load?
   1.966 +      op_str = "FLD_D";
   1.967 +      op = 0xDD;
   1.968 +    } else {                   // 32-bit load
   1.969 +      op_str = "FLD_S";
   1.970 +      op = 0xD9;
   1.971 +      assert( src_second_rc == rc_bad && dst_second_rc == rc_bad, "no non-adjacent float-loads" );
   1.972 +    }
   1.973 +    if( cbuf ) {
   1.974 +      emit_opcode  (*cbuf, op );
   1.975 +      encode_RegMem(*cbuf, 0x0, ESP_enc, 0x4, 0, offset, false);
   1.976 +      emit_opcode  (*cbuf, 0xDD );           // FSTP   ST(i)
   1.977 +      emit_d8      (*cbuf, 0xD8+Matcher::_regEncode[dst_first] );
   1.978 +#ifndef PRODUCT
   1.979 +    } else if( !do_size ) {
   1.980 +      if( size != 0 ) st->print("\n\t");
   1.981 +      st->print("%s  ST,[ESP + #%d]\n\tFSTP   %s",op_str, offset,Matcher::regName[dst_first]);
   1.982 +#endif
   1.983 +    }
   1.984 +    int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
   1.985 +    return size + 3+offset_size+2;
   1.986 +  }
   1.987 +
   1.988 +  // Check for xmm reg-reg copy
   1.989 +  if( src_first_rc == rc_xmm && dst_first_rc == rc_xmm ) {
   1.990 +    assert( (src_second_rc == rc_bad && dst_second_rc == rc_bad) ||
   1.991 +            (src_first+1 == src_second && dst_first+1 == dst_second),
   1.992 +            "no non-adjacent float-moves" );
   1.993 +    return impl_movx_helper(cbuf,do_size,src_first,dst_first,src_second, dst_second, size);
   1.994 +  }
   1.995 +
   1.996 +  // Check for xmm store
   1.997 +  if( src_first_rc == rc_xmm && dst_first_rc == rc_stack ) {
   1.998 +    return impl_x_helper(cbuf,do_size,false,ra_->reg2offset(dst_first),src_first, src_second, size);
   1.999 +  }
  1.1000 +
  1.1001 +  // Check for float xmm load
  1.1002 +  if( dst_first_rc == rc_xmm && src_first_rc == rc_stack ) {
  1.1003 +    return impl_x_helper(cbuf,do_size,true ,ra_->reg2offset(src_first),dst_first, dst_second, size);
  1.1004 +  }
  1.1005 +
  1.1006 +  // Copy from float reg to xmm reg
  1.1007 +  if( dst_first_rc == rc_xmm && src_first_rc == rc_float ) {
  1.1008 +    // copy to the top of stack from floating point reg
  1.1009 +    // and use LEA to preserve flags
  1.1010 +    if( cbuf ) {
  1.1011 +      emit_opcode(*cbuf,0x8D);  // LEA  ESP,[ESP-8]
  1.1012 +      emit_rm(*cbuf, 0x1, ESP_enc, 0x04);
  1.1013 +      emit_rm(*cbuf, 0x0, 0x04, ESP_enc);
  1.1014 +      emit_d8(*cbuf,0xF8);
  1.1015 +#ifndef PRODUCT
  1.1016 +    } else if( !do_size ) {
  1.1017 +      if( size != 0 ) st->print("\n\t");
  1.1018 +      st->print("LEA    ESP,[ESP-8]");
  1.1019 +#endif
  1.1020 +    }
  1.1021 +    size += 4;
  1.1022 +
  1.1023 +    size = impl_fp_store_helper(cbuf,do_size,src_first,src_second,dst_first,dst_second,0,size);
  1.1024 +
  1.1025 +    // Copy from the temp memory to the xmm reg.
  1.1026 +    size = impl_x_helper(cbuf,do_size,true ,0,dst_first, dst_second, size);
  1.1027 +
  1.1028 +    if( cbuf ) {
  1.1029 +      emit_opcode(*cbuf,0x8D);  // LEA  ESP,[ESP+8]
  1.1030 +      emit_rm(*cbuf, 0x1, ESP_enc, 0x04);
  1.1031 +      emit_rm(*cbuf, 0x0, 0x04, ESP_enc);
  1.1032 +      emit_d8(*cbuf,0x08);
  1.1033 +#ifndef PRODUCT
  1.1034 +    } else if( !do_size ) {
  1.1035 +      if( size != 0 ) st->print("\n\t");
  1.1036 +      st->print("LEA    ESP,[ESP+8]");
  1.1037 +#endif
  1.1038 +    }
  1.1039 +    size += 4;
  1.1040 +    return size;
  1.1041 +  }
  1.1042 +
  1.1043 +  assert( size > 0, "missed a case" );
  1.1044 +
  1.1045 +  // --------------------------------------------------------------------
  1.1046 +  // Check for second bits still needing moving.
  1.1047 +  if( src_second == dst_second )
  1.1048 +    return size;               // Self copy; no move
  1.1049 +  assert( src_second_rc != rc_bad && dst_second_rc != rc_bad, "src_second & dst_second cannot be Bad" );
  1.1050 +
  1.1051 +  // Check for second word int-int move
  1.1052 +  if( src_second_rc == rc_int && dst_second_rc == rc_int )
  1.1053 +    return impl_mov_helper(cbuf,do_size,src_second,dst_second,size);
  1.1054 +
  1.1055 +  // Check for second word integer store
  1.1056 +  if( src_second_rc == rc_int && dst_second_rc == rc_stack )
  1.1057 +    return impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_second),src_second,0x89,"MOV ",size);
  1.1058 +
  1.1059 +  // Check for second word integer load
  1.1060 +  if( dst_second_rc == rc_int && src_second_rc == rc_stack )
  1.1061 +    return impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_second),dst_second,0x8B,"MOV ",size);
  1.1062 +
  1.1063 +
  1.1064 +  Unimplemented();
  1.1065 +}
  1.1066 +
  1.1067 +#ifndef PRODUCT
  1.1068 +void MachSpillCopyNode::format( PhaseRegAlloc *ra_, outputStream* st ) const {
  1.1069 +  implementation( NULL, ra_, false, st );
  1.1070 +}
  1.1071 +#endif
  1.1072 +
  1.1073 +void MachSpillCopyNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
  1.1074 +  implementation( &cbuf, ra_, false, NULL );
  1.1075 +}
  1.1076 +
  1.1077 +uint MachSpillCopyNode::size(PhaseRegAlloc *ra_) const {
  1.1078 +  return implementation( NULL, ra_, true, NULL );
  1.1079 +}
  1.1080 +
  1.1081 +//=============================================================================
  1.1082 +#ifndef PRODUCT
  1.1083 +void MachNopNode::format( PhaseRegAlloc *, outputStream* st ) const {
  1.1084 +  st->print("NOP \t# %d bytes pad for loops and calls", _count);
  1.1085 +}
  1.1086 +#endif
  1.1087 +
  1.1088 +void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc * ) const {
  1.1089 +  MacroAssembler _masm(&cbuf);
  1.1090 +  __ nop(_count);
  1.1091 +}
  1.1092 +
  1.1093 +uint MachNopNode::size(PhaseRegAlloc *) const {
  1.1094 +  return _count;
  1.1095 +}
  1.1096 +
  1.1097 +
  1.1098 +//=============================================================================
  1.1099 +#ifndef PRODUCT
  1.1100 +void BoxLockNode::format( PhaseRegAlloc *ra_, outputStream* st ) const {
  1.1101 +  int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
  1.1102 +  int reg = ra_->get_reg_first(this);
  1.1103 +  st->print("LEA    %s,[ESP + #%d]",Matcher::regName[reg],offset);
  1.1104 +}
  1.1105 +#endif
  1.1106 +
  1.1107 +void BoxLockNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
  1.1108 +  int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
  1.1109 +  int reg = ra_->get_encode(this);
  1.1110 +  if( offset >= 128 ) {
  1.1111 +    emit_opcode(cbuf, 0x8D);      // LEA  reg,[SP+offset]
  1.1112 +    emit_rm(cbuf, 0x2, reg, 0x04);
  1.1113 +    emit_rm(cbuf, 0x0, 0x04, ESP_enc);
  1.1114 +    emit_d32(cbuf, offset);
  1.1115 +  }
  1.1116 +  else {
  1.1117 +    emit_opcode(cbuf, 0x8D);      // LEA  reg,[SP+offset]
  1.1118 +    emit_rm(cbuf, 0x1, reg, 0x04);
  1.1119 +    emit_rm(cbuf, 0x0, 0x04, ESP_enc);
  1.1120 +    emit_d8(cbuf, offset);
  1.1121 +  }
  1.1122 +}
  1.1123 +
  1.1124 +uint BoxLockNode::size(PhaseRegAlloc *ra_) const {
  1.1125 +  int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
  1.1126 +  if( offset >= 128 ) {
  1.1127 +    return 7;
  1.1128 +  }
  1.1129 +  else {
  1.1130 +    return 4;
  1.1131 +  }
  1.1132 +}
  1.1133 +
  1.1134 +//=============================================================================
  1.1135 +
  1.1136 +// emit call stub, compiled java to interpreter
  1.1137 +void emit_java_to_interp(CodeBuffer &cbuf ) {
  1.1138 +  // Stub is fixed up when the corresponding call is converted from calling
  1.1139 +  // compiled code to calling interpreted code.
  1.1140 +  // mov rbx,0
  1.1141 +  // jmp -1
  1.1142 +
  1.1143 +  address mark = cbuf.inst_mark();  // get mark within main instrs section
  1.1144 +
  1.1145 +  // Note that the code buffer's inst_mark is always relative to insts.
  1.1146 +  // That's why we must use the macroassembler to generate a stub.
  1.1147 +  MacroAssembler _masm(&cbuf);
  1.1148 +
  1.1149 +  address base =
  1.1150 +  __ start_a_stub(Compile::MAX_stubs_size);
  1.1151 +  if (base == NULL)  return;  // CodeBuffer::expand failed
  1.1152 +  // static stub relocation stores the instruction address of the call
  1.1153 +  __ relocate(static_stub_Relocation::spec(mark), RELOC_IMM32);
  1.1154 +  // static stub relocation also tags the methodOop in the code-stream.
  1.1155 +  __ movoop(rbx, (jobject)NULL);  // method is zapped till fixup time
  1.1156 +  __ jump(RuntimeAddress((address)-1));
  1.1157 +
  1.1158 +  __ end_a_stub();
  1.1159 +  // Update current stubs pointer and restore code_end.
  1.1160 +}
  1.1161 +// size of call stub, compiled java to interpretor
  1.1162 +uint size_java_to_interp() {
  1.1163 +  return 10;  // movl; jmp
  1.1164 +}
  1.1165 +// relocation entries for call stub, compiled java to interpretor
  1.1166 +uint reloc_java_to_interp() {
  1.1167 +  return 4;  // 3 in emit_java_to_interp + 1 in Java_Static_Call
  1.1168 +}
  1.1169 +
  1.1170 +//=============================================================================
  1.1171 +#ifndef PRODUCT
  1.1172 +void MachUEPNode::format( PhaseRegAlloc *ra_, outputStream* st ) const {
  1.1173 +  st->print_cr(  "CMP    EAX,[ECX+4]\t# Inline cache check");
  1.1174 +  st->print_cr("\tJNE    SharedRuntime::handle_ic_miss_stub");
  1.1175 +  st->print_cr("\tNOP");
  1.1176 +  st->print_cr("\tNOP");
  1.1177 +  if( !OptoBreakpoint )
  1.1178 +    st->print_cr("\tNOP");
  1.1179 +}
  1.1180 +#endif
  1.1181 +
  1.1182 +void MachUEPNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
  1.1183 +  MacroAssembler masm(&cbuf);
  1.1184 +#ifdef ASSERT
  1.1185 +  uint code_size = cbuf.code_size();
  1.1186 +#endif
  1.1187 +  masm.cmpl(rax, Address(rcx, oopDesc::klass_offset_in_bytes()));
  1.1188 +  masm.jump_cc(Assembler::notEqual,
  1.1189 +               RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
  1.1190 +  /* WARNING these NOPs are critical so that verified entry point is properly
  1.1191 +     aligned for patching by NativeJump::patch_verified_entry() */
  1.1192 +  int nops_cnt = 2;
  1.1193 +  if( !OptoBreakpoint ) // Leave space for int3
  1.1194 +     nops_cnt += 1;
  1.1195 +  masm.nop(nops_cnt);
  1.1196 +
  1.1197 +  assert(cbuf.code_size() - code_size == size(ra_), "checking code size of inline cache node");
  1.1198 +}
  1.1199 +
  1.1200 +uint MachUEPNode::size(PhaseRegAlloc *ra_) const {
  1.1201 +  return OptoBreakpoint ? 11 : 12;
  1.1202 +}
  1.1203 +
  1.1204 +
  1.1205 +//=============================================================================
  1.1206 +uint size_exception_handler() {
  1.1207 +  // NativeCall instruction size is the same as NativeJump.
  1.1208 +  // exception handler starts out as jump and can be patched to
  1.1209 +  // a call be deoptimization.  (4932387)
  1.1210 +  // Note that this value is also credited (in output.cpp) to
  1.1211 +  // the size of the code section.
  1.1212 +  return NativeJump::instruction_size;
  1.1213 +}
  1.1214 +
  1.1215 +// Emit exception handler code.  Stuff framesize into a register
  1.1216 +// and call a VM stub routine.
  1.1217 +int emit_exception_handler(CodeBuffer& cbuf) {
  1.1218 +
  1.1219 +  // Note that the code buffer's inst_mark is always relative to insts.
  1.1220 +  // That's why we must use the macroassembler to generate a handler.
  1.1221 +  MacroAssembler _masm(&cbuf);
  1.1222 +  address base =
  1.1223 +  __ start_a_stub(size_exception_handler());
  1.1224 +  if (base == NULL)  return 0;  // CodeBuffer::expand failed
  1.1225 +  int offset = __ offset();
  1.1226 +  __ jump(RuntimeAddress(OptoRuntime::exception_blob()->instructions_begin()));
  1.1227 +  assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
  1.1228 +  __ end_a_stub();
  1.1229 +  return offset;
  1.1230 +}
  1.1231 +
  1.1232 +uint size_deopt_handler() {
  1.1233 +  // NativeCall instruction size is the same as NativeJump.
  1.1234 +  // exception handler starts out as jump and can be patched to
  1.1235 +  // a call be deoptimization.  (4932387)
  1.1236 +  // Note that this value is also credited (in output.cpp) to
  1.1237 +  // the size of the code section.
  1.1238 +  return 5 + NativeJump::instruction_size; // pushl(); jmp;
  1.1239 +}
  1.1240 +
  1.1241 +// Emit deopt handler code.
  1.1242 +int emit_deopt_handler(CodeBuffer& cbuf) {
  1.1243 +
  1.1244 +  // Note that the code buffer's inst_mark is always relative to insts.
  1.1245 +  // That's why we must use the macroassembler to generate a handler.
  1.1246 +  MacroAssembler _masm(&cbuf);
  1.1247 +  address base =
  1.1248 +  __ start_a_stub(size_exception_handler());
  1.1249 +  if (base == NULL)  return 0;  // CodeBuffer::expand failed
  1.1250 +  int offset = __ offset();
  1.1251 +  InternalAddress here(__ pc());
  1.1252 +  __ pushptr(here.addr());
  1.1253 +
  1.1254 +  __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
  1.1255 +  assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow");
  1.1256 +  __ end_a_stub();
  1.1257 +  return offset;
  1.1258 +}
  1.1259 +
  1.1260 +
  1.1261 +static void emit_double_constant(CodeBuffer& cbuf, double x) {
  1.1262 +  int mark = cbuf.insts()->mark_off();
  1.1263 +  MacroAssembler _masm(&cbuf);
  1.1264 +  address double_address = __ double_constant(x);
  1.1265 +  cbuf.insts()->set_mark_off(mark);  // preserve mark across masm shift
  1.1266 +  emit_d32_reloc(cbuf,
  1.1267 +                 (int)double_address,
  1.1268 +                 internal_word_Relocation::spec(double_address),
  1.1269 +                 RELOC_DISP32);
  1.1270 +}
  1.1271 +
  1.1272 +static void emit_float_constant(CodeBuffer& cbuf, float x) {
  1.1273 +  int mark = cbuf.insts()->mark_off();
  1.1274 +  MacroAssembler _masm(&cbuf);
  1.1275 +  address float_address = __ float_constant(x);
  1.1276 +  cbuf.insts()->set_mark_off(mark);  // preserve mark across masm shift
  1.1277 +  emit_d32_reloc(cbuf,
  1.1278 +                 (int)float_address,
  1.1279 +                 internal_word_Relocation::spec(float_address),
  1.1280 +                 RELOC_DISP32);
  1.1281 +}
  1.1282 +
  1.1283 +
  1.1284 +int Matcher::regnum_to_fpu_offset(int regnum) {
  1.1285 +  return regnum - 32; // The FP registers are in the second chunk
  1.1286 +}
  1.1287 +
  1.1288 +bool is_positive_zero_float(jfloat f) {
  1.1289 +  return jint_cast(f) == jint_cast(0.0F);
  1.1290 +}
  1.1291 +
  1.1292 +bool is_positive_one_float(jfloat f) {
  1.1293 +  return jint_cast(f) == jint_cast(1.0F);
  1.1294 +}
  1.1295 +
  1.1296 +bool is_positive_zero_double(jdouble d) {
  1.1297 +  return jlong_cast(d) == jlong_cast(0.0);
  1.1298 +}
  1.1299 +
  1.1300 +bool is_positive_one_double(jdouble d) {
  1.1301 +  return jlong_cast(d) == jlong_cast(1.0);
  1.1302 +}
  1.1303 +
  1.1304 +// This is UltraSparc specific, true just means we have fast l2f conversion
  1.1305 +const bool Matcher::convL2FSupported(void) {
  1.1306 +  return true;
  1.1307 +}
  1.1308 +
  1.1309 +// Vector width in bytes
  1.1310 +const uint Matcher::vector_width_in_bytes(void) {
  1.1311 +  return UseSSE >= 2 ? 8 : 0;
  1.1312 +}
  1.1313 +
  1.1314 +// Vector ideal reg
  1.1315 +const uint Matcher::vector_ideal_reg(void) {
  1.1316 +  return Op_RegD;
  1.1317 +}
  1.1318 +
  1.1319 +// Is this branch offset short enough that a short branch can be used?
  1.1320 +//
  1.1321 +// NOTE: If the platform does not provide any short branch variants, then
  1.1322 +//       this method should return false for offset 0.
  1.1323 +bool Matcher::is_short_branch_offset(int offset) {
  1.1324 +  return (-128 <= offset && offset <= 127);
  1.1325 +}
  1.1326 +
  1.1327 +const bool Matcher::isSimpleConstant64(jlong value) {
  1.1328 +  // Will one (StoreL ConL) be cheaper than two (StoreI ConI)?.
  1.1329 +  return false;
  1.1330 +}
  1.1331 +
  1.1332 +// The ecx parameter to rep stos for the ClearArray node is in dwords.
  1.1333 +const bool Matcher::init_array_count_is_in_bytes = false;
  1.1334 +
  1.1335 +// Threshold size for cleararray.
  1.1336 +const int Matcher::init_array_short_size = 8 * BytesPerLong;
  1.1337 +
  1.1338 +// Should the Matcher clone shifts on addressing modes, expecting them to
  1.1339 +// be subsumed into complex addressing expressions or compute them into
  1.1340 +// registers?  True for Intel but false for most RISCs
  1.1341 +const bool Matcher::clone_shift_expressions = true;
  1.1342 +
  1.1343 +// Is it better to copy float constants, or load them directly from memory?
  1.1344 +// Intel can load a float constant from a direct address, requiring no
  1.1345 +// extra registers.  Most RISCs will have to materialize an address into a
  1.1346 +// register first, so they would do better to copy the constant from stack.
  1.1347 +const bool Matcher::rematerialize_float_constants = true;
  1.1348 +
  1.1349 +// If CPU can load and store mis-aligned doubles directly then no fixup is
  1.1350 +// needed.  Else we split the double into 2 integer pieces and move it
  1.1351 +// piece-by-piece.  Only happens when passing doubles into C code as the
  1.1352 +// Java calling convention forces doubles to be aligned.
  1.1353 +const bool Matcher::misaligned_doubles_ok = true;
  1.1354 +
  1.1355 +
  1.1356 +void Matcher::pd_implicit_null_fixup(MachNode *node, uint idx) {
  1.1357 +  // Get the memory operand from the node
  1.1358 +  uint numopnds = node->num_opnds();        // Virtual call for number of operands
  1.1359 +  uint skipped  = node->oper_input_base();  // Sum of leaves skipped so far
  1.1360 +  assert( idx >= skipped, "idx too low in pd_implicit_null_fixup" );
  1.1361 +  uint opcnt     = 1;                 // First operand
  1.1362 +  uint num_edges = node->_opnds[1]->num_edges(); // leaves for first operand
  1.1363 +  while( idx >= skipped+num_edges ) {
  1.1364 +    skipped += num_edges;
  1.1365 +    opcnt++;                          // Bump operand count
  1.1366 +    assert( opcnt < numopnds, "Accessing non-existent operand" );
  1.1367 +    num_edges = node->_opnds[opcnt]->num_edges(); // leaves for next operand
  1.1368 +  }
  1.1369 +
  1.1370 +  MachOper *memory = node->_opnds[opcnt];
  1.1371 +  MachOper *new_memory = NULL;
  1.1372 +  switch (memory->opcode()) {
  1.1373 +  case DIRECT:
  1.1374 +  case INDOFFSET32X:
  1.1375 +    // No transformation necessary.
  1.1376 +    return;
  1.1377 +  case INDIRECT:
  1.1378 +    new_memory = new (C) indirect_win95_safeOper( );
  1.1379 +    break;
  1.1380 +  case INDOFFSET8:
  1.1381 +    new_memory = new (C) indOffset8_win95_safeOper(memory->disp(NULL, NULL, 0));
  1.1382 +    break;
  1.1383 +  case INDOFFSET32:
  1.1384 +    new_memory = new (C) indOffset32_win95_safeOper(memory->disp(NULL, NULL, 0));
  1.1385 +    break;
  1.1386 +  case INDINDEXOFFSET:
  1.1387 +    new_memory = new (C) indIndexOffset_win95_safeOper(memory->disp(NULL, NULL, 0));
  1.1388 +    break;
  1.1389 +  case INDINDEXSCALE:
  1.1390 +    new_memory = new (C) indIndexScale_win95_safeOper(memory->scale());
  1.1391 +    break;
  1.1392 +  case INDINDEXSCALEOFFSET:
  1.1393 +    new_memory = new (C) indIndexScaleOffset_win95_safeOper(memory->scale(), memory->disp(NULL, NULL, 0));
  1.1394 +    break;
  1.1395 +  case LOAD_LONG_INDIRECT:
  1.1396 +  case LOAD_LONG_INDOFFSET32:
  1.1397 +    // Does not use EBP as address register, use { EDX, EBX, EDI, ESI}
  1.1398 +    return;
  1.1399 +  default:
  1.1400 +    assert(false, "unexpected memory operand in pd_implicit_null_fixup()");
  1.1401 +    return;
  1.1402 +  }
  1.1403 +  node->_opnds[opcnt] = new_memory;
  1.1404 +}
  1.1405 +
  1.1406 +// Advertise here if the CPU requires explicit rounding operations
  1.1407 +// to implement the UseStrictFP mode.
  1.1408 +const bool Matcher::strict_fp_requires_explicit_rounding = true;
  1.1409 +
  1.1410 +// Do floats take an entire double register or just half?
  1.1411 +const bool Matcher::float_in_double = true;
  1.1412 +// Do ints take an entire long register or just half?
  1.1413 +const bool Matcher::int_in_long = false;
  1.1414 +
  1.1415 +// Return whether or not this register is ever used as an argument.  This
  1.1416 +// function is used on startup to build the trampoline stubs in generateOptoStub.
  1.1417 +// Registers not mentioned will be killed by the VM call in the trampoline, and
  1.1418 +// arguments in those registers not be available to the callee.
  1.1419 +bool Matcher::can_be_java_arg( int reg ) {
  1.1420 +  if(  reg == ECX_num   || reg == EDX_num   ) return true;
  1.1421 +  if( (reg == XMM0a_num || reg == XMM1a_num) && UseSSE>=1 ) return true;
  1.1422 +  if( (reg == XMM0b_num || reg == XMM1b_num) && UseSSE>=2 ) return true;
  1.1423 +  return false;
  1.1424 +}
  1.1425 +
  1.1426 +bool Matcher::is_spillable_arg( int reg ) {
  1.1427 +  return can_be_java_arg(reg);
  1.1428 +}
  1.1429 +
  1.1430 +// Register for DIVI projection of divmodI
  1.1431 +RegMask Matcher::divI_proj_mask() {
  1.1432 +  return EAX_REG_mask;
  1.1433 +}
  1.1434 +
  1.1435 +// Register for MODI projection of divmodI
  1.1436 +RegMask Matcher::modI_proj_mask() {
  1.1437 +  return EDX_REG_mask;
  1.1438 +}
  1.1439 +
  1.1440 +// Register for DIVL projection of divmodL
  1.1441 +RegMask Matcher::divL_proj_mask() {
  1.1442 +  ShouldNotReachHere();
  1.1443 +  return RegMask();
  1.1444 +}
  1.1445 +
  1.1446 +// Register for MODL projection of divmodL
  1.1447 +RegMask Matcher::modL_proj_mask() {
  1.1448 +  ShouldNotReachHere();
  1.1449 +  return RegMask();
  1.1450 +}
  1.1451 +
  1.1452 +%}
  1.1453 +
  1.1454 +//----------ENCODING BLOCK-----------------------------------------------------
  1.1455 +// This block specifies the encoding classes used by the compiler to output
  1.1456 +// byte streams.  Encoding classes generate functions which are called by
  1.1457 +// Machine Instruction Nodes in order to generate the bit encoding of the
  1.1458 +// instruction.  Operands specify their base encoding interface with the
  1.1459 +// interface keyword.  There are currently supported four interfaces,
  1.1460 +// REG_INTER, CONST_INTER, MEMORY_INTER, & COND_INTER.  REG_INTER causes an
  1.1461 +// operand to generate a function which returns its register number when
  1.1462 +// queried.   CONST_INTER causes an operand to generate a function which
  1.1463 +// returns the value of the constant when queried.  MEMORY_INTER causes an
  1.1464 +// operand to generate four functions which return the Base Register, the
  1.1465 +// Index Register, the Scale Value, and the Offset Value of the operand when
  1.1466 +// queried.  COND_INTER causes an operand to generate six functions which
  1.1467 +// return the encoding code (ie - encoding bits for the instruction)
  1.1468 +// associated with each basic boolean condition for a conditional instruction.
  1.1469 +// Instructions specify two basic values for encoding.  They use the
  1.1470 +// ins_encode keyword to specify their encoding class (which must be one of
  1.1471 +// the class names specified in the encoding block), and they use the
  1.1472 +// opcode keyword to specify, in order, their primary, secondary, and
  1.1473 +// tertiary opcode.  Only the opcode sections which a particular instruction
  1.1474 +// needs for encoding need to be specified.
  1.1475 +encode %{
  1.1476 +  // Build emit functions for each basic byte or larger field in the intel
  1.1477 +  // encoding scheme (opcode, rm, sib, immediate), and call them from C++
  1.1478 +  // code in the enc_class source block.  Emit functions will live in the
  1.1479 +  // main source block for now.  In future, we can generalize this by
  1.1480 +  // adding a syntax that specifies the sizes of fields in an order,
  1.1481 +  // so that the adlc can build the emit functions automagically
  1.1482 +  enc_class OpcP %{             // Emit opcode
  1.1483 +    emit_opcode(cbuf,$primary);
  1.1484 +  %}
  1.1485 +
  1.1486 +  enc_class OpcS %{             // Emit opcode
  1.1487 +    emit_opcode(cbuf,$secondary);
  1.1488 +  %}
  1.1489 +
  1.1490 +  enc_class Opcode(immI d8 ) %{ // Emit opcode
  1.1491 +    emit_opcode(cbuf,$d8$$constant);
  1.1492 +  %}
  1.1493 +
  1.1494 +  enc_class SizePrefix %{
  1.1495 +    emit_opcode(cbuf,0x66);
  1.1496 +  %}
  1.1497 +
  1.1498 +  enc_class RegReg (eRegI dst, eRegI src) %{    // RegReg(Many)
  1.1499 +    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
  1.1500 +  %}
  1.1501 +
  1.1502 +  enc_class OpcRegReg (immI opcode, eRegI dst, eRegI src) %{    // OpcRegReg(Many)
  1.1503 +    emit_opcode(cbuf,$opcode$$constant);
  1.1504 +    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
  1.1505 +  %}
  1.1506 +
  1.1507 +  enc_class mov_r32_imm0( eRegI dst ) %{
  1.1508 +    emit_opcode( cbuf, 0xB8 + $dst$$reg ); // 0xB8+ rd   -- MOV r32  ,imm32
  1.1509 +    emit_d32   ( cbuf, 0x0  );             //                         imm32==0x0
  1.1510 +  %}
  1.1511 +
  1.1512 +  enc_class cdq_enc %{
  1.1513 +    // Full implementation of Java idiv and irem; checks for
  1.1514 +    // special case as described in JVM spec., p.243 & p.271.
  1.1515 +    //
  1.1516 +    //         normal case                           special case
  1.1517 +    //
  1.1518 +    // input : rax,: dividend                         min_int
  1.1519 +    //         reg: divisor                          -1
  1.1520 +    //
  1.1521 +    // output: rax,: quotient  (= rax, idiv reg)       min_int
  1.1522 +    //         rdx: remainder (= rax, irem reg)       0
  1.1523 +    //
  1.1524 +    //  Code sequnce:
  1.1525 +    //
  1.1526 +    //  81 F8 00 00 00 80    cmp         rax,80000000h
  1.1527 +    //  0F 85 0B 00 00 00    jne         normal_case
  1.1528 +    //  33 D2                xor         rdx,edx
  1.1529 +    //  83 F9 FF             cmp         rcx,0FFh
  1.1530 +    //  0F 84 03 00 00 00    je          done
  1.1531 +    //                  normal_case:
  1.1532 +    //  99                   cdq
  1.1533 +    //  F7 F9                idiv        rax,ecx
  1.1534 +    //                  done:
  1.1535 +    //
  1.1536 +    emit_opcode(cbuf,0x81); emit_d8(cbuf,0xF8);
  1.1537 +    emit_opcode(cbuf,0x00); emit_d8(cbuf,0x00);
  1.1538 +    emit_opcode(cbuf,0x00); emit_d8(cbuf,0x80);                     // cmp rax,80000000h
  1.1539 +    emit_opcode(cbuf,0x0F); emit_d8(cbuf,0x85);
  1.1540 +    emit_opcode(cbuf,0x0B); emit_d8(cbuf,0x00);
  1.1541 +    emit_opcode(cbuf,0x00); emit_d8(cbuf,0x00);                     // jne normal_case
  1.1542 +    emit_opcode(cbuf,0x33); emit_d8(cbuf,0xD2);                     // xor rdx,edx
  1.1543 +    emit_opcode(cbuf,0x83); emit_d8(cbuf,0xF9); emit_d8(cbuf,0xFF); // cmp rcx,0FFh
  1.1544 +    emit_opcode(cbuf,0x0F); emit_d8(cbuf,0x84);
  1.1545 +    emit_opcode(cbuf,0x03); emit_d8(cbuf,0x00);
  1.1546 +    emit_opcode(cbuf,0x00); emit_d8(cbuf,0x00);                     // je done
  1.1547 +    // normal_case:
  1.1548 +    emit_opcode(cbuf,0x99);                                         // cdq
  1.1549 +    // idiv (note: must be emitted by the user of this rule)
  1.1550 +    // normal:
  1.1551 +  %}
  1.1552 +
  1.1553 +  // Dense encoding for older common ops
  1.1554 +  enc_class Opc_plus(immI opcode, eRegI reg) %{
  1.1555 +    emit_opcode(cbuf, $opcode$$constant + $reg$$reg);
  1.1556 +  %}
  1.1557 +
  1.1558 +
  1.1559 +  // Opcde enc_class for 8/32 bit immediate instructions with sign-extension
  1.1560 +  enc_class OpcSE (immI imm) %{ // Emit primary opcode and set sign-extend bit
  1.1561 +    // Check for 8-bit immediate, and set sign extend bit in opcode
  1.1562 +    if (($imm$$constant >= -128) && ($imm$$constant <= 127)) {
  1.1563 +      emit_opcode(cbuf, $primary | 0x02);
  1.1564 +    }
  1.1565 +    else {                          // If 32-bit immediate
  1.1566 +      emit_opcode(cbuf, $primary);
  1.1567 +    }
  1.1568 +  %}
  1.1569 +
  1.1570 +  enc_class OpcSErm (eRegI dst, immI imm) %{    // OpcSEr/m
  1.1571 +    // Emit primary opcode and set sign-extend bit
  1.1572 +    // Check for 8-bit immediate, and set sign extend bit in opcode
  1.1573 +    if (($imm$$constant >= -128) && ($imm$$constant <= 127)) {
  1.1574 +      emit_opcode(cbuf, $primary | 0x02);    }
  1.1575 +    else {                          // If 32-bit immediate
  1.1576 +      emit_opcode(cbuf, $primary);
  1.1577 +    }
  1.1578 +    // Emit r/m byte with secondary opcode, after primary opcode.
  1.1579 +    emit_rm(cbuf, 0x3, $secondary, $dst$$reg);
  1.1580 +  %}
  1.1581 +
  1.1582 +  enc_class Con8or32 (immI imm) %{    // Con8or32(storeImmI), 8 or 32 bits
  1.1583 +    // Check for 8-bit immediate, and set sign extend bit in opcode
  1.1584 +    if (($imm$$constant >= -128) && ($imm$$constant <= 127)) {
  1.1585 +      $$$emit8$imm$$constant;
  1.1586 +    }
  1.1587 +    else {                          // If 32-bit immediate
  1.1588 +      // Output immediate
  1.1589 +      $$$emit32$imm$$constant;
  1.1590 +    }
  1.1591 +  %}
  1.1592 +
  1.1593 +  enc_class Long_OpcSErm_Lo(eRegL dst, immL imm) %{
  1.1594 +    // Emit primary opcode and set sign-extend bit
  1.1595 +    // Check for 8-bit immediate, and set sign extend bit in opcode
  1.1596 +    int con = (int)$imm$$constant; // Throw away top bits
  1.1597 +    emit_opcode(cbuf, ((con >= -128) && (con <= 127)) ? ($primary | 0x02) : $primary);
  1.1598 +    // Emit r/m byte with secondary opcode, after primary opcode.
  1.1599 +    emit_rm(cbuf, 0x3, $secondary, $dst$$reg);
  1.1600 +    if ((con >= -128) && (con <= 127)) emit_d8 (cbuf,con);
  1.1601 +    else                               emit_d32(cbuf,con);
  1.1602 +  %}
  1.1603 +
  1.1604 +  enc_class Long_OpcSErm_Hi(eRegL dst, immL imm) %{
  1.1605 +    // Emit primary opcode and set sign-extend bit
  1.1606 +    // Check for 8-bit immediate, and set sign extend bit in opcode
  1.1607 +    int con = (int)($imm$$constant >> 32); // Throw away bottom bits
  1.1608 +    emit_opcode(cbuf, ((con >= -128) && (con <= 127)) ? ($primary | 0x02) : $primary);
  1.1609 +    // Emit r/m byte with tertiary opcode, after primary opcode.
  1.1610 +    emit_rm(cbuf, 0x3, $tertiary, HIGH_FROM_LOW($dst$$reg));
  1.1611 +    if ((con >= -128) && (con <= 127)) emit_d8 (cbuf,con);
  1.1612 +    else                               emit_d32(cbuf,con);
  1.1613 +  %}
  1.1614 +
  1.1615 +  enc_class Lbl (label labl) %{ // JMP, CALL
  1.1616 +    Label *l = $labl$$label;
  1.1617 +    emit_d32(cbuf, l ? (l->loc_pos() - (cbuf.code_size()+4)) : 0);
  1.1618 +  %}
  1.1619 +
  1.1620 +  enc_class LblShort (label labl) %{ // JMP, CALL
  1.1621 +    Label *l = $labl$$label;
  1.1622 +    int disp = l ? (l->loc_pos() - (cbuf.code_size()+1)) : 0;
  1.1623 +    assert(-128 <= disp && disp <= 127, "Displacement too large for short jmp");
  1.1624 +    emit_d8(cbuf, disp);
  1.1625 +  %}
  1.1626 +
  1.1627 +  enc_class OpcSReg (eRegI dst) %{    // BSWAP
  1.1628 +    emit_cc(cbuf, $secondary, $dst$$reg );
  1.1629 +  %}
  1.1630 +
  1.1631 +  enc_class bswap_long_bytes(eRegL dst) %{ // BSWAP
  1.1632 +    int destlo = $dst$$reg;
  1.1633 +    int desthi = HIGH_FROM_LOW(destlo);
  1.1634 +    // bswap lo
  1.1635 +    emit_opcode(cbuf, 0x0F);
  1.1636 +    emit_cc(cbuf, 0xC8, destlo);
  1.1637 +    // bswap hi
  1.1638 +    emit_opcode(cbuf, 0x0F);
  1.1639 +    emit_cc(cbuf, 0xC8, desthi);
  1.1640 +    // xchg lo and hi
  1.1641 +    emit_opcode(cbuf, 0x87);
  1.1642 +    emit_rm(cbuf, 0x3, destlo, desthi);
  1.1643 +  %}
  1.1644 +
  1.1645 +  enc_class RegOpc (eRegI div) %{    // IDIV, IMOD, JMP indirect, ...
  1.1646 +    emit_rm(cbuf, 0x3, $secondary, $div$$reg );
  1.1647 +  %}
  1.1648 +
  1.1649 +  enc_class Jcc (cmpOp cop, label labl) %{    // JCC
  1.1650 +    Label *l = $labl$$label;
  1.1651 +    $$$emit8$primary;
  1.1652 +    emit_cc(cbuf, $secondary, $cop$$cmpcode);
  1.1653 +    emit_d32(cbuf, l ? (l->loc_pos() - (cbuf.code_size()+4)) : 0);
  1.1654 +  %}
  1.1655 +
  1.1656 +  enc_class JccShort (cmpOp cop, label labl) %{    // JCC
  1.1657 +    Label *l = $labl$$label;
  1.1658 +    emit_cc(cbuf, $primary, $cop$$cmpcode);
  1.1659 +    int disp = l ? (l->loc_pos() - (cbuf.code_size()+1)) : 0;
  1.1660 +    assert(-128 <= disp && disp <= 127, "Displacement too large for short jmp");
  1.1661 +    emit_d8(cbuf, disp);
  1.1662 +  %}
  1.1663 +
  1.1664 +  enc_class enc_cmov(cmpOp cop ) %{ // CMOV
  1.1665 +    $$$emit8$primary;
  1.1666 +    emit_cc(cbuf, $secondary, $cop$$cmpcode);
  1.1667 +  %}
  1.1668 +
  1.1669 +  enc_class enc_cmov_d(cmpOp cop, regD src ) %{ // CMOV
  1.1670 +    int op = 0xDA00 + $cop$$cmpcode + ($src$$reg-1);
  1.1671 +    emit_d8(cbuf, op >> 8 );
  1.1672 +    emit_d8(cbuf, op & 255);
  1.1673 +  %}
  1.1674 +
  1.1675 +  // emulate a CMOV with a conditional branch around a MOV
  1.1676 +  enc_class enc_cmov_branch( cmpOp cop, immI brOffs ) %{ // CMOV
  1.1677 +    // Invert sense of branch from sense of CMOV
  1.1678 +    emit_cc( cbuf, 0x70, ($cop$$cmpcode^1) );
  1.1679 +    emit_d8( cbuf, $brOffs$$constant );
  1.1680 +  %}
  1.1681 +
  1.1682 +  enc_class enc_PartialSubtypeCheck( ) %{
  1.1683 +    Register Redi = as_Register(EDI_enc); // result register
  1.1684 +    Register Reax = as_Register(EAX_enc); // super class
  1.1685 +    Register Recx = as_Register(ECX_enc); // killed
  1.1686 +    Register Resi = as_Register(ESI_enc); // sub class
  1.1687 +    Label hit, miss;
  1.1688 +
  1.1689 +    MacroAssembler _masm(&cbuf);
  1.1690 +    // Compare super with sub directly, since super is not in its own SSA.
  1.1691 +    // The compiler used to emit this test, but we fold it in here,
  1.1692 +    // to allow platform-specific tweaking on sparc.
  1.1693 +    __ cmpl(Reax, Resi);
  1.1694 +    __ jcc(Assembler::equal, hit);
  1.1695 +#ifndef PRODUCT
  1.1696 +    __ increment(ExternalAddress((address)&SharedRuntime::_partial_subtype_ctr));
  1.1697 +#endif //PRODUCT
  1.1698 +    __ movl(Redi,Address(Resi,sizeof(oopDesc) + Klass::secondary_supers_offset_in_bytes()));
  1.1699 +    __ movl(Recx,Address(Redi,arrayOopDesc::length_offset_in_bytes()));
  1.1700 +    __ addl(Redi,arrayOopDesc::base_offset_in_bytes(T_OBJECT));
  1.1701 +    __ repne_scan();
  1.1702 +    __ jcc(Assembler::notEqual, miss);
  1.1703 +    __ movl(Address(Resi,sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes()),Reax);
  1.1704 +    __ bind(hit);
  1.1705 +    if( $primary )
  1.1706 +      __ xorl(Redi,Redi);
  1.1707 +    __ bind(miss);
  1.1708 +  %}
  1.1709 +
  1.1710 +  enc_class FFree_Float_Stack_All %{    // Free_Float_Stack_All
  1.1711 +    MacroAssembler masm(&cbuf);
  1.1712 +    int start = masm.offset();
  1.1713 +    if (UseSSE >= 2) {
  1.1714 +      if (VerifyFPU) {
  1.1715 +        masm.verify_FPU(0, "must be empty in SSE2+ mode");
  1.1716 +      }
  1.1717 +    } else {
  1.1718 +      // External c_calling_convention expects the FPU stack to be 'clean'.
  1.1719 +      // Compiled code leaves it dirty.  Do cleanup now.
  1.1720 +      masm.empty_FPU_stack();
  1.1721 +    }
  1.1722 +    if (sizeof_FFree_Float_Stack_All == -1) {
  1.1723 +      sizeof_FFree_Float_Stack_All = masm.offset() - start;
  1.1724 +    } else {
  1.1725 +      assert(masm.offset() - start == sizeof_FFree_Float_Stack_All, "wrong size");
  1.1726 +    }
  1.1727 +  %}
  1.1728 +
  1.1729 +  enc_class Verify_FPU_For_Leaf %{
  1.1730 +    if( VerifyFPU ) {
  1.1731 +      MacroAssembler masm(&cbuf);
  1.1732 +      masm.verify_FPU( -3, "Returning from Runtime Leaf call");
  1.1733 +    }
  1.1734 +  %}
  1.1735 +
  1.1736 +  enc_class Java_To_Runtime (method meth) %{    // CALL Java_To_Runtime, Java_To_Runtime_Leaf
  1.1737 +    // This is the instruction starting address for relocation info.
  1.1738 +    cbuf.set_inst_mark();
  1.1739 +    $$$emit8$primary;
  1.1740 +    // CALL directly to the runtime
  1.1741 +    emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.code_end()) - 4),
  1.1742 +                runtime_call_Relocation::spec(), RELOC_IMM32 );
  1.1743 +
  1.1744 +    if (UseSSE >= 2) {
  1.1745 +      MacroAssembler _masm(&cbuf);
  1.1746 +      BasicType rt = tf()->return_type();
  1.1747 +
  1.1748 +      if ((rt == T_FLOAT || rt == T_DOUBLE) && !return_value_is_used()) {
  1.1749 +        // A C runtime call where the return value is unused.  In SSE2+
  1.1750 +        // mode the result needs to be removed from the FPU stack.  It's
  1.1751 +        // likely that this function call could be removed by the
  1.1752 +        // optimizer if the C function is a pure function.
  1.1753 +        __ ffree(0);
  1.1754 +      } else if (rt == T_FLOAT) {
  1.1755 +        __ leal(rsp, Address(rsp, -4));
  1.1756 +        __ fstp_s(Address(rsp, 0));
  1.1757 +        __ movflt(xmm0, Address(rsp, 0));
  1.1758 +        __ leal(rsp, Address(rsp,  4));
  1.1759 +      } else if (rt == T_DOUBLE) {
  1.1760 +        __ leal(rsp, Address(rsp, -8));
  1.1761 +        __ fstp_d(Address(rsp, 0));
  1.1762 +        __ movdbl(xmm0, Address(rsp, 0));
  1.1763 +        __ leal(rsp, Address(rsp,  8));
  1.1764 +      }
  1.1765 +    }
  1.1766 +  %}
  1.1767 +
  1.1768 +
  1.1769 +  enc_class pre_call_FPU %{
  1.1770 +    // If method sets FPU control word restore it here
  1.1771 +    if( Compile::current()->in_24_bit_fp_mode() ) {
  1.1772 +      MacroAssembler masm(&cbuf);
  1.1773 +      masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
  1.1774 +    }
  1.1775 +  %}
  1.1776 +
  1.1777 +  enc_class post_call_FPU %{
  1.1778 +    // If method sets FPU control word do it here also
  1.1779 +    if( Compile::current()->in_24_bit_fp_mode() ) {
  1.1780 +      MacroAssembler masm(&cbuf);
  1.1781 +      masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
  1.1782 +    }
  1.1783 +  %}
  1.1784 +
  1.1785 +  enc_class Java_Static_Call (method meth) %{    // JAVA STATIC CALL
  1.1786 +    // CALL to fixup routine.  Fixup routine uses ScopeDesc info to determine
  1.1787 +    // who we intended to call.
  1.1788 +    cbuf.set_inst_mark();
  1.1789 +    $$$emit8$primary;
  1.1790 +    if ( !_method ) {
  1.1791 +      emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.code_end()) - 4),
  1.1792 +                     runtime_call_Relocation::spec(), RELOC_IMM32 );
  1.1793 +    } else if(_optimized_virtual) {
  1.1794 +      emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.code_end()) - 4),
  1.1795 +                     opt_virtual_call_Relocation::spec(), RELOC_IMM32 );
  1.1796 +    } else {
  1.1797 +      emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.code_end()) - 4),
  1.1798 +                     static_call_Relocation::spec(), RELOC_IMM32 );
  1.1799 +    }
  1.1800 +    if( _method ) {  // Emit stub for static call
  1.1801 +      emit_java_to_interp(cbuf);
  1.1802 +    }
  1.1803 +  %}
  1.1804 +
  1.1805 +  enc_class Java_Dynamic_Call (method meth) %{    // JAVA DYNAMIC CALL
  1.1806 +    // !!!!!
  1.1807 +    // Generate  "Mov EAX,0x00", placeholder instruction to load oop-info
  1.1808 +    // emit_call_dynamic_prologue( cbuf );
  1.1809 +    cbuf.set_inst_mark();
  1.1810 +    emit_opcode(cbuf, 0xB8 + EAX_enc);        // mov    EAX,-1
  1.1811 +    emit_d32_reloc(cbuf, (int)Universe::non_oop_word(), oop_Relocation::spec_for_immediate(), RELOC_IMM32);
  1.1812 +    address  virtual_call_oop_addr = cbuf.inst_mark();
  1.1813 +    // CALL to fixup routine.  Fixup routine uses ScopeDesc info to determine
  1.1814 +    // who we intended to call.
  1.1815 +    cbuf.set_inst_mark();
  1.1816 +    $$$emit8$primary;
  1.1817 +    emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.code_end()) - 4),
  1.1818 +                virtual_call_Relocation::spec(virtual_call_oop_addr), RELOC_IMM32 );
  1.1819 +  %}
  1.1820 +
  1.1821 +  enc_class Java_Compiled_Call (method meth) %{    // JAVA COMPILED CALL
  1.1822 +    int disp = in_bytes(methodOopDesc::from_compiled_offset());
  1.1823 +    assert( -128 <= disp && disp <= 127, "compiled_code_offset isn't small");
  1.1824 +
  1.1825 +    // CALL *[EAX+in_bytes(methodOopDesc::from_compiled_code_entry_point_offset())]
  1.1826 +    cbuf.set_inst_mark();
  1.1827 +    $$$emit8$primary;
  1.1828 +    emit_rm(cbuf, 0x01, $secondary, EAX_enc );  // R/M byte
  1.1829 +    emit_d8(cbuf, disp);             // Displacement
  1.1830 +
  1.1831 +  %}
  1.1832 +
  1.1833 +  enc_class Xor_Reg (eRegI dst) %{
  1.1834 +    emit_opcode(cbuf, 0x33);
  1.1835 +    emit_rm(cbuf, 0x3, $dst$$reg, $dst$$reg);
  1.1836 +  %}
  1.1837 +
  1.1838 +//   Following encoding is no longer used, but may be restored if calling
  1.1839 +//   convention changes significantly.
  1.1840 +//   Became: Xor_Reg(EBP), Java_To_Runtime( labl )
  1.1841 +//
  1.1842 +//   enc_class Java_Interpreter_Call (label labl) %{    // JAVA INTERPRETER CALL
  1.1843 +//     // int ic_reg     = Matcher::inline_cache_reg();
  1.1844 +//     // int ic_encode  = Matcher::_regEncode[ic_reg];
  1.1845 +//     // int imo_reg    = Matcher::interpreter_method_oop_reg();
  1.1846 +//     // int imo_encode = Matcher::_regEncode[imo_reg];
  1.1847 +//
  1.1848 +//     // // Interpreter expects method_oop in EBX, currently a callee-saved register,
  1.1849 +//     // // so we load it immediately before the call
  1.1850 +//     // emit_opcode(cbuf, 0x8B);                     // MOV    imo_reg,ic_reg  # method_oop
  1.1851 +//     // emit_rm(cbuf, 0x03, imo_encode, ic_encode ); // R/M byte
  1.1852 +//
  1.1853 +//     // xor rbp,ebp
  1.1854 +//     emit_opcode(cbuf, 0x33);
  1.1855 +//     emit_rm(cbuf, 0x3, EBP_enc, EBP_enc);
  1.1856 +//
  1.1857 +//     // CALL to interpreter.
  1.1858 +//     cbuf.set_inst_mark();
  1.1859 +//     $$$emit8$primary;
  1.1860 +//     emit_d32_reloc(cbuf, ($labl$$label - (int)(cbuf.code_end()) - 4),
  1.1861 +//                 runtime_call_Relocation::spec(), RELOC_IMM32 );
  1.1862 +//   %}
  1.1863 +
  1.1864 +  enc_class RegOpcImm (eRegI dst, immI8 shift) %{    // SHL, SAR, SHR
  1.1865 +    $$$emit8$primary;
  1.1866 +    emit_rm(cbuf, 0x3, $secondary, $dst$$reg);
  1.1867 +    $$$emit8$shift$$constant;
  1.1868 +  %}
  1.1869 +
  1.1870 +  enc_class LdImmI (eRegI dst, immI src) %{    // Load Immediate
  1.1871 +    // Load immediate does not have a zero or sign extended version
  1.1872 +    // for 8-bit immediates
  1.1873 +    emit_opcode(cbuf, 0xB8 + $dst$$reg);
  1.1874 +    $$$emit32$src$$constant;
  1.1875 +  %}
  1.1876 +
  1.1877 +  enc_class LdImmP (eRegI dst, immI src) %{    // Load Immediate
  1.1878 +    // Load immediate does not have a zero or sign extended version
  1.1879 +    // for 8-bit immediates
  1.1880 +    emit_opcode(cbuf, $primary + $dst$$reg);
  1.1881 +    $$$emit32$src$$constant;
  1.1882 +  %}
  1.1883 +
  1.1884 +  enc_class LdImmL_Lo( eRegL dst, immL src) %{    // Load Immediate
  1.1885 +    // Load immediate does not have a zero or sign extended version
  1.1886 +    // for 8-bit immediates
  1.1887 +    int dst_enc = $dst$$reg;
  1.1888 +    int src_con = $src$$constant & 0x0FFFFFFFFL;
  1.1889 +    if (src_con == 0) {
  1.1890 +      // xor dst, dst
  1.1891 +      emit_opcode(cbuf, 0x33);
  1.1892 +      emit_rm(cbuf, 0x3, dst_enc, dst_enc);
  1.1893 +    } else {
  1.1894 +      emit_opcode(cbuf, $primary + dst_enc);
  1.1895 +      emit_d32(cbuf, src_con);
  1.1896 +    }
  1.1897 +  %}
  1.1898 +
  1.1899 +  enc_class LdImmL_Hi( eRegL dst, immL src) %{    // Load Immediate
  1.1900 +    // Load immediate does not have a zero or sign extended version
  1.1901 +    // for 8-bit immediates
  1.1902 +    int dst_enc = $dst$$reg + 2;
  1.1903 +    int src_con = ((julong)($src$$constant)) >> 32;
  1.1904 +    if (src_con == 0) {
  1.1905 +      // xor dst, dst
  1.1906 +      emit_opcode(cbuf, 0x33);
  1.1907 +      emit_rm(cbuf, 0x3, dst_enc, dst_enc);
  1.1908 +    } else {
  1.1909 +      emit_opcode(cbuf, $primary + dst_enc);
  1.1910 +      emit_d32(cbuf, src_con);
  1.1911 +    }
  1.1912 +  %}
  1.1913 +
  1.1914 +
  1.1915 +  enc_class LdImmD (immD src) %{    // Load Immediate
  1.1916 +    if( is_positive_zero_double($src$$constant)) {
  1.1917 +      // FLDZ
  1.1918 +      emit_opcode(cbuf,0xD9);
  1.1919 +      emit_opcode(cbuf,0xEE);
  1.1920 +    } else if( is_positive_one_double($src$$constant)) {
  1.1921 +      // FLD1
  1.1922 +      emit_opcode(cbuf,0xD9);
  1.1923 +      emit_opcode(cbuf,0xE8);
  1.1924 +    } else {
  1.1925 +      emit_opcode(cbuf,0xDD);
  1.1926 +      emit_rm(cbuf, 0x0, 0x0, 0x5);
  1.1927 +      emit_double_constant(cbuf, $src$$constant);
  1.1928 +    }
  1.1929 +  %}
  1.1930 +
  1.1931 +
  1.1932 +  enc_class LdImmF (immF src) %{    // Load Immediate
  1.1933 +    if( is_positive_zero_float($src$$constant)) {
  1.1934 +      emit_opcode(cbuf,0xD9);
  1.1935 +      emit_opcode(cbuf,0xEE);
  1.1936 +    } else if( is_positive_one_float($src$$constant)) {
  1.1937 +      emit_opcode(cbuf,0xD9);
  1.1938 +      emit_opcode(cbuf,0xE8);
  1.1939 +    } else {
  1.1940 +      $$$emit8$primary;
  1.1941 +      // Load immediate does not have a zero or sign extended version
  1.1942 +      // for 8-bit immediates
  1.1943 +      // First load to TOS, then move to dst
  1.1944 +      emit_rm(cbuf, 0x0, 0x0, 0x5);
  1.1945 +      emit_float_constant(cbuf, $src$$constant);
  1.1946 +    }
  1.1947 +  %}
  1.1948 +
  1.1949 +  enc_class LdImmX (regX dst, immXF con) %{    // Load Immediate
  1.1950 +    emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
  1.1951 +    emit_float_constant(cbuf, $con$$constant);
  1.1952 +  %}
  1.1953 +
  1.1954 +  enc_class LdImmXD (regXD dst, immXD con) %{    // Load Immediate
  1.1955 +    emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
  1.1956 +    emit_double_constant(cbuf, $con$$constant);
  1.1957 +  %}
  1.1958 +
  1.1959 +  enc_class load_conXD (regXD dst, immXD con) %{ // Load double constant
  1.1960 +    // UseXmmLoadAndClearUpper ? movsd(dst, con) : movlpd(dst, con)
  1.1961 +    emit_opcode(cbuf, UseXmmLoadAndClearUpper ? 0xF2 : 0x66);
  1.1962 +    emit_opcode(cbuf, 0x0F);
  1.1963 +    emit_opcode(cbuf, UseXmmLoadAndClearUpper ? 0x10 : 0x12);
  1.1964 +    emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
  1.1965 +    emit_double_constant(cbuf, $con$$constant);
  1.1966 +  %}
  1.1967 +
  1.1968 +  enc_class Opc_MemImm_F(immF src) %{
  1.1969 +    cbuf.set_inst_mark();
  1.1970 +    $$$emit8$primary;
  1.1971 +    emit_rm(cbuf, 0x0, $secondary, 0x5);
  1.1972 +    emit_float_constant(cbuf, $src$$constant);
  1.1973 +  %}
  1.1974 +
  1.1975 +
  1.1976 +  enc_class MovI2X_reg(regX dst, eRegI src) %{
  1.1977 +    emit_opcode(cbuf, 0x66 );     // MOVD dst,src
  1.1978 +    emit_opcode(cbuf, 0x0F );
  1.1979 +    emit_opcode(cbuf, 0x6E );
  1.1980 +    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
  1.1981 +  %}
  1.1982 +
  1.1983 +  enc_class MovX2I_reg(eRegI dst, regX src) %{
  1.1984 +    emit_opcode(cbuf, 0x66 );     // MOVD dst,src
  1.1985 +    emit_opcode(cbuf, 0x0F );
  1.1986 +    emit_opcode(cbuf, 0x7E );
  1.1987 +    emit_rm(cbuf, 0x3, $src$$reg, $dst$$reg);
  1.1988 +  %}
  1.1989 +
  1.1990 +  enc_class MovL2XD_reg(regXD dst, eRegL src, regXD tmp) %{
  1.1991 +    { // MOVD $dst,$src.lo
  1.1992 +      emit_opcode(cbuf,0x66);
  1.1993 +      emit_opcode(cbuf,0x0F);
  1.1994 +      emit_opcode(cbuf,0x6E);
  1.1995 +      emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
  1.1996 +    }
  1.1997 +    { // MOVD $tmp,$src.hi
  1.1998 +      emit_opcode(cbuf,0x66);
  1.1999 +      emit_opcode(cbuf,0x0F);
  1.2000 +      emit_opcode(cbuf,0x6E);
  1.2001 +      emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src$$reg));
  1.2002 +    }
  1.2003 +    { // PUNPCKLDQ $dst,$tmp
  1.2004 +      emit_opcode(cbuf,0x66);
  1.2005 +      emit_opcode(cbuf,0x0F);
  1.2006 +      emit_opcode(cbuf,0x62);
  1.2007 +      emit_rm(cbuf, 0x3, $dst$$reg, $tmp$$reg);
  1.2008 +     }
  1.2009 +  %}
  1.2010 +
  1.2011 +  enc_class MovXD2L_reg(eRegL dst, regXD src, regXD tmp) %{
  1.2012 +    { // MOVD $dst.lo,$src
  1.2013 +      emit_opcode(cbuf,0x66);
  1.2014 +      emit_opcode(cbuf,0x0F);
  1.2015 +      emit_opcode(cbuf,0x7E);
  1.2016 +      emit_rm(cbuf, 0x3, $src$$reg, $dst$$reg);
  1.2017 +    }
  1.2018 +    { // PSHUFLW $tmp,$src,0x4E  (01001110b)
  1.2019 +      emit_opcode(cbuf,0xF2);
  1.2020 +      emit_opcode(cbuf,0x0F);
  1.2021 +      emit_opcode(cbuf,0x70);
  1.2022 +      emit_rm(cbuf, 0x3, $tmp$$reg, $src$$reg);
  1.2023 +      emit_d8(cbuf, 0x4E);
  1.2024 +    }
  1.2025 +    { // MOVD $dst.hi,$tmp
  1.2026 +      emit_opcode(cbuf,0x66);
  1.2027 +      emit_opcode(cbuf,0x0F);
  1.2028 +      emit_opcode(cbuf,0x7E);
  1.2029 +      emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($dst$$reg));
  1.2030 +    }
  1.2031 +  %}
  1.2032 +
  1.2033 +
  1.2034 +  // Encode a reg-reg copy.  If it is useless, then empty encoding.
  1.2035 +  enc_class enc_Copy( eRegI dst, eRegI src ) %{
  1.2036 +    encode_Copy( cbuf, $dst$$reg, $src$$reg );
  1.2037 +  %}
  1.2038 +
  1.2039 +  enc_class enc_CopyL_Lo( eRegI dst, eRegL src ) %{
  1.2040 +    encode_Copy( cbuf, $dst$$reg, $src$$reg );
  1.2041 +  %}
  1.2042 +
  1.2043 +  // Encode xmm reg-reg copy.  If it is useless, then empty encoding.
  1.2044 +  enc_class enc_CopyXD( RegXD dst, RegXD src ) %{
  1.2045 +    encode_CopyXD( cbuf, $dst$$reg, $src$$reg );
  1.2046 +  %}
  1.2047 +
  1.2048 +  enc_class RegReg (eRegI dst, eRegI src) %{    // RegReg(Many)
  1.2049 +    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
  1.2050 +  %}
  1.2051 +
  1.2052 +  enc_class RegReg_Lo(eRegL dst, eRegL src) %{    // RegReg(Many)
  1.2053 +    $$$emit8$primary;
  1.2054 +    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
  1.2055 +  %}
  1.2056 +
  1.2057 +  enc_class RegReg_Hi(eRegL dst, eRegL src) %{    // RegReg(Many)
  1.2058 +    $$$emit8$secondary;
  1.2059 +    emit_rm(cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), HIGH_FROM_LOW($src$$reg));
  1.2060 +  %}
  1.2061 +
  1.2062 +  enc_class RegReg_Lo2(eRegL dst, eRegL src) %{    // RegReg(Many)
  1.2063 +    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
  1.2064 +  %}
  1.2065 +
  1.2066 +  enc_class RegReg_Hi2(eRegL dst, eRegL src) %{    // RegReg(Many)
  1.2067 +    emit_rm(cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), HIGH_FROM_LOW($src$$reg));
  1.2068 +  %}
  1.2069 +
  1.2070 +  enc_class RegReg_HiLo( eRegL src, eRegI dst ) %{
  1.2071 +    emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($src$$reg));
  1.2072 +  %}
  1.2073 +
  1.2074 +  enc_class Con32 (immI src) %{    // Con32(storeImmI)
  1.2075 +    // Output immediate
  1.2076 +    $$$emit32$src$$constant;
  1.2077 +  %}
  1.2078 +
  1.2079 +  enc_class Con32F_as_bits(immF src) %{        // storeF_imm
  1.2080 +    // Output Float immediate bits
  1.2081 +    jfloat jf = $src$$constant;
  1.2082 +    int    jf_as_bits = jint_cast( jf );
  1.2083 +    emit_d32(cbuf, jf_as_bits);
  1.2084 +  %}
  1.2085 +
  1.2086 +  enc_class Con32XF_as_bits(immXF src) %{      // storeX_imm
  1.2087 +    // Output Float immediate bits
  1.2088 +    jfloat jf = $src$$constant;
  1.2089 +    int    jf_as_bits = jint_cast( jf );
  1.2090 +    emit_d32(cbuf, jf_as_bits);
  1.2091 +  %}
  1.2092 +
  1.2093 +  enc_class Con16 (immI src) %{    // Con16(storeImmI)
  1.2094 +    // Output immediate
  1.2095 +    $$$emit16$src$$constant;
  1.2096 +  %}
  1.2097 +
  1.2098 +  enc_class Con_d32(immI src) %{
  1.2099 +    emit_d32(cbuf,$src$$constant);
  1.2100 +  %}
  1.2101 +
  1.2102 +  enc_class conmemref (eRegP t1) %{    // Con32(storeImmI)
  1.2103 +    // Output immediate memory reference
  1.2104 +    emit_rm(cbuf, 0x00, $t1$$reg, 0x05 );
  1.2105 +    emit_d32(cbuf, 0x00);
  1.2106 +  %}
  1.2107 +
  1.2108 +  enc_class lock_prefix( ) %{
  1.2109 +    if( os::is_MP() )
  1.2110 +      emit_opcode(cbuf,0xF0);         // [Lock]
  1.2111 +  %}
  1.2112 +
  1.2113 +  // Cmp-xchg long value.
  1.2114 +  // Note: we need to swap rbx, and rcx before and after the
  1.2115 +  //       cmpxchg8 instruction because the instruction uses
  1.2116 +  //       rcx as the high order word of the new value to store but
  1.2117 +  //       our register encoding uses rbx,.
  1.2118 +  enc_class enc_cmpxchg8(eSIRegP mem_ptr) %{
  1.2119 +
  1.2120 +    // XCHG  rbx,ecx
  1.2121 +    emit_opcode(cbuf,0x87);
  1.2122 +    emit_opcode(cbuf,0xD9);
  1.2123 +    // [Lock]
  1.2124 +    if( os::is_MP() )
  1.2125 +      emit_opcode(cbuf,0xF0);
  1.2126 +    // CMPXCHG8 [Eptr]
  1.2127 +    emit_opcode(cbuf,0x0F);
  1.2128 +    emit_opcode(cbuf,0xC7);
  1.2129 +    emit_rm( cbuf, 0x0, 1, $mem_ptr$$reg );
  1.2130 +    // XCHG  rbx,ecx
  1.2131 +    emit_opcode(cbuf,0x87);
  1.2132 +    emit_opcode(cbuf,0xD9);
  1.2133 +  %}
  1.2134 +
  1.2135 +  enc_class enc_cmpxchg(eSIRegP mem_ptr) %{
  1.2136 +    // [Lock]
  1.2137 +    if( os::is_MP() )
  1.2138 +      emit_opcode(cbuf,0xF0);
  1.2139 +
  1.2140 +    // CMPXCHG [Eptr]
  1.2141 +    emit_opcode(cbuf,0x0F);
  1.2142 +    emit_opcode(cbuf,0xB1);
  1.2143 +    emit_rm( cbuf, 0x0, 1, $mem_ptr$$reg );
  1.2144 +  %}
  1.2145 +
  1.2146 +  enc_class enc_flags_ne_to_boolean( iRegI res ) %{
  1.2147 +    int res_encoding = $res$$reg;
  1.2148 +
  1.2149 +    // MOV  res,0
  1.2150 +    emit_opcode( cbuf, 0xB8 + res_encoding);
  1.2151 +    emit_d32( cbuf, 0 );
  1.2152 +    // JNE,s  fail
  1.2153 +    emit_opcode(cbuf,0x75);
  1.2154 +    emit_d8(cbuf, 5 );
  1.2155 +    // MOV  res,1
  1.2156 +    emit_opcode( cbuf, 0xB8 + res_encoding);
  1.2157 +    emit_d32( cbuf, 1 );
  1.2158 +    // fail:
  1.2159 +  %}
  1.2160 +
  1.2161 +  enc_class set_instruction_start( ) %{
  1.2162 +    cbuf.set_inst_mark();            // Mark start of opcode for reloc info in mem operand
  1.2163 +  %}
  1.2164 +
  1.2165 +  enc_class RegMem (eRegI ereg, memory mem) %{    // emit_reg_mem
  1.2166 +    int reg_encoding = $ereg$$reg;
  1.2167 +    int base  = $mem$$base;
  1.2168 +    int index = $mem$$index;
  1.2169 +    int scale = $mem$$scale;
  1.2170 +    int displace = $mem$$disp;
  1.2171 +    bool disp_is_oop = $mem->disp_is_oop();
  1.2172 +    encode_RegMem(cbuf, reg_encoding, base, index, scale, displace, disp_is_oop);
  1.2173 +  %}
  1.2174 +
  1.2175 +  enc_class RegMem_Hi(eRegL ereg, memory mem) %{    // emit_reg_mem
  1.2176 +    int reg_encoding = HIGH_FROM_LOW($ereg$$reg);  // Hi register of pair, computed from lo
  1.2177 +    int base  = $mem$$base;
  1.2178 +    int index = $mem$$index;
  1.2179 +    int scale = $mem$$scale;
  1.2180 +    int displace = $mem$$disp + 4;      // Offset is 4 further in memory
  1.2181 +    assert( !$mem->disp_is_oop(), "Cannot add 4 to oop" );
  1.2182 +    encode_RegMem(cbuf, reg_encoding, base, index, scale, displace, false/*disp_is_oop*/);
  1.2183 +  %}
  1.2184 +
  1.2185 +  enc_class move_long_small_shift( eRegL dst, immI_1_31 cnt ) %{
  1.2186 +    int r1, r2;
  1.2187 +    if( $tertiary == 0xA4 ) { r1 = $dst$$reg;  r2 = HIGH_FROM_LOW($dst$$reg); }
  1.2188 +    else                    { r2 = $dst$$reg;  r1 = HIGH_FROM_LOW($dst$$reg); }
  1.2189 +    emit_opcode(cbuf,0x0F);
  1.2190 +    emit_opcode(cbuf,$tertiary);
  1.2191 +    emit_rm(cbuf, 0x3, r1, r2);
  1.2192 +    emit_d8(cbuf,$cnt$$constant);
  1.2193 +    emit_d8(cbuf,$primary);
  1.2194 +    emit_rm(cbuf, 0x3, $secondary, r1);
  1.2195 +    emit_d8(cbuf,$cnt$$constant);
  1.2196 +  %}
  1.2197 +
  1.2198 +  enc_class move_long_big_shift_sign( eRegL dst, immI_32_63 cnt ) %{
  1.2199 +    emit_opcode( cbuf, 0x8B ); // Move
  1.2200 +    emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($dst$$reg));
  1.2201 +    emit_d8(cbuf,$primary);
  1.2202 +    emit_rm(cbuf, 0x3, $secondary, $dst$$reg);
  1.2203 +    emit_d8(cbuf,$cnt$$constant-32);
  1.2204 +    emit_d8(cbuf,$primary);
  1.2205 +    emit_rm(cbuf, 0x3, $secondary, HIGH_FROM_LOW($dst$$reg));
  1.2206 +    emit_d8(cbuf,31);
  1.2207 +  %}
  1.2208 +
  1.2209 +  enc_class move_long_big_shift_clr( eRegL dst, immI_32_63 cnt ) %{
  1.2210 +    int r1, r2;
  1.2211 +    if( $secondary == 0x5 ) { r1 = $dst$$reg;  r2 = HIGH_FROM_LOW($dst$$reg); }
  1.2212 +    else                    { r2 = $dst$$reg;  r1 = HIGH_FROM_LOW($dst$$reg); }
  1.2213 +
  1.2214 +    emit_opcode( cbuf, 0x8B ); // Move r1,r2
  1.2215 +    emit_rm(cbuf, 0x3, r1, r2);
  1.2216 +    if( $cnt$$constant > 32 ) { // Shift, if not by zero
  1.2217 +      emit_opcode(cbuf,$primary);
  1.2218 +      emit_rm(cbuf, 0x3, $secondary, r1);
  1.2219 +      emit_d8(cbuf,$cnt$$constant-32);
  1.2220 +    }
  1.2221 +    emit_opcode(cbuf,0x33);  // XOR r2,r2
  1.2222 +    emit_rm(cbuf, 0x3, r2, r2);
  1.2223 +  %}
  1.2224 +
  1.2225 +  // Clone of RegMem but accepts an extra parameter to access each
  1.2226 +  // half of a double in memory; it never needs relocation info.
  1.2227 +  enc_class Mov_MemD_half_to_Reg (immI opcode, memory mem, immI disp_for_half, eRegI rm_reg) %{
  1.2228 +    emit_opcode(cbuf,$opcode$$constant);
  1.2229 +    int reg_encoding = $rm_reg$$reg;
  1.2230 +    int base     = $mem$$base;
  1.2231 +    int index    = $mem$$index;
  1.2232 +    int scale    = $mem$$scale;
  1.2233 +    int displace = $mem$$disp + $disp_for_half$$constant;
  1.2234 +    bool disp_is_oop = false;
  1.2235 +    encode_RegMem(cbuf, reg_encoding, base, index, scale, displace, disp_is_oop);
  1.2236 +  %}
  1.2237 +
  1.2238 +  // !!!!! Special Custom Code used by MemMove, and stack access instructions !!!!!
  1.2239 +  //
  1.2240 +  // Clone of RegMem except the RM-byte's reg/opcode field is an ADLC-time constant
  1.2241 +  // and it never needs relocation information.
  1.2242 +  // Frequently used to move data between FPU's Stack Top and memory.
  1.2243 +  enc_class RMopc_Mem_no_oop (immI rm_opcode, memory mem) %{
  1.2244 +    int rm_byte_opcode = $rm_opcode$$constant;
  1.2245 +    int base     = $mem$$base;
  1.2246 +    int index    = $mem$$index;
  1.2247 +    int scale    = $mem$$scale;
  1.2248 +    int displace = $mem$$disp;
  1.2249 +    assert( !$mem->disp_is_oop(), "No oops here because no relo info allowed" );
  1.2250 +    encode_RegMem(cbuf, rm_byte_opcode, base, index, scale, displace, false);
  1.2251 +  %}
  1.2252 +
  1.2253 +  enc_class RMopc_Mem (immI rm_opcode, memory mem) %{
  1.2254 +    int rm_byte_opcode = $rm_opcode$$constant;
  1.2255 +    int base     = $mem$$base;
  1.2256 +    int index    = $mem$$index;
  1.2257 +    int scale    = $mem$$scale;
  1.2258 +    int displace = $mem$$disp;
  1.2259 +    bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
  1.2260 +    encode_RegMem(cbuf, rm_byte_opcode, base, index, scale, displace, disp_is_oop);
  1.2261 +  %}
  1.2262 +
  1.2263 +  enc_class RegLea (eRegI dst, eRegI src0, immI src1 ) %{    // emit_reg_lea
  1.2264 +    int reg_encoding = $dst$$reg;
  1.2265 +    int base         = $src0$$reg;      // 0xFFFFFFFF indicates no base
  1.2266 +    int index        = 0x04;            // 0x04 indicates no index
  1.2267 +    int scale        = 0x00;            // 0x00 indicates no scale
  1.2268 +    int displace     = $src1$$constant; // 0x00 indicates no displacement
  1.2269 +    bool disp_is_oop = false;
  1.2270 +    encode_RegMem(cbuf, reg_encoding, base, index, scale, displace, disp_is_oop);
  1.2271 +  %}
  1.2272 +
  1.2273 +  enc_class min_enc (eRegI dst, eRegI src) %{    // MIN
  1.2274 +    // Compare dst,src
  1.2275 +    emit_opcode(cbuf,0x3B);
  1.2276 +    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
  1.2277 +    // jmp dst < src around move
  1.2278 +    emit_opcode(cbuf,0x7C);
  1.2279 +    emit_d8(cbuf,2);
  1.2280 +    // move dst,src
  1.2281 +    emit_opcode(cbuf,0x8B);
  1.2282 +    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
  1.2283 +  %}
  1.2284 +
  1.2285 +  enc_class max_enc (eRegI dst, eRegI src) %{    // MAX
  1.2286 +    // Compare dst,src
  1.2287 +    emit_opcode(cbuf,0x3B);
  1.2288 +    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
  1.2289 +    // jmp dst > src around move
  1.2290 +    emit_opcode(cbuf,0x7F);
  1.2291 +    emit_d8(cbuf,2);
  1.2292 +    // move dst,src
  1.2293 +    emit_opcode(cbuf,0x8B);
  1.2294 +    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
  1.2295 +  %}
  1.2296 +
  1.2297 +  enc_class enc_FP_store(memory mem, regD src) %{
  1.2298 +    // If src is FPR1, we can just FST to store it.
  1.2299 +    // Else we need to FLD it to FPR1, then FSTP to store/pop it.
  1.2300 +    int reg_encoding = 0x2; // Just store
  1.2301 +    int base  = $mem$$base;
  1.2302 +    int index = $mem$$index;
  1.2303 +    int scale = $mem$$scale;
  1.2304 +    int displace = $mem$$disp;
  1.2305 +    bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
  1.2306 +    if( $src$$reg != FPR1L_enc ) {
  1.2307 +      reg_encoding = 0x3;  // Store & pop
  1.2308 +      emit_opcode( cbuf, 0xD9 ); // FLD (i.e., push it)
  1.2309 +      emit_d8( cbuf, 0xC0-1+$src$$reg );
  1.2310 +    }
  1.2311 +    cbuf.set_inst_mark();       // Mark start of opcode for reloc info in mem operand
  1.2312 +    emit_opcode(cbuf,$primary);
  1.2313 +    encode_RegMem(cbuf, reg_encoding, base, index, scale, displace, disp_is_oop);
  1.2314 +  %}
  1.2315 +
  1.2316 +  enc_class neg_reg(eRegI dst) %{
  1.2317 +    // NEG $dst
  1.2318 +    emit_opcode(cbuf,0xF7);
  1.2319 +    emit_rm(cbuf, 0x3, 0x03, $dst$$reg );
  1.2320 +  %}
  1.2321 +
  1.2322 +  enc_class setLT_reg(eCXRegI dst) %{
  1.2323 +    // SETLT $dst
  1.2324 +    emit_opcode(cbuf,0x0F);
  1.2325 +    emit_opcode(cbuf,0x9C);
  1.2326 +    emit_rm( cbuf, 0x3, 0x4, $dst$$reg );
  1.2327 +  %}
  1.2328 +
  1.2329 +  enc_class enc_cmpLTP(ncxRegI p, ncxRegI q, ncxRegI y, eCXRegI tmp) %{    // cadd_cmpLT
  1.2330 +    int tmpReg = $tmp$$reg;
  1.2331 +
  1.2332 +    // SUB $p,$q
  1.2333 +    emit_opcode(cbuf,0x2B);
  1.2334 +    emit_rm(cbuf, 0x3, $p$$reg, $q$$reg);
  1.2335 +    // SBB $tmp,$tmp
  1.2336 +    emit_opcode(cbuf,0x1B);
  1.2337 +    emit_rm(cbuf, 0x3, tmpReg, tmpReg);
  1.2338 +    // AND $tmp,$y
  1.2339 +    emit_opcode(cbuf,0x23);
  1.2340 +    emit_rm(cbuf, 0x3, tmpReg, $y$$reg);
  1.2341 +    // ADD $p,$tmp
  1.2342 +    emit_opcode(cbuf,0x03);
  1.2343 +    emit_rm(cbuf, 0x3, $p$$reg, tmpReg);
  1.2344 +  %}
  1.2345 +
  1.2346 +  enc_class enc_cmpLTP_mem(eRegI p, eRegI q, memory mem, eCXRegI tmp) %{    // cadd_cmpLT
  1.2347 +    int tmpReg = $tmp$$reg;
  1.2348 +
  1.2349 +    // SUB $p,$q
  1.2350 +    emit_opcode(cbuf,0x2B);
  1.2351 +    emit_rm(cbuf, 0x3, $p$$reg, $q$$reg);
  1.2352 +    // SBB $tmp,$tmp
  1.2353 +    emit_opcode(cbuf,0x1B);
  1.2354 +    emit_rm(cbuf, 0x3, tmpReg, tmpReg);
  1.2355 +    // AND $tmp,$y
  1.2356 +    cbuf.set_inst_mark();       // Mark start of opcode for reloc info in mem operand
  1.2357 +    emit_opcode(cbuf,0x23);
  1.2358 +    int reg_encoding = tmpReg;
  1.2359 +    int base  = $mem$$base;
  1.2360 +    int index = $mem$$index;
  1.2361 +    int scale = $mem$$scale;
  1.2362 +    int displace = $mem$$disp;
  1.2363 +    bool disp_is_oop = $mem->disp_is_oop();
  1.2364 +    encode_RegMem(cbuf, reg_encoding, base, index, scale, displace, disp_is_oop);
  1.2365 +    // ADD $p,$tmp
  1.2366 +    emit_opcode(cbuf,0x03);
  1.2367 +    emit_rm(cbuf, 0x3, $p$$reg, tmpReg);
  1.2368 +  %}
  1.2369 +
  1.2370 +  enc_class shift_left_long( eRegL dst, eCXRegI shift ) %{
  1.2371 +    // TEST shift,32
  1.2372 +    emit_opcode(cbuf,0xF7);
  1.2373 +    emit_rm(cbuf, 0x3, 0, ECX_enc);
  1.2374 +    emit_d32(cbuf,0x20);
  1.2375 +    // JEQ,s small
  1.2376 +    emit_opcode(cbuf, 0x74);
  1.2377 +    emit_d8(cbuf, 0x04);
  1.2378 +    // MOV    $dst.hi,$dst.lo
  1.2379 +    emit_opcode( cbuf, 0x8B );
  1.2380 +    emit_rm(cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), $dst$$reg );
  1.2381 +    // CLR    $dst.lo
  1.2382 +    emit_opcode(cbuf, 0x33);
  1.2383 +    emit_rm(cbuf, 0x3, $dst$$reg, $dst$$reg);
  1.2384 +// small:
  1.2385 +    // SHLD   $dst.hi,$dst.lo,$shift
  1.2386 +    emit_opcode(cbuf,0x0F);
  1.2387 +    emit_opcode(cbuf,0xA5);
  1.2388 +    emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($dst$$reg));
  1.2389 +    // SHL    $dst.lo,$shift"
  1.2390 +    emit_opcode(cbuf,0xD3);
  1.2391 +    emit_rm(cbuf, 0x3, 0x4, $dst$$reg );
  1.2392 +  %}
  1.2393 +
  1.2394 +  enc_class shift_right_long( eRegL dst, eCXRegI shift ) %{
  1.2395 +    // TEST shift,32
  1.2396 +    emit_opcode(cbuf,0xF7);
  1.2397 +    emit_rm(cbuf, 0x3, 0, ECX_enc);
  1.2398 +    emit_d32(cbuf,0x20);
  1.2399 +    // JEQ,s small
  1.2400 +    emit_opcode(cbuf, 0x74);
  1.2401 +    emit_d8(cbuf, 0x04);
  1.2402 +    // MOV    $dst.lo,$dst.hi
  1.2403 +    emit_opcode( cbuf, 0x8B );
  1.2404 +    emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($dst$$reg) );
  1.2405 +    // CLR    $dst.hi
  1.2406 +    emit_opcode(cbuf, 0x33);
  1.2407 +    emit_rm(cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), HIGH_FROM_LOW($dst$$reg));
  1.2408 +// small:
  1.2409 +    // SHRD   $dst.lo,$dst.hi,$shift
  1.2410 +    emit_opcode(cbuf,0x0F);
  1.2411 +    emit_opcode(cbuf,0xAD);
  1.2412 +    emit_rm(cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), $dst$$reg);
  1.2413 +    // SHR    $dst.hi,$shift"
  1.2414 +    emit_opcode(cbuf,0xD3);
  1.2415 +    emit_rm(cbuf, 0x3, 0x5, HIGH_FROM_LOW($dst$$reg) );
  1.2416 +  %}
  1.2417 +
  1.2418 +  enc_class shift_right_arith_long( eRegL dst, eCXRegI shift ) %{
  1.2419 +    // TEST shift,32
  1.2420 +    emit_opcode(cbuf,0xF7);
  1.2421 +    emit_rm(cbuf, 0x3, 0, ECX_enc);
  1.2422 +    emit_d32(cbuf,0x20);
  1.2423 +    // JEQ,s small
  1.2424 +    emit_opcode(cbuf, 0x74);
  1.2425 +    emit_d8(cbuf, 0x05);
  1.2426 +    // MOV    $dst.lo,$dst.hi
  1.2427 +    emit_opcode( cbuf, 0x8B );
  1.2428 +    emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($dst$$reg) );
  1.2429 +    // SAR    $dst.hi,31
  1.2430 +    emit_opcode(cbuf, 0xC1);
  1.2431 +    emit_rm(cbuf, 0x3, 7, HIGH_FROM_LOW($dst$$reg) );
  1.2432 +    emit_d8(cbuf, 0x1F );
  1.2433 +// small:
  1.2434 +    // SHRD   $dst.lo,$dst.hi,$shift
  1.2435 +    emit_opcode(cbuf,0x0F);
  1.2436 +    emit_opcode(cbuf,0xAD);
  1.2437 +    emit_rm(cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), $dst$$reg);
  1.2438 +    // SAR    $dst.hi,$shift"
  1.2439 +    emit_opcode(cbuf,0xD3);
  1.2440 +    emit_rm(cbuf, 0x3, 0x7, HIGH_FROM_LOW($dst$$reg) );
  1.2441 +  %}
  1.2442 +
  1.2443 +
  1.2444 +  // ----------------- Encodings for floating point unit -----------------
  1.2445 +  // May leave result in FPU-TOS or FPU reg depending on opcodes
  1.2446 +  enc_class OpcReg_F (regF src) %{    // FMUL, FDIV
  1.2447 +    $$$emit8$primary;
  1.2448 +    emit_rm(cbuf, 0x3, $secondary, $src$$reg );
  1.2449 +  %}
  1.2450 +
  1.2451 +  // Pop argument in FPR0 with FSTP ST(0)
  1.2452 +  enc_class PopFPU() %{
  1.2453 +    emit_opcode( cbuf, 0xDD );
  1.2454 +    emit_d8( cbuf, 0xD8 );
  1.2455 +  %}
  1.2456 +
  1.2457 +  // !!!!! equivalent to Pop_Reg_F
  1.2458 +  enc_class Pop_Reg_D( regD dst ) %{
  1.2459 +    emit_opcode( cbuf, 0xDD );           // FSTP   ST(i)
  1.2460 +    emit_d8( cbuf, 0xD8+$dst$$reg );
  1.2461 +  %}
  1.2462 +
  1.2463 +  enc_class Push_Reg_D( regD dst ) %{
  1.2464 +    emit_opcode( cbuf, 0xD9 );
  1.2465 +    emit_d8( cbuf, 0xC0-1+$dst$$reg );   // FLD ST(i-1)
  1.2466 +  %}
  1.2467 +
  1.2468 +  enc_class strictfp_bias1( regD dst ) %{
  1.2469 +    emit_opcode( cbuf, 0xDB );           // FLD m80real
  1.2470 +    emit_opcode( cbuf, 0x2D );
  1.2471 +    emit_d32( cbuf, (int)StubRoutines::addr_fpu_subnormal_bias1() );
  1.2472 +    emit_opcode( cbuf, 0xDE );           // FMULP ST(dst), ST0
  1.2473 +    emit_opcode( cbuf, 0xC8+$dst$$reg );
  1.2474 +  %}
  1.2475 +
  1.2476 +  enc_class strictfp_bias2( regD dst ) %{
  1.2477 +    emit_opcode( cbuf, 0xDB );           // FLD m80real
  1.2478 +    emit_opcode( cbuf, 0x2D );
  1.2479 +    emit_d32( cbuf, (int)StubRoutines::addr_fpu_subnormal_bias2() );
  1.2480 +    emit_opcode( cbuf, 0xDE );           // FMULP ST(dst), ST0
  1.2481 +    emit_opcode( cbuf, 0xC8+$dst$$reg );
  1.2482 +  %}
  1.2483 +
  1.2484 +  // Special case for moving an integer register to a stack slot.
  1.2485 +  enc_class OpcPRegSS( stackSlotI dst, eRegI src ) %{ // RegSS
  1.2486 +    store_to_stackslot( cbuf, $primary, $src$$reg, $dst$$disp );
  1.2487 +  %}
  1.2488 +
  1.2489 +  // Special case for moving a register to a stack slot.
  1.2490 +  enc_class RegSS( stackSlotI dst, eRegI src ) %{ // RegSS
  1.2491 +    // Opcode already emitted
  1.2492 +    emit_rm( cbuf, 0x02, $src$$reg, ESP_enc );   // R/M byte
  1.2493 +    emit_rm( cbuf, 0x00, ESP_enc, ESP_enc);          // SIB byte
  1.2494 +    emit_d32(cbuf, $dst$$disp);   // Displacement
  1.2495 +  %}
  1.2496 +
  1.2497 +  // Push the integer in stackSlot 'src' onto FP-stack
  1.2498 +  enc_class Push_Mem_I( memory src ) %{    // FILD   [ESP+src]
  1.2499 +    store_to_stackslot( cbuf, $primary, $secondary, $src$$disp );
  1.2500 +  %}
  1.2501 +
  1.2502 +  // Push the float in stackSlot 'src' onto FP-stack
  1.2503 +  enc_class Push_Mem_F( memory src ) %{    // FLD_S   [ESP+src]
  1.2504 +    store_to_stackslot( cbuf, 0xD9, 0x00, $src$$disp );
  1.2505 +  %}
  1.2506 +
  1.2507 +  // Push the double in stackSlot 'src' onto FP-stack
  1.2508 +  enc_class Push_Mem_D( memory src ) %{    // FLD_D   [ESP+src]
  1.2509 +    store_to_stackslot( cbuf, 0xDD, 0x00, $src$$disp );
  1.2510 +  %}
  1.2511 +
  1.2512 +  // Push FPU's TOS float to a stack-slot, and pop FPU-stack
  1.2513 +  enc_class Pop_Mem_F( stackSlotF dst ) %{ // FSTP_S [ESP+dst]
  1.2514 +    store_to_stackslot( cbuf, 0xD9, 0x03, $dst$$disp );
  1.2515 +  %}
  1.2516 +
  1.2517 +  // Same as Pop_Mem_F except for opcode
  1.2518 +  // Push FPU's TOS double to a stack-slot, and pop FPU-stack
  1.2519 +  enc_class Pop_Mem_D( stackSlotD dst ) %{ // FSTP_D [ESP+dst]
  1.2520 +    store_to_stackslot( cbuf, 0xDD, 0x03, $dst$$disp );
  1.2521 +  %}
  1.2522 +
  1.2523 +  enc_class Pop_Reg_F( regF dst ) %{
  1.2524 +    emit_opcode( cbuf, 0xDD );           // FSTP   ST(i)
  1.2525 +    emit_d8( cbuf, 0xD8+$dst$$reg );
  1.2526 +  %}
  1.2527 +
  1.2528 +  enc_class Push_Reg_F( regF dst ) %{
  1.2529 +    emit_opcode( cbuf, 0xD9 );           // FLD    ST(i-1)
  1.2530 +    emit_d8( cbuf, 0xC0-1+$dst$$reg );
  1.2531 +  %}
  1.2532 +
  1.2533 +  // Push FPU's float to a stack-slot, and pop FPU-stack
  1.2534 +  enc_class Pop_Mem_Reg_F( stackSlotF dst, regF src ) %{
  1.2535 +    int pop = 0x02;
  1.2536 +    if ($src$$reg != FPR1L_enc) {
  1.2537 +      emit_opcode( cbuf, 0xD9 );         // FLD    ST(i-1)
  1.2538 +      emit_d8( cbuf, 0xC0-1+$src$$reg );
  1.2539 +      pop = 0x03;
  1.2540 +    }
  1.2541 +    store_to_stackslot( cbuf, 0xD9, pop, $dst$$disp ); // FST<P>_S  [ESP+dst]
  1.2542 +  %}
  1.2543 +
  1.2544 +  // Push FPU's double to a stack-slot, and pop FPU-stack
  1.2545 +  enc_class Pop_Mem_Reg_D( stackSlotD dst, regD src ) %{
  1.2546 +    int pop = 0x02;
  1.2547 +    if ($src$$reg != FPR1L_enc) {
  1.2548 +      emit_opcode( cbuf, 0xD9 );         // FLD    ST(i-1)
  1.2549 +      emit_d8( cbuf, 0xC0-1+$src$$reg );
  1.2550 +      pop = 0x03;
  1.2551 +    }
  1.2552 +    store_to_stackslot( cbuf, 0xDD, pop, $dst$$disp ); // FST<P>_D  [ESP+dst]
  1.2553 +  %}
  1.2554 +
  1.2555 +  // Push FPU's double to a FPU-stack-slot, and pop FPU-stack
  1.2556 +  enc_class Pop_Reg_Reg_D( regD dst, regF src ) %{
  1.2557 +    int pop = 0xD0 - 1; // -1 since we skip FLD
  1.2558 +    if ($src$$reg != FPR1L_enc) {
  1.2559 +      emit_opcode( cbuf, 0xD9 );         // FLD    ST(src-1)
  1.2560 +      emit_d8( cbuf, 0xC0-1+$src$$reg );
  1.2561 +      pop = 0xD8;
  1.2562 +    }
  1.2563 +    emit_opcode( cbuf, 0xDD );
  1.2564 +    emit_d8( cbuf, pop+$dst$$reg );      // FST<P> ST(i)
  1.2565 +  %}
  1.2566 +
  1.2567 +
  1.2568 +  enc_class Mul_Add_F( regF dst, regF src, regF src1, regF src2 ) %{
  1.2569 +    MacroAssembler masm(&cbuf);
  1.2570 +    masm.fld_s(  $src1$$reg-1);   // nothing at TOS, load TOS from src1.reg
  1.2571 +    masm.fmul(   $src2$$reg+0);   // value at TOS
  1.2572 +    masm.fadd(   $src$$reg+0);    // value at TOS
  1.2573 +    masm.fstp_d( $dst$$reg+0);    // value at TOS, popped off after store
  1.2574 +  %}
  1.2575 +
  1.2576 +
  1.2577 +  enc_class Push_Reg_Mod_D( regD dst, regD src) %{
  1.2578 +    // load dst in FPR0
  1.2579 +    emit_opcode( cbuf, 0xD9 );
  1.2580 +    emit_d8( cbuf, 0xC0-1+$dst$$reg );
  1.2581 +    if ($src$$reg != FPR1L_enc) {
  1.2582 +      // fincstp
  1.2583 +      emit_opcode (cbuf, 0xD9);
  1.2584 +      emit_opcode (cbuf, 0xF7);
  1.2585 +      // swap src with FPR1:
  1.2586 +      // FXCH FPR1 with src
  1.2587 +      emit_opcode(cbuf, 0xD9);
  1.2588 +      emit_d8(cbuf, 0xC8-1+$src$$reg );
  1.2589 +      // fdecstp
  1.2590 +      emit_opcode (cbuf, 0xD9);
  1.2591 +      emit_opcode (cbuf, 0xF6);
  1.2592 +    }
  1.2593 +  %}
  1.2594 +
  1.2595 +  enc_class Push_ModD_encoding( regXD src0, regXD src1) %{
  1.2596 +    // Allocate a word
  1.2597 +    emit_opcode(cbuf,0x83);            // SUB ESP,8
  1.2598 +    emit_opcode(cbuf,0xEC);
  1.2599 +    emit_d8(cbuf,0x08);
  1.2600 +
  1.2601 +    emit_opcode  (cbuf, 0xF2 );     // MOVSD [ESP], src1
  1.2602 +    emit_opcode  (cbuf, 0x0F );
  1.2603 +    emit_opcode  (cbuf, 0x11 );
  1.2604 +    encode_RegMem(cbuf, $src1$$reg, ESP_enc, 0x4, 0, 0, false);
  1.2605 +
  1.2606 +    emit_opcode(cbuf,0xDD );      // FLD_D [ESP]
  1.2607 +    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
  1.2608 +
  1.2609 +    emit_opcode  (cbuf, 0xF2 );     // MOVSD [ESP], src0
  1.2610 +    emit_opcode  (cbuf, 0x0F );
  1.2611 +    emit_opcode  (cbuf, 0x11 );
  1.2612 +    encode_RegMem(cbuf, $src0$$reg, ESP_enc, 0x4, 0, 0, false);
  1.2613 +
  1.2614 +    emit_opcode(cbuf,0xDD );      // FLD_D [ESP]
  1.2615 +    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
  1.2616 +
  1.2617 +  %}
  1.2618 +
  1.2619 +  enc_class Push_ModX_encoding( regX src0, regX src1) %{
  1.2620 +    // Allocate a word
  1.2621 +    emit_opcode(cbuf,0x83);            // SUB ESP,4
  1.2622 +    emit_opcode(cbuf,0xEC);
  1.2623 +    emit_d8(cbuf,0x04);
  1.2624 +
  1.2625 +    emit_opcode  (cbuf, 0xF3 );     // MOVSS [ESP], src1
  1.2626 +    emit_opcode  (cbuf, 0x0F );
  1.2627 +    emit_opcode  (cbuf, 0x11 );
  1.2628 +    encode_RegMem(cbuf, $src1$$reg, ESP_enc, 0x4, 0, 0, false);
  1.2629 +
  1.2630 +    emit_opcode(cbuf,0xD9 );      // FLD [ESP]
  1.2631 +    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
  1.2632 +
  1.2633 +    emit_opcode  (cbuf, 0xF3 );     // MOVSS [ESP], src0
  1.2634 +    emit_opcode  (cbuf, 0x0F );
  1.2635 +    emit_opcode  (cbuf, 0x11 );
  1.2636 +    encode_RegMem(cbuf, $src0$$reg, ESP_enc, 0x4, 0, 0, false);
  1.2637 +
  1.2638 +    emit_opcode(cbuf,0xD9 );      // FLD [ESP]
  1.2639 +    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
  1.2640 +
  1.2641 +  %}
  1.2642 +
  1.2643 +  enc_class Push_ResultXD(regXD dst) %{
  1.2644 +    store_to_stackslot( cbuf, 0xDD, 0x03, 0 ); //FSTP [ESP]
  1.2645 +
  1.2646 +    // UseXmmLoadAndClearUpper ? movsd dst,[esp] : movlpd dst,[esp]
  1.2647 +    emit_opcode  (cbuf, UseXmmLoadAndClearUpper ? 0xF2 : 0x66);
  1.2648 +    emit_opcode  (cbuf, 0x0F );
  1.2649 +    emit_opcode  (cbuf, UseXmmLoadAndClearUpper ? 0x10 : 0x12);
  1.2650 +    encode_RegMem(cbuf, $dst$$reg, ESP_enc, 0x4, 0, 0, false);
  1.2651 +
  1.2652 +    emit_opcode(cbuf,0x83);    // ADD ESP,8
  1.2653 +    emit_opcode(cbuf,0xC4);
  1.2654 +    emit_d8(cbuf,0x08);
  1.2655 +  %}
  1.2656 +
  1.2657 +  enc_class Push_ResultX(regX dst, immI d8) %{
  1.2658 +    store_to_stackslot( cbuf, 0xD9, 0x03, 0 ); //FSTP_S [ESP]
  1.2659 +
  1.2660 +    emit_opcode  (cbuf, 0xF3 );     // MOVSS dst(xmm), [ESP]
  1.2661 +    emit_opcode  (cbuf, 0x0F );
  1.2662 +    emit_opcode  (cbuf, 0x10 );
  1.2663 +    encode_RegMem(cbuf, $dst$$reg, ESP_enc, 0x4, 0, 0, false);
  1.2664 +
  1.2665 +    emit_opcode(cbuf,0x83);    // ADD ESP,d8 (4 or 8)
  1.2666 +    emit_opcode(cbuf,0xC4);
  1.2667 +    emit_d8(cbuf,$d8$$constant);
  1.2668 +  %}
  1.2669 +
  1.2670 +  enc_class Push_SrcXD(regXD src) %{
  1.2671 +    // Allocate a word
  1.2672 +    emit_opcode(cbuf,0x83);            // SUB ESP,8
  1.2673 +    emit_opcode(cbuf,0xEC);
  1.2674 +    emit_d8(cbuf,0x08);
  1.2675 +
  1.2676 +    emit_opcode  (cbuf, 0xF2 );     // MOVSD [ESP], src
  1.2677 +    emit_opcode  (cbuf, 0x0F );
  1.2678 +    emit_opcode  (cbuf, 0x11 );
  1.2679 +    encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
  1.2680 +
  1.2681 +    emit_opcode(cbuf,0xDD );      // FLD_D [ESP]
  1.2682 +    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
  1.2683 +  %}
  1.2684 +
  1.2685 +  enc_class push_stack_temp_qword() %{
  1.2686 +    emit_opcode(cbuf,0x83);     // SUB ESP,8
  1.2687 +    emit_opcode(cbuf,0xEC);
  1.2688 +    emit_d8    (cbuf,0x08);
  1.2689 +  %}
  1.2690 +
  1.2691 +  enc_class pop_stack_temp_qword() %{
  1.2692 +    emit_opcode(cbuf,0x83);     // ADD ESP,8
  1.2693 +    emit_opcode(cbuf,0xC4);
  1.2694 +    emit_d8    (cbuf,0x08);
  1.2695 +  %}
  1.2696 +
  1.2697 +  enc_class push_xmm_to_fpr1( regXD xmm_src ) %{
  1.2698 +    emit_opcode  (cbuf, 0xF2 );     // MOVSD [ESP], xmm_src
  1.2699 +    emit_opcode  (cbuf, 0x0F );
  1.2700 +    emit_opcode  (cbuf, 0x11 );
  1.2701 +    encode_RegMem(cbuf, $xmm_src$$reg, ESP_enc, 0x4, 0, 0, false);
  1.2702 +
  1.2703 +    emit_opcode(cbuf,0xDD );      // FLD_D [ESP]
  1.2704 +    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
  1.2705 +  %}
  1.2706 +
  1.2707 +  // Compute X^Y using Intel's fast hardware instructions, if possible.
  1.2708 +  // Otherwise return a NaN.
  1.2709 +  enc_class pow_exp_core_encoding %{
  1.2710 +    // FPR1 holds Y*ln2(X).  Compute FPR1 = 2^(Y*ln2(X))
  1.2711 +    emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xC0);  // fdup = fld st(0)          Q       Q
  1.2712 +    emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xFC);  // frndint               int(Q)      Q
  1.2713 +    emit_opcode(cbuf,0xDC); emit_opcode(cbuf,0xE9);  // fsub st(1) -= st(0);  int(Q) frac(Q)
  1.2714 +    emit_opcode(cbuf,0xDB);                          // FISTP [ESP]           frac(Q)
  1.2715 +    emit_opcode(cbuf,0x1C);
  1.2716 +    emit_d8(cbuf,0x24);
  1.2717 +    emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xF0);  // f2xm1                 2^frac(Q)-1
  1.2718 +    emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xE8);  // fld1                  1 2^frac(Q)-1
  1.2719 +    emit_opcode(cbuf,0xDE); emit_opcode(cbuf,0xC1);  // faddp                 2^frac(Q)
  1.2720 +    emit_opcode(cbuf,0x8B);                          // mov rax,[esp+0]=int(Q)
  1.2721 +    encode_RegMem(cbuf, EAX_enc, ESP_enc, 0x4, 0, 0, false);
  1.2722 +    emit_opcode(cbuf,0xC7);                          // mov rcx,0xFFFFF800 - overflow mask
  1.2723 +    emit_rm(cbuf, 0x3, 0x0, ECX_enc);
  1.2724 +    emit_d32(cbuf,0xFFFFF800);
  1.2725 +    emit_opcode(cbuf,0x81);                          // add rax,1023 - the double exponent bias
  1.2726 +    emit_rm(cbuf, 0x3, 0x0, EAX_enc);
  1.2727 +    emit_d32(cbuf,1023);
  1.2728 +    emit_opcode(cbuf,0x8B);                          // mov rbx,eax
  1.2729 +    emit_rm(cbuf, 0x3, EBX_enc, EAX_enc);
  1.2730 +    emit_opcode(cbuf,0xC1);                          // shl rax,20 - Slide to exponent position
  1.2731 +    emit_rm(cbuf,0x3,0x4,EAX_enc);
  1.2732 +    emit_d8(cbuf,20);
  1.2733 +    emit_opcode(cbuf,0x85);                          // test rbx,ecx - check for overflow
  1.2734 +    emit_rm(cbuf, 0x3, EBX_enc, ECX_enc);
  1.2735 +    emit_opcode(cbuf,0x0F); emit_opcode(cbuf,0x45);  // CMOVne rax,ecx - overflow; stuff NAN into EAX
  1.2736 +    emit_rm(cbuf, 0x3, EAX_enc, ECX_enc);
  1.2737 +    emit_opcode(cbuf,0x89);                          // mov [esp+4],eax - Store as part of double word
  1.2738 +    encode_RegMem(cbuf, EAX_enc, ESP_enc, 0x4, 0, 4, false);
  1.2739 +    emit_opcode(cbuf,0xC7);                          // mov [esp+0],0   - [ESP] = (double)(1<<int(Q)) = 2^int(Q)
  1.2740 +    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
  1.2741 +    emit_d32(cbuf,0);
  1.2742 +    emit_opcode(cbuf,0xDC);                          // fmul dword st(0),[esp+0]; FPR1 = 2^int(Q)*2^frac(Q) = 2^Q
  1.2743 +    encode_RegMem(cbuf, 0x1, ESP_enc, 0x4, 0, 0, false);
  1.2744 +  %}
  1.2745 +
  1.2746 +//   enc_class Pop_Reg_Mod_D( regD dst, regD src)
  1.2747 +//   was replaced by Push_Result_Mod_D followed by Pop_Reg_X() or Pop_Mem_X()
  1.2748 +
  1.2749 +  enc_class Push_Result_Mod_D( regD src) %{
  1.2750 +    if ($src$$reg != FPR1L_enc) {
  1.2751 +      // fincstp
  1.2752 +      emit_opcode (cbuf, 0xD9);
  1.2753 +      emit_opcode (cbuf, 0xF7);
  1.2754 +      // FXCH FPR1 with src
  1.2755 +      emit_opcode(cbuf, 0xD9);
  1.2756 +      emit_d8(cbuf, 0xC8-1+$src$$reg );
  1.2757 +      // fdecstp
  1.2758 +      emit_opcode (cbuf, 0xD9);
  1.2759 +      emit_opcode (cbuf, 0xF6);
  1.2760 +    }
  1.2761 +    // // following asm replaced with Pop_Reg_F or Pop_Mem_F
  1.2762 +    // // FSTP   FPR$dst$$reg
  1.2763 +    // emit_opcode( cbuf, 0xDD );
  1.2764 +    // emit_d8( cbuf, 0xD8+$dst$$reg );
  1.2765 +  %}
  1.2766 +
  1.2767 +  enc_class fnstsw_sahf_skip_parity() %{
  1.2768 +    // fnstsw ax
  1.2769 +    emit_opcode( cbuf, 0xDF );
  1.2770 +    emit_opcode( cbuf, 0xE0 );
  1.2771 +    // sahf
  1.2772 +    emit_opcode( cbuf, 0x9E );
  1.2773 +    // jnp  ::skip
  1.2774 +    emit_opcode( cbuf, 0x7B );
  1.2775 +    emit_opcode( cbuf, 0x05 );
  1.2776 +  %}
  1.2777 +
  1.2778 +  enc_class emitModD() %{
  1.2779 +    // fprem must be iterative
  1.2780 +    // :: loop
  1.2781 +    // fprem
  1.2782 +    emit_opcode( cbuf, 0xD9 );
  1.2783 +    emit_opcode( cbuf, 0xF8 );
  1.2784 +    // wait
  1.2785 +    emit_opcode( cbuf, 0x9b );
  1.2786 +    // fnstsw ax
  1.2787 +    emit_opcode( cbuf, 0xDF );
  1.2788 +    emit_opcode( cbuf, 0xE0 );
  1.2789 +    // sahf
  1.2790 +    emit_opcode( cbuf, 0x9E );
  1.2791 +    // jp  ::loop
  1.2792 +    emit_opcode( cbuf, 0x0F );
  1.2793 +    emit_opcode( cbuf, 0x8A );
  1.2794 +    emit_opcode( cbuf, 0xF4 );
  1.2795 +    emit_opcode( cbuf, 0xFF );
  1.2796 +    emit_opcode( cbuf, 0xFF );
  1.2797 +    emit_opcode( cbuf, 0xFF );
  1.2798 +  %}
  1.2799 +
  1.2800 +  enc_class fpu_flags() %{
  1.2801 +    // fnstsw_ax
  1.2802 +    emit_opcode( cbuf, 0xDF);
  1.2803 +    emit_opcode( cbuf, 0xE0);
  1.2804 +    // test ax,0x0400
  1.2805 +    emit_opcode( cbuf, 0x66 );   // operand-size prefix for 16-bit immediate
  1.2806 +    emit_opcode( cbuf, 0xA9 );
  1.2807 +    emit_d16   ( cbuf, 0x0400 );
  1.2808 +    // // // This sequence works, but stalls for 12-16 cycles on PPro
  1.2809 +    // // test rax,0x0400
  1.2810 +    // emit_opcode( cbuf, 0xA9 );
  1.2811 +    // emit_d32   ( cbuf, 0x00000400 );
  1.2812 +    //
  1.2813 +    // jz exit (no unordered comparison)
  1.2814 +    emit_opcode( cbuf, 0x74 );
  1.2815 +    emit_d8    ( cbuf, 0x02 );
  1.2816 +    // mov ah,1 - treat as LT case (set carry flag)
  1.2817 +    emit_opcode( cbuf, 0xB4 );
  1.2818 +    emit_d8    ( cbuf, 0x01 );
  1.2819 +    // sahf
  1.2820 +    emit_opcode( cbuf, 0x9E);
  1.2821 +  %}
  1.2822 +
  1.2823 +  enc_class cmpF_P6_fixup() %{
  1.2824 +    // Fixup the integer flags in case comparison involved a NaN
  1.2825 +    //
  1.2826 +    // JNP exit (no unordered comparison, P-flag is set by NaN)
  1.2827 +    emit_opcode( cbuf, 0x7B );
  1.2828 +    emit_d8    ( cbuf, 0x03 );
  1.2829 +    // MOV AH,1 - treat as LT case (set carry flag)
  1.2830 +    emit_opcode( cbuf, 0xB4 );
  1.2831 +    emit_d8    ( cbuf, 0x01 );
  1.2832 +    // SAHF
  1.2833 +    emit_opcode( cbuf, 0x9E);
  1.2834 +    // NOP     // target for branch to avoid branch to branch
  1.2835 +    emit_opcode( cbuf, 0x90);
  1.2836 +  %}
  1.2837 +
  1.2838 +//     fnstsw_ax();
  1.2839 +//     sahf();
  1.2840 +//     movl(dst, nan_result);
  1.2841 +//     jcc(Assembler::parity, exit);
  1.2842 +//     movl(dst, less_result);
  1.2843 +//     jcc(Assembler::below, exit);
  1.2844 +//     movl(dst, equal_result);
  1.2845 +//     jcc(Assembler::equal, exit);
  1.2846 +//     movl(dst, greater_result);
  1.2847 +
  1.2848 +// less_result     =  1;
  1.2849 +// greater_result  = -1;
  1.2850 +// equal_result    = 0;
  1.2851 +// nan_result      = -1;
  1.2852 +
  1.2853 +  enc_class CmpF_Result(eRegI dst) %{
  1.2854 +    // fnstsw_ax();
  1.2855 +    emit_opcode( cbuf, 0xDF);
  1.2856 +    emit_opcode( cbuf, 0xE0);
  1.2857 +    // sahf
  1.2858 +    emit_opcode( cbuf, 0x9E);
  1.2859 +    // movl(dst, nan_result);
  1.2860 +    emit_opcode( cbuf, 0xB8 + $dst$$reg);
  1.2861 +    emit_d32( cbuf, -1 );
  1.2862 +    // jcc(Assembler::parity, exit);
  1.2863 +    emit_opcode( cbuf, 0x7A );
  1.2864 +    emit_d8    ( cbuf, 0x13 );
  1.2865 +    // movl(dst, less_result);
  1.2866 +    emit_opcode( cbuf, 0xB8 + $dst$$reg);
  1.2867 +    emit_d32( cbuf, -1 );
  1.2868 +    // jcc(Assembler::below, exit);
  1.2869 +    emit_opcode( cbuf, 0x72 );
  1.2870 +    emit_d8    ( cbuf, 0x0C );
  1.2871 +    // movl(dst, equal_result);
  1.2872 +    emit_opcode( cbuf, 0xB8 + $dst$$reg);
  1.2873 +    emit_d32( cbuf, 0 );
  1.2874 +    // jcc(Assembler::equal, exit);
  1.2875 +    emit_opcode( cbuf, 0x74 );
  1.2876 +    emit_d8    ( cbuf, 0x05 );
  1.2877 +    // movl(dst, greater_result);
  1.2878 +    emit_opcode( cbuf, 0xB8 + $dst$$reg);
  1.2879 +    emit_d32( cbuf, 1 );
  1.2880 +  %}
  1.2881 +
  1.2882 +
  1.2883 +  // XMM version of CmpF_Result. Because the XMM compare
  1.2884 +  // instructions set the EFLAGS directly. It becomes simpler than
  1.2885 +  // the float version above.
  1.2886 +  enc_class CmpX_Result(eRegI dst) %{
  1.2887 +    MacroAssembler _masm(&cbuf);
  1.2888 +    Label nan, inc, done;
  1.2889 +
  1.2890 +    __ jccb(Assembler::parity, nan);
  1.2891 +    __ jccb(Assembler::equal,  done);
  1.2892 +    __ jccb(Assembler::above,  inc);
  1.2893 +    __ bind(nan);
  1.2894 +    __ decrement(as_Register($dst$$reg));
  1.2895 +    __ jmpb(done);
  1.2896 +    __ bind(inc);
  1.2897 +    __ increment(as_Register($dst$$reg));
  1.2898 +    __ bind(done);
  1.2899 +  %}
  1.2900 +
  1.2901 +  // Compare the longs and set flags
  1.2902 +  // BROKEN!  Do Not use as-is
  1.2903 +  enc_class cmpl_test( eRegL src1, eRegL src2 ) %{
  1.2904 +    // CMP    $src1.hi,$src2.hi
  1.2905 +    emit_opcode( cbuf, 0x3B );
  1.2906 +    emit_rm(cbuf, 0x3, HIGH_FROM_LOW($src1$$reg), HIGH_FROM_LOW($src2$$reg) );
  1.2907 +    // JNE,s  done
  1.2908 +    emit_opcode(cbuf,0x75);
  1.2909 +    emit_d8(cbuf, 2 );
  1.2910 +    // CMP    $src1.lo,$src2.lo
  1.2911 +    emit_opcode( cbuf, 0x3B );
  1.2912 +    emit_rm(cbuf, 0x3, $src1$$reg, $src2$$reg );
  1.2913 +// done:
  1.2914 +  %}
  1.2915 +
  1.2916 +  enc_class convert_int_long( regL dst, eRegI src ) %{
  1.2917 +    // mov $dst.lo,$src
  1.2918 +    int dst_encoding = $dst$$reg;
  1.2919 +    int src_encoding = $src$$reg;
  1.2920 +    encode_Copy( cbuf, dst_encoding  , src_encoding );
  1.2921 +    // mov $dst.hi,$src
  1.2922 +    encode_Copy( cbuf, HIGH_FROM_LOW(dst_encoding), src_encoding );
  1.2923 +    // sar $dst.hi,31
  1.2924 +    emit_opcode( cbuf, 0xC1 );
  1.2925 +    emit_rm(cbuf, 0x3, 7, HIGH_FROM_LOW(dst_encoding) );
  1.2926 +    emit_d8(cbuf, 0x1F );
  1.2927 +  %}
  1.2928 +
  1.2929 +  enc_class convert_long_double( eRegL src ) %{
  1.2930 +    // push $src.hi
  1.2931 +    emit_opcode(cbuf, 0x50+HIGH_FROM_LOW($src$$reg));
  1.2932 +    // push $src.lo
  1.2933 +    emit_opcode(cbuf, 0x50+$src$$reg  );
  1.2934 +    // fild 64-bits at [SP]
  1.2935 +    emit_opcode(cbuf,0xdf);
  1.2936 +    emit_d8(cbuf, 0x6C);
  1.2937 +    emit_d8(cbuf, 0x24);
  1.2938 +    emit_d8(cbuf, 0x00);
  1.2939 +    // pop stack
  1.2940 +    emit_opcode(cbuf, 0x83); // add  SP, #8
  1.2941 +    emit_rm(cbuf, 0x3, 0x00, ESP_enc);
  1.2942 +    emit_d8(cbuf, 0x8);
  1.2943 +  %}
  1.2944 +
  1.2945 +  enc_class multiply_con_and_shift_high( eDXRegI dst, nadxRegI src1, eADXRegL_low_only src2, immI_32_63 cnt, eFlagsReg cr ) %{
  1.2946 +    // IMUL   EDX:EAX,$src1
  1.2947 +    emit_opcode( cbuf, 0xF7 );
  1.2948 +    emit_rm( cbuf, 0x3, 0x5, $src1$$reg );
  1.2949 +    // SAR    EDX,$cnt-32
  1.2950 +    int shift_count = ((int)$cnt$$constant) - 32;
  1.2951 +    if (shift_count > 0) {
  1.2952 +      emit_opcode(cbuf, 0xC1);
  1.2953 +      emit_rm(cbuf, 0x3, 7, $dst$$reg );
  1.2954 +      emit_d8(cbuf, shift_count);
  1.2955 +    }
  1.2956 +  %}
  1.2957 +
  1.2958 +  // this version doesn't have add sp, 8
  1.2959 +  enc_class convert_long_double2( eRegL src ) %{
  1.2960 +    // push $src.hi
  1.2961 +    emit_opcode(cbuf, 0x50+HIGH_FROM_LOW($src$$reg));
  1.2962 +    // push $src.lo
  1.2963 +    emit_opcode(cbuf, 0x50+$src$$reg  );
  1.2964 +    // fild 64-bits at [SP]
  1.2965 +    emit_opcode(cbuf,0xdf);
  1.2966 +    emit_d8(cbuf, 0x6C);
  1.2967 +    emit_d8(cbuf, 0x24);
  1.2968 +    emit_d8(cbuf, 0x00);
  1.2969 +  %}
  1.2970 +
  1.2971 +  enc_class long_int_multiply( eADXRegL dst, nadxRegI src) %{
  1.2972 +    // Basic idea: long = (long)int * (long)int
  1.2973 +    // IMUL EDX:EAX, src
  1.2974 +    emit_opcode( cbuf, 0xF7 );
  1.2975 +    emit_rm( cbuf, 0x3, 0x5, $src$$reg);
  1.2976 +  %}
  1.2977 +
  1.2978 +  enc_class long_uint_multiply( eADXRegL dst, nadxRegI src) %{
  1.2979 +    // Basic Idea:  long = (int & 0xffffffffL) * (int & 0xffffffffL)
  1.2980 +    // MUL EDX:EAX, src
  1.2981 +    emit_opcode( cbuf, 0xF7 );
  1.2982 +    emit_rm( cbuf, 0x3, 0x4, $src$$reg);
  1.2983 +  %}
  1.2984 +
  1.2985 +  enc_class long_multiply( eADXRegL dst, eRegL src, eRegI tmp ) %{
  1.2986 +    // Basic idea: lo(result) = lo(x_lo * y_lo)
  1.2987 +    //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
  1.2988 +    // MOV    $tmp,$src.lo
  1.2989 +    encode_Copy( cbuf, $tmp$$reg, $src$$reg );
  1.2990 +    // IMUL   $tmp,EDX
  1.2991 +    emit_opcode( cbuf, 0x0F );
  1.2992 +    emit_opcode( cbuf, 0xAF );
  1.2993 +    emit_rm( cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($dst$$reg) );
  1.2994 +    // MOV    EDX,$src.hi
  1.2995 +    encode_Copy( cbuf, HIGH_FROM_LOW($dst$$reg), HIGH_FROM_LOW($src$$reg) );
  1.2996 +    // IMUL   EDX,EAX
  1.2997 +    emit_opcode( cbuf, 0x0F );
  1.2998 +    emit_opcode( cbuf, 0xAF );
  1.2999 +    emit_rm( cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), $dst$$reg );
  1.3000 +    // ADD    $tmp,EDX
  1.3001 +    emit_opcode( cbuf, 0x03 );
  1.3002 +    emit_rm( cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($dst$$reg) );
  1.3003 +    // MUL   EDX:EAX,$src.lo
  1.3004 +    emit_opcode( cbuf, 0xF7 );
  1.3005 +    emit_rm( cbuf, 0x3, 0x4, $src$$reg );
  1.3006 +    // ADD    EDX,ESI
  1.3007 +    emit_opcode( cbuf, 0x03 );
  1.3008 +    emit_rm( cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), $tmp$$reg );
  1.3009 +  %}
  1.3010 +
  1.3011 +  enc_class long_multiply_con( eADXRegL dst, immL_127 src, eRegI tmp ) %{
  1.3012 +    // Basic idea: lo(result) = lo(src * y_lo)
  1.3013 +    //             hi(result) = hi(src * y_lo) + lo(src * y_hi)
  1.3014 +    // IMUL   $tmp,EDX,$src
  1.3015 +    emit_opcode( cbuf, 0x6B );
  1.3016 +    emit_rm( cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($dst$$reg) );
  1.3017 +    emit_d8( cbuf, (int)$src$$constant );
  1.3018 +    // MOV    EDX,$src
  1.3019 +    emit_opcode(cbuf, 0xB8 + EDX_enc);
  1.3020 +    emit_d32( cbuf, (int)$src$$constant );
  1.3021 +    // MUL   EDX:EAX,EDX
  1.3022 +    emit_opcode( cbuf, 0xF7 );
  1.3023 +    emit_rm( cbuf, 0x3, 0x4, EDX_enc );
  1.3024 +    // ADD    EDX,ESI
  1.3025 +    emit_opcode( cbuf, 0x03 );
  1.3026 +    emit_rm( cbuf, 0x3, EDX_enc, $tmp$$reg );
  1.3027 +  %}
  1.3028 +
  1.3029 +  enc_class long_div( eRegL src1, eRegL src2 ) %{
  1.3030 +    // PUSH src1.hi
  1.3031 +    emit_opcode(cbuf, HIGH_FROM_LOW(0x50+$src1$$reg) );
  1.3032 +    // PUSH src1.lo
  1.3033 +    emit_opcode(cbuf,               0x50+$src1$$reg  );
  1.3034 +    // PUSH src2.hi
  1.3035 +    emit_opcode(cbuf, HIGH_FROM_LOW(0x50+$src2$$reg) );
  1.3036 +    // PUSH src2.lo
  1.3037 +    emit_opcode(cbuf,               0x50+$src2$$reg  );
  1.3038 +    // CALL directly to the runtime
  1.3039 +    cbuf.set_inst_mark();
  1.3040 +    emit_opcode(cbuf,0xE8);       // Call into runtime
  1.3041 +    emit_d32_reloc(cbuf, (CAST_FROM_FN_PTR(address, SharedRuntime::ldiv) - cbuf.code_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
  1.3042 +    // Restore stack
  1.3043 +    emit_opcode(cbuf, 0x83); // add  SP, #framesize
  1.3044 +    emit_rm(cbuf, 0x3, 0x00, ESP_enc);
  1.3045 +    emit_d8(cbuf, 4*4);
  1.3046 +  %}
  1.3047 +
  1.3048 +  enc_class long_mod( eRegL src1, eRegL src2 ) %{
  1.3049 +    // PUSH src1.hi
  1.3050 +    emit_opcode(cbuf, HIGH_FROM_LOW(0x50+$src1$$reg) );
  1.3051 +    // PUSH src1.lo
  1.3052 +    emit_opcode(cbuf,               0x50+$src1$$reg  );
  1.3053 +    // PUSH src2.hi
  1.3054 +    emit_opcode(cbuf, HIGH_FROM_LOW(0x50+$src2$$reg) );
  1.3055 +    // PUSH src2.lo
  1.3056 +    emit_opcode(cbuf,               0x50+$src2$$reg  );
  1.3057 +    // CALL directly to the runtime
  1.3058 +    cbuf.set_inst_mark();
  1.3059 +    emit_opcode(cbuf,0xE8);       // Call into runtime
  1.3060 +    emit_d32_reloc(cbuf, (CAST_FROM_FN_PTR(address, SharedRuntime::lrem ) - cbuf.code_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
  1.3061 +    // Restore stack
  1.3062 +    emit_opcode(cbuf, 0x83); // add  SP, #framesize
  1.3063 +    emit_rm(cbuf, 0x3, 0x00, ESP_enc);
  1.3064 +    emit_d8(cbuf, 4*4);
  1.3065 +  %}
  1.3066 +
  1.3067 +  enc_class long_cmp_flags0( eRegL src, eRegI tmp ) %{
  1.3068 +    // MOV   $tmp,$src.lo
  1.3069 +    emit_opcode(cbuf, 0x8B);
  1.3070 +    emit_rm(cbuf, 0x3, $tmp$$reg, $src$$reg);
  1.3071 +    // OR    $tmp,$src.hi
  1.3072 +    emit_opcode(cbuf, 0x0B);
  1.3073 +    emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src$$reg));
  1.3074 +  %}
  1.3075 +
  1.3076 +  enc_class long_cmp_flags1( eRegL src1, eRegL src2 ) %{
  1.3077 +    // CMP    $src1.lo,$src2.lo
  1.3078 +    emit_opcode( cbuf, 0x3B );
  1.3079 +    emit_rm(cbuf, 0x3, $src1$$reg, $src2$$reg );
  1.3080 +    // JNE,s  skip
  1.3081 +    emit_cc(cbuf, 0x70, 0x5);
  1.3082 +    emit_d8(cbuf,2);
  1.3083 +    // CMP    $src1.hi,$src2.hi
  1.3084 +    emit_opcode( cbuf, 0x3B );
  1.3085 +    emit_rm(cbuf, 0x3, HIGH_FROM_LOW($src1$$reg), HIGH_FROM_LOW($src2$$reg) );
  1.3086 +  %}
  1.3087 +
  1.3088 +  enc_class long_cmp_flags2( eRegL src1, eRegL src2, eRegI tmp ) %{
  1.3089 +    // CMP    $src1.lo,$src2.lo\t! Long compare; set flags for low bits
  1.3090 +    emit_opcode( cbuf, 0x3B );
  1.3091 +    emit_rm(cbuf, 0x3, $src1$$reg, $src2$$reg );
  1.3092 +    // MOV    $tmp,$src1.hi
  1.3093 +    emit_opcode( cbuf, 0x8B );
  1.3094 +    emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src1$$reg) );
  1.3095 +    // SBB   $tmp,$src2.hi\t! Compute flags for long compare
  1.3096 +    emit_opcode( cbuf, 0x1B );
  1.3097 +    emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src2$$reg) );
  1.3098 +  %}
  1.3099 +
  1.3100 +  enc_class long_cmp_flags3( eRegL src, eRegI tmp ) %{
  1.3101 +    // XOR    $tmp,$tmp
  1.3102 +    emit_opcode(cbuf,0x33);  // XOR
  1.3103 +    emit_rm(cbuf,0x3, $tmp$$reg, $tmp$$reg);
  1.3104 +    // CMP    $tmp,$src.lo
  1.3105 +    emit_opcode( cbuf, 0x3B );
  1.3106 +    emit_rm(cbuf, 0x3, $tmp$$reg, $src$$reg );
  1.3107 +    // SBB    $tmp,$src.hi
  1.3108 +    emit_opcode( cbuf, 0x1B );
  1.3109 +    emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src$$reg) );
  1.3110 +  %}
  1.3111 +
  1.3112 + // Sniff, sniff... smells like Gnu Superoptimizer
  1.3113 +  enc_class neg_long( eRegL dst ) %{
  1.3114 +    emit_opcode(cbuf,0xF7);    // NEG hi
  1.3115 +    emit_rm    (cbuf,0x3, 0x3, HIGH_FROM_LOW($dst$$reg));
  1.3116 +    emit_opcode(cbuf,0xF7);    // NEG lo
  1.3117 +    emit_rm    (cbuf,0x3, 0x3,               $dst$$reg );
  1.3118 +    emit_opcode(cbuf,0x83);    // SBB hi,0
  1.3119 +    emit_rm    (cbuf,0x3, 0x3, HIGH_FROM_LOW($dst$$reg));
  1.3120 +    emit_d8    (cbuf,0 );
  1.3121 +  %}
  1.3122 +
  1.3123 +  enc_class movq_ld(regXD dst, memory mem) %{
  1.3124 +    MacroAssembler _masm(&cbuf);
  1.3125 +    Address madr = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp);
  1.3126 +    __ movq(as_XMMRegister($dst$$reg), madr);
  1.3127 +  %}
  1.3128 +
  1.3129 +  enc_class movq_st(memory mem, regXD src) %{
  1.3130 +    MacroAssembler _masm(&cbuf);
  1.3131 +    Address madr = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp);
  1.3132 +    __ movq(madr, as_XMMRegister($src$$reg));
  1.3133 +  %}
  1.3134 +
  1.3135 +  enc_class pshufd_8x8(regX dst, regX src) %{
  1.3136 +    MacroAssembler _masm(&cbuf);
  1.3137 +
  1.3138 +    encode_CopyXD(cbuf, $dst$$reg, $src$$reg);
  1.3139 +    __ punpcklbw(as_XMMRegister($dst$$reg), as_XMMRegister($dst$$reg));
  1.3140 +    __ pshuflw(as_XMMRegister($dst$$reg), as_XMMRegister($dst$$reg), 0x00);
  1.3141 +  %}
  1.3142 +
  1.3143 +  enc_class pshufd_4x16(regX dst, regX src) %{
  1.3144 +    MacroAssembler _masm(&cbuf);
  1.3145 +
  1.3146 +    __ pshuflw(as_XMMRegister($dst$$reg), as_XMMRegister($src$$reg), 0x00);
  1.3147 +  %}
  1.3148 +
  1.3149 +  enc_class pshufd(regXD dst, regXD src, int mode) %{
  1.3150 +    MacroAssembler _masm(&cbuf);
  1.3151 +
  1.3152 +    __ pshufd(as_XMMRegister($dst$$reg), as_XMMRegister($src$$reg), $mode);
  1.3153 +  %}
  1.3154 +
  1.3155 +  enc_class pxor(regXD dst, regXD src) %{
  1.3156 +    MacroAssembler _masm(&cbuf);
  1.3157 +
  1.3158 +    __ pxor(as_XMMRegister($dst$$reg), as_XMMRegister($src$$reg));
  1.3159 +  %}
  1.3160 +
  1.3161 +  enc_class mov_i2x(regXD dst, eRegI src) %{
  1.3162 +    MacroAssembler _masm(&cbuf);
  1.3163 +
  1.3164 +    __ movd(as_XMMRegister($dst$$reg), as_Register($src$$reg));
  1.3165 +  %}
  1.3166 +
  1.3167 +
  1.3168 +  // Because the transitions from emitted code to the runtime
  1.3169 +  // monitorenter/exit helper stubs are so slow it's critical that
  1.3170 +  // we inline both the stack-locking fast-path and the inflated fast path.
  1.3171 +  //
  1.3172 +  // See also: cmpFastLock and cmpFastUnlock.
  1.3173 +  //
  1.3174 +  // What follows is a specialized inline transliteration of the code
  1.3175 +  // in slow_enter() and slow_exit().  If we're concerned about I$ bloat
  1.3176 +  // another option would be to emit TrySlowEnter and TrySlowExit methods
  1.3177 +  // at startup-time.  These methods would accept arguments as
  1.3178 +  // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
  1.3179 +  // indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
  1.3180 +  // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
  1.3181 +  // In practice, however, the # of lock sites is bounded and is usually small.
  1.3182 +  // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
  1.3183 +  // if the processor uses simple bimodal branch predictors keyed by EIP
  1.3184 +  // Since the helper routines would be called from multiple synchronization
  1.3185 +  // sites.
  1.3186 +  //
  1.3187 +  // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
  1.3188 +  // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
  1.3189 +  // to those specialized methods.  That'd give us a mostly platform-independent
  1.3190 +  // implementation that the JITs could optimize and inline at their pleasure.
  1.3191 +  // Done correctly, the only time we'd need to cross to native could would be
  1.3192 +  // to park() or unpark() threads.  We'd also need a few more unsafe operators
  1.3193 +  // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
  1.3194 +  // (b) explicit barriers or fence operations.
  1.3195 +  //
  1.3196 +  // TODO:
  1.3197 +  //
  1.3198 +  // *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
  1.3199 +  //    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
  1.3200 +  //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
  1.3201 +  //    the lock operators would typically be faster than reifying Self.
  1.3202 +  //
  1.3203 +  // *  Ideally I'd define the primitives as:
  1.3204 +  //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
  1.3205 +  //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
  1.3206 +  //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
  1.3207 +  //    Instead, we're stuck with a rather awkward and brittle register assignments below.
  1.3208 +  //    Furthermore the register assignments are overconstrained, possibly resulting in
  1.3209 +  //    sub-optimal code near the synchronization site.
  1.3210 +  //
  1.3211 +  // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
  1.3212 +  //    Alternately, use a better sp-proximity test.
  1.3213 +  //
  1.3214 +  // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
  1.3215 +  //    Either one is sufficient to uniquely identify a thread.
  1.3216 +  //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
  1.3217 +  //
  1.3218 +  // *  Intrinsify notify() and notifyAll() for the common cases where the
  1.3219 +  //    object is locked by the calling thread but the waitlist is empty.
  1.3220 +  //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
  1.3221 +  //
  1.3222 +  // *  use jccb and jmpb instead of jcc and jmp to improve code density.
  1.3223 +  //    But beware of excessive branch density on AMD Opterons.
  1.3224 +  //
  1.3225 +  // *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
  1.3226 +  //    or failure of the fast-path.  If the fast-path fails then we pass
  1.3227 +  //    control to the slow-path, typically in C.  In Fast_Lock and
  1.3228 +  //    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
  1.3229 +  //    will emit a conditional branch immediately after the node.
  1.3230 +  //    So we have branches to branches and lots of ICC.ZF games.
  1.3231 +  //    Instead, it might be better to have C2 pass a "FailureLabel"
  1.3232 +  //    into Fast_Lock and Fast_Unlock.  In the case of success, control
  1.3233 +  //    will drop through the node.  ICC.ZF is undefined at exit.
  1.3234 +  //    In the case of failure, the node will branch directly to the
  1.3235 +  //    FailureLabel
  1.3236 +
  1.3237 +
  1.3238 +  // obj: object to lock
  1.3239 +  // box: on-stack box address (displaced header location) - KILLED
  1.3240 +  // rax,: tmp -- KILLED
  1.3241 +  // scr: tmp -- KILLED
  1.3242 +  enc_class Fast_Lock( eRegP obj, eRegP box, eAXRegI tmp, eRegP scr ) %{
  1.3243 +
  1.3244 +    Register objReg = as_Register($obj$$reg);
  1.3245 +    Register boxReg = as_Register($box$$reg);
  1.3246 +    Register tmpReg = as_Register($tmp$$reg);
  1.3247 +    Register scrReg = as_Register($scr$$reg);
  1.3248 +
  1.3249 +    // Ensure the register assignents are disjoint
  1.3250 +    guarantee (objReg != boxReg, "") ;
  1.3251 +    guarantee (objReg != tmpReg, "") ;
  1.3252 +    guarantee (objReg != scrReg, "") ;
  1.3253 +    guarantee (boxReg != tmpReg, "") ;
  1.3254 +    guarantee (boxReg != scrReg, "") ;
  1.3255 +    guarantee (tmpReg == as_Register(EAX_enc), "") ;
  1.3256 +
  1.3257 +    MacroAssembler masm(&cbuf);
  1.3258 +
  1.3259 +    if (_counters != NULL) {
  1.3260 +      masm.atomic_incl(ExternalAddress((address) _counters->total_entry_count_addr()));
  1.3261 +    }
  1.3262 +    if (EmitSync & 1) {
  1.3263 +        // set box->dhw = unused_mark (3)
  1.3264 +        // Force all sync thru slow-path: slow_enter() and slow_exit()
  1.3265 +        masm.movl (Address(boxReg, 0), intptr_t(markOopDesc::unused_mark())) ;
  1.3266 +        masm.cmpl (rsp, 0) ;
  1.3267 +    } else
  1.3268 +    if (EmitSync & 2) {
  1.3269 +        Label DONE_LABEL ;
  1.3270 +        if (UseBiasedLocking) {
  1.3271 +           // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
  1.3272 +           masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
  1.3273 +        }
  1.3274 +
  1.3275 +        masm.movl  (tmpReg, Address(objReg, 0)) ;          // fetch markword
  1.3276 +        masm.orl   (tmpReg, 0x1);
  1.3277 +        masm.movl  (Address(boxReg, 0), tmpReg);           // Anticipate successful CAS
  1.3278 +        if (os::is_MP()) { masm.lock();  }
  1.3279 +        masm.cmpxchg(boxReg, Address(objReg, 0));          // Updates tmpReg
  1.3280 +        masm.jcc(Assembler::equal, DONE_LABEL);
  1.3281 +        // Recursive locking
  1.3282 +        masm.subl(tmpReg, rsp);
  1.3283 +        masm.andl(tmpReg, 0xFFFFF003 );
  1.3284 +        masm.movl(Address(boxReg, 0), tmpReg);
  1.3285 +        masm.bind(DONE_LABEL) ;
  1.3286 +    } else {
  1.3287 +      // Possible cases that we'll encounter in fast_lock
  1.3288 +      // ------------------------------------------------
  1.3289 +      // * Inflated
  1.3290 +      //    -- unlocked
  1.3291 +      //    -- Locked
  1.3292 +      //       = by self
  1.3293 +      //       = by other
  1.3294 +      // * biased
  1.3295 +      //    -- by Self
  1.3296 +      //    -- by other
  1.3297 +      // * neutral
  1.3298 +      // * stack-locked
  1.3299 +      //    -- by self
  1.3300 +      //       = sp-proximity test hits
  1.3301 +      //       = sp-proximity test generates false-negative
  1.3302 +      //    -- by other
  1.3303 +      //
  1.3304 +
  1.3305 +      Label IsInflated, DONE_LABEL, PopDone ;
  1.3306 +
  1.3307 +      // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
  1.3308 +      // order to reduce the number of conditional branches in the most common cases.
  1.3309 +      // Beware -- there's a subtle invariant that fetch of the markword
  1.3310 +      // at [FETCH], below, will never observe a biased encoding (*101b).
  1.3311 +      // If this invariant is not held we risk exclusion (safety) failure.
  1.3312 +      if (UseBiasedLocking) {
  1.3313 +        masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
  1.3314 +      }
  1.3315 +
  1.3316 +      masm.movl  (tmpReg, Address(objReg, 0)) ;        // [FETCH]
  1.3317 +      masm.testl (tmpReg, 0x02) ;                      // Inflated v (Stack-locked or neutral)
  1.3318 +      masm.jccb  (Assembler::notZero, IsInflated) ;
  1.3319 +
  1.3320 +      // Attempt stack-locking ...
  1.3321 +      masm.orl   (tmpReg, 0x1);
  1.3322 +      masm.movl  (Address(boxReg, 0), tmpReg);            // Anticipate successful CAS
  1.3323 +      if (os::is_MP()) { masm.lock();  }
  1.3324 +      masm.cmpxchg(boxReg, Address(objReg, 0));           // Updates tmpReg
  1.3325 +      if (_counters != NULL) {
  1.3326 +        masm.cond_inc32(Assembler::equal,
  1.3327 +                        ExternalAddress((address)_counters->fast_path_entry_count_addr()));
  1.3328 +      }
  1.3329 +      masm.jccb (Assembler::equal, DONE_LABEL);
  1.3330 +
  1.3331 +      // Recursive locking
  1.3332 +      masm.subl(tmpReg, rsp);
  1.3333 +      masm.andl(tmpReg, 0xFFFFF003 );
  1.3334 +      masm.movl(Address(boxReg, 0), tmpReg);
  1.3335 +      if (_counters != NULL) {
  1.3336 +        masm.cond_inc32(Assembler::equal,
  1.3337 +                        ExternalAddress((address)_counters->fast_path_entry_count_addr()));
  1.3338 +      }
  1.3339 +      masm.jmp  (DONE_LABEL) ;
  1.3340 +
  1.3341 +      masm.bind (IsInflated) ;
  1.3342 +
  1.3343 +      // The object is inflated.
  1.3344 +      //
  1.3345 +      // TODO-FIXME: eliminate the ugly use of manifest constants:
  1.3346 +      //   Use markOopDesc::monitor_value instead of "2".
  1.3347 +      //   use markOop::unused_mark() instead of "3".
  1.3348 +      // The tmpReg value is an objectMonitor reference ORed with
  1.3349 +      // markOopDesc::monitor_value (2).   We can either convert tmpReg to an
  1.3350 +      // objectmonitor pointer by masking off the "2" bit or we can just
  1.3351 +      // use tmpReg as an objectmonitor pointer but bias the objectmonitor
  1.3352 +      // field offsets with "-2" to compensate for and annul the low-order tag bit.
  1.3353 +      //
  1.3354 +      // I use the latter as it avoids AGI stalls.
  1.3355 +      // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]"
  1.3356 +      // instead of "mov r, [tmpReg+OFFSETOF(Owner)]".
  1.3357 +      //
  1.3358 +      #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2)
  1.3359 +
  1.3360 +      // boxReg refers to the on-stack BasicLock in the current frame.
  1.3361 +      // We'd like to write:
  1.3362 +      //   set box->_displaced_header = markOop::unused_mark().  Any non-0 value suffices.
  1.3363 +      // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
  1.3364 +      // additional latency as we have another ST in the store buffer that must drain.
  1.3365 +
  1.3366 +      if (EmitSync & 8192) {
  1.3367 +         masm.movl  (Address(boxReg, 0), 3) ;            // results in ST-before-CAS penalty
  1.3368 +         masm.get_thread (scrReg) ;
  1.3369 +         masm.movl  (boxReg, tmpReg);                    // consider: LEA box, [tmp-2]
  1.3370 +         masm.movl  (tmpReg, 0);                         // consider: xor vs mov
  1.3371 +         if (os::is_MP()) { masm.lock(); }
  1.3372 +         masm.cmpxchg (scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
  1.3373 +      } else
  1.3374 +      if ((EmitSync & 128) == 0) {                      // avoid ST-before-CAS
  1.3375 +         masm.movl (scrReg, boxReg) ;
  1.3376 +         masm.movl (boxReg, tmpReg);                    // consider: LEA box, [tmp-2]
  1.3377 +
  1.3378 +         // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
  1.3379 +         if ((EmitSync & 2048) && VM_Version::supports_3dnow() && os::is_MP()) {
  1.3380 +            // prefetchw [eax + Offset(_owner)-2]
  1.3381 +            masm.emit_raw (0x0F) ;
  1.3382 +            masm.emit_raw (0x0D) ;
  1.3383 +            masm.emit_raw (0x48) ;
  1.3384 +            masm.emit_raw (ObjectMonitor::owner_offset_in_bytes()-2) ;
  1.3385 +         }
  1.3386 +
  1.3387 +         if ((EmitSync & 64) == 0) {
  1.3388 +           // Optimistic form: consider XORL tmpReg,tmpReg
  1.3389 +           masm.movl  (tmpReg, 0 ) ;
  1.3390 +         } else {
  1.3391 +           // Can suffer RTS->RTO upgrades on shared or cold $ lines
  1.3392 +           // Test-And-CAS instead of CAS
  1.3393 +           masm.movl  (tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;   // rax, = m->_owner
  1.3394 +           masm.testl (tmpReg, tmpReg) ;                   // Locked ?
  1.3395 +           masm.jccb  (Assembler::notZero, DONE_LABEL) ;
  1.3396 +         }
  1.3397 +
  1.3398 +         // Appears unlocked - try to swing _owner from null to non-null.
  1.3399 +         // Ideally, I'd manifest "Self" with get_thread and then attempt
  1.3400 +         // to CAS the register containing Self into m->Owner.
  1.3401 +         // But we don't have enough registers, so instead we can either try to CAS
  1.3402 +         // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
  1.3403 +         // we later store "Self" into m->Owner.  Transiently storing a stack address
  1.3404 +         // (rsp or the address of the box) into  m->owner is harmless.
  1.3405 +         // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
  1.3406 +         if (os::is_MP()) { masm.lock();  }
  1.3407 +         masm.cmpxchg (scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
  1.3408 +         masm.movl  (Address(scrReg, 0), 3) ;          // box->_displaced_header = 3
  1.3409 +         masm.jccb  (Assembler::notZero, DONE_LABEL) ;
  1.3410 +         masm.get_thread (scrReg) ;                    // beware: clobbers ICCs
  1.3411 +         masm.movl  (Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg) ;
  1.3412 +         masm.xorl  (boxReg, boxReg) ;                 // set icc.ZFlag = 1 to indicate success
  1.3413 +
  1.3414 +         // If the CAS fails we can either retry or pass control to the slow-path.
  1.3415 +         // We use the latter tactic.
  1.3416 +         // Pass the CAS result in the icc.ZFlag into DONE_LABEL
  1.3417 +         // If the CAS was successful ...
  1.3418 +         //   Self has acquired the lock
  1.3419 +         //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
  1.3420 +         // Intentional fall-through into DONE_LABEL ...
  1.3421 +      } else {
  1.3422 +         masm.movl (Address(boxReg, 0), 3) ;       // results in ST-before-CAS penalty
  1.3423 +         masm.movl (boxReg, tmpReg) ;
  1.3424 +
  1.3425 +         // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
  1.3426 +         if ((EmitSync & 2048) && VM_Version::supports_3dnow() && os::is_MP()) {
  1.3427 +            // prefetchw [eax + Offset(_owner)-2]
  1.3428 +            masm.emit_raw (0x0F) ;
  1.3429 +            masm.emit_raw (0x0D) ;
  1.3430 +            masm.emit_raw (0x48) ;
  1.3431 +            masm.emit_raw (ObjectMonitor::owner_offset_in_bytes()-2) ;
  1.3432 +         }
  1.3433 +
  1.3434 +         if ((EmitSync & 64) == 0) {
  1.3435 +           // Optimistic form
  1.3436 +           masm.xorl  (tmpReg, tmpReg) ;
  1.3437 +         } else {
  1.3438 +           // Can suffer RTS->RTO upgrades on shared or cold $ lines
  1.3439 +           masm.movl  (tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;   // rax, = m->_owner
  1.3440 +           masm.testl (tmpReg, tmpReg) ;                   // Locked ?
  1.3441 +           masm.jccb  (Assembler::notZero, DONE_LABEL) ;
  1.3442 +         }
  1.3443 +
  1.3444 +         // Appears unlocked - try to swing _owner from null to non-null.
  1.3445 +         // Use either "Self" (in scr) or rsp as thread identity in _owner.
  1.3446 +         // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
  1.3447 +         masm.get_thread (scrReg) ;
  1.3448 +         if (os::is_MP()) { masm.lock(); }
  1.3449 +         masm.cmpxchg (scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
  1.3450 +
  1.3451 +         // If the CAS fails we can either retry or pass control to the slow-path.
  1.3452 +         // We use the latter tactic.
  1.3453 +         // Pass the CAS result in the icc.ZFlag into DONE_LABEL
  1.3454 +         // If the CAS was successful ...
  1.3455 +         //   Self has acquired the lock
  1.3456 +         //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
  1.3457 +         // Intentional fall-through into DONE_LABEL ...
  1.3458 +      }
  1.3459 +
  1.3460 +      // DONE_LABEL is a hot target - we'd really like to place it at the
  1.3461 +      // start of cache line by padding with NOPs.
  1.3462 +      // See the AMD and Intel software optimization manuals for the
  1.3463 +      // most efficient "long" NOP encodings.
  1.3464 +      // Unfortunately none of our alignment mechanisms suffice.
  1.3465 +      masm.bind(DONE_LABEL);
  1.3466 +
  1.3467 +      // Avoid branch-to-branch on AMD processors
  1.3468 +      // This appears to be superstition.
  1.3469 +      if (EmitSync & 32) masm.nop() ;
  1.3470 +
  1.3471 +
  1.3472 +      // At DONE_LABEL the icc ZFlag is set as follows ...
  1.3473 +      // Fast_Unlock uses the same protocol.
  1.3474 +      // ZFlag == 1 -> Success
  1.3475 +      // ZFlag == 0 -> Failure - force control through the slow-path
  1.3476 +    }
  1.3477 +  %}
  1.3478 +
  1.3479 +  // obj: object to unlock
  1.3480 +  // box: box address (displaced header location), killed.  Must be EAX.
  1.3481 +  // rbx,: killed tmp; cannot be obj nor box.
  1.3482 +  //
  1.3483 +  // Some commentary on balanced locking:
  1.3484 +  //
  1.3485 +  // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
  1.3486 +  // Methods that don't have provably balanced locking are forced to run in the
  1.3487 +  // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
  1.3488 +  // The interpreter provides two properties:
  1.3489 +  // I1:  At return-time the interpreter automatically and quietly unlocks any
  1.3490 +  //      objects acquired the current activation (frame).  Recall that the
  1.3491 +  //      interpreter maintains an on-stack list of locks currently held by
  1.3492 +  //      a frame.
  1.3493 +  // I2:  If a method attempts to unlock an object that is not held by the
  1.3494 +  //      the frame the interpreter throws IMSX.
  1.3495 +  //
  1.3496 +  // Lets say A(), which has provably balanced locking, acquires O and then calls B().
  1.3497 +  // B() doesn't have provably balanced locking so it runs in the interpreter.
  1.3498 +  // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
  1.3499 +  // is still locked by A().
  1.3500 +  //
  1.3501 +  // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
  1.3502 +  // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
  1.3503 +  // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
  1.3504 +  // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
  1.3505 +
  1.3506 +  enc_class Fast_Unlock( nabxRegP obj, eAXRegP box, eRegP tmp) %{
  1.3507 +
  1.3508 +    Register objReg = as_Register($obj$$reg);
  1.3509 +    Register boxReg = as_Register($box$$reg);
  1.3510 +    Register tmpReg = as_Register($tmp$$reg);
  1.3511 +
  1.3512 +    guarantee (objReg != boxReg, "") ;
  1.3513 +    guarantee (objReg != tmpReg, "") ;
  1.3514 +    guarantee (boxReg != tmpReg, "") ;
  1.3515 +    guarantee (boxReg == as_Register(EAX_enc), "") ;
  1.3516 +    MacroAssembler masm(&cbuf);
  1.3517 +
  1.3518 +    if (EmitSync & 4) {
  1.3519 +      // Disable - inhibit all inlining.  Force control through the slow-path
  1.3520 +      masm.cmpl (rsp, 0) ;
  1.3521 +    } else
  1.3522 +    if (EmitSync & 8) {
  1.3523 +      Label DONE_LABEL ;
  1.3524 +      if (UseBiasedLocking) {
  1.3525 +         masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
  1.3526 +      }
  1.3527 +      // classic stack-locking code ...
  1.3528 +      masm.movl  (tmpReg, Address(boxReg, 0)) ;
  1.3529 +      masm.testl (tmpReg, tmpReg) ;
  1.3530 +      masm.jcc   (Assembler::zero, DONE_LABEL) ;
  1.3531 +      if (os::is_MP()) { masm.lock(); }
  1.3532 +      masm.cmpxchg(tmpReg, Address(objReg, 0));          // Uses EAX which is box
  1.3533 +      masm.bind(DONE_LABEL);
  1.3534 +    } else {
  1.3535 +      Label DONE_LABEL, Stacked, CheckSucc, Inflated ;
  1.3536 +
  1.3537 +      // Critically, the biased locking test must have precedence over
  1.3538 +      // and appear before the (box->dhw == 0) recursive stack-lock test.
  1.3539 +      if (UseBiasedLocking) {
  1.3540 +         masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
  1.3541 +      }
  1.3542 +
  1.3543 +      masm.cmpl  (Address(boxReg, 0), 0) ;            // Examine the displaced header
  1.3544 +      masm.movl  (tmpReg, Address(objReg, 0)) ;       // Examine the object's markword
  1.3545 +      masm.jccb  (Assembler::zero, DONE_LABEL) ;      // 0 indicates recursive stack-lock
  1.3546 +
  1.3547 +      masm.testl (tmpReg, 0x02) ;                     // Inflated?
  1.3548 +      masm.jccb  (Assembler::zero, Stacked) ;
  1.3549 +
  1.3550 +      masm.bind  (Inflated) ;
  1.3551 +      // It's inflated.
  1.3552 +      // Despite our balanced locking property we still check that m->_owner == Self
  1.3553 +      // as java routines or native JNI code called by this thread might
  1.3554 +      // have released the lock.
  1.3555 +      // Refer to the comments in synchronizer.cpp for how we might encode extra
  1.3556 +      // state in _succ so we can avoid fetching EntryList|cxq.
  1.3557 +      //
  1.3558 +      // I'd like to add more cases in fast_lock() and fast_unlock() --
  1.3559 +      // such as recursive enter and exit -- but we have to be wary of
  1.3560 +      // I$ bloat, T$ effects and BP$ effects.
  1.3561 +      //
  1.3562 +      // If there's no contention try a 1-0 exit.  That is, exit without
  1.3563 +      // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
  1.3564 +      // we detect and recover from the race that the 1-0 exit admits.
  1.3565 +      //
  1.3566 +      // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
  1.3567 +      // before it STs null into _owner, releasing the lock.  Updates
  1.3568 +      // to data protected by the critical section must be visible before
  1.3569 +      // we drop the lock (and thus before any other thread could acquire
  1.3570 +      // the lock and observe the fields protected by the lock).
  1.3571 +      // IA32's memory-model is SPO, so STs are ordered with respect to
  1.3572 +      // each other and there's no need for an explicit barrier (fence).
  1.3573 +      // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
  1.3574 +
  1.3575 +      masm.get_thread (boxReg) ;
  1.3576 +      if ((EmitSync & 4096) && VM_Version::supports_3dnow() && os::is_MP()) {
  1.3577 +         // prefetchw [ebx + Offset(_owner)-2]
  1.3578 +         masm.emit_raw (0x0F) ;
  1.3579 +         masm.emit_raw (0x0D) ;
  1.3580 +         masm.emit_raw (0x4B) ;
  1.3581 +         masm.emit_raw (ObjectMonitor::owner_offset_in_bytes()-2) ;
  1.3582 +      }
  1.3583 +
  1.3584 +      // Note that we could employ various encoding schemes to reduce
  1.3585 +      // the number of loads below (currently 4) to just 2 or 3.
  1.3586 +      // Refer to the comments in synchronizer.cpp.
  1.3587 +      // In practice the chain of fetches doesn't seem to impact performance, however.
  1.3588 +      if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
  1.3589 +         // Attempt to reduce branch density - AMD's branch predictor.
  1.3590 +         masm.xorl  (boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
  1.3591 +         masm.orl   (boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
  1.3592 +         masm.orl   (boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
  1.3593 +         masm.orl   (boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
  1.3594 +         masm.jccb  (Assembler::notZero, DONE_LABEL) ;
  1.3595 +         masm.movl  (Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), 0) ;
  1.3596 +         masm.jmpb  (DONE_LABEL) ;
  1.3597 +      } else {
  1.3598 +         masm.xorl  (boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
  1.3599 +         masm.orl   (boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
  1.3600 +         masm.jccb  (Assembler::notZero, DONE_LABEL) ;
  1.3601 +         masm.movl  (boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
  1.3602 +         masm.orl   (boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
  1.3603 +         masm.jccb  (Assembler::notZero, CheckSucc) ;
  1.3604 +         masm.movl  (Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), 0) ;
  1.3605 +         masm.jmpb  (DONE_LABEL) ;
  1.3606 +      }
  1.3607 +
  1.3608 +      // The Following code fragment (EmitSync & 65536) improves the performance of
  1.3609 +      // contended applications and contended synchronization microbenchmarks.
  1.3610 +      // Unfortunately the emission of the code - even though not executed - causes regressions
  1.3611 +      // in scimark and jetstream, evidently because of $ effects.  Replacing the code
  1.3612 +      // with an equal number of never-executed NOPs results in the same regression.
  1.3613 +      // We leave it off by default.
  1.3614 +
  1.3615 +      if ((EmitSync & 65536) != 0) {
  1.3616 +         Label LSuccess, LGoSlowPath ;
  1.3617 +
  1.3618 +         masm.bind  (CheckSucc) ;
  1.3619 +
  1.3620 +         // Optional pre-test ... it's safe to elide this
  1.3621 +         if ((EmitSync & 16) == 0) {
  1.3622 +            masm.cmpl  (Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ;
  1.3623 +            masm.jccb  (Assembler::zero, LGoSlowPath) ;
  1.3624 +         }
  1.3625 +
  1.3626 +         // We have a classic Dekker-style idiom:
  1.3627 +         //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
  1.3628 +         // There are a number of ways to implement the barrier:
  1.3629 +         // (1) lock:andl &m->_owner, 0
  1.3630 +         //     is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
  1.3631 +         //     LOCK: ANDL [ebx+Offset(_Owner)-2], 0
  1.3632 +         //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
  1.3633 +         // (2) If supported, an explicit MFENCE is appealing.
  1.3634 +         //     In older IA32 processors MFENCE is slower than lock:add or xchg
  1.3635 +         //     particularly if the write-buffer is full as might be the case if
  1.3636 +         //     if stores closely precede the fence or fence-equivalent instruction.
  1.3637 +         //     In more modern implementations MFENCE appears faster, however.
  1.3638 +         // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
  1.3639 +         //     The $lines underlying the top-of-stack should be in M-state.
  1.3640 +         //     The locked add instruction is serializing, of course.
  1.3641 +         // (4) Use xchg, which is serializing
  1.3642 +         //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
  1.3643 +         // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
  1.3644 +         //     The integer condition codes will tell us if succ was 0.
  1.3645 +         //     Since _succ and _owner should reside in the same $line and
  1.3646 +         //     we just stored into _owner, it's likely that the $line
  1.3647 +         //     remains in M-state for the lock:orl.
  1.3648 +         //
  1.3649 +         // We currently use (3), although it's likely that switching to (2)
  1.3650 +         // is correct for the future.
  1.3651 +
  1.3652 +         masm.movl  (Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), 0) ;
  1.3653 +         if (os::is_MP()) {
  1.3654 +            if (VM_Version::supports_sse2() && 1 == FenceInstruction) {
  1.3655 +              masm.emit_raw (0x0F) ;    // MFENCE ...
  1.3656 +              masm.emit_raw (0xAE) ;
  1.3657 +              masm.emit_raw (0xF0) ;
  1.3658 +            } else {
  1.3659 +              masm.lock () ; masm.addl (Address(rsp, 0), 0) ;
  1.3660 +            }
  1.3661 +         }
  1.3662 +         // Ratify _succ remains non-null
  1.3663 +         masm.cmpl  (Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ;
  1.3664 +         masm.jccb  (Assembler::notZero, LSuccess) ;
  1.3665 +
  1.3666 +         masm.xorl  (boxReg, boxReg) ;                  // box is really EAX
  1.3667 +         if (os::is_MP()) { masm.lock(); }
  1.3668 +         masm.cmpxchg(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
  1.3669 +         masm.jccb  (Assembler::notEqual, LSuccess) ;
  1.3670 +         // Since we're low on registers we installed rsp as a placeholding in _owner.
  1.3671 +         // Now install Self over rsp.  This is safe as we're transitioning from
  1.3672 +         // non-null to non=null
  1.3673 +         masm.get_thread (boxReg) ;
  1.3674 +         masm.movl  (Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg) ;
  1.3675 +         // Intentional fall-through into LGoSlowPath ...
  1.3676 +
  1.3677 +         masm.bind  (LGoSlowPath) ;
  1.3678 +         masm.orl   (boxReg, 1) ;                      // set ICC.ZF=0 to indicate failure
  1.3679 +         masm.jmpb  (DONE_LABEL) ;
  1.3680 +
  1.3681 +         masm.bind  (LSuccess) ;
  1.3682 +         masm.xorl  (boxReg, boxReg) ;                 // set ICC.ZF=1 to indicate success
  1.3683 +         masm.jmpb  (DONE_LABEL) ;
  1.3684 +      }
  1.3685 +
  1.3686 +      masm.bind (Stacked) ;
  1.3687 +      // It's not inflated and it's not recursively stack-locked and it's not biased.
  1.3688 +      // It must be stack-locked.
  1.3689 +      // Try to reset the header to displaced header.
  1.3690 +      // The "box" value on the stack is stable, so we can reload
  1.3691 +      // and be assured we observe the same value as above.
  1.3692 +      masm.movl (tmpReg, Address(boxReg, 0)) ;
  1.3693 +      if (os::is_MP()) {   masm.lock();    }
  1.3694 +      masm.cmpxchg(tmpReg, Address(objReg, 0)); // Uses EAX which is box
  1.3695 +      // Intention fall-thru into DONE_LABEL
  1.3696 +
  1.3697 +
  1.3698 +      // DONE_LABEL is a hot target - we'd really like to place it at the
  1.3699 +      // start of cache line by padding with NOPs.
  1.3700 +      // See the AMD and Intel software optimization manuals for the
  1.3701 +      // most efficient "long" NOP encodings.
  1.3702 +      // Unfortunately none of our alignment mechanisms suffice.
  1.3703 +      if ((EmitSync & 65536) == 0) {
  1.3704 +         masm.bind (CheckSucc) ;
  1.3705 +      }
  1.3706 +      masm.bind(DONE_LABEL);
  1.3707 +
  1.3708 +      // Avoid branch to branch on AMD processors
  1.3709 +      if (EmitSync & 32768) { masm.nop() ; }
  1.3710 +    }
  1.3711 +  %}
  1.3712 +
  1.3713 +  enc_class enc_String_Compare() %{
  1.3714 +    Label ECX_GOOD_LABEL, LENGTH_DIFF_LABEL,
  1.3715 +          POP_LABEL, DONE_LABEL, CONT_LABEL,
  1.3716 +          WHILE_HEAD_LABEL;
  1.3717 +    MacroAssembler masm(&cbuf);
  1.3718 +
  1.3719 +    // Get the first character position in both strings
  1.3720 +    //         [8] char array, [12] offset, [16] count
  1.3721 +    int value_offset  = java_lang_String::value_offset_in_bytes();
  1.3722 +    int offset_offset = java_lang_String::offset_offset_in_bytes();
  1.3723 +    int count_offset  = java_lang_String::count_offset_in_bytes();
  1.3724 +    int base_offset   = arrayOopDesc::base_offset_in_bytes(T_CHAR);
  1.3725 +
  1.3726 +    masm.movl(rax, Address(rsi, value_offset));
  1.3727 +    masm.movl(rcx, Address(rsi, offset_offset));
  1.3728 +    masm.leal(rax, Address(rax, rcx, Address::times_2, base_offset));
  1.3729 +    masm.movl(rbx, Address(rdi, value_offset));
  1.3730 +    masm.movl(rcx, Address(rdi, offset_offset));
  1.3731 +    masm.leal(rbx, Address(rbx, rcx, Address::times_2, base_offset));
  1.3732 +
  1.3733 +    // Compute the minimum of the string lengths(rsi) and the
  1.3734 +    // difference of the string lengths (stack)
  1.3735 +
  1.3736 +
  1.3737 +    if (VM_Version::supports_cmov()) {
  1.3738 +      masm.movl(rdi, Address(rdi, count_offset));
  1.3739 +      masm.movl(rsi, Address(rsi, count_offset));
  1.3740 +      masm.movl(rcx, rdi);
  1.3741 +      masm.subl(rdi, rsi);
  1.3742 +      masm.pushl(rdi);
  1.3743 +      masm.cmovl(Assembler::lessEqual, rsi, rcx);
  1.3744 +    } else {
  1.3745 +      masm.movl(rdi, Address(rdi, count_offset));
  1.3746 +      masm.movl(rcx, Address(rsi, count_offset));
  1.3747 +      masm.movl(rsi, rdi);
  1.3748 +      masm.subl(rdi, rcx);
  1.3749 +      masm.pushl(rdi);
  1.3750 +      masm.jcc(Assembler::lessEqual, ECX_GOOD_LABEL);
  1.3751 +      masm.movl(rsi, rcx);
  1.3752 +      // rsi holds min, rcx is unused
  1.3753 +    }
  1.3754 +
  1.3755 +    // Is the minimum length zero?
  1.3756 +    masm.bind(ECX_GOOD_LABEL);
  1.3757 +    masm.testl(rsi, rsi);
  1.3758 +    masm.jcc(Assembler::zero, LENGTH_DIFF_LABEL);
  1.3759 +
  1.3760 +    // Load first characters
  1.3761 +    masm.load_unsigned_word(rcx, Address(rbx, 0));
  1.3762 +    masm.load_unsigned_word(rdi, Address(rax, 0));
  1.3763 +
  1.3764 +    // Compare first characters
  1.3765 +    masm.subl(rcx, rdi);
  1.3766 +    masm.jcc(Assembler::notZero,  POP_LABEL);
  1.3767 +    masm.decrement(rsi);
  1.3768 +    masm.jcc(Assembler::zero, LENGTH_DIFF_LABEL);
  1.3769 +
  1.3770 +    {
  1.3771 +      // Check after comparing first character to see if strings are equivalent
  1.3772 +      Label LSkip2;
  1.3773 +      // Check if the strings start at same location
  1.3774 +      masm.cmpl(rbx,rax);
  1.3775 +      masm.jcc(Assembler::notEqual, LSkip2);
  1.3776 +
  1.3777 +      // Check if the length difference is zero (from stack)
  1.3778 +      masm.cmpl(Address(rsp, 0), 0x0);
  1.3779 +      masm.jcc(Assembler::equal,  LENGTH_DIFF_LABEL);
  1.3780 +
  1.3781 +      // Strings might not be equivalent
  1.3782 +      masm.bind(LSkip2);
  1.3783 +    }
  1.3784 +
  1.3785 +    // Shift rax, and rbx, to the end of the arrays, negate min
  1.3786 +    masm.leal(rax, Address(rax, rsi, Address::times_2, 2));
  1.3787 +    masm.leal(rbx, Address(rbx, rsi, Address::times_2, 2));
  1.3788 +    masm.negl(rsi);
  1.3789 +
  1.3790 +    // Compare the rest of the characters
  1.3791 +    masm.bind(WHILE_HEAD_LABEL);
  1.3792 +    masm.load_unsigned_word(rcx, Address(rbx, rsi, Address::times_2, 0));
  1.3793 +    masm.load_unsigned_word(rdi, Address(rax, rsi, Address::times_2, 0));
  1.3794 +    masm.subl(rcx, rdi);
  1.3795 +    masm.jcc(Assembler::notZero, POP_LABEL);
  1.3796 +    masm.increment(rsi);
  1.3797 +    masm.jcc(Assembler::notZero, WHILE_HEAD_LABEL);
  1.3798 +
  1.3799 +    // Strings are equal up to min length.  Return the length difference.
  1.3800 +    masm.bind(LENGTH_DIFF_LABEL);
  1.3801 +    masm.popl(rcx);
  1.3802 +    masm.jmp(DONE_LABEL);
  1.3803 +
  1.3804 +    // Discard the stored length difference
  1.3805 +    masm.bind(POP_LABEL);
  1.3806 +    masm.addl(rsp, 4);
  1.3807 +
  1.3808 +    // That's it
  1.3809 +    masm.bind(DONE_LABEL);
  1.3810 +  %}
  1.3811 +
  1.3812 +  enc_class enc_pop_rdx() %{
  1.3813 +    emit_opcode(cbuf,0x5A);
  1.3814 +  %}
  1.3815 +
  1.3816 +  enc_class enc_rethrow() %{
  1.3817 +    cbuf.set_inst_mark();
  1.3818 +    emit_opcode(cbuf, 0xE9);        // jmp    entry
  1.3819 +    emit_d32_reloc(cbuf, (int)OptoRuntime::rethrow_stub() - ((int)cbuf.code_end())-4,
  1.3820 +                   runtime_call_Relocation::spec(), RELOC_IMM32 );
  1.3821 +  %}
  1.3822 +
  1.3823 +
  1.3824 +  // Convert a double to an int.  Java semantics require we do complex
  1.3825 +  // manglelations in the corner cases.  So we set the rounding mode to
  1.3826 +  // 'zero', store the darned double down as an int, and reset the
  1.3827 +  // rounding mode to 'nearest'.  The hardware throws an exception which
  1.3828 +  // patches up the correct value directly to the stack.
  1.3829 +  enc_class D2I_encoding( regD src ) %{
  1.3830 +    // Flip to round-to-zero mode.  We attempted to allow invalid-op
  1.3831 +    // exceptions here, so that a NAN or other corner-case value will
  1.3832 +    // thrown an exception (but normal values get converted at full speed).
  1.3833 +    // However, I2C adapters and other float-stack manglers leave pending
  1.3834 +    // invalid-op exceptions hanging.  We would have to clear them before
  1.3835 +    // enabling them and that is more expensive than just testing for the
  1.3836 +    // invalid value Intel stores down in the corner cases.
  1.3837 +    emit_opcode(cbuf,0xD9);            // FLDCW  trunc
  1.3838 +    emit_opcode(cbuf,0x2D);
  1.3839 +    emit_d32(cbuf,(int)StubRoutines::addr_fpu_cntrl_wrd_trunc());
  1.3840 +    // Allocate a word
  1.3841 +    emit_opcode(cbuf,0x83);            // SUB ESP,4
  1.3842 +    emit_opcode(cbuf,0xEC);
  1.3843 +    emit_d8(cbuf,0x04);
  1.3844 +    // Encoding assumes a double has been pushed into FPR0.
  1.3845 +    // Store down the double as an int, popping the FPU stack
  1.3846 +    emit_opcode(cbuf,0xDB);            // FISTP [ESP]
  1.3847 +    emit_opcode(cbuf,0x1C);
  1.3848 +    emit_d8(cbuf,0x24);
  1.3849 +    // Restore the rounding mode; mask the exception
  1.3850 +    emit_opcode(cbuf,0xD9);            // FLDCW   std/24-bit mode
  1.3851 +    emit_opcode(cbuf,0x2D);
  1.3852 +    emit_d32( cbuf, Compile::current()->in_24_bit_fp_mode()
  1.3853 +        ? (int)StubRoutines::addr_fpu_cntrl_wrd_24()
  1.3854 +        : (int)StubRoutines::addr_fpu_cntrl_wrd_std());
  1.3855 +
  1.3856 +    // Load the converted int; adjust CPU stack
  1.3857 +    emit_opcode(cbuf,0x58);       // POP EAX
  1.3858 +    emit_opcode(cbuf,0x3D);       // CMP EAX,imm
  1.3859 +    emit_d32   (cbuf,0x80000000); //         0x80000000
  1.3860 +    emit_opcode(cbuf,0x75);       // JNE around_slow_call
  1.3861 +    emit_d8    (cbuf,0x07);       // Size of slow_call
  1.3862 +    // Push src onto stack slow-path
  1.3863 +    emit_opcode(cbuf,0xD9 );      // FLD     ST(i)
  1.3864 +    emit_d8    (cbuf,0xC0-1+$src$$reg );
  1.3865 +    // CALL directly to the runtime
  1.3866 +    cbuf.set_inst_mark();
  1.3867 +    emit_opcode(cbuf,0xE8);       // Call into runtime
  1.3868 +    emit_d32_reloc(cbuf, (StubRoutines::d2i_wrapper() - cbuf.code_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
  1.3869 +    // Carry on here...
  1.3870 +  %}
  1.3871 +
  1.3872 +  enc_class D2L_encoding( regD src ) %{
  1.3873 +    emit_opcode(cbuf,0xD9);            // FLDCW  trunc
  1.3874 +    emit_opcode(cbuf,0x2D);
  1.3875 +    emit_d32(cbuf,(int)StubRoutines::addr_fpu_cntrl_wrd_trunc());
  1.3876 +    // Allocate a word
  1.3877 +    emit_opcode(cbuf,0x83);            // SUB ESP,8
  1.3878 +    emit_opcode(cbuf,0xEC);
  1.3879 +    emit_d8(cbuf,0x08);
  1.3880 +    // Encoding assumes a double has been pushed into FPR0.
  1.3881 +    // Store down the double as a long, popping the FPU stack
  1.3882 +    emit_opcode(cbuf,0xDF);            // FISTP [ESP]
  1.3883 +    emit_opcode(cbuf,0x3C);
  1.3884 +    emit_d8(cbuf,0x24);
  1.3885 +    // Restore the rounding mode; mask the exception
  1.3886 +    emit_opcode(cbuf,0xD9);            // FLDCW   std/24-bit mode
  1.3887 +    emit_opcode(cbuf,0x2D);
  1.3888 +    emit_d32( cbuf, Compile::current()->in_24_bit_fp_mode()
  1.3889 +        ? (int)StubRoutines::addr_fpu_cntrl_wrd_24()
  1.3890 +        : (int)StubRoutines::addr_fpu_cntrl_wrd_std());
  1.3891 +
  1.3892 +    // Load the converted int; adjust CPU stack
  1.3893 +    emit_opcode(cbuf,0x58);       // POP EAX
  1.3894 +    emit_opcode(cbuf,0x5A);       // POP EDX
  1.3895 +    emit_opcode(cbuf,0x81);       // CMP EDX,imm
  1.3896 +    emit_d8    (cbuf,0xFA);       // rdx
  1.3897 +    emit_d32   (cbuf,0x80000000); //         0x80000000
  1.3898 +    emit_opcode(cbuf,0x75);       // JNE around_slow_call
  1.3899 +    emit_d8    (cbuf,0x07+4);     // Size of slow_call
  1.3900 +    emit_opcode(cbuf,0x85);       // TEST EAX,EAX
  1.3901 +    emit_opcode(cbuf,0xC0);       // 2/rax,/rax,
  1.3902 +    emit_opcode(cbuf,0x75);       // JNE around_slow_call
  1.3903 +    emit_d8    (cbuf,0x07);       // Size of slow_call
  1.3904 +    // Push src onto stack slow-path
  1.3905 +    emit_opcode(cbuf,0xD9 );      // FLD     ST(i)
  1.3906 +    emit_d8    (cbuf,0xC0-1+$src$$reg );
  1.3907 +    // CALL directly to the runtime
  1.3908 +    cbuf.set_inst_mark();
  1.3909 +    emit_opcode(cbuf,0xE8);       // Call into runtime
  1.3910 +    emit_d32_reloc(cbuf, (StubRoutines::d2l_wrapper() - cbuf.code_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
  1.3911 +    // Carry on here...
  1.3912 +  %}
  1.3913 +
  1.3914 +  enc_class X2L_encoding( regX src ) %{
  1.3915 +    // Allocate a word
  1.3916 +    emit_opcode(cbuf,0x83);      // SUB ESP,8
  1.3917 +    emit_opcode(cbuf,0xEC);
  1.3918 +    emit_d8(cbuf,0x08);
  1.3919 +
  1.3920 +    emit_opcode  (cbuf, 0xF3 );  // MOVSS [ESP], src
  1.3921 +    emit_opcode  (cbuf, 0x0F );
  1.3922 +    emit_opcode  (cbuf, 0x11 );
  1.3923 +    encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
  1.3924 +
  1.3925 +    emit_opcode(cbuf,0xD9 );     // FLD_S [ESP]
  1.3926 +    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
  1.3927 +
  1.3928 +    emit_opcode(cbuf,0xD9);      // FLDCW  trunc
  1.3929 +    emit_opcode(cbuf,0x2D);
  1.3930 +    emit_d32(cbuf,(int)StubRoutines::addr_fpu_cntrl_wrd_trunc());
  1.3931 +
  1.3932 +    // Encoding assumes a double has been pushed into FPR0.
  1.3933 +    // Store down the double as a long, popping the FPU stack
  1.3934 +    emit_opcode(cbuf,0xDF);      // FISTP [ESP]
  1.3935 +    emit_opcode(cbuf,0x3C);
  1.3936 +    emit_d8(cbuf,0x24);
  1.3937 +
  1.3938 +    // Restore the rounding mode; mask the exception
  1.3939 +    emit_opcode(cbuf,0xD9);      // FLDCW   std/24-bit mode
  1.3940 +    emit_opcode(cbuf,0x2D);
  1.3941 +    emit_d32( cbuf, Compile::current()->in_24_bit_fp_mode()
  1.3942 +      ? (int)StubRoutines::addr_fpu_cntrl_wrd_24()
  1.3943 +      : (int)StubRoutines::addr_fpu_cntrl_wrd_std());
  1.3944 +
  1.3945 +    // Load the converted int; adjust CPU stack
  1.3946 +    emit_opcode(cbuf,0x58);      // POP EAX
  1.3947 +
  1.3948 +    emit_opcode(cbuf,0x5A);      // POP EDX
  1.3949 +
  1.3950 +    emit_opcode(cbuf,0x81);      // CMP EDX,imm
  1.3951 +    emit_d8    (cbuf,0xFA);      // rdx
  1.3952 +    emit_d32   (cbuf,0x80000000);//         0x80000000
  1.3953 +
  1.3954 +    emit_opcode(cbuf,0x75);      // JNE around_slow_call
  1.3955 +    emit_d8    (cbuf,0x13+4);    // Size of slow_call
  1.3956 +
  1.3957 +    emit_opcode(cbuf,0x85);      // TEST EAX,EAX
  1.3958 +    emit_opcode(cbuf,0xC0);      // 2/rax,/rax,
  1.3959 +
  1.3960 +    emit_opcode(cbuf,0x75);      // JNE around_slow_call
  1.3961 +    emit_d8    (cbuf,0x13);      // Size of slow_call
  1.3962 +
  1.3963 +    // Allocate a word
  1.3964 +    emit_opcode(cbuf,0x83);      // SUB ESP,4
  1.3965 +    emit_opcode(cbuf,0xEC);
  1.3966 +    emit_d8(cbuf,0x04);
  1.3967 +
  1.3968 +    emit_opcode  (cbuf, 0xF3 );  // MOVSS [ESP], src
  1.3969 +    emit_opcode  (cbuf, 0x0F );
  1.3970 +    emit_opcode  (cbuf, 0x11 );
  1.3971 +    encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
  1.3972 +
  1.3973 +    emit_opcode(cbuf,0xD9 );     // FLD_S [ESP]
  1.3974 +    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
  1.3975 +
  1.3976 +    emit_opcode(cbuf,0x83);      // ADD ESP,4
  1.3977 +    emit_opcode(cbuf,0xC4);
  1.3978 +    emit_d8(cbuf,0x04);
  1.3979 +
  1.3980 +    // CALL directly to the runtime
  1.3981 +    cbuf.set_inst_mark();
  1.3982 +    emit_opcode(cbuf,0xE8);       // Call into runtime
  1.3983 +    emit_d32_reloc(cbuf, (StubRoutines::d2l_wrapper() - cbuf.code_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
  1.3984 +    // Carry on here...
  1.3985 +  %}
  1.3986 +
  1.3987 +  enc_class XD2L_encoding( regXD src ) %{
  1.3988 +    // Allocate a word
  1.3989 +    emit_opcode(cbuf,0x83);      // SUB ESP,8
  1.3990 +    emit_opcode(cbuf,0xEC);
  1.3991 +    emit_d8(cbuf,0x08);
  1.3992 +
  1.3993 +    emit_opcode  (cbuf, 0xF2 );  // MOVSD [ESP], src
  1.3994 +    emit_opcode  (cbuf, 0x0F );
  1.3995 +    emit_opcode  (cbuf, 0x11 );
  1.3996 +    encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
  1.3997 +
  1.3998 +    emit_opcode(cbuf,0xDD );     // FLD_D [ESP]
  1.3999 +    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
  1.4000 +
  1.4001 +    emit_opcode(cbuf,0xD9);      // FLDCW  trunc
  1.4002 +    emit_opcode(cbuf,0x2D);
  1.4003 +    emit_d32(cbuf,(int)StubRoutines::addr_fpu_cntrl_wrd_trunc());
  1.4004 +
  1.4005 +    // Encoding assumes a double has been pushed into FPR0.
  1.4006 +    // Store down the double as a long, popping the FPU stack
  1.4007 +    emit_opcode(cbuf,0xDF);      // FISTP [ESP]
  1.4008 +    emit_opcode(cbuf,0x3C);
  1.4009 +    emit_d8(cbuf,0x24);
  1.4010 +
  1.4011 +    // Restore the rounding mode; mask the exception
  1.4012 +    emit_opcode(cbuf,0xD9);      // FLDCW   std/24-bit mode
  1.4013 +    emit_opcode(cbuf,0x2D);
  1.4014 +    emit_d32( cbuf, Compile::current()->in_24_bit_fp_mode()
  1.4015 +      ? (int)StubRoutines::addr_fpu_cntrl_wrd_24()
  1.4016 +      : (int)StubRoutines::addr_fpu_cntrl_wrd_std());
  1.4017 +
  1.4018 +    // Load the converted int; adjust CPU stack
  1.4019 +    emit_opcode(cbuf,0x58);      // POP EAX
  1.4020 +
  1.4021 +    emit_opcode(cbuf,0x5A);      // POP EDX
  1.4022 +
  1.4023 +    emit_opcode(cbuf,0x81);      // CMP EDX,imm
  1.4024 +    emit_d8    (cbuf,0xFA);      // rdx
  1.4025 +    emit_d32   (cbuf,0x80000000); //         0x80000000
  1.4026 +
  1.4027 +    emit_opcode(cbuf,0x75);      // JNE around_slow_call
  1.4028 +    emit_d8    (cbuf,0x13+4);    // Size of slow_call
  1.4029 +
  1.4030 +    emit_opcode(cbuf,0x85);      // TEST EAX,EAX
  1.4031 +    emit_opcode(cbuf,0xC0);      // 2/rax,/rax,
  1.4032 +
  1.4033 +    emit_opcode(cbuf,0x75);      // JNE around_slow_call
  1.4034 +    emit_d8    (cbuf,0x13);      // Size of slow_call
  1.4035 +
  1.4036 +    // Push src onto stack slow-path
  1.4037 +    // Allocate a word
  1.4038 +    emit_opcode(cbuf,0x83);      // SUB ESP,8
  1.4039 +    emit_opcode(cbuf,0xEC);
  1.4040 +    emit_d8(cbuf,0x08);
  1.4041 +
  1.4042 +    emit_opcode  (cbuf, 0xF2 );  // MOVSD [ESP], src
  1.4043 +    emit_opcode  (cbuf, 0x0F );
  1.4044 +    emit_opcode  (cbuf, 0x11 );
  1.4045 +    encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
  1.4046 +
  1.4047 +    emit_opcode(cbuf,0xDD );     // FLD_D [ESP]
  1.4048 +    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
  1.4049 +
  1.4050 +    emit_opcode(cbuf,0x83);      // ADD ESP,8
  1.4051 +    emit_opcode(cbuf,0xC4);
  1.4052 +    emit_d8(cbuf,0x08);
  1.4053 +
  1.4054 +    // CALL directly to the runtime
  1.4055 +    cbuf.set_inst_mark();
  1.4056 +    emit_opcode(cbuf,0xE8);      // Call into runtime
  1.4057 +    emit_d32_reloc(cbuf, (StubRoutines::d2l_wrapper() - cbuf.code_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
  1.4058 +    // Carry on here...
  1.4059 +  %}
  1.4060 +
  1.4061 +  enc_class D2X_encoding( regX dst, regD src ) %{
  1.4062 +    // Allocate a word
  1.4063 +    emit_opcode(cbuf,0x83);            // SUB ESP,4
  1.4064 +    emit_opcode(cbuf,0xEC);
  1.4065 +    emit_d8(cbuf,0x04);
  1.4066 +    int pop = 0x02;
  1.4067 +    if ($src$$reg != FPR1L_enc) {
  1.4068 +      emit_opcode( cbuf, 0xD9 );       // FLD    ST(i-1)
  1.4069 +      emit_d8( cbuf, 0xC0-1+$src$$reg );
  1.4070 +      pop = 0x03;
  1.4071 +    }
  1.4072 +    store_to_stackslot( cbuf, 0xD9, pop, 0 ); // FST<P>_S  [ESP]
  1.4073 +
  1.4074 +    emit_opcode  (cbuf, 0xF3 );        // MOVSS dst(xmm), [ESP]
  1.4075 +    emit_opcode  (cbuf, 0x0F );
  1.4076 +    emit_opcode  (cbuf, 0x10 );
  1.4077 +    encode_RegMem(cbuf, $dst$$reg, ESP_enc, 0x4, 0, 0, false);
  1.4078 +
  1.4079 +    emit_opcode(cbuf,0x83);            // ADD ESP,4
  1.4080 +    emit_opcode(cbuf,0xC4);
  1.4081 +    emit_d8(cbuf,0x04);
  1.4082 +    // Carry on here...
  1.4083 +  %}
  1.4084 +
  1.4085 +  enc_class FX2I_encoding( regX src, eRegI dst ) %{
  1.4086 +    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
  1.4087 +
  1.4088 +    // Compare the result to see if we need to go to the slow path
  1.4089 +    emit_opcode(cbuf,0x81);       // CMP dst,imm
  1.4090 +    emit_rm    (cbuf,0x3,0x7,$dst$$reg);
  1.4091 +    emit_d32   (cbuf,0x80000000); //         0x80000000
  1.4092 +
  1.4093 +    emit_opcode(cbuf,0x75);       // JNE around_slow_call
  1.4094 +    emit_d8    (cbuf,0x13);       // Size of slow_call
  1.4095 +    // Store xmm to a temp memory
  1.4096 +    // location and push it onto stack.
  1.4097 +
  1.4098 +    emit_opcode(cbuf,0x83);  // SUB ESP,4
  1.4099 +    emit_opcode(cbuf,0xEC);
  1.4100 +    emit_d8(cbuf, $primary ? 0x8 : 0x4);
  1.4101 +
  1.4102 +    emit_opcode  (cbuf, $primary ? 0xF2 : 0xF3 );   // MOVSS [ESP], xmm
  1.4103 +    emit_opcode  (cbuf, 0x0F );
  1.4104 +    emit_opcode  (cbuf, 0x11 );
  1.4105 +    encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
  1.4106 +
  1.4107 +    emit_opcode(cbuf, $primary ? 0xDD : 0xD9 );      // FLD [ESP]
  1.4108 +    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
  1.4109 +
  1.4110 +    emit_opcode(cbuf,0x83);    // ADD ESP,4
  1.4111 +    emit_opcode(cbuf,0xC4);
  1.4112 +    emit_d8(cbuf, $primary ? 0x8 : 0x4);
  1.4113 +
  1.4114 +    // CALL directly to the runtime
  1.4115 +    cbuf.set_inst_mark();
  1.4116 +    emit_opcode(cbuf,0xE8);       // Call into runtime
  1.4117 +    emit_d32_reloc(cbuf, (StubRoutines::d2i_wrapper() - cbuf.code_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
  1.4118 +
  1.4119 +    // Carry on here...
  1.4120 +  %}
  1.4121 +
  1.4122 +  enc_class X2D_encoding( regD dst, regX src ) %{
  1.4123 +    // Allocate a word
  1.4124 +    emit_opcode(cbuf,0x83);     // SUB ESP,4
  1.4125 +    emit_opcode(cbuf,0xEC);
  1.4126 +    emit_d8(cbuf,0x04);
  1.4127 +
  1.4128 +    emit_opcode  (cbuf, 0xF3 ); // MOVSS [ESP], xmm
  1.4129 +    emit_opcode  (cbuf, 0x0F );
  1.4130 +    emit_opcode  (cbuf, 0x11 );
  1.4131 +    encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
  1.4132 +
  1.4133 +    emit_opcode(cbuf,0xD9 );    // FLD_S [ESP]
  1.4134 +    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
  1.4135 +
  1.4136 +    emit_opcode(cbuf,0x83);     // ADD ESP,4
  1.4137 +    emit_opcode(cbuf,0xC4);
  1.4138 +    emit_d8(cbuf,0x04);
  1.4139 +
  1.4140 +    // Carry on here...
  1.4141 +  %}
  1.4142 +
  1.4143 +  enc_class AbsXF_encoding(regX dst) %{
  1.4144 +    address signmask_address=(address)float_signmask_pool;
  1.4145 +    // andpd:\tANDPS  $dst,[signconst]
  1.4146 +    emit_opcode(cbuf, 0x0F);
  1.4147 +    emit_opcode(cbuf, 0x54);
  1.4148 +    emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
  1.4149 +    emit_d32(cbuf, (int)signmask_address);
  1.4150 +  %}
  1.4151 +
  1.4152 +  enc_class AbsXD_encoding(regXD dst) %{
  1.4153 +    address signmask_address=(address)double_signmask_pool;
  1.4154 +    // andpd:\tANDPD  $dst,[signconst]
  1.4155 +    emit_opcode(cbuf, 0x66);
  1.4156 +    emit_opcode(cbuf, 0x0F);
  1.4157 +    emit_opcode(cbuf, 0x54);
  1.4158 +    emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
  1.4159 +    emit_d32(cbuf, (int)signmask_address);
  1.4160 +  %}
  1.4161 +
  1.4162 +  enc_class NegXF_encoding(regX dst) %{
  1.4163 +    address signmask_address=(address)float_signflip_pool;
  1.4164 +    // andpd:\tXORPS  $dst,[signconst]
  1.4165 +    emit_opcode(cbuf, 0x0F);
  1.4166 +    emit_opcode(cbuf, 0x57);
  1.4167 +    emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
  1.4168 +    emit_d32(cbuf, (int)signmask_address);
  1.4169 +  %}
  1.4170 +
  1.4171 +  enc_class NegXD_encoding(regXD dst) %{
  1.4172 +    address signmask_address=(address)double_signflip_pool;
  1.4173 +    // andpd:\tXORPD  $dst,[signconst]
  1.4174 +    emit_opcode(cbuf, 0x66);
  1.4175 +    emit_opcode(cbuf, 0x0F);
  1.4176 +    emit_opcode(cbuf, 0x57);
  1.4177 +    emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
  1.4178 +    emit_d32(cbuf, (int)signmask_address);
  1.4179 +  %}
  1.4180 +
  1.4181 +  enc_class FMul_ST_reg( eRegF src1 ) %{
  1.4182 +    // Operand was loaded from memory into fp ST (stack top)
  1.4183 +    // FMUL   ST,$src  /* D8 C8+i */
  1.4184 +    emit_opcode(cbuf, 0xD8);
  1.4185 +    emit_opcode(cbuf, 0xC8 + $src1$$reg);
  1.4186 +  %}
  1.4187 +
  1.4188 +  enc_class FAdd_ST_reg( eRegF src2 ) %{
  1.4189 +    // FADDP  ST,src2  /* D8 C0+i */
  1.4190 +    emit_opcode(cbuf, 0xD8);
  1.4191 +    emit_opcode(cbuf, 0xC0 + $src2$$reg);
  1.4192 +    //could use FADDP  src2,fpST  /* DE C0+i */
  1.4193 +  %}
  1.4194 +
  1.4195 +  enc_class FAddP_reg_ST( eRegF src2 ) %{
  1.4196 +    // FADDP  src2,ST  /* DE C0+i */
  1.4197 +    emit_opcode(cbuf, 0xDE);
  1.4198 +    emit_opcode(cbuf, 0xC0 + $src2$$reg);
  1.4199 +  %}
  1.4200 +
  1.4201 +  enc_class subF_divF_encode( eRegF src1, eRegF src2) %{
  1.4202 +    // Operand has been loaded into fp ST (stack top)
  1.4203 +      // FSUB   ST,$src1
  1.4204 +      emit_opcode(cbuf, 0xD8);
  1.4205 +      emit_opcode(cbuf, 0xE0 + $src1$$reg);
  1.4206 +
  1.4207 +      // FDIV
  1.4208 +      emit_opcode(cbuf, 0xD8);
  1.4209 +      emit_opcode(cbuf, 0xF0 + $src2$$reg);
  1.4210 +  %}
  1.4211 +
  1.4212 +  enc_class MulFAddF (eRegF src1, eRegF src2) %{
  1.4213 +    // Operand was loaded from memory into fp ST (stack top)
  1.4214 +    // FADD   ST,$src  /* D8 C0+i */
  1.4215 +    emit_opcode(cbuf, 0xD8);
  1.4216 +    emit_opcode(cbuf, 0xC0 + $src1$$reg);
  1.4217 +
  1.4218 +    // FMUL  ST,src2  /* D8 C*+i */
  1.4219 +    emit_opcode(cbuf, 0xD8);
  1.4220 +    emit_opcode(cbuf, 0xC8 + $src2$$reg);
  1.4221 +  %}
  1.4222 +
  1.4223 +
  1.4224 +  enc_class MulFAddFreverse (eRegF src1, eRegF src2) %{
  1.4225 +    // Operand was loaded from memory into fp ST (stack top)
  1.4226 +    // FADD   ST,$src  /* D8 C0+i */
  1.4227 +    emit_opcode(cbuf, 0xD8);
  1.4228 +    emit_opcode(cbuf, 0xC0 + $src1$$reg);
  1.4229 +
  1.4230 +    // FMULP  src2,ST  /* DE C8+i */
  1.4231 +    emit_opcode(cbuf, 0xDE);
  1.4232 +    emit_opcode(cbuf, 0xC8 + $src2$$reg);
  1.4233 +  %}
  1.4234 +
  1.4235 +  enc_class enc_membar_acquire %{
  1.4236 +    // Doug Lea believes this is not needed with current Sparcs and TSO.
  1.4237 +    // MacroAssembler masm(&cbuf);
  1.4238 +    // masm.membar();
  1.4239 +  %}
  1.4240 +
  1.4241 +  enc_class enc_membar_release %{
  1.4242 +    // Doug Lea believes this is not needed with current Sparcs and TSO.
  1.4243 +    // MacroAssembler masm(&cbuf);
  1.4244 +    // masm.membar();
  1.4245 +  %}
  1.4246 +
  1.4247 +  enc_class enc_membar_volatile %{
  1.4248 +    MacroAssembler masm(&cbuf);
  1.4249 +    masm.membar();
  1.4250 +  %}
  1.4251 +
  1.4252 +  // Atomically load the volatile long
  1.4253 +  enc_class enc_loadL_volatile( memory mem, stackSlotL dst ) %{
  1.4254 +    emit_opcode(cbuf,0xDF);
  1.4255 +    int rm_byte_opcode = 0x05;
  1.4256 +    int base     = $mem$$base;
  1.4257 +    int index    = $mem$$index;
  1.4258 +    int scale    = $mem$$scale;
  1.4259 +    int displace = $mem$$disp;
  1.4260 +    bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
  1.4261 +    encode_RegMem(cbuf, rm_byte_opcode, base, index, scale, displace, disp_is_oop);
  1.4262 +    store_to_stackslot( cbuf, 0x0DF, 0x07, $dst$$disp );
  1.4263 +  %}
  1.4264 +
  1.4265 +  enc_class enc_loadLX_volatile( memory mem, stackSlotL dst, regXD tmp ) %{
  1.4266 +    { // Atomic long load
  1.4267 +      // UseXmmLoadAndClearUpper ? movsd $tmp,$mem : movlpd $tmp,$mem
  1.4268 +      emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0xF2 : 0x66);
  1.4269 +      emit_opcode(cbuf,0x0F);
  1.4270 +      emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0x10 : 0x12);
  1.4271 +      int base     = $mem$$base;
  1.4272 +      int index    = $mem$$index;
  1.4273 +      int scale    = $mem$$scale;
  1.4274 +      int displace = $mem$$disp;
  1.4275 +      bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
  1.4276 +      encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
  1.4277 +    }
  1.4278 +    { // MOVSD $dst,$tmp ! atomic long store
  1.4279 +      emit_opcode(cbuf,0xF2);
  1.4280 +      emit_opcode(cbuf,0x0F);
  1.4281 +      emit_opcode(cbuf,0x11);
  1.4282 +      int base     = $dst$$base;
  1.4283 +      int index    = $dst$$index;
  1.4284 +      int scale    = $dst$$scale;
  1.4285 +      int displace = $dst$$disp;
  1.4286 +      bool disp_is_oop = $dst->disp_is_oop(); // disp-as-oop when working with static globals
  1.4287 +      encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
  1.4288 +    }
  1.4289 +  %}
  1.4290 +
  1.4291 +  enc_class enc_loadLX_reg_volatile( memory mem, eRegL dst, regXD tmp ) %{
  1.4292 +    { // Atomic long load
  1.4293 +      // UseXmmLoadAndClearUpper ? movsd $tmp,$mem : movlpd $tmp,$mem
  1.4294 +      emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0xF2 : 0x66);
  1.4295 +      emit_opcode(cbuf,0x0F);
  1.4296 +      emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0x10 : 0x12);
  1.4297 +      int base     = $mem$$base;
  1.4298 +      int index    = $mem$$index;
  1.4299 +      int scale    = $mem$$scale;
  1.4300 +      int displace = $mem$$disp;
  1.4301 +      bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
  1.4302 +      encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
  1.4303 +    }
  1.4304 +    { // MOVD $dst.lo,$tmp
  1.4305 +      emit_opcode(cbuf,0x66);
  1.4306 +      emit_opcode(cbuf,0x0F);
  1.4307 +      emit_opcode(cbuf,0x7E);
  1.4308 +      emit_rm(cbuf, 0x3, $tmp$$reg, $dst$$reg);
  1.4309 +    }
  1.4310 +    { // PSRLQ $tmp,32
  1.4311 +      emit_opcode(cbuf,0x66);
  1.4312 +      emit_opcode(cbuf,0x0F);
  1.4313 +      emit_opcode(cbuf,0x73);
  1.4314 +      emit_rm(cbuf, 0x3, 0x02, $tmp$$reg);
  1.4315 +      emit_d8(cbuf, 0x20);
  1.4316 +    }
  1.4317 +    { // MOVD $dst.hi,$tmp
  1.4318 +      emit_opcode(cbuf,0x66);
  1.4319 +      emit_opcode(cbuf,0x0F);
  1.4320 +      emit_opcode(cbuf,0x7E);
  1.4321 +      emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($dst$$reg));
  1.4322 +    }
  1.4323 +  %}
  1.4324 +
  1.4325 +  // Volatile Store Long.  Must be atomic, so move it into
  1.4326 +  // the FP TOS and then do a 64-bit FIST.  Has to probe the
  1.4327 +  // target address before the store (for null-ptr checks)
  1.4328 +  // so the memory operand is used twice in the encoding.
  1.4329 +  enc_class enc_storeL_volatile( memory mem, stackSlotL src ) %{
  1.4330 +    store_to_stackslot( cbuf, 0x0DF, 0x05, $src$$disp );
  1.4331 +    cbuf.set_inst_mark();            // Mark start of FIST in case $mem has an oop
  1.4332 +    emit_opcode(cbuf,0xDF);
  1.4333 +    int rm_byte_opcode = 0x07;
  1.4334 +    int base     = $mem$$base;
  1.4335 +    int index    = $mem$$index;
  1.4336 +    int scale    = $mem$$scale;
  1.4337 +    int displace = $mem$$disp;
  1.4338 +    bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
  1.4339 +    encode_RegMem(cbuf, rm_byte_opcode, base, index, scale, displace, disp_is_oop);
  1.4340 +  %}
  1.4341 +
  1.4342 +  enc_class enc_storeLX_volatile( memory mem, stackSlotL src, regXD tmp) %{
  1.4343 +    { // Atomic long load
  1.4344 +      // UseXmmLoadAndClearUpper ? movsd $tmp,[$src] : movlpd $tmp,[$src]
  1.4345 +      emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0xF2 : 0x66);
  1.4346 +      emit_opcode(cbuf,0x0F);
  1.4347 +      emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0x10 : 0x12);
  1.4348 +      int base     = $src$$base;
  1.4349 +      int index    = $src$$index;
  1.4350 +      int scale    = $src$$scale;
  1.4351 +      int displace = $src$$disp;
  1.4352 +      bool disp_is_oop = $src->disp_is_oop(); // disp-as-oop when working with static globals
  1.4353 +      encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
  1.4354 +    }
  1.4355 +    cbuf.set_inst_mark();            // Mark start of MOVSD in case $mem has an oop
  1.4356 +    { // MOVSD $mem,$tmp ! atomic long store
  1.4357 +      emit_opcode(cbuf,0xF2);
  1.4358 +      emit_opcode(cbuf,0x0F);
  1.4359 +      emit_opcode(cbuf,0x11);
  1.4360 +      int base     = $mem$$base;
  1.4361 +      int index    = $mem$$index;
  1.4362 +      int scale    = $mem$$scale;
  1.4363 +      int displace = $mem$$disp;
  1.4364 +      bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
  1.4365 +      encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
  1.4366 +    }
  1.4367 +  %}
  1.4368 +
  1.4369 +  enc_class enc_storeLX_reg_volatile( memory mem, eRegL src, regXD tmp, regXD tmp2) %{
  1.4370 +    { // MOVD $tmp,$src.lo
  1.4371 +      emit_opcode(cbuf,0x66);
  1.4372 +      emit_opcode(cbuf,0x0F);
  1.4373 +      emit_opcode(cbuf,0x6E);
  1.4374 +      emit_rm(cbuf, 0x3, $tmp$$reg, $src$$reg);
  1.4375 +    }
  1.4376 +    { // MOVD $tmp2,$src.hi
  1.4377 +      emit_opcode(cbuf,0x66);
  1.4378 +      emit_opcode(cbuf,0x0F);
  1.4379 +      emit_opcode(cbuf,0x6E);
  1.4380 +      emit_rm(cbuf, 0x3, $tmp2$$reg, HIGH_FROM_LOW($src$$reg));
  1.4381 +    }
  1.4382 +    { // PUNPCKLDQ $tmp,$tmp2
  1.4383 +      emit_opcode(cbuf,0x66);
  1.4384 +      emit_opcode(cbuf,0x0F);
  1.4385 +      emit_opcode(cbuf,0x62);
  1.4386 +      emit_rm(cbuf, 0x3, $tmp$$reg, $tmp2$$reg);
  1.4387 +    }
  1.4388 +    cbuf.set_inst_mark();            // Mark start of MOVSD in case $mem has an oop
  1.4389 +    { // MOVSD $mem,$tmp ! atomic long store
  1.4390 +      emit_opcode(cbuf,0xF2);
  1.4391 +      emit_opcode(cbuf,0x0F);
  1.4392 +      emit_opcode(cbuf,0x11);
  1.4393 +      int base     = $mem$$base;
  1.4394 +      int index    = $mem$$index;
  1.4395 +      int scale    = $mem$$scale;
  1.4396 +      int displace = $mem$$disp;
  1.4397 +      bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
  1.4398 +      encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
  1.4399 +    }
  1.4400 +  %}
  1.4401 +
  1.4402 +  // Safepoint Poll.  This polls the safepoint page, and causes an
  1.4403 +  // exception if it is not readable. Unfortunately, it kills the condition code
  1.4404 +  // in the process
  1.4405 +  // We current use TESTL [spp],EDI
  1.4406 +  // A better choice might be TESTB [spp + pagesize() - CacheLineSize()],0
  1.4407 +
  1.4408 +  enc_class Safepoint_Poll() %{
  1.4409 +    cbuf.relocate(cbuf.inst_mark(), relocInfo::poll_type, 0);
  1.4410 +    emit_opcode(cbuf,0x85);
  1.4411 +    emit_rm (cbuf, 0x0, 0x7, 0x5);
  1.4412 +    emit_d32(cbuf, (intptr_t)os::get_polling_page());
  1.4413 +  %}
  1.4414 +%}
  1.4415 +
  1.4416 +
  1.4417 +//----------FRAME--------------------------------------------------------------
  1.4418 +// Definition of frame structure and management information.
  1.4419 +//
  1.4420 +//  S T A C K   L A Y O U T    Allocators stack-slot number
  1.4421 +//                             |   (to get allocators register number
  1.4422 +//  G  Owned by    |        |  v    add OptoReg::stack0())
  1.4423 +//  r   CALLER     |        |
  1.4424 +//  o     |        +--------+      pad to even-align allocators stack-slot
  1.4425 +//  w     V        |  pad0  |        numbers; owned by CALLER
  1.4426 +//  t   -----------+--------+----> Matcher::_in_arg_limit, unaligned
  1.4427 +//  h     ^        |   in   |  5
  1.4428 +//        |        |  args  |  4   Holes in incoming args owned by SELF
  1.4429 +//  |     |        |        |  3
  1.4430 +//  |     |        +--------+
  1.4431 +//  V     |        | old out|      Empty on Intel, window on Sparc
  1.4432 +//        |    old |preserve|      Must be even aligned.
  1.4433 +//        |     SP-+--------+----> Matcher::_old_SP, even aligned
  1.4434 +//        |        |   in   |  3   area for Intel ret address
  1.4435 +//     Owned by    |preserve|      Empty on Sparc.
  1.4436 +//       SELF      +--------+
  1.4437 +//        |        |  pad2  |  2   pad to align old SP
  1.4438 +//        |        +--------+  1
  1.4439 +//        |        | locks  |  0
  1.4440 +//        |        +--------+----> OptoReg::stack0(), even aligned
  1.4441 +//        |        |  pad1  | 11   pad to align new SP
  1.4442 +//        |        +--------+
  1.4443 +//        |        |        | 10
  1.4444 +//        |        | spills |  9   spills
  1.4445 +//        V        |        |  8   (pad0 slot for callee)
  1.4446 +//      -----------+--------+----> Matcher::_out_arg_limit, unaligned
  1.4447 +//        ^        |  out   |  7
  1.4448 +//        |        |  args  |  6   Holes in outgoing args owned by CALLEE
  1.4449 +//     Owned by    +--------+
  1.4450 +//      CALLEE     | new out|  6   Empty on Intel, window on Sparc
  1.4451 +//        |    new |preserve|      Must be even-aligned.
  1.4452 +//        |     SP-+--------+----> Matcher::_new_SP, even aligned
  1.4453 +//        |        |        |
  1.4454 +//
  1.4455 +// Note 1: Only region 8-11 is determined by the allocator.  Region 0-5 is
  1.4456 +//         known from SELF's arguments and the Java calling convention.
  1.4457 +//         Region 6-7 is determined per call site.
  1.4458 +// Note 2: If the calling convention leaves holes in the incoming argument
  1.4459 +//         area, those holes are owned by SELF.  Holes in the outgoing area
  1.4460 +//         are owned by the CALLEE.  Holes should not be nessecary in the
  1.4461 +//         incoming area, as the Java calling convention is completely under
  1.4462 +//         the control of the AD file.  Doubles can be sorted and packed to
  1.4463 +//         avoid holes.  Holes in the outgoing arguments may be nessecary for
  1.4464 +//         varargs C calling conventions.
  1.4465 +// Note 3: Region 0-3 is even aligned, with pad2 as needed.  Region 3-5 is
  1.4466 +//         even aligned with pad0 as needed.
  1.4467 +//         Region 6 is even aligned.  Region 6-7 is NOT even aligned;
  1.4468 +//         region 6-11 is even aligned; it may be padded out more so that
  1.4469 +//         the region from SP to FP meets the minimum stack alignment.
  1.4470 +
  1.4471 +frame %{
  1.4472 +  // What direction does stack grow in (assumed to be same for C & Java)
  1.4473 +  stack_direction(TOWARDS_LOW);
  1.4474 +
  1.4475 +  // These three registers define part of the calling convention
  1.4476 +  // between compiled code and the interpreter.
  1.4477 +  inline_cache_reg(EAX);                // Inline Cache Register
  1.4478 +  interpreter_method_oop_reg(EBX);      // Method Oop Register when calling interpreter
  1.4479 +
  1.4480 +  // Optional: name the operand used by cisc-spilling to access [stack_pointer + offset]
  1.4481 +  cisc_spilling_operand_name(indOffset32);
  1.4482 +
  1.4483 +  // Number of stack slots consumed by locking an object
  1.4484 +  sync_stack_slots(1);
  1.4485 +
  1.4486 +  // Compiled code's Frame Pointer
  1.4487 +  frame_pointer(ESP);
  1.4488 +  // Interpreter stores its frame pointer in a register which is
  1.4489 +  // stored to the stack by I2CAdaptors.
  1.4490 +  // I2CAdaptors convert from interpreted java to compiled java.
  1.4491 +  interpreter_frame_pointer(EBP);
  1.4492 +
  1.4493 +  // Stack alignment requirement
  1.4494 +  // Alignment size in bytes (128-bit -> 16 bytes)
  1.4495 +  stack_alignment(StackAlignmentInBytes);
  1.4496 +
  1.4497 +  // Number of stack slots between incoming argument block and the start of
  1.4498 +  // a new frame.  The PROLOG must add this many slots to the stack.  The
  1.4499 +  // EPILOG must remove this many slots.  Intel needs one slot for
  1.4500 +  // return address and one for rbp, (must save rbp)
  1.4501 +  in_preserve_stack_slots(2+VerifyStackAtCalls);
  1.4502 +
  1.4503 +  // Number of outgoing stack slots killed above the out_preserve_stack_slots
  1.4504 +  // for calls to C.  Supports the var-args backing area for register parms.
  1.4505 +  varargs_C_out_slots_killed(0);
  1.4506 +
  1.4507 +  // The after-PROLOG location of the return address.  Location of
  1.4508 +  // return address specifies a type (REG or STACK) and a number
  1.4509 +  // representing the register number (i.e. - use a register name) or
  1.4510 +  // stack slot.
  1.4511 +  // Ret Addr is on stack in slot 0 if no locks or verification or alignment.
  1.4512 +  // Otherwise, it is above the locks and verification slot and alignment word
  1.4513 +  return_addr(STACK - 1 +
  1.4514 +              round_to(1+VerifyStackAtCalls+
  1.4515 +              Compile::current()->fixed_slots(),
  1.4516 +              (StackAlignmentInBytes/wordSize)));
  1.4517 +
  1.4518 +  // Body of function which returns an integer array locating
  1.4519 +  // arguments either in registers or in stack slots.  Passed an array
  1.4520 +  // of ideal registers called "sig" and a "length" count.  Stack-slot
  1.4521 +  // offsets are based on outgoing arguments, i.e. a CALLER setting up
  1.4522 +  // arguments for a CALLEE.  Incoming stack arguments are
  1.4523 +  // automatically biased by the preserve_stack_slots field above.
  1.4524 +  calling_convention %{
  1.4525 +    // No difference between ingoing/outgoing just pass false
  1.4526 +    SharedRuntime::java_calling_convention(sig_bt, regs, length, false);
  1.4527 +  %}
  1.4528 +
  1.4529 +
  1.4530 +  // Body of function which returns an integer array locating
  1.4531 +  // arguments either in registers or in stack slots.  Passed an array
  1.4532 +  // of ideal registers called "sig" and a "length" count.  Stack-slot
  1.4533 +  // offsets are based on outgoing arguments, i.e. a CALLER setting up
  1.4534 +  // arguments for a CALLEE.  Incoming stack arguments are
  1.4535 +  // automatically biased by the preserve_stack_slots field above.
  1.4536 +  c_calling_convention %{
  1.4537 +    // This is obviously always outgoing
  1.4538 +    (void) SharedRuntime::c_calling_convention(sig_bt, regs, length);
  1.4539 +  %}
  1.4540 +
  1.4541 +  // Location of C & interpreter return values
  1.4542 +  c_return_value %{
  1.4543 +    assert( ideal_reg >= Op_RegI && ideal_reg <= Op_RegL, "only return normal values" );
  1.4544 +    static int lo[Op_RegL+1] = { 0, 0, EAX_num,      EAX_num,      FPR1L_num,    FPR1L_num, EAX_num };
  1.4545 +    static int hi[Op_RegL+1] = { 0, 0, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, FPR1H_num, EDX_num };
  1.4546 +
  1.4547 +    // in SSE2+ mode we want to keep the FPU stack clean so pretend
  1.4548 +    // that C functions return float and double results in XMM0.
  1.4549 +    if( ideal_reg == Op_RegD && UseSSE>=2 )
  1.4550 +      return OptoRegPair(XMM0b_num,XMM0a_num);
  1.4551 +    if( ideal_reg == Op_RegF && UseSSE>=2 )
  1.4552 +      return OptoRegPair(OptoReg::Bad,XMM0a_num);
  1.4553 +
  1.4554 +    return OptoRegPair(hi[ideal_reg],lo[ideal_reg]);
  1.4555 +  %}
  1.4556 +
  1.4557 +  // Location of return values
  1.4558 +  return_value %{
  1.4559 +    assert( ideal_reg >= Op_RegI && ideal_reg <= Op_RegL, "only return normal values" );
  1.4560 +    static int lo[Op_RegL+1] = { 0, 0, EAX_num,      EAX_num,      FPR1L_num,    FPR1L_num, EAX_num };
  1.4561 +    static int hi[Op_RegL+1] = { 0, 0, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, FPR1H_num, EDX_num };
  1.4562 +    if( ideal_reg == Op_RegD && UseSSE>=2 )
  1.4563 +      return OptoRegPair(XMM0b_num,XMM0a_num);
  1.4564 +    if( ideal_reg == Op_RegF && UseSSE>=1 )
  1.4565 +      return OptoRegPair(OptoReg::Bad,XMM0a_num);
  1.4566 +    return OptoRegPair(hi[ideal_reg],lo[ideal_reg]);
  1.4567 +  %}
  1.4568 +
  1.4569 +%}
  1.4570 +
  1.4571 +//----------ATTRIBUTES---------------------------------------------------------
  1.4572 +//----------Operand Attributes-------------------------------------------------
  1.4573 +op_attrib op_cost(0);        // Required cost attribute
  1.4574 +
  1.4575 +//----------Instruction Attributes---------------------------------------------
  1.4576 +ins_attrib ins_cost(100);       // Required cost attribute
  1.4577 +ins_attrib ins_size(8);         // Required size attribute (in bits)
  1.4578 +ins_attrib ins_pc_relative(0);  // Required PC Relative flag
  1.4579 +ins_attrib ins_short_branch(0); // Required flag: is this instruction a
  1.4580 +                                // non-matching short branch variant of some
  1.4581 +                                                            // long branch?
  1.4582 +ins_attrib ins_alignment(1);    // Required alignment attribute (must be a power of 2)
  1.4583 +                                // specifies the alignment that some part of the instruction (not
  1.4584 +                                // necessarily the start) requires.  If > 1, a compute_padding()
  1.4585 +                                // function must be provided for the instruction
  1.4586 +
  1.4587 +//----------OPERANDS-----------------------------------------------------------
  1.4588 +// Operand definitions must precede instruction definitions for correct parsing
  1.4589 +// in the ADLC because operands constitute user defined types which are used in
  1.4590 +// instruction definitions.
  1.4591 +
  1.4592 +//----------Simple Operands----------------------------------------------------
  1.4593 +// Immediate Operands
  1.4594 +// Integer Immediate
  1.4595 +operand immI() %{
  1.4596 +  match(ConI);
  1.4597 +
  1.4598 +  op_cost(10);
  1.4599 +  format %{ %}
  1.4600 +  interface(CONST_INTER);
  1.4601 +%}
  1.4602 +
  1.4603 +// Constant for test vs zero
  1.4604 +operand immI0() %{
  1.4605 +  predicate(n->get_int() == 0);
  1.4606 +  match(ConI);
  1.4607 +
  1.4608 +  op_cost(0);
  1.4609 +  format %{ %}
  1.4610 +  interface(CONST_INTER);
  1.4611 +%}
  1.4612 +
  1.4613 +// Constant for increment
  1.4614 +operand immI1() %{
  1.4615 +  predicate(n->get_int() == 1);
  1.4616 +  match(ConI);
  1.4617 +
  1.4618 +  op_cost(0);
  1.4619 +  format %{ %}
  1.4620 +  interface(CONST_INTER);
  1.4621 +%}
  1.4622 +
  1.4623 +// Constant for decrement
  1.4624 +operand immI_M1() %{
  1.4625 +  predicate(n->get_int() == -1);
  1.4626 +  match(ConI);
  1.4627 +
  1.4628 +  op_cost(0);
  1.4629 +  format %{ %}
  1.4630 +  interface(CONST_INTER);
  1.4631 +%}
  1.4632 +
  1.4633 +// Valid scale values for addressing modes
  1.4634 +operand immI2() %{
  1.4635 +  predicate(0 <= n->get_int() && (n->get_int() <= 3));
  1.4636 +  match(ConI);
  1.4637 +
  1.4638 +  format %{ %}
  1.4639 +  interface(CONST_INTER);
  1.4640 +%}
  1.4641 +
  1.4642 +operand immI8() %{
  1.4643 +  predicate((-128 <= n->get_int()) && (n->get_int() <= 127));
  1.4644 +  match(ConI);
  1.4645 +
  1.4646 +  op_cost(5);
  1.4647 +  format %{ %}
  1.4648 +  interface(CONST_INTER);
  1.4649 +%}
  1.4650 +
  1.4651 +operand immI16() %{
  1.4652 +  predicate((-32768 <= n->get_int()) && (n->get_int() <= 32767));
  1.4653 +  match(ConI);
  1.4654 +
  1.4655 +  op_cost(10);
  1.4656 +  format %{ %}
  1.4657 +  interface(CONST_INTER);
  1.4658 +%}
  1.4659 +
  1.4660 +// Constant for long shifts
  1.4661 +operand immI_32() %{
  1.4662 +  predicate( n->get_int() == 32 );
  1.4663 +  match(ConI);
  1.4664 +
  1.4665 +  op_cost(0);
  1.4666 +  format %{ %}
  1.4667 +  interface(CONST_INTER);
  1.4668 +%}
  1.4669 +
  1.4670 +operand immI_1_31() %{
  1.4671 +  predicate( n->get_int() >= 1 && n->get_int() <= 31 );
  1.4672 +  match(ConI);
  1.4673 +
  1.4674 +  op_cost(0);
  1.4675 +  format %{ %}
  1.4676 +  interface(CONST_INTER);
  1.4677 +%}
  1.4678 +
  1.4679 +operand immI_32_63() %{
  1.4680 +  predicate( n->get_int() >= 32 && n->get_int() <= 63 );
  1.4681 +  match(ConI);
  1.4682 +  op_cost(0);
  1.4683 +
  1.4684 +  format %{ %}
  1.4685 +  interface(CONST_INTER);
  1.4686 +%}
  1.4687 +
  1.4688 +// Pointer Immediate
  1.4689 +operand immP() %{
  1.4690 +  match(ConP);
  1.4691 +
  1.4692 +  op_cost(10);
  1.4693 +  format %{ %}
  1.4694 +  interface(CONST_INTER);
  1.4695 +%}
  1.4696 +
  1.4697 +// NULL Pointer Immediate
  1.4698 +operand immP0() %{
  1.4699 +  predicate( n->get_ptr() == 0 );
  1.4700 +  match(ConP);
  1.4701 +  op_cost(0);
  1.4702 +
  1.4703 +  format %{ %}
  1.4704 +  interface(CONST_INTER);
  1.4705 +%}
  1.4706 +
  1.4707 +// Long Immediate
  1.4708 +operand immL() %{
  1.4709 +  match(ConL);
  1.4710 +
  1.4711 +  op_cost(20);
  1.4712 +  format %{ %}
  1.4713 +  interface(CONST_INTER);
  1.4714 +%}
  1.4715 +
  1.4716 +// Long Immediate zero
  1.4717 +operand immL0() %{
  1.4718 +  predicate( n->get_long() == 0L );
  1.4719 +  match(ConL);
  1.4720 +  op_cost(0);
  1.4721 +
  1.4722 +  format %{ %}
  1.4723 +  interface(CONST_INTER);
  1.4724 +%}
  1.4725 +
  1.4726 +// Long immediate from 0 to 127.
  1.4727 +// Used for a shorter form of long mul by 10.
  1.4728 +operand immL_127() %{
  1.4729 +  predicate((0 <= n->get_long()) && (n->get_long() <= 127));
  1.4730 +  match(ConL);
  1.4731 +  op_cost(0);
  1.4732 +
  1.4733 +  format %{ %}
  1.4734 +  interface(CONST_INTER);
  1.4735 +%}
  1.4736 +
  1.4737 +// Long Immediate: low 32-bit mask
  1.4738 +operand immL_32bits() %{
  1.4739 +  predicate(n->get_long() == 0xFFFFFFFFL);
  1.4740 +  match(ConL);
  1.4741 +  op_cost(0);
  1.4742 +
  1.4743 +  format %{ %}
  1.4744 +  interface(CONST_INTER);
  1.4745 +%}
  1.4746 +
  1.4747 +// Long Immediate: low 32-bit mask
  1.4748 +operand immL32() %{
  1.4749 +  predicate(n->get_long() == (int)(n->get_long()));
  1.4750 +  match(ConL);
  1.4751 +  op_cost(20);
  1.4752 +
  1.4753 +  format %{ %}
  1.4754 +  interface(CONST_INTER);
  1.4755 +%}
  1.4756 +
  1.4757 +//Double Immediate zero
  1.4758 +operand immD0() %{
  1.4759 +  // Do additional (and counter-intuitive) test against NaN to work around VC++
  1.4760 +  // bug that generates code such that NaNs compare equal to 0.0
  1.4761 +  predicate( UseSSE<=1 && n->getd() == 0.0 && !g_isnan(n->getd()) );
  1.4762 +  match(ConD);
  1.4763 +
  1.4764 +  op_cost(5);
  1.4765 +  format %{ %}
  1.4766 +  interface(CONST_INTER);
  1.4767 +%}
  1.4768 +
  1.4769 +// Double Immediate
  1.4770 +operand immD1() %{
  1.4771 +  predicate( UseSSE<=1 && n->getd() == 1.0 );
  1.4772 +  match(ConD);
  1.4773 +
  1.4774 +  op_cost(5);
  1.4775 +  format %{ %}
  1.4776 +  interface(CONST_INTER);
  1.4777 +%}
  1.4778 +
  1.4779 +// Double Immediate
  1.4780 +operand immD() %{
  1.4781 +  predicate(UseSSE<=1);
  1.4782 +  match(ConD);
  1.4783 +
  1.4784 +  op_cost(5);
  1.4785 +  format %{ %}
  1.4786 +  interface(CONST_INTER);
  1.4787 +%}
  1.4788 +
  1.4789 +operand immXD() %{
  1.4790 +  predicate(UseSSE>=2);
  1.4791 +  match(ConD);
  1.4792 +
  1.4793 +  op_cost(5);
  1.4794 +  format %{ %}
  1.4795 +  interface(CONST_INTER);
  1.4796 +%}
  1.4797 +
  1.4798 +// Double Immediate zero
  1.4799 +operand immXD0() %{
  1.4800 +  // Do additional (and counter-intuitive) test against NaN to work around VC++
  1.4801 +  // bug that generates code such that NaNs compare equal to 0.0 AND do not
  1.4802 +  // compare equal to -0.0.
  1.4803 +  predicate( UseSSE>=2 && jlong_cast(n->getd()) == 0 );
  1.4804 +  match(ConD);
  1.4805 +
  1.4806 +  format %{ %}
  1.4807 +  interface(CONST_INTER);
  1.4808 +%}
  1.4809 +
  1.4810 +// Float Immediate zero
  1.4811 +operand immF0() %{
  1.4812 +  predicate( UseSSE == 0 && n->getf() == 0.0 );
  1.4813 +  match(ConF);
  1.4814 +
  1.4815 +  op_cost(5);
  1.4816 +  format %{ %}
  1.4817 +  interface(CONST_INTER);
  1.4818 +%}
  1.4819 +
  1.4820 +// Float Immediate
  1.4821 +operand immF() %{
  1.4822 +  predicate( UseSSE == 0 );
  1.4823 +  match(ConF);
  1.4824 +
  1.4825 +  op_cost(5);
  1.4826 +  format %{ %}
  1.4827 +  interface(CONST_INTER);
  1.4828 +%}
  1.4829 +
  1.4830 +// Float Immediate
  1.4831 +operand immXF() %{
  1.4832 +  predicate(UseSSE >= 1);
  1.4833 +  match(ConF);
  1.4834 +
  1.4835 +  op_cost(5);
  1.4836 +  format %{ %}
  1.4837 +  interface(CONST_INTER);
  1.4838 +%}
  1.4839 +
  1.4840 +// Float Immediate zero.  Zero and not -0.0
  1.4841 +operand immXF0() %{
  1.4842 +  predicate( UseSSE >= 1 && jint_cast(n->getf()) == 0 );
  1.4843 +  match(ConF);
  1.4844 +
  1.4845 +  op_cost(5);
  1.4846 +  format %{ %}
  1.4847 +  interface(CONST_INTER);
  1.4848 +%}
  1.4849 +
  1.4850 +// Immediates for special shifts (sign extend)
  1.4851 +
  1.4852 +// Constants for increment
  1.4853 +operand immI_16() %{
  1.4854 +  predicate( n->get_int() == 16 );
  1.4855 +  match(ConI);
  1.4856 +
  1.4857 +  format %{ %}
  1.4858 +  interface(CONST_INTER);
  1.4859 +%}
  1.4860 +
  1.4861 +operand immI_24() %{
  1.4862 +  predicate( n->get_int() == 24 );
  1.4863 +  match(ConI);
  1.4864 +
  1.4865 +  format %{ %}
  1.4866 +  interface(CONST_INTER);
  1.4867 +%}
  1.4868 +
  1.4869 +// Constant for byte-wide masking
  1.4870 +operand immI_255() %{
  1.4871 +  predicate( n->get_int() == 255 );
  1.4872 +  match(ConI);
  1.4873 +
  1.4874 +  format %{ %}
  1.4875 +  interface(CONST_INTER);
  1.4876 +%}
  1.4877 +
  1.4878 +// Register Operands
  1.4879 +// Integer Register
  1.4880 +operand eRegI() %{
  1.4881 +  constraint(ALLOC_IN_RC(e_reg));
  1.4882 +  match(RegI);
  1.4883 +  match(xRegI);
  1.4884 +  match(eAXRegI);
  1.4885 +  match(eBXRegI);
  1.4886 +  match(eCXRegI);
  1.4887 +  match(eDXRegI);
  1.4888 +  match(eDIRegI);
  1.4889 +  match(eSIRegI);
  1.4890 +
  1.4891 +  format %{ %}
  1.4892 +  interface(REG_INTER);
  1.4893 +%}
  1.4894 +
  1.4895 +// Subset of Integer Register
  1.4896 +operand xRegI(eRegI reg) %{
  1.4897 +  constraint(ALLOC_IN_RC(x_reg));
  1.4898 +  match(reg);
  1.4899 +  match(eAXRegI);
  1.4900 +  match(eBXRegI);
  1.4901 +  match(eCXRegI);
  1.4902 +  match(eDXRegI);
  1.4903 +
  1.4904 +  format %{ %}
  1.4905 +  interface(REG_INTER);
  1.4906 +%}
  1.4907 +
  1.4908 +// Special Registers
  1.4909 +operand eAXRegI(xRegI reg) %{
  1.4910 +  constraint(ALLOC_IN_RC(eax_reg));
  1.4911 +  match(reg);
  1.4912 +  match(eRegI);
  1.4913 +
  1.4914 +  format %{ "EAX" %}
  1.4915 +  interface(REG_INTER);
  1.4916 +%}
  1.4917 +
  1.4918 +// Special Registers
  1.4919 +operand eBXRegI(xRegI reg) %{
  1.4920 +  constraint(ALLOC_IN_RC(ebx_reg));
  1.4921 +  match(reg);
  1.4922 +  match(eRegI);
  1.4923 +
  1.4924 +  format %{ "EBX" %}
  1.4925 +  interface(REG_INTER);
  1.4926 +%}
  1.4927 +
  1.4928 +operand eCXRegI(xRegI reg) %{
  1.4929 +  constraint(ALLOC_IN_RC(ecx_reg));
  1.4930 +  match(reg);
  1.4931 +  match(eRegI);
  1.4932 +
  1.4933 +  format %{ "ECX" %}
  1.4934 +  interface(REG_INTER);
  1.4935 +%}
  1.4936 +
  1.4937 +operand eDXRegI(xRegI reg) %{
  1.4938 +  constraint(ALLOC_IN_RC(edx_reg));
  1.4939 +  match(reg);
  1.4940 +  match(eRegI);
  1.4941 +
  1.4942 +  format %{ "EDX" %}
  1.4943 +  interface(REG_INTER);
  1.4944 +%}
  1.4945 +
  1.4946 +operand eDIRegI(xRegI reg) %{
  1.4947 +  constraint(ALLOC_IN_RC(edi_reg));
  1.4948 +  match(reg);
  1.4949 +  match(eRegI);
  1.4950 +
  1.4951 +  format %{ "EDI" %}
  1.4952 +  interface(REG_INTER);
  1.4953 +%}
  1.4954 +
  1.4955 +operand naxRegI() %{
  1.4956 +  constraint(ALLOC_IN_RC(nax_reg));
  1.4957 +  match(RegI);
  1.4958 +  match(eCXRegI);
  1.4959 +  match(eDXRegI);
  1.4960 +  match(eSIRegI);
  1.4961 +  match(eDIRegI);
  1.4962 +
  1.4963 +  format %{ %}
  1.4964 +  interface(REG_INTER);
  1.4965 +%}
  1.4966 +
  1.4967 +operand nadxRegI() %{
  1.4968 +  constraint(ALLOC_IN_RC(nadx_reg));
  1.4969 +  match(RegI);
  1.4970 +  match(eBXRegI);
  1.4971 +  match(eCXRegI);
  1.4972 +  match(eSIRegI);
  1.4973 +  match(eDIRegI);
  1.4974 +
  1.4975 +  format %{ %}
  1.4976 +  interface(REG_INTER);
  1.4977 +%}
  1.4978 +
  1.4979 +operand ncxRegI() %{
  1.4980 +  constraint(ALLOC_IN_RC(ncx_reg));
  1.4981 +  match(RegI);
  1.4982 +  match(eAXRegI);
  1.4983 +  match(eDXRegI);
  1.4984 +  match(eSIRegI);
  1.4985 +  match(eDIRegI);
  1.4986 +
  1.4987 +  format %{ %}
  1.4988 +  interface(REG_INTER);
  1.4989 +%}
  1.4990 +
  1.4991 +// // This operand was used by cmpFastUnlock, but conflicted with 'object' reg
  1.4992 +// //
  1.4993 +operand eSIRegI(xRegI reg) %{
  1.4994 +   constraint(ALLOC_IN_RC(esi_reg));
  1.4995 +   match(reg);
  1.4996 +   match(eRegI);
  1.4997 +
  1.4998 +   format %{ "ESI" %}
  1.4999 +   interface(REG_INTER);
  1.5000 +%}
  1.5001 +
  1.5002 +// Pointer Register
  1.5003 +operand anyRegP() %{
  1.5004 +  constraint(ALLOC_IN_RC(any_reg));
  1.5005 +  match(RegP);
  1.5006 +  match(eAXRegP);
  1.5007 +  match(eBXRegP);
  1.5008 +  match(eCXRegP);
  1.5009 +  match(eDIRegP);
  1.5010 +  match(eRegP);
  1.5011 +
  1.5012 +  format %{ %}
  1.5013 +  interface(REG_INTER);
  1.5014 +%}
  1.5015 +
  1.5016 +operand eRegP() %{
  1.5017 +  constraint(ALLOC_IN_RC(e_reg));
  1.5018 +  match(RegP);
  1.5019 +  match(eAXRegP);
  1.5020 +  match(eBXRegP);
  1.5021 +  match(eCXRegP);
  1.5022 +  match(eDIRegP);
  1.5023 +
  1.5024 +  format %{ %}
  1.5025 +  interface(REG_INTER);
  1.5026 +%}
  1.5027 +
  1.5028 +// On windows95, EBP is not safe to use for implicit null tests.
  1.5029 +operand eRegP_no_EBP() %{
  1.5030 +  constraint(ALLOC_IN_RC(e_reg_no_rbp));
  1.5031 +  match(RegP);
  1.5032 +  match(eAXRegP);
  1.5033 +  match(eBXRegP);
  1.5034 +  match(eCXRegP);
  1.5035 +  match(eDIRegP);
  1.5036 +
  1.5037 +  op_cost(100);
  1.5038 +  format %{ %}
  1.5039 +  interface(REG_INTER);
  1.5040 +%}
  1.5041 +
  1.5042 +operand naxRegP() %{
  1.5043 +  constraint(ALLOC_IN_RC(nax_reg));
  1.5044 +  match(RegP);
  1.5045 +  match(eBXRegP);
  1.5046 +  match(eDXRegP);
  1.5047 +  match(eCXRegP);
  1.5048 +  match(eSIRegP);
  1.5049 +  match(eDIRegP);
  1.5050 +
  1.5051 +  format %{ %}
  1.5052 +  interface(REG_INTER);
  1.5053 +%}
  1.5054 +
  1.5055 +operand nabxRegP() %{
  1.5056 +  constraint(ALLOC_IN_RC(nabx_reg));
  1.5057 +  match(RegP);
  1.5058 +  match(eCXRegP);
  1.5059 +  match(eDXRegP);
  1.5060 +  match(eSIRegP);
  1.5061 +  match(eDIRegP);
  1.5062 +
  1.5063 +  format %{ %}
  1.5064 +  interface(REG_INTER);
  1.5065 +%}
  1.5066 +
  1.5067 +operand pRegP() %{
  1.5068 +  constraint(ALLOC_IN_RC(p_reg));
  1.5069 +  match(RegP);
  1.5070 +  match(eBXRegP);
  1.5071 +  match(eDXRegP);
  1.5072 +  match(eSIRegP);
  1.5073 +  match(eDIRegP);
  1.5074 +
  1.5075 +  format %{ %}
  1.5076 +  interface(REG_INTER);
  1.5077 +%}
  1.5078 +
  1.5079 +// Special Registers
  1.5080 +// Return a pointer value
  1.5081 +operand eAXRegP(eRegP reg) %{
  1.5082 +  constraint(ALLOC_IN_RC(eax_reg));
  1.5083 +  match(reg);
  1.5084 +  format %{ "EAX" %}
  1.5085 +  interface(REG_INTER);
  1.5086 +%}
  1.5087 +
  1.5088 +// Used in AtomicAdd
  1.5089 +operand eBXRegP(eRegP reg) %{
  1.5090 +  constraint(ALLOC_IN_RC(ebx_reg));
  1.5091 +  match(reg);
  1.5092 +  format %{ "EBX" %}
  1.5093 +  interface(REG_INTER);
  1.5094 +%}
  1.5095 +
  1.5096 +// Tail-call (interprocedural jump) to interpreter
  1.5097 +operand eCXRegP(eRegP reg) %{
  1.5098 +  constraint(ALLOC_IN_RC(ecx_reg));
  1.5099 +  match(reg);
  1.5100 +  format %{ "ECX" %}
  1.5101 +  interface(REG_INTER);
  1.5102 +%}
  1.5103 +
  1.5104 +operand eSIRegP(eRegP reg) %{
  1.5105 +  constraint(ALLOC_IN_RC(esi_reg));
  1.5106 +  match(reg);
  1.5107 +  format %{ "ESI" %}
  1.5108 +  interface(REG_INTER);
  1.5109 +%}
  1.5110 +
  1.5111 +// Used in rep stosw
  1.5112 +operand eDIRegP(eRegP reg) %{
  1.5113 +  constraint(ALLOC_IN_RC(edi_reg));
  1.5114 +  match(reg);
  1.5115 +  format %{ "EDI" %}
  1.5116 +  interface(REG_INTER);
  1.5117 +%}
  1.5118 +
  1.5119 +operand eBPRegP() %{
  1.5120 +  constraint(ALLOC_IN_RC(ebp_reg));
  1.5121 +  match(RegP);
  1.5122 +  format %{ "EBP" %}
  1.5123 +  interface(REG_INTER);
  1.5124 +%}
  1.5125 +
  1.5126 +operand eRegL() %{
  1.5127 +  constraint(ALLOC_IN_RC(long_reg));
  1.5128 +  match(RegL);
  1.5129 +  match(eADXRegL);
  1.5130 +
  1.5131 +  format %{ %}
  1.5132 +  interface(REG_INTER);
  1.5133 +%}
  1.5134 +
  1.5135 +operand eADXRegL( eRegL reg ) %{
  1.5136 +  constraint(ALLOC_IN_RC(eadx_reg));
  1.5137 +  match(reg);
  1.5138 +
  1.5139 +  format %{ "EDX:EAX" %}
  1.5140 +  interface(REG_INTER);
  1.5141 +%}
  1.5142 +
  1.5143 +operand eBCXRegL( eRegL reg ) %{
  1.5144 +  constraint(ALLOC_IN_RC(ebcx_reg));
  1.5145 +  match(reg);
  1.5146 +
  1.5147 +  format %{ "EBX:ECX" %}
  1.5148 +  interface(REG_INTER);
  1.5149 +%}
  1.5150 +
  1.5151 +// Special case for integer high multiply
  1.5152 +operand eADXRegL_low_only() %{
  1.5153 +  constraint(ALLOC_IN_RC(eadx_reg));
  1.5154 +  match(RegL);
  1.5155 +
  1.5156 +  format %{ "EAX" %}
  1.5157 +  interface(REG_INTER);
  1.5158 +%}
  1.5159 +
  1.5160 +// Flags register, used as output of compare instructions
  1.5161 +operand eFlagsReg() %{
  1.5162 +  constraint(ALLOC_IN_RC(int_flags));
  1.5163 +  match(RegFlags);
  1.5164 +
  1.5165 +  format %{ "EFLAGS" %}
  1.5166 +  interface(REG_INTER);
  1.5167 +%}
  1.5168 +
  1.5169 +// Flags register, used as output of FLOATING POINT compare instructions
  1.5170 +operand eFlagsRegU() %{
  1.5171 +  constraint(ALLOC_IN_RC(int_flags));
  1.5172 +  match(RegFlags);
  1.5173 +
  1.5174 +  format %{ "EFLAGS_U" %}
  1.5175 +  interface(REG_INTER);
  1.5176 +%}
  1.5177 +
  1.5178 +// Condition Code Register used by long compare
  1.5179 +operand flagsReg_long_LTGE() %{
  1.5180 +  constraint(ALLOC_IN_RC(int_flags));
  1.5181 +  match(RegFlags);
  1.5182 +  format %{ "FLAGS_LTGE" %}
  1.5183 +  interface(REG_INTER);
  1.5184 +%}
  1.5185 +operand flagsReg_long_EQNE() %{
  1.5186 +  constraint(ALLOC_IN_RC(int_flags));
  1.5187 +  match(RegFlags);
  1.5188 +  format %{ "FLAGS_EQNE" %}
  1.5189 +  interface(REG_INTER);
  1.5190 +%}
  1.5191 +operand flagsReg_long_LEGT() %{
  1.5192 +  constraint(ALLOC_IN_RC(int_flags));
  1.5193 +  match(RegFlags);
  1.5194 +  format %{ "FLAGS_LEGT" %}
  1.5195 +  interface(REG_INTER);
  1.5196 +%}
  1.5197 +
  1.5198 +// Float register operands
  1.5199 +operand regD() %{
  1.5200 +  predicate( UseSSE < 2 );
  1.5201 +  constraint(ALLOC_IN_RC(dbl_reg));
  1.5202 +  match(RegD);
  1.5203 +  match(regDPR1);
  1.5204 +  match(regDPR2);
  1.5205 +  format %{ %}
  1.5206 +  interface(REG_INTER);
  1.5207 +%}
  1.5208 +
  1.5209 +operand regDPR1(regD reg) %{
  1.5210 +  predicate( UseSSE < 2 );
  1.5211 +  constraint(ALLOC_IN_RC(dbl_reg0));
  1.5212 +  match(reg);
  1.5213 +  format %{ "FPR1" %}
  1.5214 +  interface(REG_INTER);
  1.5215 +%}
  1.5216 +
  1.5217 +operand regDPR2(regD reg) %{
  1.5218 +  predicate( UseSSE < 2 );
  1.5219 +  constraint(ALLOC_IN_RC(dbl_reg1));
  1.5220 +  match(reg);
  1.5221 +  format %{ "FPR2" %}
  1.5222 +  interface(REG_INTER);
  1.5223 +%}
  1.5224 +
  1.5225 +operand regnotDPR1(regD reg) %{
  1.5226 +  predicate( UseSSE < 2 );
  1.5227 +  constraint(ALLOC_IN_RC(dbl_notreg0));
  1.5228 +  match(reg);
  1.5229 +  format %{ %}
  1.5230 +  interface(REG_INTER);
  1.5231 +%}
  1.5232 +
  1.5233 +// XMM Double register operands
  1.5234 +operand regXD() %{
  1.5235 +  predicate( UseSSE>=2 );
  1.5236 +  constraint(ALLOC_IN_RC(xdb_reg));
  1.5237 +  match(RegD);
  1.5238 +  match(regXD6);
  1.5239 +  match(regXD7);
  1.5240 +  format %{ %}
  1.5241 +  interface(REG_INTER);
  1.5242 +%}
  1.5243 +
  1.5244 +// XMM6 double register operands
  1.5245 +operand regXD6(regXD reg) %{
  1.5246 +  predicate( UseSSE>=2 );
  1.5247 +  constraint(ALLOC_IN_RC(xdb_reg6));
  1.5248 +  match(reg);
  1.5249 +  format %{ "XMM6" %}
  1.5250 +  interface(REG_INTER);
  1.5251 +%}
  1.5252 +
  1.5253 +// XMM7 double register operands
  1.5254 +operand regXD7(regXD reg) %{
  1.5255 +  predicate( UseSSE>=2 );
  1.5256 +  constraint(ALLOC_IN_RC(xdb_reg7));
  1.5257 +  match(reg);
  1.5258 +  format %{ "XMM7" %}
  1.5259 +  interface(REG_INTER);
  1.5260 +%}
  1.5261 +
  1.5262 +// Float register operands
  1.5263 +operand regF() %{
  1.5264 +  predicate( UseSSE < 2 );
  1.5265 +  constraint(ALLOC_IN_RC(flt_reg));
  1.5266 +  match(RegF);
  1.5267 +  match(regFPR1);
  1.5268 +  format %{ %}
  1.5269 +  interface(REG_INTER);
  1.5270 +%}
  1.5271 +
  1.5272 +// Float register operands
  1.5273 +operand regFPR1(regF reg) %{
  1.5274 +  predicate( UseSSE < 2 );
  1.5275 +  constraint(ALLOC_IN_RC(flt_reg0));
  1.5276 +  match(reg);
  1.5277 +  format %{ "FPR1" %}
  1.5278 +  interface(REG_INTER);
  1.5279 +%}
  1.5280 +
  1.5281 +// XMM register operands
  1.5282 +operand regX() %{
  1.5283 +  predicate( UseSSE>=1 );
  1.5284 +  constraint(ALLOC_IN_RC(xmm_reg));
  1.5285 +  match(RegF);
  1.5286 +  format %{ %}
  1.5287 +  interface(REG_INTER);
  1.5288 +%}
  1.5289 +
  1.5290 +
  1.5291 +//----------Memory Operands----------------------------------------------------
  1.5292 +// Direct Memory Operand
  1.5293 +operand direct(immP addr) %{
  1.5294 +  match(addr);
  1.5295 +
  1.5296 +  format %{ "[$addr]" %}
  1.5297 +  interface(MEMORY_INTER) %{
  1.5298 +    base(0xFFFFFFFF);
  1.5299 +    index(0x4);
  1.5300 +    scale(0x0);
  1.5301 +    disp($addr);
  1.5302 +  %}
  1.5303 +%}
  1.5304 +
  1.5305 +// Indirect Memory Operand
  1.5306 +operand indirect(eRegP reg) %{
  1.5307 +  constraint(ALLOC_IN_RC(e_reg));
  1.5308 +  match(reg);
  1.5309 +
  1.5310 +  format %{ "[$reg]" %}
  1.5311 +  interface(MEMORY_INTER) %{
  1.5312 +    base($reg);
  1.5313 +    index(0x4);
  1.5314 +    scale(0x0);
  1.5315 +    disp(0x0);
  1.5316 +  %}
  1.5317 +%}
  1.5318 +
  1.5319 +// Indirect Memory Plus Short Offset Operand
  1.5320 +operand indOffset8(eRegP reg, immI8 off) %{
  1.5321 +  match(AddP reg off);
  1.5322 +
  1.5323 +  format %{ "[$reg + $off]" %}
  1.5324 +  interface(MEMORY_INTER) %{
  1.5325 +    base($reg);
  1.5326 +    index(0x4);
  1.5327 +    scale(0x0);
  1.5328 +    disp($off);
  1.5329 +  %}
  1.5330 +%}
  1.5331 +
  1.5332 +// Indirect Memory Plus Long Offset Operand
  1.5333 +operand indOffset32(eRegP reg, immI off) %{
  1.5334 +  match(AddP reg off);
  1.5335 +
  1.5336 +  format %{ "[$reg + $off]" %}
  1.5337 +  interface(MEMORY_INTER) %{
  1.5338 +    base($reg);
  1.5339 +    index(0x4);
  1.5340 +    scale(0x0);
  1.5341 +    disp($off);
  1.5342 +  %}
  1.5343 +%}
  1.5344 +
  1.5345 +// Indirect Memory Plus Long Offset Operand
  1.5346 +operand indOffset32X(eRegI reg, immP off) %{
  1.5347 +  match(AddP off reg);
  1.5348 +
  1.5349 +  format %{ "[$reg + $off]" %}
  1.5350 +  interface(MEMORY_INTER) %{
  1.5351 +    base($reg);
  1.5352 +    index(0x4);
  1.5353 +    scale(0x0);
  1.5354 +    disp($off);
  1.5355 +  %}
  1.5356 +%}
  1.5357 +
  1.5358 +// Indirect Memory Plus Index Register Plus Offset Operand
  1.5359 +operand indIndexOffset(eRegP reg, eRegI ireg, immI off) %{
  1.5360 +  match(AddP (AddP reg ireg) off);
  1.5361 +
  1.5362 +  op_cost(10);
  1.5363 +  format %{"[$reg + $off + $ireg]" %}
  1.5364 +  interface(MEMORY_INTER) %{
  1.5365 +    base($reg);
  1.5366 +    index($ireg);
  1.5367 +    scale(0x0);
  1.5368 +    disp($off);
  1.5369 +  %}
  1.5370 +%}
  1.5371 +
  1.5372 +// Indirect Memory Plus Index Register Plus Offset Operand
  1.5373 +operand indIndex(eRegP reg, eRegI ireg) %{
  1.5374 +  match(AddP reg ireg);
  1.5375 +
  1.5376 +  op_cost(10);
  1.5377 +  format %{"[$reg + $ireg]" %}
  1.5378 +  interface(MEMORY_INTER) %{
  1.5379 +    base($reg);
  1.5380 +    index($ireg);
  1.5381 +    scale(0x0);
  1.5382 +    disp(0x0);
  1.5383 +  %}
  1.5384 +%}
  1.5385 +
  1.5386 +// // -------------------------------------------------------------------------
  1.5387 +// // 486 architecture doesn't support "scale * index + offset" with out a base
  1.5388 +// // -------------------------------------------------------------------------
  1.5389 +// // Scaled Memory Operands
  1.5390 +// // Indirect Memory Times Scale Plus Offset Operand
  1.5391 +// operand indScaleOffset(immP off, eRegI ireg, immI2 scale) %{
  1.5392 +//   match(AddP off (LShiftI ireg scale));
  1.5393 +//
  1.5394 +//   op_cost(10);
  1.5395 +//   format %{"[$off + $ireg << $scale]" %}
  1.5396 +//   interface(MEMORY_INTER) %{
  1.5397 +//     base(0x4);
  1.5398 +//     index($ireg);
  1.5399 +//     scale($scale);
  1.5400 +//     disp($off);
  1.5401 +//   %}
  1.5402 +// %}
  1.5403 +
  1.5404 +// Indirect Memory Times Scale Plus Index Register
  1.5405 +operand indIndexScale(eRegP reg, eRegI ireg, immI2 scale) %{
  1.5406 +  match(AddP reg (LShiftI ireg scale));
  1.5407 +
  1.5408 +  op_cost(10);
  1.5409 +  format %{"[$reg + $ireg << $scale]" %}
  1.5410 +  interface(MEMORY_INTER) %{
  1.5411 +    base($reg);
  1.5412 +    index($ireg);
  1.5413 +    scale($scale);
  1.5414 +    disp(0x0);
  1.5415 +  %}
  1.5416 +%}
  1.5417 +
  1.5418 +// Indirect Memory Times Scale Plus Index Register Plus Offset Operand
  1.5419 +operand indIndexScaleOffset(eRegP reg, immI off, eRegI ireg, immI2 scale) %{
  1.5420 +  match(AddP (AddP reg (LShiftI ireg scale)) off);
  1.5421 +
  1.5422 +  op_cost(10);
  1.5423 +  format %{"[$reg + $off + $ireg << $scale]" %}
  1.5424 +  interface(MEMORY_INTER) %{
  1.5425 +    base($reg);
  1.5426 +    index($ireg);
  1.5427 +    scale($scale);
  1.5428 +    disp($off);
  1.5429 +  %}
  1.5430 +%}
  1.5431 +
  1.5432 +//----------Load Long Memory Operands------------------------------------------
  1.5433 +// The load-long idiom will use it's address expression again after loading
  1.5434 +// the first word of the long.  If the load-long destination overlaps with
  1.5435 +// registers used in the addressing expression, the 2nd half will be loaded
  1.5436 +// from a clobbered address.  Fix this by requiring that load-long use
  1.5437 +// address registers that do not overlap with the load-long target.
  1.5438 +
  1.5439 +// load-long support
  1.5440 +operand load_long_RegP() %{
  1.5441 +  constraint(ALLOC_IN_RC(esi_reg));
  1.5442 +  match(RegP);
  1.5443 +  match(eSIRegP);
  1.5444 +  op_cost(100);
  1.5445 +  format %{  %}
  1.5446 +  interface(REG_INTER);
  1.5447 +%}
  1.5448 +
  1.5449 +// Indirect Memory Operand Long
  1.5450 +operand load_long_indirect(load_long_RegP reg) %{
  1.5451 +  constraint(ALLOC_IN_RC(esi_reg));
  1.5452 +  match(reg);
  1.5453 +
  1.5454 +  format %{ "[$reg]" %}
  1.5455 +  interface(MEMORY_INTER) %{
  1.5456 +    base($reg);
  1.5457 +    index(0x4);
  1.5458 +    scale(0x0);
  1.5459 +    disp(0x0);
  1.5460 +  %}
  1.5461 +%}
  1.5462 +
  1.5463 +// Indirect Memory Plus Long Offset Operand
  1.5464 +operand load_long_indOffset32(load_long_RegP reg, immI off) %{
  1.5465 +  match(AddP reg off);
  1.5466 +
  1.5467 +  format %{ "[$reg + $off]" %}
  1.5468 +  interface(MEMORY_INTER) %{
  1.5469 +    base($reg);
  1.5470 +    index(0x4);
  1.5471 +    scale(0x0);
  1.5472 +    disp($off);
  1.5473 +  %}
  1.5474 +%}
  1.5475 +
  1.5476 +opclass load_long_memory(load_long_indirect, load_long_indOffset32);
  1.5477 +
  1.5478 +
  1.5479 +//----------Special Memory Operands--------------------------------------------
  1.5480 +// Stack Slot Operand - This operand is used for loading and storing temporary
  1.5481 +//                      values on the stack where a match requires a value to
  1.5482 +//                      flow through memory.
  1.5483 +operand stackSlotP(sRegP reg) %{
  1.5484 +  constraint(ALLOC_IN_RC(stack_slots));
  1.5485 +  // No match rule because this operand is only generated in matching
  1.5486 +  format %{ "[$reg]" %}
  1.5487 +  interface(MEMORY_INTER) %{
  1.5488 +    base(0x4);   // ESP
  1.5489 +    index(0x4);  // No Index
  1.5490 +    scale(0x0);  // No Scale
  1.5491 +    disp($reg);  // Stack Offset
  1.5492 +  %}
  1.5493 +%}
  1.5494 +
  1.5495 +operand stackSlotI(sRegI reg) %{
  1.5496 +  constraint(ALLOC_IN_RC(stack_slots));
  1.5497 +  // No match rule because this operand is only generated in matching
  1.5498 +  format %{ "[$reg]" %}
  1.5499 +  interface(MEMORY_INTER) %{
  1.5500 +    base(0x4);   // ESP
  1.5501 +    index(0x4);  // No Index
  1.5502 +    scale(0x0);  // No Scale
  1.5503 +    disp($reg);  // Stack Offset
  1.5504 +  %}
  1.5505 +%}
  1.5506 +
  1.5507 +operand stackSlotF(sRegF reg) %{
  1.5508 +  constraint(ALLOC_IN_RC(stack_slots));
  1.5509 +  // No match rule because this operand is only generated in matching
  1.5510 +  format %{ "[$reg]" %}
  1.5511 +  interface(MEMORY_INTER) %{
  1.5512 +    base(0x4);   // ESP
  1.5513 +    index(0x4);  // No Index
  1.5514 +    scale(0x0);  // No Scale
  1.5515 +    disp($reg);  // Stack Offset
  1.5516 +  %}
  1.5517 +%}
  1.5518 +
  1.5519 +operand stackSlotD(sRegD reg) %{
  1.5520 +  constraint(ALLOC_IN_RC(stack_slots));
  1.5521 +  // No match rule because this operand is only generated in matching
  1.5522 +  format %{ "[$reg]" %}
  1.5523 +  interface(MEMORY_INTER) %{
  1.5524 +    base(0x4);   // ESP
  1.5525 +    index(0x4);  // No Index
  1.5526 +    scale(0x0);  // No Scale
  1.5527 +    disp($reg);  // Stack Offset
  1.5528 +  %}
  1.5529 +%}
  1.5530 +
  1.5531 +operand stackSlotL(sRegL reg) %{
  1.5532 +  constraint(ALLOC_IN_RC(stack_slots));
  1.5533 +  // No match rule because this operand is only generated in matching
  1.5534 +  format %{ "[$reg]" %}
  1.5535 +  interface(MEMORY_INTER) %{
  1.5536 +    base(0x4);   // ESP
  1.5537 +    index(0x4);  // No Index
  1.5538 +    scale(0x0);  // No Scale
  1.5539 +    disp($reg);  // Stack Offset
  1.5540 +  %}
  1.5541 +%}
  1.5542 +
  1.5543 +//----------Memory Operands - Win95 Implicit Null Variants----------------
  1.5544 +// Indirect Memory Operand
  1.5545 +operand indirect_win95_safe(eRegP_no_EBP reg)
  1.5546 +%{
  1.5547 +  constraint(ALLOC_IN_RC(e_reg));
  1.5548 +  match(reg);
  1.5549 +
  1.5550 +  op_cost(100);
  1.5551 +  format %{ "[$reg]" %}
  1.5552 +  interface(MEMORY_INTER) %{
  1.5553 +    base($reg);
  1.5554 +    index(0x4);
  1.5555 +    scale(0x0);
  1.5556 +    disp(0x0);
  1.5557 +  %}
  1.5558 +%}
  1.5559 +
  1.5560 +// Indirect Memory Plus Short Offset Operand
  1.5561 +operand indOffset8_win95_safe(eRegP_no_EBP reg, immI8 off)
  1.5562 +%{
  1.5563 +  match(AddP reg off);
  1.5564 +
  1.5565 +  op_cost(100);
  1.5566 +  format %{ "[$reg + $off]" %}
  1.5567 +  interface(MEMORY_INTER) %{
  1.5568 +    base($reg);
  1.5569 +    index(0x4);
  1.5570 +    scale(0x0);
  1.5571 +    disp($off);
  1.5572 +  %}
  1.5573 +%}
  1.5574 +
  1.5575 +// Indirect Memory Plus Long Offset Operand
  1.5576 +operand indOffset32_win95_safe(eRegP_no_EBP reg, immI off)
  1.5577 +%{
  1.5578 +  match(AddP reg off);
  1.5579 +
  1.5580 +  op_cost(100);
  1.5581 +  format %{ "[$reg + $off]" %}
  1.5582 +  interface(MEMORY_INTER) %{
  1.5583 +    base($reg);
  1.5584 +    index(0x4);
  1.5585 +    scale(0x0);
  1.5586 +    disp($off);
  1.5587 +  %}
  1.5588 +%}
  1.5589 +
  1.5590 +// Indirect Memory Plus Index Register Plus Offset Operand
  1.5591 +operand indIndexOffset_win95_safe(eRegP_no_EBP reg, eRegI ireg, immI off)
  1.5592 +%{
  1.5593 +  match(AddP (AddP reg ireg) off);
  1.5594 +
  1.5595 +  op_cost(100);
  1.5596 +  format %{"[$reg + $off + $ireg]" %}
  1.5597 +  interface(MEMORY_INTER) %{
  1.5598 +    base($reg);
  1.5599 +    index($ireg);
  1.5600 +    scale(0x0);
  1.5601 +    disp($off);
  1.5602 +  %}
  1.5603 +%}
  1.5604 +
  1.5605 +// Indirect Memory Times Scale Plus Index Register
  1.5606 +operand indIndexScale_win95_safe(eRegP_no_EBP reg, eRegI ireg, immI2 scale)
  1.5607 +%{
  1.5608 +  match(AddP reg (LShiftI ireg scale));
  1.5609 +
  1.5610 +  op_cost(100);
  1.5611 +  format %{"[$reg + $ireg << $scale]" %}
  1.5612 +  interface(MEMORY_INTER) %{
  1.5613 +    base($reg);
  1.5614 +    index($ireg);
  1.5615 +    scale($scale);
  1.5616 +    disp(0x0);
  1.5617 +  %}
  1.5618 +%}
  1.5619 +
  1.5620 +// Indirect Memory Times Scale Plus Index Register Plus Offset Operand
  1.5621 +operand indIndexScaleOffset_win95_safe(eRegP_no_EBP reg, immI off, eRegI ireg, immI2 scale)
  1.5622 +%{
  1.5623 +  match(AddP (AddP reg (LShiftI ireg scale)) off);
  1.5624 +
  1.5625 +  op_cost(100);
  1.5626 +  format %{"[$reg + $off + $ireg << $scale]" %}
  1.5627 +  interface(MEMORY_INTER) %{
  1.5628 +    base($reg);
  1.5629 +    index($ireg);
  1.5630 +    scale($scale);
  1.5631 +    disp($off);
  1.5632 +  %}
  1.5633 +%}
  1.5634 +
  1.5635 +//----------Conditional Branch Operands----------------------------------------
  1.5636 +// Comparison Op  - This is the operation of the comparison, and is limited to
  1.5637 +//                  the following set of codes:
  1.5638 +//                  L (<), LE (<=), G (>), GE (>=), E (==), NE (!=)
  1.5639 +//
  1.5640 +// Other attributes of the comparison, such as unsignedness, are specified
  1.5641 +// by the comparison instruction that sets a condition code flags register.
  1.5642 +// That result is represented by a flags operand whose subtype is appropriate
  1.5643 +// to the unsignedness (etc.) of the comparison.
  1.5644 +//
  1.5645 +// Later, the instruction which matches both the Comparison Op (a Bool) and
  1.5646 +// the flags (produced by the Cmp) specifies the coding of the comparison op
  1.5647 +// by matching a specific subtype of Bool operand below, such as cmpOpU.
  1.5648 +
  1.5649 +// Comparision Code
  1.5650 +operand cmpOp() %{
  1.5651 +  match(Bool);
  1.5652 +
  1.5653 +  format %{ "" %}
  1.5654 +  interface(COND_INTER) %{
  1.5655 +    equal(0x4);
  1.5656 +    not_equal(0x5);
  1.5657 +    less(0xC);
  1.5658 +    greater_equal(0xD);
  1.5659 +    less_equal(0xE);
  1.5660 +    greater(0xF);
  1.5661 +  %}
  1.5662 +%}
  1.5663 +
  1.5664 +// Comparison Code, unsigned compare.  Used by FP also, with
  1.5665 +// C2 (unordered) turned into GT or LT already.  The other bits
  1.5666 +// C0 and C3 are turned into Carry & Zero flags.
  1.5667 +operand cmpOpU() %{
  1.5668 +  match(Bool);
  1.5669 +
  1.5670 +  format %{ "" %}
  1.5671 +  interface(COND_INTER) %{
  1.5672 +    equal(0x4);
  1.5673 +    not_equal(0x5);
  1.5674 +    less(0x2);
  1.5675 +    greater_equal(0x3);
  1.5676 +    less_equal(0x6);
  1.5677 +    greater(0x7);
  1.5678 +  %}
  1.5679 +%}
  1.5680 +
  1.5681 +// Comparison Code for FP conditional move
  1.5682 +operand cmpOp_fcmov() %{
  1.5683 +  match(Bool);
  1.5684 +
  1.5685 +  format %{ "" %}
  1.5686 +  interface(COND_INTER) %{
  1.5687 +    equal        (0x0C8);
  1.5688 +    not_equal    (0x1C8);
  1.5689 +    less         (0x0C0);
  1.5690 +    greater_equal(0x1C0);
  1.5691 +    less_equal   (0x0D0);
  1.5692 +    greater      (0x1D0);
  1.5693 +  %}
  1.5694 +%}
  1.5695 +
  1.5696 +// Comparision Code used in long compares
  1.5697 +operand cmpOp_commute() %{
  1.5698 +  match(Bool);
  1.5699 +
  1.5700 +  format %{ "" %}
  1.5701 +  interface(COND_INTER) %{
  1.5702 +    equal(0x4);
  1.5703 +    not_equal(0x5);
  1.5704 +    less(0xF);
  1.5705 +    greater_equal(0xE);
  1.5706 +    less_equal(0xD);
  1.5707 +    greater(0xC);
  1.5708 +  %}
  1.5709 +%}
  1.5710 +
  1.5711 +//----------OPERAND CLASSES----------------------------------------------------
  1.5712 +// Operand Classes are groups of operands that are used as to simplify
  1.5713 +// instruction definitions by not requiring the AD writer to specify seperate
  1.5714 +// instructions for every form of operand when the instruction accepts
  1.5715 +// multiple operand types with the same basic encoding and format.  The classic
  1.5716 +// case of this is memory operands.
  1.5717 +
  1.5718 +opclass memory(direct, indirect, indOffset8, indOffset32, indOffset32X, indIndexOffset,
  1.5719 +               indIndex, indIndexScale, indIndexScaleOffset);
  1.5720 +
  1.5721 +// Long memory operations are encoded in 2 instructions and a +4 offset.
  1.5722 +// This means some kind of offset is always required and you cannot use
  1.5723 +// an oop as the offset (done when working on static globals).
  1.5724 +opclass long_memory(direct, indirect, indOffset8, indOffset32, indIndexOffset,
  1.5725 +                    indIndex, indIndexScale, indIndexScaleOffset);
  1.5726 +
  1.5727 +
  1.5728 +//----------PIPELINE-----------------------------------------------------------
  1.5729 +// Rules which define the behavior of the target architectures pipeline.
  1.5730 +pipeline %{
  1.5731 +
  1.5732 +//----------ATTRIBUTES---------------------------------------------------------
  1.5733 +attributes %{
  1.5734 +  variable_size_instructions;        // Fixed size instructions
  1.5735 +  max_instructions_per_bundle = 3;   // Up to 3 instructions per bundle
  1.5736 +  instruction_unit_size = 1;         // An instruction is 1 bytes long
  1.5737 +  instruction_fetch_unit_size = 16;  // The processor fetches one line
  1.5738 +  instruction_fetch_units = 1;       // of 16 bytes
  1.5739 +
  1.5740 +  // List of nop instructions
  1.5741 +  nops( MachNop );
  1.5742 +%}
  1.5743 +
  1.5744 +//----------RESOURCES----------------------------------------------------------
  1.5745 +// Resources are the functional units available to the machine
  1.5746 +
  1.5747 +// Generic P2/P3 pipeline
  1.5748 +// 3 decoders, only D0 handles big operands; a "bundle" is the limit of
  1.5749 +// 3 instructions decoded per cycle.
  1.5750 +// 2 load/store ops per cycle, 1 branch, 1 FPU,
  1.5751 +// 2 ALU op, only ALU0 handles mul/div instructions.
  1.5752 +resources( D0, D1, D2, DECODE = D0 | D1 | D2,
  1.5753 +           MS0, MS1, MEM = MS0 | MS1,
  1.5754 +           BR, FPU,
  1.5755 +           ALU0, ALU1, ALU = ALU0 | ALU1 );
  1.5756 +
  1.5757 +//----------PIPELINE DESCRIPTION-----------------------------------------------
  1.5758 +// Pipeline Description specifies the stages in the machine's pipeline
  1.5759 +
  1.5760 +// Generic P2/P3 pipeline
  1.5761 +pipe_desc(S0, S1, S2, S3, S4, S5);
  1.5762 +
  1.5763 +//----------PIPELINE CLASSES---------------------------------------------------
  1.5764 +// Pipeline Classes describe the stages in which input and output are
  1.5765 +// referenced by the hardware pipeline.
  1.5766 +
  1.5767 +// Naming convention: ialu or fpu
  1.5768 +// Then: _reg
  1.5769 +// Then: _reg if there is a 2nd register
  1.5770 +// Then: _long if it's a pair of instructions implementing a long
  1.5771 +// Then: _fat if it requires the big decoder
  1.5772 +//   Or: _mem if it requires the big decoder and a memory unit.
  1.5773 +
  1.5774 +// Integer ALU reg operation
  1.5775 +pipe_class ialu_reg(eRegI dst) %{
  1.5776 +    single_instruction;
  1.5777 +    dst    : S4(write);
  1.5778 +    dst    : S3(read);
  1.5779 +    DECODE : S0;        // any decoder
  1.5780 +    ALU    : S3;        // any alu
  1.5781 +%}
  1.5782 +
  1.5783 +// Long ALU reg operation
  1.5784 +pipe_class ialu_reg_long(eRegL dst) %{
  1.5785 +    instruction_count(2);
  1.5786 +    dst    : S4(write);
  1.5787 +    dst    : S3(read);
  1.5788 +    DECODE : S0(2);     // any 2 decoders
  1.5789 +    ALU    : S3(2);     // both alus
  1.5790 +%}
  1.5791 +
  1.5792 +// Integer ALU reg operation using big decoder
  1.5793 +pipe_class ialu_reg_fat(eRegI dst) %{
  1.5794 +    single_instruction;
  1.5795 +    dst    : S4(write);
  1.5796 +    dst    : S3(read);
  1.5797 +    D0     : S0;        // big decoder only
  1.5798 +    ALU    : S3;        // any alu
  1.5799 +%}
  1.5800 +
  1.5801 +// Long ALU reg operation using big decoder
  1.5802 +pipe_class ialu_reg_long_fat(eRegL dst) %{
  1.5803 +    instruction_count(2);
  1.5804 +    dst    : S4(write);
  1.5805 +    dst    : S3(read);
  1.5806 +    D0     : S0(2);     // big decoder only; twice
  1.5807 +    ALU    : S3(2);     // any 2 alus
  1.5808 +%}
  1.5809 +
  1.5810 +// Integer ALU reg-reg operation
  1.5811 +pipe_class ialu_reg_reg(eRegI dst, eRegI src) %{
  1.5812 +    single_instruction;
  1.5813 +    dst    : S4(write);
  1.5814 +    src    : S3(read);
  1.5815 +    DECODE : S0;        // any decoder
  1.5816 +    ALU    : S3;        // any alu
  1.5817 +%}
  1.5818 +
  1.5819 +// Long ALU reg-reg operation
  1.5820 +pipe_class ialu_reg_reg_long(eRegL dst, eRegL src) %{
  1.5821 +    instruction_count(2);
  1.5822 +    dst    : S4(write);
  1.5823 +    src    : S3(read);
  1.5824 +    DECODE : S0(2);     // any 2 decoders
  1.5825 +    ALU    : S3(2);     // both alus
  1.5826 +%}
  1.5827 +
  1.5828 +// Integer ALU reg-reg operation
  1.5829 +pipe_class ialu_reg_reg_fat(eRegI dst, memory src) %{
  1.5830 +    single_instruction;
  1.5831 +    dst    : S4(write);
  1.5832 +    src    : S3(read);
  1.5833 +    D0     : S0;        // big decoder only
  1.5834 +    ALU    : S3;        // any alu
  1.5835 +%}
  1.5836 +
  1.5837 +// Long ALU reg-reg operation
  1.5838 +pipe_class ialu_reg_reg_long_fat(eRegL dst, eRegL src) %{
  1.5839 +    instruction_count(2);
  1.5840 +    dst    : S4(write);
  1.5841 +    src    : S3(read);
  1.5842 +    D0     : S0(2);     // big decoder only; twice
  1.5843 +    ALU    : S3(2);     // both alus
  1.5844 +%}
  1.5845 +
  1.5846 +// Integer ALU reg-mem operation
  1.5847 +pipe_class ialu_reg_mem(eRegI dst, memory mem) %{
  1.5848 +    single_instruction;
  1.5849 +    dst    : S5(write);
  1.5850 +    mem    : S3(read);
  1.5851 +    D0     : S0;        // big decoder only
  1.5852 +    ALU    : S4;        // any alu
  1.5853 +    MEM    : S3;        // any mem
  1.5854 +%}
  1.5855 +
  1.5856 +// Long ALU reg-mem operation
  1.5857 +pipe_class ialu_reg_long_mem(eRegL dst, load_long_memory mem) %{
  1.5858 +    instruction_count(2);
  1.5859 +    dst    : S5(write);
  1.5860 +    mem    : S3(read);
  1.5861 +    D0     : S0(2);     // big decoder only; twice
  1.5862 +    ALU    : S4(2);     // any 2 alus
  1.5863 +    MEM    : S3(2);     // both mems
  1.5864 +%}
  1.5865 +
  1.5866 +// Integer mem operation (prefetch)
  1.5867 +pipe_class ialu_mem(memory mem)
  1.5868 +%{
  1.5869 +    single_instruction;
  1.5870 +    mem    : S3(read);
  1.5871 +    D0     : S0;        // big decoder only
  1.5872 +    MEM    : S3;        // any mem
  1.5873 +%}
  1.5874 +
  1.5875 +// Integer Store to Memory
  1.5876 +pipe_class ialu_mem_reg(memory mem, eRegI src) %{
  1.5877 +    single_instruction;
  1.5878 +    mem    : S3(read);
  1.5879 +    src    : S5(read);
  1.5880 +    D0     : S0;        // big decoder only
  1.5881 +    ALU    : S4;        // any alu
  1.5882 +    MEM    : S3;
  1.5883 +%}
  1.5884 +
  1.5885 +// Long Store to Memory
  1.5886 +pipe_class ialu_mem_long_reg(memory mem, eRegL src) %{
  1.5887 +    instruction_count(2);
  1.5888 +    mem    : S3(read);
  1.5889 +    src    : S5(read);
  1.5890 +    D0     : S0(2);     // big decoder only; twice
  1.5891 +    ALU    : S4(2);     // any 2 alus
  1.5892 +    MEM    : S3(2);     // Both mems
  1.5893 +%}
  1.5894 +
  1.5895 +// Integer Store to Memory
  1.5896 +pipe_class ialu_mem_imm(memory mem) %{
  1.5897 +    single_instruction;
  1.5898 +    mem    : S3(read);
  1.5899 +    D0     : S0;        // big decoder only
  1.5900 +    ALU    : S4;        // any alu
  1.5901 +    MEM    : S3;
  1.5902 +%}
  1.5903 +
  1.5904 +// Integer ALU0 reg-reg operation
  1.5905 +pipe_class ialu_reg_reg_alu0(eRegI dst, eRegI src) %{
  1.5906 +    single_instruction;
  1.5907 +    dst    : S4(write);
  1.5908 +    src    : S3(read);
  1.5909 +    D0     : S0;        // Big decoder only
  1.5910 +    ALU0   : S3;        // only alu0
  1.5911 +%}
  1.5912 +
  1.5913 +// Integer ALU0 reg-mem operation
  1.5914 +pipe_class ialu_reg_mem_alu0(eRegI dst, memory mem) %{
  1.5915 +    single_instruction;
  1.5916 +    dst    : S5(write);
  1.5917 +    mem    : S3(read);
  1.5918 +    D0     : S0;        // big decoder only
  1.5919 +    ALU0   : S4;        // ALU0 only
  1.5920 +    MEM    : S3;        // any mem
  1.5921 +%}
  1.5922 +
  1.5923 +// Integer ALU reg-reg operation
  1.5924 +pipe_class ialu_cr_reg_reg(eFlagsReg cr, eRegI src1, eRegI src2) %{
  1.5925 +    single_instruction;
  1.5926 +    cr     : S4(write);
  1.5927 +    src1   : S3(read);
  1.5928 +    src2   : S3(read);
  1.5929 +    DECODE : S0;        // any decoder
  1.5930 +    ALU    : S3;        // any alu
  1.5931 +%}
  1.5932 +
  1.5933 +// Integer ALU reg-imm operation
  1.5934 +pipe_class ialu_cr_reg_imm(eFlagsReg cr, eRegI src1) %{
  1.5935 +    single_instruction;
  1.5936 +    cr     : S4(write);
  1.5937 +    src1   : S3(read);
  1.5938 +    DECODE : S0;        // any decoder
  1.5939 +    ALU    : S3;        // any alu
  1.5940 +%}
  1.5941 +
  1.5942 +// Integer ALU reg-mem operation
  1.5943 +pipe_class ialu_cr_reg_mem(eFlagsReg cr, eRegI src1, memory src2) %{
  1.5944 +    single_instruction;
  1.5945 +    cr     : S4(write);
  1.5946 +    src1   : S3(read);
  1.5947 +    src2   : S3(read);
  1.5948 +    D0     : S0;        // big decoder only
  1.5949 +    ALU    : S4;        // any alu
  1.5950 +    MEM    : S3;
  1.5951 +%}
  1.5952 +
  1.5953 +// Conditional move reg-reg
  1.5954 +pipe_class pipe_cmplt( eRegI p, eRegI q, eRegI y ) %{
  1.5955 +    instruction_count(4);
  1.5956 +    y      : S4(read);
  1.5957 +    q      : S3(read);
  1.5958 +    p      : S3(read);
  1.5959 +    DECODE : S0(4);     // any decoder
  1.5960 +%}
  1.5961 +
  1.5962 +// Conditional move reg-reg
  1.5963 +pipe_class pipe_cmov_reg( eRegI dst, eRegI src, eFlagsReg cr ) %{
  1.5964 +    single_instruction;
  1.5965 +    dst    : S4(write);
  1.5966 +    src    : S3(read);
  1.5967 +    cr     : S3(read);
  1.5968 +    DECODE : S0;        // any decoder
  1.5969 +%}
  1.5970 +
  1.5971 +// Conditional move reg-mem
  1.5972 +pipe_class pipe_cmov_mem( eFlagsReg cr, eRegI dst, memory src) %{
  1.5973 +    single_instruction;
  1.5974 +    dst    : S4(write);
  1.5975 +    src    : S3(read);
  1.5976 +    cr     : S3(read);
  1.5977 +    DECODE : S0;        // any decoder
  1.5978 +    MEM    : S3;
  1.5979 +%}
  1.5980 +
  1.5981 +// Conditional move reg-reg long
  1.5982 +pipe_class pipe_cmov_reg_long( eFlagsReg cr, eRegL dst, eRegL src) %{
  1.5983 +    single_instruction;
  1.5984 +    dst    : S4(write);
  1.5985 +    src    : S3(read);
  1.5986 +    cr     : S3(read);
  1.5987 +    DECODE : S0(2);     // any 2 decoders
  1.5988 +%}
  1.5989 +
  1.5990 +// Conditional move double reg-reg
  1.5991 +pipe_class pipe_cmovD_reg( eFlagsReg cr, regDPR1 dst, regD src) %{
  1.5992 +    single_instruction;
  1.5993 +    dst    : S4(write);
  1.5994 +    src    : S3(read);
  1.5995 +    cr     : S3(read);
  1.5996 +    DECODE : S0;        // any decoder
  1.5997 +%}
  1.5998 +
  1.5999 +// Float reg-reg operation
  1.6000 +pipe_class fpu_reg(regD dst) %{
  1.6001 +    instruction_count(2);
  1.6002 +    dst    : S3(read);
  1.6003 +    DECODE : S0(2);     // any 2 decoders
  1.6004 +    FPU    : S3;
  1.6005 +%}
  1.6006 +
  1.6007 +// Float reg-reg operation
  1.6008 +pipe_class fpu_reg_reg(regD dst, regD src) %{
  1.6009 +    instruction_count(2);
  1.6010 +    dst    : S4(write);
  1.6011 +    src    : S3(read);
  1.6012 +    DECODE : S0(2);     // any 2 decoders
  1.6013 +    FPU    : S3;
  1.6014 +%}
  1.6015 +
  1.6016 +// Float reg-reg operation
  1.6017 +pipe_class fpu_reg_reg_reg(regD dst, regD src1, regD src2) %{
  1.6018 +    instruction_count(3);
  1.6019 +    dst    : S4(write);
  1.6020 +    src1   : S3(read);
  1.6021 +    src2   : S3(read);
  1.6022 +    DECODE : S0(3);     // any 3 decoders
  1.6023 +    FPU    : S3(2);
  1.6024 +%}
  1.6025 +
  1.6026 +// Float reg-reg operation
  1.6027 +pipe_class fpu_reg_reg_reg_reg(regD dst, regD src1, regD src2, regD src3) %{
  1.6028 +    instruction_count(4);
  1.6029 +    dst    : S4(write);
  1.6030 +    src1   : S3(read);
  1.6031 +    src2   : S3(read);
  1.6032 +    src3   : S3(read);
  1.6033 +    DECODE : S0(4);     // any 3 decoders
  1.6034 +    FPU    : S3(2);
  1.6035 +%}
  1.6036 +
  1.6037 +// Float reg-reg operation
  1.6038 +pipe_class fpu_reg_mem_reg_reg(regD dst, memory src1, regD src2, regD src3) %{
  1.6039 +    instruction_count(4);
  1.6040 +    dst    : S4(write);
  1.6041 +    src1   : S3(read);
  1.6042 +    src2   : S3(read);
  1.6043 +    src3   : S3(read);
  1.6044 +    DECODE : S1(3);     // any 3 decoders
  1.6045 +    D0     : S0;        // Big decoder only
  1.6046 +    FPU    : S3(2);
  1.6047 +    MEM    : S3;
  1.6048 +%}
  1.6049 +
  1.6050 +// Float reg-mem operation
  1.6051 +pipe_class fpu_reg_mem(regD dst, memory mem) %{
  1.6052 +    instruction_count(2);
  1.6053 +    dst    : S5(write);
  1.6054 +    mem    : S3(read);
  1.6055 +    D0     : S0;        // big decoder only
  1.6056 +    DECODE : S1;        // any decoder for FPU POP
  1.6057 +    FPU    : S4;
  1.6058 +    MEM    : S3;        // any mem
  1.6059 +%}
  1.6060 +
  1.6061 +// Float reg-mem operation
  1.6062 +pipe_class fpu_reg_reg_mem(regD dst, regD src1, memory mem) %{
  1.6063 +    instruction_count(3);
  1.6064 +    dst    : S5(write);
  1.6065 +    src1   : S3(read);
  1.6066 +    mem    : S3(read);
  1.6067 +    D0     : S0;        // big decoder only
  1.6068 +    DECODE : S1(2);     // any decoder for FPU POP
  1.6069 +    FPU    : S4;
  1.6070 +    MEM    : S3;        // any mem
  1.6071 +%}
  1.6072 +
  1.6073 +// Float mem-reg operation
  1.6074 +pipe_class fpu_mem_reg(memory mem, regD src) %{
  1.6075 +    instruction_count(2);
  1.6076 +    src    : S5(read);
  1.6077 +    mem    : S3(read);
  1.6078 +    DECODE : S0;        // any decoder for FPU PUSH
  1.6079 +    D0     : S1;        // big decoder only
  1.6080 +    FPU    : S4;
  1.6081 +    MEM    : S3;        // any mem
  1.6082 +%}
  1.6083 +
  1.6084 +pipe_class fpu_mem_reg_reg(memory mem, regD src1, regD src2) %{
  1.6085 +    instruction_count(3);
  1.6086 +    src1   : S3(read);
  1.6087 +    src2   : S3(read);
  1.6088 +    mem    : S3(read);
  1.6089 +    DECODE : S0(2);     // any decoder for FPU PUSH
  1.6090 +    D0     : S1;        // big decoder only
  1.6091 +    FPU    : S4;
  1.6092 +    MEM    : S3;        // any mem
  1.6093 +%}
  1.6094 +
  1.6095 +pipe_class fpu_mem_reg_mem(memory mem, regD src1, memory src2) %{
  1.6096 +    instruction_count(3);
  1.6097 +    src1   : S3(read);
  1.6098 +    src2   : S3(read);
  1.6099 +    mem    : S4(read);
  1.6100 +    DECODE : S0;        // any decoder for FPU PUSH
  1.6101 +    D0     : S0(2);     // big decoder only
  1.6102 +    FPU    : S4;
  1.6103 +    MEM    : S3(2);     // any mem
  1.6104 +%}
  1.6105 +
  1.6106 +pipe_class fpu_mem_mem(memory dst, memory src1) %{
  1.6107 +    instruction_count(2);
  1.6108 +    src1   : S3(read);
  1.6109 +    dst    : S4(read);
  1.6110 +    D0     : S0(2);     // big decoder only
  1.6111 +    MEM    : S3(2);     // any mem
  1.6112 +%}
  1.6113 +
  1.6114 +pipe_class fpu_mem_mem_mem(memory dst, memory src1, memory src2) %{
  1.6115 +    instruction_count(3);
  1.6116 +    src1   : S3(read);
  1.6117 +    src2   : S3(read);
  1.6118 +    dst    : S4(read);
  1.6119 +    D0     : S0(3);     // big decoder only
  1.6120 +    FPU    : S4;
  1.6121 +    MEM    : S3(3);     // any mem
  1.6122 +%}
  1.6123 +
  1.6124 +pipe_class fpu_mem_reg_con(memory mem, regD src1) %{
  1.6125 +    instruction_count(3);
  1.6126 +    src1   : S4(read);
  1.6127 +    mem    : S4(read);
  1.6128 +    DECODE : S0;        // any decoder for FPU PUSH
  1.6129 +    D0     : S0(2);     // big decoder only
  1.6130 +    FPU    : S4;
  1.6131 +    MEM    : S3(2);     // any mem
  1.6132 +%}
  1.6133 +
  1.6134 +// Float load constant
  1.6135 +pipe_class fpu_reg_con(regD dst) %{
  1.6136 +    instruction_count(2);
  1.6137 +    dst    : S5(write);
  1.6138 +    D0     : S0;        // big decoder only for the load
  1.6139 +    DECODE : S1;        // any decoder for FPU POP
  1.6140 +    FPU    : S4;
  1.6141 +    MEM    : S3;        // any mem
  1.6142 +%}
  1.6143 +
  1.6144 +// Float load constant
  1.6145 +pipe_class fpu_reg_reg_con(regD dst, regD src) %{
  1.6146 +    instruction_count(3);
  1.6147 +    dst    : S5(write);
  1.6148 +    src    : S3(read);
  1.6149 +    D0     : S0;        // big decoder only for the load
  1.6150 +    DECODE : S1(2);     // any decoder for FPU POP
  1.6151 +    FPU    : S4;
  1.6152 +    MEM    : S3;        // any mem
  1.6153 +%}
  1.6154 +
  1.6155 +// UnConditional branch
  1.6156 +pipe_class pipe_jmp( label labl ) %{
  1.6157 +    single_instruction;
  1.6158 +    BR   : S3;
  1.6159 +%}
  1.6160 +
  1.6161 +// Conditional branch
  1.6162 +pipe_class pipe_jcc( cmpOp cmp, eFlagsReg cr, label labl ) %{
  1.6163 +    single_instruction;
  1.6164 +    cr    : S1(read);
  1.6165 +    BR    : S3;
  1.6166 +%}
  1.6167 +
  1.6168 +// Allocation idiom
  1.6169 +pipe_class pipe_cmpxchg( eRegP dst, eRegP heap_ptr ) %{
  1.6170 +    instruction_count(1); force_serialization;
  1.6171 +    fixed_latency(6);
  1.6172 +    heap_ptr : S3(read);
  1.6173 +    DECODE   : S0(3);
  1.6174 +    D0       : S2;
  1.6175 +    MEM      : S3;
  1.6176 +    ALU      : S3(2);
  1.6177 +    dst      : S5(write);
  1.6178 +    BR       : S5;
  1.6179 +%}
  1.6180 +
  1.6181 +// Generic big/slow expanded idiom
  1.6182 +pipe_class pipe_slow(  ) %{
  1.6183 +    instruction_count(10); multiple_bundles; force_serialization;
  1.6184 +    fixed_latency(100);
  1.6185 +    D0  : S0(2);
  1.6186 +    MEM : S3(2);
  1.6187 +%}
  1.6188 +
  1.6189 +// The real do-nothing guy
  1.6190 +pipe_class empty( ) %{
  1.6191 +    instruction_count(0);
  1.6192 +%}
  1.6193 +
  1.6194 +// Define the class for the Nop node
  1.6195 +define %{
  1.6196 +   MachNop = empty;
  1.6197 +%}
  1.6198 +
  1.6199 +%}
  1.6200 +
  1.6201 +//----------INSTRUCTIONS-------------------------------------------------------
  1.6202 +//
  1.6203 +// match      -- States which machine-independent subtree may be replaced
  1.6204 +//               by this instruction.
  1.6205 +// ins_cost   -- The estimated cost of this instruction is used by instruction
  1.6206 +//               selection to identify a minimum cost tree of machine
  1.6207 +//               instructions that matches a tree of machine-independent
  1.6208 +//               instructions.
  1.6209 +// format     -- A string providing the disassembly for this instruction.
  1.6210 +//               The value of an instruction's operand may be inserted
  1.6211 +//               by referring to it with a '$' prefix.
  1.6212 +// opcode     -- Three instruction opcodes may be provided.  These are referred
  1.6213 +//               to within an encode class as $primary, $secondary, and $tertiary
  1.6214 +//               respectively.  The primary opcode is commonly used to
  1.6215 +//               indicate the type of machine instruction, while secondary
  1.6216 +//               and tertiary are often used for prefix options or addressing
  1.6217 +//               modes.
  1.6218 +// ins_encode -- A list of encode classes with parameters. The encode class
  1.6219 +//               name must have been defined in an 'enc_class' specification
  1.6220 +//               in the encode section of the architecture description.
  1.6221 +
  1.6222 +//----------BSWAP-Instruction--------------------------------------------------
  1.6223 +instruct bytes_reverse_int(eRegI dst) %{
  1.6224 +  match(Set dst (ReverseBytesI dst));
  1.6225 +
  1.6226 +  format %{ "BSWAP  $dst" %}
  1.6227 +  opcode(0x0F, 0xC8);
  1.6228 +  ins_encode( OpcP, OpcSReg(dst) );
  1.6229 +  ins_pipe( ialu_reg );
  1.6230 +%}
  1.6231 +
  1.6232 +instruct bytes_reverse_long(eRegL dst) %{
  1.6233 +  match(Set dst (ReverseBytesL dst));
  1.6234 +
  1.6235 +  format %{ "BSWAP  $dst.lo\n\t"
  1.6236 +            "BSWAP  $dst.hi\n\t"
  1.6237 +            "XCHG   $dst.lo $dst.hi" %}
  1.6238 +
  1.6239 +  ins_cost(125);
  1.6240 +  ins_encode( bswap_long_bytes(dst) );
  1.6241 +  ins_pipe( ialu_reg_reg);
  1.6242 +%}
  1.6243 +
  1.6244 +
  1.6245 +//----------Load/Store/Move Instructions---------------------------------------
  1.6246 +//----------Load Instructions--------------------------------------------------
  1.6247 +// Load Byte (8bit signed)
  1.6248 +instruct loadB(xRegI dst, memory mem) %{
  1.6249 +  match(Set dst (LoadB mem));
  1.6250 +
  1.6251 +  ins_cost(125);
  1.6252 +  format %{ "MOVSX8 $dst,$mem" %}
  1.6253 +  opcode(0xBE, 0x0F);
  1.6254 +  ins_encode( OpcS, OpcP, RegMem(dst,mem));
  1.6255 +  ins_pipe( ialu_reg_mem );
  1.6256 +%}
  1.6257 +
  1.6258 +// Load Byte (8bit UNsigned)
  1.6259 +instruct loadUB(xRegI dst, memory mem, immI_255 bytemask) %{
  1.6260 +  match(Set dst (AndI (LoadB mem) bytemask));
  1.6261 +
  1.6262 +  ins_cost(125);
  1.6263 +  format %{ "MOVZX8 $dst,$mem" %}
  1.6264 +  opcode(0xB6, 0x0F);
  1.6265 +  ins_encode( OpcS, OpcP, RegMem(dst,mem));
  1.6266 +  ins_pipe( ialu_reg_mem );
  1.6267 +%}
  1.6268 +
  1.6269 +// Load Char (16bit unsigned)
  1.6270 +instruct loadC(eRegI dst, memory mem) %{
  1.6271 +  match(Set dst (LoadC mem));
  1.6272 +
  1.6273 +  ins_cost(125);
  1.6274 +  format %{ "MOVZX  $dst,$mem" %}
  1.6275 +  opcode(0xB7, 0x0F);
  1.6276 +  ins_encode( OpcS, OpcP, RegMem(dst,mem));
  1.6277 +  ins_pipe( ialu_reg_mem );
  1.6278 +%}
  1.6279 +
  1.6280 +// Load Integer
  1.6281 +instruct loadI(eRegI dst, memory mem) %{
  1.6282 +  match(Set dst (LoadI mem));
  1.6283 +
  1.6284 +  ins_cost(125);
  1.6285 +  format %{ "MOV    $dst,$mem" %}
  1.6286 +  opcode(0x8B);
  1.6287 +  ins_encode( OpcP, RegMem(dst,mem));
  1.6288 +  ins_pipe( ialu_reg_mem );
  1.6289 +%}
  1.6290 +
  1.6291 +// Load Long.  Cannot clobber address while loading, so restrict address
  1.6292 +// register to ESI
  1.6293 +instruct loadL(eRegL dst, load_long_memory mem) %{
  1.6294 +  predicate(!((LoadLNode*)n)->require_atomic_access());
  1.6295 +  match(Set dst (LoadL mem));
  1.6296 +
  1.6297 +  ins_cost(250);
  1.6298 +  format %{ "MOV    $dst.lo,$mem\n\t"
  1.6299 +            "MOV    $dst.hi,$mem+4" %}
  1.6300 +  opcode(0x8B, 0x8B);
  1.6301 +  ins_encode( OpcP, RegMem(dst,mem), OpcS, RegMem_Hi(dst,mem));
  1.6302 +  ins_pipe( ialu_reg_long_mem );
  1.6303 +%}
  1.6304 +
  1.6305 +// Volatile Load Long.  Must be atomic, so do 64-bit FILD
  1.6306 +// then store it down to the stack and reload on the int
  1.6307 +// side.
  1.6308 +instruct loadL_volatile(stackSlotL dst, memory mem) %{
  1.6309 +  predicate(UseSSE<=1 && ((LoadLNode*)n)->require_atomic_access());
  1.6310 +  match(Set dst (LoadL mem));
  1.6311 +
  1.6312 +  ins_cost(200);
  1.6313 +  format %{ "FILD   $mem\t# Atomic volatile long load\n\t"
  1.6314 +            "FISTp  $dst" %}
  1.6315 +  ins_encode(enc_loadL_volatile(mem,dst));
  1.6316 +  ins_pipe( fpu_reg_mem );
  1.6317 +%}
  1.6318 +
  1.6319 +instruct loadLX_volatile(stackSlotL dst, memory mem, regXD tmp) %{
  1.6320 +  predicate(UseSSE>=2 && ((LoadLNode*)n)->require_atomic_access());
  1.6321 +  match(Set dst (LoadL mem));
  1.6322 +  effect(TEMP tmp);
  1.6323 +  ins_cost(180);
  1.6324 +  format %{ "MOVSD  $tmp,$mem\t# Atomic volatile long load\n\t"
  1.6325 +            "MOVSD  $dst,$tmp" %}
  1.6326 +  ins_encode(enc_loadLX_volatile(mem, dst, tmp));
  1.6327 +  ins_pipe( pipe_slow );
  1.6328 +%}
  1.6329 +
  1.6330 +instruct loadLX_reg_volatile(eRegL dst, memory mem, regXD tmp) %{
  1.6331 +  predicate(UseSSE>=2 && ((LoadLNode*)n)->require_atomic_access());
  1.6332 +  match(Set dst (LoadL mem));
  1.6333 +  effect(TEMP tmp);
  1.6334 +  ins_cost(160);
  1.6335 +  format %{ "MOVSD  $tmp,$mem\t# Atomic volatile long load\n\t"
  1.6336 +            "MOVD   $dst.lo,$tmp\n\t"
  1.6337 +            "PSRLQ  $tmp,32\n\t"
  1.6338 +            "MOVD   $dst.hi,$tmp" %}
  1.6339 +  ins_encode(enc_loadLX_reg_volatile(mem, dst, tmp));
  1.6340 +  ins_pipe( pipe_slow );
  1.6341 +%}
  1.6342 +
  1.6343 +// Load Range
  1.6344 +instruct loadRange(eRegI dst, memory mem) %{
  1.6345 +  match(Set dst (LoadRange mem));
  1.6346 +
  1.6347 +  ins_cost(125);
  1.6348 +  format %{ "MOV    $dst,$mem" %}
  1.6349 +  opcode(0x8B);
  1.6350 +  ins_encode( OpcP, RegMem(dst,mem));
  1.6351 +  ins_pipe( ialu_reg_mem );
  1.6352 +%}
  1.6353 +
  1.6354 +
  1.6355 +// Load Pointer
  1.6356 +instruct loadP(eRegP dst, memory mem) %{
  1.6357 +  match(Set dst (LoadP mem));
  1.6358 +
  1.6359 +  ins_cost(125);
  1.6360 +  format %{ "MOV    $dst,$mem" %}
  1.6361 +  opcode(0x8B);
  1.6362 +  ins_encode( OpcP, RegMem(dst,mem));
  1.6363 +  ins_pipe( ialu_reg_mem );
  1.6364 +%}
  1.6365 +
  1.6366 +// Load Klass Pointer
  1.6367 +instruct loadKlass(eRegP dst, memory mem) %{
  1.6368 +  match(Set dst (LoadKlass mem));
  1.6369 +
  1.6370 +  ins_cost(125);
  1.6371 +  format %{ "MOV    $dst,$mem" %}
  1.6372 +  opcode(0x8B);
  1.6373 +  ins_encode( OpcP, RegMem(dst,mem));
  1.6374 +  ins_pipe( ialu_reg_mem );
  1.6375 +%}
  1.6376 +
  1.6377 +// Load Short (16bit signed)
  1.6378 +instruct loadS(eRegI dst, memory mem) %{
  1.6379 +  match(Set dst (LoadS mem));
  1.6380 +
  1.6381 +  ins_cost(125);
  1.6382 +  format %{ "MOVSX  $dst,$mem" %}
  1.6383 +  opcode(0xBF, 0x0F);
  1.6384 +  ins_encode( OpcS, OpcP, RegMem(dst,mem));
  1.6385 +  ins_pipe( ialu_reg_mem );
  1.6386 +%}
  1.6387 +
  1.6388 +// Load Double
  1.6389 +instruct loadD(regD dst, memory mem) %{
  1.6390 +  predicate(UseSSE<=1);
  1.6391 +  match(Set dst (LoadD mem));
  1.6392 +
  1.6393 +  ins_cost(150);
  1.6394 +  format %{ "FLD_D  ST,$mem\n\t"
  1.6395 +            "FSTP   $dst" %}
  1.6396 +  opcode(0xDD);               /* DD /0 */
  1.6397 +  ins_encode( OpcP, RMopc_Mem(0x00,mem),
  1.6398 +              Pop_Reg_D(dst) );
  1.6399 +  ins_pipe( fpu_reg_mem );
  1.6400 +%}
  1.6401 +
  1.6402 +// Load Double to XMM
  1.6403 +instruct loadXD(regXD dst, memory mem) %{
  1.6404 +  predicate(UseSSE>=2 && UseXmmLoadAndClearUpper);
  1.6405 +  match(Set dst (LoadD mem));
  1.6406 +  ins_cost(145);
  1.6407 +  format %{ "MOVSD  $dst,$mem" %}
  1.6408 +  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x10), RegMem(dst,mem));
  1.6409 +  ins_pipe( pipe_slow );
  1.6410 +%}
  1.6411 +
  1.6412 +instruct loadXD_partial(regXD dst, memory mem) %{
  1.6413 +  predicate(UseSSE>=2 && !UseXmmLoadAndClearUpper);
  1.6414 +  match(Set dst (LoadD mem));
  1.6415 +  ins_cost(145);
  1.6416 +  format %{ "MOVLPD $dst,$mem" %}
  1.6417 +  ins_encode( Opcode(0x66), Opcode(0x0F), Opcode(0x12), RegMem(dst,mem));
  1.6418 +  ins_pipe( pipe_slow );
  1.6419 +%}
  1.6420 +
  1.6421 +// Load to XMM register (single-precision floating point)
  1.6422 +// MOVSS instruction
  1.6423 +instruct loadX(regX dst, memory mem) %{
  1.6424 +  predicate(UseSSE>=1);
  1.6425 +  match(Set dst (LoadF mem));
  1.6426 +  ins_cost(145);
  1.6427 +  format %{ "MOVSS  $dst,$mem" %}
  1.6428 +  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x10), RegMem(dst,mem));
  1.6429 +  ins_pipe( pipe_slow );
  1.6430 +%}
  1.6431 +
  1.6432 +// Load Float
  1.6433 +instruct loadF(regF dst, memory mem) %{
  1.6434 +  predicate(UseSSE==0);
  1.6435 +  match(Set dst (LoadF mem));
  1.6436 +
  1.6437 +  ins_cost(150);
  1.6438 +  format %{ "FLD_S  ST,$mem\n\t"
  1.6439 +            "FSTP   $dst" %}
  1.6440 +  opcode(0xD9);               /* D9 /0 */
  1.6441 +  ins_encode( OpcP, RMopc_Mem(0x00,mem),
  1.6442 +              Pop_Reg_F(dst) );
  1.6443 +  ins_pipe( fpu_reg_mem );
  1.6444 +%}
  1.6445 +
  1.6446 +// Load Aligned Packed Byte to XMM register
  1.6447 +instruct loadA8B(regXD dst, memory mem) %{
  1.6448 +  predicate(UseSSE>=1);
  1.6449 +  match(Set dst (Load8B mem));
  1.6450 +  ins_cost(125);
  1.6451 +  format %{ "MOVQ  $dst,$mem\t! packed8B" %}
  1.6452 +  ins_encode( movq_ld(dst, mem));
  1.6453 +  ins_pipe( pipe_slow );
  1.6454 +%}
  1.6455 +
  1.6456 +// Load Aligned Packed Short to XMM register
  1.6457 +instruct loadA4S(regXD dst, memory mem) %{
  1.6458 +  predicate(UseSSE>=1);
  1.6459 +  match(Set dst (Load4S mem));
  1.6460 +  ins_cost(125);
  1.6461 +  format %{ "MOVQ  $dst,$mem\t! packed4S" %}
  1.6462 +  ins_encode( movq_ld(dst, mem));
  1.6463 +  ins_pipe( pipe_slow );
  1.6464 +%}
  1.6465 +
  1.6466 +// Load Aligned Packed Char to XMM register
  1.6467 +instruct loadA4C(regXD dst, memory mem) %{
  1.6468 +  predicate(UseSSE>=1);
  1.6469 +  match(Set dst (Load4C mem));
  1.6470 +  ins_cost(125);
  1.6471 +  format %{ "MOVQ  $dst,$mem\t! packed4C" %}
  1.6472 +  ins_encode( movq_ld(dst, mem));
  1.6473 +  ins_pipe( pipe_slow );
  1.6474 +%}
  1.6475 +
  1.6476 +// Load Aligned Packed Integer to XMM register
  1.6477 +instruct load2IU(regXD dst, memory mem) %{
  1.6478 +  predicate(UseSSE>=1);
  1.6479 +  match(Set dst (Load2I mem));
  1.6480 +  ins_cost(125);
  1.6481 +  format %{ "MOVQ  $dst,$mem\t! packed2I" %}
  1.6482 +  ins_encode( movq_ld(dst, mem));
  1.6483 +  ins_pipe( pipe_slow );
  1.6484 +%}
  1.6485 +
  1.6486 +// Load Aligned Packed Single to XMM
  1.6487 +instruct loadA2F(regXD dst, memory mem) %{
  1.6488 +  predicate(UseSSE>=1);
  1.6489 +  match(Set dst (Load2F mem));
  1.6490 +  ins_cost(145);
  1.6491 +  format %{ "MOVQ  $dst,$mem\t! packed2F" %}
  1.6492 +  ins_encode( movq_ld(dst, mem));
  1.6493 +  ins_pipe( pipe_slow );
  1.6494 +%}
  1.6495 +
  1.6496 +// Load Effective Address
  1.6497 +instruct leaP8(eRegP dst, indOffset8 mem) %{
  1.6498 +  match(Set dst mem);
  1.6499 +
  1.6500 +  ins_cost(110);
  1.6501 +  format %{ "LEA    $dst,$mem" %}
  1.6502 +  opcode(0x8D);
  1.6503 +  ins_encode( OpcP, RegMem(dst,mem));
  1.6504 +  ins_pipe( ialu_reg_reg_fat );
  1.6505 +%}
  1.6506 +
  1.6507 +instruct leaP32(eRegP dst, indOffset32 mem) %{
  1.6508 +  match(Set dst mem);
  1.6509 +
  1.6510 +  ins_cost(110);
  1.6511 +  format %{ "LEA    $dst,$mem" %}
  1.6512 +  opcode(0x8D);
  1.6513 +  ins_encode( OpcP, RegMem(dst,mem));
  1.6514 +  ins_pipe( ialu_reg_reg_fat );
  1.6515 +%}
  1.6516 +
  1.6517 +instruct leaPIdxOff(eRegP dst, indIndexOffset mem) %{
  1.6518 +  match(Set dst mem);
  1.6519 +
  1.6520 +  ins_cost(110);
  1.6521 +  format %{ "LEA    $dst,$mem" %}
  1.6522 +  opcode(0x8D);
  1.6523 +  ins_encode( OpcP, RegMem(dst,mem));
  1.6524 +  ins_pipe( ialu_reg_reg_fat );
  1.6525 +%}
  1.6526 +
  1.6527 +instruct leaPIdxScale(eRegP dst, indIndexScale mem) %{
  1.6528 +  match(Set dst mem);
  1.6529 +
  1.6530 +  ins_cost(110);
  1.6531 +  format %{ "LEA    $dst,$mem" %}
  1.6532 +  opcode(0x8D);
  1.6533 +  ins_encode( OpcP, RegMem(dst,mem));
  1.6534 +  ins_pipe( ialu_reg_reg_fat );
  1.6535 +%}
  1.6536 +
  1.6537 +instruct leaPIdxScaleOff(eRegP dst, indIndexScaleOffset mem) %{
  1.6538 +  match(Set dst mem);
  1.6539 +
  1.6540 +  ins_cost(110);
  1.6541 +  format %{ "LEA    $dst,$mem" %}
  1.6542 +  opcode(0x8D);
  1.6543 +  ins_encode( OpcP, RegMem(dst,mem));
  1.6544 +  ins_pipe( ialu_reg_reg_fat );
  1.6545 +%}
  1.6546 +
  1.6547 +// Load Constant
  1.6548 +instruct loadConI(eRegI dst, immI src) %{
  1.6549 +  match(Set dst src);
  1.6550 +
  1.6551 +  format %{ "MOV    $dst,$src" %}
  1.6552 +  ins_encode( LdImmI(dst, src) );
  1.6553 +  ins_pipe( ialu_reg_fat );
  1.6554 +%}
  1.6555 +
  1.6556 +// Load Constant zero
  1.6557 +instruct loadConI0(eRegI dst, immI0 src, eFlagsReg cr) %{
  1.6558 +  match(Set dst src);
  1.6559 +  effect(KILL cr);
  1.6560 +
  1.6561 +  ins_cost(50);
  1.6562 +  format %{ "XOR    $dst,$dst" %}
  1.6563 +  opcode(0x33);  /* + rd */
  1.6564 +  ins_encode( OpcP, RegReg( dst, dst ) );
  1.6565 +  ins_pipe( ialu_reg );
  1.6566 +%}
  1.6567 +
  1.6568 +instruct loadConP(eRegP dst, immP src) %{
  1.6569 +  match(Set dst src);
  1.6570 +
  1.6571 +  format %{ "MOV    $dst,$src" %}
  1.6572 +  opcode(0xB8);  /* + rd */
  1.6573 +  ins_encode( LdImmP(dst, src) );
  1.6574 +  ins_pipe( ialu_reg_fat );
  1.6575 +%}
  1.6576 +
  1.6577 +instruct loadConL(eRegL dst, immL src, eFlagsReg cr) %{
  1.6578 +  match(Set dst src);
  1.6579 +  effect(KILL cr);
  1.6580 +  ins_cost(200);
  1.6581 +  format %{ "MOV    $dst.lo,$src.lo\n\t"
  1.6582 +            "MOV    $dst.hi,$src.hi" %}
  1.6583 +  opcode(0xB8);
  1.6584 +  ins_encode( LdImmL_Lo(dst, src), LdImmL_Hi(dst, src) );
  1.6585 +  ins_pipe( ialu_reg_long_fat );
  1.6586 +%}
  1.6587 +
  1.6588 +instruct loadConL0(eRegL dst, immL0 src, eFlagsReg cr) %{
  1.6589 +  match(Set dst src);
  1.6590 +  effect(KILL cr);
  1.6591 +  ins_cost(150);
  1.6592 +  format %{ "XOR    $dst.lo,$dst.lo\n\t"
  1.6593 +            "XOR    $dst.hi,$dst.hi" %}
  1.6594 +  opcode(0x33,0x33);
  1.6595 +  ins_encode( RegReg_Lo(dst,dst), RegReg_Hi(dst, dst) );
  1.6596 +  ins_pipe( ialu_reg_long );
  1.6597 +%}
  1.6598 +
  1.6599 +// The instruction usage is guarded by predicate in operand immF().
  1.6600 +instruct loadConF(regF dst, immF src) %{
  1.6601 +  match(Set dst src);
  1.6602 +  ins_cost(125);
  1.6603 +
  1.6604 +  format %{ "FLD_S  ST,$src\n\t"
  1.6605 +            "FSTP   $dst" %}
  1.6606 +  opcode(0xD9, 0x00);       /* D9 /0 */
  1.6607 +  ins_encode(LdImmF(src), Pop_Reg_F(dst) );
  1.6608 +  ins_pipe( fpu_reg_con );
  1.6609 +%}
  1.6610 +
  1.6611 +// The instruction usage is guarded by predicate in operand immXF().
  1.6612 +instruct loadConX(regX dst, immXF con) %{
  1.6613 +  match(Set dst con);
  1.6614 +  ins_cost(125);
  1.6615 +  format %{ "MOVSS  $dst,[$con]" %}
  1.6616 +  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x10), LdImmX(dst, con));
  1.6617 +  ins_pipe( pipe_slow );
  1.6618 +%}
  1.6619 +
  1.6620 +// The instruction usage is guarded by predicate in operand immXF0().
  1.6621 +instruct loadConX0(regX dst, immXF0 src) %{
  1.6622 +  match(Set dst src);
  1.6623 +  ins_cost(100);
  1.6624 +  format %{ "XORPS  $dst,$dst\t# float 0.0" %}
  1.6625 +  ins_encode( Opcode(0x0F), Opcode(0x57), RegReg(dst,dst));
  1.6626 +  ins_pipe( pipe_slow );
  1.6627 +%}
  1.6628 +
  1.6629 +// The instruction usage is guarded by predicate in operand immD().
  1.6630 +instruct loadConD(regD dst, immD src) %{
  1.6631 +  match(Set dst src);
  1.6632 +  ins_cost(125);
  1.6633 +
  1.6634 +  format %{ "FLD_D  ST,$src\n\t"
  1.6635 +            "FSTP   $dst" %}
  1.6636 +  ins_encode(LdImmD(src), Pop_Reg_D(dst) );
  1.6637 +  ins_pipe( fpu_reg_con );
  1.6638 +%}
  1.6639 +
  1.6640 +// The instruction usage is guarded by predicate in operand immXD().
  1.6641 +instruct loadConXD(regXD dst, immXD con) %{
  1.6642 +  match(Set dst con);
  1.6643 +  ins_cost(125);
  1.6644 +  format %{ "MOVSD  $dst,[$con]" %}
  1.6645 +  ins_encode(load_conXD(dst, con));
  1.6646 +  ins_pipe( pipe_slow );
  1.6647 +%}
  1.6648 +
  1.6649 +// The instruction usage is guarded by predicate in operand immXD0().
  1.6650 +instruct loadConXD0(regXD dst, immXD0 src) %{
  1.6651 +  match(Set dst src);
  1.6652 +  ins_cost(100);
  1.6653 +  format %{ "XORPD  $dst,$dst\t# double 0.0" %}
  1.6654 +  ins_encode( Opcode(0x66), Opcode(0x0F), Opcode(0x57), RegReg(dst,dst));
  1.6655 +  ins_pipe( pipe_slow );
  1.6656 +%}
  1.6657 +
  1.6658 +// Load Stack Slot
  1.6659 +instruct loadSSI(eRegI dst, stackSlotI src) %{
  1.6660 +  match(Set dst src);
  1.6661 +  ins_cost(125);
  1.6662 +
  1.6663 +  format %{ "MOV    $dst,$src" %}
  1.6664 +  opcode(0x8B);
  1.6665 +  ins_encode( OpcP, RegMem(dst,src));
  1.6666 +  ins_pipe( ialu_reg_mem );
  1.6667 +%}
  1.6668 +
  1.6669 +instruct loadSSL(eRegL dst, stackSlotL src) %{
  1.6670 +  match(Set dst src);
  1.6671 +
  1.6672 +  ins_cost(200);
  1.6673 +  format %{ "MOV    $dst,$src.lo\n\t"
  1.6674 +            "MOV    $dst+4,$src.hi" %}
  1.6675 +  opcode(0x8B, 0x8B);
  1.6676 +  ins_encode( OpcP, RegMem( dst, src ), OpcS, RegMem_Hi( dst, src ) );
  1.6677 +  ins_pipe( ialu_mem_long_reg );
  1.6678 +%}
  1.6679 +
  1.6680 +// Load Stack Slot
  1.6681 +instruct loadSSP(eRegP dst, stackSlotP src) %{
  1.6682 +  match(Set dst src);
  1.6683 +  ins_cost(125);
  1.6684 +
  1.6685 +  format %{ "MOV    $dst,$src" %}
  1.6686 +  opcode(0x8B);
  1.6687 +  ins_encode( OpcP, RegMem(dst,src));
  1.6688 +  ins_pipe( ialu_reg_mem );
  1.6689 +%}
  1.6690 +
  1.6691 +// Load Stack Slot
  1.6692 +instruct loadSSF(regF dst, stackSlotF src) %{
  1.6693 +  match(Set dst src);
  1.6694 +  ins_cost(125);
  1.6695 +
  1.6696 +  format %{ "FLD_S  $src\n\t"
  1.6697 +            "FSTP   $dst" %}
  1.6698 +  opcode(0xD9);               /* D9 /0, FLD m32real */
  1.6699 +  ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
  1.6700 +              Pop_Reg_F(dst) );
  1.6701 +  ins_pipe( fpu_reg_mem );
  1.6702 +%}
  1.6703 +
  1.6704 +// Load Stack Slot
  1.6705 +instruct loadSSD(regD dst, stackSlotD src) %{
  1.6706 +  match(Set dst src);
  1.6707 +  ins_cost(125);
  1.6708 +
  1.6709 +  format %{ "FLD_D  $src\n\t"
  1.6710 +            "FSTP   $dst" %}
  1.6711 +  opcode(0xDD);               /* DD /0, FLD m64real */
  1.6712 +  ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
  1.6713 +              Pop_Reg_D(dst) );
  1.6714 +  ins_pipe( fpu_reg_mem );
  1.6715 +%}
  1.6716 +
  1.6717 +// Prefetch instructions.
  1.6718 +// Must be safe to execute with invalid address (cannot fault).
  1.6719 +
  1.6720 +instruct prefetchr0( memory mem ) %{
  1.6721 +  predicate(UseSSE==0 && !VM_Version::supports_3dnow());
  1.6722 +  match(PrefetchRead mem);
  1.6723 +  ins_cost(0);
  1.6724 +  size(0);
  1.6725 +  format %{ "PREFETCHR (non-SSE is empty encoding)" %}
  1.6726 +  ins_encode();
  1.6727 +  ins_pipe(empty);
  1.6728 +%}
  1.6729 +
  1.6730 +instruct prefetchr( memory mem ) %{
  1.6731 +  predicate(UseSSE==0 && VM_Version::supports_3dnow() || ReadPrefetchInstr==3);
  1.6732 +  match(PrefetchRead mem);
  1.6733 +  ins_cost(100);
  1.6734 +
  1.6735 +  format %{ "PREFETCHR $mem\t! Prefetch into level 1 cache for read" %}
  1.6736 +  opcode(0x0F, 0x0d);     /* Opcode 0F 0d /0 */
  1.6737 +  ins_encode(OpcP, OpcS, RMopc_Mem(0x00,mem));
  1.6738 +  ins_pipe(ialu_mem);
  1.6739 +%}
  1.6740 +
  1.6741 +instruct prefetchrNTA( memory mem ) %{
  1.6742 +  predicate(UseSSE>=1 && ReadPrefetchInstr==0);
  1.6743 +  match(PrefetchRead mem);
  1.6744 +  ins_cost(100);
  1.6745 +
  1.6746 +  format %{ "PREFETCHNTA $mem\t! Prefetch into non-temporal cache for read" %}
  1.6747 +  opcode(0x0F, 0x18);     /* Opcode 0F 18 /0 */
  1.6748 +  ins_encode(OpcP, OpcS, RMopc_Mem(0x00,mem));
  1.6749 +  ins_pipe(ialu_mem);
  1.6750 +%}
  1.6751 +
  1.6752 +instruct prefetchrT0( memory mem ) %{
  1.6753 +  predicate(UseSSE>=1 && ReadPrefetchInstr==1);
  1.6754 +  match(PrefetchRead mem);
  1.6755 +  ins_cost(100);
  1.6756 +
  1.6757 +  format %{ "PREFETCHT0 $mem\t! Prefetch into L1 and L2 caches for read" %}
  1.6758 +  opcode(0x0F, 0x18);     /* Opcode 0F 18 /1 */
  1.6759 +  ins_encode(OpcP, OpcS, RMopc_Mem(0x01,mem));
  1.6760 +  ins_pipe(ialu_mem);
  1.6761 +%}
  1.6762 +
  1.6763 +instruct prefetchrT2( memory mem ) %{
  1.6764 +  predicate(UseSSE>=1 && ReadPrefetchInstr==2);
  1.6765 +  match(PrefetchRead mem);
  1.6766 +  ins_cost(100);
  1.6767 +
  1.6768 +  format %{ "PREFETCHT2 $mem\t! Prefetch into L2 cache for read" %}
  1.6769 +  opcode(0x0F, 0x18);     /* Opcode 0F 18 /3 */
  1.6770 +  ins_encode(OpcP, OpcS, RMopc_Mem(0x03,mem));
  1.6771 +  ins_pipe(ialu_mem);
  1.6772 +%}
  1.6773 +
  1.6774 +instruct prefetchw0( memory mem ) %{
  1.6775 +  predicate(UseSSE==0 && !VM_Version::supports_3dnow());
  1.6776 +  match(PrefetchWrite mem);
  1.6777 +  ins_cost(0);
  1.6778 +  size(0);
  1.6779 +  format %{ "Prefetch (non-SSE is empty encoding)" %}
  1.6780 +  ins_encode();
  1.6781 +  ins_pipe(empty);
  1.6782 +%}
  1.6783 +
  1.6784 +instruct prefetchw( memory mem ) %{
  1.6785 +  predicate(UseSSE==0 && VM_Version::supports_3dnow() || AllocatePrefetchInstr==3);
  1.6786 +  match( PrefetchWrite mem );
  1.6787 +  ins_cost(100);
  1.6788 +
  1.6789 +  format %{ "PREFETCHW $mem\t! Prefetch into L1 cache and mark modified" %}
  1.6790 +  opcode(0x0F, 0x0D);     /* Opcode 0F 0D /1 */
  1.6791 +  ins_encode(OpcP, OpcS, RMopc_Mem(0x01,mem));
  1.6792 +  ins_pipe(ialu_mem);
  1.6793 +%}
  1.6794 +
  1.6795 +instruct prefetchwNTA( memory mem ) %{
  1.6796 +  predicate(UseSSE>=1 && AllocatePrefetchInstr==0);
  1.6797 +  match(PrefetchWrite mem);
  1.6798 +  ins_cost(100);
  1.6799 +
  1.6800 +  format %{ "PREFETCHNTA $mem\t! Prefetch into non-temporal cache for write" %}
  1.6801 +  opcode(0x0F, 0x18);     /* Opcode 0F 18 /0 */
  1.6802 +  ins_encode(OpcP, OpcS, RMopc_Mem(0x00,mem));
  1.6803 +  ins_pipe(ialu_mem);
  1.6804 +%}
  1.6805 +
  1.6806 +instruct prefetchwT0( memory mem ) %{
  1.6807 +  predicate(UseSSE>=1 && AllocatePrefetchInstr==1);
  1.6808 +  match(PrefetchWrite mem);
  1.6809 +  ins_cost(100);
  1.6810 +
  1.6811 +  format %{ "PREFETCHT0 $mem\t! Prefetch into L1 and L2 caches for write" %}
  1.6812 +  opcode(0x0F, 0x18);     /* Opcode 0F 18 /1 */
  1.6813 +  ins_encode(OpcP, OpcS, RMopc_Mem(0x01,mem));
  1.6814 +  ins_pipe(ialu_mem);
  1.6815 +%}
  1.6816 +
  1.6817 +instruct prefetchwT2( memory mem ) %{
  1.6818 +  predicate(UseSSE>=1 && AllocatePrefetchInstr==2);
  1.6819 +  match(PrefetchWrite mem);
  1.6820 +  ins_cost(100);
  1.6821 +
  1.6822 +  format %{ "PREFETCHT2 $mem\t! Prefetch into L2 cache for write" %}
  1.6823 +  opcode(0x0F, 0x18);     /* Opcode 0F 18 /3 */
  1.6824 +  ins_encode(OpcP, OpcS, RMopc_Mem(0x03,mem));
  1.6825 +  ins_pipe(ialu_mem);
  1.6826 +%}
  1.6827 +
  1.6828 +//----------Store Instructions-------------------------------------------------
  1.6829 +
  1.6830 +// Store Byte
  1.6831 +instruct storeB(memory mem, xRegI src) %{
  1.6832 +  match(Set mem (StoreB mem src));
  1.6833 +
  1.6834 +  ins_cost(125);
  1.6835 +  format %{ "MOV8   $mem,$src" %}
  1.6836 +  opcode(0x88);
  1.6837 +  ins_encode( OpcP, RegMem( src, mem ) );
  1.6838 +  ins_pipe( ialu_mem_reg );
  1.6839 +%}
  1.6840 +
  1.6841 +// Store Char/Short
  1.6842 +instruct storeC(memory mem, eRegI src) %{
  1.6843 +  match(Set mem (StoreC mem src));
  1.6844 +
  1.6845 +  ins_cost(125);
  1.6846 +  format %{ "MOV16  $mem,$src" %}
  1.6847 +  opcode(0x89, 0x66);
  1.6848 +  ins_encode( OpcS, OpcP, RegMem( src, mem ) );
  1.6849 +  ins_pipe( ialu_mem_reg );
  1.6850 +%}
  1.6851 +
  1.6852 +// Store Integer
  1.6853 +instruct storeI(memory mem, eRegI src) %{
  1.6854 +  match(Set mem (StoreI mem src));
  1.6855 +
  1.6856 +  ins_cost(125);
  1.6857 +  format %{ "MOV    $mem,$src" %}
  1.6858 +  opcode(0x89);
  1.6859 +  ins_encode( OpcP, RegMem( src, mem ) );
  1.6860 +  ins_pipe( ialu_mem_reg );
  1.6861 +%}
  1.6862 +
  1.6863 +// Store Long
  1.6864 +instruct storeL(long_memory mem, eRegL src) %{
  1.6865 +  predicate(!((StoreLNode*)n)->require_atomic_access());
  1.6866 +  match(Set mem (StoreL mem src));
  1.6867 +
  1.6868 +  ins_cost(200);
  1.6869 +  format %{ "MOV    $mem,$src.lo\n\t"
  1.6870 +            "MOV    $mem+4,$src.hi" %}
  1.6871 +  opcode(0x89, 0x89);
  1.6872 +  ins_encode( OpcP, RegMem( src, mem ), OpcS, RegMem_Hi( src, mem ) );
  1.6873 +  ins_pipe( ialu_mem_long_reg );
  1.6874 +%}
  1.6875 +
  1.6876 +// Volatile Store Long.  Must be atomic, so move it into
  1.6877 +// the FP TOS and then do a 64-bit FIST.  Has to probe the
  1.6878 +// target address before the store (for null-ptr checks)
  1.6879 +// so the memory operand is used twice in the encoding.
  1.6880 +instruct storeL_volatile(memory mem, stackSlotL src, eFlagsReg cr ) %{
  1.6881 +  predicate(UseSSE<=1 && ((StoreLNode*)n)->require_atomic_access());
  1.6882 +  match(Set mem (StoreL mem src));
  1.6883 +  effect( KILL cr );
  1.6884 +  ins_cost(400);
  1.6885 +  format %{ "CMP    $mem,EAX\t# Probe address for implicit null check\n\t"
  1.6886 +            "FILD   $src\n\t"
  1.6887 +            "FISTp  $mem\t # 64-bit atomic volatile long store" %}
  1.6888 +  opcode(0x3B);
  1.6889 +  ins_encode( OpcP, RegMem( EAX, mem ), enc_storeL_volatile(mem,src));
  1.6890 +  ins_pipe( fpu_reg_mem );
  1.6891 +%}
  1.6892 +
  1.6893 +instruct storeLX_volatile(memory mem, stackSlotL src, regXD tmp, eFlagsReg cr) %{
  1.6894 +  predicate(UseSSE>=2 && ((StoreLNode*)n)->require_atomic_access());
  1.6895 +  match(Set mem (StoreL mem src));
  1.6896 +  effect( TEMP tmp, KILL cr );
  1.6897 +  ins_cost(380);
  1.6898 +  format %{ "CMP    $mem,EAX\t# Probe address for implicit null check\n\t"
  1.6899 +            "MOVSD  $tmp,$src\n\t"
  1.6900 +            "MOVSD  $mem,$tmp\t # 64-bit atomic volatile long store" %}
  1.6901 +  opcode(0x3B);
  1.6902 +  ins_encode( OpcP, RegMem( EAX, mem ), enc_storeLX_volatile(mem, src, tmp));
  1.6903 +  ins_pipe( pipe_slow );
  1.6904 +%}
  1.6905 +
  1.6906 +instruct storeLX_reg_volatile(memory mem, eRegL src, regXD tmp2, regXD tmp, eFlagsReg cr) %{
  1.6907 +  predicate(UseSSE>=2 && ((StoreLNode*)n)->require_atomic_access());
  1.6908 +  match(Set mem (StoreL mem src));
  1.6909 +  effect( TEMP tmp2 , TEMP tmp, KILL cr );
  1.6910 +  ins_cost(360);
  1.6911 +  format %{ "CMP    $mem,EAX\t# Probe address for implicit null check\n\t"
  1.6912 +            "MOVD   $tmp,$src.lo\n\t"
  1.6913 +            "MOVD   $tmp2,$src.hi\n\t"
  1.6914 +            "PUNPCKLDQ $tmp,$tmp2\n\t"
  1.6915 +            "MOVSD  $mem,$tmp\t # 64-bit atomic volatile long store" %}
  1.6916 +  opcode(0x3B);
  1.6917 +  ins_encode( OpcP, RegMem( EAX, mem ), enc_storeLX_reg_volatile(mem, src, tmp, tmp2));
  1.6918 +  ins_pipe( pipe_slow );
  1.6919 +%}
  1.6920 +
  1.6921 +// Store Pointer; for storing unknown oops and raw pointers
  1.6922 +instruct storeP(memory mem, anyRegP src) %{
  1.6923 +  match(Set mem (StoreP mem src));
  1.6924 +
  1.6925 +  ins_cost(125);
  1.6926 +  format %{ "MOV    $mem,$src" %}
  1.6927 +  opcode(0x89);
  1.6928 +  ins_encode( OpcP, RegMem( src, mem ) );
  1.6929 +  ins_pipe( ialu_mem_reg );
  1.6930 +%}
  1.6931 +
  1.6932 +// Store Integer Immediate
  1.6933 +instruct storeImmI(memory mem, immI src) %{
  1.6934 +  match(Set mem (StoreI mem src));
  1.6935 +
  1.6936 +  ins_cost(150);
  1.6937 +  format %{ "MOV    $mem,$src" %}
  1.6938 +  opcode(0xC7);               /* C7 /0 */
  1.6939 +  ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con32( src ));
  1.6940 +  ins_pipe( ialu_mem_imm );
  1.6941 +%}
  1.6942 +
  1.6943 +// Store Short/Char Immediate
  1.6944 +instruct storeImmI16(memory mem, immI16 src) %{
  1.6945 +  predicate(UseStoreImmI16);
  1.6946 +  match(Set mem (StoreC mem src));
  1.6947 +
  1.6948 +  ins_cost(150);
  1.6949 +  format %{ "MOV16  $mem,$src" %}
  1.6950 +  opcode(0xC7);     /* C7 /0 Same as 32 store immediate with prefix */
  1.6951 +  ins_encode( SizePrefix, OpcP, RMopc_Mem(0x00,mem),  Con16( src ));
  1.6952 +  ins_pipe( ialu_mem_imm );
  1.6953 +%}
  1.6954 +
  1.6955 +// Store Pointer Immediate; null pointers or constant oops that do not
  1.6956 +// need card-mark barriers.
  1.6957 +instruct storeImmP(memory mem, immP src) %{
  1.6958 +  match(Set mem (StoreP mem src));
  1.6959 +
  1.6960 +  ins_cost(150);
  1.6961 +  format %{ "MOV    $mem,$src" %}
  1.6962 +  opcode(0xC7);               /* C7 /0 */
  1.6963 +  ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con32( src ));
  1.6964 +  ins_pipe( ialu_mem_imm );
  1.6965 +%}
  1.6966 +
  1.6967 +// Store Byte Immediate
  1.6968 +instruct storeImmB(memory mem, immI8 src) %{
  1.6969 +  match(Set mem (StoreB mem src));
  1.6970 +
  1.6971 +  ins_cost(150);
  1.6972 +  format %{ "MOV8   $mem,$src" %}
  1.6973 +  opcode(0xC6);               /* C6 /0 */
  1.6974 +  ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con8or32( src ));
  1.6975 +  ins_pipe( ialu_mem_imm );
  1.6976 +%}
  1.6977 +
  1.6978 +// Store Aligned Packed Byte XMM register to memory
  1.6979 +instruct storeA8B(memory mem, regXD src) %{
  1.6980 +  predicate(UseSSE>=1);
  1.6981 +  match(Set mem (Store8B mem src));
  1.6982 +  ins_cost(145);
  1.6983 +  format %{ "MOVQ  $mem,$src\t! packed8B" %}
  1.6984 +  ins_encode( movq_st(mem, src));
  1.6985 +  ins_pipe( pipe_slow );
  1.6986 +%}
  1.6987 +
  1.6988 +// Store Aligned Packed Char/Short XMM register to memory
  1.6989 +instruct storeA4C(memory mem, regXD src) %{
  1.6990 +  predicate(UseSSE>=1);
  1.6991 +  match(Set mem (Store4C mem src));
  1.6992 +  ins_cost(145);
  1.6993 +  format %{ "MOVQ  $mem,$src\t! packed4C" %}
  1.6994 +  ins_encode( movq_st(mem, src));
  1.6995 +  ins_pipe( pipe_slow );
  1.6996 +%}
  1.6997 +
  1.6998 +// Store Aligned Packed Integer XMM register to memory
  1.6999 +instruct storeA2I(memory mem, regXD src) %{
  1.7000 +  predicate(UseSSE>=1);
  1.7001 +  match(Set mem (Store2I mem src));
  1.7002 +  ins_cost(145);
  1.7003 +  format %{ "MOVQ  $mem,$src\t! packed2I" %}
  1.7004 +  ins_encode( movq_st(mem, src));
  1.7005 +  ins_pipe( pipe_slow );
  1.7006 +%}
  1.7007 +
  1.7008 +// Store CMS card-mark Immediate
  1.7009 +instruct storeImmCM(memory mem, immI8 src) %{
  1.7010 +  match(Set mem (StoreCM mem src));
  1.7011 +
  1.7012 +  ins_cost(150);
  1.7013 +  format %{ "MOV8   $mem,$src\t! CMS card-mark imm0" %}
  1.7014 +  opcode(0xC6);               /* C6 /0 */
  1.7015 +  ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con8or32( src ));
  1.7016 +  ins_pipe( ialu_mem_imm );
  1.7017 +%}
  1.7018 +
  1.7019 +// Store Double
  1.7020 +instruct storeD( memory mem, regDPR1 src) %{
  1.7021 +  predicate(UseSSE<=1);
  1.7022 +  match(Set mem (StoreD mem src));
  1.7023 +
  1.7024 +  ins_cost(100);
  1.7025 +  format %{ "FST_D  $mem,$src" %}
  1.7026 +  opcode(0xDD);       /* DD /2 */
  1.7027 +  ins_encode( enc_FP_store(mem,src) );
  1.7028 +  ins_pipe( fpu_mem_reg );
  1.7029 +%}
  1.7030 +
  1.7031 +// Store double does rounding on x86
  1.7032 +instruct storeD_rounded( memory mem, regDPR1 src) %{
  1.7033 +  predicate(UseSSE<=1);
  1.7034 +  match(Set mem (StoreD mem (RoundDouble src)));
  1.7035 +
  1.7036 +  ins_cost(100);
  1.7037 +  format %{ "FST_D  $mem,$src\t# round" %}
  1.7038 +  opcode(0xDD);       /* DD /2 */
  1.7039 +  ins_encode( enc_FP_store(mem,src) );
  1.7040 +  ins_pipe( fpu_mem_reg );
  1.7041 +%}
  1.7042 +
  1.7043 +// Store XMM register to memory (double-precision floating points)
  1.7044 +// MOVSD instruction
  1.7045 +instruct storeXD(memory mem, regXD src) %{
  1.7046 +  predicate(UseSSE>=2);
  1.7047 +  match(Set mem (StoreD mem src));
  1.7048 +  ins_cost(95);
  1.7049 +  format %{ "MOVSD  $mem,$src" %}
  1.7050 +  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x11), RegMem(src, mem));
  1.7051 +  ins_pipe( pipe_slow );
  1.7052 +%}
  1.7053 +
  1.7054 +// Store XMM register to memory (single-precision floating point)
  1.7055 +// MOVSS instruction
  1.7056 +instruct storeX(memory mem, regX src) %{
  1.7057 +  predicate(UseSSE>=1);
  1.7058 +  match(Set mem (StoreF mem src));
  1.7059 +  ins_cost(95);
  1.7060 +  format %{ "MOVSS  $mem,$src" %}
  1.7061 +  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x11), RegMem(src, mem));
  1.7062 +  ins_pipe( pipe_slow );
  1.7063 +%}
  1.7064 +
  1.7065 +// Store Aligned Packed Single Float XMM register to memory
  1.7066 +instruct storeA2F(memory mem, regXD src) %{
  1.7067 +  predicate(UseSSE>=1);
  1.7068 +  match(Set mem (Store2F mem src));
  1.7069 +  ins_cost(145);
  1.7070 +  format %{ "MOVQ  $mem,$src\t! packed2F" %}
  1.7071 +  ins_encode( movq_st(mem, src));
  1.7072 +  ins_pipe( pipe_slow );
  1.7073 +%}
  1.7074 +
  1.7075 +// Store Float
  1.7076 +instruct storeF( memory mem, regFPR1 src) %{
  1.7077 +  predicate(UseSSE==0);
  1.7078 +  match(Set mem (StoreF mem src));
  1.7079 +
  1.7080 +  ins_cost(100);
  1.7081 +  format %{ "FST_S  $mem,$src" %}
  1.7082 +  opcode(0xD9);       /* D9 /2 */
  1.7083 +  ins_encode( enc_FP_store(mem,src) );
  1.7084 +  ins_pipe( fpu_mem_reg );
  1.7085 +%}
  1.7086 +
  1.7087 +// Store Float does rounding on x86
  1.7088 +instruct storeF_rounded( memory mem, regFPR1 src) %{
  1.7089 +  predicate(UseSSE==0);
  1.7090 +  match(Set mem (StoreF mem (RoundFloat src)));
  1.7091 +
  1.7092 +  ins_cost(100);
  1.7093 +  format %{ "FST_S  $mem,$src\t# round" %}
  1.7094 +  opcode(0xD9);       /* D9 /2 */
  1.7095 +  ins_encode( enc_FP_store(mem,src) );
  1.7096 +  ins_pipe( fpu_mem_reg );
  1.7097 +%}
  1.7098 +
  1.7099 +// Store Float does rounding on x86
  1.7100 +instruct storeF_Drounded( memory mem, regDPR1 src) %{
  1.7101 +  predicate(UseSSE<=1);
  1.7102 +  match(Set mem (StoreF mem (ConvD2F src)));
  1.7103 +
  1.7104 +  ins_cost(100);
  1.7105 +  format %{ "FST_S  $mem,$src\t# D-round" %}
  1.7106 +  opcode(0xD9);       /* D9 /2 */
  1.7107 +  ins_encode( enc_FP_store(mem,src) );
  1.7108 +  ins_pipe( fpu_mem_reg );
  1.7109 +%}
  1.7110 +
  1.7111 +// Store immediate Float value (it is faster than store from FPU register)
  1.7112 +// The instruction usage is guarded by predicate in operand immF().
  1.7113 +instruct storeF_imm( memory mem, immF src) %{
  1.7114 +  match(Set mem (StoreF mem src));
  1.7115 +
  1.7116 +  ins_cost(50);
  1.7117 +  format %{ "MOV    $mem,$src\t# store float" %}
  1.7118 +  opcode(0xC7);               /* C7 /0 */
  1.7119 +  ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con32F_as_bits( src ));
  1.7120 +  ins_pipe( ialu_mem_imm );
  1.7121 +%}
  1.7122 +
  1.7123 +// Store immediate Float value (it is faster than store from XMM register)
  1.7124 +// The instruction usage is guarded by predicate in operand immXF().
  1.7125 +instruct storeX_imm( memory mem, immXF src) %{
  1.7126 +  match(Set mem (StoreF mem src));
  1.7127 +
  1.7128 +  ins_cost(50);
  1.7129 +  format %{ "MOV    $mem,$src\t# store float" %}
  1.7130 +  opcode(0xC7);               /* C7 /0 */
  1.7131 +  ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con32XF_as_bits( src ));
  1.7132 +  ins_pipe( ialu_mem_imm );
  1.7133 +%}
  1.7134 +
  1.7135 +// Store Integer to stack slot
  1.7136 +instruct storeSSI(stackSlotI dst, eRegI src) %{
  1.7137 +  match(Set dst src);
  1.7138 +
  1.7139 +  ins_cost(100);
  1.7140 +  format %{ "MOV    $dst,$src" %}
  1.7141 +  opcode(0x89);
  1.7142 +  ins_encode( OpcPRegSS( dst, src ) );
  1.7143 +  ins_pipe( ialu_mem_reg );
  1.7144 +%}
  1.7145 +
  1.7146 +// Store Integer to stack slot
  1.7147 +instruct storeSSP(stackSlotP dst, eRegP src) %{
  1.7148 +  match(Set dst src);
  1.7149 +
  1.7150 +  ins_cost(100);
  1.7151 +  format %{ "MOV    $dst,$src" %}
  1.7152 +  opcode(0x89);
  1.7153 +  ins_encode( OpcPRegSS( dst, src ) );
  1.7154 +  ins_pipe( ialu_mem_reg );
  1.7155 +%}
  1.7156 +
  1.7157 +// Store Long to stack slot
  1.7158 +instruct storeSSL(stackSlotL dst, eRegL src) %{
  1.7159 +  match(Set dst src);
  1.7160 +
  1.7161 +  ins_cost(200);
  1.7162 +  format %{ "MOV    $dst,$src.lo\n\t"
  1.7163 +            "MOV    $dst+4,$src.hi" %}
  1.7164 +  opcode(0x89, 0x89);
  1.7165 +  ins_encode( OpcP, RegMem( src, dst ), OpcS, RegMem_Hi( src, dst ) );
  1.7166 +  ins_pipe( ialu_mem_long_reg );
  1.7167 +%}
  1.7168 +
  1.7169 +//----------MemBar Instructions-----------------------------------------------
  1.7170 +// Memory barrier flavors
  1.7171 +
  1.7172 +instruct membar_acquire() %{
  1.7173 +  match(MemBarAcquire);
  1.7174 +  ins_cost(400);
  1.7175 +
  1.7176 +  size(0);
  1.7177 +  format %{ "MEMBAR-acquire" %}
  1.7178 +  ins_encode( enc_membar_acquire );
  1.7179 +  ins_pipe(pipe_slow);
  1.7180 +%}
  1.7181 +
  1.7182 +instruct membar_acquire_lock() %{
  1.7183 +  match(MemBarAcquire);
  1.7184 +  predicate(Matcher::prior_fast_lock(n));
  1.7185 +  ins_cost(0);
  1.7186 +
  1.7187 +  size(0);
  1.7188 +  format %{ "MEMBAR-acquire (prior CMPXCHG in FastLock so empty encoding)" %}
  1.7189 +  ins_encode( );
  1.7190 +  ins_pipe(empty);
  1.7191 +%}
  1.7192 +
  1.7193 +instruct membar_release() %{
  1.7194 +  match(MemBarRelease);
  1.7195 +  ins_cost(400);
  1.7196 +
  1.7197 +  size(0);
  1.7198 +  format %{ "MEMBAR-release" %}
  1.7199 +  ins_encode( enc_membar_release );
  1.7200 +  ins_pipe(pipe_slow);
  1.7201 +%}
  1.7202 +
  1.7203 +instruct membar_release_lock() %{
  1.7204 +  match(MemBarRelease);
  1.7205 +  predicate(Matcher::post_fast_unlock(n));
  1.7206 +  ins_cost(0);
  1.7207 +
  1.7208 +  size(0);
  1.7209 +  format %{ "MEMBAR-release (a FastUnlock follows so empty encoding)" %}
  1.7210 +  ins_encode( );
  1.7211 +  ins_pipe(empty);
  1.7212 +%}
  1.7213 +
  1.7214 +instruct membar_volatile() %{
  1.7215 +  match(MemBarVolatile);
  1.7216 +  ins_cost(400);
  1.7217 +
  1.7218 +  format %{ "MEMBAR-volatile" %}
  1.7219 +  ins_encode( enc_membar_volatile );
  1.7220 +  ins_pipe(pipe_slow);
  1.7221 +%}
  1.7222 +
  1.7223 +instruct unnecessary_membar_volatile() %{
  1.7224 +  match(MemBarVolatile);
  1.7225 +  predicate(Matcher::post_store_load_barrier(n));
  1.7226 +  ins_cost(0);
  1.7227 +
  1.7228 +  size(0);
  1.7229 +  format %{ "MEMBAR-volatile (unnecessary so empty encoding)" %}
  1.7230 +  ins_encode( );
  1.7231 +  ins_pipe(empty);
  1.7232 +%}
  1.7233 +
  1.7234 +//----------Move Instructions--------------------------------------------------
  1.7235 +instruct castX2P(eAXRegP dst, eAXRegI src) %{
  1.7236 +  match(Set dst (CastX2P src));
  1.7237 +  format %{ "# X2P  $dst, $src" %}
  1.7238 +  ins_encode( /*empty encoding*/ );
  1.7239 +  ins_cost(0);
  1.7240 +  ins_pipe(empty);
  1.7241 +%}
  1.7242 +
  1.7243 +instruct castP2X(eRegI dst, eRegP src ) %{
  1.7244 +  match(Set dst (CastP2X src));
  1.7245 +  ins_cost(50);
  1.7246 +  format %{ "MOV    $dst, $src\t# CastP2X" %}
  1.7247 +  ins_encode( enc_Copy( dst, src) );
  1.7248 +  ins_pipe( ialu_reg_reg );
  1.7249 +%}
  1.7250 +
  1.7251 +//----------Conditional Move---------------------------------------------------
  1.7252 +// Conditional move
  1.7253 +instruct cmovI_reg(eRegI dst, eRegI src, eFlagsReg cr, cmpOp cop ) %{
  1.7254 +  predicate(VM_Version::supports_cmov() );
  1.7255 +  match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
  1.7256 +  ins_cost(200);
  1.7257 +  format %{ "CMOV$cop $dst,$src" %}
  1.7258 +  opcode(0x0F,0x40);
  1.7259 +  ins_encode( enc_cmov(cop), RegReg( dst, src ) );
  1.7260 +  ins_pipe( pipe_cmov_reg );
  1.7261 +%}
  1.7262 +
  1.7263 +instruct cmovI_regU( eRegI dst, eRegI src, eFlagsRegU cr, cmpOpU cop ) %{
  1.7264 +  predicate(VM_Version::supports_cmov() );
  1.7265 +  match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
  1.7266 +  ins_cost(200);
  1.7267 +  format %{ "CMOV$cop $dst,$src" %}
  1.7268 +  opcode(0x0F,0x40);
  1.7269 +  ins_encode( enc_cmov(cop), RegReg( dst, src ) );
  1.7270 +  ins_pipe( pipe_cmov_reg );
  1.7271 +%}
  1.7272 +
  1.7273 +// Conditional move
  1.7274 +instruct cmovI_mem(cmpOp cop, eFlagsReg cr, eRegI dst, memory src) %{
  1.7275 +  predicate(VM_Version::supports_cmov() );
  1.7276 +  match(Set dst (CMoveI (Binary cop cr) (Binary dst (LoadI src))));
  1.7277 +  ins_cost(250);
  1.7278 +  format %{ "CMOV$cop $dst,$src" %}
  1.7279 +  opcode(0x0F,0x40);
  1.7280 +  ins_encode( enc_cmov(cop), RegMem( dst, src ) );
  1.7281 +  ins_pipe( pipe_cmov_mem );
  1.7282 +%}
  1.7283 +
  1.7284 +// Conditional move
  1.7285 +instruct cmovI_memu(cmpOpU cop, eFlagsRegU cr, eRegI dst, memory src) %{
  1.7286 +  predicate(VM_Version::supports_cmov() );
  1.7287 +  match(Set dst (CMoveI (Binary cop cr) (Binary dst (LoadI src))));
  1.7288 +  ins_cost(250);
  1.7289 +  format %{ "CMOV$cop $dst,$src" %}
  1.7290 +  opcode(0x0F,0x40);
  1.7291 +  ins_encode( enc_cmov(cop), RegMem( dst, src ) );
  1.7292 +  ins_pipe( pipe_cmov_mem );
  1.7293 +%}
  1.7294 +
  1.7295 +// Conditional move
  1.7296 +instruct cmovP_reg(eRegP dst, eRegP src, eFlagsReg cr, cmpOp cop ) %{
  1.7297 +  predicate(VM_Version::supports_cmov() );
  1.7298 +  match(Set dst (CMoveP (Binary cop cr) (Binary dst src)));
  1.7299 +  ins_cost(200);
  1.7300 +  format %{ "CMOV$cop $dst,$src\t# ptr" %}
  1.7301 +  opcode(0x0F,0x40);
  1.7302 +  ins_encode( enc_cmov(cop), RegReg( dst, src ) );
  1.7303 +  ins_pipe( pipe_cmov_reg );
  1.7304 +%}
  1.7305 +
  1.7306 +// Conditional move (non-P6 version)
  1.7307 +// Note:  a CMoveP is generated for  stubs and native wrappers
  1.7308 +//        regardless of whether we are on a P6, so we
  1.7309 +//        emulate a cmov here
  1.7310 +instruct cmovP_reg_nonP6(eRegP dst, eRegP src, eFlagsReg cr, cmpOp cop ) %{
  1.7311 +  match(Set dst (CMoveP (Binary cop cr) (Binary dst src)));
  1.7312 +  ins_cost(300);
  1.7313 +  format %{ "Jn$cop   skip\n\t"
  1.7314 +          "MOV    $dst,$src\t# pointer\n"
  1.7315 +      "skip:" %}
  1.7316 +  opcode(0x8b);
  1.7317 +  ins_encode( enc_cmov_branch(cop, 0x2), OpcP, RegReg(dst, src));
  1.7318 +  ins_pipe( pipe_cmov_reg );
  1.7319 +%}
  1.7320 +
  1.7321 +// Conditional move
  1.7322 +instruct cmovP_regU(eRegP dst, eRegP src, eFlagsRegU cr, cmpOpU cop ) %{
  1.7323 +  predicate(VM_Version::supports_cmov() );
  1.7324 +  match(Set dst (CMoveP (Binary cop cr) (Binary dst src)));
  1.7325 +  ins_cost(200);
  1.7326 +  format %{ "CMOV$cop $dst,$src\t# ptr" %}
  1.7327 +  opcode(0x0F,0x40);
  1.7328 +  ins_encode( enc_cmov(cop), RegReg( dst, src ) );
  1.7329 +  ins_pipe( pipe_cmov_reg );
  1.7330 +%}
  1.7331 +
  1.7332 +// DISABLED: Requires the ADLC to emit a bottom_type call that
  1.7333 +// correctly meets the two pointer arguments; one is an incoming
  1.7334 +// register but the other is a memory operand.  ALSO appears to
  1.7335 +// be buggy with implicit null checks.
  1.7336 +//
  1.7337 +//// Conditional move
  1.7338 +//instruct cmovP_mem(cmpOp cop, eFlagsReg cr, eRegP dst, memory src) %{
  1.7339 +//  predicate(VM_Version::supports_cmov() );
  1.7340 +//  match(Set dst (CMoveP (Binary cop cr) (Binary dst (LoadP src))));
  1.7341 +//  ins_cost(250);
  1.7342 +//  format %{ "CMOV$cop $dst,$src\t# ptr" %}
  1.7343 +//  opcode(0x0F,0x40);
  1.7344 +//  ins_encode( enc_cmov(cop), RegMem( dst, src ) );
  1.7345 +//  ins_pipe( pipe_cmov_mem );
  1.7346 +//%}
  1.7347 +//
  1.7348 +//// Conditional move
  1.7349 +//instruct cmovP_memU(cmpOpU cop, eFlagsRegU cr, eRegP dst, memory src) %{
  1.7350 +//  predicate(VM_Version::supports_cmov() );
  1.7351 +//  match(Set dst (CMoveP (Binary cop cr) (Binary dst (LoadP src))));
  1.7352 +//  ins_cost(250);
  1.7353 +//  format %{ "CMOV$cop $dst,$src\t# ptr" %}
  1.7354 +//  opcode(0x0F,0x40);
  1.7355 +//  ins_encode( enc_cmov(cop), RegMem( dst, src ) );
  1.7356 +//  ins_pipe( pipe_cmov_mem );
  1.7357 +//%}
  1.7358 +
  1.7359 +// Conditional move
  1.7360 +instruct fcmovD_regU(cmpOp_fcmov cop, eFlagsRegU cr, regDPR1 dst, regD src) %{
  1.7361 +  predicate(UseSSE<=1);
  1.7362 +  match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
  1.7363 +  ins_cost(200);
  1.7364 +  format %{ "FCMOV$cop $dst,$src\t# double" %}
  1.7365 +  opcode(0xDA);
  1.7366 +  ins_encode( enc_cmov_d(cop,src) );
  1.7367 +  ins_pipe( pipe_cmovD_reg );
  1.7368 +%}
  1.7369 +
  1.7370 +// Conditional move
  1.7371 +instruct fcmovF_regU(cmpOp_fcmov cop, eFlagsRegU cr, regFPR1 dst, regF src) %{
  1.7372 +  predicate(UseSSE==0);
  1.7373 +  match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
  1.7374 +  ins_cost(200);
  1.7375 +  format %{ "FCMOV$cop $dst,$src\t# float" %}
  1.7376 +  opcode(0xDA);
  1.7377 +  ins_encode( enc_cmov_d(cop,src) );
  1.7378 +  ins_pipe( pipe_cmovD_reg );
  1.7379 +%}
  1.7380 +
  1.7381 +// Float CMOV on Intel doesn't handle *signed* compares, only unsigned.
  1.7382 +instruct fcmovD_regS(cmpOp cop, eFlagsReg cr, regD dst, regD src) %{
  1.7383 +  predicate(UseSSE<=1);
  1.7384 +  match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
  1.7385 +  ins_cost(200);
  1.7386 +  format %{ "Jn$cop   skip\n\t"
  1.7387 +            "MOV    $dst,$src\t# double\n"
  1.7388 +      "skip:" %}
  1.7389 +  opcode (0xdd, 0x3);     /* DD D8+i or DD /3 */
  1.7390 +  ins_encode( enc_cmov_branch( cop, 0x4 ), Push_Reg_D(src), OpcP, RegOpc(dst) );
  1.7391 +  ins_pipe( pipe_cmovD_reg );
  1.7392 +%}
  1.7393 +
  1.7394 +// Float CMOV on Intel doesn't handle *signed* compares, only unsigned.
  1.7395 +instruct fcmovF_regS(cmpOp cop, eFlagsReg cr, regF dst, regF src) %{
  1.7396 +  predicate(UseSSE==0);
  1.7397 +  match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
  1.7398 +  ins_cost(200);
  1.7399 +  format %{ "Jn$cop    skip\n\t"
  1.7400 +            "MOV    $dst,$src\t# float\n"
  1.7401 +      "skip:" %}
  1.7402 +  opcode (0xdd, 0x3);     /* DD D8+i or DD /3 */
  1.7403 +  ins_encode( enc_cmov_branch( cop, 0x4 ), Push_Reg_F(src), OpcP, RegOpc(dst) );
  1.7404 +  ins_pipe( pipe_cmovD_reg );
  1.7405 +%}
  1.7406 +
  1.7407 +// No CMOVE with SSE/SSE2
  1.7408 +instruct fcmovX_regS(cmpOp cop, eFlagsReg cr, regX dst, regX src) %{
  1.7409 +  predicate (UseSSE>=1);
  1.7410 +  match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
  1.7411 +  ins_cost(200);
  1.7412 +  format %{ "Jn$cop   skip\n\t"
  1.7413 +            "MOVSS  $dst,$src\t# float\n"
  1.7414 +      "skip:" %}
  1.7415 +  ins_encode %{
  1.7416 +    Label skip;
  1.7417 +    // Invert sense of branch from sense of CMOV
  1.7418 +    __ jccb((Assembler::Condition)($cop$$cmpcode^1), skip);
  1.7419 +    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
  1.7420 +    __ bind(skip);
  1.7421 +  %}
  1.7422 +  ins_pipe( pipe_slow );
  1.7423 +%}
  1.7424 +
  1.7425 +// No CMOVE with SSE/SSE2
  1.7426 +instruct fcmovXD_regS(cmpOp cop, eFlagsReg cr, regXD dst, regXD src) %{
  1.7427 +  predicate (UseSSE>=2);
  1.7428 +  match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
  1.7429 +  ins_cost(200);
  1.7430 +  format %{ "Jn$cop   skip\n\t"
  1.7431 +            "MOVSD  $dst,$src\t# float\n"
  1.7432 +      "skip:" %}
  1.7433 +  ins_encode %{
  1.7434 +    Label skip;
  1.7435 +    // Invert sense of branch from sense of CMOV
  1.7436 +    __ jccb((Assembler::Condition)($cop$$cmpcode^1), skip);
  1.7437 +    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
  1.7438 +    __ bind(skip);
  1.7439 +  %}
  1.7440 +  ins_pipe( pipe_slow );
  1.7441 +%}
  1.7442 +
  1.7443 +// unsigned version
  1.7444 +instruct fcmovX_regU(cmpOpU cop, eFlagsRegU cr, regX dst, regX src) %{
  1.7445 +  predicate (UseSSE>=1);
  1.7446 +  match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
  1.7447 +  ins_cost(200);
  1.7448 +  format %{ "Jn$cop   skip\n\t"
  1.7449 +            "MOVSS  $dst,$src\t# float\n"
  1.7450 +      "skip:" %}
  1.7451 +  ins_encode %{
  1.7452 +    Label skip;
  1.7453 +    // Invert sense of branch from sense of CMOV
  1.7454 +    __ jccb((Assembler::Condition)($cop$$cmpcode^1), skip);
  1.7455 +    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
  1.7456 +    __ bind(skip);
  1.7457 +  %}
  1.7458 +  ins_pipe( pipe_slow );
  1.7459 +%}
  1.7460 +
  1.7461 +// unsigned version
  1.7462 +instruct fcmovXD_regU(cmpOpU cop, eFlagsRegU cr, regXD dst, regXD src) %{
  1.7463 +  predicate (UseSSE>=2);
  1.7464 +  match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
  1.7465 +  ins_cost(200);
  1.7466 +  format %{ "Jn$cop   skip\n\t"
  1.7467 +            "MOVSD  $dst,$src\t# float\n"
  1.7468 +      "skip:" %}
  1.7469 +  ins_encode %{
  1.7470 +    Label skip;
  1.7471 +    // Invert sense of branch from sense of CMOV
  1.7472 +    __ jccb((Assembler::Condition)($cop$$cmpcode^1), skip);
  1.7473 +    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
  1.7474 +    __ bind(skip);
  1.7475 +  %}
  1.7476 +  ins_pipe( pipe_slow );
  1.7477 +%}
  1.7478 +
  1.7479 +instruct cmovL_reg(cmpOp cop, eFlagsReg cr, eRegL dst, eRegL src) %{
  1.7480 +  predicate(VM_Version::supports_cmov() );
  1.7481 +  match(Set dst (CMoveL (Binary cop cr) (Binary dst src)));
  1.7482 +  ins_cost(200);
  1.7483 +  format %{ "CMOV$cop $dst.lo,$src.lo\n\t"
  1.7484 +            "CMOV$cop $dst.hi,$src.hi" %}
  1.7485 +  opcode(0x0F,0x40);
  1.7486 +  ins_encode( enc_cmov(cop), RegReg_Lo2( dst, src ), enc_cmov(cop), RegReg_Hi2( dst, src ) );
  1.7487 +  ins_pipe( pipe_cmov_reg_long );
  1.7488 +%}
  1.7489 +
  1.7490 +instruct cmovL_regU(cmpOpU cop, eFlagsRegU cr, eRegL dst, eRegL src) %{
  1.7491 +  predicate(VM_Version::supports_cmov() );
  1.7492 +  match(Set dst (CMoveL (Binary cop cr) (Binary dst src)));
  1.7493 +  ins_cost(200);
  1.7494 +  format %{ "CMOV$cop $dst.lo,$src.lo\n\t"
  1.7495 +            "CMOV$cop $dst.hi,$src.hi" %}
  1.7496 +  opcode(0x0F,0x40);
  1.7497 +  ins_encode( enc_cmov(cop), RegReg_Lo2( dst, src ), enc_cmov(cop), RegReg_Hi2( dst, src ) );
  1.7498 +  ins_pipe( pipe_cmov_reg_long );
  1.7499 +%}
  1.7500 +
  1.7501 +//----------Arithmetic Instructions--------------------------------------------
  1.7502 +//----------Addition Instructions----------------------------------------------
  1.7503 +// Integer Addition Instructions
  1.7504 +instruct addI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
  1.7505 +  match(Set dst (AddI dst src));
  1.7506 +  effect(KILL cr);
  1.7507 +
  1.7508 +  size(2);
  1.7509 +  format %{ "ADD    $dst,$src" %}
  1.7510 +  opcode(0x03);
  1.7511 +  ins_encode( OpcP, RegReg( dst, src) );
  1.7512 +  ins_pipe( ialu_reg_reg );
  1.7513 +%}
  1.7514 +
  1.7515 +instruct addI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
  1.7516 +  match(Set dst (AddI dst src));
  1.7517 +  effect(KILL cr);
  1.7518 +
  1.7519 +  format %{ "ADD    $dst,$src" %}
  1.7520 +  opcode(0x81, 0x00); /* /0 id */
  1.7521 +  ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
  1.7522 +  ins_pipe( ialu_reg );
  1.7523 +%}
  1.7524 +
  1.7525 +instruct incI_eReg(eRegI dst, immI1 src, eFlagsReg cr) %{
  1.7526 +  predicate(UseIncDec);
  1.7527 +  match(Set dst (AddI dst src));
  1.7528 +  effect(KILL cr);
  1.7529 +
  1.7530 +  size(1);
  1.7531 +  format %{ "INC    $dst" %}
  1.7532 +  opcode(0x40); /*  */
  1.7533 +  ins_encode( Opc_plus( primary, dst ) );
  1.7534 +  ins_pipe( ialu_reg );
  1.7535 +%}
  1.7536 +
  1.7537 +instruct leaI_eReg_immI(eRegI dst, eRegI src0, immI src1) %{
  1.7538 +  match(Set dst (AddI src0 src1));
  1.7539 +  ins_cost(110);
  1.7540 +
  1.7541 +  format %{ "LEA    $dst,[$src0 + $src1]" %}
  1.7542 +  opcode(0x8D); /* 0x8D /r */
  1.7543 +  ins_encode( OpcP, RegLea( dst, src0, src1 ) );
  1.7544 +  ins_pipe( ialu_reg_reg );
  1.7545 +%}
  1.7546 +
  1.7547 +instruct leaP_eReg_immI(eRegP dst, eRegP src0, immI src1) %{
  1.7548 +  match(Set dst (AddP src0 src1));
  1.7549 +  ins_cost(110);
  1.7550 +
  1.7551 +  format %{ "LEA    $dst,[$src0 + $src1]\t# ptr" %}
  1.7552 +  opcode(0x8D); /* 0x8D /r */
  1.7553 +  ins_encode( OpcP, RegLea( dst, src0, src1 ) );
  1.7554 +  ins_pipe( ialu_reg_reg );
  1.7555 +%}
  1.7556 +
  1.7557 +instruct decI_eReg(eRegI dst, immI_M1 src, eFlagsReg cr) %{
  1.7558 +  predicate(UseIncDec);
  1.7559 +  match(Set dst (AddI dst src));
  1.7560 +  effect(KILL cr);
  1.7561 +
  1.7562 +  size(1);
  1.7563 +  format %{ "DEC    $dst" %}
  1.7564 +  opcode(0x48); /*  */
  1.7565 +  ins_encode( Opc_plus( primary, dst ) );
  1.7566 +  ins_pipe( ialu_reg );
  1.7567 +%}
  1.7568 +
  1.7569 +instruct addP_eReg(eRegP dst, eRegI src, eFlagsReg cr) %{
  1.7570 +  match(Set dst (AddP dst src));
  1.7571 +  effect(KILL cr);
  1.7572 +
  1.7573 +  size(2);
  1.7574 +  format %{ "ADD    $dst,$src" %}
  1.7575 +  opcode(0x03);
  1.7576 +  ins_encode( OpcP, RegReg( dst, src) );
  1.7577 +  ins_pipe( ialu_reg_reg );
  1.7578 +%}
  1.7579 +
  1.7580 +instruct addP_eReg_imm(eRegP dst, immI src, eFlagsReg cr) %{
  1.7581 +  match(Set dst (AddP dst src));
  1.7582 +  effect(KILL cr);
  1.7583 +
  1.7584 +  format %{ "ADD    $dst,$src" %}
  1.7585 +  opcode(0x81,0x00); /* Opcode 81 /0 id */
  1.7586 +  // ins_encode( RegImm( dst, src) );
  1.7587 +  ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
  1.7588 +  ins_pipe( ialu_reg );
  1.7589 +%}
  1.7590 +
  1.7591 +instruct addI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
  1.7592 +  match(Set dst (AddI dst (LoadI src)));
  1.7593 +  effect(KILL cr);
  1.7594 +
  1.7595 +  ins_cost(125);
  1.7596 +  format %{ "ADD    $dst,$src" %}
  1.7597 +  opcode(0x03);
  1.7598 +  ins_encode( OpcP, RegMem( dst, src) );
  1.7599 +  ins_pipe( ialu_reg_mem );
  1.7600 +%}
  1.7601 +
  1.7602 +instruct addI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
  1.7603 +  match(Set dst (StoreI dst (AddI (LoadI dst) src)));
  1.7604 +  effect(KILL cr);
  1.7605 +
  1.7606 +  ins_cost(150);
  1.7607 +  format %{ "ADD    $dst,$src" %}
  1.7608 +  opcode(0x01);  /* Opcode 01 /r */
  1.7609 +  ins_encode( OpcP, RegMem( src, dst ) );
  1.7610 +  ins_pipe( ialu_mem_reg );
  1.7611 +%}
  1.7612 +
  1.7613 +// Add Memory with Immediate
  1.7614 +instruct addI_mem_imm(memory dst, immI src, eFlagsReg cr) %{
  1.7615 +  match(Set dst (StoreI dst (AddI (LoadI dst) src)));
  1.7616 +  effect(KILL cr);
  1.7617 +
  1.7618 +  ins_cost(125);
  1.7619 +  format %{ "ADD    $dst,$src" %}
  1.7620 +  opcode(0x81);               /* Opcode 81 /0 id */
  1.7621 +  ins_encode( OpcSE( src ), RMopc_Mem(0x00,dst), Con8or32( src ) );
  1.7622 +  ins_pipe( ialu_mem_imm );
  1.7623 +%}
  1.7624 +
  1.7625 +instruct incI_mem(memory dst, immI1 src, eFlagsReg cr) %{
  1.7626 +  match(Set dst (StoreI dst (AddI (LoadI dst) src)));
  1.7627 +  effect(KILL cr);
  1.7628 +
  1.7629 +  ins_cost(125);
  1.7630 +  format %{ "INC    $dst" %}
  1.7631 +  opcode(0xFF);               /* Opcode FF /0 */
  1.7632 +  ins_encode( OpcP, RMopc_Mem(0x00,dst));
  1.7633 +  ins_pipe( ialu_mem_imm );
  1.7634 +%}
  1.7635 +
  1.7636 +instruct decI_mem(memory dst, immI_M1 src, eFlagsReg cr) %{
  1.7637 +  match(Set dst (StoreI dst (AddI (LoadI dst) src)));
  1.7638 +  effect(KILL cr);
  1.7639 +
  1.7640 +  ins_cost(125);
  1.7641 +  format %{ "DEC    $dst" %}
  1.7642 +  opcode(0xFF);               /* Opcode FF /1 */
  1.7643 +  ins_encode( OpcP, RMopc_Mem(0x01,dst));
  1.7644 +  ins_pipe( ialu_mem_imm );
  1.7645 +%}
  1.7646 +
  1.7647 +
  1.7648 +instruct checkCastPP( eRegP dst ) %{
  1.7649 +  match(Set dst (CheckCastPP dst));
  1.7650 +
  1.7651 +  size(0);
  1.7652 +  format %{ "#checkcastPP of $dst" %}
  1.7653 +  ins_encode( /*empty encoding*/ );
  1.7654 +  ins_pipe( empty );
  1.7655 +%}
  1.7656 +
  1.7657 +instruct castPP( eRegP dst ) %{
  1.7658 +  match(Set dst (CastPP dst));
  1.7659 +  format %{ "#castPP of $dst" %}
  1.7660 +  ins_encode( /*empty encoding*/ );
  1.7661 +  ins_pipe( empty );
  1.7662 +%}
  1.7663 +
  1.7664 +instruct castII( eRegI dst ) %{
  1.7665 +  match(Set dst (CastII dst));
  1.7666 +  format %{ "#castII of $dst" %}
  1.7667 +  ins_encode( /*empty encoding*/ );
  1.7668 +  ins_cost(0);
  1.7669 +  ins_pipe( empty );
  1.7670 +%}
  1.7671 +
  1.7672 +
  1.7673 +// Load-locked - same as a regular pointer load when used with compare-swap
  1.7674 +instruct loadPLocked(eRegP dst, memory mem) %{
  1.7675 +  match(Set dst (LoadPLocked mem));
  1.7676 +
  1.7677 +  ins_cost(125);
  1.7678 +  format %{ "MOV    $dst,$mem\t# Load ptr. locked" %}
  1.7679 +  opcode(0x8B);
  1.7680 +  ins_encode( OpcP, RegMem(dst,mem));
  1.7681 +  ins_pipe( ialu_reg_mem );
  1.7682 +%}
  1.7683 +
  1.7684 +// LoadLong-locked - same as a volatile long load when used with compare-swap
  1.7685 +instruct loadLLocked(stackSlotL dst, load_long_memory mem) %{
  1.7686 +  predicate(UseSSE<=1);
  1.7687 +  match(Set dst (LoadLLocked mem));
  1.7688 +
  1.7689 +  ins_cost(200);
  1.7690 +  format %{ "FILD   $mem\t# Atomic volatile long load\n\t"
  1.7691 +            "FISTp  $dst" %}
  1.7692 +  ins_encode(enc_loadL_volatile(mem,dst));
  1.7693 +  ins_pipe( fpu_reg_mem );
  1.7694 +%}
  1.7695 +
  1.7696 +instruct loadLX_Locked(stackSlotL dst, load_long_memory mem, regXD tmp) %{
  1.7697 +  predicate(UseSSE>=2);
  1.7698 +  match(Set dst (LoadLLocked mem));
  1.7699 +  effect(TEMP tmp);
  1.7700 +  ins_cost(180);
  1.7701 +  format %{ "MOVSD  $tmp,$mem\t# Atomic volatile long load\n\t"
  1.7702 +            "MOVSD  $dst,$tmp" %}
  1.7703 +  ins_encode(enc_loadLX_volatile(mem, dst, tmp));
  1.7704 +  ins_pipe( pipe_slow );
  1.7705 +%}
  1.7706 +
  1.7707 +instruct loadLX_reg_Locked(eRegL dst, load_long_memory mem, regXD tmp) %{
  1.7708 +  predicate(UseSSE>=2);
  1.7709 +  match(Set dst (LoadLLocked mem));
  1.7710 +  effect(TEMP tmp);
  1.7711 +  ins_cost(160);
  1.7712 +  format %{ "MOVSD  $tmp,$mem\t# Atomic volatile long load\n\t"
  1.7713 +            "MOVD   $dst.lo,$tmp\n\t"
  1.7714 +            "PSRLQ  $tmp,32\n\t"
  1.7715 +            "MOVD   $dst.hi,$tmp" %}
  1.7716 +  ins_encode(enc_loadLX_reg_volatile(mem, dst, tmp));
  1.7717 +  ins_pipe( pipe_slow );
  1.7718 +%}
  1.7719 +
  1.7720 +// Conditional-store of the updated heap-top.
  1.7721 +// Used during allocation of the shared heap.
  1.7722 +// Sets flags (EQ) on success.  Implemented with a CMPXCHG on Intel.
  1.7723 +instruct storePConditional( memory heap_top_ptr, eAXRegP oldval, eRegP newval, eFlagsReg cr ) %{
  1.7724 +  match(Set cr (StorePConditional heap_top_ptr (Binary oldval newval)));
  1.7725 +  // EAX is killed if there is contention, but then it's also unused.
  1.7726 +  // In the common case of no contention, EAX holds the new oop address.
  1.7727 +  format %{ "CMPXCHG $heap_top_ptr,$newval\t# If EAX==$heap_top_ptr Then store $newval into $heap_top_ptr" %}
  1.7728 +  ins_encode( lock_prefix, Opcode(0x0F), Opcode(0xB1), RegMem(newval,heap_top_ptr) );
  1.7729 +  ins_pipe( pipe_cmpxchg );
  1.7730 +%}
  1.7731 +
  1.7732 +// Conditional-store of a long value
  1.7733 +// Returns a boolean value (0/1) on success.  Implemented with a CMPXCHG8 on Intel.
  1.7734 +// mem_ptr can actually be in either ESI or EDI
  1.7735 +instruct storeLConditional( eRegI res, eSIRegP mem_ptr, eADXRegL oldval, eBCXRegL newval, eFlagsReg cr ) %{
  1.7736 +  match(Set res (StoreLConditional mem_ptr (Binary oldval newval)));
  1.7737 +  effect(KILL cr);
  1.7738 +  // EDX:EAX is killed if there is contention, but then it's also unused.
  1.7739 +  // In the common case of no contention, EDX:EAX holds the new oop address.
  1.7740 +  format %{ "CMPXCHG8 [$mem_ptr],$newval\t# If EDX:EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
  1.7741 +            "MOV    $res,0\n\t"
  1.7742 +            "JNE,s  fail\n\t"
  1.7743 +            "MOV    $res,1\n"
  1.7744 +          "fail:" %}
  1.7745 +  ins_encode( enc_cmpxchg8(mem_ptr),
  1.7746 +              enc_flags_ne_to_boolean(res) );
  1.7747 +  ins_pipe( pipe_cmpxchg );
  1.7748 +%}
  1.7749 +
  1.7750 +// Conditional-store of a long value
  1.7751 +// ZF flag is set on success, reset otherwise. Implemented with a CMPXCHG8 on Intel.
  1.7752 +// mem_ptr can actually be in either ESI or EDI
  1.7753 +instruct storeLConditional_flags( eSIRegP mem_ptr, eADXRegL oldval, eBCXRegL newval, eFlagsReg cr, immI0 zero ) %{
  1.7754 +  match(Set cr (CmpI (StoreLConditional mem_ptr (Binary oldval newval)) zero));
  1.7755 +  // EDX:EAX is killed if there is contention, but then it's also unused.
  1.7756 +  // In the common case of no contention, EDX:EAX holds the new oop address.
  1.7757 +  format %{ "CMPXCHG8 [$mem_ptr],$newval\t# If EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t" %}
  1.7758 +  ins_encode( enc_cmpxchg8(mem_ptr) );
  1.7759 +  ins_pipe( pipe_cmpxchg );
  1.7760 +%}
  1.7761 +
  1.7762 +// No flag versions for CompareAndSwap{P,I,L} because matcher can't match them
  1.7763 +
  1.7764 +instruct compareAndSwapL( eRegI res, eSIRegP mem_ptr, eADXRegL oldval, eBCXRegL newval, eFlagsReg cr ) %{
  1.7765 +  match(Set res (CompareAndSwapL mem_ptr (Binary oldval newval)));
  1.7766 +  effect(KILL cr, KILL oldval);
  1.7767 +  format %{ "CMPXCHG8 [$mem_ptr],$newval\t# If EDX:EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
  1.7768 +            "MOV    $res,0\n\t"
  1.7769 +            "JNE,s  fail\n\t"
  1.7770 +            "MOV    $res,1\n"
  1.7771 +          "fail:" %}
  1.7772 +  ins_encode( enc_cmpxchg8(mem_ptr),
  1.7773 +              enc_flags_ne_to_boolean(res) );
  1.7774 +  ins_pipe( pipe_cmpxchg );
  1.7775 +%}
  1.7776 +
  1.7777 +instruct compareAndSwapP( eRegI res,  pRegP mem_ptr, eAXRegP oldval, eCXRegP newval, eFlagsReg cr) %{
  1.7778 +  match(Set res (CompareAndSwapP mem_ptr (Binary oldval newval)));
  1.7779 +  effect(KILL cr, KILL oldval);
  1.7780 +  format %{ "CMPXCHG [$mem_ptr],$newval\t# If EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
  1.7781 +            "MOV    $res,0\n\t"
  1.7782 +            "JNE,s  fail\n\t"
  1.7783 +            "MOV    $res,1\n"
  1.7784 +          "fail:" %}
  1.7785 +  ins_encode( enc_cmpxchg(mem_ptr), enc_flags_ne_to_boolean(res) );
  1.7786 +  ins_pipe( pipe_cmpxchg );
  1.7787 +%}
  1.7788 +
  1.7789 +instruct compareAndSwapI( eRegI res, pRegP mem_ptr, eAXRegI oldval, eCXRegI newval, eFlagsReg cr) %{
  1.7790 +  match(Set res (CompareAndSwapI mem_ptr (Binary oldval newval)));
  1.7791 +  effect(KILL cr, KILL oldval);
  1.7792 +  format %{ "CMPXCHG [$mem_ptr],$newval\t# If EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
  1.7793 +            "MOV    $res,0\n\t"
  1.7794 +            "JNE,s  fail\n\t"
  1.7795 +            "MOV    $res,1\n"
  1.7796 +          "fail:" %}
  1.7797 +  ins_encode( enc_cmpxchg(mem_ptr), enc_flags_ne_to_boolean(res) );
  1.7798 +  ins_pipe( pipe_cmpxchg );
  1.7799 +%}
  1.7800 +
  1.7801 +//----------Subtraction Instructions-------------------------------------------
  1.7802 +// Integer Subtraction Instructions
  1.7803 +instruct subI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
  1.7804 +  match(Set dst (SubI dst src));
  1.7805 +  effect(KILL cr);
  1.7806 +
  1.7807 +  size(2);
  1.7808 +  format %{ "SUB    $dst,$src" %}
  1.7809 +  opcode(0x2B);
  1.7810 +  ins_encode( OpcP, RegReg( dst, src) );
  1.7811 +  ins_pipe( ialu_reg_reg );
  1.7812 +%}
  1.7813 +
  1.7814 +instruct subI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
  1.7815 +  match(Set dst (SubI dst src));
  1.7816 +  effect(KILL cr);
  1.7817 +
  1.7818 +  format %{ "SUB    $dst,$src" %}
  1.7819 +  opcode(0x81,0x05);  /* Opcode 81 /5 */
  1.7820 +  // ins_encode( RegImm( dst, src) );
  1.7821 +  ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
  1.7822 +  ins_pipe( ialu_reg );
  1.7823 +%}
  1.7824 +
  1.7825 +instruct subI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
  1.7826 +  match(Set dst (SubI dst (LoadI src)));
  1.7827 +  effect(KILL cr);
  1.7828 +
  1.7829 +  ins_cost(125);
  1.7830 +  format %{ "SUB    $dst,$src" %}
  1.7831 +  opcode(0x2B);
  1.7832 +  ins_encode( OpcP, RegMem( dst, src) );
  1.7833 +  ins_pipe( ialu_reg_mem );
  1.7834 +%}
  1.7835 +
  1.7836 +instruct subI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
  1.7837 +  match(Set dst (StoreI dst (SubI (LoadI dst) src)));
  1.7838 +  effect(KILL cr);
  1.7839 +
  1.7840 +  ins_cost(150);
  1.7841 +  format %{ "SUB    $dst,$src" %}
  1.7842 +  opcode(0x29);  /* Opcode 29 /r */
  1.7843 +  ins_encode( OpcP, RegMem( src, dst ) );
  1.7844 +  ins_pipe( ialu_mem_reg );
  1.7845 +%}
  1.7846 +
  1.7847 +// Subtract from a pointer
  1.7848 +instruct subP_eReg(eRegP dst, eRegI src, immI0 zero, eFlagsReg cr) %{
  1.7849 +  match(Set dst (AddP dst (SubI zero src)));
  1.7850 +  effect(KILL cr);
  1.7851 +
  1.7852 +  size(2);
  1.7853 +  format %{ "SUB    $dst,$src" %}
  1.7854 +  opcode(0x2B);
  1.7855 +  ins_encode( OpcP, RegReg( dst, src) );
  1.7856 +  ins_pipe( ialu_reg_reg );
  1.7857 +%}
  1.7858 +
  1.7859 +instruct negI_eReg(eRegI dst, immI0 zero, eFlagsReg cr) %{
  1.7860 +  match(Set dst (SubI zero dst));
  1.7861 +  effect(KILL cr);
  1.7862 +
  1.7863 +  size(2);
  1.7864 +  format %{ "NEG    $dst" %}
  1.7865 +  opcode(0xF7,0x03);  // Opcode F7 /3
  1.7866 +  ins_encode( OpcP, RegOpc( dst ) );
  1.7867 +  ins_pipe( ialu_reg );
  1.7868 +%}
  1.7869 +
  1.7870 +
  1.7871 +//----------Multiplication/Division Instructions-------------------------------
  1.7872 +// Integer Multiplication Instructions
  1.7873 +// Multiply Register
  1.7874 +instruct mulI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
  1.7875 +  match(Set dst (MulI dst src));
  1.7876 +  effect(KILL cr);
  1.7877 +
  1.7878 +  size(3);
  1.7879 +  ins_cost(300);
  1.7880 +  format %{ "IMUL   $dst,$src" %}
  1.7881 +  opcode(0xAF, 0x0F);
  1.7882 +  ins_encode( OpcS, OpcP, RegReg( dst, src) );
  1.7883 +  ins_pipe( ialu_reg_reg_alu0 );
  1.7884 +%}
  1.7885 +
  1.7886 +// Multiply 32-bit Immediate
  1.7887 +instruct mulI_eReg_imm(eRegI dst, eRegI src, immI imm, eFlagsReg cr) %{
  1.7888 +  match(Set dst (MulI src imm));
  1.7889 +  effect(KILL cr);
  1.7890 +
  1.7891 +  ins_cost(300);
  1.7892 +  format %{ "IMUL   $dst,$src,$imm" %}
  1.7893 +  opcode(0x69);  /* 69 /r id */
  1.7894 +  ins_encode( OpcSE(imm), RegReg( dst, src ), Con8or32( imm ) );
  1.7895 +  ins_pipe( ialu_reg_reg_alu0 );
  1.7896 +%}
  1.7897 +
  1.7898 +instruct loadConL_low_only(eADXRegL_low_only dst, immL32 src, eFlagsReg cr) %{
  1.7899 +  match(Set dst src);
  1.7900 +  effect(KILL cr);
  1.7901 +
  1.7902 +  // Note that this is artificially increased to make it more expensive than loadConL
  1.7903 +  ins_cost(250);
  1.7904 +  format %{ "MOV    EAX,$src\t// low word only" %}
  1.7905 +  opcode(0xB8);
  1.7906 +  ins_encode( LdImmL_Lo(dst, src) );
  1.7907 +  ins_pipe( ialu_reg_fat );
  1.7908 +%}
  1.7909 +
  1.7910 +// Multiply by 32-bit Immediate, taking the shifted high order results
  1.7911 +//  (special case for shift by 32)
  1.7912 +instruct mulI_imm_high(eDXRegI dst, nadxRegI src1, eADXRegL_low_only src2, immI_32 cnt, eFlagsReg cr) %{
  1.7913 +  match(Set dst (ConvL2I (RShiftL (MulL (ConvI2L src1) src2) cnt)));
  1.7914 +  predicate( _kids[0]->_kids[0]->_kids[1]->_leaf->Opcode() == Op_ConL &&
  1.7915 +             _kids[0]->_kids[0]->_kids[1]->_leaf->as_Type()->type()->is_long()->get_con() >= min_jint &&
  1.7916 +             _kids[0]->_kids[0]->_kids[1]->_leaf->as_Type()->type()->is_long()->get_con() <= max_jint );
  1.7917 +  effect(USE src1, KILL cr);
  1.7918 +
  1.7919 +  // Note that this is adjusted by 150 to compensate for the overcosting of loadConL_low_only
  1.7920 +  ins_cost(0*100 + 1*400 - 150);
  1.7921 +  format %{ "IMUL   EDX:EAX,$src1" %}
  1.7922 +  ins_encode( multiply_con_and_shift_high( dst, src1, src2, cnt, cr ) );
  1.7923 +  ins_pipe( pipe_slow );
  1.7924 +%}
  1.7925 +
  1.7926 +// Multiply by 32-bit Immediate, taking the shifted high order results
  1.7927 +instruct mulI_imm_RShift_high(eDXRegI dst, nadxRegI src1, eADXRegL_low_only src2, immI_32_63 cnt, eFlagsReg cr) %{
  1.7928 +  match(Set dst (ConvL2I (RShiftL (MulL (ConvI2L src1) src2) cnt)));
  1.7929 +  predicate( _kids[0]->_kids[0]->_kids[1]->_leaf->Opcode() == Op_ConL &&
  1.7930 +             _kids[0]->_kids[0]->_kids[1]->_leaf->as_Type()->type()->is_long()->get_con() >= min_jint &&
  1.7931 +             _kids[0]->_kids[0]->_kids[1]->_leaf->as_Type()->type()->is_long()->get_con() <= max_jint );
  1.7932 +  effect(USE src1, KILL cr);
  1.7933 +
  1.7934 +  // Note that this is adjusted by 150 to compensate for the overcosting of loadConL_low_only
  1.7935 +  ins_cost(1*100 + 1*400 - 150);
  1.7936 +  format %{ "IMUL   EDX:EAX,$src1\n\t"
  1.7937 +            "SAR    EDX,$cnt-32" %}
  1.7938 +  ins_encode( multiply_con_and_shift_high( dst, src1, src2, cnt, cr ) );
  1.7939 +  ins_pipe( pipe_slow );
  1.7940 +%}
  1.7941 +
  1.7942 +// Multiply Memory 32-bit Immediate
  1.7943 +instruct mulI_mem_imm(eRegI dst, memory src, immI imm, eFlagsReg cr) %{
  1.7944 +  match(Set dst (MulI (LoadI src) imm));
  1.7945 +  effect(KILL cr);
  1.7946 +
  1.7947 +  ins_cost(300);
  1.7948 +  format %{ "IMUL   $dst,$src,$imm" %}
  1.7949 +  opcode(0x69);  /* 69 /r id */
  1.7950 +  ins_encode( OpcSE(imm), RegMem( dst, src ), Con8or32( imm ) );
  1.7951 +  ins_pipe( ialu_reg_mem_alu0 );
  1.7952 +%}
  1.7953 +
  1.7954 +// Multiply Memory
  1.7955 +instruct mulI(eRegI dst, memory src, eFlagsReg cr) %{
  1.7956 +  match(Set dst (MulI dst (LoadI src)));
  1.7957 +  effect(KILL cr);
  1.7958 +
  1.7959 +  ins_cost(350);
  1.7960 +  format %{ "IMUL   $dst,$src" %}
  1.7961 +  opcode(0xAF, 0x0F);
  1.7962 +  ins_encode( OpcS, OpcP, RegMem( dst, src) );
  1.7963 +  ins_pipe( ialu_reg_mem_alu0 );
  1.7964 +%}
  1.7965 +
  1.7966 +// Multiply Register Int to Long
  1.7967 +instruct mulI2L(eADXRegL dst, eAXRegI src, nadxRegI src1, eFlagsReg flags) %{
  1.7968 +  // Basic Idea: long = (long)int * (long)int
  1.7969 +  match(Set dst (MulL (ConvI2L src) (ConvI2L src1)));
  1.7970 +  effect(DEF dst, USE src, USE src1, KILL flags);
  1.7971 +
  1.7972 +  ins_cost(300);
  1.7973 +  format %{ "IMUL   $dst,$src1" %}
  1.7974 +
  1.7975 +  ins_encode( long_int_multiply( dst, src1 ) );
  1.7976 +  ins_pipe( ialu_reg_reg_alu0 );
  1.7977 +%}
  1.7978 +
  1.7979 +instruct mulIS_eReg(eADXRegL dst, immL_32bits mask, eFlagsReg flags, eAXRegI src, nadxRegI src1) %{
  1.7980 +  // Basic Idea:  long = (int & 0xffffffffL) * (int & 0xffffffffL)
  1.7981 +  match(Set dst (MulL (AndL (ConvI2L src) mask) (AndL (ConvI2L src1) mask)));
  1.7982 +  effect(KILL flags);
  1.7983 +
  1.7984 +  ins_cost(300);
  1.7985 +  format %{ "MUL    $dst,$src1" %}
  1.7986 +
  1.7987 +  ins_encode( long_uint_multiply(dst, src1) );
  1.7988 +  ins_pipe( ialu_reg_reg_alu0 );
  1.7989 +%}
  1.7990 +
  1.7991 +// Multiply Register Long
  1.7992 +instruct mulL_eReg(eADXRegL dst, eRegL src, eRegI tmp, eFlagsReg cr) %{
  1.7993 +  match(Set dst (MulL dst src));
  1.7994 +  effect(KILL cr, TEMP tmp);
  1.7995 +  ins_cost(4*100+3*400);
  1.7996 +// Basic idea: lo(result) = lo(x_lo * y_lo)
  1.7997 +//             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
  1.7998 +  format %{ "MOV    $tmp,$src.lo\n\t"
  1.7999 +            "IMUL   $tmp,EDX\n\t"
  1.8000 +            "MOV    EDX,$src.hi\n\t"
  1.8001 +            "IMUL   EDX,EAX\n\t"
  1.8002 +            "ADD    $tmp,EDX\n\t"
  1.8003 +            "MUL    EDX:EAX,$src.lo\n\t"
  1.8004 +            "ADD    EDX,$tmp" %}
  1.8005 +  ins_encode( long_multiply( dst, src, tmp ) );
  1.8006 +  ins_pipe( pipe_slow );
  1.8007 +%}
  1.8008 +
  1.8009 +// Multiply Register Long by small constant
  1.8010 +instruct mulL_eReg_con(eADXRegL dst, immL_127 src, eRegI tmp, eFlagsReg cr) %{
  1.8011 +  match(Set dst (MulL dst src));
  1.8012 +  effect(KILL cr, TEMP tmp);
  1.8013 +  ins_cost(2*100+2*400);
  1.8014 +  size(12);
  1.8015 +// Basic idea: lo(result) = lo(src * EAX)
  1.8016 +//             hi(result) = hi(src * EAX) + lo(src * EDX)
  1.8017 +  format %{ "IMUL   $tmp,EDX,$src\n\t"
  1.8018 +            "MOV    EDX,$src\n\t"
  1.8019 +            "MUL    EDX\t# EDX*EAX -> EDX:EAX\n\t"
  1.8020 +            "ADD    EDX,$tmp" %}
  1.8021 +  ins_encode( long_multiply_con( dst, src, tmp ) );
  1.8022 +  ins_pipe( pipe_slow );
  1.8023 +%}
  1.8024 +
  1.8025 +// Integer DIV with Register
  1.8026 +instruct divI_eReg(eAXRegI rax, eDXRegI rdx, eCXRegI div, eFlagsReg cr) %{
  1.8027 +  match(Set rax (DivI rax div));
  1.8028 +  effect(KILL rdx, KILL cr);
  1.8029 +  size(26);
  1.8030 +  ins_cost(30*100+10*100);
  1.8031 +  format %{ "CMP    EAX,0x80000000\n\t"
  1.8032 +            "JNE,s  normal\n\t"
  1.8033 +            "XOR    EDX,EDX\n\t"
  1.8034 +            "CMP    ECX,-1\n\t"
  1.8035 +            "JE,s   done\n"
  1.8036 +    "normal: CDQ\n\t"
  1.8037 +            "IDIV   $div\n\t"
  1.8038 +    "done:"        %}
  1.8039 +  opcode(0xF7, 0x7);  /* Opcode F7 /7 */
  1.8040 +  ins_encode( cdq_enc, OpcP, RegOpc(div) );
  1.8041 +  ins_pipe( ialu_reg_reg_alu0 );
  1.8042 +%}
  1.8043 +
  1.8044 +// Divide Register Long
  1.8045 +instruct divL_eReg( eADXRegL dst, eRegL src1, eRegL src2, eFlagsReg cr, eCXRegI cx, eBXRegI bx ) %{
  1.8046 +  match(Set dst (DivL src1 src2));
  1.8047 +  effect( KILL cr, KILL cx, KILL bx );
  1.8048 +  ins_cost(10000);
  1.8049 +  format %{ "PUSH   $src1.hi\n\t"
  1.8050 +            "PUSH   $src1.lo\n\t"
  1.8051 +            "PUSH   $src2.hi\n\t"
  1.8052 +            "PUSH   $src2.lo\n\t"
  1.8053 +            "CALL   SharedRuntime::ldiv\n\t"
  1.8054 +            "ADD    ESP,16" %}
  1.8055 +  ins_encode( long_div(src1,src2) );
  1.8056 +  ins_pipe( pipe_slow );
  1.8057 +%}
  1.8058 +
  1.8059 +// Integer DIVMOD with Register, both quotient and mod results
  1.8060 +instruct divModI_eReg_divmod(eAXRegI rax, eDXRegI rdx, eCXRegI div, eFlagsReg cr) %{
  1.8061 +  match(DivModI rax div);
  1.8062 +  effect(KILL cr);
  1.8063 +  size(26);
  1.8064 +  ins_cost(30*100+10*100);
  1.8065 +  format %{ "CMP    EAX,0x80000000\n\t"
  1.8066 +            "JNE,s  normal\n\t"
  1.8067 +            "XOR    EDX,EDX\n\t"
  1.8068 +            "CMP    ECX,-1\n\t"
  1.8069 +            "JE,s   done\n"
  1.8070 +    "normal: CDQ\n\t"
  1.8071 +            "IDIV   $div\n\t"
  1.8072 +    "done:"        %}
  1.8073 +  opcode(0xF7, 0x7);  /* Opcode F7 /7 */
  1.8074 +  ins_encode( cdq_enc, OpcP, RegOpc(div) );
  1.8075 +  ins_pipe( pipe_slow );
  1.8076 +%}
  1.8077 +
  1.8078 +// Integer MOD with Register
  1.8079 +instruct modI_eReg(eDXRegI rdx, eAXRegI rax, eCXRegI div, eFlagsReg cr) %{
  1.8080 +  match(Set rdx (ModI rax div));
  1.8081 +  effect(KILL rax, KILL cr);
  1.8082 +
  1.8083 +  size(26);
  1.8084 +  ins_cost(300);
  1.8085 +  format %{ "CDQ\n\t"
  1.8086 +            "IDIV   $div" %}
  1.8087 +  opcode(0xF7, 0x7);  /* Opcode F7 /7 */
  1.8088 +  ins_encode( cdq_enc, OpcP, RegOpc(div) );
  1.8089 +  ins_pipe( ialu_reg_reg_alu0 );
  1.8090 +%}
  1.8091 +
  1.8092 +// Remainder Register Long
  1.8093 +instruct modL_eReg( eADXRegL dst, eRegL src1, eRegL src2, eFlagsReg cr, eCXRegI cx, eBXRegI bx ) %{
  1.8094 +  match(Set dst (ModL src1 src2));
  1.8095 +  effect( KILL cr, KILL cx, KILL bx );
  1.8096 +  ins_cost(10000);
  1.8097 +  format %{ "PUSH   $src1.hi\n\t"
  1.8098 +            "PUSH   $src1.lo\n\t"
  1.8099 +            "PUSH   $src2.hi\n\t"
  1.8100 +            "PUSH   $src2.lo\n\t"
  1.8101 +            "CALL   SharedRuntime::lrem\n\t"
  1.8102 +            "ADD    ESP,16" %}
  1.8103 +  ins_encode( long_mod(src1,src2) );
  1.8104 +  ins_pipe( pipe_slow );
  1.8105 +%}
  1.8106 +
  1.8107 +// Integer Shift Instructions
  1.8108 +// Shift Left by one
  1.8109 +instruct shlI_eReg_1(eRegI dst, immI1 shift, eFlagsReg cr) %{
  1.8110 +  match(Set dst (LShiftI dst shift));
  1.8111 +  effect(KILL cr);
  1.8112 +
  1.8113 +  size(2);
  1.8114 +  format %{ "SHL    $dst,$shift" %}
  1.8115 +  opcode(0xD1, 0x4);  /* D1 /4 */
  1.8116 +  ins_encode( OpcP, RegOpc( dst ) );
  1.8117 +  ins_pipe( ialu_reg );
  1.8118 +%}
  1.8119 +
  1.8120 +// Shift Left by 8-bit immediate
  1.8121 +instruct salI_eReg_imm(eRegI dst, immI8 shift, eFlagsReg cr) %{
  1.8122 +  match(Set dst (LShiftI dst shift));
  1.8123 +  effect(KILL cr);
  1.8124 +
  1.8125 +  size(3);
  1.8126 +  format %{ "SHL    $dst,$shift" %}
  1.8127 +  opcode(0xC1, 0x4);  /* C1 /4 ib */
  1.8128 +  ins_encode( RegOpcImm( dst, shift) );
  1.8129 +  ins_pipe( ialu_reg );
  1.8130 +%}
  1.8131 +
  1.8132 +// Shift Left by variable
  1.8133 +instruct salI_eReg_CL(eRegI dst, eCXRegI shift, eFlagsReg cr) %{
  1.8134 +  match(Set dst (LShiftI dst shift));
  1.8135 +  effect(KILL cr);
  1.8136 +
  1.8137 +  size(2);
  1.8138 +  format %{ "SHL    $dst,$shift" %}
  1.8139 +  opcode(0xD3, 0x4);  /* D3 /4 */
  1.8140 +  ins_encode( OpcP, RegOpc( dst ) );
  1.8141 +  ins_pipe( ialu_reg_reg );
  1.8142 +%}
  1.8143 +
  1.8144 +// Arithmetic shift right by one
  1.8145 +instruct sarI_eReg_1(eRegI dst, immI1 shift, eFlagsReg cr) %{
  1.8146 +  match(Set dst (RShiftI dst shift));
  1.8147 +  effect(KILL cr);
  1.8148 +
  1.8149 +  size(2);
  1.8150 +  format %{ "SAR    $dst,$shift" %}
  1.8151 +  opcode(0xD1, 0x7);  /* D1 /7 */
  1.8152 +  ins_encode( OpcP, RegOpc( dst ) );
  1.8153 +  ins_pipe( ialu_reg );
  1.8154 +%}
  1.8155 +
  1.8156 +// Arithmetic shift right by one
  1.8157 +instruct sarI_mem_1(memory dst, immI1 shift, eFlagsReg cr) %{
  1.8158 +  match(Set dst (StoreI dst (RShiftI (LoadI dst) shift)));
  1.8159 +  effect(KILL cr);
  1.8160 +  format %{ "SAR    $dst,$shift" %}
  1.8161 +  opcode(0xD1, 0x7);  /* D1 /7 */
  1.8162 +  ins_encode( OpcP, RMopc_Mem(secondary,dst) );
  1.8163 +  ins_pipe( ialu_mem_imm );
  1.8164 +%}
  1.8165 +
  1.8166 +// Arithmetic Shift Right by 8-bit immediate
  1.8167 +instruct sarI_eReg_imm(eRegI dst, immI8 shift, eFlagsReg cr) %{
  1.8168 +  match(Set dst (RShiftI dst shift));
  1.8169 +  effect(KILL cr);
  1.8170 +
  1.8171 +  size(3);
  1.8172 +  format %{ "SAR    $dst,$shift" %}
  1.8173 +  opcode(0xC1, 0x7);  /* C1 /7 ib */
  1.8174 +  ins_encode( RegOpcImm( dst, shift ) );
  1.8175 +  ins_pipe( ialu_mem_imm );
  1.8176 +%}
  1.8177 +
  1.8178 +// Arithmetic Shift Right by 8-bit immediate
  1.8179 +instruct sarI_mem_imm(memory dst, immI8 shift, eFlagsReg cr) %{
  1.8180 +  match(Set dst (StoreI dst (RShiftI (LoadI dst) shift)));
  1.8181 +  effect(KILL cr);
  1.8182 +
  1.8183 +  format %{ "SAR    $dst,$shift" %}
  1.8184 +  opcode(0xC1, 0x7);  /* C1 /7 ib */
  1.8185 +  ins_encode( OpcP, RMopc_Mem(secondary, dst ), Con8or32( shift ) );
  1.8186 +  ins_pipe( ialu_mem_imm );
  1.8187 +%}
  1.8188 +
  1.8189 +// Arithmetic Shift Right by variable
  1.8190 +instruct sarI_eReg_CL(eRegI dst, eCXRegI shift, eFlagsReg cr) %{
  1.8191 +  match(Set dst (RShiftI dst shift));
  1.8192 +  effect(KILL cr);
  1.8193 +
  1.8194 +  size(2);
  1.8195 +  format %{ "SAR    $dst,$shift" %}
  1.8196 +  opcode(0xD3, 0x7);  /* D3 /7 */
  1.8197 +  ins_encode( OpcP, RegOpc( dst ) );
  1.8198 +  ins_pipe( ialu_reg_reg );
  1.8199 +%}
  1.8200 +
  1.8201 +// Logical shift right by one
  1.8202 +instruct shrI_eReg_1(eRegI dst, immI1 shift, eFlagsReg cr) %{
  1.8203 +  match(Set dst (URShiftI dst shift));
  1.8204 +  effect(KILL cr);
  1.8205 +
  1.8206 +  size(2);
  1.8207 +  format %{ "SHR    $dst,$shift" %}
  1.8208 +  opcode(0xD1, 0x5);  /* D1 /5 */
  1.8209 +  ins_encode( OpcP, RegOpc( dst ) );
  1.8210 +  ins_pipe( ialu_reg );
  1.8211 +%}
  1.8212 +
  1.8213 +// Logical Shift Right by 8-bit immediate
  1.8214 +instruct shrI_eReg_imm(eRegI dst, immI8 shift, eFlagsReg cr) %{
  1.8215 +  match(Set dst (URShiftI dst shift));
  1.8216 +  effect(KILL cr);
  1.8217 +
  1.8218 +  size(3);
  1.8219 +  format %{ "SHR    $dst,$shift" %}
  1.8220 +  opcode(0xC1, 0x5);  /* C1 /5 ib */
  1.8221 +  ins_encode( RegOpcImm( dst, shift) );
  1.8222 +  ins_pipe( ialu_reg );
  1.8223 +%}
  1.8224 +
  1.8225 +// Logical Shift Right by 24, followed by Arithmetic Shift Left by 24.
  1.8226 +// This idiom is used by the compiler for the i2b bytecode.
  1.8227 +instruct i2b(eRegI dst, xRegI src, immI_24 twentyfour, eFlagsReg cr) %{
  1.8228 +  match(Set dst (RShiftI (LShiftI src twentyfour) twentyfour));
  1.8229 +  effect(KILL cr);
  1.8230 +
  1.8231 +  size(3);
  1.8232 +  format %{ "MOVSX  $dst,$src :8" %}
  1.8233 +  opcode(0xBE, 0x0F);
  1.8234 +  ins_encode( OpcS, OpcP, RegReg( dst, src));
  1.8235 +  ins_pipe( ialu_reg_reg );
  1.8236 +%}
  1.8237 +
  1.8238 +// Logical Shift Right by 16, followed by Arithmetic Shift Left by 16.
  1.8239 +// This idiom is used by the compiler the i2s bytecode.
  1.8240 +instruct i2s(eRegI dst, xRegI src, immI_16 sixteen, eFlagsReg cr) %{
  1.8241 +  match(Set dst (RShiftI (LShiftI src sixteen) sixteen));
  1.8242 +  effect(KILL cr);
  1.8243 +
  1.8244 +  size(3);
  1.8245 +  format %{ "MOVSX  $dst,$src :16" %}
  1.8246 +  opcode(0xBF, 0x0F);
  1.8247 +  ins_encode( OpcS, OpcP, RegReg( dst, src));
  1.8248 +  ins_pipe( ialu_reg_reg );
  1.8249 +%}
  1.8250 +
  1.8251 +
  1.8252 +// Logical Shift Right by variable
  1.8253 +instruct shrI_eReg_CL(eRegI dst, eCXRegI shift, eFlagsReg cr) %{
  1.8254 +  match(Set dst (URShiftI dst shift));
  1.8255 +  effect(KILL cr);
  1.8256 +
  1.8257 +  size(2);
  1.8258 +  format %{ "SHR    $dst,$shift" %}
  1.8259 +  opcode(0xD3, 0x5);  /* D3 /5 */
  1.8260 +  ins_encode( OpcP, RegOpc( dst ) );
  1.8261 +  ins_pipe( ialu_reg_reg );
  1.8262 +%}
  1.8263 +
  1.8264 +
  1.8265 +//----------Logical Instructions-----------------------------------------------
  1.8266 +//----------Integer Logical Instructions---------------------------------------
  1.8267 +// And Instructions
  1.8268 +// And Register with Register
  1.8269 +instruct andI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
  1.8270 +  match(Set dst (AndI dst src));
  1.8271 +  effect(KILL cr);
  1.8272 +
  1.8273 +  size(2);
  1.8274 +  format %{ "AND    $dst,$src" %}
  1.8275 +  opcode(0x23);
  1.8276 +  ins_encode( OpcP, RegReg( dst, src) );
  1.8277 +  ins_pipe( ialu_reg_reg );
  1.8278 +%}
  1.8279 +
  1.8280 +// And Register with Immediate
  1.8281 +instruct andI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
  1.8282 +  match(Set dst (AndI dst src));
  1.8283 +  effect(KILL cr);
  1.8284 +
  1.8285 +  format %{ "AND    $dst,$src" %}
  1.8286 +  opcode(0x81,0x04);  /* Opcode 81 /4 */
  1.8287 +  // ins_encode( RegImm( dst, src) );
  1.8288 +  ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
  1.8289 +  ins_pipe( ialu_reg );
  1.8290 +%}
  1.8291 +
  1.8292 +// And Register with Memory
  1.8293 +instruct andI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
  1.8294 +  match(Set dst (AndI dst (LoadI src)));
  1.8295 +  effect(KILL cr);
  1.8296 +
  1.8297 +  ins_cost(125);
  1.8298 +  format %{ "AND    $dst,$src" %}
  1.8299 +  opcode(0x23);
  1.8300 +  ins_encode( OpcP, RegMem( dst, src) );
  1.8301 +  ins_pipe( ialu_reg_mem );
  1.8302 +%}
  1.8303 +
  1.8304 +// And Memory with Register
  1.8305 +instruct andI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
  1.8306 +  match(Set dst (StoreI dst (AndI (LoadI dst) src)));
  1.8307 +  effect(KILL cr);
  1.8308 +
  1.8309 +  ins_cost(150);
  1.8310 +  format %{ "AND    $dst,$src" %}
  1.8311 +  opcode(0x21);  /* Opcode 21 /r */
  1.8312 +  ins_encode( OpcP, RegMem( src, dst ) );
  1.8313 +  ins_pipe( ialu_mem_reg );
  1.8314 +%}
  1.8315 +
  1.8316 +// And Memory with Immediate
  1.8317 +instruct andI_mem_imm(memory dst, immI src, eFlagsReg cr) %{
  1.8318 +  match(Set dst (StoreI dst (AndI (LoadI dst) src)));
  1.8319 +  effect(KILL cr);
  1.8320 +
  1.8321 +  ins_cost(125);
  1.8322 +  format %{ "AND    $dst,$src" %}
  1.8323 +  opcode(0x81, 0x4);  /* Opcode 81 /4 id */
  1.8324 +  // ins_encode( MemImm( dst, src) );
  1.8325 +  ins_encode( OpcSE( src ), RMopc_Mem(secondary, dst ), Con8or32( src ) );
  1.8326 +  ins_pipe( ialu_mem_imm );
  1.8327 +%}
  1.8328 +
  1.8329 +// Or Instructions
  1.8330 +// Or Register with Register
  1.8331 +instruct orI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
  1.8332 +  match(Set dst (OrI dst src));
  1.8333 +  effect(KILL cr);
  1.8334 +
  1.8335 +  size(2);
  1.8336 +  format %{ "OR     $dst,$src" %}
  1.8337 +  opcode(0x0B);
  1.8338 +  ins_encode( OpcP, RegReg( dst, src) );
  1.8339 +  ins_pipe( ialu_reg_reg );
  1.8340 +%}
  1.8341 +
  1.8342 +// Or Register with Immediate
  1.8343 +instruct orI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
  1.8344 +  match(Set dst (OrI dst src));
  1.8345 +  effect(KILL cr);
  1.8346 +
  1.8347 +  format %{ "OR     $dst,$src" %}
  1.8348 +  opcode(0x81,0x01);  /* Opcode 81 /1 id */
  1.8349 +  // ins_encode( RegImm( dst, src) );
  1.8350 +  ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
  1.8351 +  ins_pipe( ialu_reg );
  1.8352 +%}
  1.8353 +
  1.8354 +// Or Register with Memory
  1.8355 +instruct orI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
  1.8356 +  match(Set dst (OrI dst (LoadI src)));
  1.8357 +  effect(KILL cr);
  1.8358 +
  1.8359 +  ins_cost(125);
  1.8360 +  format %{ "OR     $dst,$src" %}
  1.8361 +  opcode(0x0B);
  1.8362 +  ins_encode( OpcP, RegMem( dst, src) );
  1.8363 +  ins_pipe( ialu_reg_mem );
  1.8364 +%}
  1.8365 +
  1.8366 +// Or Memory with Register
  1.8367 +instruct orI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
  1.8368 +  match(Set dst (StoreI dst (OrI (LoadI dst) src)));
  1.8369 +  effect(KILL cr);
  1.8370 +
  1.8371 +  ins_cost(150);
  1.8372 +  format %{ "OR     $dst,$src" %}
  1.8373 +  opcode(0x09);  /* Opcode 09 /r */
  1.8374 +  ins_encode( OpcP, RegMem( src, dst ) );
  1.8375 +  ins_pipe( ialu_mem_reg );
  1.8376 +%}
  1.8377 +
  1.8378 +// Or Memory with Immediate
  1.8379 +instruct orI_mem_imm(memory dst, immI src, eFlagsReg cr) %{
  1.8380 +  match(Set dst (StoreI dst (OrI (LoadI dst) src)));
  1.8381 +  effect(KILL cr);
  1.8382 +
  1.8383 +  ins_cost(125);
  1.8384 +  format %{ "OR     $dst,$src" %}
  1.8385 +  opcode(0x81,0x1);  /* Opcode 81 /1 id */
  1.8386 +  // ins_encode( MemImm( dst, src) );
  1.8387 +  ins_encode( OpcSE( src ), RMopc_Mem(secondary, dst ), Con8or32( src ) );
  1.8388 +  ins_pipe( ialu_mem_imm );
  1.8389 +%}
  1.8390 +
  1.8391 +// ROL/ROR
  1.8392 +// ROL expand
  1.8393 +instruct rolI_eReg_imm1(eRegI dst, immI1 shift, eFlagsReg cr) %{
  1.8394 +  effect(USE_DEF dst, USE shift, KILL cr);
  1.8395 +
  1.8396 +  format %{ "ROL    $dst, $shift" %}
  1.8397 +  opcode(0xD1, 0x0); /* Opcode D1 /0 */
  1.8398 +  ins_encode( OpcP, RegOpc( dst ));
  1.8399 +  ins_pipe( ialu_reg );
  1.8400 +%}
  1.8401 +
  1.8402 +instruct rolI_eReg_imm8(eRegI dst, immI8 shift, eFlagsReg cr) %{
  1.8403 +  effect(USE_DEF dst, USE shift, KILL cr);
  1.8404 +
  1.8405 +  format %{ "ROL    $dst, $shift" %}
  1.8406 +  opcode(0xC1, 0x0); /*Opcode /C1  /0  */
  1.8407 +  ins_encode( RegOpcImm(dst, shift) );
  1.8408 +  ins_pipe(ialu_reg);
  1.8409 +%}
  1.8410 +
  1.8411 +instruct rolI_eReg_CL(ncxRegI dst, eCXRegI shift, eFlagsReg cr) %{
  1.8412 +  effect(USE_DEF dst, USE shift, KILL cr);
  1.8413 +
  1.8414 +  format %{ "ROL    $dst, $shift" %}
  1.8415 +  opcode(0xD3, 0x0);    /* Opcode D3 /0 */
  1.8416 +  ins_encode(OpcP, RegOpc(dst));
  1.8417 +  ins_pipe( ialu_reg_reg );
  1.8418 +%}
  1.8419 +// end of ROL expand
  1.8420 +
  1.8421 +// ROL 32bit by one once
  1.8422 +instruct rolI_eReg_i1(eRegI dst, immI1 lshift, immI_M1 rshift, eFlagsReg cr) %{
  1.8423 +  match(Set dst ( OrI (LShiftI dst lshift) (URShiftI dst rshift)));
  1.8424 +
  1.8425 +  expand %{
  1.8426 +    rolI_eReg_imm1(dst, lshift, cr);
  1.8427 +  %}
  1.8428 +%}
  1.8429 +
  1.8430 +// ROL 32bit var by imm8 once
  1.8431 +instruct rolI_eReg_i8(eRegI dst, immI8 lshift, immI8 rshift, eFlagsReg cr) %{
  1.8432 +  predicate(  0 == ((n->in(1)->in(2)->get_int() + n->in(2)->in(2)->get_int()) & 0x1f));
  1.8433 +  match(Set dst ( OrI (LShiftI dst lshift) (URShiftI dst rshift)));
  1.8434 +
  1.8435 +  expand %{
  1.8436 +    rolI_eReg_imm8(dst, lshift, cr);
  1.8437 +  %}
  1.8438 +%}
  1.8439 +
  1.8440 +// ROL 32bit var by var once
  1.8441 +instruct rolI_eReg_Var_C0(ncxRegI dst, eCXRegI shift, immI0 zero, eFlagsReg cr) %{
  1.8442 +  match(Set dst ( OrI (LShiftI dst shift) (URShiftI dst (SubI zero shift))));
  1.8443 +
  1.8444 +  expand %{
  1.8445 +    rolI_eReg_CL(dst, shift, cr);
  1.8446 +  %}
  1.8447 +%}
  1.8448 +
  1.8449 +// ROL 32bit var by var once
  1.8450 +instruct rolI_eReg_Var_C32(ncxRegI dst, eCXRegI shift, immI_32 c32, eFlagsReg cr) %{
  1.8451 +  match(Set dst ( OrI (LShiftI dst shift) (URShiftI dst (SubI c32 shift))));
  1.8452 +
  1.8453 +  expand %{
  1.8454 +    rolI_eReg_CL(dst, shift, cr);
  1.8455 +  %}
  1.8456 +%}
  1.8457 +
  1.8458 +// ROR expand
  1.8459 +instruct rorI_eReg_imm1(eRegI dst, immI1 shift, eFlagsReg cr) %{
  1.8460 +  effect(USE_DEF dst, USE shift, KILL cr);
  1.8461 +
  1.8462 +  format %{ "ROR    $dst, $shift" %}
  1.8463 +  opcode(0xD1,0x1);  /* Opcode D1 /1 */
  1.8464 +  ins_encode( OpcP, RegOpc( dst ) );
  1.8465 +  ins_pipe( ialu_reg );
  1.8466 +%}
  1.8467 +
  1.8468 +instruct rorI_eReg_imm8(eRegI dst, immI8 shift, eFlagsReg cr) %{
  1.8469 +  effect (USE_DEF dst, USE shift, KILL cr);
  1.8470 +
  1.8471 +  format %{ "ROR    $dst, $shift" %}
  1.8472 +  opcode(0xC1, 0x1); /* Opcode /C1 /1 ib */
  1.8473 +  ins_encode( RegOpcImm(dst, shift) );
  1.8474 +  ins_pipe( ialu_reg );
  1.8475 +%}
  1.8476 +
  1.8477 +instruct rorI_eReg_CL(ncxRegI dst, eCXRegI shift, eFlagsReg cr)%{
  1.8478 +  effect(USE_DEF dst, USE shift, KILL cr);
  1.8479 +
  1.8480 +  format %{ "ROR    $dst, $shift" %}
  1.8481 +  opcode(0xD3, 0x1);    /* Opcode D3 /1 */
  1.8482 +  ins_encode(OpcP, RegOpc(dst));
  1.8483 +  ins_pipe( ialu_reg_reg );
  1.8484 +%}
  1.8485 +// end of ROR expand
  1.8486 +
  1.8487 +// ROR right once
  1.8488 +instruct rorI_eReg_i1(eRegI dst, immI1 rshift, immI_M1 lshift, eFlagsReg cr) %{
  1.8489 +  match(Set dst ( OrI (URShiftI dst rshift) (LShiftI dst lshift)));
  1.8490 +
  1.8491 +  expand %{
  1.8492 +    rorI_eReg_imm1(dst, rshift, cr);
  1.8493 +  %}
  1.8494 +%}
  1.8495 +
  1.8496 +// ROR 32bit by immI8 once
  1.8497 +instruct rorI_eReg_i8(eRegI dst, immI8 rshift, immI8 lshift, eFlagsReg cr) %{
  1.8498 +  predicate(  0 == ((n->in(1)->in(2)->get_int() + n->in(2)->in(2)->get_int()) & 0x1f));
  1.8499 +  match(Set dst ( OrI (URShiftI dst rshift) (LShiftI dst lshift)));
  1.8500 +
  1.8501 +  expand %{
  1.8502 +    rorI_eReg_imm8(dst, rshift, cr);
  1.8503 +  %}
  1.8504 +%}
  1.8505 +
  1.8506 +// ROR 32bit var by var once
  1.8507 +instruct rorI_eReg_Var_C0(ncxRegI dst, eCXRegI shift, immI0 zero, eFlagsReg cr) %{
  1.8508 +  match(Set dst ( OrI (URShiftI dst shift) (LShiftI dst (SubI zero shift))));
  1.8509 +
  1.8510 +  expand %{
  1.8511 +    rorI_eReg_CL(dst, shift, cr);
  1.8512 +  %}
  1.8513 +%}
  1.8514 +
  1.8515 +// ROR 32bit var by var once
  1.8516 +instruct rorI_eReg_Var_C32(ncxRegI dst, eCXRegI shift, immI_32 c32, eFlagsReg cr) %{
  1.8517 +  match(Set dst ( OrI (URShiftI dst shift) (LShiftI dst (SubI c32 shift))));
  1.8518 +
  1.8519 +  expand %{
  1.8520 +    rorI_eReg_CL(dst, shift, cr);
  1.8521 +  %}
  1.8522 +%}
  1.8523 +
  1.8524 +// Xor Instructions
  1.8525 +// Xor Register with Register
  1.8526 +instruct xorI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
  1.8527 +  match(Set dst (XorI dst src));
  1.8528 +  effect(KILL cr);
  1.8529 +
  1.8530 +  size(2);
  1.8531 +  format %{ "XOR    $dst,$src" %}
  1.8532 +  opcode(0x33);
  1.8533 +  ins_encode( OpcP, RegReg( dst, src) );
  1.8534 +  ins_pipe( ialu_reg_reg );
  1.8535 +%}
  1.8536 +
  1.8537 +// Xor Register with Immediate
  1.8538 +instruct xorI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
  1.8539 +  match(Set dst (XorI dst src));
  1.8540 +  effect(KILL cr);
  1.8541 +
  1.8542 +  format %{ "XOR    $dst,$src" %}
  1.8543 +  opcode(0x81,0x06);  /* Opcode 81 /6 id */
  1.8544 +  // ins_encode( RegImm( dst, src) );
  1.8545 +  ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
  1.8546 +  ins_pipe( ialu_reg );
  1.8547 +%}
  1.8548 +
  1.8549 +// Xor Register with Memory
  1.8550 +instruct xorI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
  1.8551 +  match(Set dst (XorI dst (LoadI src)));
  1.8552 +  effect(KILL cr);
  1.8553 +
  1.8554 +  ins_cost(125);
  1.8555 +  format %{ "XOR    $dst,$src" %}
  1.8556 +  opcode(0x33);
  1.8557 +  ins_encode( OpcP, RegMem(dst, src) );
  1.8558 +  ins_pipe( ialu_reg_mem );
  1.8559 +%}
  1.8560 +
  1.8561 +// Xor Memory with Register
  1.8562 +instruct xorI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
  1.8563 +  match(Set dst (StoreI dst (XorI (LoadI dst) src)));
  1.8564 +  effect(KILL cr);
  1.8565 +
  1.8566 +  ins_cost(150);
  1.8567 +  format %{ "XOR    $dst,$src" %}
  1.8568 +  opcode(0x31);  /* Opcode 31 /r */
  1.8569 +  ins_encode( OpcP, RegMem( src, dst ) );
  1.8570 +  ins_pipe( ialu_mem_reg );
  1.8571 +%}
  1.8572 +
  1.8573 +// Xor Memory with Immediate
  1.8574 +instruct xorI_mem_imm(memory dst, immI src, eFlagsReg cr) %{
  1.8575 +  match(Set dst (StoreI dst (XorI (LoadI dst) src)));
  1.8576 +  effect(KILL cr);
  1.8577 +
  1.8578 +  ins_cost(125);
  1.8579 +  format %{ "XOR    $dst,$src" %}
  1.8580 +  opcode(0x81,0x6);  /* Opcode 81 /6 id */
  1.8581 +  ins_encode( OpcSE( src ), RMopc_Mem(secondary, dst ), Con8or32( src ) );
  1.8582 +  ins_pipe( ialu_mem_imm );
  1.8583 +%}
  1.8584 +
  1.8585 +//----------Convert Int to Boolean---------------------------------------------
  1.8586 +
  1.8587 +instruct movI_nocopy(eRegI dst, eRegI src) %{
  1.8588 +  effect( DEF dst, USE src );
  1.8589 +  format %{ "MOV    $dst,$src" %}
  1.8590 +  ins_encode( enc_Copy( dst, src) );
  1.8591 +  ins_pipe( ialu_reg_reg );
  1.8592 +%}
  1.8593 +
  1.8594 +instruct ci2b( eRegI dst, eRegI src, eFlagsReg cr ) %{
  1.8595 +  effect( USE_DEF dst, USE src, KILL cr );
  1.8596 +
  1.8597 +  size(4);
  1.8598 +  format %{ "NEG    $dst\n\t"
  1.8599 +            "ADC    $dst,$src" %}
  1.8600 +  ins_encode( neg_reg(dst),
  1.8601 +              OpcRegReg(0x13,dst,src) );
  1.8602 +  ins_pipe( ialu_reg_reg_long );
  1.8603 +%}
  1.8604 +
  1.8605 +instruct convI2B( eRegI dst, eRegI src, eFlagsReg cr ) %{
  1.8606 +  match(Set dst (Conv2B src));
  1.8607 +
  1.8608 +  expand %{
  1.8609 +    movI_nocopy(dst,src);
  1.8610 +    ci2b(dst,src,cr);
  1.8611 +  %}
  1.8612 +%}
  1.8613 +
  1.8614 +instruct movP_nocopy(eRegI dst, eRegP src) %{
  1.8615 +  effect( DEF dst, USE src );
  1.8616 +  format %{ "MOV    $dst,$src" %}
  1.8617 +  ins_encode( enc_Copy( dst, src) );
  1.8618 +  ins_pipe( ialu_reg_reg );
  1.8619 +%}
  1.8620 +
  1.8621 +instruct cp2b( eRegI dst, eRegP src, eFlagsReg cr ) %{
  1.8622 +  effect( USE_DEF dst, USE src, KILL cr );
  1.8623 +  format %{ "NEG    $dst\n\t"
  1.8624 +            "ADC    $dst,$src" %}
  1.8625 +  ins_encode( neg_reg(dst),
  1.8626 +              OpcRegReg(0x13,dst,src) );
  1.8627 +  ins_pipe( ialu_reg_reg_long );
  1.8628 +%}
  1.8629 +
  1.8630 +instruct convP2B( eRegI dst, eRegP src, eFlagsReg cr ) %{
  1.8631 +  match(Set dst (Conv2B src));
  1.8632 +
  1.8633 +  expand %{
  1.8634 +    movP_nocopy(dst,src);
  1.8635 +    cp2b(dst,src,cr);
  1.8636 +  %}
  1.8637 +%}
  1.8638 +
  1.8639 +instruct cmpLTMask( eCXRegI dst, ncxRegI p, ncxRegI q, eFlagsReg cr ) %{
  1.8640 +  match(Set dst (CmpLTMask p q));
  1.8641 +  effect( KILL cr );
  1.8642 +  ins_cost(400);
  1.8643 +
  1.8644 +  // SETlt can only use low byte of EAX,EBX, ECX, or EDX as destination
  1.8645 +  format %{ "XOR    $dst,$dst\n\t"
  1.8646 +            "CMP    $p,$q\n\t"
  1.8647 +            "SETlt  $dst\n\t"
  1.8648 +            "NEG    $dst" %}
  1.8649 +  ins_encode( OpcRegReg(0x33,dst,dst),
  1.8650 +              OpcRegReg(0x3B,p,q),
  1.8651 +              setLT_reg(dst), neg_reg(dst) );
  1.8652 +  ins_pipe( pipe_slow );
  1.8653 +%}
  1.8654 +
  1.8655 +instruct cmpLTMask0( eRegI dst, immI0 zero, eFlagsReg cr ) %{
  1.8656 +  match(Set dst (CmpLTMask dst zero));
  1.8657 +  effect( DEF dst, KILL cr );
  1.8658 +  ins_cost(100);
  1.8659 +
  1.8660 +  format %{ "SAR    $dst,31" %}
  1.8661 +  opcode(0xC1, 0x7);  /* C1 /7 ib */
  1.8662 +  ins_encode( RegOpcImm( dst, 0x1F ) );
  1.8663 +  ins_pipe( ialu_reg );
  1.8664 +%}
  1.8665 +
  1.8666 +
  1.8667 +instruct cadd_cmpLTMask( ncxRegI p, ncxRegI q, ncxRegI y, eCXRegI tmp, eFlagsReg cr ) %{
  1.8668 +  match(Set p (AddI (AndI (CmpLTMask p q) y) (SubI p q)));
  1.8669 +  effect( KILL tmp, KILL cr );
  1.8670 +  ins_cost(400);
  1.8671 +  // annoyingly, $tmp has no edges so you cant ask for it in
  1.8672 +  // any format or encoding
  1.8673 +  format %{ "SUB    $p,$q\n\t"
  1.8674 +            "SBB    ECX,ECX\n\t"
  1.8675 +            "AND    ECX,$y\n\t"
  1.8676 +            "ADD    $p,ECX" %}
  1.8677 +  ins_encode( enc_cmpLTP(p,q,y,tmp) );
  1.8678 +  ins_pipe( pipe_cmplt );
  1.8679 +%}
  1.8680 +
  1.8681 +/* If I enable this, I encourage spilling in the inner loop of compress.
  1.8682 +instruct cadd_cmpLTMask_mem( ncxRegI p, ncxRegI q, memory y, eCXRegI tmp, eFlagsReg cr ) %{
  1.8683 +  match(Set p (AddI (AndI (CmpLTMask p q) (LoadI y)) (SubI p q)));
  1.8684 +  effect( USE_KILL tmp, KILL cr );
  1.8685 +  ins_cost(400);
  1.8686 +
  1.8687 +  format %{ "SUB    $p,$q\n\t"
  1.8688 +            "SBB    ECX,ECX\n\t"
  1.8689 +            "AND    ECX,$y\n\t"
  1.8690 +            "ADD    $p,ECX" %}
  1.8691 +  ins_encode( enc_cmpLTP_mem(p,q,y,tmp) );
  1.8692 +%}
  1.8693 +*/
  1.8694 +
  1.8695 +//----------Long Instructions------------------------------------------------
  1.8696 +// Add Long Register with Register
  1.8697 +instruct addL_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
  1.8698 +  match(Set dst (AddL dst src));
  1.8699 +  effect(KILL cr);
  1.8700 +  ins_cost(200);
  1.8701 +  format %{ "ADD    $dst.lo,$src.lo\n\t"
  1.8702 +            "ADC    $dst.hi,$src.hi" %}
  1.8703 +  opcode(0x03, 0x13);
  1.8704 +  ins_encode( RegReg_Lo(dst, src), RegReg_Hi(dst,src) );
  1.8705 +  ins_pipe( ialu_reg_reg_long );
  1.8706 +%}
  1.8707 +
  1.8708 +// Add Long Register with Immediate
  1.8709 +instruct addL_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
  1.8710 +  match(Set dst (AddL dst src));
  1.8711 +  effect(KILL cr);
  1.8712 +  format %{ "ADD    $dst.lo,$src.lo\n\t"
  1.8713 +            "ADC    $dst.hi,$src.hi" %}
  1.8714 +  opcode(0x81,0x00,0x02);  /* Opcode 81 /0, 81 /2 */
  1.8715 +  ins_encode( Long_OpcSErm_Lo( dst, src ), Long_OpcSErm_Hi( dst, src ) );
  1.8716 +  ins_pipe( ialu_reg_long );
  1.8717 +%}
  1.8718 +
  1.8719 +// Add Long Register with Memory
  1.8720 +instruct addL_eReg_mem(eRegL dst, load_long_memory mem, eFlagsReg cr) %{
  1.8721 +  match(Set dst (AddL dst (LoadL mem)));
  1.8722 +  effect(KILL cr);
  1.8723 +  ins_cost(125);
  1.8724 +  format %{ "ADD    $dst.lo,$mem\n\t"
  1.8725 +            "ADC    $dst.hi,$mem+4" %}
  1.8726 +  opcode(0x03, 0x13);
  1.8727 +  ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
  1.8728 +  ins_pipe( ialu_reg_long_mem );
  1.8729 +%}
  1.8730 +
  1.8731 +// Subtract Long Register with Register.
  1.8732 +instruct subL_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
  1.8733 +  match(Set dst (SubL dst src));
  1.8734 +  effect(KILL cr);
  1.8735 +  ins_cost(200);
  1.8736 +  format %{ "SUB    $dst.lo,$src.lo\n\t"
  1.8737 +            "SBB    $dst.hi,$src.hi" %}
  1.8738 +  opcode(0x2B, 0x1B);
  1.8739 +  ins_encode( RegReg_Lo(dst, src), RegReg_Hi(dst,src) );
  1.8740 +  ins_pipe( ialu_reg_reg_long );
  1.8741 +%}
  1.8742 +
  1.8743 +// Subtract Long Register with Immediate
  1.8744 +instruct subL_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
  1.8745 +  match(Set dst (SubL dst src));
  1.8746 +  effect(KILL cr);
  1.8747 +  format %{ "SUB    $dst.lo,$src.lo\n\t"
  1.8748 +            "SBB    $dst.hi,$src.hi" %}
  1.8749 +  opcode(0x81,0x05,0x03);  /* Opcode 81 /5, 81 /3 */
  1.8750 +  ins_encode( Long_OpcSErm_Lo( dst, src ), Long_OpcSErm_Hi( dst, src ) );
  1.8751 +  ins_pipe( ialu_reg_long );
  1.8752 +%}
  1.8753 +
  1.8754 +// Subtract Long Register with Memory
  1.8755 +instruct subL_eReg_mem(eRegL dst, load_long_memory mem, eFlagsReg cr) %{
  1.8756 +  match(Set dst (SubL dst (LoadL mem)));
  1.8757 +  effect(KILL cr);
  1.8758 +  ins_cost(125);
  1.8759 +  format %{ "SUB    $dst.lo,$mem\n\t"
  1.8760 +            "SBB    $dst.hi,$mem+4" %}
  1.8761 +  opcode(0x2B, 0x1B);
  1.8762 +  ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
  1.8763 +  ins_pipe( ialu_reg_long_mem );
  1.8764 +%}
  1.8765 +
  1.8766 +instruct negL_eReg(eRegL dst, immL0 zero, eFlagsReg cr) %{
  1.8767 +  match(Set dst (SubL zero dst));
  1.8768 +  effect(KILL cr);
  1.8769 +  ins_cost(300);
  1.8770 +  format %{ "NEG    $dst.hi\n\tNEG    $dst.lo\n\tSBB    $dst.hi,0" %}
  1.8771 +  ins_encode( neg_long(dst) );
  1.8772 +  ins_pipe( ialu_reg_reg_long );
  1.8773 +%}
  1.8774 +
  1.8775 +// And Long Register with Register
  1.8776 +instruct andL_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
  1.8777 +  match(Set dst (AndL dst src));
  1.8778 +  effect(KILL cr);
  1.8779 +  format %{ "AND    $dst.lo,$src.lo\n\t"
  1.8780 +            "AND    $dst.hi,$src.hi" %}
  1.8781 +  opcode(0x23,0x23);
  1.8782 +  ins_encode( RegReg_Lo( dst, src), RegReg_Hi( dst, src) );
  1.8783 +  ins_pipe( ialu_reg_reg_long );
  1.8784 +%}
  1.8785 +
  1.8786 +// And Long Register with Immediate
  1.8787 +instruct andL_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
  1.8788 +  match(Set dst (AndL dst src));
  1.8789 +  effect(KILL cr);
  1.8790 +  format %{ "AND    $dst.lo,$src.lo\n\t"
  1.8791 +            "AND    $dst.hi,$src.hi" %}
  1.8792 +  opcode(0x81,0x04,0x04);  /* Opcode 81 /4, 81 /4 */
  1.8793 +  ins_encode( Long_OpcSErm_Lo( dst, src ), Long_OpcSErm_Hi( dst, src ) );
  1.8794 +  ins_pipe( ialu_reg_long );
  1.8795 +%}
  1.8796 +
  1.8797 +// And Long Register with Memory
  1.8798 +instruct andL_eReg_mem(eRegL dst, load_long_memory mem, eFlagsReg cr) %{
  1.8799 +  match(Set dst (AndL dst (LoadL mem)));
  1.8800 +  effect(KILL cr);
  1.8801 +  ins_cost(125);
  1.8802 +  format %{ "AND    $dst.lo,$mem\n\t"
  1.8803 +            "AND    $dst.hi,$mem+4" %}
  1.8804 +  opcode(0x23, 0x23);
  1.8805 +  ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
  1.8806 +  ins_pipe( ialu_reg_long_mem );
  1.8807 +%}
  1.8808 +
  1.8809 +// Or Long Register with Register
  1.8810 +instruct orl_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
  1.8811 +  match(Set dst (OrL dst src));
  1.8812 +  effect(KILL cr);
  1.8813 +  format %{ "OR     $dst.lo,$src.lo\n\t"
  1.8814 +            "OR     $dst.hi,$src.hi" %}
  1.8815 +  opcode(0x0B,0x0B);
  1.8816 +  ins_encode( RegReg_Lo( dst, src), RegReg_Hi( dst, src) );
  1.8817 +  ins_pipe( ialu_reg_reg_long );
  1.8818 +%}
  1.8819 +
  1.8820 +// Or Long Register with Immediate
  1.8821 +instruct orl_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
  1.8822 +  match(Set dst (OrL dst src));
  1.8823 +  effect(KILL cr);
  1.8824 +  format %{ "OR     $dst.lo,$src.lo\n\t"
  1.8825 +            "OR     $dst.hi,$src.hi" %}
  1.8826 +  opcode(0x81,0x01,0x01);  /* Opcode 81 /1, 81 /1 */
  1.8827 +  ins_encode( Long_OpcSErm_Lo( dst, src ), Long_OpcSErm_Hi( dst, src ) );
  1.8828 +  ins_pipe( ialu_reg_long );
  1.8829 +%}
  1.8830 +
  1.8831 +// Or Long Register with Memory
  1.8832 +instruct orl_eReg_mem(eRegL dst, load_long_memory mem, eFlagsReg cr) %{
  1.8833 +  match(Set dst (OrL dst (LoadL mem)));
  1.8834 +  effect(KILL cr);
  1.8835 +  ins_cost(125);
  1.8836 +  format %{ "OR     $dst.lo,$mem\n\t"
  1.8837 +            "OR     $dst.hi,$mem+4" %}
  1.8838 +  opcode(0x0B,0x0B);
  1.8839 +  ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
  1.8840 +  ins_pipe( ialu_reg_long_mem );
  1.8841 +%}
  1.8842 +
  1.8843 +// Xor Long Register with Register
  1.8844 +instruct xorl_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
  1.8845 +  match(Set dst (XorL dst src));
  1.8846 +  effect(KILL cr);
  1.8847 +  format %{ "XOR    $dst.lo,$src.lo\n\t"
  1.8848 +            "XOR    $dst.hi,$src.hi" %}
  1.8849 +  opcode(0x33,0x33);
  1.8850 +  ins_encode( RegReg_Lo( dst, src), RegReg_Hi( dst, src) );
  1.8851 +  ins_pipe( ialu_reg_reg_long );
  1.8852 +%}
  1.8853 +
  1.8854 +// Xor Long Register with Immediate
  1.8855 +instruct xorl_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
  1.8856 +  match(Set dst (XorL dst src));
  1.8857 +  effect(KILL cr);
  1.8858 +  format %{ "XOR    $dst.lo,$src.lo\n\t"
  1.8859 +            "XOR    $dst.hi,$src.hi" %}
  1.8860 +  opcode(0x81,0x06,0x06);  /* Opcode 81 /6, 81 /6 */
  1.8861 +  ins_encode( Long_OpcSErm_Lo( dst, src ), Long_OpcSErm_Hi( dst, src ) );
  1.8862 +  ins_pipe( ialu_reg_long );
  1.8863 +%}
  1.8864 +
  1.8865 +// Xor Long Register with Memory
  1.8866 +instruct xorl_eReg_mem(eRegL dst, load_long_memory mem, eFlagsReg cr) %{
  1.8867 +  match(Set dst (XorL dst (LoadL mem)));
  1.8868 +  effect(KILL cr);
  1.8869 +  ins_cost(125);
  1.8870 +  format %{ "XOR    $dst.lo,$mem\n\t"
  1.8871 +            "XOR    $dst.hi,$mem+4" %}
  1.8872 +  opcode(0x33,0x33);
  1.8873 +  ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
  1.8874 +  ins_pipe( ialu_reg_long_mem );
  1.8875 +%}
  1.8876 +
  1.8877 +// Shift Left Long by 1-31
  1.8878 +instruct shlL_eReg_1_31(eRegL dst, immI_1_31 cnt, eFlagsReg cr) %{
  1.8879 +  match(Set dst (LShiftL dst cnt));
  1.8880 +  effect(KILL cr);
  1.8881 +  ins_cost(200);
  1.8882 +  format %{ "SHLD   $dst.hi,$dst.lo,$cnt\n\t"
  1.8883 +            "SHL    $dst.lo,$cnt" %}
  1.8884 +  opcode(0xC1, 0x4, 0xA4);  /* 0F/A4, then C1 /4 ib */
  1.8885 +  ins_encode( move_long_small_shift(dst,cnt) );
  1.8886 +  ins_pipe( ialu_reg_long );
  1.8887 +%}
  1.8888 +
  1.8889 +// Shift Left Long by 32-63
  1.8890 +instruct shlL_eReg_32_63(eRegL dst, immI_32_63 cnt, eFlagsReg cr) %{
  1.8891 +  match(Set dst (LShiftL dst cnt));
  1.8892 +  effect(KILL cr);
  1.8893 +  ins_cost(300);
  1.8894 +  format %{ "MOV    $dst.hi,$dst.lo\n"
  1.8895 +          "\tSHL    $dst.hi,$cnt-32\n"
  1.8896 +          "\tXOR    $dst.lo,$dst.lo" %}
  1.8897 +  opcode(0xC1, 0x4);  /* C1 /4 ib */
  1.8898 +  ins_encode( move_long_big_shift_clr(dst,cnt) );
  1.8899 +  ins_pipe( ialu_reg_long );
  1.8900 +%}
  1.8901 +
  1.8902 +// Shift Left Long by variable
  1.8903 +instruct salL_eReg_CL(eRegL dst, eCXRegI shift, eFlagsReg cr) %{
  1.8904 +  match(Set dst (LShiftL dst shift));
  1.8905 +  effect(KILL cr);
  1.8906 +  ins_cost(500+200);
  1.8907 +  size(17);
  1.8908 +  format %{ "TEST   $shift,32\n\t"
  1.8909 +            "JEQ,s  small\n\t"
  1.8910 +            "MOV    $dst.hi,$dst.lo\n\t"
  1.8911 +            "XOR    $dst.lo,$dst.lo\n"
  1.8912 +    "small:\tSHLD   $dst.hi,$dst.lo,$shift\n\t"
  1.8913 +            "SHL    $dst.lo,$shift" %}
  1.8914 +  ins_encode( shift_left_long( dst, shift ) );
  1.8915 +  ins_pipe( pipe_slow );
  1.8916 +%}
  1.8917 +
  1.8918 +// Shift Right Long by 1-31
  1.8919 +instruct shrL_eReg_1_31(eRegL dst, immI_1_31 cnt, eFlagsReg cr) %{
  1.8920 +  match(Set dst (URShiftL dst cnt));
  1.8921 +  effect(KILL cr);
  1.8922 +  ins_cost(200);
  1.8923 +  format %{ "SHRD   $dst.lo,$dst.hi,$cnt\n\t"
  1.8924 +            "SHR    $dst.hi,$cnt" %}
  1.8925 +  opcode(0xC1, 0x5, 0xAC);  /* 0F/AC, then C1 /5 ib */
  1.8926 +  ins_encode( move_long_small_shift(dst,cnt) );
  1.8927 +  ins_pipe( ialu_reg_long );
  1.8928 +%}
  1.8929 +
  1.8930 +// Shift Right Long by 32-63
  1.8931 +instruct shrL_eReg_32_63(eRegL dst, immI_32_63 cnt, eFlagsReg cr) %{
  1.8932 +  match(Set dst (URShiftL dst cnt));
  1.8933 +  effect(KILL cr);
  1.8934 +  ins_cost(300);
  1.8935 +  format %{ "MOV    $dst.lo,$dst.hi\n"
  1.8936 +          "\tSHR    $dst.lo,$cnt-32\n"
  1.8937 +          "\tXOR    $dst.hi,$dst.hi" %}
  1.8938 +  opcode(0xC1, 0x5);  /* C1 /5 ib */
  1.8939 +  ins_encode( move_long_big_shift_clr(dst,cnt) );
  1.8940 +  ins_pipe( ialu_reg_long );
  1.8941 +%}
  1.8942 +
  1.8943 +// Shift Right Long by variable
  1.8944 +instruct shrL_eReg_CL(eRegL dst, eCXRegI shift, eFlagsReg cr) %{
  1.8945 +  match(Set dst (URShiftL dst shift));
  1.8946 +  effect(KILL cr);
  1.8947 +  ins_cost(600);
  1.8948 +  size(17);
  1.8949 +  format %{ "TEST   $shift,32\n\t"
  1.8950 +            "JEQ,s  small\n\t"
  1.8951 +            "MOV    $dst.lo,$dst.hi\n\t"
  1.8952 +            "XOR    $dst.hi,$dst.hi\n"
  1.8953 +    "small:\tSHRD   $dst.lo,$dst.hi,$shift\n\t"
  1.8954 +            "SHR    $dst.hi,$shift" %}
  1.8955 +  ins_encode( shift_right_long( dst, shift ) );
  1.8956 +  ins_pipe( pipe_slow );
  1.8957 +%}
  1.8958 +
  1.8959 +// Shift Right Long by 1-31
  1.8960 +instruct sarL_eReg_1_31(eRegL dst, immI_1_31 cnt, eFlagsReg cr) %{
  1.8961 +  match(Set dst (RShiftL dst cnt));
  1.8962 +  effect(KILL cr);
  1.8963 +  ins_cost(200);
  1.8964 +  format %{ "SHRD   $dst.lo,$dst.hi,$cnt\n\t"
  1.8965 +            "SAR    $dst.hi,$cnt" %}
  1.8966 +  opcode(0xC1, 0x7, 0xAC);  /* 0F/AC, then C1 /7 ib */
  1.8967 +  ins_encode( move_long_small_shift(dst,cnt) );
  1.8968 +  ins_pipe( ialu_reg_long );
  1.8969 +%}
  1.8970 +
  1.8971 +// Shift Right Long by 32-63
  1.8972 +instruct sarL_eReg_32_63( eRegL dst, immI_32_63 cnt, eFlagsReg cr) %{
  1.8973 +  match(Set dst (RShiftL dst cnt));
  1.8974 +  effect(KILL cr);
  1.8975 +  ins_cost(300);
  1.8976 +  format %{ "MOV    $dst.lo,$dst.hi\n"
  1.8977 +          "\tSAR    $dst.lo,$cnt-32\n"
  1.8978 +          "\tSAR    $dst.hi,31" %}
  1.8979 +  opcode(0xC1, 0x7);  /* C1 /7 ib */
  1.8980 +  ins_encode( move_long_big_shift_sign(dst,cnt) );
  1.8981 +  ins_pipe( ialu_reg_long );
  1.8982 +%}
  1.8983 +
  1.8984 +// Shift Right arithmetic Long by variable
  1.8985 +instruct sarL_eReg_CL(eRegL dst, eCXRegI shift, eFlagsReg cr) %{
  1.8986 +  match(Set dst (RShiftL dst shift));
  1.8987 +  effect(KILL cr);
  1.8988 +  ins_cost(600);
  1.8989 +  size(18);
  1.8990 +  format %{ "TEST   $shift,32\n\t"
  1.8991 +            "JEQ,s  small\n\t"
  1.8992 +            "MOV    $dst.lo,$dst.hi\n\t"
  1.8993 +            "SAR    $dst.hi,31\n"
  1.8994 +    "small:\tSHRD   $dst.lo,$dst.hi,$shift\n\t"
  1.8995 +            "SAR    $dst.hi,$shift" %}
  1.8996 +  ins_encode( shift_right_arith_long( dst, shift ) );
  1.8997 +  ins_pipe( pipe_slow );
  1.8998 +%}
  1.8999 +
  1.9000 +
  1.9001 +//----------Double Instructions------------------------------------------------
  1.9002 +// Double Math
  1.9003 +
  1.9004 +// Compare & branch
  1.9005 +
  1.9006 +// P6 version of float compare, sets condition codes in EFLAGS
  1.9007 +instruct cmpD_cc_P6(eFlagsRegU cr, regD src1, regD src2, eAXRegI rax) %{
  1.9008 +  predicate(VM_Version::supports_cmov() && UseSSE <=1);
  1.9009 +  match(Set cr (CmpD src1 src2));
  1.9010 +  effect(KILL rax);
  1.9011 +  ins_cost(150);
  1.9012 +  format %{ "FLD    $src1\n\t"
  1.9013 +            "FUCOMIP ST,$src2  // P6 instruction\n\t"
  1.9014 +            "JNP    exit\n\t"
  1.9015 +            "MOV    ah,1       // saw a NaN, set CF\n\t"
  1.9016 +            "SAHF\n"
  1.9017 +     "exit:\tNOP               // avoid branch to branch" %}
  1.9018 +  opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
  1.9019 +  ins_encode( Push_Reg_D(src1),
  1.9020 +              OpcP, RegOpc(src2),
  1.9021 +              cmpF_P6_fixup );
  1.9022 +  ins_pipe( pipe_slow );
  1.9023 +%}
  1.9024 +
  1.9025 +// Compare & branch
  1.9026 +instruct cmpD_cc(eFlagsRegU cr, regD src1, regD src2, eAXRegI rax) %{
  1.9027 +  predicate(UseSSE<=1);
  1.9028 +  match(Set cr (CmpD src1 src2));
  1.9029 +  effect(KILL rax);
  1.9030 +  ins_cost(200);
  1.9031 +  format %{ "FLD    $src1\n\t"
  1.9032 +            "FCOMp  $src2\n\t"
  1.9033 +            "FNSTSW AX\n\t"
  1.9034 +            "TEST   AX,0x400\n\t"
  1.9035 +            "JZ,s   flags\n\t"
  1.9036 +            "MOV    AH,1\t# unordered treat as LT\n"
  1.9037 +    "flags:\tSAHF" %}
  1.9038 +  opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
  1.9039 +  ins_encode( Push_Reg_D(src1),
  1.9040 +              OpcP, RegOpc(src2),
  1.9041 +              fpu_flags);
  1.9042 +  ins_pipe( pipe_slow );
  1.9043 +%}
  1.9044 +
  1.9045 +// Compare vs zero into -1,0,1
  1.9046 +instruct cmpD_0(eRegI dst, regD src1, immD0 zero, eAXRegI rax, eFlagsReg cr) %{
  1.9047 +  predicate(UseSSE<=1);
  1.9048 +  match(Set dst (CmpD3 src1 zero));
  1.9049 +  effect(KILL cr, KILL rax);
  1.9050 +  ins_cost(280);
  1.9051 +  format %{ "FTSTD  $dst,$src1" %}
  1.9052 +  opcode(0xE4, 0xD9);
  1.9053 +  ins_encode( Push_Reg_D(src1),
  1.9054 +              OpcS, OpcP, PopFPU,
  1.9055 +              CmpF_Result(dst));
  1.9056 +  ins_pipe( pipe_slow );
  1.9057 +%}
  1.9058 +
  1.9059 +// Compare into -1,0,1
  1.9060 +instruct cmpD_reg(eRegI dst, regD src1, regD src2, eAXRegI rax, eFlagsReg cr) %{
  1.9061 +  predicate(UseSSE<=1);
  1.9062 +  match(Set dst (CmpD3 src1 src2));
  1.9063 +  effect(KILL cr, KILL rax);
  1.9064 +  ins_cost(300);
  1.9065 +  format %{ "FCMPD  $dst,$src1,$src2" %}
  1.9066 +  opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
  1.9067 +  ins_encode( Push_Reg_D(src1),
  1.9068 +              OpcP, RegOpc(src2),
  1.9069 +              CmpF_Result(dst));
  1.9070 +  ins_pipe( pipe_slow );
  1.9071 +%}
  1.9072 +
  1.9073 +// float compare and set condition codes in EFLAGS by XMM regs
  1.9074 +instruct cmpXD_cc(eFlagsRegU cr, regXD dst, regXD src, eAXRegI rax) %{
  1.9075 +  predicate(UseSSE>=2);
  1.9076 +  match(Set cr (CmpD dst src));
  1.9077 +  effect(KILL rax);
  1.9078 +  ins_cost(125);
  1.9079 +  format %{ "COMISD $dst,$src\n"
  1.9080 +          "\tJNP    exit\n"
  1.9081 +          "\tMOV    ah,1       // saw a NaN, set CF\n"
  1.9082 +          "\tSAHF\n"
  1.9083 +     "exit:\tNOP               // avoid branch to branch" %}
  1.9084 +  opcode(0x66, 0x0F, 0x2F);
  1.9085 +  ins_encode(OpcP, OpcS, Opcode(tertiary), RegReg(dst, src), cmpF_P6_fixup);
  1.9086 +  ins_pipe( pipe_slow );
  1.9087 +%}
  1.9088 +
  1.9089 +// float compare and set condition codes in EFLAGS by XMM regs
  1.9090 +instruct cmpXD_ccmem(eFlagsRegU cr, regXD dst, memory src, eAXRegI rax) %{
  1.9091 +  predicate(UseSSE>=2);
  1.9092 +  match(Set cr (CmpD dst (LoadD src)));
  1.9093 +  effect(KILL rax);
  1.9094 +  ins_cost(145);
  1.9095 +  format %{ "COMISD $dst,$src\n"
  1.9096 +          "\tJNP    exit\n"
  1.9097 +          "\tMOV    ah,1       // saw a NaN, set CF\n"
  1.9098 +          "\tSAHF\n"
  1.9099 +     "exit:\tNOP               // avoid branch to branch" %}
  1.9100 +  opcode(0x66, 0x0F, 0x2F);
  1.9101 +  ins_encode(OpcP, OpcS, Opcode(tertiary), RegMem(dst, src), cmpF_P6_fixup);
  1.9102 +  ins_pipe( pipe_slow );
  1.9103 +%}
  1.9104 +
  1.9105 +// Compare into -1,0,1 in XMM
  1.9106 +instruct cmpXD_reg(eRegI dst, regXD src1, regXD src2, eFlagsReg cr) %{
  1.9107 +  predicate(UseSSE>=2);
  1.9108 +  match(Set dst (CmpD3 src1 src2));
  1.9109 +  effect(KILL cr);
  1.9110 +  ins_cost(255);
  1.9111 +  format %{ "XOR    $dst,$dst\n"
  1.9112 +          "\tCOMISD $src1,$src2\n"
  1.9113 +          "\tJP,s   nan\n"
  1.9114 +          "\tJEQ,s  exit\n"
  1.9115 +          "\tJA,s   inc\n"
  1.9116 +      "nan:\tDEC    $dst\n"
  1.9117 +          "\tJMP,s  exit\n"
  1.9118 +      "inc:\tINC    $dst\n"
  1.9119 +      "exit:"
  1.9120 +                %}
  1.9121 +  opcode(0x66, 0x0F, 0x2F);
  1.9122 +  ins_encode(Xor_Reg(dst), OpcP, OpcS, Opcode(tertiary), RegReg(src1, src2),
  1.9123 +             CmpX_Result(dst));
  1.9124 +  ins_pipe( pipe_slow );
  1.9125 +%}
  1.9126 +
  1.9127 +// Compare into -1,0,1 in XMM and memory
  1.9128 +instruct cmpXD_regmem(eRegI dst, regXD src1, memory mem, eFlagsReg cr) %{
  1.9129 +  predicate(UseSSE>=2);
  1.9130 +  match(Set dst (CmpD3 src1 (LoadD mem)));
  1.9131 +  effect(KILL cr);
  1.9132 +  ins_cost(275);
  1.9133 +  format %{ "COMISD $src1,$mem\n"
  1.9134 +          "\tMOV    $dst,0\t\t# do not blow flags\n"
  1.9135 +          "\tJP,s   nan\n"
  1.9136 +          "\tJEQ,s  exit\n"
  1.9137 +          "\tJA,s   inc\n"
  1.9138 +      "nan:\tDEC    $dst\n"
  1.9139 +          "\tJMP,s  exit\n"
  1.9140 +      "inc:\tINC    $dst\n"
  1.9141 +      "exit:"
  1.9142 +                %}
  1.9143 +  opcode(0x66, 0x0F, 0x2F);
  1.9144 +  ins_encode(OpcP, OpcS, Opcode(tertiary), RegMem(src1, mem),
  1.9145 +             LdImmI(dst,0x0), CmpX_Result(dst));
  1.9146 +  ins_pipe( pipe_slow );
  1.9147 +%}
  1.9148 +
  1.9149 +
  1.9150 +instruct subD_reg(regD dst, regD src) %{
  1.9151 +  predicate (UseSSE <=1);
  1.9152 +  match(Set dst (SubD dst src));
  1.9153 +
  1.9154 +  format %{ "FLD    $src\n\t"
  1.9155 +            "DSUBp  $dst,ST" %}
  1.9156 +  opcode(0xDE, 0x5); /* DE E8+i  or DE /5 */
  1.9157 +  ins_cost(150);
  1.9158 +  ins_encode( Push_Reg_D(src),
  1.9159 +              OpcP, RegOpc(dst) );
  1.9160 +  ins_pipe( fpu_reg_reg );
  1.9161 +%}
  1.9162 +
  1.9163 +instruct subD_reg_round(stackSlotD dst, regD src1, regD src2) %{
  1.9164 +  predicate (UseSSE <=1);
  1.9165 +  match(Set dst (RoundDouble (SubD src1 src2)));
  1.9166 +  ins_cost(250);
  1.9167 +
  1.9168 +  format %{ "FLD    $src2\n\t"
  1.9169 +            "DSUB   ST,$src1\n\t"
  1.9170 +            "FSTP_D $dst\t# D-round" %}
  1.9171 +  opcode(0xD8, 0x5);
  1.9172 +  ins_encode( Push_Reg_D(src2),
  1.9173 +              OpcP, RegOpc(src1), Pop_Mem_D(dst) );
  1.9174 +  ins_pipe( fpu_mem_reg_reg );
  1.9175 +%}
  1.9176 +
  1.9177 +
  1.9178 +instruct subD_reg_mem(regD dst, memory src) %{
  1.9179 +  predicate (UseSSE <=1);
  1.9180 +  match(Set dst (SubD dst (LoadD src)));
  1.9181 +  ins_cost(150);
  1.9182 +
  1.9183 +  format %{ "FLD    $src\n\t"
  1.9184 +            "DSUBp  $dst,ST" %}
  1.9185 +  opcode(0xDE, 0x5, 0xDD); /* DE C0+i */  /* LoadD  DD /0 */
  1.9186 +  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src),
  1.9187 +              OpcP, RegOpc(dst) );
  1.9188 +  ins_pipe( fpu_reg_mem );
  1.9189 +%}
  1.9190 +
  1.9191 +instruct absD_reg(regDPR1 dst, regDPR1 src) %{
  1.9192 +  predicate (UseSSE<=1);
  1.9193 +  match(Set dst (AbsD src));
  1.9194 +  ins_cost(100);
  1.9195 +  format %{ "FABS" %}
  1.9196 +  opcode(0xE1, 0xD9);
  1.9197 +  ins_encode( OpcS, OpcP );
  1.9198 +  ins_pipe( fpu_reg_reg );
  1.9199 +%}
  1.9200 +
  1.9201 +instruct absXD_reg( regXD dst ) %{
  1.9202 +  predicate(UseSSE>=2);
  1.9203 +  match(Set dst (AbsD dst));
  1.9204 +  format %{ "ANDPD  $dst,[0x7FFFFFFFFFFFFFFF]\t# ABS D by sign masking" %}
  1.9205 +  ins_encode( AbsXD_encoding(dst));
  1.9206 +  ins_pipe( pipe_slow );
  1.9207 +%}
  1.9208 +
  1.9209 +instruct negD_reg(regDPR1 dst, regDPR1 src) %{
  1.9210 +  predicate(UseSSE<=1);
  1.9211 +  match(Set dst (NegD src));
  1.9212 +  ins_cost(100);
  1.9213 +  format %{ "FCHS" %}
  1.9214 +  opcode(0xE0, 0xD9);
  1.9215 +  ins_encode( OpcS, OpcP );
  1.9216 +  ins_pipe( fpu_reg_reg );
  1.9217 +%}
  1.9218 +
  1.9219 +instruct negXD_reg( regXD dst ) %{
  1.9220 +  predicate(UseSSE>=2);
  1.9221 +  match(Set dst (NegD dst));
  1.9222 +  format %{ "XORPD  $dst,[0x8000000000000000]\t# CHS D by sign flipping" %}
  1.9223 +  ins_encode %{
  1.9224 +     __ xorpd($dst$$XMMRegister,
  1.9225 +              ExternalAddress((address)double_signflip_pool));
  1.9226 +  %}
  1.9227 +  ins_pipe( pipe_slow );
  1.9228 +%}
  1.9229 +
  1.9230 +instruct addD_reg(regD dst, regD src) %{
  1.9231 +  predicate(UseSSE<=1);
  1.9232 +  match(Set dst (AddD dst src));
  1.9233 +  format %{ "FLD    $src\n\t"
  1.9234 +            "DADD   $dst,ST" %}
  1.9235 +  size(4);
  1.9236 +  ins_cost(150);
  1.9237 +  opcode(0xDE, 0x0); /* DE C0+i or DE /0*/
  1.9238 +  ins_encode( Push_Reg_D(src),
  1.9239 +              OpcP, RegOpc(dst) );
  1.9240 +  ins_pipe( fpu_reg_reg );
  1.9241 +%}
  1.9242 +
  1.9243 +
  1.9244 +instruct addD_reg_round(stackSlotD dst, regD src1, regD src2) %{
  1.9245 +  predicate(UseSSE<=1);
  1.9246 +  match(Set dst (RoundDouble (AddD src1 src2)));
  1.9247 +  ins_cost(250);
  1.9248 +
  1.9249 +  format %{ "FLD    $src2\n\t"
  1.9250 +            "DADD   ST,$src1\n\t"
  1.9251 +            "FSTP_D $dst\t# D-round" %}
  1.9252 +  opcode(0xD8, 0x0); /* D8 C0+i or D8 /0*/
  1.9253 +  ins_encode( Push_Reg_D(src2),
  1.9254 +              OpcP, RegOpc(src1), Pop_Mem_D(dst) );
  1.9255 +  ins_pipe( fpu_mem_reg_reg );
  1.9256 +%}
  1.9257 +
  1.9258 +
  1.9259 +instruct addD_reg_mem(regD dst, memory src) %{
  1.9260 +  predicate(UseSSE<=1);
  1.9261 +  match(Set dst (AddD dst (LoadD src)));
  1.9262 +  ins_cost(150);
  1.9263 +
  1.9264 +  format %{ "FLD    $src\n\t"
  1.9265 +            "DADDp  $dst,ST" %}
  1.9266 +  opcode(0xDE, 0x0, 0xDD); /* DE C0+i */  /* LoadD  DD /0 */
  1.9267 +  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src),
  1.9268 +              OpcP, RegOpc(dst) );
  1.9269 +  ins_pipe( fpu_reg_mem );
  1.9270 +%}
  1.9271 +
  1.9272 +// add-to-memory
  1.9273 +instruct addD_mem_reg(memory dst, regD src) %{
  1.9274 +  predicate(UseSSE<=1);
  1.9275 +  match(Set dst (StoreD dst (RoundDouble (AddD (LoadD dst) src))));
  1.9276 +  ins_cost(150);
  1.9277 +
  1.9278 +  format %{ "FLD_D  $dst\n\t"
  1.9279 +            "DADD   ST,$src\n\t"
  1.9280 +            "FST_D  $dst" %}
  1.9281 +  opcode(0xDD, 0x0);
  1.9282 +  ins_encode( Opcode(0xDD), RMopc_Mem(0x00,dst),
  1.9283 +              Opcode(0xD8), RegOpc(src),
  1.9284 +              set_instruction_start,
  1.9285 +              Opcode(0xDD), RMopc_Mem(0x03,dst) );
  1.9286 +  ins_pipe( fpu_reg_mem );
  1.9287 +%}
  1.9288 +
  1.9289 +instruct addD_reg_imm1(regD dst, immD1 src) %{
  1.9290 +  predicate(UseSSE<=1);
  1.9291 +  match(Set dst (AddD dst src));
  1.9292 +  ins_cost(125);
  1.9293 +  format %{ "FLD1\n\t"
  1.9294 +            "DADDp  $dst,ST" %}
  1.9295 +  opcode(0xDE, 0x00);
  1.9296 +  ins_encode( LdImmD(src),
  1.9297 +              OpcP, RegOpc(dst) );
  1.9298 +  ins_pipe( fpu_reg );
  1.9299 +%}
  1.9300 +
  1.9301 +instruct addD_reg_imm(regD dst, immD src) %{
  1.9302 +  predicate(UseSSE<=1 && _kids[1]->_leaf->getd() != 0.0 && _kids[1]->_leaf->getd() != 1.0 );
  1.9303 +  match(Set dst (AddD dst src));
  1.9304 +  ins_cost(200);
  1.9305 +  format %{ "FLD_D  [$src]\n\t"
  1.9306 +            "DADDp  $dst,ST" %}
  1.9307 +  opcode(0xDE, 0x00);       /* DE /0 */
  1.9308 +  ins_encode( LdImmD(src),
  1.9309 +              OpcP, RegOpc(dst));
  1.9310 +  ins_pipe( fpu_reg_mem );
  1.9311 +%}
  1.9312 +
  1.9313 +instruct addD_reg_imm_round(stackSlotD dst, regD src, immD con) %{
  1.9314 +  predicate(UseSSE<=1 && _kids[0]->_kids[1]->_leaf->getd() != 0.0 && _kids[0]->_kids[1]->_leaf->getd() != 1.0 );
  1.9315 +  match(Set dst (RoundDouble (AddD src con)));
  1.9316 +  ins_cost(200);
  1.9317 +  format %{ "FLD_D  [$con]\n\t"
  1.9318 +            "DADD   ST,$src\n\t"
  1.9319 +            "FSTP_D $dst\t# D-round" %}
  1.9320 +  opcode(0xD8, 0x00);       /* D8 /0 */
  1.9321 +  ins_encode( LdImmD(con),
  1.9322 +              OpcP, RegOpc(src), Pop_Mem_D(dst));
  1.9323 +  ins_pipe( fpu_mem_reg_con );
  1.9324 +%}
  1.9325 +
  1.9326 +// Add two double precision floating point values in xmm
  1.9327 +instruct addXD_reg(regXD dst, regXD src) %{
  1.9328 +  predicate(UseSSE>=2);
  1.9329 +  match(Set dst (AddD dst src));
  1.9330 +  format %{ "ADDSD  $dst,$src" %}
  1.9331 +  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x58), RegReg(dst, src));
  1.9332 +  ins_pipe( pipe_slow );
  1.9333 +%}
  1.9334 +
  1.9335 +instruct addXD_imm(regXD dst, immXD con) %{
  1.9336 +  predicate(UseSSE>=2);
  1.9337 +  match(Set dst (AddD dst con));
  1.9338 +  format %{ "ADDSD  $dst,[$con]" %}
  1.9339 +  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x58), LdImmXD(dst, con) );
  1.9340 +  ins_pipe( pipe_slow );
  1.9341 +%}
  1.9342 +
  1.9343 +instruct addXD_mem(regXD dst, memory mem) %{
  1.9344 +  predicate(UseSSE>=2);
  1.9345 +  match(Set dst (AddD dst (LoadD mem)));
  1.9346 +  format %{ "ADDSD  $dst,$mem" %}
  1.9347 +  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x58), RegMem(dst,mem));
  1.9348 +  ins_pipe( pipe_slow );
  1.9349 +%}
  1.9350 +
  1.9351 +// Sub two double precision floating point values in xmm
  1.9352 +instruct subXD_reg(regXD dst, regXD src) %{
  1.9353 +  predicate(UseSSE>=2);
  1.9354 +  match(Set dst (SubD dst src));
  1.9355 +  format %{ "SUBSD  $dst,$src" %}
  1.9356 +  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5C), RegReg(dst, src));
  1.9357 +  ins_pipe( pipe_slow );
  1.9358 +%}
  1.9359 +
  1.9360 +instruct subXD_imm(regXD dst, immXD con) %{
  1.9361 +  predicate(UseSSE>=2);
  1.9362 +  match(Set dst (SubD dst con));
  1.9363 +  format %{ "SUBSD  $dst,[$con]" %}
  1.9364 +  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5C), LdImmXD(dst, con) );
  1.9365 +  ins_pipe( pipe_slow );
  1.9366 +%}
  1.9367 +
  1.9368 +instruct subXD_mem(regXD dst, memory mem) %{
  1.9369 +  predicate(UseSSE>=2);
  1.9370 +  match(Set dst (SubD dst (LoadD mem)));
  1.9371 +  format %{ "SUBSD  $dst,$mem" %}
  1.9372 +  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5C), RegMem(dst,mem));
  1.9373 +  ins_pipe( pipe_slow );
  1.9374 +%}
  1.9375 +
  1.9376 +// Mul two double precision floating point values in xmm
  1.9377 +instruct mulXD_reg(regXD dst, regXD src) %{
  1.9378 +  predicate(UseSSE>=2);
  1.9379 +  match(Set dst (MulD dst src));
  1.9380 +  format %{ "MULSD  $dst,$src" %}
  1.9381 +  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x59), RegReg(dst, src));
  1.9382 +  ins_pipe( pipe_slow );
  1.9383 +%}
  1.9384 +
  1.9385 +instruct mulXD_imm(regXD dst, immXD con) %{
  1.9386 +  predicate(UseSSE>=2);
  1.9387 +  match(Set dst (MulD dst con));
  1.9388 +  format %{ "MULSD  $dst,[$con]" %}
  1.9389 +  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x59), LdImmXD(dst, con) );
  1.9390 +  ins_pipe( pipe_slow );
  1.9391 +%}
  1.9392 +
  1.9393 +instruct mulXD_mem(regXD dst, memory mem) %{
  1.9394 +  predicate(UseSSE>=2);
  1.9395 +  match(Set dst (MulD dst (LoadD mem)));
  1.9396 +  format %{ "MULSD  $dst,$mem" %}
  1.9397 +  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x59), RegMem(dst,mem));
  1.9398 +  ins_pipe( pipe_slow );
  1.9399 +%}
  1.9400 +
  1.9401 +// Div two double precision floating point values in xmm
  1.9402 +instruct divXD_reg(regXD dst, regXD src) %{
  1.9403 +  predicate(UseSSE>=2);
  1.9404 +  match(Set dst (DivD dst src));
  1.9405 +  format %{ "DIVSD  $dst,$src" %}
  1.9406 +  opcode(0xF2, 0x0F, 0x5E);
  1.9407 +  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5E), RegReg(dst, src));
  1.9408 +  ins_pipe( pipe_slow );
  1.9409 +%}
  1.9410 +
  1.9411 +instruct divXD_imm(regXD dst, immXD con) %{
  1.9412 +  predicate(UseSSE>=2);
  1.9413 +  match(Set dst (DivD dst con));
  1.9414 +  format %{ "DIVSD  $dst,[$con]" %}
  1.9415 +  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5E), LdImmXD(dst, con));
  1.9416 +  ins_pipe( pipe_slow );
  1.9417 +%}
  1.9418 +
  1.9419 +instruct divXD_mem(regXD dst, memory mem) %{
  1.9420 +  predicate(UseSSE>=2);
  1.9421 +  match(Set dst (DivD dst (LoadD mem)));
  1.9422 +  format %{ "DIVSD  $dst,$mem" %}
  1.9423 +  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5E), RegMem(dst,mem));
  1.9424 +  ins_pipe( pipe_slow );
  1.9425 +%}
  1.9426 +
  1.9427 +
  1.9428 +instruct mulD_reg(regD dst, regD src) %{
  1.9429 +  predicate(UseSSE<=1);
  1.9430 +  match(Set dst (MulD dst src));
  1.9431 +  format %{ "FLD    $src\n\t"
  1.9432 +            "DMULp  $dst,ST" %}
  1.9433 +  opcode(0xDE, 0x1); /* DE C8+i or DE /1*/
  1.9434 +  ins_cost(150);
  1.9435 +  ins_encode( Push_Reg_D(src),
  1.9436 +              OpcP, RegOpc(dst) );
  1.9437 +  ins_pipe( fpu_reg_reg );
  1.9438 +%}
  1.9439 +
  1.9440 +// Strict FP instruction biases argument before multiply then
  1.9441 +// biases result to avoid double rounding of subnormals.
  1.9442 +//
  1.9443 +// scale arg1 by multiplying arg1 by 2^(-15360)
  1.9444 +// load arg2
  1.9445 +// multiply scaled arg1 by arg2
  1.9446 +// rescale product by 2^(15360)
  1.9447 +//
  1.9448 +instruct strictfp_mulD_reg(regDPR1 dst, regnotDPR1 src) %{
  1.9449 +  predicate( UseSSE<=1 && Compile::current()->has_method() && Compile::current()->method()->is_strict() );
  1.9450 +  match(Set dst (MulD dst src));
  1.9451 +  ins_cost(1);   // Select this instruction for all strict FP double multiplies
  1.9452 +
  1.9453 +  format %{ "FLD    StubRoutines::_fpu_subnormal_bias1\n\t"
  1.9454 +            "DMULp  $dst,ST\n\t"
  1.9455 +            "FLD    $src\n\t"
  1.9456 +            "DMULp  $dst,ST\n\t"
  1.9457 +            "FLD    StubRoutines::_fpu_subnormal_bias2\n\t"
  1.9458 +            "DMULp  $dst,ST\n\t" %}
  1.9459 +  opcode(0xDE, 0x1); /* DE C8+i or DE /1*/
  1.9460 +  ins_encode( strictfp_bias1(dst),
  1.9461 +              Push_Reg_D(src),
  1.9462 +              OpcP, RegOpc(dst),
  1.9463 +              strictfp_bias2(dst) );
  1.9464 +  ins_pipe( fpu_reg_reg );
  1.9465 +%}
  1.9466 +
  1.9467 +instruct mulD_reg_imm(regD dst, immD src) %{
  1.9468 +  predicate( UseSSE<=1 && _kids[1]->_leaf->getd() != 0.0 && _kids[1]->_leaf->getd() != 1.0 );
  1.9469 +  match(Set dst (MulD dst src));
  1.9470 +  ins_cost(200);
  1.9471 +  format %{ "FLD_D  [$src]\n\t"
  1.9472 +            "DMULp  $dst,ST" %}
  1.9473 +  opcode(0xDE, 0x1); /* DE /1 */
  1.9474 +  ins_encode( LdImmD(src),
  1.9475 +              OpcP, RegOpc(dst) );
  1.9476 +  ins_pipe( fpu_reg_mem );
  1.9477 +%}
  1.9478 +
  1.9479 +
  1.9480 +instruct mulD_reg_mem(regD dst, memory src) %{
  1.9481 +  predicate( UseSSE<=1 );
  1.9482 +  match(Set dst (MulD dst (LoadD src)));
  1.9483 +  ins_cost(200);
  1.9484 +  format %{ "FLD_D  $src\n\t"
  1.9485 +            "DMULp  $dst,ST" %}
  1.9486 +  opcode(0xDE, 0x1, 0xDD); /* DE C8+i or DE /1*/  /* LoadD  DD /0 */
  1.9487 +  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src),
  1.9488 +              OpcP, RegOpc(dst) );
  1.9489 +  ins_pipe( fpu_reg_mem );
  1.9490 +%}
  1.9491 +
  1.9492 +//
  1.9493 +// Cisc-alternate to reg-reg multiply
  1.9494 +instruct mulD_reg_mem_cisc(regD dst, regD src, memory mem) %{
  1.9495 +  predicate( UseSSE<=1 );
  1.9496 +  match(Set dst (MulD src (LoadD mem)));
  1.9497 +  ins_cost(250);
  1.9498 +  format %{ "FLD_D  $mem\n\t"
  1.9499 +            "DMUL   ST,$src\n\t"
  1.9500 +            "FSTP_D $dst" %}
  1.9501 +  opcode(0xD8, 0x1, 0xD9); /* D8 C8+i */  /* LoadD D9 /0 */
  1.9502 +  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,mem),
  1.9503 +              OpcReg_F(src),
  1.9504 +              Pop_Reg_D(dst) );
  1.9505 +  ins_pipe( fpu_reg_reg_mem );
  1.9506 +%}
  1.9507 +
  1.9508 +
  1.9509 +// MACRO3 -- addD a mulD
  1.9510 +// This instruction is a '2-address' instruction in that the result goes
  1.9511 +// back to src2.  This eliminates a move from the macro; possibly the
  1.9512 +// register allocator will have to add it back (and maybe not).
  1.9513 +instruct addD_mulD_reg(regD src2, regD src1, regD src0) %{
  1.9514 +  predicate( UseSSE<=1 );
  1.9515 +  match(Set src2 (AddD (MulD src0 src1) src2));
  1.9516 +  format %{ "FLD    $src0\t# ===MACRO3d===\n\t"
  1.9517 +            "DMUL   ST,$src1\n\t"
  1.9518 +            "DADDp  $src2,ST" %}
  1.9519 +  ins_cost(250);
  1.9520 +  opcode(0xDD); /* LoadD DD /0 */
  1.9521 +  ins_encode( Push_Reg_F(src0),
  1.9522 +              FMul_ST_reg(src1),
  1.9523 +              FAddP_reg_ST(src2) );
  1.9524 +  ins_pipe( fpu_reg_reg_reg );
  1.9525 +%}
  1.9526 +
  1.9527 +
  1.9528 +// MACRO3 -- subD a mulD
  1.9529 +instruct subD_mulD_reg(regD src2, regD src1, regD src0) %{
  1.9530 +  predicate( UseSSE<=1 );
  1.9531 +  match(Set src2 (SubD (MulD src0 src1) src2));
  1.9532 +  format %{ "FLD    $src0\t# ===MACRO3d===\n\t"
  1.9533 +            "DMUL   ST,$src1\n\t"
  1.9534 +            "DSUBRp $src2,ST" %}
  1.9535 +  ins_cost(250);
  1.9536 +  ins_encode( Push_Reg_F(src0),
  1.9537 +              FMul_ST_reg(src1),
  1.9538 +              Opcode(0xDE), Opc_plus(0xE0,src2));
  1.9539 +  ins_pipe( fpu_reg_reg_reg );
  1.9540 +%}
  1.9541 +
  1.9542 +
  1.9543 +instruct divD_reg(regD dst, regD src) %{
  1.9544 +  predicate( UseSSE<=1 );
  1.9545 +  match(Set dst (DivD dst src));
  1.9546 +
  1.9547 +  format %{ "FLD    $src\n\t"
  1.9548 +            "FDIVp  $dst,ST" %}
  1.9549 +  opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
  1.9550 +  ins_cost(150);
  1.9551 +  ins_encode( Push_Reg_D(src),
  1.9552 +              OpcP, RegOpc(dst) );
  1.9553 +  ins_pipe( fpu_reg_reg );
  1.9554 +%}
  1.9555 +
  1.9556 +// Strict FP instruction biases argument before division then
  1.9557 +// biases result, to avoid double rounding of subnormals.
  1.9558 +//
  1.9559 +// scale dividend by multiplying dividend by 2^(-15360)
  1.9560 +// load divisor
  1.9561 +// divide scaled dividend by divisor
  1.9562 +// rescale quotient by 2^(15360)
  1.9563 +//
  1.9564 +instruct strictfp_divD_reg(regDPR1 dst, regnotDPR1 src) %{
  1.9565 +  predicate (UseSSE<=1);
  1.9566 +  match(Set dst (DivD dst src));
  1.9567 +  predicate( UseSSE<=1 && Compile::current()->has_method() && Compile::current()->method()->is_strict() );
  1.9568 +  ins_cost(01);
  1.9569 +
  1.9570 +  format %{ "FLD    StubRoutines::_fpu_subnormal_bias1\n\t"
  1.9571 +            "DMULp  $dst,ST\n\t"
  1.9572 +            "FLD    $src\n\t"
  1.9573 +            "FDIVp  $dst,ST\n\t"
  1.9574 +            "FLD    StubRoutines::_fpu_subnormal_bias2\n\t"
  1.9575 +            "DMULp  $dst,ST\n\t" %}
  1.9576 +  opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
  1.9577 +  ins_encode( strictfp_bias1(dst),
  1.9578 +              Push_Reg_D(src),
  1.9579 +              OpcP, RegOpc(dst),
  1.9580 +              strictfp_bias2(dst) );
  1.9581 +  ins_pipe( fpu_reg_reg );
  1.9582 +%}
  1.9583 +
  1.9584 +instruct divD_reg_round(stackSlotD dst, regD src1, regD src2) %{
  1.9585 +  predicate( UseSSE<=1 && !(Compile::current()->has_method() && Compile::current()->method()->is_strict()) );
  1.9586 +  match(Set dst (RoundDouble (DivD src1 src2)));
  1.9587 +
  1.9588 +  format %{ "FLD    $src1\n\t"
  1.9589 +            "FDIV   ST,$src2\n\t"
  1.9590 +            "FSTP_D $dst\t# D-round" %}
  1.9591 +  opcode(0xD8, 0x6); /* D8 F0+i or D8 /6 */
  1.9592 +  ins_encode( Push_Reg_D(src1),
  1.9593 +              OpcP, RegOpc(src2), Pop_Mem_D(dst) );
  1.9594 +  ins_pipe( fpu_mem_reg_reg );
  1.9595 +%}
  1.9596 +
  1.9597 +
  1.9598 +instruct modD_reg(regD dst, regD src, eAXRegI rax, eFlagsReg cr) %{
  1.9599 +  predicate(UseSSE<=1);
  1.9600 +  match(Set dst (ModD dst src));
  1.9601 +  effect(KILL rax, KILL cr); // emitModD() uses EAX and EFLAGS
  1.9602 +
  1.9603 +  format %{ "DMOD   $dst,$src" %}
  1.9604 +  ins_cost(250);
  1.9605 +  ins_encode(Push_Reg_Mod_D(dst, src),
  1.9606 +              emitModD(),
  1.9607 +              Push_Result_Mod_D(src),
  1.9608 +              Pop_Reg_D(dst));
  1.9609 +  ins_pipe( pipe_slow );
  1.9610 +%}
  1.9611 +
  1.9612 +instruct modXD_reg(regXD dst, regXD src0, regXD src1, eAXRegI rax, eFlagsReg cr) %{
  1.9613 +  predicate(UseSSE>=2);
  1.9614 +  match(Set dst (ModD src0 src1));
  1.9615 +  effect(KILL rax, KILL cr);
  1.9616 +
  1.9617 +  format %{ "SUB    ESP,8\t # DMOD\n"
  1.9618 +          "\tMOVSD  [ESP+0],$src1\n"
  1.9619 +          "\tFLD_D  [ESP+0]\n"
  1.9620 +          "\tMOVSD  [ESP+0],$src0\n"
  1.9621 +          "\tFLD_D  [ESP+0]\n"
  1.9622 +     "loop:\tFPREM\n"
  1.9623 +          "\tFWAIT\n"
  1.9624 +          "\tFNSTSW AX\n"
  1.9625 +          "\tSAHF\n"
  1.9626 +          "\tJP     loop\n"
  1.9627 +          "\tFSTP_D [ESP+0]\n"
  1.9628 +          "\tMOVSD  $dst,[ESP+0]\n"
  1.9629 +          "\tADD    ESP,8\n"
  1.9630 +          "\tFSTP   ST0\t # Restore FPU Stack"
  1.9631 +    %}
  1.9632 +  ins_cost(250);
  1.9633 +  ins_encode( Push_ModD_encoding(src0, src1), emitModD(), Push_ResultXD(dst), PopFPU);
  1.9634 +  ins_pipe( pipe_slow );
  1.9635 +%}
  1.9636 +
  1.9637 +instruct sinD_reg(regDPR1 dst, regDPR1 src) %{
  1.9638 +  predicate (UseSSE<=1);
  1.9639 +  match(Set dst (SinD src));
  1.9640 +  ins_cost(1800);
  1.9641 +  format %{ "DSIN   $dst" %}
  1.9642 +  opcode(0xD9, 0xFE);
  1.9643 +  ins_encode( OpcP, OpcS );
  1.9644 +  ins_pipe( pipe_slow );
  1.9645 +%}
  1.9646 +
  1.9647 +instruct sinXD_reg(regXD dst, eFlagsReg cr) %{
  1.9648 +  predicate (UseSSE>=2);
  1.9649 +  match(Set dst (SinD dst));
  1.9650 +  effect(KILL cr); // Push_{Src|Result}XD() uses "{SUB|ADD} ESP,8"
  1.9651 +  ins_cost(1800);
  1.9652 +  format %{ "DSIN   $dst" %}
  1.9653 +  opcode(0xD9, 0xFE);
  1.9654 +  ins_encode( Push_SrcXD(dst), OpcP, OpcS, Push_ResultXD(dst) );
  1.9655 +  ins_pipe( pipe_slow );
  1.9656 +%}
  1.9657 +
  1.9658 +instruct cosD_reg(regDPR1 dst, regDPR1 src) %{
  1.9659 +  predicate (UseSSE<=1);
  1.9660 +  match(Set dst (CosD src));
  1.9661 +  ins_cost(1800);
  1.9662 +  format %{ "DCOS   $dst" %}
  1.9663 +  opcode(0xD9, 0xFF);
  1.9664 +  ins_encode( OpcP, OpcS );
  1.9665 +  ins_pipe( pipe_slow );
  1.9666 +%}
  1.9667 +
  1.9668 +instruct cosXD_reg(regXD dst, eFlagsReg cr) %{
  1.9669 +  predicate (UseSSE>=2);
  1.9670 +  match(Set dst (CosD dst));
  1.9671 +  effect(KILL cr); // Push_{Src|Result}XD() uses "{SUB|ADD} ESP,8"
  1.9672 +  ins_cost(1800);
  1.9673 +  format %{ "DCOS   $dst" %}
  1.9674 +  opcode(0xD9, 0xFF);
  1.9675 +  ins_encode( Push_SrcXD(dst), OpcP, OpcS, Push_ResultXD(dst) );
  1.9676 +  ins_pipe( pipe_slow );
  1.9677 +%}
  1.9678 +
  1.9679 +instruct tanD_reg(regDPR1 dst, regDPR1 src) %{
  1.9680 +  predicate (UseSSE<=1);
  1.9681 +  match(Set dst(TanD src));
  1.9682 +  format %{ "DTAN   $dst" %}
  1.9683 +  ins_encode( Opcode(0xD9), Opcode(0xF2),    // fptan
  1.9684 +              Opcode(0xDD), Opcode(0xD8));   // fstp st
  1.9685 +  ins_pipe( pipe_slow );
  1.9686 +%}
  1.9687 +
  1.9688 +instruct tanXD_reg(regXD dst, eFlagsReg cr) %{
  1.9689 +  predicate (UseSSE>=2);
  1.9690 +  match(Set dst(TanD dst));
  1.9691 +  effect(KILL cr); // Push_{Src|Result}XD() uses "{SUB|ADD} ESP,8"
  1.9692 +  format %{ "DTAN   $dst" %}
  1.9693 +  ins_encode( Push_SrcXD(dst),
  1.9694 +              Opcode(0xD9), Opcode(0xF2),    // fptan
  1.9695 +              Opcode(0xDD), Opcode(0xD8),   // fstp st
  1.9696 +              Push_ResultXD(dst) );
  1.9697 +  ins_pipe( pipe_slow );
  1.9698 +%}
  1.9699 +
  1.9700 +instruct atanD_reg(regD dst, regD src) %{
  1.9701 +  predicate (UseSSE<=1);
  1.9702 +  match(Set dst(AtanD dst src));
  1.9703 +  format %{ "DATA   $dst,$src" %}
  1.9704 +  opcode(0xD9, 0xF3);
  1.9705 +  ins_encode( Push_Reg_D(src),
  1.9706 +              OpcP, OpcS, RegOpc(dst) );
  1.9707 +  ins_pipe( pipe_slow );
  1.9708 +%}
  1.9709 +
  1.9710 +instruct atanXD_reg(regXD dst, regXD src, eFlagsReg cr) %{
  1.9711 +  predicate (UseSSE>=2);
  1.9712 +  match(Set dst(AtanD dst src));
  1.9713 +  effect(KILL cr); // Push_{Src|Result}XD() uses "{SUB|ADD} ESP,8"
  1.9714 +  format %{ "DATA   $dst,$src" %}
  1.9715 +  opcode(0xD9, 0xF3);
  1.9716 +  ins_encode( Push_SrcXD(src),
  1.9717 +              OpcP, OpcS, Push_ResultXD(dst) );
  1.9718 +  ins_pipe( pipe_slow );
  1.9719 +%}
  1.9720 +
  1.9721 +instruct sqrtD_reg(regD dst, regD src) %{
  1.9722 +  predicate (UseSSE<=1);
  1.9723 +  match(Set dst (SqrtD src));
  1.9724 +  format %{ "DSQRT  $dst,$src" %}
  1.9725 +  opcode(0xFA, 0xD9);
  1.9726 +  ins_encode( Push_Reg_D(src),
  1.9727 +              OpcS, OpcP, Pop_Reg_D(dst) );
  1.9728 +  ins_pipe( pipe_slow );
  1.9729 +%}
  1.9730 +
  1.9731 +instruct powD_reg(regD X, regDPR1 Y, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
  1.9732 +  predicate (UseSSE<=1);
  1.9733 +  match(Set Y (PowD X Y));  // Raise X to the Yth power
  1.9734 +  effect(KILL rax, KILL rbx, KILL rcx);
  1.9735 +  format %{ "SUB    ESP,8\t\t# Fast-path POW encoding\n\t"
  1.9736 +            "FLD_D  $X\n\t"
  1.9737 +            "FYL2X  \t\t\t# Q=Y*ln2(X)\n\t"
  1.9738 +
  1.9739 +            "FDUP   \t\t\t# Q Q\n\t"
  1.9740 +            "FRNDINT\t\t\t# int(Q) Q\n\t"
  1.9741 +            "FSUB   ST(1),ST(0)\t# int(Q) frac(Q)\n\t"
  1.9742 +            "FISTP  dword [ESP]\n\t"
  1.9743 +            "F2XM1  \t\t\t# 2^frac(Q)-1 int(Q)\n\t"
  1.9744 +            "FLD1   \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t"
  1.9745 +            "FADDP  \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead
  1.9746 +            "MOV    EAX,[ESP]\t# Pick up int(Q)\n\t"
  1.9747 +            "MOV    ECX,0xFFFFF800\t# Overflow mask\n\t"
  1.9748 +            "ADD    EAX,1023\t\t# Double exponent bias\n\t"
  1.9749 +            "MOV    EBX,EAX\t\t# Preshifted biased expo\n\t"
  1.9750 +            "SHL    EAX,20\t\t# Shift exponent into place\n\t"
  1.9751 +            "TEST   EBX,ECX\t\t# Check for overflow\n\t"
  1.9752 +            "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t"
  1.9753 +            "MOV    [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t"
  1.9754 +            "MOV    [ESP+0],0\n\t"
  1.9755 +            "FMUL   ST(0),[ESP+0]\t# Scale\n\t"
  1.9756 +
  1.9757 +            "ADD    ESP,8"
  1.9758 +             %}
  1.9759 +  ins_encode( push_stack_temp_qword,
  1.9760 +              Push_Reg_D(X),
  1.9761 +              Opcode(0xD9), Opcode(0xF1),   // fyl2x
  1.9762 +              pow_exp_core_encoding,
  1.9763 +              pop_stack_temp_qword);
  1.9764 +  ins_pipe( pipe_slow );
  1.9765 +%}
  1.9766 +
  1.9767 +instruct powXD_reg(regXD dst, regXD src0, regXD src1, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx ) %{
  1.9768 +  predicate (UseSSE>=2);
  1.9769 +  match(Set dst (PowD src0 src1));  // Raise src0 to the src1'th power
  1.9770 +  effect(KILL tmp1, KILL rax, KILL rbx, KILL rcx );
  1.9771 +  format %{ "SUB    ESP,8\t\t# Fast-path POW encoding\n\t"
  1.9772 +            "MOVSD  [ESP],$src1\n\t"
  1.9773 +            "FLD    FPR1,$src1\n\t"
  1.9774 +            "MOVSD  [ESP],$src0\n\t"
  1.9775 +            "FLD    FPR1,$src0\n\t"
  1.9776 +            "FYL2X  \t\t\t# Q=Y*ln2(X)\n\t"
  1.9777 +
  1.9778 +            "FDUP   \t\t\t# Q Q\n\t"
  1.9779 +            "FRNDINT\t\t\t# int(Q) Q\n\t"
  1.9780 +            "FSUB   ST(1),ST(0)\t# int(Q) frac(Q)\n\t"
  1.9781 +            "FISTP  dword [ESP]\n\t"
  1.9782 +            "F2XM1  \t\t\t# 2^frac(Q)-1 int(Q)\n\t"
  1.9783 +            "FLD1   \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t"
  1.9784 +            "FADDP  \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead
  1.9785 +            "MOV    EAX,[ESP]\t# Pick up int(Q)\n\t"
  1.9786 +            "MOV    ECX,0xFFFFF800\t# Overflow mask\n\t"
  1.9787 +            "ADD    EAX,1023\t\t# Double exponent bias\n\t"
  1.9788 +            "MOV    EBX,EAX\t\t# Preshifted biased expo\n\t"
  1.9789 +            "SHL    EAX,20\t\t# Shift exponent into place\n\t"
  1.9790 +            "TEST   EBX,ECX\t\t# Check for overflow\n\t"
  1.9791 +            "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t"
  1.9792 +            "MOV    [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t"
  1.9793 +            "MOV    [ESP+0],0\n\t"
  1.9794 +            "FMUL   ST(0),[ESP+0]\t# Scale\n\t"
  1.9795 +
  1.9796 +            "FST_D  [ESP]\n\t"
  1.9797 +            "MOVSD  $dst,[ESP]\n\t"
  1.9798 +            "ADD    ESP,8"
  1.9799 +             %}
  1.9800 +  ins_encode( push_stack_temp_qword,
  1.9801 +              push_xmm_to_fpr1(src1),
  1.9802 +              push_xmm_to_fpr1(src0),
  1.9803 +              Opcode(0xD9), Opcode(0xF1),   // fyl2x
  1.9804 +              pow_exp_core_encoding,
  1.9805 +              Push_ResultXD(dst) );
  1.9806 +  ins_pipe( pipe_slow );
  1.9807 +%}
  1.9808 +
  1.9809 +
  1.9810 +instruct expD_reg(regDPR1 dpr1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
  1.9811 +  predicate (UseSSE<=1);
  1.9812 +  match(Set dpr1 (ExpD dpr1));
  1.9813 +  effect(KILL rax, KILL rbx, KILL rcx);
  1.9814 +  format %{ "SUB    ESP,8\t\t# Fast-path EXP encoding"
  1.9815 +            "FLDL2E \t\t\t# Ld log2(e) X\n\t"
  1.9816 +            "FMULP  \t\t\t# Q=X*log2(e)\n\t"
  1.9817 +
  1.9818 +            "FDUP   \t\t\t# Q Q\n\t"
  1.9819 +            "FRNDINT\t\t\t# int(Q) Q\n\t"
  1.9820 +            "FSUB   ST(1),ST(0)\t# int(Q) frac(Q)\n\t"
  1.9821 +            "FISTP  dword [ESP]\n\t"
  1.9822 +            "F2XM1  \t\t\t# 2^frac(Q)-1 int(Q)\n\t"
  1.9823 +            "FLD1   \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t"
  1.9824 +            "FADDP  \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead
  1.9825 +            "MOV    EAX,[ESP]\t# Pick up int(Q)\n\t"
  1.9826 +            "MOV    ECX,0xFFFFF800\t# Overflow mask\n\t"
  1.9827 +            "ADD    EAX,1023\t\t# Double exponent bias\n\t"
  1.9828 +            "MOV    EBX,EAX\t\t# Preshifted biased expo\n\t"
  1.9829 +            "SHL    EAX,20\t\t# Shift exponent into place\n\t"
  1.9830 +            "TEST   EBX,ECX\t\t# Check for overflow\n\t"
  1.9831 +            "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t"
  1.9832 +            "MOV    [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t"
  1.9833 +            "MOV    [ESP+0],0\n\t"
  1.9834 +            "FMUL   ST(0),[ESP+0]\t# Scale\n\t"
  1.9835 +
  1.9836 +            "ADD    ESP,8"
  1.9837 +             %}
  1.9838 +  ins_encode( push_stack_temp_qword,
  1.9839 +              Opcode(0xD9), Opcode(0xEA),   // fldl2e
  1.9840 +              Opcode(0xDE), Opcode(0xC9),   // fmulp
  1.9841 +              pow_exp_core_encoding,
  1.9842 +              pop_stack_temp_qword);
  1.9843 +  ins_pipe( pipe_slow );
  1.9844 +%}
  1.9845 +
  1.9846 +instruct expXD_reg(regXD dst, regXD src, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
  1.9847 +  predicate (UseSSE>=2);
  1.9848 +  match(Set dst (ExpD src));
  1.9849 +  effect(KILL tmp1, KILL rax, KILL rbx, KILL rcx);
  1.9850 +  format %{ "SUB    ESP,8\t\t# Fast-path EXP encoding\n\t"
  1.9851 +            "MOVSD  [ESP],$src\n\t"
  1.9852 +            "FLDL2E \t\t\t# Ld log2(e) X\n\t"
  1.9853 +            "FMULP  \t\t\t# Q=X*log2(e) X\n\t"
  1.9854 +
  1.9855 +            "FDUP   \t\t\t# Q Q\n\t"
  1.9856 +            "FRNDINT\t\t\t# int(Q) Q\n\t"
  1.9857 +            "FSUB   ST(1),ST(0)\t# int(Q) frac(Q)\n\t"
  1.9858 +            "FISTP  dword [ESP]\n\t"
  1.9859 +            "F2XM1  \t\t\t# 2^frac(Q)-1 int(Q)\n\t"
  1.9860 +            "FLD1   \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t"
  1.9861 +            "FADDP  \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead
  1.9862 +            "MOV    EAX,[ESP]\t# Pick up int(Q)\n\t"
  1.9863 +            "MOV    ECX,0xFFFFF800\t# Overflow mask\n\t"
  1.9864 +            "ADD    EAX,1023\t\t# Double exponent bias\n\t"
  1.9865 +            "MOV    EBX,EAX\t\t# Preshifted biased expo\n\t"
  1.9866 +            "SHL    EAX,20\t\t# Shift exponent into place\n\t"
  1.9867 +            "TEST   EBX,ECX\t\t# Check for overflow\n\t"
  1.9868 +            "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t"
  1.9869 +            "MOV    [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t"
  1.9870 +            "MOV    [ESP+0],0\n\t"
  1.9871 +            "FMUL   ST(0),[ESP+0]\t# Scale\n\t"
  1.9872 +
  1.9873 +            "FST_D  [ESP]\n\t"
  1.9874 +            "MOVSD  $dst,[ESP]\n\t"
  1.9875 +            "ADD    ESP,8"
  1.9876 +             %}
  1.9877 +  ins_encode( Push_SrcXD(src),
  1.9878 +              Opcode(0xD9), Opcode(0xEA),   // fldl2e
  1.9879 +              Opcode(0xDE), Opcode(0xC9),   // fmulp
  1.9880 +              pow_exp_core_encoding,
  1.9881 +              Push_ResultXD(dst) );
  1.9882 +  ins_pipe( pipe_slow );
  1.9883 +%}
  1.9884 +
  1.9885 +
  1.9886 +
  1.9887 +instruct log10D_reg(regDPR1 dst, regDPR1 src) %{
  1.9888 +  predicate (UseSSE<=1);
  1.9889 +  // The source Double operand on FPU stack
  1.9890 +  match(Set dst (Log10D src));
  1.9891 +  // fldlg2       ; push log_10(2) on the FPU stack; full 80-bit number
  1.9892 +  // fxch         ; swap ST(0) with ST(1)
  1.9893 +  // fyl2x        ; compute log_10(2) * log_2(x)
  1.9894 +  format %{ "FLDLG2 \t\t\t#Log10\n\t"
  1.9895 +            "FXCH   \n\t"
  1.9896 +            "FYL2X  \t\t\t# Q=Log10*Log_2(x)"
  1.9897 +         %}
  1.9898 +  ins_encode( Opcode(0xD9), Opcode(0xEC),   // fldlg2
  1.9899 +              Opcode(0xD9), Opcode(0xC9),   // fxch
  1.9900 +              Opcode(0xD9), Opcode(0xF1));  // fyl2x
  1.9901 +
  1.9902 +  ins_pipe( pipe_slow );
  1.9903 +%}
  1.9904 +
  1.9905 +instruct log10XD_reg(regXD dst, regXD src, eFlagsReg cr) %{
  1.9906 +  predicate (UseSSE>=2);
  1.9907 +  effect(KILL cr);
  1.9908 +  match(Set dst (Log10D src));
  1.9909 +  // fldlg2       ; push log_10(2) on the FPU stack; full 80-bit number
  1.9910 +  // fyl2x        ; compute log_10(2) * log_2(x)
  1.9911 +  format %{ "FLDLG2 \t\t\t#Log10\n\t"
  1.9912 +            "FYL2X  \t\t\t# Q=Log10*Log_2(x)"
  1.9913 +         %}
  1.9914 +  ins_encode( Opcode(0xD9), Opcode(0xEC),   // fldlg2
  1.9915 +              Push_SrcXD(src),
  1.9916 +              Opcode(0xD9), Opcode(0xF1),   // fyl2x
  1.9917 +              Push_ResultXD(dst));
  1.9918 +
  1.9919 +  ins_pipe( pipe_slow );
  1.9920 +%}
  1.9921 +
  1.9922 +instruct logD_reg(regDPR1 dst, regDPR1 src) %{
  1.9923 +  predicate (UseSSE<=1);
  1.9924 +  // The source Double operand on FPU stack
  1.9925 +  match(Set dst (LogD src));
  1.9926 +  // fldln2       ; push log_e(2) on the FPU stack; full 80-bit number
  1.9927 +  // fxch         ; swap ST(0) with ST(1)
  1.9928 +  // fyl2x        ; compute log_e(2) * log_2(x)
  1.9929 +  format %{ "FLDLN2 \t\t\t#Log_e\n\t"
  1.9930 +            "FXCH   \n\t"
  1.9931 +            "FYL2X  \t\t\t# Q=Log_e*Log_2(x)"
  1.9932 +         %}
  1.9933 +  ins_encode( Opcode(0xD9), Opcode(0xED),   // fldln2
  1.9934 +              Opcode(0xD9), Opcode(0xC9),   // fxch
  1.9935 +              Opcode(0xD9), Opcode(0xF1));  // fyl2x
  1.9936 +
  1.9937 +  ins_pipe( pipe_slow );
  1.9938 +%}
  1.9939 +
  1.9940 +instruct logXD_reg(regXD dst, regXD src, eFlagsReg cr) %{
  1.9941 +  predicate (UseSSE>=2);
  1.9942 +  effect(KILL cr);
  1.9943 +  // The source and result Double operands in XMM registers
  1.9944 +  match(Set dst (LogD src));
  1.9945 +  // fldln2       ; push log_e(2) on the FPU stack; full 80-bit number
  1.9946 +  // fyl2x        ; compute log_e(2) * log_2(x)
  1.9947 +  format %{ "FLDLN2 \t\t\t#Log_e\n\t"
  1.9948 +            "FYL2X  \t\t\t# Q=Log_e*Log_2(x)"
  1.9949 +         %}
  1.9950 +  ins_encode( Opcode(0xD9), Opcode(0xED),   // fldln2
  1.9951 +              Push_SrcXD(src),
  1.9952 +              Opcode(0xD9), Opcode(0xF1),   // fyl2x
  1.9953 +              Push_ResultXD(dst));
  1.9954 +  ins_pipe( pipe_slow );
  1.9955 +%}
  1.9956 +
  1.9957 +//-------------Float Instructions-------------------------------
  1.9958 +// Float Math
  1.9959 +
  1.9960 +// Code for float compare:
  1.9961 +//     fcompp();
  1.9962 +//     fwait(); fnstsw_ax();
  1.9963 +//     sahf();
  1.9964 +//     movl(dst, unordered_result);
  1.9965 +//     jcc(Assembler::parity, exit);
  1.9966 +//     movl(dst, less_result);
  1.9967 +//     jcc(Assembler::below, exit);
  1.9968 +//     movl(dst, equal_result);
  1.9969 +//     jcc(Assembler::equal, exit);
  1.9970 +//     movl(dst, greater_result);
  1.9971 +//   exit:
  1.9972 +
  1.9973 +// P6 version of float compare, sets condition codes in EFLAGS
  1.9974 +instruct cmpF_cc_P6(eFlagsRegU cr, regF src1, regF src2, eAXRegI rax) %{
  1.9975 +  predicate(VM_Version::supports_cmov() && UseSSE == 0);
  1.9976 +  match(Set cr (CmpF src1 src2));
  1.9977 +  effect(KILL rax);
  1.9978 +  ins_cost(150);
  1.9979 +  format %{ "FLD    $src1\n\t"
  1.9980 +            "FUCOMIP ST,$src2  // P6 instruction\n\t"
  1.9981 +            "JNP    exit\n\t"
  1.9982 +            "MOV    ah,1       // saw a NaN, set CF (treat as LT)\n\t"
  1.9983 +            "SAHF\n"
  1.9984 +     "exit:\tNOP               // avoid branch to branch" %}
  1.9985 +  opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
  1.9986 +  ins_encode( Push_Reg_D(src1),
  1.9987 +              OpcP, RegOpc(src2),
  1.9988 +              cmpF_P6_fixup );
  1.9989 +  ins_pipe( pipe_slow );
  1.9990 +%}
  1.9991 +
  1.9992 +
  1.9993 +// Compare & branch
  1.9994 +instruct cmpF_cc(eFlagsRegU cr, regF src1, regF src2, eAXRegI rax) %{
  1.9995 +  predicate(UseSSE == 0);
  1.9996 +  match(Set cr (CmpF src1 src2));
  1.9997 +  effect(KILL rax);
  1.9998 +  ins_cost(200);
  1.9999 +  format %{ "FLD    $src1\n\t"
 1.10000 +            "FCOMp  $src2\n\t"
 1.10001 +            "FNSTSW AX\n\t"
 1.10002 +            "TEST   AX,0x400\n\t"
 1.10003 +            "JZ,s   flags\n\t"
 1.10004 +            "MOV    AH,1\t# unordered treat as LT\n"
 1.10005 +    "flags:\tSAHF" %}
 1.10006 +  opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
 1.10007 +  ins_encode( Push_Reg_D(src1),
 1.10008 +              OpcP, RegOpc(src2),
 1.10009 +              fpu_flags);
 1.10010 +  ins_pipe( pipe_slow );
 1.10011 +%}
 1.10012 +
 1.10013 +// Compare vs zero into -1,0,1
 1.10014 +instruct cmpF_0(eRegI dst, regF src1, immF0 zero, eAXRegI rax, eFlagsReg cr) %{
 1.10015 +  predicate(UseSSE == 0);
 1.10016 +  match(Set dst (CmpF3 src1 zero));
 1.10017 +  effect(KILL cr, KILL rax);
 1.10018 +  ins_cost(280);
 1.10019 +  format %{ "FTSTF  $dst,$src1" %}
 1.10020 +  opcode(0xE4, 0xD9);
 1.10021 +  ins_encode( Push_Reg_D(src1),
 1.10022 +              OpcS, OpcP, PopFPU,
 1.10023 +              CmpF_Result(dst));
 1.10024 +  ins_pipe( pipe_slow );
 1.10025 +%}
 1.10026 +
 1.10027 +// Compare into -1,0,1
 1.10028 +instruct cmpF_reg(eRegI dst, regF src1, regF src2, eAXRegI rax, eFlagsReg cr) %{
 1.10029 +  predicate(UseSSE == 0);
 1.10030 +  match(Set dst (CmpF3 src1 src2));
 1.10031 +  effect(KILL cr, KILL rax);
 1.10032 +  ins_cost(300);
 1.10033 +  format %{ "FCMPF  $dst,$src1,$src2" %}
 1.10034 +  opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
 1.10035 +  ins_encode( Push_Reg_D(src1),
 1.10036 +              OpcP, RegOpc(src2),
 1.10037 +              CmpF_Result(dst));
 1.10038 +  ins_pipe( pipe_slow );
 1.10039 +%}
 1.10040 +
 1.10041 +// float compare and set condition codes in EFLAGS by XMM regs
 1.10042 +instruct cmpX_cc(eFlagsRegU cr, regX dst, regX src, eAXRegI rax) %{
 1.10043 +  predicate(UseSSE>=1);
 1.10044 +  match(Set cr (CmpF dst src));
 1.10045 +  effect(KILL rax);
 1.10046 +  ins_cost(145);
 1.10047 +  format %{ "COMISS $dst,$src\n"
 1.10048 +          "\tJNP    exit\n"
 1.10049 +          "\tMOV    ah,1       // saw a NaN, set CF\n"
 1.10050 +          "\tSAHF\n"
 1.10051 +     "exit:\tNOP               // avoid branch to branch" %}
 1.10052 +  opcode(0x0F, 0x2F);
 1.10053 +  ins_encode(OpcP, OpcS, RegReg(dst, src), cmpF_P6_fixup);
 1.10054 +  ins_pipe( pipe_slow );
 1.10055 +%}
 1.10056 +
 1.10057 +// float compare and set condition codes in EFLAGS by XMM regs
 1.10058 +instruct cmpX_ccmem(eFlagsRegU cr, regX dst, memory src, eAXRegI rax) %{
 1.10059 +  predicate(UseSSE>=1);
 1.10060 +  match(Set cr (CmpF dst (LoadF src)));
 1.10061 +  effect(KILL rax);
 1.10062 +  ins_cost(165);
 1.10063 +  format %{ "COMISS $dst,$src\n"
 1.10064 +          "\tJNP    exit\n"
 1.10065 +          "\tMOV    ah,1       // saw a NaN, set CF\n"
 1.10066 +          "\tSAHF\n"
 1.10067 +     "exit:\tNOP               // avoid branch to branch" %}
 1.10068 +  opcode(0x0F, 0x2F);
 1.10069 +  ins_encode(OpcP, OpcS, RegMem(dst, src), cmpF_P6_fixup);
 1.10070 +  ins_pipe( pipe_slow );
 1.10071 +%}
 1.10072 +
 1.10073 +// Compare into -1,0,1 in XMM
 1.10074 +instruct cmpX_reg(eRegI dst, regX src1, regX src2, eFlagsReg cr) %{
 1.10075 +  predicate(UseSSE>=1);
 1.10076 +  match(Set dst (CmpF3 src1 src2));
 1.10077 +  effect(KILL cr);
 1.10078 +  ins_cost(255);
 1.10079 +  format %{ "XOR    $dst,$dst\n"
 1.10080 +          "\tCOMISS $src1,$src2\n"
 1.10081 +          "\tJP,s   nan\n"
 1.10082 +          "\tJEQ,s  exit\n"
 1.10083 +          "\tJA,s   inc\n"
 1.10084 +      "nan:\tDEC    $dst\n"
 1.10085 +          "\tJMP,s  exit\n"
 1.10086 +      "inc:\tINC    $dst\n"
 1.10087 +      "exit:"
 1.10088 +                %}
 1.10089 +  opcode(0x0F, 0x2F);
 1.10090 +  ins_encode(Xor_Reg(dst), OpcP, OpcS, RegReg(src1, src2), CmpX_Result(dst));
 1.10091 +  ins_pipe( pipe_slow );
 1.10092 +%}
 1.10093 +
 1.10094 +// Compare into -1,0,1 in XMM and memory
 1.10095 +instruct cmpX_regmem(eRegI dst, regX src1, memory mem, eFlagsReg cr) %{
 1.10096 +  predicate(UseSSE>=1);
 1.10097 +  match(Set dst (CmpF3 src1 (LoadF mem)));
 1.10098 +  effect(KILL cr);
 1.10099 +  ins_cost(275);
 1.10100 +  format %{ "COMISS $src1,$mem\n"
 1.10101 +          "\tMOV    $dst,0\t\t# do not blow flags\n"
 1.10102 +          "\tJP,s   nan\n"
 1.10103 +          "\tJEQ,s  exit\n"
 1.10104 +          "\tJA,s   inc\n"
 1.10105 +      "nan:\tDEC    $dst\n"
 1.10106 +          "\tJMP,s  exit\n"
 1.10107 +      "inc:\tINC    $dst\n"
 1.10108 +      "exit:"
 1.10109 +                %}
 1.10110 +  opcode(0x0F, 0x2F);
 1.10111 +  ins_encode(OpcP, OpcS, RegMem(src1, mem), LdImmI(dst,0x0), CmpX_Result(dst));
 1.10112 +  ins_pipe( pipe_slow );
 1.10113 +%}
 1.10114 +
 1.10115 +// Spill to obtain 24-bit precision
 1.10116 +instruct subF24_reg(stackSlotF dst, regF src1, regF src2) %{
 1.10117 +  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
 1.10118 +  match(Set dst (SubF src1 src2));
 1.10119 +
 1.10120 +  format %{ "FSUB   $dst,$src1 - $src2" %}
 1.10121 +  opcode(0xD8, 0x4); /* D8 E0+i or D8 /4 mod==0x3 ;; result in TOS */
 1.10122 +  ins_encode( Push_Reg_F(src1),
 1.10123 +              OpcReg_F(src2),
 1.10124 +              Pop_Mem_F(dst) );
 1.10125 +  ins_pipe( fpu_mem_reg_reg );
 1.10126 +%}
 1.10127 +//
 1.10128 +// This instruction does not round to 24-bits
 1.10129 +instruct subF_reg(regF dst, regF src) %{
 1.10130 +  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
 1.10131 +  match(Set dst (SubF dst src));
 1.10132 +
 1.10133 +  format %{ "FSUB   $dst,$src" %}
 1.10134 +  opcode(0xDE, 0x5); /* DE E8+i  or DE /5 */
 1.10135 +  ins_encode( Push_Reg_F(src),
 1.10136 +              OpcP, RegOpc(dst) );
 1.10137 +  ins_pipe( fpu_reg_reg );
 1.10138 +%}
 1.10139 +
 1.10140 +// Spill to obtain 24-bit precision
 1.10141 +instruct addF24_reg(stackSlotF dst, regF src1, regF src2) %{
 1.10142 +  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
 1.10143 +  match(Set dst (AddF src1 src2));
 1.10144 +
 1.10145 +  format %{ "FADD   $dst,$src1,$src2" %}
 1.10146 +  opcode(0xD8, 0x0); /* D8 C0+i */
 1.10147 +  ins_encode( Push_Reg_F(src2),
 1.10148 +              OpcReg_F(src1),
 1.10149 +              Pop_Mem_F(dst) );
 1.10150 +  ins_pipe( fpu_mem_reg_reg );
 1.10151 +%}
 1.10152 +//
 1.10153 +// This instruction does not round to 24-bits
 1.10154 +instruct addF_reg(regF dst, regF src) %{
 1.10155 +  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
 1.10156 +  match(Set dst (AddF dst src));
 1.10157 +
 1.10158 +  format %{ "FLD    $src\n\t"
 1.10159 +            "FADDp  $dst,ST" %}
 1.10160 +  opcode(0xDE, 0x0); /* DE C0+i or DE /0*/
 1.10161 +  ins_encode( Push_Reg_F(src),
 1.10162 +              OpcP, RegOpc(dst) );
 1.10163 +  ins_pipe( fpu_reg_reg );
 1.10164 +%}
 1.10165 +
 1.10166 +// Add two single precision floating point values in xmm
 1.10167 +instruct addX_reg(regX dst, regX src) %{
 1.10168 +  predicate(UseSSE>=1);
 1.10169 +  match(Set dst (AddF dst src));
 1.10170 +  format %{ "ADDSS  $dst,$src" %}
 1.10171 +  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x58), RegReg(dst, src));
 1.10172 +  ins_pipe( pipe_slow );
 1.10173 +%}
 1.10174 +
 1.10175 +instruct addX_imm(regX dst, immXF con) %{
 1.10176 +  predicate(UseSSE>=1);
 1.10177 +  match(Set dst (AddF dst con));
 1.10178 +  format %{ "ADDSS  $dst,[$con]" %}
 1.10179 +  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x58), LdImmX(dst, con) );
 1.10180 +  ins_pipe( pipe_slow );
 1.10181 +%}
 1.10182 +
 1.10183 +instruct addX_mem(regX dst, memory mem) %{
 1.10184 +  predicate(UseSSE>=1);
 1.10185 +  match(Set dst (AddF dst (LoadF mem)));
 1.10186 +  format %{ "ADDSS  $dst,$mem" %}
 1.10187 +  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x58), RegMem(dst, mem));
 1.10188 +  ins_pipe( pipe_slow );
 1.10189 +%}
 1.10190 +
 1.10191 +// Subtract two single precision floating point values in xmm
 1.10192 +instruct subX_reg(regX dst, regX src) %{
 1.10193 +  predicate(UseSSE>=1);
 1.10194 +  match(Set dst (SubF dst src));
 1.10195 +  format %{ "SUBSS  $dst,$src" %}
 1.10196 +  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5C), RegReg(dst, src));
 1.10197 +  ins_pipe( pipe_slow );
 1.10198 +%}
 1.10199 +
 1.10200 +instruct subX_imm(regX dst, immXF con) %{
 1.10201 +  predicate(UseSSE>=1);
 1.10202 +  match(Set dst (SubF dst con));
 1.10203 +  format %{ "SUBSS  $dst,[$con]" %}
 1.10204 +  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5C), LdImmX(dst, con) );
 1.10205 +  ins_pipe( pipe_slow );
 1.10206 +%}
 1.10207 +
 1.10208 +instruct subX_mem(regX dst, memory mem) %{
 1.10209 +  predicate(UseSSE>=1);
 1.10210 +  match(Set dst (SubF dst (LoadF mem)));
 1.10211 +  format %{ "SUBSS  $dst,$mem" %}
 1.10212 +  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5C), RegMem(dst,mem));
 1.10213 +  ins_pipe( pipe_slow );
 1.10214 +%}
 1.10215 +
 1.10216 +// Multiply two single precision floating point values in xmm
 1.10217 +instruct mulX_reg(regX dst, regX src) %{
 1.10218 +  predicate(UseSSE>=1);
 1.10219 +  match(Set dst (MulF dst src));
 1.10220 +  format %{ "MULSS  $dst,$src" %}
 1.10221 +  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x59), RegReg(dst, src));
 1.10222 +  ins_pipe( pipe_slow );
 1.10223 +%}
 1.10224 +
 1.10225 +instruct mulX_imm(regX dst, immXF con) %{
 1.10226 +  predicate(UseSSE>=1);
 1.10227 +  match(Set dst (MulF dst con));
 1.10228 +  format %{ "MULSS  $dst,[$con]" %}
 1.10229 +  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x59), LdImmX(dst, con) );
 1.10230 +  ins_pipe( pipe_slow );
 1.10231 +%}
 1.10232 +
 1.10233 +instruct mulX_mem(regX dst, memory mem) %{
 1.10234 +  predicate(UseSSE>=1);
 1.10235 +  match(Set dst (MulF dst (LoadF mem)));
 1.10236 +  format %{ "MULSS  $dst,$mem" %}
 1.10237 +  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x59), RegMem(dst,mem));
 1.10238 +  ins_pipe( pipe_slow );
 1.10239 +%}
 1.10240 +
 1.10241 +// Divide two single precision floating point values in xmm
 1.10242 +instruct divX_reg(regX dst, regX src) %{
 1.10243 +  predicate(UseSSE>=1);
 1.10244 +  match(Set dst (DivF dst src));
 1.10245 +  format %{ "DIVSS  $dst,$src" %}
 1.10246 +  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5E), RegReg(dst, src));
 1.10247 +  ins_pipe( pipe_slow );
 1.10248 +%}
 1.10249 +
 1.10250 +instruct divX_imm(regX dst, immXF con) %{
 1.10251 +  predicate(UseSSE>=1);
 1.10252 +  match(Set dst (DivF dst con));
 1.10253 +  format %{ "DIVSS  $dst,[$con]" %}
 1.10254 +  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5E), LdImmX(dst, con) );
 1.10255 +  ins_pipe( pipe_slow );
 1.10256 +%}
 1.10257 +
 1.10258 +instruct divX_mem(regX dst, memory mem) %{
 1.10259 +  predicate(UseSSE>=1);
 1.10260 +  match(Set dst (DivF dst (LoadF mem)));
 1.10261 +  format %{ "DIVSS  $dst,$mem" %}
 1.10262 +  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5E), RegMem(dst,mem));
 1.10263 +  ins_pipe( pipe_slow );
 1.10264 +%}
 1.10265 +
 1.10266 +// Get the square root of a single precision floating point values in xmm
 1.10267 +instruct sqrtX_reg(regX dst, regX src) %{
 1.10268 +  predicate(UseSSE>=1);
 1.10269 +  match(Set dst (ConvD2F (SqrtD (ConvF2D src))));
 1.10270 +  format %{ "SQRTSS $dst,$src" %}
 1.10271 +  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x51), RegReg(dst, src));
 1.10272 +  ins_pipe( pipe_slow );
 1.10273 +%}
 1.10274 +
 1.10275 +instruct sqrtX_mem(regX dst, memory mem) %{
 1.10276 +  predicate(UseSSE>=1);
 1.10277 +  match(Set dst (ConvD2F (SqrtD (ConvF2D (LoadF mem)))));
 1.10278 +  format %{ "SQRTSS $dst,$mem" %}
 1.10279 +  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x51), RegMem(dst, mem));
 1.10280 +  ins_pipe( pipe_slow );
 1.10281 +%}
 1.10282 +
 1.10283 +// Get the square root of a double precision floating point values in xmm
 1.10284 +instruct sqrtXD_reg(regXD dst, regXD src) %{
 1.10285 +  predicate(UseSSE>=2);
 1.10286 +  match(Set dst (SqrtD src));
 1.10287 +  format %{ "SQRTSD $dst,$src" %}
 1.10288 +  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x51), RegReg(dst, src));
 1.10289 +  ins_pipe( pipe_slow );
 1.10290 +%}
 1.10291 +
 1.10292 +instruct sqrtXD_mem(regXD dst, memory mem) %{
 1.10293 +  predicate(UseSSE>=2);
 1.10294 +  match(Set dst (SqrtD (LoadD mem)));
 1.10295 +  format %{ "SQRTSD $dst,$mem" %}
 1.10296 +  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x51), RegMem(dst, mem));
 1.10297 +  ins_pipe( pipe_slow );
 1.10298 +%}
 1.10299 +
 1.10300 +instruct absF_reg(regFPR1 dst, regFPR1 src) %{
 1.10301 +  predicate(UseSSE==0);
 1.10302 +  match(Set dst (AbsF src));
 1.10303 +  ins_cost(100);
 1.10304 +  format %{ "FABS" %}
 1.10305 +  opcode(0xE1, 0xD9);
 1.10306 +  ins_encode( OpcS, OpcP );
 1.10307 +  ins_pipe( fpu_reg_reg );
 1.10308 +%}
 1.10309 +
 1.10310 +instruct absX_reg(regX dst ) %{
 1.10311 +  predicate(UseSSE>=1);
 1.10312 +  match(Set dst (AbsF dst));
 1.10313 +  format %{ "ANDPS  $dst,[0x7FFFFFFF]\t# ABS F by sign masking" %}
 1.10314 +  ins_encode( AbsXF_encoding(dst));
 1.10315 +  ins_pipe( pipe_slow );
 1.10316 +%}
 1.10317 +
 1.10318 +instruct negF_reg(regFPR1 dst, regFPR1 src) %{
 1.10319 +  predicate(UseSSE==0);
 1.10320 +  match(Set dst (NegF src));
 1.10321 +  ins_cost(100);
 1.10322 +  format %{ "FCHS" %}
 1.10323 +  opcode(0xE0, 0xD9);
 1.10324 +  ins_encode( OpcS, OpcP );
 1.10325 +  ins_pipe( fpu_reg_reg );
 1.10326 +%}
 1.10327 +
 1.10328 +instruct negX_reg( regX dst ) %{
 1.10329 +  predicate(UseSSE>=1);
 1.10330 +  match(Set dst (NegF dst));
 1.10331 +  format %{ "XORPS  $dst,[0x80000000]\t# CHS F by sign flipping" %}
 1.10332 +  ins_encode( NegXF_encoding(dst));
 1.10333 +  ins_pipe( pipe_slow );
 1.10334 +%}
 1.10335 +
 1.10336 +// Cisc-alternate to addF_reg
 1.10337 +// Spill to obtain 24-bit precision
 1.10338 +instruct addF24_reg_mem(stackSlotF dst, regF src1, memory src2) %{
 1.10339 +  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
 1.10340 +  match(Set dst (AddF src1 (LoadF src2)));
 1.10341 +
 1.10342 +  format %{ "FLD    $src2\n\t"
 1.10343 +            "FADD   ST,$src1\n\t"
 1.10344 +            "FSTP_S $dst" %}
 1.10345 +  opcode(0xD8, 0x0, 0xD9); /* D8 C0+i */  /* LoadF  D9 /0 */
 1.10346 +  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
 1.10347 +              OpcReg_F(src1),
 1.10348 +              Pop_Mem_F(dst) );
 1.10349 +  ins_pipe( fpu_mem_reg_mem );
 1.10350 +%}
 1.10351 +//
 1.10352 +// Cisc-alternate to addF_reg
 1.10353 +// This instruction does not round to 24-bits
 1.10354 +instruct addF_reg_mem(regF dst, memory src) %{
 1.10355 +  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
 1.10356 +  match(Set dst (AddF dst (LoadF src)));
 1.10357 +
 1.10358 +  format %{ "FADD   $dst,$src" %}
 1.10359 +  opcode(0xDE, 0x0, 0xD9); /* DE C0+i or DE /0*/  /* LoadF  D9 /0 */
 1.10360 +  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src),
 1.10361 +              OpcP, RegOpc(dst) );
 1.10362 +  ins_pipe( fpu_reg_mem );
 1.10363 +%}
 1.10364 +
 1.10365 +// // Following two instructions for _222_mpegaudio
 1.10366 +// Spill to obtain 24-bit precision
 1.10367 +instruct addF24_mem_reg(stackSlotF dst, regF src2, memory src1 ) %{
 1.10368 +  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
 1.10369 +  match(Set dst (AddF src1 src2));
 1.10370 +
 1.10371 +  format %{ "FADD   $dst,$src1,$src2" %}
 1.10372 +  opcode(0xD8, 0x0, 0xD9); /* D8 C0+i */  /* LoadF  D9 /0 */
 1.10373 +  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src1),
 1.10374 +              OpcReg_F(src2),
 1.10375 +              Pop_Mem_F(dst) );
 1.10376 +  ins_pipe( fpu_mem_reg_mem );
 1.10377 +%}
 1.10378 +
 1.10379 +// Cisc-spill variant
 1.10380 +// Spill to obtain 24-bit precision
 1.10381 +instruct addF24_mem_cisc(stackSlotF dst, memory src1, memory src2) %{
 1.10382 +  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
 1.10383 +  match(Set dst (AddF src1 (LoadF src2)));
 1.10384 +
 1.10385 +  format %{ "FADD   $dst,$src1,$src2 cisc" %}
 1.10386 +  opcode(0xD8, 0x0, 0xD9); /* D8 C0+i */  /* LoadF  D9 /0 */
 1.10387 +  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
 1.10388 +              set_instruction_start,
 1.10389 +              OpcP, RMopc_Mem(secondary,src1),
 1.10390 +              Pop_Mem_F(dst) );
 1.10391 +  ins_pipe( fpu_mem_mem_mem );
 1.10392 +%}
 1.10393 +
 1.10394 +// Spill to obtain 24-bit precision
 1.10395 +instruct addF24_mem_mem(stackSlotF dst, memory src1, memory src2) %{
 1.10396 +  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
 1.10397 +  match(Set dst (AddF src1 src2));
 1.10398 +
 1.10399 +  format %{ "FADD   $dst,$src1,$src2" %}
 1.10400 +  opcode(0xD8, 0x0, 0xD9); /* D8 /0 */  /* LoadF  D9 /0 */
 1.10401 +  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
 1.10402 +              set_instruction_start,
 1.10403 +              OpcP, RMopc_Mem(secondary,src1),
 1.10404 +              Pop_Mem_F(dst) );
 1.10405 +  ins_pipe( fpu_mem_mem_mem );
 1.10406 +%}
 1.10407 +
 1.10408 +
 1.10409 +// Spill to obtain 24-bit precision
 1.10410 +instruct addF24_reg_imm(stackSlotF dst, regF src1, immF src2) %{
 1.10411 +  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
 1.10412 +  match(Set dst (AddF src1 src2));
 1.10413 +  format %{ "FLD    $src1\n\t"
 1.10414 +            "FADD   $src2\n\t"
 1.10415 +            "FSTP_S $dst"  %}
 1.10416 +  opcode(0xD8, 0x00);       /* D8 /0 */
 1.10417 +  ins_encode( Push_Reg_F(src1),
 1.10418 +              Opc_MemImm_F(src2),
 1.10419 +              Pop_Mem_F(dst));
 1.10420 +  ins_pipe( fpu_mem_reg_con );
 1.10421 +%}
 1.10422 +//
 1.10423 +// This instruction does not round to 24-bits
 1.10424 +instruct addF_reg_imm(regF dst, regF src1, immF src2) %{
 1.10425 +  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
 1.10426 +  match(Set dst (AddF src1 src2));
 1.10427 +  format %{ "FLD    $src1\n\t"
 1.10428 +            "FADD   $src2\n\t"
 1.10429 +            "FSTP_S $dst"  %}
 1.10430 +  opcode(0xD8, 0x00);       /* D8 /0 */
 1.10431 +  ins_encode( Push_Reg_F(src1),
 1.10432 +              Opc_MemImm_F(src2),
 1.10433 +              Pop_Reg_F(dst));
 1.10434 +  ins_pipe( fpu_reg_reg_con );
 1.10435 +%}
 1.10436 +
 1.10437 +// Spill to obtain 24-bit precision
 1.10438 +instruct mulF24_reg(stackSlotF dst, regF src1, regF src2) %{
 1.10439 +  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
 1.10440 +  match(Set dst (MulF src1 src2));
 1.10441 +
 1.10442 +  format %{ "FLD    $src1\n\t"
 1.10443 +            "FMUL   $src2\n\t"
 1.10444 +            "FSTP_S $dst"  %}
 1.10445 +  opcode(0xD8, 0x1); /* D8 C8+i or D8 /1 ;; result in TOS */
 1.10446 +  ins_encode( Push_Reg_F(src1),
 1.10447 +              OpcReg_F(src2),
 1.10448 +              Pop_Mem_F(dst) );
 1.10449 +  ins_pipe( fpu_mem_reg_reg );
 1.10450 +%}
 1.10451 +//
 1.10452 +// This instruction does not round to 24-bits
 1.10453 +instruct mulF_reg(regF dst, regF src1, regF src2) %{
 1.10454 +  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
 1.10455 +  match(Set dst (MulF src1 src2));
 1.10456 +
 1.10457 +  format %{ "FLD    $src1\n\t"
 1.10458 +            "FMUL   $src2\n\t"
 1.10459 +            "FSTP_S $dst"  %}
 1.10460 +  opcode(0xD8, 0x1); /* D8 C8+i */
 1.10461 +  ins_encode( Push_Reg_F(src2),
 1.10462 +              OpcReg_F(src1),
 1.10463 +              Pop_Reg_F(dst) );
 1.10464 +  ins_pipe( fpu_reg_reg_reg );
 1.10465 +%}
 1.10466 +
 1.10467 +
 1.10468 +// Spill to obtain 24-bit precision
 1.10469 +// Cisc-alternate to reg-reg multiply
 1.10470 +instruct mulF24_reg_mem(stackSlotF dst, regF src1, memory src2) %{
 1.10471 +  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
 1.10472 +  match(Set dst (MulF src1 (LoadF src2)));
 1.10473 +
 1.10474 +  format %{ "FLD_S  $src2\n\t"
 1.10475 +            "FMUL   $src1\n\t"
 1.10476 +            "FSTP_S $dst"  %}
 1.10477 +  opcode(0xD8, 0x1, 0xD9); /* D8 C8+i or DE /1*/  /* LoadF D9 /0 */
 1.10478 +  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
 1.10479 +              OpcReg_F(src1),
 1.10480 +              Pop_Mem_F(dst) );
 1.10481 +  ins_pipe( fpu_mem_reg_mem );
 1.10482 +%}
 1.10483 +//
 1.10484 +// This instruction does not round to 24-bits
 1.10485 +// Cisc-alternate to reg-reg multiply
 1.10486 +instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 1.10487 +  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
 1.10488 +  match(Set dst (MulF src1 (LoadF src2)));
 1.10489 +
 1.10490 +  format %{ "FMUL   $dst,$src1,$src2" %}
 1.10491 +  opcode(0xD8, 0x1, 0xD9); /* D8 C8+i */  /* LoadF D9 /0 */
 1.10492 +  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
 1.10493 +              OpcReg_F(src1),
 1.10494 +              Pop_Reg_F(dst) );
 1.10495 +  ins_pipe( fpu_reg_reg_mem );
 1.10496 +%}
 1.10497 +
 1.10498 +// Spill to obtain 24-bit precision
 1.10499 +instruct mulF24_mem_mem(stackSlotF dst, memory src1, memory src2) %{
 1.10500 +  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
 1.10501 +  match(Set dst (MulF src1 src2));
 1.10502 +
 1.10503 +  format %{ "FMUL   $dst,$src1,$src2" %}
 1.10504 +  opcode(0xD8, 0x1, 0xD9); /* D8 /1 */  /* LoadF D9 /0 */
 1.10505 +  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
 1.10506 +              set_instruction_start,
 1.10507 +              OpcP, RMopc_Mem(secondary,src1),
 1.10508 +              Pop_Mem_F(dst) );
 1.10509 +  ins_pipe( fpu_mem_mem_mem );
 1.10510 +%}
 1.10511 +
 1.10512 +// Spill to obtain 24-bit precision
 1.10513 +instruct mulF24_reg_imm(stackSlotF dst, regF src1, immF src2) %{
 1.10514 +  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
 1.10515 +  match(Set dst (MulF src1 src2));
 1.10516 +
 1.10517 +  format %{ "FMULc $dst,$src1,$src2" %}
 1.10518 +  opcode(0xD8, 0x1);  /* D8 /1*/
 1.10519 +  ins_encode( Push_Reg_F(src1),
 1.10520 +              Opc_MemImm_F(src2),
 1.10521 +              Pop_Mem_F(dst));
 1.10522 +  ins_pipe( fpu_mem_reg_con );
 1.10523 +%}
 1.10524 +//
 1.10525 +// This instruction does not round to 24-bits
 1.10526 +instruct mulF_reg_imm(regF dst, regF src1, immF src2) %{
 1.10527 +  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
 1.10528 +  match(Set dst (MulF src1 src2));
 1.10529 +
 1.10530 +  format %{ "FMULc $dst. $src1, $src2" %}
 1.10531 +  opcode(0xD8, 0x1);  /* D8 /1*/
 1.10532 +  ins_encode( Push_Reg_F(src1),
 1.10533 +              Opc_MemImm_F(src2),
 1.10534 +              Pop_Reg_F(dst));
 1.10535 +  ins_pipe( fpu_reg_reg_con );
 1.10536 +%}
 1.10537 +
 1.10538 +
 1.10539 +//
 1.10540 +// MACRO1 -- subsume unshared load into mulF
 1.10541 +// This instruction does not round to 24-bits
 1.10542 +instruct mulF_reg_load1(regF dst, regF src, memory mem1 ) %{
 1.10543 +  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
 1.10544 +  match(Set dst (MulF (LoadF mem1) src));
 1.10545 +
 1.10546 +  format %{ "FLD    $mem1    ===MACRO1===\n\t"
 1.10547 +            "FMUL   ST,$src\n\t"
 1.10548 +            "FSTP   $dst" %}
 1.10549 +  opcode(0xD8, 0x1, 0xD9); /* D8 C8+i or D8 /1 */  /* LoadF D9 /0 */
 1.10550 +  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,mem1),
 1.10551 +              OpcReg_F(src),
 1.10552 +              Pop_Reg_F(dst) );
 1.10553 +  ins_pipe( fpu_reg_reg_mem );
 1.10554 +%}
 1.10555 +//
 1.10556 +// MACRO2 -- addF a mulF which subsumed an unshared load
 1.10557 +// This instruction does not round to 24-bits
 1.10558 +instruct addF_mulF_reg_load1(regF dst, memory mem1, regF src1, regF src2) %{
 1.10559 +  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
 1.10560 +  match(Set dst (AddF (MulF (LoadF mem1) src1) src2));
 1.10561 +  ins_cost(95);
 1.10562 +
 1.10563 +  format %{ "FLD    $mem1     ===MACRO2===\n\t"
 1.10564 +            "FMUL   ST,$src1  subsume mulF left load\n\t"
 1.10565 +            "FADD   ST,$src2\n\t"
 1.10566 +            "FSTP   $dst" %}
 1.10567 +  opcode(0xD9); /* LoadF D9 /0 */
 1.10568 +  ins_encode( OpcP, RMopc_Mem(0x00,mem1),
 1.10569 +              FMul_ST_reg(src1),
 1.10570 +              FAdd_ST_reg(src2),
 1.10571 +              Pop_Reg_F(dst) );
 1.10572 +  ins_pipe( fpu_reg_mem_reg_reg );
 1.10573 +%}
 1.10574 +
 1.10575 +// MACRO3 -- addF a mulF
 1.10576 +// This instruction does not round to 24-bits.  It is a '2-address'
 1.10577 +// instruction in that the result goes back to src2.  This eliminates
 1.10578 +// a move from the macro; possibly the register allocator will have
 1.10579 +// to add it back (and maybe not).
 1.10580 +instruct addF_mulF_reg(regF src2, regF src1, regF src0) %{
 1.10581 +  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
 1.10582 +  match(Set src2 (AddF (MulF src0 src1) src2));
 1.10583 +
 1.10584 +  format %{ "FLD    $src0     ===MACRO3===\n\t"
 1.10585 +            "FMUL   ST,$src1\n\t"
 1.10586 +            "FADDP  $src2,ST" %}
 1.10587 +  opcode(0xD9); /* LoadF D9 /0 */
 1.10588 +  ins_encode( Push_Reg_F(src0),
 1.10589 +              FMul_ST_reg(src1),
 1.10590 +              FAddP_reg_ST(src2) );
 1.10591 +  ins_pipe( fpu_reg_reg_reg );
 1.10592 +%}
 1.10593 +
 1.10594 +// MACRO4 -- divF subF
 1.10595 +// This instruction does not round to 24-bits
 1.10596 +instruct subF_divF_reg(regF dst, regF src1, regF src2, regF src3) %{
 1.10597 +  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
 1.10598 +  match(Set dst (DivF (SubF src2 src1) src3));
 1.10599 +
 1.10600 +  format %{ "FLD    $src2   ===MACRO4===\n\t"
 1.10601 +            "FSUB   ST,$src1\n\t"
 1.10602 +            "FDIV   ST,$src3\n\t"
 1.10603 +            "FSTP  $dst" %}
 1.10604 +  opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
 1.10605 +  ins_encode( Push_Reg_F(src2),
 1.10606 +              subF_divF_encode(src1,src3),
 1.10607 +              Pop_Reg_F(dst) );
 1.10608 +  ins_pipe( fpu_reg_reg_reg_reg );
 1.10609 +%}
 1.10610 +
 1.10611 +// Spill to obtain 24-bit precision
 1.10612 +instruct divF24_reg(stackSlotF dst, regF src1, regF src2) %{
 1.10613 +  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
 1.10614 +  match(Set dst (DivF src1 src2));
 1.10615 +
 1.10616 +  format %{ "FDIV   $dst,$src1,$src2" %}
 1.10617 +  opcode(0xD8, 0x6); /* D8 F0+i or DE /6*/
 1.10618 +  ins_encode( Push_Reg_F(src1),
 1.10619 +              OpcReg_F(src2),
 1.10620 +              Pop_Mem_F(dst) );
 1.10621 +  ins_pipe( fpu_mem_reg_reg );
 1.10622 +%}
 1.10623 +//
 1.10624 +// This instruction does not round to 24-bits
 1.10625 +instruct divF_reg(regF dst, regF src) %{
 1.10626 +  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
 1.10627 +  match(Set dst (DivF dst src));
 1.10628 +
 1.10629 +  format %{ "FDIV   $dst,$src" %}
 1.10630 +  opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
 1.10631 +  ins_encode( Push_Reg_F(src),
 1.10632 +              OpcP, RegOpc(dst) );
 1.10633 +  ins_pipe( fpu_reg_reg );
 1.10634 +%}
 1.10635 +
 1.10636 +
 1.10637 +// Spill to obtain 24-bit precision
 1.10638 +instruct modF24_reg(stackSlotF dst, regF src1, regF src2, eAXRegI rax, eFlagsReg cr) %{
 1.10639 +  predicate( UseSSE==0 && Compile::current()->select_24_bit_instr());
 1.10640 +  match(Set dst (ModF src1 src2));
 1.10641 +  effect(KILL rax, KILL cr); // emitModD() uses EAX and EFLAGS
 1.10642 +
 1.10643 +  format %{ "FMOD   $dst,$src1,$src2" %}
 1.10644 +  ins_encode( Push_Reg_Mod_D(src1, src2),
 1.10645 +              emitModD(),
 1.10646 +              Push_Result_Mod_D(src2),
 1.10647 +              Pop_Mem_F(dst));
 1.10648 +  ins_pipe( pipe_slow );
 1.10649 +%}
 1.10650 +//
 1.10651 +// This instruction does not round to 24-bits
 1.10652 +instruct modF_reg(regF dst, regF src, eAXRegI rax, eFlagsReg cr) %{
 1.10653 +  predicate( UseSSE==0 && !Compile::current()->select_24_bit_instr());
 1.10654 +  match(Set dst (ModF dst src));
 1.10655 +  effect(KILL rax, KILL cr); // emitModD() uses EAX and EFLAGS
 1.10656 +
 1.10657 +  format %{ "FMOD   $dst,$src" %}
 1.10658 +  ins_encode(Push_Reg_Mod_D(dst, src),
 1.10659 +              emitModD(),
 1.10660 +              Push_Result_Mod_D(src),
 1.10661 +              Pop_Reg_F(dst));
 1.10662 +  ins_pipe( pipe_slow );
 1.10663 +%}
 1.10664 +
 1.10665 +instruct modX_reg(regX dst, regX src0, regX src1, eAXRegI rax, eFlagsReg cr) %{
 1.10666 +  predicate(UseSSE>=1);
 1.10667 +  match(Set dst (ModF src0 src1));
 1.10668 +  effect(KILL rax, KILL cr);
 1.10669 +  format %{ "SUB    ESP,4\t # FMOD\n"
 1.10670 +          "\tMOVSS  [ESP+0],$src1\n"
 1.10671 +          "\tFLD_S  [ESP+0]\n"
 1.10672 +          "\tMOVSS  [ESP+0],$src0\n"
 1.10673 +          "\tFLD_S  [ESP+0]\n"
 1.10674 +     "loop:\tFPREM\n"
 1.10675 +          "\tFWAIT\n"
 1.10676 +          "\tFNSTSW AX\n"
 1.10677 +          "\tSAHF\n"
 1.10678 +          "\tJP     loop\n"
 1.10679 +          "\tFSTP_S [ESP+0]\n"
 1.10680 +          "\tMOVSS  $dst,[ESP+0]\n"
 1.10681 +          "\tADD    ESP,4\n"
 1.10682 +          "\tFSTP   ST0\t # Restore FPU Stack"
 1.10683 +    %}
 1.10684 +  ins_cost(250);
 1.10685 +  ins_encode( Push_ModX_encoding(src0, src1), emitModD(), Push_ResultX(dst,0x4), PopFPU);
 1.10686 +  ins_pipe( pipe_slow );
 1.10687 +%}
 1.10688 +
 1.10689 +
 1.10690 +//----------Arithmetic Conversion Instructions---------------------------------
 1.10691 +// The conversions operations are all Alpha sorted.  Please keep it that way!
 1.10692 +
 1.10693 +instruct roundFloat_mem_reg(stackSlotF dst, regF src) %{
 1.10694 +  predicate(UseSSE==0);
 1.10695 +  match(Set dst (RoundFloat src));
 1.10696 +  ins_cost(125);
 1.10697 +  format %{ "FST_S  $dst,$src\t# F-round" %}
 1.10698 +  ins_encode( Pop_Mem_Reg_F(dst, src) );
 1.10699 +  ins_pipe( fpu_mem_reg );
 1.10700 +%}
 1.10701 +
 1.10702 +instruct roundDouble_mem_reg(stackSlotD dst, regD src) %{
 1.10703 +  predicate(UseSSE<=1);
 1.10704 +  match(Set dst (RoundDouble src));
 1.10705 +  ins_cost(125);
 1.10706 +  format %{ "FST_D  $dst,$src\t# D-round" %}
 1.10707 +  ins_encode( Pop_Mem_Reg_D(dst, src) );
 1.10708 +  ins_pipe( fpu_mem_reg );
 1.10709 +%}
 1.10710 +
 1.10711 +// Force rounding to 24-bit precision and 6-bit exponent
 1.10712 +instruct convD2F_reg(stackSlotF dst, regD src) %{
 1.10713 +  predicate(UseSSE==0);
 1.10714 +  match(Set dst (ConvD2F src));
 1.10715 +  format %{ "FST_S  $dst,$src\t# F-round" %}
 1.10716 +  expand %{
 1.10717 +    roundFloat_mem_reg(dst,src);
 1.10718 +  %}
 1.10719 +%}
 1.10720 +
 1.10721 +// Force rounding to 24-bit precision and 6-bit exponent
 1.10722 +instruct convD2X_reg(regX dst, regD src, eFlagsReg cr) %{
 1.10723 +  predicate(UseSSE==1);
 1.10724 +  match(Set dst (ConvD2F src));
 1.10725 +  effect( KILL cr );
 1.10726 +  format %{ "SUB    ESP,4\n\t"
 1.10727 +            "FST_S  [ESP],$src\t# F-round\n\t"
 1.10728 +            "MOVSS  $dst,[ESP]\n\t"
 1.10729 +            "ADD ESP,4" %}
 1.10730 +  ins_encode( D2X_encoding(dst, src) );
 1.10731 +  ins_pipe( pipe_slow );
 1.10732 +%}
 1.10733 +
 1.10734 +// Force rounding double precision to single precision
 1.10735 +instruct convXD2X_reg(regX dst, regXD src) %{
 1.10736 +  predicate(UseSSE>=2);
 1.10737 +  match(Set dst (ConvD2F src));
 1.10738 +  format %{ "CVTSD2SS $dst,$src\t# F-round" %}
 1.10739 +  opcode(0xF2, 0x0F, 0x5A);
 1.10740 +  ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
 1.10741 +  ins_pipe( pipe_slow );
 1.10742 +%}
 1.10743 +
 1.10744 +instruct convF2D_reg_reg(regD dst, regF src) %{
 1.10745 +  predicate(UseSSE==0);
 1.10746 +  match(Set dst (ConvF2D src));
 1.10747 +  format %{ "FST_S  $dst,$src\t# D-round" %}
 1.10748 +  ins_encode( Pop_Reg_Reg_D(dst, src));
 1.10749 +  ins_pipe( fpu_reg_reg );
 1.10750 +%}
 1.10751 +
 1.10752 +instruct convF2D_reg(stackSlotD dst, regF src) %{
 1.10753 +  predicate(UseSSE==1);
 1.10754 +  match(Set dst (ConvF2D src));
 1.10755 +  format %{ "FST_D  $dst,$src\t# D-round" %}
 1.10756 +  expand %{
 1.10757 +    roundDouble_mem_reg(dst,src);
 1.10758 +  %}
 1.10759 +%}
 1.10760 +
 1.10761 +instruct convX2D_reg(regD dst, regX src, eFlagsReg cr) %{
 1.10762 +  predicate(UseSSE==1);
 1.10763 +  match(Set dst (ConvF2D src));
 1.10764 +  effect( KILL cr );
 1.10765 +  format %{ "SUB    ESP,4\n\t"
 1.10766 +            "MOVSS  [ESP] $src\n\t"
 1.10767 +            "FLD_S  [ESP]\n\t"
 1.10768 +            "ADD    ESP,4\n\t"
 1.10769 +            "FSTP   $dst\t# D-round" %}
 1.10770 +  ins_encode( X2D_encoding(dst, src), Pop_Reg_D(dst));
 1.10771 +  ins_pipe( pipe_slow );
 1.10772 +%}
 1.10773 +
 1.10774 +instruct convX2XD_reg(regXD dst, regX src) %{
 1.10775 +  predicate(UseSSE>=2);
 1.10776 +  match(Set dst (ConvF2D src));
 1.10777 +  format %{ "CVTSS2SD $dst,$src\t# D-round" %}
 1.10778 +  opcode(0xF3, 0x0F, 0x5A);
 1.10779 +  ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
 1.10780 +  ins_pipe( pipe_slow );
 1.10781 +%}
 1.10782 +
 1.10783 +// Convert a double to an int.  If the double is a NAN, stuff a zero in instead.
 1.10784 +instruct convD2I_reg_reg( eAXRegI dst, eDXRegI tmp, regD src, eFlagsReg cr ) %{
 1.10785 +  predicate(UseSSE<=1);
 1.10786 +  match(Set dst (ConvD2I src));
 1.10787 +  effect( KILL tmp, KILL cr );
 1.10788 +  format %{ "FLD    $src\t# Convert double to int \n\t"
 1.10789 +            "FLDCW  trunc mode\n\t"
 1.10790 +            "SUB    ESP,4\n\t"
 1.10791 +            "FISTp  [ESP + #0]\n\t"
 1.10792 +            "FLDCW  std/24-bit mode\n\t"
 1.10793 +            "POP    EAX\n\t"
 1.10794 +            "CMP    EAX,0x80000000\n\t"
 1.10795 +            "JNE,s  fast\n\t"
 1.10796 +            "FLD_D  $src\n\t"
 1.10797 +            "CALL   d2i_wrapper\n"
 1.10798 +      "fast:" %}
 1.10799 +  ins_encode( Push_Reg_D(src), D2I_encoding(src) );
 1.10800 +  ins_pipe( pipe_slow );
 1.10801 +%}
 1.10802 +
 1.10803 +// Convert a double to an int.  If the double is a NAN, stuff a zero in instead.
 1.10804 +instruct convXD2I_reg_reg( eAXRegI dst, eDXRegI tmp, regXD src, eFlagsReg cr ) %{
 1.10805 +  predicate(UseSSE>=2);
 1.10806 +  match(Set dst (ConvD2I src));
 1.10807 +  effect( KILL tmp, KILL cr );
 1.10808 +  format %{ "CVTTSD2SI $dst, $src\n\t"
 1.10809 +            "CMP    $dst,0x80000000\n\t"
 1.10810 +            "JNE,s  fast\n\t"
 1.10811 +            "SUB    ESP, 8\n\t"
 1.10812 +            "MOVSD  [ESP], $src\n\t"
 1.10813 +            "FLD_D  [ESP]\n\t"
 1.10814 +            "ADD    ESP, 8\n\t"
 1.10815 +            "CALL   d2i_wrapper\n"
 1.10816 +      "fast:" %}
 1.10817 +  opcode(0x1); // double-precision conversion
 1.10818 +  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x2C), FX2I_encoding(src,dst));
 1.10819 +  ins_pipe( pipe_slow );
 1.10820 +%}
 1.10821 +
 1.10822 +instruct convD2L_reg_reg( eADXRegL dst, regD src, eFlagsReg cr ) %{
 1.10823 +  predicate(UseSSE<=1);
 1.10824 +  match(Set dst (ConvD2L src));
 1.10825 +  effect( KILL cr );
 1.10826 +  format %{ "FLD    $src\t# Convert double to long\n\t"
 1.10827 +            "FLDCW  trunc mode\n\t"
 1.10828 +            "SUB    ESP,8\n\t"
 1.10829 +            "FISTp  [ESP + #0]\n\t"
 1.10830 +            "FLDCW  std/24-bit mode\n\t"
 1.10831 +            "POP    EAX\n\t"
 1.10832 +            "POP    EDX\n\t"
 1.10833 +            "CMP    EDX,0x80000000\n\t"
 1.10834 +            "JNE,s  fast\n\t"
 1.10835 +            "TEST   EAX,EAX\n\t"
 1.10836 +            "JNE,s  fast\n\t"
 1.10837 +            "FLD    $src\n\t"
 1.10838 +            "CALL   d2l_wrapper\n"
 1.10839 +      "fast:" %}
 1.10840 +  ins_encode( Push_Reg_D(src),  D2L_encoding(src) );
 1.10841 +  ins_pipe( pipe_slow );
 1.10842 +%}
 1.10843 +
 1.10844 +// XMM lacks a float/double->long conversion, so use the old FPU stack.
 1.10845 +instruct convXD2L_reg_reg( eADXRegL dst, regXD src, eFlagsReg cr ) %{
 1.10846 +  predicate (UseSSE>=2);
 1.10847 +  match(Set dst (ConvD2L src));
 1.10848 +  effect( KILL cr );
 1.10849 +  format %{ "SUB    ESP,8\t# Convert double to long\n\t"
 1.10850 +            "MOVSD  [ESP],$src\n\t"
 1.10851 +            "FLD_D  [ESP]\n\t"
 1.10852 +            "FLDCW  trunc mode\n\t"
 1.10853 +            "FISTp  [ESP + #0]\n\t"
 1.10854 +            "FLDCW  std/24-bit mode\n\t"
 1.10855 +            "POP    EAX\n\t"
 1.10856 +            "POP    EDX\n\t"
 1.10857 +            "CMP    EDX,0x80000000\n\t"
 1.10858 +            "JNE,s  fast\n\t"
 1.10859 +            "TEST   EAX,EAX\n\t"
 1.10860 +            "JNE,s  fast\n\t"
 1.10861 +            "SUB    ESP,8\n\t"
 1.10862 +            "MOVSD  [ESP],$src\n\t"
 1.10863 +            "FLD_D  [ESP]\n\t"
 1.10864 +            "CALL   d2l_wrapper\n"
 1.10865 +      "fast:" %}
 1.10866 +  ins_encode( XD2L_encoding(src) );
 1.10867 +  ins_pipe( pipe_slow );
 1.10868 +%}
 1.10869 +
 1.10870 +// Convert a double to an int.  Java semantics require we do complex
 1.10871 +// manglations in the corner cases.  So we set the rounding mode to
 1.10872 +// 'zero', store the darned double down as an int, and reset the
 1.10873 +// rounding mode to 'nearest'.  The hardware stores a flag value down
 1.10874 +// if we would overflow or converted a NAN; we check for this and
 1.10875 +// and go the slow path if needed.
 1.10876 +instruct convF2I_reg_reg(eAXRegI dst, eDXRegI tmp, regF src, eFlagsReg cr ) %{
 1.10877 +  predicate(UseSSE==0);
 1.10878 +  match(Set dst (ConvF2I src));
 1.10879 +  effect( KILL tmp, KILL cr );
 1.10880 +  format %{ "FLD    $src\t# Convert float to int \n\t"
 1.10881 +            "FLDCW  trunc mode\n\t"
 1.10882 +            "SUB    ESP,4\n\t"
 1.10883 +            "FISTp  [ESP + #0]\n\t"
 1.10884 +            "FLDCW  std/24-bit mode\n\t"
 1.10885 +            "POP    EAX\n\t"
 1.10886 +            "CMP    EAX,0x80000000\n\t"
 1.10887 +            "JNE,s  fast\n\t"
 1.10888 +            "FLD    $src\n\t"
 1.10889 +            "CALL   d2i_wrapper\n"
 1.10890 +      "fast:" %}
 1.10891 +  // D2I_encoding works for F2I
 1.10892 +  ins_encode( Push_Reg_F(src), D2I_encoding(src) );
 1.10893 +  ins_pipe( pipe_slow );
 1.10894 +%}
 1.10895 +
 1.10896 +// Convert a float in xmm to an int reg.
 1.10897 +instruct convX2I_reg(eAXRegI dst, eDXRegI tmp, regX src, eFlagsReg cr ) %{
 1.10898 +  predicate(UseSSE>=1);
 1.10899 +  match(Set dst (ConvF2I src));
 1.10900 +  effect( KILL tmp, KILL cr );
 1.10901 +  format %{ "CVTTSS2SI $dst, $src\n\t"
 1.10902 +            "CMP    $dst,0x80000000\n\t"
 1.10903 +            "JNE,s  fast\n\t"
 1.10904 +            "SUB    ESP, 4\n\t"
 1.10905 +            "MOVSS  [ESP], $src\n\t"
 1.10906 +            "FLD    [ESP]\n\t"
 1.10907 +            "ADD    ESP, 4\n\t"
 1.10908 +            "CALL   d2i_wrapper\n"
 1.10909 +      "fast:" %}
 1.10910 +  opcode(0x0); // single-precision conversion
 1.10911 +  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x2C), FX2I_encoding(src,dst));
 1.10912 +  ins_pipe( pipe_slow );
 1.10913 +%}
 1.10914 +
 1.10915 +instruct convF2L_reg_reg( eADXRegL dst, regF src, eFlagsReg cr ) %{
 1.10916 +  predicate(UseSSE==0);
 1.10917 +  match(Set dst (ConvF2L src));
 1.10918 +  effect( KILL cr );
 1.10919 +  format %{ "FLD    $src\t# Convert float to long\n\t"
 1.10920 +            "FLDCW  trunc mode\n\t"
 1.10921 +            "SUB    ESP,8\n\t"
 1.10922 +            "FISTp  [ESP + #0]\n\t"
 1.10923 +            "FLDCW  std/24-bit mode\n\t"
 1.10924 +            "POP    EAX\n\t"
 1.10925 +            "POP    EDX\n\t"
 1.10926 +            "CMP    EDX,0x80000000\n\t"
 1.10927 +            "JNE,s  fast\n\t"
 1.10928 +            "TEST   EAX,EAX\n\t"
 1.10929 +            "JNE,s  fast\n\t"
 1.10930 +            "FLD    $src\n\t"
 1.10931 +            "CALL   d2l_wrapper\n"
 1.10932 +      "fast:" %}
 1.10933 +  // D2L_encoding works for F2L
 1.10934 +  ins_encode( Push_Reg_F(src), D2L_encoding(src) );
 1.10935 +  ins_pipe( pipe_slow );
 1.10936 +%}
 1.10937 +
 1.10938 +// XMM lacks a float/double->long conversion, so use the old FPU stack.
 1.10939 +instruct convX2L_reg_reg( eADXRegL dst, regX src, eFlagsReg cr ) %{
 1.10940 +  predicate (UseSSE>=1);
 1.10941 +  match(Set dst (ConvF2L src));
 1.10942 +  effect( KILL cr );
 1.10943 +  format %{ "SUB    ESP,8\t# Convert float to long\n\t"
 1.10944 +            "MOVSS  [ESP],$src\n\t"
 1.10945 +            "FLD_S  [ESP]\n\t"
 1.10946 +            "FLDCW  trunc mode\n\t"
 1.10947 +            "FISTp  [ESP + #0]\n\t"
 1.10948 +            "FLDCW  std/24-bit mode\n\t"
 1.10949 +            "POP    EAX\n\t"
 1.10950 +            "POP    EDX\n\t"
 1.10951 +            "CMP    EDX,0x80000000\n\t"
 1.10952 +            "JNE,s  fast\n\t"
 1.10953 +            "TEST   EAX,EAX\n\t"
 1.10954 +            "JNE,s  fast\n\t"
 1.10955 +            "SUB    ESP,4\t# Convert float to long\n\t"
 1.10956 +            "MOVSS  [ESP],$src\n\t"
 1.10957 +            "FLD_S  [ESP]\n\t"
 1.10958 +            "ADD    ESP,4\n\t"
 1.10959 +            "CALL   d2l_wrapper\n"
 1.10960 +      "fast:" %}
 1.10961 +  ins_encode( X2L_encoding(src) );
 1.10962 +  ins_pipe( pipe_slow );
 1.10963 +%}
 1.10964 +
 1.10965 +instruct convI2D_reg(regD dst, stackSlotI src) %{
 1.10966 +  predicate( UseSSE<=1 );
 1.10967 +  match(Set dst (ConvI2D src));
 1.10968 +  format %{ "FILD   $src\n\t"
 1.10969 +            "FSTP   $dst" %}
 1.10970 +  opcode(0xDB, 0x0);  /* DB /0 */
 1.10971 +  ins_encode(Push_Mem_I(src), Pop_Reg_D(dst));
 1.10972 +  ins_pipe( fpu_reg_mem );
 1.10973 +%}
 1.10974 +
 1.10975 +instruct convI2XD_reg(regXD dst, eRegI src) %{
 1.10976 +  predicate( UseSSE>=2 );
 1.10977 +  match(Set dst (ConvI2D src));
 1.10978 +  format %{ "CVTSI2SD $dst,$src" %}
 1.10979 +  opcode(0xF2, 0x0F, 0x2A);
 1.10980 +  ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
 1.10981 +  ins_pipe( pipe_slow );
 1.10982 +%}
 1.10983 +
 1.10984 +instruct convI2XD_mem(regXD dst, memory mem) %{
 1.10985 +  predicate( UseSSE>=2 );
 1.10986 +  match(Set dst (ConvI2D (LoadI mem)));
 1.10987 +  format %{ "CVTSI2SD $dst,$mem" %}
 1.10988 +  opcode(0xF2, 0x0F, 0x2A);
 1.10989 +  ins_encode( OpcP, OpcS, Opcode(tertiary), RegMem(dst, mem));
 1.10990 +  ins_pipe( pipe_slow );
 1.10991 +%}
 1.10992 +
 1.10993 +instruct convI2D_mem(regD dst, memory mem) %{
 1.10994 +  predicate( UseSSE<=1 && !Compile::current()->select_24_bit_instr());
 1.10995 +  match(Set dst (ConvI2D (LoadI mem)));
 1.10996 +  format %{ "FILD   $mem\n\t"
 1.10997 +            "FSTP   $dst" %}
 1.10998 +  opcode(0xDB);      /* DB /0 */
 1.10999 +  ins_encode( OpcP, RMopc_Mem(0x00,mem),
 1.11000 +              Pop_Reg_D(dst));
 1.11001 +  ins_pipe( fpu_reg_mem );
 1.11002 +%}
 1.11003 +
 1.11004 +// Convert a byte to a float; no rounding step needed.
 1.11005 +instruct conv24I2F_reg(regF dst, stackSlotI src) %{
 1.11006 +  predicate( UseSSE==0 && n->in(1)->Opcode() == Op_AndI && n->in(1)->in(2)->is_Con() && n->in(1)->in(2)->get_int() == 255 );
 1.11007 +  match(Set dst (ConvI2F src));
 1.11008 +  format %{ "FILD   $src\n\t"
 1.11009 +            "FSTP   $dst" %}
 1.11010 +
 1.11011 +  opcode(0xDB, 0x0);  /* DB /0 */
 1.11012 +  ins_encode(Push_Mem_I(src), Pop_Reg_F(dst));
 1.11013 +  ins_pipe( fpu_reg_mem );
 1.11014 +%}
 1.11015 +
 1.11016 +// In 24-bit mode, force exponent rounding by storing back out
 1.11017 +instruct convI2F_SSF(stackSlotF dst, stackSlotI src) %{
 1.11018 +  predicate( UseSSE==0 && Compile::current()->select_24_bit_instr());
 1.11019 +  match(Set dst (ConvI2F src));
 1.11020 +  ins_cost(200);
 1.11021 +  format %{ "FILD   $src\n\t"
 1.11022 +            "FSTP_S $dst" %}
 1.11023 +  opcode(0xDB, 0x0);  /* DB /0 */
 1.11024 +  ins_encode( Push_Mem_I(src),
 1.11025 +              Pop_Mem_F(dst));
 1.11026 +  ins_pipe( fpu_mem_mem );
 1.11027 +%}
 1.11028 +
 1.11029 +// In 24-bit mode, force exponent rounding by storing back out
 1.11030 +instruct convI2F_SSF_mem(stackSlotF dst, memory mem) %{
 1.11031 +  predicate( UseSSE==0 && Compile::current()->select_24_bit_instr());
 1.11032 +  match(Set dst (ConvI2F (LoadI mem)));
 1.11033 +  ins_cost(200);
 1.11034 +  format %{ "FILD   $mem\n\t"
 1.11035 +            "FSTP_S $dst" %}
 1.11036 +  opcode(0xDB);  /* DB /0 */
 1.11037 +  ins_encode( OpcP, RMopc_Mem(0x00,mem),
 1.11038 +              Pop_Mem_F(dst));
 1.11039 +  ins_pipe( fpu_mem_mem );
 1.11040 +%}
 1.11041 +
 1.11042 +// This instruction does not round to 24-bits
 1.11043 +instruct convI2F_reg(regF dst, stackSlotI src) %{
 1.11044 +  predicate( UseSSE==0 && !Compile::current()->select_24_bit_instr());
 1.11045 +  match(Set dst (ConvI2F src));
 1.11046 +  format %{ "FILD   $src\n\t"
 1.11047 +            "FSTP   $dst" %}
 1.11048 +  opcode(0xDB, 0x0);  /* DB /0 */
 1.11049 +  ins_encode( Push_Mem_I(src),
 1.11050 +              Pop_Reg_F(dst));
 1.11051 +  ins_pipe( fpu_reg_mem );
 1.11052 +%}
 1.11053 +
 1.11054 +// This instruction does not round to 24-bits
 1.11055 +instruct convI2F_mem(regF dst, memory mem) %{
 1.11056 +  predicate( UseSSE==0 && !Compile::current()->select_24_bit_instr());
 1.11057 +  match(Set dst (ConvI2F (LoadI mem)));
 1.11058 +  format %{ "FILD   $mem\n\t"
 1.11059 +            "FSTP   $dst" %}
 1.11060 +  opcode(0xDB);      /* DB /0 */
 1.11061 +  ins_encode( OpcP, RMopc_Mem(0x00,mem),
 1.11062 +              Pop_Reg_F(dst));
 1.11063 +  ins_pipe( fpu_reg_mem );
 1.11064 +%}
 1.11065 +
 1.11066 +// Convert an int to a float in xmm; no rounding step needed.
 1.11067 +instruct convI2X_reg(regX dst, eRegI src) %{
 1.11068 +  predicate(UseSSE>=1);
 1.11069 +  match(Set dst (ConvI2F src));
 1.11070 +  format %{ "CVTSI2SS $dst, $src" %}
 1.11071 +
 1.11072 +  opcode(0xF3, 0x0F, 0x2A);  /* F3 0F 2A /r */
 1.11073 +  ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
 1.11074 +  ins_pipe( pipe_slow );
 1.11075 +%}
 1.11076 +
 1.11077 +instruct convI2L_reg( eRegL dst, eRegI src, eFlagsReg cr) %{
 1.11078 +  match(Set dst (ConvI2L src));
 1.11079 +  effect(KILL cr);
 1.11080 +  format %{ "MOV    $dst.lo,$src\n\t"
 1.11081 +            "MOV    $dst.hi,$src\n\t"
 1.11082 +            "SAR    $dst.hi,31" %}
 1.11083 +  ins_encode(convert_int_long(dst,src));
 1.11084 +  ins_pipe( ialu_reg_reg_long );
 1.11085 +%}
 1.11086 +
 1.11087 +// Zero-extend convert int to long
 1.11088 +instruct convI2L_reg_zex(eRegL dst, eRegI src, immL_32bits mask, eFlagsReg flags ) %{
 1.11089 +  match(Set dst (AndL (ConvI2L src) mask) );
 1.11090 +  effect( KILL flags );
 1.11091 +  format %{ "MOV    $dst.lo,$src\n\t"
 1.11092 +            "XOR    $dst.hi,$dst.hi" %}
 1.11093 +  opcode(0x33); // XOR
 1.11094 +  ins_encode(enc_Copy(dst,src), OpcP, RegReg_Hi2(dst,dst) );
 1.11095 +  ins_pipe( ialu_reg_reg_long );
 1.11096 +%}
 1.11097 +
 1.11098 +// Zero-extend long
 1.11099 +instruct zerox_long(eRegL dst, eRegL src, immL_32bits mask, eFlagsReg flags ) %{
 1.11100 +  match(Set dst (AndL src mask) );
 1.11101 +  effect( KILL flags );
 1.11102 +  format %{ "MOV    $dst.lo,$src.lo\n\t"
 1.11103 +            "XOR    $dst.hi,$dst.hi\n\t" %}
 1.11104 +  opcode(0x33); // XOR
 1.11105 +  ins_encode(enc_Copy(dst,src), OpcP, RegReg_Hi2(dst,dst) );
 1.11106 +  ins_pipe( ialu_reg_reg_long );
 1.11107 +%}
 1.11108 +
 1.11109 +instruct convL2D_reg( stackSlotD dst, eRegL src, eFlagsReg cr) %{
 1.11110 +  predicate (UseSSE<=1);
 1.11111 +  match(Set dst (ConvL2D src));
 1.11112 +  effect( KILL cr );
 1.11113 +  format %{ "PUSH   $src.hi\t# Convert long to double\n\t"
 1.11114 +            "PUSH   $src.lo\n\t"
 1.11115 +            "FILD   ST,[ESP + #0]\n\t"
 1.11116 +            "ADD    ESP,8\n\t"
 1.11117 +            "FSTP_D $dst\t# D-round" %}
 1.11118 +  opcode(0xDF, 0x5);  /* DF /5 */
 1.11119 +  ins_encode(convert_long_double(src), Pop_Mem_D(dst));
 1.11120 +  ins_pipe( pipe_slow );
 1.11121 +%}
 1.11122 +
 1.11123 +instruct convL2XD_reg( regXD dst, eRegL src, eFlagsReg cr) %{
 1.11124 +  predicate (UseSSE>=2);
 1.11125 +  match(Set dst (ConvL2D src));
 1.11126 +  effect( KILL cr );
 1.11127 +  format %{ "PUSH   $src.hi\t# Convert long to double\n\t"
 1.11128 +            "PUSH   $src.lo\n\t"
 1.11129 +            "FILD_D [ESP]\n\t"
 1.11130 +            "FSTP_D [ESP]\n\t"
 1.11131 +            "MOVSD  $dst,[ESP]\n\t"
 1.11132 +            "ADD    ESP,8" %}
 1.11133 +  opcode(0xDF, 0x5);  /* DF /5 */
 1.11134 +  ins_encode(convert_long_double2(src), Push_ResultXD(dst));
 1.11135 +  ins_pipe( pipe_slow );
 1.11136 +%}
 1.11137 +
 1.11138 +instruct convL2X_reg( regX dst, eRegL src, eFlagsReg cr) %{
 1.11139 +  predicate (UseSSE>=1);
 1.11140 +  match(Set dst (ConvL2F src));
 1.11141 +  effect( KILL cr );
 1.11142 +  format %{ "PUSH   $src.hi\t# Convert long to single float\n\t"
 1.11143 +            "PUSH   $src.lo\n\t"
 1.11144 +            "FILD_D [ESP]\n\t"
 1.11145 +            "FSTP_S [ESP]\n\t"
 1.11146 +            "MOVSS  $dst,[ESP]\n\t"
 1.11147 +            "ADD    ESP,8" %}
 1.11148 +  opcode(0xDF, 0x5);  /* DF /5 */
 1.11149 +  ins_encode(convert_long_double2(src), Push_ResultX(dst,0x8));
 1.11150 +  ins_pipe( pipe_slow );
 1.11151 +%}
 1.11152 +
 1.11153 +instruct convL2F_reg( stackSlotF dst, eRegL src, eFlagsReg cr) %{
 1.11154 +  match(Set dst (ConvL2F src));
 1.11155 +  effect( KILL cr );
 1.11156 +  format %{ "PUSH   $src.hi\t# Convert long to single float\n\t"
 1.11157 +            "PUSH   $src.lo\n\t"
 1.11158 +            "FILD   ST,[ESP + #0]\n\t"
 1.11159 +            "ADD    ESP,8\n\t"
 1.11160 +            "FSTP_S $dst\t# F-round" %}
 1.11161 +  opcode(0xDF, 0x5);  /* DF /5 */
 1.11162 +  ins_encode(convert_long_double(src), Pop_Mem_F(dst));
 1.11163 +  ins_pipe( pipe_slow );
 1.11164 +%}
 1.11165 +
 1.11166 +instruct convL2I_reg( eRegI dst, eRegL src ) %{
 1.11167 +  match(Set dst (ConvL2I src));
 1.11168 +  effect( DEF dst, USE src );
 1.11169 +  format %{ "MOV    $dst,$src.lo" %}
 1.11170 +  ins_encode(enc_CopyL_Lo(dst,src));
 1.11171 +  ins_pipe( ialu_reg_reg );
 1.11172 +%}
 1.11173 +
 1.11174 +
 1.11175 +instruct MoveF2I_stack_reg(eRegI dst, stackSlotF src) %{
 1.11176 +  match(Set dst (MoveF2I src));
 1.11177 +  effect( DEF dst, USE src );
 1.11178 +  ins_cost(100);
 1.11179 +  format %{ "MOV    $dst,$src\t# MoveF2I_stack_reg" %}
 1.11180 +  opcode(0x8B);
 1.11181 +  ins_encode( OpcP, RegMem(dst,src));
 1.11182 +  ins_pipe( ialu_reg_mem );
 1.11183 +%}
 1.11184 +
 1.11185 +instruct MoveF2I_reg_stack(stackSlotI dst, regF src) %{
 1.11186 +  predicate(UseSSE==0);
 1.11187 +  match(Set dst (MoveF2I src));
 1.11188 +  effect( DEF dst, USE src );
 1.11189 +
 1.11190 +  ins_cost(125);
 1.11191 +  format %{ "FST_S  $dst,$src\t# MoveF2I_reg_stack" %}
 1.11192 +  ins_encode( Pop_Mem_Reg_F(dst, src) );
 1.11193 +  ins_pipe( fpu_mem_reg );
 1.11194 +%}
 1.11195 +
 1.11196 +instruct MoveF2I_reg_stack_sse(stackSlotI dst, regX src) %{
 1.11197 +  predicate(UseSSE>=1);
 1.11198 +  match(Set dst (MoveF2I src));
 1.11199 +  effect( DEF dst, USE src );
 1.11200 +
 1.11201 +  ins_cost(95);
 1.11202 +  format %{ "MOVSS  $dst,$src\t# MoveF2I_reg_stack_sse" %}
 1.11203 +  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x11), RegMem(src, dst));
 1.11204 +  ins_pipe( pipe_slow );
 1.11205 +%}
 1.11206 +
 1.11207 +instruct MoveF2I_reg_reg_sse(eRegI dst, regX src) %{
 1.11208 +  predicate(UseSSE>=2);
 1.11209 +  match(Set dst (MoveF2I src));
 1.11210 +  effect( DEF dst, USE src );
 1.11211 +  ins_cost(85);
 1.11212 +  format %{ "MOVD   $dst,$src\t# MoveF2I_reg_reg_sse" %}
 1.11213 +  ins_encode( MovX2I_reg(dst, src));
 1.11214 +  ins_pipe( pipe_slow );
 1.11215 +%}
 1.11216 +
 1.11217 +instruct MoveI2F_reg_stack(stackSlotF dst, eRegI src) %{
 1.11218 +  match(Set dst (MoveI2F src));
 1.11219 +  effect( DEF dst, USE src );
 1.11220 +
 1.11221 +  ins_cost(100);
 1.11222 +  format %{ "MOV    $dst,$src\t# MoveI2F_reg_stack" %}
 1.11223 +  opcode(0x89);
 1.11224 +  ins_encode( OpcPRegSS( dst, src ) );
 1.11225 +  ins_pipe( ialu_mem_reg );
 1.11226 +%}
 1.11227 +
 1.11228 +
 1.11229 +instruct MoveI2F_stack_reg(regF dst, stackSlotI src) %{
 1.11230 +  predicate(UseSSE==0);
 1.11231 +  match(Set dst (MoveI2F src));
 1.11232 +  effect(DEF dst, USE src);
 1.11233 +
 1.11234 +  ins_cost(125);
 1.11235 +  format %{ "FLD_S  $src\n\t"
 1.11236 +            "FSTP   $dst\t# MoveI2F_stack_reg" %}
 1.11237 +  opcode(0xD9);               /* D9 /0, FLD m32real */
 1.11238 +  ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
 1.11239 +              Pop_Reg_F(dst) );
 1.11240 +  ins_pipe( fpu_reg_mem );
 1.11241 +%}
 1.11242 +
 1.11243 +instruct MoveI2F_stack_reg_sse(regX dst, stackSlotI src) %{
 1.11244 +  predicate(UseSSE>=1);
 1.11245 +  match(Set dst (MoveI2F src));
 1.11246 +  effect( DEF dst, USE src );
 1.11247 +
 1.11248 +  ins_cost(95);
 1.11249 +  format %{ "MOVSS  $dst,$src\t# MoveI2F_stack_reg_sse" %}
 1.11250 +  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x10), RegMem(dst,src));
 1.11251 +  ins_pipe( pipe_slow );
 1.11252 +%}
 1.11253 +
 1.11254 +instruct MoveI2F_reg_reg_sse(regX dst, eRegI src) %{
 1.11255 +  predicate(UseSSE>=2);
 1.11256 +  match(Set dst (MoveI2F src));
 1.11257 +  effect( DEF dst, USE src );
 1.11258 +
 1.11259 +  ins_cost(85);
 1.11260 +  format %{ "MOVD   $dst,$src\t# MoveI2F_reg_reg_sse" %}
 1.11261 +  ins_encode( MovI2X_reg(dst, src) );
 1.11262 +  ins_pipe( pipe_slow );
 1.11263 +%}
 1.11264 +
 1.11265 +instruct MoveD2L_stack_reg(eRegL dst, stackSlotD src) %{
 1.11266 +  match(Set dst (MoveD2L src));
 1.11267 +  effect(DEF dst, USE src);
 1.11268 +
 1.11269 +  ins_cost(250);
 1.11270 +  format %{ "MOV    $dst.lo,$src\n\t"
 1.11271 +            "MOV    $dst.hi,$src+4\t# MoveD2L_stack_reg" %}
 1.11272 +  opcode(0x8B, 0x8B);
 1.11273 +  ins_encode( OpcP, RegMem(dst,src), OpcS, RegMem_Hi(dst,src));
 1.11274 +  ins_pipe( ialu_mem_long_reg );
 1.11275 +%}
 1.11276 +
 1.11277 +instruct MoveD2L_reg_stack(stackSlotL dst, regD src) %{
 1.11278 +  predicate(UseSSE<=1);
 1.11279 +  match(Set dst (MoveD2L src));
 1.11280 +  effect(DEF dst, USE src);
 1.11281 +
 1.11282 +  ins_cost(125);
 1.11283 +  format %{ "FST_D  $dst,$src\t# MoveD2L_reg_stack" %}
 1.11284 +  ins_encode( Pop_Mem_Reg_D(dst, src) );
 1.11285 +  ins_pipe( fpu_mem_reg );
 1.11286 +%}
 1.11287 +
 1.11288 +instruct MoveD2L_reg_stack_sse(stackSlotL dst, regXD src) %{
 1.11289 +  predicate(UseSSE>=2);
 1.11290 +  match(Set dst (MoveD2L src));
 1.11291 +  effect(DEF dst, USE src);
 1.11292 +  ins_cost(95);
 1.11293 +
 1.11294 +  format %{ "MOVSD  $dst,$src\t# MoveD2L_reg_stack_sse" %}
 1.11295 +  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x11), RegMem(src,dst));
 1.11296 +  ins_pipe( pipe_slow );
 1.11297 +%}
 1.11298 +
 1.11299 +instruct MoveD2L_reg_reg_sse(eRegL dst, regXD src, regXD tmp) %{
 1.11300 +  predicate(UseSSE>=2);
 1.11301 +  match(Set dst (MoveD2L src));
 1.11302 +  effect(DEF dst, USE src, TEMP tmp);
 1.11303 +  ins_cost(85);
 1.11304 +  format %{ "MOVD   $dst.lo,$src\n\t"
 1.11305 +            "PSHUFLW $tmp,$src,0x4E\n\t"
 1.11306 +            "MOVD   $dst.hi,$tmp\t# MoveD2L_reg_reg_sse" %}
 1.11307 +  ins_encode( MovXD2L_reg(dst, src, tmp) );
 1.11308 +  ins_pipe( pipe_slow );
 1.11309 +%}
 1.11310 +
 1.11311 +instruct MoveL2D_reg_stack(stackSlotD dst, eRegL src) %{
 1.11312 +  match(Set dst (MoveL2D src));
 1.11313 +  effect(DEF dst, USE src);
 1.11314 +
 1.11315 +  ins_cost(200);
 1.11316 +  format %{ "MOV    $dst,$src.lo\n\t"
 1.11317 +            "MOV    $dst+4,$src.hi\t# MoveL2D_reg_stack" %}
 1.11318 +  opcode(0x89, 0x89);
 1.11319 +  ins_encode( OpcP, RegMem( src, dst ), OpcS, RegMem_Hi( src, dst ) );
 1.11320 +  ins_pipe( ialu_mem_long_reg );
 1.11321 +%}
 1.11322 +
 1.11323 +
 1.11324 +instruct MoveL2D_stack_reg(regD dst, stackSlotL src) %{
 1.11325 +  predicate(UseSSE<=1);
 1.11326 +  match(Set dst (MoveL2D src));
 1.11327 +  effect(DEF dst, USE src);
 1.11328 +  ins_cost(125);
 1.11329 +
 1.11330 +  format %{ "FLD_D  $src\n\t"
 1.11331 +            "FSTP   $dst\t# MoveL2D_stack_reg" %}
 1.11332 +  opcode(0xDD);               /* DD /0, FLD m64real */
 1.11333 +  ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
 1.11334 +              Pop_Reg_D(dst) );
 1.11335 +  ins_pipe( fpu_reg_mem );
 1.11336 +%}
 1.11337 +
 1.11338 +
 1.11339 +instruct MoveL2D_stack_reg_sse(regXD dst, stackSlotL src) %{
 1.11340 +  predicate(UseSSE>=2 && UseXmmLoadAndClearUpper);
 1.11341 +  match(Set dst (MoveL2D src));
 1.11342 +  effect(DEF dst, USE src);
 1.11343 +
 1.11344 +  ins_cost(95);
 1.11345 +  format %{ "MOVSD  $dst,$src\t# MoveL2D_stack_reg_sse" %}
 1.11346 +  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x10), RegMem(dst,src));
 1.11347 +  ins_pipe( pipe_slow );
 1.11348 +%}
 1.11349 +
 1.11350 +instruct MoveL2D_stack_reg_sse_partial(regXD dst, stackSlotL src) %{
 1.11351 +  predicate(UseSSE>=2 && !UseXmmLoadAndClearUpper);
 1.11352 +  match(Set dst (MoveL2D src));
 1.11353 +  effect(DEF dst, USE src);
 1.11354 +
 1.11355 +  ins_cost(95);
 1.11356 +  format %{ "MOVLPD $dst,$src\t# MoveL2D_stack_reg_sse" %}
 1.11357 +  ins_encode( Opcode(0x66), Opcode(0x0F), Opcode(0x12), RegMem(dst,src));
 1.11358 +  ins_pipe( pipe_slow );
 1.11359 +%}
 1.11360 +
 1.11361 +instruct MoveL2D_reg_reg_sse(regXD dst, eRegL src, regXD tmp) %{
 1.11362 +  predicate(UseSSE>=2);
 1.11363 +  match(Set dst (MoveL2D src));
 1.11364 +  effect(TEMP dst, USE src, TEMP tmp);
 1.11365 +  ins_cost(85);
 1.11366 +  format %{ "MOVD   $dst,$src.lo\n\t"
 1.11367 +            "MOVD   $tmp,$src.hi\n\t"
 1.11368 +            "PUNPCKLDQ $dst,$tmp\t# MoveL2D_reg_reg_sse" %}
 1.11369 +  ins_encode( MovL2XD_reg(dst, src, tmp) );
 1.11370 +  ins_pipe( pipe_slow );
 1.11371 +%}
 1.11372 +
 1.11373 +// Replicate scalar to packed byte (1 byte) values in xmm
 1.11374 +instruct Repl8B_reg(regXD dst, regXD src) %{
 1.11375 +  predicate(UseSSE>=2);
 1.11376 +  match(Set dst (Replicate8B src));
 1.11377 +  format %{ "MOVDQA  $dst,$src\n\t"
 1.11378 +            "PUNPCKLBW $dst,$dst\n\t"
 1.11379 +            "PSHUFLW $dst,$dst,0x00\t! replicate8B" %}
 1.11380 +  ins_encode( pshufd_8x8(dst, src));
 1.11381 +  ins_pipe( pipe_slow );
 1.11382 +%}
 1.11383 +
 1.11384 +// Replicate scalar to packed byte (1 byte) values in xmm
 1.11385 +instruct Repl8B_eRegI(regXD dst, eRegI src) %{
 1.11386 +  predicate(UseSSE>=2);
 1.11387 +  match(Set dst (Replicate8B src));
 1.11388 +  format %{ "MOVD    $dst,$src\n\t"
 1.11389 +            "PUNPCKLBW $dst,$dst\n\t"
 1.11390 +            "PSHUFLW $dst,$dst,0x00\t! replicate8B" %}
 1.11391 +  ins_encode( mov_i2x(dst, src), pshufd_8x8(dst, dst));
 1.11392 +  ins_pipe( pipe_slow );
 1.11393 +%}
 1.11394 +
 1.11395 +// Replicate scalar zero to packed byte (1 byte) values in xmm
 1.11396 +instruct Repl8B_immI0(regXD dst, immI0 zero) %{
 1.11397 +  predicate(UseSSE>=2);
 1.11398 +  match(Set dst (Replicate8B zero));
 1.11399 +  format %{ "PXOR  $dst,$dst\t! replicate8B" %}
 1.11400 +  ins_encode( pxor(dst, dst));
 1.11401 +  ins_pipe( fpu_reg_reg );
 1.11402 +%}
 1.11403 +
 1.11404 +// Replicate scalar to packed shore (2 byte) values in xmm
 1.11405 +instruct Repl4S_reg(regXD dst, regXD src) %{
 1.11406 +  predicate(UseSSE>=2);
 1.11407 +  match(Set dst (Replicate4S src));
 1.11408 +  format %{ "PSHUFLW $dst,$src,0x00\t! replicate4S" %}
 1.11409 +  ins_encode( pshufd_4x16(dst, src));
 1.11410 +  ins_pipe( fpu_reg_reg );
 1.11411 +%}
 1.11412 +
 1.11413 +// Replicate scalar to packed shore (2 byte) values in xmm
 1.11414 +instruct Repl4S_eRegI(regXD dst, eRegI src) %{
 1.11415 +  predicate(UseSSE>=2);
 1.11416 +  match(Set dst (Replicate4S src));
 1.11417 +  format %{ "MOVD    $dst,$src\n\t"
 1.11418 +            "PSHUFLW $dst,$dst,0x00\t! replicate4S" %}
 1.11419 +  ins_encode( mov_i2x(dst, src), pshufd_4x16(dst, dst));
 1.11420 +  ins_pipe( fpu_reg_reg );
 1.11421 +%}
 1.11422 +
 1.11423 +// Replicate scalar zero to packed short (2 byte) values in xmm
 1.11424 +instruct Repl4S_immI0(regXD dst, immI0 zero) %{
 1.11425 +  predicate(UseSSE>=2);
 1.11426 +  match(Set dst (Replicate4S zero));
 1.11427 +  format %{ "PXOR  $dst,$dst\t! replicate4S" %}
 1.11428 +  ins_encode( pxor(dst, dst));
 1.11429 +  ins_pipe( fpu_reg_reg );
 1.11430 +%}
 1.11431 +
 1.11432 +// Replicate scalar to packed char (2 byte) values in xmm
 1.11433 +instruct Repl4C_reg(regXD dst, regXD src) %{
 1.11434 +  predicate(UseSSE>=2);
 1.11435 +  match(Set dst (Replicate4C src));
 1.11436 +  format %{ "PSHUFLW $dst,$src,0x00\t! replicate4C" %}
 1.11437 +  ins_encode( pshufd_4x16(dst, src));
 1.11438 +  ins_pipe( fpu_reg_reg );
 1.11439 +%}
 1.11440 +
 1.11441 +// Replicate scalar to packed char (2 byte) values in xmm
 1.11442 +instruct Repl4C_eRegI(regXD dst, eRegI src) %{
 1.11443 +  predicate(UseSSE>=2);
 1.11444 +  match(Set dst (Replicate4C src));
 1.11445 +  format %{ "MOVD    $dst,$src\n\t"
 1.11446 +            "PSHUFLW $dst,$dst,0x00\t! replicate4C" %}
 1.11447 +  ins_encode( mov_i2x(dst, src), pshufd_4x16(dst, dst));
 1.11448 +  ins_pipe( fpu_reg_reg );
 1.11449 +%}
 1.11450 +
 1.11451 +// Replicate scalar zero to packed char (2 byte) values in xmm
 1.11452 +instruct Repl4C_immI0(regXD dst, immI0 zero) %{
 1.11453 +  predicate(UseSSE>=2);
 1.11454 +  match(Set dst (Replicate4C zero));
 1.11455 +  format %{ "PXOR  $dst,$dst\t! replicate4C" %}
 1.11456 +  ins_encode( pxor(dst, dst));
 1.11457 +  ins_pipe( fpu_reg_reg );
 1.11458 +%}
 1.11459 +
 1.11460 +// Replicate scalar to packed integer (4 byte) values in xmm
 1.11461 +instruct Repl2I_reg(regXD dst, regXD src) %{
 1.11462 +  predicate(UseSSE>=2);
 1.11463 +  match(Set dst (Replicate2I src));
 1.11464 +  format %{ "PSHUFD $dst,$src,0x00\t! replicate2I" %}
 1.11465 +  ins_encode( pshufd(dst, src, 0x00));
 1.11466 +  ins_pipe( fpu_reg_reg );
 1.11467 +%}
 1.11468 +
 1.11469 +// Replicate scalar to packed integer (4 byte) values in xmm
 1.11470 +instruct Repl2I_eRegI(regXD dst, eRegI src) %{
 1.11471 +  predicate(UseSSE>=2);
 1.11472 +  match(Set dst (Replicate2I src));
 1.11473 +  format %{ "MOVD   $dst,$src\n\t"
 1.11474 +            "PSHUFD $dst,$dst,0x00\t! replicate2I" %}
 1.11475 +  ins_encode( mov_i2x(dst, src), pshufd(dst, dst, 0x00));
 1.11476 +  ins_pipe( fpu_reg_reg );
 1.11477 +%}
 1.11478 +
 1.11479 +// Replicate scalar zero to packed integer (2 byte) values in xmm
 1.11480 +instruct Repl2I_immI0(regXD dst, immI0 zero) %{
 1.11481 +  predicate(UseSSE>=2);
 1.11482 +  match(Set dst (Replicate2I zero));
 1.11483 +  format %{ "PXOR  $dst,$dst\t! replicate2I" %}
 1.11484 +  ins_encode( pxor(dst, dst));
 1.11485 +  ins_pipe( fpu_reg_reg );
 1.11486 +%}
 1.11487 +
 1.11488 +// Replicate scalar to packed single precision floating point values in xmm
 1.11489 +instruct Repl2F_reg(regXD dst, regXD src) %{
 1.11490 +  predicate(UseSSE>=2);
 1.11491 +  match(Set dst (Replicate2F src));
 1.11492 +  format %{ "PSHUFD $dst,$src,0xe0\t! replicate2F" %}
 1.11493 +  ins_encode( pshufd(dst, src, 0xe0));
 1.11494 +  ins_pipe( fpu_reg_reg );
 1.11495 +%}
 1.11496 +
 1.11497 +// Replicate scalar to packed single precision floating point values in xmm
 1.11498 +instruct Repl2F_regX(regXD dst, regX src) %{
 1.11499 +  predicate(UseSSE>=2);
 1.11500 +  match(Set dst (Replicate2F src));
 1.11501 +  format %{ "PSHUFD $dst,$src,0xe0\t! replicate2F" %}
 1.11502 +  ins_encode( pshufd(dst, src, 0xe0));
 1.11503 +  ins_pipe( fpu_reg_reg );
 1.11504 +%}
 1.11505 +
 1.11506 +// Replicate scalar to packed single precision floating point values in xmm
 1.11507 +instruct Repl2F_immXF0(regXD dst, immXF0 zero) %{
 1.11508 +  predicate(UseSSE>=2);
 1.11509 +  match(Set dst (Replicate2F zero));
 1.11510 +  format %{ "PXOR  $dst,$dst\t! replicate2F" %}
 1.11511 +  ins_encode( pxor(dst, dst));
 1.11512 +  ins_pipe( fpu_reg_reg );
 1.11513 +%}
 1.11514 +
 1.11515 +
 1.11516 +
 1.11517 +// =======================================================================
 1.11518 +// fast clearing of an array
 1.11519 +
 1.11520 +instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
 1.11521 +  match(Set dummy (ClearArray cnt base));
 1.11522 +  effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
 1.11523 +  format %{ "SHL    ECX,1\t# Convert doublewords to words\n\t"
 1.11524 +            "XOR    EAX,EAX\n\t"
 1.11525 +            "REP STOS\t# store EAX into [EDI++] while ECX--" %}
 1.11526 +  opcode(0,0x4);
 1.11527 +  ins_encode( Opcode(0xD1), RegOpc(ECX),
 1.11528 +              OpcRegReg(0x33,EAX,EAX),
 1.11529 +              Opcode(0xF3), Opcode(0xAB) );
 1.11530 +  ins_pipe( pipe_slow );
 1.11531 +%}
 1.11532 +
 1.11533 +instruct string_compare(eDIRegP str1, eSIRegP str2, eAXRegI tmp1, eBXRegI tmp2, eCXRegI result, eFlagsReg cr) %{
 1.11534 +  match(Set result (StrComp str1 str2));
 1.11535 +  effect(USE_KILL str1, USE_KILL str2, KILL tmp1, KILL tmp2, KILL cr);
 1.11536 +  //ins_cost(300);
 1.11537 +
 1.11538 +  format %{ "String Compare $str1,$str2 -> $result    // KILL EAX, EBX" %}
 1.11539 +  ins_encode( enc_String_Compare() );
 1.11540 +  ins_pipe( pipe_slow );
 1.11541 +%}
 1.11542 +
 1.11543 +//----------Control Flow Instructions------------------------------------------
 1.11544 +// Signed compare Instructions
 1.11545 +instruct compI_eReg(eFlagsReg cr, eRegI op1, eRegI op2) %{
 1.11546 +  match(Set cr (CmpI op1 op2));
 1.11547 +  effect( DEF cr, USE op1, USE op2 );
 1.11548 +  format %{ "CMP    $op1,$op2" %}
 1.11549 +  opcode(0x3B);  /* Opcode 3B /r */
 1.11550 +  ins_encode( OpcP, RegReg( op1, op2) );
 1.11551 +  ins_pipe( ialu_cr_reg_reg );
 1.11552 +%}
 1.11553 +
 1.11554 +instruct compI_eReg_imm(eFlagsReg cr, eRegI op1, immI op2) %{
 1.11555 +  match(Set cr (CmpI op1 op2));
 1.11556 +  effect( DEF cr, USE op1 );
 1.11557 +  format %{ "CMP    $op1,$op2" %}
 1.11558 +  opcode(0x81,0x07);  /* Opcode 81 /7 */
 1.11559 +  // ins_encode( RegImm( op1, op2) );  /* Was CmpImm */
 1.11560 +  ins_encode( OpcSErm( op1, op2 ), Con8or32( op2 ) );
 1.11561 +  ins_pipe( ialu_cr_reg_imm );
 1.11562 +%}
 1.11563 +
 1.11564 +// Cisc-spilled version of cmpI_eReg
 1.11565 +instruct compI_eReg_mem(eFlagsReg cr, eRegI op1, memory op2) %{
 1.11566 +  match(Set cr (CmpI op1 (LoadI op2)));
 1.11567 +
 1.11568 +  format %{ "CMP    $op1,$op2" %}
 1.11569 +  ins_cost(500);
 1.11570 +  opcode(0x3B);  /* Opcode 3B /r */
 1.11571 +  ins_encode( OpcP, RegMem( op1, op2) );
 1.11572 +  ins_pipe( ialu_cr_reg_mem );
 1.11573 +%}
 1.11574 +
 1.11575 +instruct testI_reg( eFlagsReg cr, eRegI src, immI0 zero ) %{
 1.11576 +  match(Set cr (CmpI src zero));
 1.11577 +  effect( DEF cr, USE src );
 1.11578 +
 1.11579 +  format %{ "TEST   $src,$src" %}
 1.11580 +  opcode(0x85);
 1.11581 +  ins_encode( OpcP, RegReg( src, src ) );
 1.11582 +  ins_pipe( ialu_cr_reg_imm );
 1.11583 +%}
 1.11584 +
 1.11585 +instruct testI_reg_imm( eFlagsReg cr, eRegI src, immI con, immI0 zero ) %{
 1.11586 +  match(Set cr (CmpI (AndI src con) zero));
 1.11587 +
 1.11588 +  format %{ "TEST   $src,$con" %}
 1.11589 +  opcode(0xF7,0x00);
 1.11590 +  ins_encode( OpcP, RegOpc(src), Con32(con) );
 1.11591 +  ins_pipe( ialu_cr_reg_imm );
 1.11592 +%}
 1.11593 +
 1.11594 +instruct testI_reg_mem( eFlagsReg cr, eRegI src, memory mem, immI0 zero ) %{
 1.11595 +  match(Set cr (CmpI (AndI src mem) zero));
 1.11596 +
 1.11597 +  format %{ "TEST   $src,$mem" %}
 1.11598 +  opcode(0x85);
 1.11599 +  ins_encode( OpcP, RegMem( src, mem ) );
 1.11600 +  ins_pipe( ialu_cr_reg_mem );
 1.11601 +%}
 1.11602 +
 1.11603 +// Unsigned compare Instructions; really, same as signed except they
 1.11604 +// produce an eFlagsRegU instead of eFlagsReg.
 1.11605 +instruct compU_eReg(eFlagsRegU cr, eRegI op1, eRegI op2) %{
 1.11606 +  match(Set cr (CmpU op1 op2));
 1.11607 +
 1.11608 +  format %{ "CMPu   $op1,$op2" %}
 1.11609 +  opcode(0x3B);  /* Opcode 3B /r */
 1.11610 +  ins_encode( OpcP, RegReg( op1, op2) );
 1.11611 +  ins_pipe( ialu_cr_reg_reg );
 1.11612 +%}
 1.11613 +
 1.11614 +instruct compU_eReg_imm(eFlagsRegU cr, eRegI op1, immI op2) %{
 1.11615 +  match(Set cr (CmpU op1 op2));
 1.11616 +
 1.11617 +  format %{ "CMPu   $op1,$op2" %}
 1.11618 +  opcode(0x81,0x07);  /* Opcode 81 /7 */
 1.11619 +  ins_encode( OpcSErm( op1, op2 ), Con8or32( op2 ) );
 1.11620 +  ins_pipe( ialu_cr_reg_imm );
 1.11621 +%}
 1.11622 +
 1.11623 +// // Cisc-spilled version of cmpU_eReg
 1.11624 +instruct compU_eReg_mem(eFlagsRegU cr, eRegI op1, memory op2) %{
 1.11625 +  match(Set cr (CmpU op1 (LoadI op2)));
 1.11626 +
 1.11627 +  format %{ "CMPu   $op1,$op2" %}
 1.11628 +  ins_cost(500);
 1.11629 +  opcode(0x3B);  /* Opcode 3B /r */
 1.11630 +  ins_encode( OpcP, RegMem( op1, op2) );
 1.11631 +  ins_pipe( ialu_cr_reg_mem );
 1.11632 +%}
 1.11633 +
 1.11634 +// // Cisc-spilled version of cmpU_eReg
 1.11635 +//instruct compU_mem_eReg(eFlagsRegU cr, memory op1, eRegI op2) %{
 1.11636 +//  match(Set cr (CmpU (LoadI op1) op2));
 1.11637 +//
 1.11638 +//  format %{ "CMPu   $op1,$op2" %}
 1.11639 +//  ins_cost(500);
 1.11640 +//  opcode(0x39);  /* Opcode 39 /r */
 1.11641 +//  ins_encode( OpcP, RegMem( op1, op2) );
 1.11642 +//%}
 1.11643 +
 1.11644 +instruct testU_reg( eFlagsRegU cr, eRegI src, immI0 zero ) %{
 1.11645 +  match(Set cr (CmpU src zero));
 1.11646 +
 1.11647 +  format %{ "TESTu  $src,$src" %}
 1.11648 +  opcode(0x85);
 1.11649 +  ins_encode( OpcP, RegReg( src, src ) );
 1.11650 +  ins_pipe( ialu_cr_reg_imm );
 1.11651 +%}
 1.11652 +
 1.11653 +// Unsigned pointer compare Instructions
 1.11654 +instruct compP_eReg(eFlagsRegU cr, eRegP op1, eRegP op2) %{
 1.11655 +  match(Set cr (CmpP op1 op2));
 1.11656 +
 1.11657 +  format %{ "CMPu   $op1,$op2" %}
 1.11658 +  opcode(0x3B);  /* Opcode 3B /r */
 1.11659 +  ins_encode( OpcP, RegReg( op1, op2) );
 1.11660 +  ins_pipe( ialu_cr_reg_reg );
 1.11661 +%}
 1.11662 +
 1.11663 +instruct compP_eReg_imm(eFlagsRegU cr, eRegP op1, immP op2) %{
 1.11664 +  match(Set cr (CmpP op1 op2));
 1.11665 +
 1.11666 +  format %{ "CMPu   $op1,$op2" %}
 1.11667 +  opcode(0x81,0x07);  /* Opcode 81 /7 */
 1.11668 +  ins_encode( OpcSErm( op1, op2 ), Con8or32( op2 ) );
 1.11669 +  ins_pipe( ialu_cr_reg_imm );
 1.11670 +%}
 1.11671 +
 1.11672 +// // Cisc-spilled version of cmpP_eReg
 1.11673 +instruct compP_eReg_mem(eFlagsRegU cr, eRegP op1, memory op2) %{
 1.11674 +  match(Set cr (CmpP op1 (LoadP op2)));
 1.11675 +
 1.11676 +  format %{ "CMPu   $op1,$op2" %}
 1.11677 +  ins_cost(500);
 1.11678 +  opcode(0x3B);  /* Opcode 3B /r */
 1.11679 +  ins_encode( OpcP, RegMem( op1, op2) );
 1.11680 +  ins_pipe( ialu_cr_reg_mem );
 1.11681 +%}
 1.11682 +
 1.11683 +// // Cisc-spilled version of cmpP_eReg
 1.11684 +//instruct compP_mem_eReg(eFlagsRegU cr, memory op1, eRegP op2) %{
 1.11685 +//  match(Set cr (CmpP (LoadP op1) op2));
 1.11686 +//
 1.11687 +//  format %{ "CMPu   $op1,$op2" %}
 1.11688 +//  ins_cost(500);
 1.11689 +//  opcode(0x39);  /* Opcode 39 /r */
 1.11690 +//  ins_encode( OpcP, RegMem( op1, op2) );
 1.11691 +//%}
 1.11692 +
 1.11693 +// Compare raw pointer (used in out-of-heap check).
 1.11694 +// Only works because non-oop pointers must be raw pointers
 1.11695 +// and raw pointers have no anti-dependencies.
 1.11696 +instruct compP_mem_eReg( eFlagsRegU cr, eRegP op1, memory op2 ) %{
 1.11697 +  predicate( !n->in(2)->in(2)->bottom_type()->isa_oop_ptr() );
 1.11698 +  match(Set cr (CmpP op1 (LoadP op2)));
 1.11699 +
 1.11700 +  format %{ "CMPu   $op1,$op2" %}
 1.11701 +  opcode(0x3B);  /* Opcode 3B /r */
 1.11702 +  ins_encode( OpcP, RegMem( op1, op2) );
 1.11703 +  ins_pipe( ialu_cr_reg_mem );
 1.11704 +%}
 1.11705 +
 1.11706 +//
 1.11707 +// This will generate a signed flags result. This should be ok
 1.11708 +// since any compare to a zero should be eq/neq.
 1.11709 +instruct testP_reg( eFlagsReg cr, eRegP src, immP0 zero ) %{
 1.11710 +  match(Set cr (CmpP src zero));
 1.11711 +
 1.11712 +  format %{ "TEST   $src,$src" %}
 1.11713 +  opcode(0x85);
 1.11714 +  ins_encode( OpcP, RegReg( src, src ) );
 1.11715 +  ins_pipe( ialu_cr_reg_imm );
 1.11716 +%}
 1.11717 +
 1.11718 +// Cisc-spilled version of testP_reg
 1.11719 +// This will generate a signed flags result. This should be ok
 1.11720 +// since any compare to a zero should be eq/neq.
 1.11721 +instruct testP_Reg_mem( eFlagsReg cr, memory op, immI0 zero ) %{
 1.11722 +  match(Set cr (CmpP (LoadP op) zero));
 1.11723 +
 1.11724 +  format %{ "TEST   $op,0xFFFFFFFF" %}
 1.11725 +  ins_cost(500);
 1.11726 +  opcode(0xF7);               /* Opcode F7 /0 */
 1.11727 +  ins_encode( OpcP, RMopc_Mem(0x00,op), Con_d32(0xFFFFFFFF) );
 1.11728 +  ins_pipe( ialu_cr_reg_imm );
 1.11729 +%}
 1.11730 +
 1.11731 +// Yanked all unsigned pointer compare operations.
 1.11732 +// Pointer compares are done with CmpP which is already unsigned.
 1.11733 +
 1.11734 +//----------Max and Min--------------------------------------------------------
 1.11735 +// Min Instructions
 1.11736 +////
 1.11737 +//   *** Min and Max using the conditional move are slower than the
 1.11738 +//   *** branch version on a Pentium III.
 1.11739 +// // Conditional move for min
 1.11740 +//instruct cmovI_reg_lt( eRegI op2, eRegI op1, eFlagsReg cr ) %{
 1.11741 +//  effect( USE_DEF op2, USE op1, USE cr );
 1.11742 +//  format %{ "CMOVlt $op2,$op1\t! min" %}
 1.11743 +//  opcode(0x4C,0x0F);
 1.11744 +//  ins_encode( OpcS, OpcP, RegReg( op2, op1 ) );
 1.11745 +//  ins_pipe( pipe_cmov_reg );
 1.11746 +//%}
 1.11747 +//
 1.11748 +//// Min Register with Register (P6 version)
 1.11749 +//instruct minI_eReg_p6( eRegI op1, eRegI op2 ) %{
 1.11750 +//  predicate(VM_Version::supports_cmov() );
 1.11751 +//  match(Set op2 (MinI op1 op2));
 1.11752 +//  ins_cost(200);
 1.11753 +//  expand %{
 1.11754 +//    eFlagsReg cr;
 1.11755 +//    compI_eReg(cr,op1,op2);
 1.11756 +//    cmovI_reg_lt(op2,op1,cr);
 1.11757 +//  %}
 1.11758 +//%}
 1.11759 +
 1.11760 +// Min Register with Register (generic version)
 1.11761 +instruct minI_eReg(eRegI dst, eRegI src, eFlagsReg flags) %{
 1.11762 +  match(Set dst (MinI dst src));
 1.11763 +  effect(KILL flags);
 1.11764 +  ins_cost(300);
 1.11765 +
 1.11766 +  format %{ "MIN    $dst,$src" %}
 1.11767 +  opcode(0xCC);
 1.11768 +  ins_encode( min_enc(dst,src) );
 1.11769 +  ins_pipe( pipe_slow );
 1.11770 +%}
 1.11771 +
 1.11772 +// Max Register with Register
 1.11773 +//   *** Min and Max using the conditional move are slower than the
 1.11774 +//   *** branch version on a Pentium III.
 1.11775 +// // Conditional move for max
 1.11776 +//instruct cmovI_reg_gt( eRegI op2, eRegI op1, eFlagsReg cr ) %{
 1.11777 +//  effect( USE_DEF op2, USE op1, USE cr );
 1.11778 +//  format %{ "CMOVgt $op2,$op1\t! max" %}
 1.11779 +//  opcode(0x4F,0x0F);
 1.11780 +//  ins_encode( OpcS, OpcP, RegReg( op2, op1 ) );
 1.11781 +//  ins_pipe( pipe_cmov_reg );
 1.11782 +//%}
 1.11783 +//
 1.11784 +// // Max Register with Register (P6 version)
 1.11785 +//instruct maxI_eReg_p6( eRegI op1, eRegI op2 ) %{
 1.11786 +//  predicate(VM_Version::supports_cmov() );
 1.11787 +//  match(Set op2 (MaxI op1 op2));
 1.11788 +//  ins_cost(200);
 1.11789 +//  expand %{
 1.11790 +//    eFlagsReg cr;
 1.11791 +//    compI_eReg(cr,op1,op2);
 1.11792 +//    cmovI_reg_gt(op2,op1,cr);
 1.11793 +//  %}
 1.11794 +//%}
 1.11795 +
 1.11796 +// Max Register with Register (generic version)
 1.11797 +instruct maxI_eReg(eRegI dst, eRegI src, eFlagsReg flags) %{
 1.11798 +  match(Set dst (MaxI dst src));
 1.11799 +  effect(KILL flags);
 1.11800 +  ins_cost(300);
 1.11801 +
 1.11802 +  format %{ "MAX    $dst,$src" %}
 1.11803 +  opcode(0xCC);
 1.11804 +  ins_encode( max_enc(dst,src) );
 1.11805 +  ins_pipe( pipe_slow );
 1.11806 +%}
 1.11807 +
 1.11808 +// ============================================================================
 1.11809 +// Branch Instructions
 1.11810 +// Jump Table
 1.11811 +instruct jumpXtnd(eRegI switch_val) %{
 1.11812 +  match(Jump switch_val);
 1.11813 +  ins_cost(350);
 1.11814 +
 1.11815 +  format %{  "JMP    [table_base](,$switch_val,1)\n\t" %}
 1.11816 +
 1.11817 +  ins_encode %{
 1.11818 +    address table_base  = __ address_table_constant(_index2label);
 1.11819 +
 1.11820 +    // Jump to Address(table_base + switch_reg)
 1.11821 +    InternalAddress table(table_base);
 1.11822 +    Address index(noreg, $switch_val$$Register, Address::times_1);
 1.11823 +    __ jump(ArrayAddress(table, index));
 1.11824 +  %}
 1.11825 +  ins_pc_relative(1);
 1.11826 +  ins_pipe(pipe_jmp);
 1.11827 +%}
 1.11828 +
 1.11829 +// Jump Direct - Label defines a relative address from JMP+1
 1.11830 +instruct jmpDir(label labl) %{
 1.11831 +  match(Goto);
 1.11832 +  effect(USE labl);
 1.11833 +
 1.11834 +  ins_cost(300);
 1.11835 +  format %{ "JMP    $labl" %}
 1.11836 +  size(5);
 1.11837 +  opcode(0xE9);
 1.11838 +  ins_encode( OpcP, Lbl( labl ) );
 1.11839 +  ins_pipe( pipe_jmp );
 1.11840 +  ins_pc_relative(1);
 1.11841 +%}
 1.11842 +
 1.11843 +// Jump Direct Conditional - Label defines a relative address from Jcc+1
 1.11844 +instruct jmpCon(cmpOp cop, eFlagsReg cr, label labl) %{
 1.11845 +  match(If cop cr);
 1.11846 +  effect(USE labl);
 1.11847 +
 1.11848 +  ins_cost(300);
 1.11849 +  format %{ "J$cop    $labl" %}
 1.11850 +  size(6);
 1.11851 +  opcode(0x0F, 0x80);
 1.11852 +  ins_encode( Jcc( cop, labl) );
 1.11853 +  ins_pipe( pipe_jcc );
 1.11854 +  ins_pc_relative(1);
 1.11855 +%}
 1.11856 +
 1.11857 +// Jump Direct Conditional - Label defines a relative address from Jcc+1
 1.11858 +instruct jmpLoopEnd(cmpOp cop, eFlagsReg cr, label labl) %{
 1.11859 +  match(CountedLoopEnd cop cr);
 1.11860 +  effect(USE labl);
 1.11861 +
 1.11862 +  ins_cost(300);
 1.11863 +  format %{ "J$cop    $labl\t# Loop end" %}
 1.11864 +  size(6);
 1.11865 +  opcode(0x0F, 0x80);
 1.11866 +  ins_encode( Jcc( cop, labl) );
 1.11867 +  ins_pipe( pipe_jcc );
 1.11868 +  ins_pc_relative(1);
 1.11869 +%}
 1.11870 +
 1.11871 +// Jump Direct Conditional - Label defines a relative address from Jcc+1
 1.11872 +instruct jmpLoopEndU(cmpOpU cop, eFlagsRegU cmp, label labl) %{
 1.11873 +  match(CountedLoopEnd cop cmp);
 1.11874 +  effect(USE labl);
 1.11875 +
 1.11876 +  ins_cost(300);
 1.11877 +  format %{ "J$cop,u  $labl\t# Loop end" %}
 1.11878 +  size(6);
 1.11879 +  opcode(0x0F, 0x80);
 1.11880 +  ins_encode( Jcc( cop, labl) );
 1.11881 +  ins_pipe( pipe_jcc );
 1.11882 +  ins_pc_relative(1);
 1.11883 +%}
 1.11884 +
 1.11885 +// Jump Direct Conditional - using unsigned comparison
 1.11886 +instruct jmpConU(cmpOpU cop, eFlagsRegU cmp, label labl) %{
 1.11887 +  match(If cop cmp);
 1.11888 +  effect(USE labl);
 1.11889 +
 1.11890 +  ins_cost(300);
 1.11891 +  format %{ "J$cop,u  $labl" %}
 1.11892 +  size(6);
 1.11893 +  opcode(0x0F, 0x80);
 1.11894 +  ins_encode( Jcc( cop, labl) );
 1.11895 +  ins_pipe( pipe_jcc );
 1.11896 +  ins_pc_relative(1);
 1.11897 +%}
 1.11898 +
 1.11899 +// ============================================================================
 1.11900 +// The 2nd slow-half of a subtype check.  Scan the subklass's 2ndary superklass
 1.11901 +// array for an instance of the superklass.  Set a hidden internal cache on a
 1.11902 +// hit (cache is checked with exposed code in gen_subtype_check()).  Return
 1.11903 +// NZ for a miss or zero for a hit.  The encoding ALSO sets flags.
 1.11904 +instruct partialSubtypeCheck( eDIRegP result, eSIRegP sub, eAXRegP super, eCXRegI rcx, eFlagsReg cr ) %{
 1.11905 +  match(Set result (PartialSubtypeCheck sub super));
 1.11906 +  effect( KILL rcx, KILL cr );
 1.11907 +
 1.11908 +  ins_cost(1100);  // slightly larger than the next version
 1.11909 +  format %{ "CMPL   EAX,ESI\n\t"
 1.11910 +            "JEQ,s  hit\n\t"
 1.11911 +            "MOV    EDI,[$sub+Klass::secondary_supers]\n\t"
 1.11912 +            "MOV    ECX,[EDI+arrayKlass::length]\t# length to scan\n\t"
 1.11913 +            "ADD    EDI,arrayKlass::base_offset\t# Skip to start of data; set NZ in case count is zero\n\t"
 1.11914 +            "REPNE SCASD\t# Scan *EDI++ for a match with EAX while CX-- != 0\n\t"
 1.11915 +            "JNE,s  miss\t\t# Missed: EDI not-zero\n\t"
 1.11916 +            "MOV    [$sub+Klass::secondary_super_cache],$super\t# Hit: update cache\n\t"
 1.11917 +     "hit:\n\t"
 1.11918 +            "XOR    $result,$result\t\t Hit: EDI zero\n\t"
 1.11919 +     "miss:\t" %}
 1.11920 +
 1.11921 +  opcode(0x1); // Force a XOR of EDI
 1.11922 +  ins_encode( enc_PartialSubtypeCheck() );
 1.11923 +  ins_pipe( pipe_slow );
 1.11924 +%}
 1.11925 +
 1.11926 +instruct partialSubtypeCheck_vs_Zero( eFlagsReg cr, eSIRegP sub, eAXRegP super, eCXRegI rcx, eDIRegP result, immP0 zero ) %{
 1.11927 +  match(Set cr (CmpP (PartialSubtypeCheck sub super) zero));
 1.11928 +  effect( KILL rcx, KILL result );
 1.11929 +
 1.11930 +  ins_cost(1000);
 1.11931 +  format %{ "CMPL   EAX,ESI\n\t"
 1.11932 +            "JEQ,s  miss\t# Actually a hit; we are done.\n\t"
 1.11933 +            "MOV    EDI,[$sub+Klass::secondary_supers]\n\t"
 1.11934 +            "MOV    ECX,[EDI+arrayKlass::length]\t# length to scan\n\t"
 1.11935 +            "ADD    EDI,arrayKlass::base_offset\t# Skip to start of data; set NZ in case count is zero\n\t"
 1.11936 +            "REPNE SCASD\t# Scan *EDI++ for a match with EAX while CX-- != 0\n\t"
 1.11937 +            "JNE,s  miss\t\t# Missed: flags NZ\n\t"
 1.11938 +            "MOV    [$sub+Klass::secondary_super_cache],$super\t# Hit: update cache, flags Z\n\t"
 1.11939 +     "miss:\t" %}
 1.11940 +
 1.11941 +  opcode(0x0);  // No need to XOR EDI
 1.11942 +  ins_encode( enc_PartialSubtypeCheck() );
 1.11943 +  ins_pipe( pipe_slow );
 1.11944 +%}
 1.11945 +
 1.11946 +// ============================================================================
 1.11947 +// Branch Instructions -- short offset versions
 1.11948 +//
 1.11949 +// These instructions are used to replace jumps of a long offset (the default
 1.11950 +// match) with jumps of a shorter offset.  These instructions are all tagged
 1.11951 +// with the ins_short_branch attribute, which causes the ADLC to suppress the
 1.11952 +// match rules in general matching.  Instead, the ADLC generates a conversion
 1.11953 +// method in the MachNode which can be used to do in-place replacement of the
 1.11954 +// long variant with the shorter variant.  The compiler will determine if a
 1.11955 +// branch can be taken by the is_short_branch_offset() predicate in the machine
 1.11956 +// specific code section of the file.
 1.11957 +
 1.11958 +// Jump Direct - Label defines a relative address from JMP+1
 1.11959 +instruct jmpDir_short(label labl) %{
 1.11960 +  match(Goto);
 1.11961 +  effect(USE labl);
 1.11962 +
 1.11963 +  ins_cost(300);
 1.11964 +  format %{ "JMP,s  $labl" %}
 1.11965 +  size(2);
 1.11966 +  opcode(0xEB);
 1.11967 +  ins_encode( OpcP, LblShort( labl ) );
 1.11968 +  ins_pipe( pipe_jmp );
 1.11969 +  ins_pc_relative(1);
 1.11970 +  ins_short_branch(1);
 1.11971 +%}
 1.11972 +
 1.11973 +// Jump Direct Conditional - Label defines a relative address from Jcc+1
 1.11974 +instruct jmpCon_short(cmpOp cop, eFlagsReg cr, label labl) %{
 1.11975 +  match(If cop cr);
 1.11976 +  effect(USE labl);
 1.11977 +
 1.11978 +  ins_cost(300);
 1.11979 +  format %{ "J$cop,s  $labl" %}
 1.11980 +  size(2);
 1.11981 +  opcode(0x70);
 1.11982 +  ins_encode( JccShort( cop, labl) );
 1.11983 +  ins_pipe( pipe_jcc );
 1.11984 +  ins_pc_relative(1);
 1.11985 +  ins_short_branch(1);
 1.11986 +%}
 1.11987 +
 1.11988 +// Jump Direct Conditional - Label defines a relative address from Jcc+1
 1.11989 +instruct jmpLoopEnd_short(cmpOp cop, eFlagsReg cr, label labl) %{
 1.11990 +  match(CountedLoopEnd cop cr);
 1.11991 +  effect(USE labl);
 1.11992 +
 1.11993 +  ins_cost(300);
 1.11994 +  format %{ "J$cop,s  $labl" %}
 1.11995 +  size(2);
 1.11996 +  opcode(0x70);
 1.11997 +  ins_encode( JccShort( cop, labl) );
 1.11998 +  ins_pipe( pipe_jcc );
 1.11999 +  ins_pc_relative(1);
 1.12000 +  ins_short_branch(1);
 1.12001 +%}
 1.12002 +
 1.12003 +// Jump Direct Conditional - Label defines a relative address from Jcc+1
 1.12004 +instruct jmpLoopEndU_short(cmpOpU cop, eFlagsRegU cmp, label labl) %{
 1.12005 +  match(CountedLoopEnd cop cmp);
 1.12006 +  effect(USE labl);
 1.12007 +
 1.12008 +  ins_cost(300);
 1.12009 +  format %{ "J$cop,us $labl" %}
 1.12010 +  size(2);
 1.12011 +  opcode(0x70);
 1.12012 +  ins_encode( JccShort( cop, labl) );
 1.12013 +  ins_pipe( pipe_jcc );
 1.12014 +  ins_pc_relative(1);
 1.12015 +  ins_short_branch(1);
 1.12016 +%}
 1.12017 +
 1.12018 +// Jump Direct Conditional - using unsigned comparison
 1.12019 +instruct jmpConU_short(cmpOpU cop, eFlagsRegU cmp, label labl) %{
 1.12020 +  match(If cop cmp);
 1.12021 +  effect(USE labl);
 1.12022 +
 1.12023 +  ins_cost(300);
 1.12024 +  format %{ "J$cop,us $labl" %}
 1.12025 +  size(2);
 1.12026 +  opcode(0x70);
 1.12027 +  ins_encode( JccShort( cop, labl) );
 1.12028 +  ins_pipe( pipe_jcc );
 1.12029 +  ins_pc_relative(1);
 1.12030 +  ins_short_branch(1);
 1.12031 +%}
 1.12032 +
 1.12033 +// ============================================================================
 1.12034 +// Long Compare
 1.12035 +//
 1.12036 +// Currently we hold longs in 2 registers.  Comparing such values efficiently
 1.12037 +// is tricky.  The flavor of compare used depends on whether we are testing
 1.12038 +// for LT, LE, or EQ.  For a simple LT test we can check just the sign bit.
 1.12039 +// The GE test is the negated LT test.  The LE test can be had by commuting
 1.12040 +// the operands (yielding a GE test) and then negating; negate again for the
 1.12041 +// GT test.  The EQ test is done by ORcc'ing the high and low halves, and the
 1.12042 +// NE test is negated from that.
 1.12043 +
 1.12044 +// Due to a shortcoming in the ADLC, it mixes up expressions like:
 1.12045 +// (foo (CmpI (CmpL X Y) 0)) and (bar (CmpI (CmpL X 0L) 0)).  Note the
 1.12046 +// difference between 'Y' and '0L'.  The tree-matches for the CmpI sections
 1.12047 +// are collapsed internally in the ADLC's dfa-gen code.  The match for
 1.12048 +// (CmpI (CmpL X Y) 0) is silently replaced with (CmpI (CmpL X 0L) 0) and the
 1.12049 +// foo match ends up with the wrong leaf.  One fix is to not match both
 1.12050 +// reg-reg and reg-zero forms of long-compare.  This is unfortunate because
 1.12051 +// both forms beat the trinary form of long-compare and both are very useful
 1.12052 +// on Intel which has so few registers.
 1.12053 +
 1.12054 +// Manifest a CmpL result in an integer register.  Very painful.
 1.12055 +// This is the test to avoid.
 1.12056 +instruct cmpL3_reg_reg(eSIRegI dst, eRegL src1, eRegL src2, eFlagsReg flags ) %{
 1.12057 +  match(Set dst (CmpL3 src1 src2));
 1.12058 +  effect( KILL flags );
 1.12059 +  ins_cost(1000);
 1.12060 +  format %{ "XOR    $dst,$dst\n\t"
 1.12061 +            "CMP    $src1.hi,$src2.hi\n\t"
 1.12062 +            "JLT,s  m_one\n\t"
 1.12063 +            "JGT,s  p_one\n\t"
 1.12064 +            "CMP    $src1.lo,$src2.lo\n\t"
 1.12065 +            "JB,s   m_one\n\t"
 1.12066 +            "JEQ,s  done\n"
 1.12067 +    "p_one:\tINC    $dst\n\t"
 1.12068 +            "JMP,s  done\n"
 1.12069 +    "m_one:\tDEC    $dst\n"
 1.12070 +     "done:" %}
 1.12071 +  ins_encode %{
 1.12072 +    Label p_one, m_one, done;
 1.12073 +    __ xorl($dst$$Register, $dst$$Register);
 1.12074 +    __ cmpl(HIGH_FROM_LOW($src1$$Register), HIGH_FROM_LOW($src2$$Register));
 1.12075 +    __ jccb(Assembler::less,    m_one);
 1.12076 +    __ jccb(Assembler::greater, p_one);
 1.12077 +    __ cmpl($src1$$Register, $src2$$Register);
 1.12078 +    __ jccb(Assembler::below,   m_one);
 1.12079 +    __ jccb(Assembler::equal,   done);
 1.12080 +    __ bind(p_one);
 1.12081 +    __ increment($dst$$Register);
 1.12082 +    __ jmpb(done);
 1.12083 +    __ bind(m_one);
 1.12084 +    __ decrement($dst$$Register);
 1.12085 +    __ bind(done);
 1.12086 +  %}
 1.12087 +  ins_pipe( pipe_slow );
 1.12088 +%}
 1.12089 +
 1.12090 +//======
 1.12091 +// Manifest a CmpL result in the normal flags.  Only good for LT or GE
 1.12092 +// compares.  Can be used for LE or GT compares by reversing arguments.
 1.12093 +// NOT GOOD FOR EQ/NE tests.
 1.12094 +instruct cmpL_zero_flags_LTGE( flagsReg_long_LTGE flags, eRegL src, immL0 zero ) %{
 1.12095 +  match( Set flags (CmpL src zero ));
 1.12096 +  ins_cost(100);
 1.12097 +  format %{ "TEST   $src.hi,$src.hi" %}
 1.12098 +  opcode(0x85);
 1.12099 +  ins_encode( OpcP, RegReg_Hi2( src, src ) );
 1.12100 +  ins_pipe( ialu_cr_reg_reg );
 1.12101 +%}
 1.12102 +
 1.12103 +// Manifest a CmpL result in the normal flags.  Only good for LT or GE
 1.12104 +// compares.  Can be used for LE or GT compares by reversing arguments.
 1.12105 +// NOT GOOD FOR EQ/NE tests.
 1.12106 +instruct cmpL_reg_flags_LTGE( flagsReg_long_LTGE flags, eRegL src1, eRegL src2, eRegI tmp ) %{
 1.12107 +  match( Set flags (CmpL src1 src2 ));
 1.12108 +  effect( TEMP tmp );
 1.12109 +  ins_cost(300);
 1.12110 +  format %{ "CMP    $src1.lo,$src2.lo\t! Long compare; set flags for low bits\n\t"
 1.12111 +            "MOV    $tmp,$src1.hi\n\t"
 1.12112 +            "SBB    $tmp,$src2.hi\t! Compute flags for long compare" %}
 1.12113 +  ins_encode( long_cmp_flags2( src1, src2, tmp ) );
 1.12114 +  ins_pipe( ialu_cr_reg_reg );
 1.12115 +%}
 1.12116 +
 1.12117 +// Long compares reg < zero/req OR reg >= zero/req.
 1.12118 +// Just a wrapper for a normal branch, plus the predicate test.
 1.12119 +instruct cmpL_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, label labl) %{
 1.12120 +  match(If cmp flags);
 1.12121 +  effect(USE labl);
 1.12122 +  predicate( _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
 1.12123 +  expand %{
 1.12124 +    jmpCon(cmp,flags,labl);    // JLT or JGE...
 1.12125 +  %}
 1.12126 +%}
 1.12127 +
 1.12128 +// Compare 2 longs and CMOVE longs.
 1.12129 +instruct cmovLL_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegL dst, eRegL src) %{
 1.12130 +  match(Set dst (CMoveL (Binary cmp flags) (Binary dst src)));
 1.12131 +  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge ));
 1.12132 +  ins_cost(400);
 1.12133 +  format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
 1.12134 +            "CMOV$cmp $dst.hi,$src.hi" %}
 1.12135 +  opcode(0x0F,0x40);
 1.12136 +  ins_encode( enc_cmov(cmp), RegReg_Lo2( dst, src ), enc_cmov(cmp), RegReg_Hi2( dst, src ) );
 1.12137 +  ins_pipe( pipe_cmov_reg_long );
 1.12138 +%}
 1.12139 +
 1.12140 +instruct cmovLL_mem_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegL dst, load_long_memory src) %{
 1.12141 +  match(Set dst (CMoveL (Binary cmp flags) (Binary dst (LoadL src))));
 1.12142 +  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge ));
 1.12143 +  ins_cost(500);
 1.12144 +  format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
 1.12145 +            "CMOV$cmp $dst.hi,$src.hi" %}
 1.12146 +  opcode(0x0F,0x40);
 1.12147 +  ins_encode( enc_cmov(cmp), RegMem(dst, src), enc_cmov(cmp), RegMem_Hi(dst, src) );
 1.12148 +  ins_pipe( pipe_cmov_reg_long );
 1.12149 +%}
 1.12150 +
 1.12151 +// Compare 2 longs and CMOVE ints.
 1.12152 +instruct cmovII_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegI dst, eRegI src) %{
 1.12153 +  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge ));
 1.12154 +  match(Set dst (CMoveI (Binary cmp flags) (Binary dst src)));
 1.12155 +  ins_cost(200);
 1.12156 +  format %{ "CMOV$cmp $dst,$src" %}
 1.12157 +  opcode(0x0F,0x40);
 1.12158 +  ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
 1.12159 +  ins_pipe( pipe_cmov_reg );
 1.12160 +%}
 1.12161 +
 1.12162 +instruct cmovII_mem_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegI dst, memory src) %{
 1.12163 +  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge ));
 1.12164 +  match(Set dst (CMoveI (Binary cmp flags) (Binary dst (LoadI src))));
 1.12165 +  ins_cost(250);
 1.12166 +  format %{ "CMOV$cmp $dst,$src" %}
 1.12167 +  opcode(0x0F,0x40);
 1.12168 +  ins_encode( enc_cmov(cmp), RegMem( dst, src ) );
 1.12169 +  ins_pipe( pipe_cmov_mem );
 1.12170 +%}
 1.12171 +
 1.12172 +// Compare 2 longs and CMOVE ints.
 1.12173 +instruct cmovPP_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegP dst, eRegP src) %{
 1.12174 +  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge ));
 1.12175 +  match(Set dst (CMoveP (Binary cmp flags) (Binary dst src)));
 1.12176 +  ins_cost(200);
 1.12177 +  format %{ "CMOV$cmp $dst,$src" %}
 1.12178 +  opcode(0x0F,0x40);
 1.12179 +  ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
 1.12180 +  ins_pipe( pipe_cmov_reg );
 1.12181 +%}
 1.12182 +
 1.12183 +// Compare 2 longs and CMOVE doubles
 1.12184 +instruct cmovDD_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regD dst, regD src) %{
 1.12185 +  predicate( UseSSE<=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
 1.12186 +  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
 1.12187 +  ins_cost(200);
 1.12188 +  expand %{
 1.12189 +    fcmovD_regS(cmp,flags,dst,src);
 1.12190 +  %}
 1.12191 +%}
 1.12192 +
 1.12193 +// Compare 2 longs and CMOVE doubles
 1.12194 +instruct cmovXDD_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regXD dst, regXD src) %{
 1.12195 +  predicate( UseSSE>=2 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
 1.12196 +  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
 1.12197 +  ins_cost(200);
 1.12198 +  expand %{
 1.12199 +    fcmovXD_regS(cmp,flags,dst,src);
 1.12200 +  %}
 1.12201 +%}
 1.12202 +
 1.12203 +instruct cmovFF_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regF dst, regF src) %{
 1.12204 +  predicate( UseSSE==0 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
 1.12205 +  match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
 1.12206 +  ins_cost(200);
 1.12207 +  expand %{
 1.12208 +    fcmovF_regS(cmp,flags,dst,src);
 1.12209 +  %}
 1.12210 +%}
 1.12211 +
 1.12212 +instruct cmovXX_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regX dst, regX src) %{
 1.12213 +  predicate( UseSSE>=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
 1.12214 +  match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
 1.12215 +  ins_cost(200);
 1.12216 +  expand %{
 1.12217 +    fcmovX_regS(cmp,flags,dst,src);
 1.12218 +  %}
 1.12219 +%}
 1.12220 +
 1.12221 +//======
 1.12222 +// Manifest a CmpL result in the normal flags.  Only good for EQ/NE compares.
 1.12223 +instruct cmpL_zero_flags_EQNE( flagsReg_long_EQNE flags, eRegL src, immL0 zero, eRegI tmp ) %{
 1.12224 +  match( Set flags (CmpL src zero ));
 1.12225 +  effect(TEMP tmp);
 1.12226 +  ins_cost(200);
 1.12227 +  format %{ "MOV    $tmp,$src.lo\n\t"
 1.12228 +            "OR     $tmp,$src.hi\t! Long is EQ/NE 0?" %}
 1.12229 +  ins_encode( long_cmp_flags0( src, tmp ) );
 1.12230 +  ins_pipe( ialu_reg_reg_long );
 1.12231 +%}
 1.12232 +
 1.12233 +// Manifest a CmpL result in the normal flags.  Only good for EQ/NE compares.
 1.12234 +instruct cmpL_reg_flags_EQNE( flagsReg_long_EQNE flags, eRegL src1, eRegL src2 ) %{
 1.12235 +  match( Set flags (CmpL src1 src2 ));
 1.12236 +  ins_cost(200+300);
 1.12237 +  format %{ "CMP    $src1.lo,$src2.lo\t! Long compare; set flags for low bits\n\t"
 1.12238 +            "JNE,s  skip\n\t"
 1.12239 +            "CMP    $src1.hi,$src2.hi\n\t"
 1.12240 +     "skip:\t" %}
 1.12241 +  ins_encode( long_cmp_flags1( src1, src2 ) );
 1.12242 +  ins_pipe( ialu_cr_reg_reg );
 1.12243 +%}
 1.12244 +
 1.12245 +// Long compare reg == zero/reg OR reg != zero/reg
 1.12246 +// Just a wrapper for a normal branch, plus the predicate test.
 1.12247 +instruct cmpL_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, label labl) %{
 1.12248 +  match(If cmp flags);
 1.12249 +  effect(USE labl);
 1.12250 +  predicate( _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
 1.12251 +  expand %{
 1.12252 +    jmpCon(cmp,flags,labl);    // JEQ or JNE...
 1.12253 +  %}
 1.12254 +%}
 1.12255 +
 1.12256 +// Compare 2 longs and CMOVE longs.
 1.12257 +instruct cmovLL_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegL dst, eRegL src) %{
 1.12258 +  match(Set dst (CMoveL (Binary cmp flags) (Binary dst src)));
 1.12259 +  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ));
 1.12260 +  ins_cost(400);
 1.12261 +  format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
 1.12262 +            "CMOV$cmp $dst.hi,$src.hi" %}
 1.12263 +  opcode(0x0F,0x40);
 1.12264 +  ins_encode( enc_cmov(cmp), RegReg_Lo2( dst, src ), enc_cmov(cmp), RegReg_Hi2( dst, src ) );
 1.12265 +  ins_pipe( pipe_cmov_reg_long );
 1.12266 +%}
 1.12267 +
 1.12268 +instruct cmovLL_mem_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegL dst, load_long_memory src) %{
 1.12269 +  match(Set dst (CMoveL (Binary cmp flags) (Binary dst (LoadL src))));
 1.12270 +  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ));
 1.12271 +  ins_cost(500);
 1.12272 +  format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
 1.12273 +            "CMOV$cmp $dst.hi,$src.hi" %}
 1.12274 +  opcode(0x0F,0x40);
 1.12275 +  ins_encode( enc_cmov(cmp), RegMem(dst, src), enc_cmov(cmp), RegMem_Hi(dst, src) );
 1.12276 +  ins_pipe( pipe_cmov_reg_long );
 1.12277 +%}
 1.12278 +
 1.12279 +// Compare 2 longs and CMOVE ints.
 1.12280 +instruct cmovII_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegI dst, eRegI src) %{
 1.12281 +  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ));
 1.12282 +  match(Set dst (CMoveI (Binary cmp flags) (Binary dst src)));
 1.12283 +  ins_cost(200);
 1.12284 +  format %{ "CMOV$cmp $dst,$src" %}
 1.12285 +  opcode(0x0F,0x40);
 1.12286 +  ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
 1.12287 +  ins_pipe( pipe_cmov_reg );
 1.12288 +%}
 1.12289 +
 1.12290 +instruct cmovII_mem_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegI dst, memory src) %{
 1.12291 +  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ));
 1.12292 +  match(Set dst (CMoveI (Binary cmp flags) (Binary dst (LoadI src))));
 1.12293 +  ins_cost(250);
 1.12294 +  format %{ "CMOV$cmp $dst,$src" %}
 1.12295 +  opcode(0x0F,0x40);
 1.12296 +  ins_encode( enc_cmov(cmp), RegMem( dst, src ) );
 1.12297 +  ins_pipe( pipe_cmov_mem );
 1.12298 +%}
 1.12299 +
 1.12300 +// Compare 2 longs and CMOVE ints.
 1.12301 +instruct cmovPP_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegP dst, eRegP src) %{
 1.12302 +  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ));
 1.12303 +  match(Set dst (CMoveP (Binary cmp flags) (Binary dst src)));
 1.12304 +  ins_cost(200);
 1.12305 +  format %{ "CMOV$cmp $dst,$src" %}
 1.12306 +  opcode(0x0F,0x40);
 1.12307 +  ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
 1.12308 +  ins_pipe( pipe_cmov_reg );
 1.12309 +%}
 1.12310 +
 1.12311 +// Compare 2 longs and CMOVE doubles
 1.12312 +instruct cmovDD_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regD dst, regD src) %{
 1.12313 +  predicate( UseSSE<=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
 1.12314 +  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
 1.12315 +  ins_cost(200);
 1.12316 +  expand %{
 1.12317 +    fcmovD_regS(cmp,flags,dst,src);
 1.12318 +  %}
 1.12319 +%}
 1.12320 +
 1.12321 +// Compare 2 longs and CMOVE doubles
 1.12322 +instruct cmovXDD_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regXD dst, regXD src) %{
 1.12323 +  predicate( UseSSE>=2 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
 1.12324 +  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
 1.12325 +  ins_cost(200);
 1.12326 +  expand %{
 1.12327 +    fcmovXD_regS(cmp,flags,dst,src);
 1.12328 +  %}
 1.12329 +%}
 1.12330 +
 1.12331 +instruct cmovFF_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regF dst, regF src) %{
 1.12332 +  predicate( UseSSE==0 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
 1.12333 +  match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
 1.12334 +  ins_cost(200);
 1.12335 +  expand %{
 1.12336 +    fcmovF_regS(cmp,flags,dst,src);
 1.12337 +  %}
 1.12338 +%}
 1.12339 +
 1.12340 +instruct cmovXX_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regX dst, regX src) %{
 1.12341 +  predicate( UseSSE>=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
 1.12342 +  match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
 1.12343 +  ins_cost(200);
 1.12344 +  expand %{
 1.12345 +    fcmovX_regS(cmp,flags,dst,src);
 1.12346 +  %}
 1.12347 +%}
 1.12348 +
 1.12349 +//======
 1.12350 +// Manifest a CmpL result in the normal flags.  Only good for LE or GT compares.
 1.12351 +// Same as cmpL_reg_flags_LEGT except must negate src
 1.12352 +instruct cmpL_zero_flags_LEGT( flagsReg_long_LEGT flags, eRegL src, immL0 zero, eRegI tmp ) %{
 1.12353 +  match( Set flags (CmpL src zero ));
 1.12354 +  effect( TEMP tmp );
 1.12355 +  ins_cost(300);
 1.12356 +  format %{ "XOR    $tmp,$tmp\t# Long compare for -$src < 0, use commuted test\n\t"
 1.12357 +            "CMP    $tmp,$src.lo\n\t"
 1.12358 +            "SBB    $tmp,$src.hi\n\t" %}
 1.12359 +  ins_encode( long_cmp_flags3(src, tmp) );
 1.12360 +  ins_pipe( ialu_reg_reg_long );
 1.12361 +%}
 1.12362 +
 1.12363 +// Manifest a CmpL result in the normal flags.  Only good for LE or GT compares.
 1.12364 +// Same as cmpL_reg_flags_LTGE except operands swapped.  Swapping operands
 1.12365 +// requires a commuted test to get the same result.
 1.12366 +instruct cmpL_reg_flags_LEGT( flagsReg_long_LEGT flags, eRegL src1, eRegL src2, eRegI tmp ) %{
 1.12367 +  match( Set flags (CmpL src1 src2 ));
 1.12368 +  effect( TEMP tmp );
 1.12369 +  ins_cost(300);
 1.12370 +  format %{ "CMP    $src2.lo,$src1.lo\t! Long compare, swapped operands, use with commuted test\n\t"
 1.12371 +            "MOV    $tmp,$src2.hi\n\t"
 1.12372 +            "SBB    $tmp,$src1.hi\t! Compute flags for long compare" %}
 1.12373 +  ins_encode( long_cmp_flags2( src2, src1, tmp ) );
 1.12374 +  ins_pipe( ialu_cr_reg_reg );
 1.12375 +%}
 1.12376 +
 1.12377 +// Long compares reg < zero/req OR reg >= zero/req.
 1.12378 +// Just a wrapper for a normal branch, plus the predicate test
 1.12379 +instruct cmpL_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, label labl) %{
 1.12380 +  match(If cmp flags);
 1.12381 +  effect(USE labl);
 1.12382 +  predicate( _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt || _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le );
 1.12383 +  ins_cost(300);
 1.12384 +  expand %{
 1.12385 +    jmpCon(cmp,flags,labl);    // JGT or JLE...
 1.12386 +  %}
 1.12387 +%}
 1.12388 +
 1.12389 +// Compare 2 longs and CMOVE longs.
 1.12390 +instruct cmovLL_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegL dst, eRegL src) %{
 1.12391 +  match(Set dst (CMoveL (Binary cmp flags) (Binary dst src)));
 1.12392 +  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt ));
 1.12393 +  ins_cost(400);
 1.12394 +  format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
 1.12395 +            "CMOV$cmp $dst.hi,$src.hi" %}
 1.12396 +  opcode(0x0F,0x40);
 1.12397 +  ins_encode( enc_cmov(cmp), RegReg_Lo2( dst, src ), enc_cmov(cmp), RegReg_Hi2( dst, src ) );
 1.12398 +  ins_pipe( pipe_cmov_reg_long );
 1.12399 +%}
 1.12400 +
 1.12401 +instruct cmovLL_mem_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegL dst, load_long_memory src) %{
 1.12402 +  match(Set dst (CMoveL (Binary cmp flags) (Binary dst (LoadL src))));
 1.12403 +  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt ));
 1.12404 +  ins_cost(500);
 1.12405 +  format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
 1.12406 +            "CMOV$cmp $dst.hi,$src.hi+4" %}
 1.12407 +  opcode(0x0F,0x40);
 1.12408 +  ins_encode( enc_cmov(cmp), RegMem(dst, src), enc_cmov(cmp), RegMem_Hi(dst, src) );
 1.12409 +  ins_pipe( pipe_cmov_reg_long );
 1.12410 +%}
 1.12411 +
 1.12412 +// Compare 2 longs and CMOVE ints.
 1.12413 +instruct cmovII_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegI dst, eRegI src) %{
 1.12414 +  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt ));
 1.12415 +  match(Set dst (CMoveI (Binary cmp flags) (Binary dst src)));
 1.12416 +  ins_cost(200);
 1.12417 +  format %{ "CMOV$cmp $dst,$src" %}
 1.12418 +  opcode(0x0F,0x40);
 1.12419 +  ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
 1.12420 +  ins_pipe( pipe_cmov_reg );
 1.12421 +%}
 1.12422 +
 1.12423 +instruct cmovII_mem_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegI dst, memory src) %{
 1.12424 +  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt ));
 1.12425 +  match(Set dst (CMoveI (Binary cmp flags) (Binary dst (LoadI src))));
 1.12426 +  ins_cost(250);
 1.12427 +  format %{ "CMOV$cmp $dst,$src" %}
 1.12428 +  opcode(0x0F,0x40);
 1.12429 +  ins_encode( enc_cmov(cmp), RegMem( dst, src ) );
 1.12430 +  ins_pipe( pipe_cmov_mem );
 1.12431 +%}
 1.12432 +
 1.12433 +// Compare 2 longs and CMOVE ptrs.
 1.12434 +instruct cmovPP_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegP dst, eRegP src) %{
 1.12435 +  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt ));
 1.12436 +  match(Set dst (CMoveP (Binary cmp flags) (Binary dst src)));
 1.12437 +  ins_cost(200);
 1.12438 +  format %{ "CMOV$cmp $dst,$src" %}
 1.12439 +  opcode(0x0F,0x40);
 1.12440 +  ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
 1.12441 +  ins_pipe( pipe_cmov_reg );
 1.12442 +%}
 1.12443 +
 1.12444 +// Compare 2 longs and CMOVE doubles
 1.12445 +instruct cmovDD_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regD dst, regD src) %{
 1.12446 +  predicate( UseSSE<=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
 1.12447 +  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
 1.12448 +  ins_cost(200);
 1.12449 +  expand %{
 1.12450 +    fcmovD_regS(cmp,flags,dst,src);
 1.12451 +  %}
 1.12452 +%}
 1.12453 +
 1.12454 +// Compare 2 longs and CMOVE doubles
 1.12455 +instruct cmovXDD_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regXD dst, regXD src) %{
 1.12456 +  predicate( UseSSE>=2 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
 1.12457 +  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
 1.12458 +  ins_cost(200);
 1.12459 +  expand %{
 1.12460 +    fcmovXD_regS(cmp,flags,dst,src);
 1.12461 +  %}
 1.12462 +%}
 1.12463 +
 1.12464 +instruct cmovFF_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regF dst, regF src) %{
 1.12465 +  predicate( UseSSE==0 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
 1.12466 +  match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
 1.12467 +  ins_cost(200);
 1.12468 +  expand %{
 1.12469 +    fcmovF_regS(cmp,flags,dst,src);
 1.12470 +  %}
 1.12471 +%}
 1.12472 +
 1.12473 +
 1.12474 +instruct cmovXX_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regX dst, regX src) %{
 1.12475 +  predicate( UseSSE>=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
 1.12476 +  match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
 1.12477 +  ins_cost(200);
 1.12478 +  expand %{
 1.12479 +    fcmovX_regS(cmp,flags,dst,src);
 1.12480 +  %}
 1.12481 +%}
 1.12482 +
 1.12483 +
 1.12484 +// ============================================================================
 1.12485 +// Procedure Call/Return Instructions
 1.12486 +// Call Java Static Instruction
 1.12487 +// Note: If this code changes, the corresponding ret_addr_offset() and
 1.12488 +//       compute_padding() functions will have to be adjusted.
 1.12489 +instruct CallStaticJavaDirect(method meth) %{
 1.12490 +  match(CallStaticJava);
 1.12491 +  effect(USE meth);
 1.12492 +
 1.12493 +  ins_cost(300);
 1.12494 +  format %{ "CALL,static " %}
 1.12495 +  opcode(0xE8); /* E8 cd */
 1.12496 +  ins_encode( pre_call_FPU,
 1.12497 +              Java_Static_Call( meth ),
 1.12498 +              call_epilog,
 1.12499 +              post_call_FPU );
 1.12500 +  ins_pipe( pipe_slow );
 1.12501 +  ins_pc_relative(1);
 1.12502 +  ins_alignment(4);
 1.12503 +%}
 1.12504 +
 1.12505 +// Call Java Dynamic Instruction
 1.12506 +// Note: If this code changes, the corresponding ret_addr_offset() and
 1.12507 +//       compute_padding() functions will have to be adjusted.
 1.12508 +instruct CallDynamicJavaDirect(method meth) %{
 1.12509 +  match(CallDynamicJava);
 1.12510 +  effect(USE meth);
 1.12511 +
 1.12512 +  ins_cost(300);
 1.12513 +  format %{ "MOV    EAX,(oop)-1\n\t"
 1.12514 +            "CALL,dynamic" %}
 1.12515 +  opcode(0xE8); /* E8 cd */
 1.12516 +  ins_encode( pre_call_FPU,
 1.12517 +              Java_Dynamic_Call( meth ),
 1.12518 +              call_epilog,
 1.12519 +              post_call_FPU );
 1.12520 +  ins_pipe( pipe_slow );
 1.12521 +  ins_pc_relative(1);
 1.12522 +  ins_alignment(4);
 1.12523 +%}
 1.12524 +
 1.12525 +// Call Runtime Instruction
 1.12526 +instruct CallRuntimeDirect(method meth) %{
 1.12527 +  match(CallRuntime );
 1.12528 +  effect(USE meth);
 1.12529 +
 1.12530 +  ins_cost(300);
 1.12531 +  format %{ "CALL,runtime " %}
 1.12532 +  opcode(0xE8); /* E8 cd */
 1.12533 +  // Use FFREEs to clear entries in float stack
 1.12534 +  ins_encode( pre_call_FPU,
 1.12535 +              FFree_Float_Stack_All,
 1.12536 +              Java_To_Runtime( meth ),
 1.12537 +              post_call_FPU );
 1.12538 +  ins_pipe( pipe_slow );
 1.12539 +  ins_pc_relative(1);
 1.12540 +%}
 1.12541 +
 1.12542 +// Call runtime without safepoint
 1.12543 +instruct CallLeafDirect(method meth) %{
 1.12544 +  match(CallLeaf);
 1.12545 +  effect(USE meth);
 1.12546 +
 1.12547 +  ins_cost(300);
 1.12548 +  format %{ "CALL_LEAF,runtime " %}
 1.12549 +  opcode(0xE8); /* E8 cd */
 1.12550 +  ins_encode( pre_call_FPU,
 1.12551 +              FFree_Float_Stack_All,
 1.12552 +              Java_To_Runtime( meth ),
 1.12553 +              Verify_FPU_For_Leaf, post_call_FPU );
 1.12554 +  ins_pipe( pipe_slow );
 1.12555 +  ins_pc_relative(1);
 1.12556 +%}
 1.12557 +
 1.12558 +instruct CallLeafNoFPDirect(method meth) %{
 1.12559 +  match(CallLeafNoFP);
 1.12560 +  effect(USE meth);
 1.12561 +
 1.12562 +  ins_cost(300);
 1.12563 +  format %{ "CALL_LEAF_NOFP,runtime " %}
 1.12564 +  opcode(0xE8); /* E8 cd */
 1.12565 +  ins_encode(Java_To_Runtime(meth));
 1.12566 +  ins_pipe( pipe_slow );
 1.12567 +  ins_pc_relative(1);
 1.12568 +%}
 1.12569 +
 1.12570 +
 1.12571 +// Return Instruction
 1.12572 +// Remove the return address & jump to it.
 1.12573 +instruct Ret() %{
 1.12574 +  match(Return);
 1.12575 +  format %{ "RET" %}
 1.12576 +  opcode(0xC3);
 1.12577 +  ins_encode(OpcP);
 1.12578 +  ins_pipe( pipe_jmp );
 1.12579 +%}
 1.12580 +
 1.12581 +// Tail Call; Jump from runtime stub to Java code.
 1.12582 +// Also known as an 'interprocedural jump'.
 1.12583 +// Target of jump will eventually return to caller.
 1.12584 +// TailJump below removes the return address.
 1.12585 +instruct TailCalljmpInd(eRegP_no_EBP jump_target, eBXRegP method_oop) %{
 1.12586 +  match(TailCall jump_target method_oop );
 1.12587 +  ins_cost(300);
 1.12588 +  format %{ "JMP    $jump_target \t# EBX holds method oop" %}
 1.12589 +  opcode(0xFF, 0x4);  /* Opcode FF /4 */
 1.12590 +  ins_encode( OpcP, RegOpc(jump_target) );
 1.12591 +  ins_pipe( pipe_jmp );
 1.12592 +%}
 1.12593 +
 1.12594 +
 1.12595 +// Tail Jump; remove the return address; jump to target.
 1.12596 +// TailCall above leaves the return address around.
 1.12597 +instruct tailjmpInd(eRegP_no_EBP jump_target, eAXRegP ex_oop) %{
 1.12598 +  match( TailJump jump_target ex_oop );
 1.12599 +  ins_cost(300);
 1.12600 +  format %{ "POP    EDX\t# pop return address into dummy\n\t"
 1.12601 +            "JMP    $jump_target " %}
 1.12602 +  opcode(0xFF, 0x4);  /* Opcode FF /4 */
 1.12603 +  ins_encode( enc_pop_rdx,
 1.12604 +              OpcP, RegOpc(jump_target) );
 1.12605 +  ins_pipe( pipe_jmp );
 1.12606 +%}
 1.12607 +
 1.12608 +// Create exception oop: created by stack-crawling runtime code.
 1.12609 +// Created exception is now available to this handler, and is setup
 1.12610 +// just prior to jumping to this handler.  No code emitted.
 1.12611 +instruct CreateException( eAXRegP ex_oop )
 1.12612 +%{
 1.12613 +  match(Set ex_oop (CreateEx));
 1.12614 +
 1.12615 +  size(0);
 1.12616 +  // use the following format syntax
 1.12617 +  format %{ "# exception oop is in EAX; no code emitted" %}
 1.12618 +  ins_encode();
 1.12619 +  ins_pipe( empty );
 1.12620 +%}
 1.12621 +
 1.12622 +
 1.12623 +// Rethrow exception:
 1.12624 +// The exception oop will come in the first argument position.
 1.12625 +// Then JUMP (not call) to the rethrow stub code.
 1.12626 +instruct RethrowException()
 1.12627 +%{
 1.12628 +  match(Rethrow);
 1.12629 +
 1.12630 +  // use the following format syntax
 1.12631 +  format %{ "JMP    rethrow_stub" %}
 1.12632 +  ins_encode(enc_rethrow);
 1.12633 +  ins_pipe( pipe_jmp );
 1.12634 +%}
 1.12635 +
 1.12636 +// inlined locking and unlocking
 1.12637 +
 1.12638 +
 1.12639 +instruct cmpFastLock( eFlagsReg cr, eRegP object, eRegP box, eAXRegI tmp, eRegP scr) %{
 1.12640 +  match( Set cr (FastLock object box) );
 1.12641 +  effect( TEMP tmp, TEMP scr );
 1.12642 +  ins_cost(300);
 1.12643 +  format %{ "FASTLOCK $object, $box KILLS $tmp,$scr" %}
 1.12644 +  ins_encode( Fast_Lock(object,box,tmp,scr) );
 1.12645 +  ins_pipe( pipe_slow );
 1.12646 +  ins_pc_relative(1);
 1.12647 +%}
 1.12648 +
 1.12649 +instruct cmpFastUnlock( eFlagsReg cr, eRegP object, eAXRegP box, eRegP tmp ) %{
 1.12650 +  match( Set cr (FastUnlock object box) );
 1.12651 +  effect( TEMP tmp );
 1.12652 +  ins_cost(300);
 1.12653 +  format %{ "FASTUNLOCK $object, $box, $tmp" %}
 1.12654 +  ins_encode( Fast_Unlock(object,box,tmp) );
 1.12655 +  ins_pipe( pipe_slow );
 1.12656 +  ins_pc_relative(1);
 1.12657 +%}
 1.12658 +
 1.12659 +
 1.12660 +
 1.12661 +// ============================================================================
 1.12662 +// Safepoint Instruction
 1.12663 +instruct safePoint_poll(eFlagsReg cr) %{
 1.12664 +  match(SafePoint);
 1.12665 +  effect(KILL cr);
 1.12666 +
 1.12667 +  // TODO-FIXME: we currently poll at offset 0 of the safepoint polling page.
 1.12668 +  // On SPARC that might be acceptable as we can generate the address with
 1.12669 +  // just a sethi, saving an or.  By polling at offset 0 we can end up
 1.12670 +  // putting additional pressure on the index-0 in the D$.  Because of
 1.12671 +  // alignment (just like the situation at hand) the lower indices tend
 1.12672 +  // to see more traffic.  It'd be better to change the polling address
 1.12673 +  // to offset 0 of the last $line in the polling page.
 1.12674 +
 1.12675 +  format %{ "TSTL   #polladdr,EAX\t! Safepoint: poll for GC" %}
 1.12676 +  ins_cost(125);
 1.12677 +  size(6) ;
 1.12678 +  ins_encode( Safepoint_Poll() );
 1.12679 +  ins_pipe( ialu_reg_mem );
 1.12680 +%}
 1.12681 +
 1.12682 +//----------PEEPHOLE RULES-----------------------------------------------------
 1.12683 +// These must follow all instruction definitions as they use the names
 1.12684 +// defined in the instructions definitions.
 1.12685 +//
 1.12686 +// peepmatch ( root_instr_name [preceeding_instruction]* );
 1.12687 +//
 1.12688 +// peepconstraint %{
 1.12689 +// (instruction_number.operand_name relational_op instruction_number.operand_name
 1.12690 +//  [, ...] );
 1.12691 +// // instruction numbers are zero-based using left to right order in peepmatch
 1.12692 +//
 1.12693 +// peepreplace ( instr_name  ( [instruction_number.operand_name]* ) );
 1.12694 +// // provide an instruction_number.operand_name for each operand that appears
 1.12695 +// // in the replacement instruction's match rule
 1.12696 +//
 1.12697 +// ---------VM FLAGS---------------------------------------------------------
 1.12698 +//
 1.12699 +// All peephole optimizations can be turned off using -XX:-OptoPeephole
 1.12700 +//
 1.12701 +// Each peephole rule is given an identifying number starting with zero and
 1.12702 +// increasing by one in the order seen by the parser.  An individual peephole
 1.12703 +// can be enabled, and all others disabled, by using -XX:OptoPeepholeAt=#
 1.12704 +// on the command-line.
 1.12705 +//
 1.12706 +// ---------CURRENT LIMITATIONS----------------------------------------------
 1.12707 +//
 1.12708 +// Only match adjacent instructions in same basic block
 1.12709 +// Only equality constraints
 1.12710 +// Only constraints between operands, not (0.dest_reg == EAX_enc)
 1.12711 +// Only one replacement instruction
 1.12712 +//
 1.12713 +// ---------EXAMPLE----------------------------------------------------------
 1.12714 +//
 1.12715 +// // pertinent parts of existing instructions in architecture description
 1.12716 +// instruct movI(eRegI dst, eRegI src) %{
 1.12717 +//   match(Set dst (CopyI src));
 1.12718 +// %}
 1.12719 +//
 1.12720 +// instruct incI_eReg(eRegI dst, immI1 src, eFlagsReg cr) %{
 1.12721 +//   match(Set dst (AddI dst src));
 1.12722 +//   effect(KILL cr);
 1.12723 +// %}
 1.12724 +//
 1.12725 +// // Change (inc mov) to lea
 1.12726 +// peephole %{
 1.12727 +//   // increment preceeded by register-register move
 1.12728 +//   peepmatch ( incI_eReg movI );
 1.12729 +//   // require that the destination register of the increment
 1.12730 +//   // match the destination register of the move
 1.12731 +//   peepconstraint ( 0.dst == 1.dst );
 1.12732 +//   // construct a replacement instruction that sets
 1.12733 +//   // the destination to ( move's source register + one )
 1.12734 +//   peepreplace ( leaI_eReg_immI( 0.dst 1.src 0.src ) );
 1.12735 +// %}
 1.12736 +//
 1.12737 +// Implementation no longer uses movX instructions since
 1.12738 +// machine-independent system no longer uses CopyX nodes.
 1.12739 +//
 1.12740 +// peephole %{
 1.12741 +//   peepmatch ( incI_eReg movI );
 1.12742 +//   peepconstraint ( 0.dst == 1.dst );
 1.12743 +//   peepreplace ( leaI_eReg_immI( 0.dst 1.src 0.src ) );
 1.12744 +// %}
 1.12745 +//
 1.12746 +// peephole %{
 1.12747 +//   peepmatch ( decI_eReg movI );
 1.12748 +//   peepconstraint ( 0.dst == 1.dst );
 1.12749 +//   peepreplace ( leaI_eReg_immI( 0.dst 1.src 0.src ) );
 1.12750 +// %}
 1.12751 +//
 1.12752 +// peephole %{
 1.12753 +//   peepmatch ( addI_eReg_imm movI );
 1.12754 +//   peepconstraint ( 0.dst == 1.dst );
 1.12755 +//   peepreplace ( leaI_eReg_immI( 0.dst 1.src 0.src ) );
 1.12756 +// %}
 1.12757 +//
 1.12758 +// peephole %{
 1.12759 +//   peepmatch ( addP_eReg_imm movP );
 1.12760 +//   peepconstraint ( 0.dst == 1.dst );
 1.12761 +//   peepreplace ( leaP_eReg_immI( 0.dst 1.src 0.src ) );
 1.12762 +// %}
 1.12763 +
 1.12764 +// // Change load of spilled value to only a spill
 1.12765 +// instruct storeI(memory mem, eRegI src) %{
 1.12766 +//   match(Set mem (StoreI mem src));
 1.12767 +// %}
 1.12768 +//
 1.12769 +// instruct loadI(eRegI dst, memory mem) %{
 1.12770 +//   match(Set dst (LoadI mem));
 1.12771 +// %}
 1.12772 +//
 1.12773 +peephole %{
 1.12774 +  peepmatch ( loadI storeI );
 1.12775 +  peepconstraint ( 1.src == 0.dst, 1.mem == 0.mem );
 1.12776 +  peepreplace ( storeI( 1.mem 1.mem 1.src ) );
 1.12777 +%}
 1.12778 +
 1.12779 +//----------SMARTSPILL RULES---------------------------------------------------
 1.12780 +// These must follow all instruction definitions as they use the names
 1.12781 +// defined in the instructions definitions.

mercurial