src/cpu/x86/vm/assembler_x86.hpp

changeset 0
f90c822e73f8
child 6876
710a3c8b516e
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/cpu/x86/vm/assembler_x86.hpp	Wed Apr 27 01:25:04 2016 +0800
     1.3 @@ -0,0 +1,1859 @@
     1.4 +/*
     1.5 + * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
     1.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     1.7 + *
     1.8 + * This code is free software; you can redistribute it and/or modify it
     1.9 + * under the terms of the GNU General Public License version 2 only, as
    1.10 + * published by the Free Software Foundation.
    1.11 + *
    1.12 + * This code is distributed in the hope that it will be useful, but WITHOUT
    1.13 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    1.14 + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    1.15 + * version 2 for more details (a copy is included in the LICENSE file that
    1.16 + * accompanied this code).
    1.17 + *
    1.18 + * You should have received a copy of the GNU General Public License version
    1.19 + * 2 along with this work; if not, write to the Free Software Foundation,
    1.20 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    1.21 + *
    1.22 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    1.23 + * or visit www.oracle.com if you need additional information or have any
    1.24 + * questions.
    1.25 + *
    1.26 + */
    1.27 +
    1.28 +#ifndef CPU_X86_VM_ASSEMBLER_X86_HPP
    1.29 +#define CPU_X86_VM_ASSEMBLER_X86_HPP
    1.30 +
    1.31 +#include "asm/register.hpp"
    1.32 +
    1.33 +class BiasedLockingCounters;
    1.34 +
    1.35 +// Contains all the definitions needed for x86 assembly code generation.
    1.36 +
    1.37 +// Calling convention
    1.38 +class Argument VALUE_OBJ_CLASS_SPEC {
    1.39 + public:
    1.40 +  enum {
    1.41 +#ifdef _LP64
    1.42 +#ifdef _WIN64
    1.43 +    n_int_register_parameters_c   = 4, // rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
    1.44 +    n_float_register_parameters_c = 4,  // xmm0 - xmm3 (c_farg0, c_farg1, ... )
    1.45 +#else
    1.46 +    n_int_register_parameters_c   = 6, // rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
    1.47 +    n_float_register_parameters_c = 8,  // xmm0 - xmm7 (c_farg0, c_farg1, ... )
    1.48 +#endif // _WIN64
    1.49 +    n_int_register_parameters_j   = 6, // j_rarg0, j_rarg1, ...
    1.50 +    n_float_register_parameters_j = 8  // j_farg0, j_farg1, ...
    1.51 +#else
    1.52 +    n_register_parameters = 0   // 0 registers used to pass arguments
    1.53 +#endif // _LP64
    1.54 +  };
    1.55 +};
    1.56 +
    1.57 +
    1.58 +#ifdef _LP64
    1.59 +// Symbolically name the register arguments used by the c calling convention.
    1.60 +// Windows is different from linux/solaris. So much for standards...
    1.61 +
    1.62 +#ifdef _WIN64
    1.63 +
    1.64 +REGISTER_DECLARATION(Register, c_rarg0, rcx);
    1.65 +REGISTER_DECLARATION(Register, c_rarg1, rdx);
    1.66 +REGISTER_DECLARATION(Register, c_rarg2, r8);
    1.67 +REGISTER_DECLARATION(Register, c_rarg3, r9);
    1.68 +
    1.69 +REGISTER_DECLARATION(XMMRegister, c_farg0, xmm0);
    1.70 +REGISTER_DECLARATION(XMMRegister, c_farg1, xmm1);
    1.71 +REGISTER_DECLARATION(XMMRegister, c_farg2, xmm2);
    1.72 +REGISTER_DECLARATION(XMMRegister, c_farg3, xmm3);
    1.73 +
    1.74 +#else
    1.75 +
    1.76 +REGISTER_DECLARATION(Register, c_rarg0, rdi);
    1.77 +REGISTER_DECLARATION(Register, c_rarg1, rsi);
    1.78 +REGISTER_DECLARATION(Register, c_rarg2, rdx);
    1.79 +REGISTER_DECLARATION(Register, c_rarg3, rcx);
    1.80 +REGISTER_DECLARATION(Register, c_rarg4, r8);
    1.81 +REGISTER_DECLARATION(Register, c_rarg5, r9);
    1.82 +
    1.83 +REGISTER_DECLARATION(XMMRegister, c_farg0, xmm0);
    1.84 +REGISTER_DECLARATION(XMMRegister, c_farg1, xmm1);
    1.85 +REGISTER_DECLARATION(XMMRegister, c_farg2, xmm2);
    1.86 +REGISTER_DECLARATION(XMMRegister, c_farg3, xmm3);
    1.87 +REGISTER_DECLARATION(XMMRegister, c_farg4, xmm4);
    1.88 +REGISTER_DECLARATION(XMMRegister, c_farg5, xmm5);
    1.89 +REGISTER_DECLARATION(XMMRegister, c_farg6, xmm6);
    1.90 +REGISTER_DECLARATION(XMMRegister, c_farg7, xmm7);
    1.91 +
    1.92 +#endif // _WIN64
    1.93 +
    1.94 +// Symbolically name the register arguments used by the Java calling convention.
    1.95 +// We have control over the convention for java so we can do what we please.
    1.96 +// What pleases us is to offset the java calling convention so that when
    1.97 +// we call a suitable jni method the arguments are lined up and we don't
    1.98 +// have to do little shuffling. A suitable jni method is non-static and a
    1.99 +// small number of arguments (two fewer args on windows)
   1.100 +//
   1.101 +//        |-------------------------------------------------------|
   1.102 +//        | c_rarg0   c_rarg1  c_rarg2 c_rarg3 c_rarg4 c_rarg5    |
   1.103 +//        |-------------------------------------------------------|
   1.104 +//        | rcx       rdx      r8      r9      rdi*    rsi*       | windows (* not a c_rarg)
   1.105 +//        | rdi       rsi      rdx     rcx     r8      r9         | solaris/linux
   1.106 +//        |-------------------------------------------------------|
   1.107 +//        | j_rarg5   j_rarg0  j_rarg1 j_rarg2 j_rarg3 j_rarg4    |
   1.108 +//        |-------------------------------------------------------|
   1.109 +
   1.110 +REGISTER_DECLARATION(Register, j_rarg0, c_rarg1);
   1.111 +REGISTER_DECLARATION(Register, j_rarg1, c_rarg2);
   1.112 +REGISTER_DECLARATION(Register, j_rarg2, c_rarg3);
   1.113 +// Windows runs out of register args here
   1.114 +#ifdef _WIN64
   1.115 +REGISTER_DECLARATION(Register, j_rarg3, rdi);
   1.116 +REGISTER_DECLARATION(Register, j_rarg4, rsi);
   1.117 +#else
   1.118 +REGISTER_DECLARATION(Register, j_rarg3, c_rarg4);
   1.119 +REGISTER_DECLARATION(Register, j_rarg4, c_rarg5);
   1.120 +#endif /* _WIN64 */
   1.121 +REGISTER_DECLARATION(Register, j_rarg5, c_rarg0);
   1.122 +
   1.123 +REGISTER_DECLARATION(XMMRegister, j_farg0, xmm0);
   1.124 +REGISTER_DECLARATION(XMMRegister, j_farg1, xmm1);
   1.125 +REGISTER_DECLARATION(XMMRegister, j_farg2, xmm2);
   1.126 +REGISTER_DECLARATION(XMMRegister, j_farg3, xmm3);
   1.127 +REGISTER_DECLARATION(XMMRegister, j_farg4, xmm4);
   1.128 +REGISTER_DECLARATION(XMMRegister, j_farg5, xmm5);
   1.129 +REGISTER_DECLARATION(XMMRegister, j_farg6, xmm6);
   1.130 +REGISTER_DECLARATION(XMMRegister, j_farg7, xmm7);
   1.131 +
   1.132 +REGISTER_DECLARATION(Register, rscratch1, r10);  // volatile
   1.133 +REGISTER_DECLARATION(Register, rscratch2, r11);  // volatile
   1.134 +
   1.135 +REGISTER_DECLARATION(Register, r12_heapbase, r12); // callee-saved
   1.136 +REGISTER_DECLARATION(Register, r15_thread, r15); // callee-saved
   1.137 +
   1.138 +#else
   1.139 +// rscratch1 will apear in 32bit code that is dead but of course must compile
   1.140 +// Using noreg ensures if the dead code is incorrectly live and executed it
   1.141 +// will cause an assertion failure
   1.142 +#define rscratch1 noreg
   1.143 +#define rscratch2 noreg
   1.144 +
   1.145 +#endif // _LP64
   1.146 +
   1.147 +// JSR 292 fixed register usages:
   1.148 +REGISTER_DECLARATION(Register, rbp_mh_SP_save, rbp);
   1.149 +
   1.150 +// Address is an abstraction used to represent a memory location
   1.151 +// using any of the amd64 addressing modes with one object.
   1.152 +//
   1.153 +// Note: A register location is represented via a Register, not
   1.154 +//       via an address for efficiency & simplicity reasons.
   1.155 +
   1.156 +class ArrayAddress;
   1.157 +
   1.158 +class Address VALUE_OBJ_CLASS_SPEC {
   1.159 + public:
   1.160 +  enum ScaleFactor {
   1.161 +    no_scale = -1,
   1.162 +    times_1  =  0,
   1.163 +    times_2  =  1,
   1.164 +    times_4  =  2,
   1.165 +    times_8  =  3,
   1.166 +    times_ptr = LP64_ONLY(times_8) NOT_LP64(times_4)
   1.167 +  };
   1.168 +  static ScaleFactor times(int size) {
   1.169 +    assert(size >= 1 && size <= 8 && is_power_of_2(size), "bad scale size");
   1.170 +    if (size == 8)  return times_8;
   1.171 +    if (size == 4)  return times_4;
   1.172 +    if (size == 2)  return times_2;
   1.173 +    return times_1;
   1.174 +  }
   1.175 +  static int scale_size(ScaleFactor scale) {
   1.176 +    assert(scale != no_scale, "");
   1.177 +    assert(((1 << (int)times_1) == 1 &&
   1.178 +            (1 << (int)times_2) == 2 &&
   1.179 +            (1 << (int)times_4) == 4 &&
   1.180 +            (1 << (int)times_8) == 8), "");
   1.181 +    return (1 << (int)scale);
   1.182 +  }
   1.183 +
   1.184 + private:
   1.185 +  Register         _base;
   1.186 +  Register         _index;
   1.187 +  ScaleFactor      _scale;
   1.188 +  int              _disp;
   1.189 +  RelocationHolder _rspec;
   1.190 +
   1.191 +  // Easily misused constructors make them private
   1.192 +  // %%% can we make these go away?
   1.193 +  NOT_LP64(Address(address loc, RelocationHolder spec);)
   1.194 +  Address(int disp, address loc, relocInfo::relocType rtype);
   1.195 +  Address(int disp, address loc, RelocationHolder spec);
   1.196 +
   1.197 + public:
   1.198 +
   1.199 + int disp() { return _disp; }
   1.200 +  // creation
   1.201 +  Address()
   1.202 +    : _base(noreg),
   1.203 +      _index(noreg),
   1.204 +      _scale(no_scale),
   1.205 +      _disp(0) {
   1.206 +  }
   1.207 +
   1.208 +  // No default displacement otherwise Register can be implicitly
   1.209 +  // converted to 0(Register) which is quite a different animal.
   1.210 +
   1.211 +  Address(Register base, int disp)
   1.212 +    : _base(base),
   1.213 +      _index(noreg),
   1.214 +      _scale(no_scale),
   1.215 +      _disp(disp) {
   1.216 +  }
   1.217 +
   1.218 +  Address(Register base, Register index, ScaleFactor scale, int disp = 0)
   1.219 +    : _base (base),
   1.220 +      _index(index),
   1.221 +      _scale(scale),
   1.222 +      _disp (disp) {
   1.223 +    assert(!index->is_valid() == (scale == Address::no_scale),
   1.224 +           "inconsistent address");
   1.225 +  }
   1.226 +
   1.227 +  Address(Register base, RegisterOrConstant index, ScaleFactor scale = times_1, int disp = 0)
   1.228 +    : _base (base),
   1.229 +      _index(index.register_or_noreg()),
   1.230 +      _scale(scale),
   1.231 +      _disp (disp + (index.constant_or_zero() * scale_size(scale))) {
   1.232 +    if (!index.is_register())  scale = Address::no_scale;
   1.233 +    assert(!_index->is_valid() == (scale == Address::no_scale),
   1.234 +           "inconsistent address");
   1.235 +  }
   1.236 +
   1.237 +  Address plus_disp(int disp) const {
   1.238 +    Address a = (*this);
   1.239 +    a._disp += disp;
   1.240 +    return a;
   1.241 +  }
   1.242 +  Address plus_disp(RegisterOrConstant disp, ScaleFactor scale = times_1) const {
   1.243 +    Address a = (*this);
   1.244 +    a._disp += disp.constant_or_zero() * scale_size(scale);
   1.245 +    if (disp.is_register()) {
   1.246 +      assert(!a.index()->is_valid(), "competing indexes");
   1.247 +      a._index = disp.as_register();
   1.248 +      a._scale = scale;
   1.249 +    }
   1.250 +    return a;
   1.251 +  }
   1.252 +  bool is_same_address(Address a) const {
   1.253 +    // disregard _rspec
   1.254 +    return _base == a._base && _disp == a._disp && _index == a._index && _scale == a._scale;
   1.255 +  }
   1.256 +
   1.257 +  // The following two overloads are used in connection with the
   1.258 +  // ByteSize type (see sizes.hpp).  They simplify the use of
   1.259 +  // ByteSize'd arguments in assembly code. Note that their equivalent
   1.260 +  // for the optimized build are the member functions with int disp
   1.261 +  // argument since ByteSize is mapped to an int type in that case.
   1.262 +  //
   1.263 +  // Note: DO NOT introduce similar overloaded functions for WordSize
   1.264 +  // arguments as in the optimized mode, both ByteSize and WordSize
   1.265 +  // are mapped to the same type and thus the compiler cannot make a
   1.266 +  // distinction anymore (=> compiler errors).
   1.267 +
   1.268 +#ifdef ASSERT
   1.269 +  Address(Register base, ByteSize disp)
   1.270 +    : _base(base),
   1.271 +      _index(noreg),
   1.272 +      _scale(no_scale),
   1.273 +      _disp(in_bytes(disp)) {
   1.274 +  }
   1.275 +
   1.276 +  Address(Register base, Register index, ScaleFactor scale, ByteSize disp)
   1.277 +    : _base(base),
   1.278 +      _index(index),
   1.279 +      _scale(scale),
   1.280 +      _disp(in_bytes(disp)) {
   1.281 +    assert(!index->is_valid() == (scale == Address::no_scale),
   1.282 +           "inconsistent address");
   1.283 +  }
   1.284 +
   1.285 +  Address(Register base, RegisterOrConstant index, ScaleFactor scale, ByteSize disp)
   1.286 +    : _base (base),
   1.287 +      _index(index.register_or_noreg()),
   1.288 +      _scale(scale),
   1.289 +      _disp (in_bytes(disp) + (index.constant_or_zero() * scale_size(scale))) {
   1.290 +    if (!index.is_register())  scale = Address::no_scale;
   1.291 +    assert(!_index->is_valid() == (scale == Address::no_scale),
   1.292 +           "inconsistent address");
   1.293 +  }
   1.294 +
   1.295 +#endif // ASSERT
   1.296 +
   1.297 +  // accessors
   1.298 +  bool        uses(Register reg) const { return _base == reg || _index == reg; }
   1.299 +  Register    base()             const { return _base;  }
   1.300 +  Register    index()            const { return _index; }
   1.301 +  ScaleFactor scale()            const { return _scale; }
   1.302 +  int         disp()             const { return _disp;  }
   1.303 +
   1.304 +  // Convert the raw encoding form into the form expected by the constructor for
   1.305 +  // Address.  An index of 4 (rsp) corresponds to having no index, so convert
   1.306 +  // that to noreg for the Address constructor.
   1.307 +  static Address make_raw(int base, int index, int scale, int disp, relocInfo::relocType disp_reloc);
   1.308 +
   1.309 +  static Address make_array(ArrayAddress);
   1.310 +
   1.311 + private:
   1.312 +  bool base_needs_rex() const {
   1.313 +    return _base != noreg && _base->encoding() >= 8;
   1.314 +  }
   1.315 +
   1.316 +  bool index_needs_rex() const {
   1.317 +    return _index != noreg &&_index->encoding() >= 8;
   1.318 +  }
   1.319 +
   1.320 +  relocInfo::relocType reloc() const { return _rspec.type(); }
   1.321 +
   1.322 +  friend class Assembler;
   1.323 +  friend class MacroAssembler;
   1.324 +  friend class LIR_Assembler; // base/index/scale/disp
   1.325 +};
   1.326 +
   1.327 +//
   1.328 +// AddressLiteral has been split out from Address because operands of this type
   1.329 +// need to be treated specially on 32bit vs. 64bit platforms. By splitting it out
   1.330 +// the few instructions that need to deal with address literals are unique and the
   1.331 +// MacroAssembler does not have to implement every instruction in the Assembler
   1.332 +// in order to search for address literals that may need special handling depending
   1.333 +// on the instruction and the platform. As small step on the way to merging i486/amd64
   1.334 +// directories.
   1.335 +//
   1.336 +class AddressLiteral VALUE_OBJ_CLASS_SPEC {
   1.337 +  friend class ArrayAddress;
   1.338 +  RelocationHolder _rspec;
   1.339 +  // Typically we use AddressLiterals we want to use their rval
   1.340 +  // However in some situations we want the lval (effect address) of the item.
   1.341 +  // We provide a special factory for making those lvals.
   1.342 +  bool _is_lval;
   1.343 +
   1.344 +  // If the target is far we'll need to load the ea of this to
   1.345 +  // a register to reach it. Otherwise if near we can do rip
   1.346 +  // relative addressing.
   1.347 +
   1.348 +  address          _target;
   1.349 +
   1.350 + protected:
   1.351 +  // creation
   1.352 +  AddressLiteral()
   1.353 +    : _is_lval(false),
   1.354 +      _target(NULL)
   1.355 +  {}
   1.356 +
   1.357 +  public:
   1.358 +
   1.359 +
   1.360 +  AddressLiteral(address target, relocInfo::relocType rtype);
   1.361 +
   1.362 +  AddressLiteral(address target, RelocationHolder const& rspec)
   1.363 +    : _rspec(rspec),
   1.364 +      _is_lval(false),
   1.365 +      _target(target)
   1.366 +  {}
   1.367 +
   1.368 +  AddressLiteral addr() {
   1.369 +    AddressLiteral ret = *this;
   1.370 +    ret._is_lval = true;
   1.371 +    return ret;
   1.372 +  }
   1.373 +
   1.374 +
   1.375 + private:
   1.376 +
   1.377 +  address target() { return _target; }
   1.378 +  bool is_lval() { return _is_lval; }
   1.379 +
   1.380 +  relocInfo::relocType reloc() const { return _rspec.type(); }
   1.381 +  const RelocationHolder& rspec() const { return _rspec; }
   1.382 +
   1.383 +  friend class Assembler;
   1.384 +  friend class MacroAssembler;
   1.385 +  friend class Address;
   1.386 +  friend class LIR_Assembler;
   1.387 +};
   1.388 +
   1.389 +// Convience classes
   1.390 +class RuntimeAddress: public AddressLiteral {
   1.391 +
   1.392 +  public:
   1.393 +
   1.394 +  RuntimeAddress(address target) : AddressLiteral(target, relocInfo::runtime_call_type) {}
   1.395 +
   1.396 +};
   1.397 +
   1.398 +class ExternalAddress: public AddressLiteral {
   1.399 + private:
   1.400 +  static relocInfo::relocType reloc_for_target(address target) {
   1.401 +    // Sometimes ExternalAddress is used for values which aren't
   1.402 +    // exactly addresses, like the card table base.
   1.403 +    // external_word_type can't be used for values in the first page
   1.404 +    // so just skip the reloc in that case.
   1.405 +    return external_word_Relocation::can_be_relocated(target) ? relocInfo::external_word_type : relocInfo::none;
   1.406 +  }
   1.407 +
   1.408 + public:
   1.409 +
   1.410 +  ExternalAddress(address target) : AddressLiteral(target, reloc_for_target(target)) {}
   1.411 +
   1.412 +};
   1.413 +
   1.414 +class InternalAddress: public AddressLiteral {
   1.415 +
   1.416 +  public:
   1.417 +
   1.418 +  InternalAddress(address target) : AddressLiteral(target, relocInfo::internal_word_type) {}
   1.419 +
   1.420 +};
   1.421 +
   1.422 +// x86 can do array addressing as a single operation since disp can be an absolute
   1.423 +// address amd64 can't. We create a class that expresses the concept but does extra
   1.424 +// magic on amd64 to get the final result
   1.425 +
   1.426 +class ArrayAddress VALUE_OBJ_CLASS_SPEC {
   1.427 +  private:
   1.428 +
   1.429 +  AddressLiteral _base;
   1.430 +  Address        _index;
   1.431 +
   1.432 +  public:
   1.433 +
   1.434 +  ArrayAddress() {};
   1.435 +  ArrayAddress(AddressLiteral base, Address index): _base(base), _index(index) {};
   1.436 +  AddressLiteral base() { return _base; }
   1.437 +  Address index() { return _index; }
   1.438 +
   1.439 +};
   1.440 +
   1.441 +const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY( 512 / wordSize);
   1.442 +
   1.443 +// The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction
   1.444 +// level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write
   1.445 +// is what you get. The Assembler is generating code into a CodeBuffer.
   1.446 +
   1.447 +class Assembler : public AbstractAssembler  {
   1.448 +  friend class AbstractAssembler; // for the non-virtual hack
   1.449 +  friend class LIR_Assembler; // as_Address()
   1.450 +  friend class StubGenerator;
   1.451 +
   1.452 + public:
   1.453 +  enum Condition {                     // The x86 condition codes used for conditional jumps/moves.
   1.454 +    zero          = 0x4,
   1.455 +    notZero       = 0x5,
   1.456 +    equal         = 0x4,
   1.457 +    notEqual      = 0x5,
   1.458 +    less          = 0xc,
   1.459 +    lessEqual     = 0xe,
   1.460 +    greater       = 0xf,
   1.461 +    greaterEqual  = 0xd,
   1.462 +    below         = 0x2,
   1.463 +    belowEqual    = 0x6,
   1.464 +    above         = 0x7,
   1.465 +    aboveEqual    = 0x3,
   1.466 +    overflow      = 0x0,
   1.467 +    noOverflow    = 0x1,
   1.468 +    carrySet      = 0x2,
   1.469 +    carryClear    = 0x3,
   1.470 +    negative      = 0x8,
   1.471 +    positive      = 0x9,
   1.472 +    parity        = 0xa,
   1.473 +    noParity      = 0xb
   1.474 +  };
   1.475 +
   1.476 +  enum Prefix {
   1.477 +    // segment overrides
   1.478 +    CS_segment = 0x2e,
   1.479 +    SS_segment = 0x36,
   1.480 +    DS_segment = 0x3e,
   1.481 +    ES_segment = 0x26,
   1.482 +    FS_segment = 0x64,
   1.483 +    GS_segment = 0x65,
   1.484 +
   1.485 +    REX        = 0x40,
   1.486 +
   1.487 +    REX_B      = 0x41,
   1.488 +    REX_X      = 0x42,
   1.489 +    REX_XB     = 0x43,
   1.490 +    REX_R      = 0x44,
   1.491 +    REX_RB     = 0x45,
   1.492 +    REX_RX     = 0x46,
   1.493 +    REX_RXB    = 0x47,
   1.494 +
   1.495 +    REX_W      = 0x48,
   1.496 +
   1.497 +    REX_WB     = 0x49,
   1.498 +    REX_WX     = 0x4A,
   1.499 +    REX_WXB    = 0x4B,
   1.500 +    REX_WR     = 0x4C,
   1.501 +    REX_WRB    = 0x4D,
   1.502 +    REX_WRX    = 0x4E,
   1.503 +    REX_WRXB   = 0x4F,
   1.504 +
   1.505 +    VEX_3bytes = 0xC4,
   1.506 +    VEX_2bytes = 0xC5
   1.507 +  };
   1.508 +
   1.509 +  enum VexPrefix {
   1.510 +    VEX_B = 0x20,
   1.511 +    VEX_X = 0x40,
   1.512 +    VEX_R = 0x80,
   1.513 +    VEX_W = 0x80
   1.514 +  };
   1.515 +
   1.516 +  enum VexSimdPrefix {
   1.517 +    VEX_SIMD_NONE = 0x0,
   1.518 +    VEX_SIMD_66   = 0x1,
   1.519 +    VEX_SIMD_F3   = 0x2,
   1.520 +    VEX_SIMD_F2   = 0x3
   1.521 +  };
   1.522 +
   1.523 +  enum VexOpcode {
   1.524 +    VEX_OPCODE_NONE  = 0x0,
   1.525 +    VEX_OPCODE_0F    = 0x1,
   1.526 +    VEX_OPCODE_0F_38 = 0x2,
   1.527 +    VEX_OPCODE_0F_3A = 0x3
   1.528 +  };
   1.529 +
   1.530 +  enum WhichOperand {
   1.531 +    // input to locate_operand, and format code for relocations
   1.532 +    imm_operand  = 0,            // embedded 32-bit|64-bit immediate operand
   1.533 +    disp32_operand = 1,          // embedded 32-bit displacement or address
   1.534 +    call32_operand = 2,          // embedded 32-bit self-relative displacement
   1.535 +#ifndef _LP64
   1.536 +    _WhichOperand_limit = 3
   1.537 +#else
   1.538 +     narrow_oop_operand = 3,     // embedded 32-bit immediate narrow oop
   1.539 +    _WhichOperand_limit = 4
   1.540 +#endif
   1.541 +  };
   1.542 +
   1.543 +
   1.544 +
   1.545 +  // NOTE: The general philopsophy of the declarations here is that 64bit versions
   1.546 +  // of instructions are freely declared without the need for wrapping them an ifdef.
   1.547 +  // (Some dangerous instructions are ifdef's out of inappropriate jvm's.)
   1.548 +  // In the .cpp file the implementations are wrapped so that they are dropped out
   1.549 +  // of the resulting jvm. This is done mostly to keep the footprint of MINIMAL
   1.550 +  // to the size it was prior to merging up the 32bit and 64bit assemblers.
   1.551 +  //
   1.552 +  // This does mean you'll get a linker/runtime error if you use a 64bit only instruction
   1.553 +  // in a 32bit vm. This is somewhat unfortunate but keeps the ifdef noise down.
   1.554 +
   1.555 +private:
   1.556 +
   1.557 +
   1.558 +  // 64bit prefixes
   1.559 +  int prefix_and_encode(int reg_enc, bool byteinst = false);
   1.560 +  int prefixq_and_encode(int reg_enc);
   1.561 +
   1.562 +  int prefix_and_encode(int dst_enc, int src_enc, bool byteinst = false);
   1.563 +  int prefixq_and_encode(int dst_enc, int src_enc);
   1.564 +
   1.565 +  void prefix(Register reg);
   1.566 +  void prefix(Address adr);
   1.567 +  void prefixq(Address adr);
   1.568 +
   1.569 +  void prefix(Address adr, Register reg,  bool byteinst = false);
   1.570 +  void prefix(Address adr, XMMRegister reg);
   1.571 +  void prefixq(Address adr, Register reg);
   1.572 +  void prefixq(Address adr, XMMRegister reg);
   1.573 +
   1.574 +  void prefetch_prefix(Address src);
   1.575 +
   1.576 +  void rex_prefix(Address adr, XMMRegister xreg,
   1.577 +                  VexSimdPrefix pre, VexOpcode opc, bool rex_w);
   1.578 +  int  rex_prefix_and_encode(int dst_enc, int src_enc,
   1.579 +                             VexSimdPrefix pre, VexOpcode opc, bool rex_w);
   1.580 +
   1.581 +  void vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w,
   1.582 +                  int nds_enc, VexSimdPrefix pre, VexOpcode opc,
   1.583 +                  bool vector256);
   1.584 +
   1.585 +  void vex_prefix(Address adr, int nds_enc, int xreg_enc,
   1.586 +                  VexSimdPrefix pre, VexOpcode opc,
   1.587 +                  bool vex_w, bool vector256);
   1.588 +
   1.589 +  void vex_prefix(XMMRegister dst, XMMRegister nds, Address src,
   1.590 +                  VexSimdPrefix pre, bool vector256 = false) {
   1.591 +    int dst_enc = dst->encoding();
   1.592 +    int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   1.593 +    vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, false, vector256);
   1.594 +  }
   1.595 +
   1.596 +  void vex_prefix_0F38(Register dst, Register nds, Address src) {
   1.597 +    bool vex_w = false;
   1.598 +    bool vector256 = false;
   1.599 +    vex_prefix(src, nds->encoding(), dst->encoding(),
   1.600 +               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
   1.601 +  }
   1.602 +
   1.603 +  void vex_prefix_0F38_q(Register dst, Register nds, Address src) {
   1.604 +    bool vex_w = true;
   1.605 +    bool vector256 = false;
   1.606 +    vex_prefix(src, nds->encoding(), dst->encoding(),
   1.607 +               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
   1.608 +  }
   1.609 +  int  vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
   1.610 +                             VexSimdPrefix pre, VexOpcode opc,
   1.611 +                             bool vex_w, bool vector256);
   1.612 +
   1.613 +  int  vex_prefix_0F38_and_encode(Register dst, Register nds, Register src) {
   1.614 +    bool vex_w = false;
   1.615 +    bool vector256 = false;
   1.616 +    return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
   1.617 +                                 VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
   1.618 +  }
   1.619 +  int  vex_prefix_0F38_and_encode_q(Register dst, Register nds, Register src) {
   1.620 +    bool vex_w = true;
   1.621 +    bool vector256 = false;
   1.622 +    return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
   1.623 +                                 VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
   1.624 +  }
   1.625 +  int  vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
   1.626 +                             VexSimdPrefix pre, bool vector256 = false,
   1.627 +                             VexOpcode opc = VEX_OPCODE_0F) {
   1.628 +    int src_enc = src->encoding();
   1.629 +    int dst_enc = dst->encoding();
   1.630 +    int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   1.631 +    return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, false, vector256);
   1.632 +  }
   1.633 +
   1.634 +  void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr,
   1.635 +                   VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
   1.636 +                   bool rex_w = false, bool vector256 = false);
   1.637 +
   1.638 +  void simd_prefix(XMMRegister dst, Address src,
   1.639 +                   VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
   1.640 +    simd_prefix(dst, xnoreg, src, pre, opc);
   1.641 +  }
   1.642 +
   1.643 +  void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre) {
   1.644 +    simd_prefix(src, dst, pre);
   1.645 +  }
   1.646 +  void simd_prefix_q(XMMRegister dst, XMMRegister nds, Address src,
   1.647 +                     VexSimdPrefix pre) {
   1.648 +    bool rex_w = true;
   1.649 +    simd_prefix(dst, nds, src, pre, VEX_OPCODE_0F, rex_w);
   1.650 +  }
   1.651 +
   1.652 +  int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
   1.653 +                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
   1.654 +                             bool rex_w = false, bool vector256 = false);
   1.655 +
   1.656 +  // Move/convert 32-bit integer value.
   1.657 +  int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, Register src,
   1.658 +                             VexSimdPrefix pre) {
   1.659 +    // It is OK to cast from Register to XMMRegister to pass argument here
   1.660 +    // since only encoding is used in simd_prefix_and_encode() and number of
   1.661 +    // Gen and Xmm registers are the same.
   1.662 +    return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre);
   1.663 +  }
   1.664 +  int simd_prefix_and_encode(XMMRegister dst, Register src, VexSimdPrefix pre) {
   1.665 +    return simd_prefix_and_encode(dst, xnoreg, src, pre);
   1.666 +  }
   1.667 +  int simd_prefix_and_encode(Register dst, XMMRegister src,
   1.668 +                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
   1.669 +    return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc);
   1.670 +  }
   1.671 +
   1.672 +  // Move/convert 64-bit integer value.
   1.673 +  int simd_prefix_and_encode_q(XMMRegister dst, XMMRegister nds, Register src,
   1.674 +                               VexSimdPrefix pre) {
   1.675 +    bool rex_w = true;
   1.676 +    return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, VEX_OPCODE_0F, rex_w);
   1.677 +  }
   1.678 +  int simd_prefix_and_encode_q(XMMRegister dst, Register src, VexSimdPrefix pre) {
   1.679 +    return simd_prefix_and_encode_q(dst, xnoreg, src, pre);
   1.680 +  }
   1.681 +  int simd_prefix_and_encode_q(Register dst, XMMRegister src,
   1.682 +                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
   1.683 +    bool rex_w = true;
   1.684 +    return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc, rex_w);
   1.685 +  }
   1.686 +
   1.687 +  // Helper functions for groups of instructions
   1.688 +  void emit_arith_b(int op1, int op2, Register dst, int imm8);
   1.689 +
   1.690 +  void emit_arith(int op1, int op2, Register dst, int32_t imm32);
   1.691 +  // Force generation of a 4 byte immediate value even if it fits into 8bit
   1.692 +  void emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32);
   1.693 +  void emit_arith(int op1, int op2, Register dst, Register src);
   1.694 +
   1.695 +  void emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre);
   1.696 +  void emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre);
   1.697 +  void emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre);
   1.698 +  void emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre);
   1.699 +  void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
   1.700 +                      Address src, VexSimdPrefix pre, bool vector256);
   1.701 +  void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
   1.702 +                      XMMRegister src, VexSimdPrefix pre, bool vector256);
   1.703 +
   1.704 +  void emit_operand(Register reg,
   1.705 +                    Register base, Register index, Address::ScaleFactor scale,
   1.706 +                    int disp,
   1.707 +                    RelocationHolder const& rspec,
   1.708 +                    int rip_relative_correction = 0);
   1.709 +
   1.710 +  void emit_operand(Register reg, Address adr, int rip_relative_correction = 0);
   1.711 +
   1.712 +  // operands that only take the original 32bit registers
   1.713 +  void emit_operand32(Register reg, Address adr);
   1.714 +
   1.715 +  void emit_operand(XMMRegister reg,
   1.716 +                    Register base, Register index, Address::ScaleFactor scale,
   1.717 +                    int disp,
   1.718 +                    RelocationHolder const& rspec);
   1.719 +
   1.720 +  void emit_operand(XMMRegister reg, Address adr);
   1.721 +
   1.722 +  void emit_operand(MMXRegister reg, Address adr);
   1.723 +
   1.724 +  // workaround gcc (3.2.1-7) bug
   1.725 +  void emit_operand(Address adr, MMXRegister reg);
   1.726 +
   1.727 +
   1.728 +  // Immediate-to-memory forms
   1.729 +  void emit_arith_operand(int op1, Register rm, Address adr, int32_t imm32);
   1.730 +
   1.731 +  void emit_farith(int b1, int b2, int i);
   1.732 +
   1.733 +
   1.734 + protected:
   1.735 +  #ifdef ASSERT
   1.736 +  void check_relocation(RelocationHolder const& rspec, int format);
   1.737 +  #endif
   1.738 +
   1.739 +  void emit_data(jint data, relocInfo::relocType    rtype, int format);
   1.740 +  void emit_data(jint data, RelocationHolder const& rspec, int format);
   1.741 +  void emit_data64(jlong data, relocInfo::relocType rtype, int format = 0);
   1.742 +  void emit_data64(jlong data, RelocationHolder const& rspec, int format = 0);
   1.743 +
   1.744 +  bool reachable(AddressLiteral adr) NOT_LP64({ return true;});
   1.745 +
   1.746 +  // These are all easily abused and hence protected
   1.747 +
   1.748 +  // 32BIT ONLY SECTION
   1.749 +#ifndef _LP64
   1.750 +  // Make these disappear in 64bit mode since they would never be correct
   1.751 +  void cmp_literal32(Register src1, int32_t imm32, RelocationHolder const& rspec);   // 32BIT ONLY
   1.752 +  void cmp_literal32(Address src1, int32_t imm32, RelocationHolder const& rspec);    // 32BIT ONLY
   1.753 +
   1.754 +  void mov_literal32(Register dst, int32_t imm32, RelocationHolder const& rspec);    // 32BIT ONLY
   1.755 +  void mov_literal32(Address dst, int32_t imm32, RelocationHolder const& rspec);     // 32BIT ONLY
   1.756 +
   1.757 +  void push_literal32(int32_t imm32, RelocationHolder const& rspec);                 // 32BIT ONLY
   1.758 +#else
   1.759 +  // 64BIT ONLY SECTION
   1.760 +  void mov_literal64(Register dst, intptr_t imm64, RelocationHolder const& rspec);   // 64BIT ONLY
   1.761 +
   1.762 +  void cmp_narrow_oop(Register src1, int32_t imm32, RelocationHolder const& rspec);
   1.763 +  void cmp_narrow_oop(Address src1, int32_t imm32, RelocationHolder const& rspec);
   1.764 +
   1.765 +  void mov_narrow_oop(Register dst, int32_t imm32, RelocationHolder const& rspec);
   1.766 +  void mov_narrow_oop(Address dst, int32_t imm32, RelocationHolder const& rspec);
   1.767 +#endif // _LP64
   1.768 +
   1.769 +  // These are unique in that we are ensured by the caller that the 32bit
   1.770 +  // relative in these instructions will always be able to reach the potentially
   1.771 +  // 64bit address described by entry. Since they can take a 64bit address they
   1.772 +  // don't have the 32 suffix like the other instructions in this class.
   1.773 +
   1.774 +  void call_literal(address entry, RelocationHolder const& rspec);
   1.775 +  void jmp_literal(address entry, RelocationHolder const& rspec);
   1.776 +
   1.777 +  // Avoid using directly section
   1.778 +  // Instructions in this section are actually usable by anyone without danger
   1.779 +  // of failure but have performance issues that are addressed my enhanced
   1.780 +  // instructions which will do the proper thing base on the particular cpu.
   1.781 +  // We protect them because we don't trust you...
   1.782 +
   1.783 +  // Don't use next inc() and dec() methods directly. INC & DEC instructions
   1.784 +  // could cause a partial flag stall since they don't set CF flag.
   1.785 +  // Use MacroAssembler::decrement() & MacroAssembler::increment() methods
   1.786 +  // which call inc() & dec() or add() & sub() in accordance with
   1.787 +  // the product flag UseIncDec value.
   1.788 +
   1.789 +  void decl(Register dst);
   1.790 +  void decl(Address dst);
   1.791 +  void decq(Register dst);
   1.792 +  void decq(Address dst);
   1.793 +
   1.794 +  void incl(Register dst);
   1.795 +  void incl(Address dst);
   1.796 +  void incq(Register dst);
   1.797 +  void incq(Address dst);
   1.798 +
   1.799 +  // New cpus require use of movsd and movss to avoid partial register stall
   1.800 +  // when loading from memory. But for old Opteron use movlpd instead of movsd.
   1.801 +  // The selection is done in MacroAssembler::movdbl() and movflt().
   1.802 +
   1.803 +  // Move Scalar Single-Precision Floating-Point Values
   1.804 +  void movss(XMMRegister dst, Address src);
   1.805 +  void movss(XMMRegister dst, XMMRegister src);
   1.806 +  void movss(Address dst, XMMRegister src);
   1.807 +
   1.808 +  // Move Scalar Double-Precision Floating-Point Values
   1.809 +  void movsd(XMMRegister dst, Address src);
   1.810 +  void movsd(XMMRegister dst, XMMRegister src);
   1.811 +  void movsd(Address dst, XMMRegister src);
   1.812 +  void movlpd(XMMRegister dst, Address src);
   1.813 +
   1.814 +  // New cpus require use of movaps and movapd to avoid partial register stall
   1.815 +  // when moving between registers.
   1.816 +  void movaps(XMMRegister dst, XMMRegister src);
   1.817 +  void movapd(XMMRegister dst, XMMRegister src);
   1.818 +
   1.819 +  // End avoid using directly
   1.820 +
   1.821 +
   1.822 +  // Instruction prefixes
   1.823 +  void prefix(Prefix p);
   1.824 +
   1.825 +  public:
   1.826 +
   1.827 +  // Creation
   1.828 +  Assembler(CodeBuffer* code) : AbstractAssembler(code) {}
   1.829 +
   1.830 +  // Decoding
   1.831 +  static address locate_operand(address inst, WhichOperand which);
   1.832 +  static address locate_next_instruction(address inst);
   1.833 +
   1.834 +  // Utilities
   1.835 +  static bool is_polling_page_far() NOT_LP64({ return false;});
   1.836 +
   1.837 +  // Generic instructions
   1.838 +  // Does 32bit or 64bit as needed for the platform. In some sense these
   1.839 +  // belong in macro assembler but there is no need for both varieties to exist
   1.840 +
   1.841 +  void lea(Register dst, Address src);
   1.842 +
   1.843 +  void mov(Register dst, Register src);
   1.844 +
   1.845 +  void pusha();
   1.846 +  void popa();
   1.847 +
   1.848 +  void pushf();
   1.849 +  void popf();
   1.850 +
   1.851 +  void push(int32_t imm32);
   1.852 +
   1.853 +  void push(Register src);
   1.854 +
   1.855 +  void pop(Register dst);
   1.856 +
   1.857 +  // These are dummies to prevent surprise implicit conversions to Register
   1.858 +  void push(void* v);
   1.859 +  void pop(void* v);
   1.860 +
   1.861 +  // These do register sized moves/scans
   1.862 +  void rep_mov();
   1.863 +  void rep_stos();
   1.864 +  void rep_stosb();
   1.865 +  void repne_scan();
   1.866 +#ifdef _LP64
   1.867 +  void repne_scanl();
   1.868 +#endif
   1.869 +
   1.870 +  // Vanilla instructions in lexical order
   1.871 +
   1.872 +  void adcl(Address dst, int32_t imm32);
   1.873 +  void adcl(Address dst, Register src);
   1.874 +  void adcl(Register dst, int32_t imm32);
   1.875 +  void adcl(Register dst, Address src);
   1.876 +  void adcl(Register dst, Register src);
   1.877 +
   1.878 +  void adcq(Register dst, int32_t imm32);
   1.879 +  void adcq(Register dst, Address src);
   1.880 +  void adcq(Register dst, Register src);
   1.881 +
   1.882 +  void addl(Address dst, int32_t imm32);
   1.883 +  void addl(Address dst, Register src);
   1.884 +  void addl(Register dst, int32_t imm32);
   1.885 +  void addl(Register dst, Address src);
   1.886 +  void addl(Register dst, Register src);
   1.887 +
   1.888 +  void addq(Address dst, int32_t imm32);
   1.889 +  void addq(Address dst, Register src);
   1.890 +  void addq(Register dst, int32_t imm32);
   1.891 +  void addq(Register dst, Address src);
   1.892 +  void addq(Register dst, Register src);
   1.893 +
   1.894 +  void addr_nop_4();
   1.895 +  void addr_nop_5();
   1.896 +  void addr_nop_7();
   1.897 +  void addr_nop_8();
   1.898 +
   1.899 +  // Add Scalar Double-Precision Floating-Point Values
   1.900 +  void addsd(XMMRegister dst, Address src);
   1.901 +  void addsd(XMMRegister dst, XMMRegister src);
   1.902 +
   1.903 +  // Add Scalar Single-Precision Floating-Point Values
   1.904 +  void addss(XMMRegister dst, Address src);
   1.905 +  void addss(XMMRegister dst, XMMRegister src);
   1.906 +
   1.907 +  // AES instructions
   1.908 +  void aesdec(XMMRegister dst, Address src);
   1.909 +  void aesdec(XMMRegister dst, XMMRegister src);
   1.910 +  void aesdeclast(XMMRegister dst, Address src);
   1.911 +  void aesdeclast(XMMRegister dst, XMMRegister src);
   1.912 +  void aesenc(XMMRegister dst, Address src);
   1.913 +  void aesenc(XMMRegister dst, XMMRegister src);
   1.914 +  void aesenclast(XMMRegister dst, Address src);
   1.915 +  void aesenclast(XMMRegister dst, XMMRegister src);
   1.916 +
   1.917 +
   1.918 +  void andl(Address  dst, int32_t imm32);
   1.919 +  void andl(Register dst, int32_t imm32);
   1.920 +  void andl(Register dst, Address src);
   1.921 +  void andl(Register dst, Register src);
   1.922 +
   1.923 +  void andq(Address  dst, int32_t imm32);
   1.924 +  void andq(Register dst, int32_t imm32);
   1.925 +  void andq(Register dst, Address src);
   1.926 +  void andq(Register dst, Register src);
   1.927 +
   1.928 +  // BMI instructions
   1.929 +  void andnl(Register dst, Register src1, Register src2);
   1.930 +  void andnl(Register dst, Register src1, Address src2);
   1.931 +  void andnq(Register dst, Register src1, Register src2);
   1.932 +  void andnq(Register dst, Register src1, Address src2);
   1.933 +
   1.934 +  void blsil(Register dst, Register src);
   1.935 +  void blsil(Register dst, Address src);
   1.936 +  void blsiq(Register dst, Register src);
   1.937 +  void blsiq(Register dst, Address src);
   1.938 +
   1.939 +  void blsmskl(Register dst, Register src);
   1.940 +  void blsmskl(Register dst, Address src);
   1.941 +  void blsmskq(Register dst, Register src);
   1.942 +  void blsmskq(Register dst, Address src);
   1.943 +
   1.944 +  void blsrl(Register dst, Register src);
   1.945 +  void blsrl(Register dst, Address src);
   1.946 +  void blsrq(Register dst, Register src);
   1.947 +  void blsrq(Register dst, Address src);
   1.948 +
   1.949 +  void bsfl(Register dst, Register src);
   1.950 +  void bsrl(Register dst, Register src);
   1.951 +
   1.952 +#ifdef _LP64
   1.953 +  void bsfq(Register dst, Register src);
   1.954 +  void bsrq(Register dst, Register src);
   1.955 +#endif
   1.956 +
   1.957 +  void bswapl(Register reg);
   1.958 +
   1.959 +  void bswapq(Register reg);
   1.960 +
   1.961 +  void call(Label& L, relocInfo::relocType rtype);
   1.962 +  void call(Register reg);  // push pc; pc <- reg
   1.963 +  void call(Address adr);   // push pc; pc <- adr
   1.964 +
   1.965 +  void cdql();
   1.966 +
   1.967 +  void cdqq();
   1.968 +
   1.969 +  void cld();
   1.970 +
   1.971 +  void clflush(Address adr);
   1.972 +
   1.973 +  void cmovl(Condition cc, Register dst, Register src);
   1.974 +  void cmovl(Condition cc, Register dst, Address src);
   1.975 +
   1.976 +  void cmovq(Condition cc, Register dst, Register src);
   1.977 +  void cmovq(Condition cc, Register dst, Address src);
   1.978 +
   1.979 +
   1.980 +  void cmpb(Address dst, int imm8);
   1.981 +
   1.982 +  void cmpl(Address dst, int32_t imm32);
   1.983 +
   1.984 +  void cmpl(Register dst, int32_t imm32);
   1.985 +  void cmpl(Register dst, Register src);
   1.986 +  void cmpl(Register dst, Address src);
   1.987 +
   1.988 +  void cmpq(Address dst, int32_t imm32);
   1.989 +  void cmpq(Address dst, Register src);
   1.990 +
   1.991 +  void cmpq(Register dst, int32_t imm32);
   1.992 +  void cmpq(Register dst, Register src);
   1.993 +  void cmpq(Register dst, Address src);
   1.994 +
   1.995 +  // these are dummies used to catch attempting to convert NULL to Register
   1.996 +  void cmpl(Register dst, void* junk); // dummy
   1.997 +  void cmpq(Register dst, void* junk); // dummy
   1.998 +
   1.999 +  void cmpw(Address dst, int imm16);
  1.1000 +
  1.1001 +  void cmpxchg8 (Address adr);
  1.1002 +
  1.1003 +  void cmpxchgl(Register reg, Address adr);
  1.1004 +
  1.1005 +  void cmpxchgq(Register reg, Address adr);
  1.1006 +
  1.1007 +  // Ordered Compare Scalar Double-Precision Floating-Point Values and set EFLAGS
  1.1008 +  void comisd(XMMRegister dst, Address src);
  1.1009 +  void comisd(XMMRegister dst, XMMRegister src);
  1.1010 +
  1.1011 +  // Ordered Compare Scalar Single-Precision Floating-Point Values and set EFLAGS
  1.1012 +  void comiss(XMMRegister dst, Address src);
  1.1013 +  void comiss(XMMRegister dst, XMMRegister src);
  1.1014 +
  1.1015 +  // Identify processor type and features
  1.1016 +  void cpuid();
  1.1017 +
  1.1018 +  // Convert Scalar Double-Precision Floating-Point Value to Scalar Single-Precision Floating-Point Value
  1.1019 +  void cvtsd2ss(XMMRegister dst, XMMRegister src);
  1.1020 +  void cvtsd2ss(XMMRegister dst, Address src);
  1.1021 +
  1.1022 +  // Convert Doubleword Integer to Scalar Double-Precision Floating-Point Value
  1.1023 +  void cvtsi2sdl(XMMRegister dst, Register src);
  1.1024 +  void cvtsi2sdl(XMMRegister dst, Address src);
  1.1025 +  void cvtsi2sdq(XMMRegister dst, Register src);
  1.1026 +  void cvtsi2sdq(XMMRegister dst, Address src);
  1.1027 +
  1.1028 +  // Convert Doubleword Integer to Scalar Single-Precision Floating-Point Value
  1.1029 +  void cvtsi2ssl(XMMRegister dst, Register src);
  1.1030 +  void cvtsi2ssl(XMMRegister dst, Address src);
  1.1031 +  void cvtsi2ssq(XMMRegister dst, Register src);
  1.1032 +  void cvtsi2ssq(XMMRegister dst, Address src);
  1.1033 +
  1.1034 +  // Convert Packed Signed Doubleword Integers to Packed Double-Precision Floating-Point Value
  1.1035 +  void cvtdq2pd(XMMRegister dst, XMMRegister src);
  1.1036 +
  1.1037 +  // Convert Packed Signed Doubleword Integers to Packed Single-Precision Floating-Point Value
  1.1038 +  void cvtdq2ps(XMMRegister dst, XMMRegister src);
  1.1039 +
  1.1040 +  // Convert Scalar Single-Precision Floating-Point Value to Scalar Double-Precision Floating-Point Value
  1.1041 +  void cvtss2sd(XMMRegister dst, XMMRegister src);
  1.1042 +  void cvtss2sd(XMMRegister dst, Address src);
  1.1043 +
  1.1044 +  // Convert with Truncation Scalar Double-Precision Floating-Point Value to Doubleword Integer
  1.1045 +  void cvttsd2sil(Register dst, Address src);
  1.1046 +  void cvttsd2sil(Register dst, XMMRegister src);
  1.1047 +  void cvttsd2siq(Register dst, XMMRegister src);
  1.1048 +
  1.1049 +  // Convert with Truncation Scalar Single-Precision Floating-Point Value to Doubleword Integer
  1.1050 +  void cvttss2sil(Register dst, XMMRegister src);
  1.1051 +  void cvttss2siq(Register dst, XMMRegister src);
  1.1052 +
  1.1053 +  // Divide Scalar Double-Precision Floating-Point Values
  1.1054 +  void divsd(XMMRegister dst, Address src);
  1.1055 +  void divsd(XMMRegister dst, XMMRegister src);
  1.1056 +
  1.1057 +  // Divide Scalar Single-Precision Floating-Point Values
  1.1058 +  void divss(XMMRegister dst, Address src);
  1.1059 +  void divss(XMMRegister dst, XMMRegister src);
  1.1060 +
  1.1061 +  void emms();
  1.1062 +
  1.1063 +  void fabs();
  1.1064 +
  1.1065 +  void fadd(int i);
  1.1066 +
  1.1067 +  void fadd_d(Address src);
  1.1068 +  void fadd_s(Address src);
  1.1069 +
  1.1070 +  // "Alternate" versions of x87 instructions place result down in FPU
  1.1071 +  // stack instead of on TOS
  1.1072 +
  1.1073 +  void fadda(int i); // "alternate" fadd
  1.1074 +  void faddp(int i = 1);
  1.1075 +
  1.1076 +  void fchs();
  1.1077 +
  1.1078 +  void fcom(int i);
  1.1079 +
  1.1080 +  void fcomp(int i = 1);
  1.1081 +  void fcomp_d(Address src);
  1.1082 +  void fcomp_s(Address src);
  1.1083 +
  1.1084 +  void fcompp();
  1.1085 +
  1.1086 +  void fcos();
  1.1087 +
  1.1088 +  void fdecstp();
  1.1089 +
  1.1090 +  void fdiv(int i);
  1.1091 +  void fdiv_d(Address src);
  1.1092 +  void fdivr_s(Address src);
  1.1093 +  void fdiva(int i);  // "alternate" fdiv
  1.1094 +  void fdivp(int i = 1);
  1.1095 +
  1.1096 +  void fdivr(int i);
  1.1097 +  void fdivr_d(Address src);
  1.1098 +  void fdiv_s(Address src);
  1.1099 +
  1.1100 +  void fdivra(int i); // "alternate" reversed fdiv
  1.1101 +
  1.1102 +  void fdivrp(int i = 1);
  1.1103 +
  1.1104 +  void ffree(int i = 0);
  1.1105 +
  1.1106 +  void fild_d(Address adr);
  1.1107 +  void fild_s(Address adr);
  1.1108 +
  1.1109 +  void fincstp();
  1.1110 +
  1.1111 +  void finit();
  1.1112 +
  1.1113 +  void fist_s (Address adr);
  1.1114 +  void fistp_d(Address adr);
  1.1115 +  void fistp_s(Address adr);
  1.1116 +
  1.1117 +  void fld1();
  1.1118 +
  1.1119 +  void fld_d(Address adr);
  1.1120 +  void fld_s(Address adr);
  1.1121 +  void fld_s(int index);
  1.1122 +  void fld_x(Address adr);  // extended-precision (80-bit) format
  1.1123 +
  1.1124 +  void fldcw(Address src);
  1.1125 +
  1.1126 +  void fldenv(Address src);
  1.1127 +
  1.1128 +  void fldlg2();
  1.1129 +
  1.1130 +  void fldln2();
  1.1131 +
  1.1132 +  void fldz();
  1.1133 +
  1.1134 +  void flog();
  1.1135 +  void flog10();
  1.1136 +
  1.1137 +  void fmul(int i);
  1.1138 +
  1.1139 +  void fmul_d(Address src);
  1.1140 +  void fmul_s(Address src);
  1.1141 +
  1.1142 +  void fmula(int i);  // "alternate" fmul
  1.1143 +
  1.1144 +  void fmulp(int i = 1);
  1.1145 +
  1.1146 +  void fnsave(Address dst);
  1.1147 +
  1.1148 +  void fnstcw(Address src);
  1.1149 +
  1.1150 +  void fnstsw_ax();
  1.1151 +
  1.1152 +  void fprem();
  1.1153 +  void fprem1();
  1.1154 +
  1.1155 +  void frstor(Address src);
  1.1156 +
  1.1157 +  void fsin();
  1.1158 +
  1.1159 +  void fsqrt();
  1.1160 +
  1.1161 +  void fst_d(Address adr);
  1.1162 +  void fst_s(Address adr);
  1.1163 +
  1.1164 +  void fstp_d(Address adr);
  1.1165 +  void fstp_d(int index);
  1.1166 +  void fstp_s(Address adr);
  1.1167 +  void fstp_x(Address adr); // extended-precision (80-bit) format
  1.1168 +
  1.1169 +  void fsub(int i);
  1.1170 +  void fsub_d(Address src);
  1.1171 +  void fsub_s(Address src);
  1.1172 +
  1.1173 +  void fsuba(int i);  // "alternate" fsub
  1.1174 +
  1.1175 +  void fsubp(int i = 1);
  1.1176 +
  1.1177 +  void fsubr(int i);
  1.1178 +  void fsubr_d(Address src);
  1.1179 +  void fsubr_s(Address src);
  1.1180 +
  1.1181 +  void fsubra(int i); // "alternate" reversed fsub
  1.1182 +
  1.1183 +  void fsubrp(int i = 1);
  1.1184 +
  1.1185 +  void ftan();
  1.1186 +
  1.1187 +  void ftst();
  1.1188 +
  1.1189 +  void fucomi(int i = 1);
  1.1190 +  void fucomip(int i = 1);
  1.1191 +
  1.1192 +  void fwait();
  1.1193 +
  1.1194 +  void fxch(int i = 1);
  1.1195 +
  1.1196 +  void fxrstor(Address src);
  1.1197 +
  1.1198 +  void fxsave(Address dst);
  1.1199 +
  1.1200 +  void fyl2x();
  1.1201 +  void frndint();
  1.1202 +  void f2xm1();
  1.1203 +  void fldl2e();
  1.1204 +
  1.1205 +  void hlt();
  1.1206 +
  1.1207 +  void idivl(Register src);
  1.1208 +  void divl(Register src); // Unsigned division
  1.1209 +
  1.1210 +  void idivq(Register src);
  1.1211 +
  1.1212 +  void imull(Register dst, Register src);
  1.1213 +  void imull(Register dst, Register src, int value);
  1.1214 +  void imull(Register dst, Address src);
  1.1215 +
  1.1216 +  void imulq(Register dst, Register src);
  1.1217 +  void imulq(Register dst, Register src, int value);
  1.1218 +#ifdef _LP64
  1.1219 +  void imulq(Register dst, Address src);
  1.1220 +#endif
  1.1221 +
  1.1222 +
  1.1223 +  // jcc is the generic conditional branch generator to run-
  1.1224 +  // time routines, jcc is used for branches to labels. jcc
  1.1225 +  // takes a branch opcode (cc) and a label (L) and generates
  1.1226 +  // either a backward branch or a forward branch and links it
  1.1227 +  // to the label fixup chain. Usage:
  1.1228 +  //
  1.1229 +  // Label L;      // unbound label
  1.1230 +  // jcc(cc, L);   // forward branch to unbound label
  1.1231 +  // bind(L);      // bind label to the current pc
  1.1232 +  // jcc(cc, L);   // backward branch to bound label
  1.1233 +  // bind(L);      // illegal: a label may be bound only once
  1.1234 +  //
  1.1235 +  // Note: The same Label can be used for forward and backward branches
  1.1236 +  // but it may be bound only once.
  1.1237 +
  1.1238 +  void jcc(Condition cc, Label& L, bool maybe_short = true);
  1.1239 +
  1.1240 +  // Conditional jump to a 8-bit offset to L.
  1.1241 +  // WARNING: be very careful using this for forward jumps.  If the label is
  1.1242 +  // not bound within an 8-bit offset of this instruction, a run-time error
  1.1243 +  // will occur.
  1.1244 +  void jccb(Condition cc, Label& L);
  1.1245 +
  1.1246 +  void jmp(Address entry);    // pc <- entry
  1.1247 +
  1.1248 +  // Label operations & relative jumps (PPUM Appendix D)
  1.1249 +  void jmp(Label& L, bool maybe_short = true);   // unconditional jump to L
  1.1250 +
  1.1251 +  void jmp(Register entry); // pc <- entry
  1.1252 +
  1.1253 +  // Unconditional 8-bit offset jump to L.
  1.1254 +  // WARNING: be very careful using this for forward jumps.  If the label is
  1.1255 +  // not bound within an 8-bit offset of this instruction, a run-time error
  1.1256 +  // will occur.
  1.1257 +  void jmpb(Label& L);
  1.1258 +
  1.1259 +  void ldmxcsr( Address src );
  1.1260 +
  1.1261 +  void leal(Register dst, Address src);
  1.1262 +
  1.1263 +  void leaq(Register dst, Address src);
  1.1264 +
  1.1265 +  void lfence();
  1.1266 +
  1.1267 +  void lock();
  1.1268 +
  1.1269 +  void lzcntl(Register dst, Register src);
  1.1270 +
  1.1271 +#ifdef _LP64
  1.1272 +  void lzcntq(Register dst, Register src);
  1.1273 +#endif
  1.1274 +
  1.1275 +  enum Membar_mask_bits {
  1.1276 +    StoreStore = 1 << 3,
  1.1277 +    LoadStore  = 1 << 2,
  1.1278 +    StoreLoad  = 1 << 1,
  1.1279 +    LoadLoad   = 1 << 0
  1.1280 +  };
  1.1281 +
  1.1282 +  // Serializes memory and blows flags
  1.1283 +  void membar(Membar_mask_bits order_constraint) {
  1.1284 +    if (os::is_MP()) {
  1.1285 +      // We only have to handle StoreLoad
  1.1286 +      if (order_constraint & StoreLoad) {
  1.1287 +        // All usable chips support "locked" instructions which suffice
  1.1288 +        // as barriers, and are much faster than the alternative of
  1.1289 +        // using cpuid instruction. We use here a locked add [esp],0.
  1.1290 +        // This is conveniently otherwise a no-op except for blowing
  1.1291 +        // flags.
  1.1292 +        // Any change to this code may need to revisit other places in
  1.1293 +        // the code where this idiom is used, in particular the
  1.1294 +        // orderAccess code.
  1.1295 +        lock();
  1.1296 +        addl(Address(rsp, 0), 0);// Assert the lock# signal here
  1.1297 +      }
  1.1298 +    }
  1.1299 +  }
  1.1300 +
  1.1301 +  void mfence();
  1.1302 +
  1.1303 +  // Moves
  1.1304 +
  1.1305 +  void mov64(Register dst, int64_t imm64);
  1.1306 +
  1.1307 +  void movb(Address dst, Register src);
  1.1308 +  void movb(Address dst, int imm8);
  1.1309 +  void movb(Register dst, Address src);
  1.1310 +
  1.1311 +  void movdl(XMMRegister dst, Register src);
  1.1312 +  void movdl(Register dst, XMMRegister src);
  1.1313 +  void movdl(XMMRegister dst, Address src);
  1.1314 +  void movdl(Address dst, XMMRegister src);
  1.1315 +
  1.1316 +  // Move Double Quadword
  1.1317 +  void movdq(XMMRegister dst, Register src);
  1.1318 +  void movdq(Register dst, XMMRegister src);
  1.1319 +
  1.1320 +  // Move Aligned Double Quadword
  1.1321 +  void movdqa(XMMRegister dst, XMMRegister src);
  1.1322 +  void movdqa(XMMRegister dst, Address src);
  1.1323 +
  1.1324 +  // Move Unaligned Double Quadword
  1.1325 +  void movdqu(Address     dst, XMMRegister src);
  1.1326 +  void movdqu(XMMRegister dst, Address src);
  1.1327 +  void movdqu(XMMRegister dst, XMMRegister src);
  1.1328 +
  1.1329 +  // Move Unaligned 256bit Vector
  1.1330 +  void vmovdqu(Address dst, XMMRegister src);
  1.1331 +  void vmovdqu(XMMRegister dst, Address src);
  1.1332 +  void vmovdqu(XMMRegister dst, XMMRegister src);
  1.1333 +
  1.1334 +  // Move lower 64bit to high 64bit in 128bit register
  1.1335 +  void movlhps(XMMRegister dst, XMMRegister src);
  1.1336 +
  1.1337 +  void movl(Register dst, int32_t imm32);
  1.1338 +  void movl(Address dst, int32_t imm32);
  1.1339 +  void movl(Register dst, Register src);
  1.1340 +  void movl(Register dst, Address src);
  1.1341 +  void movl(Address dst, Register src);
  1.1342 +
  1.1343 +  // These dummies prevent using movl from converting a zero (like NULL) into Register
  1.1344 +  // by giving the compiler two choices it can't resolve
  1.1345 +
  1.1346 +  void movl(Address  dst, void* junk);
  1.1347 +  void movl(Register dst, void* junk);
  1.1348 +
  1.1349 +#ifdef _LP64
  1.1350 +  void movq(Register dst, Register src);
  1.1351 +  void movq(Register dst, Address src);
  1.1352 +  void movq(Address  dst, Register src);
  1.1353 +#endif
  1.1354 +
  1.1355 +  void movq(Address     dst, MMXRegister src );
  1.1356 +  void movq(MMXRegister dst, Address src );
  1.1357 +
  1.1358 +#ifdef _LP64
  1.1359 +  // These dummies prevent using movq from converting a zero (like NULL) into Register
  1.1360 +  // by giving the compiler two choices it can't resolve
  1.1361 +
  1.1362 +  void movq(Address  dst, void* dummy);
  1.1363 +  void movq(Register dst, void* dummy);
  1.1364 +#endif
  1.1365 +
  1.1366 +  // Move Quadword
  1.1367 +  void movq(Address     dst, XMMRegister src);
  1.1368 +  void movq(XMMRegister dst, Address src);
  1.1369 +
  1.1370 +  void movsbl(Register dst, Address src);
  1.1371 +  void movsbl(Register dst, Register src);
  1.1372 +
  1.1373 +#ifdef _LP64
  1.1374 +  void movsbq(Register dst, Address src);
  1.1375 +  void movsbq(Register dst, Register src);
  1.1376 +
  1.1377 +  // Move signed 32bit immediate to 64bit extending sign
  1.1378 +  void movslq(Address  dst, int32_t imm64);
  1.1379 +  void movslq(Register dst, int32_t imm64);
  1.1380 +
  1.1381 +  void movslq(Register dst, Address src);
  1.1382 +  void movslq(Register dst, Register src);
  1.1383 +  void movslq(Register dst, void* src); // Dummy declaration to cause NULL to be ambiguous
  1.1384 +#endif
  1.1385 +
  1.1386 +  void movswl(Register dst, Address src);
  1.1387 +  void movswl(Register dst, Register src);
  1.1388 +
  1.1389 +#ifdef _LP64
  1.1390 +  void movswq(Register dst, Address src);
  1.1391 +  void movswq(Register dst, Register src);
  1.1392 +#endif
  1.1393 +
  1.1394 +  void movw(Address dst, int imm16);
  1.1395 +  void movw(Register dst, Address src);
  1.1396 +  void movw(Address dst, Register src);
  1.1397 +
  1.1398 +  void movzbl(Register dst, Address src);
  1.1399 +  void movzbl(Register dst, Register src);
  1.1400 +
  1.1401 +#ifdef _LP64
  1.1402 +  void movzbq(Register dst, Address src);
  1.1403 +  void movzbq(Register dst, Register src);
  1.1404 +#endif
  1.1405 +
  1.1406 +  void movzwl(Register dst, Address src);
  1.1407 +  void movzwl(Register dst, Register src);
  1.1408 +
  1.1409 +#ifdef _LP64
  1.1410 +  void movzwq(Register dst, Address src);
  1.1411 +  void movzwq(Register dst, Register src);
  1.1412 +#endif
  1.1413 +
  1.1414 +  void mull(Address src);
  1.1415 +  void mull(Register src);
  1.1416 +
  1.1417 +  // Multiply Scalar Double-Precision Floating-Point Values
  1.1418 +  void mulsd(XMMRegister dst, Address src);
  1.1419 +  void mulsd(XMMRegister dst, XMMRegister src);
  1.1420 +
  1.1421 +  // Multiply Scalar Single-Precision Floating-Point Values
  1.1422 +  void mulss(XMMRegister dst, Address src);
  1.1423 +  void mulss(XMMRegister dst, XMMRegister src);
  1.1424 +
  1.1425 +  void negl(Register dst);
  1.1426 +
  1.1427 +#ifdef _LP64
  1.1428 +  void negq(Register dst);
  1.1429 +#endif
  1.1430 +
  1.1431 +  void nop(int i = 1);
  1.1432 +
  1.1433 +  void notl(Register dst);
  1.1434 +
  1.1435 +#ifdef _LP64
  1.1436 +  void notq(Register dst);
  1.1437 +#endif
  1.1438 +
  1.1439 +  void orl(Address dst, int32_t imm32);
  1.1440 +  void orl(Register dst, int32_t imm32);
  1.1441 +  void orl(Register dst, Address src);
  1.1442 +  void orl(Register dst, Register src);
  1.1443 +
  1.1444 +  void orq(Address dst, int32_t imm32);
  1.1445 +  void orq(Register dst, int32_t imm32);
  1.1446 +  void orq(Register dst, Address src);
  1.1447 +  void orq(Register dst, Register src);
  1.1448 +
  1.1449 +  // Pack with unsigned saturation
  1.1450 +  void packuswb(XMMRegister dst, XMMRegister src);
  1.1451 +  void packuswb(XMMRegister dst, Address src);
  1.1452 +  void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1453 +
  1.1454 +  // Pemutation of 64bit words
  1.1455 +  void vpermq(XMMRegister dst, XMMRegister src, int imm8, bool vector256);
  1.1456 +
  1.1457 +  void pause();
  1.1458 +
  1.1459 +  // SSE4.2 string instructions
  1.1460 +  void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8);
  1.1461 +  void pcmpestri(XMMRegister xmm1, Address src, int imm8);
  1.1462 +
  1.1463 +  // SSE 4.1 extract
  1.1464 +  void pextrd(Register dst, XMMRegister src, int imm8);
  1.1465 +  void pextrq(Register dst, XMMRegister src, int imm8);
  1.1466 +
  1.1467 +  // SSE 4.1 insert
  1.1468 +  void pinsrd(XMMRegister dst, Register src, int imm8);
  1.1469 +  void pinsrq(XMMRegister dst, Register src, int imm8);
  1.1470 +
  1.1471 +  // SSE4.1 packed move
  1.1472 +  void pmovzxbw(XMMRegister dst, XMMRegister src);
  1.1473 +  void pmovzxbw(XMMRegister dst, Address src);
  1.1474 +
  1.1475 +#ifndef _LP64 // no 32bit push/pop on amd64
  1.1476 +  void popl(Address dst);
  1.1477 +#endif
  1.1478 +
  1.1479 +#ifdef _LP64
  1.1480 +  void popq(Address dst);
  1.1481 +#endif
  1.1482 +
  1.1483 +  void popcntl(Register dst, Address src);
  1.1484 +  void popcntl(Register dst, Register src);
  1.1485 +
  1.1486 +#ifdef _LP64
  1.1487 +  void popcntq(Register dst, Address src);
  1.1488 +  void popcntq(Register dst, Register src);
  1.1489 +#endif
  1.1490 +
  1.1491 +  // Prefetches (SSE, SSE2, 3DNOW only)
  1.1492 +
  1.1493 +  void prefetchnta(Address src);
  1.1494 +  void prefetchr(Address src);
  1.1495 +  void prefetcht0(Address src);
  1.1496 +  void prefetcht1(Address src);
  1.1497 +  void prefetcht2(Address src);
  1.1498 +  void prefetchw(Address src);
  1.1499 +
  1.1500 +  // Shuffle Bytes
  1.1501 +  void pshufb(XMMRegister dst, XMMRegister src);
  1.1502 +  void pshufb(XMMRegister dst, Address src);
  1.1503 +
  1.1504 +  // Shuffle Packed Doublewords
  1.1505 +  void pshufd(XMMRegister dst, XMMRegister src, int mode);
  1.1506 +  void pshufd(XMMRegister dst, Address src,     int mode);
  1.1507 +
  1.1508 +  // Shuffle Packed Low Words
  1.1509 +  void pshuflw(XMMRegister dst, XMMRegister src, int mode);
  1.1510 +  void pshuflw(XMMRegister dst, Address src,     int mode);
  1.1511 +
  1.1512 +  // Shift Right by bytes Logical DoubleQuadword Immediate
  1.1513 +  void psrldq(XMMRegister dst, int shift);
  1.1514 +
  1.1515 +  // Logical Compare 128bit
  1.1516 +  void ptest(XMMRegister dst, XMMRegister src);
  1.1517 +  void ptest(XMMRegister dst, Address src);
  1.1518 +  // Logical Compare 256bit
  1.1519 +  void vptest(XMMRegister dst, XMMRegister src);
  1.1520 +  void vptest(XMMRegister dst, Address src);
  1.1521 +
  1.1522 +  // Interleave Low Bytes
  1.1523 +  void punpcklbw(XMMRegister dst, XMMRegister src);
  1.1524 +  void punpcklbw(XMMRegister dst, Address src);
  1.1525 +
  1.1526 +  // Interleave Low Doublewords
  1.1527 +  void punpckldq(XMMRegister dst, XMMRegister src);
  1.1528 +  void punpckldq(XMMRegister dst, Address src);
  1.1529 +
  1.1530 +  // Interleave Low Quadwords
  1.1531 +  void punpcklqdq(XMMRegister dst, XMMRegister src);
  1.1532 +
  1.1533 +#ifndef _LP64 // no 32bit push/pop on amd64
  1.1534 +  void pushl(Address src);
  1.1535 +#endif
  1.1536 +
  1.1537 +  void pushq(Address src);
  1.1538 +
  1.1539 +  void rcll(Register dst, int imm8);
  1.1540 +
  1.1541 +  void rclq(Register dst, int imm8);
  1.1542 +
  1.1543 +  void rdtsc();
  1.1544 +
  1.1545 +  void ret(int imm16);
  1.1546 +
  1.1547 +  void sahf();
  1.1548 +
  1.1549 +  void sarl(Register dst, int imm8);
  1.1550 +  void sarl(Register dst);
  1.1551 +
  1.1552 +  void sarq(Register dst, int imm8);
  1.1553 +  void sarq(Register dst);
  1.1554 +
  1.1555 +  void sbbl(Address dst, int32_t imm32);
  1.1556 +  void sbbl(Register dst, int32_t imm32);
  1.1557 +  void sbbl(Register dst, Address src);
  1.1558 +  void sbbl(Register dst, Register src);
  1.1559 +
  1.1560 +  void sbbq(Address dst, int32_t imm32);
  1.1561 +  void sbbq(Register dst, int32_t imm32);
  1.1562 +  void sbbq(Register dst, Address src);
  1.1563 +  void sbbq(Register dst, Register src);
  1.1564 +
  1.1565 +  void setb(Condition cc, Register dst);
  1.1566 +
  1.1567 +  void shldl(Register dst, Register src);
  1.1568 +
  1.1569 +  void shll(Register dst, int imm8);
  1.1570 +  void shll(Register dst);
  1.1571 +
  1.1572 +  void shlq(Register dst, int imm8);
  1.1573 +  void shlq(Register dst);
  1.1574 +
  1.1575 +  void shrdl(Register dst, Register src);
  1.1576 +
  1.1577 +  void shrl(Register dst, int imm8);
  1.1578 +  void shrl(Register dst);
  1.1579 +
  1.1580 +  void shrq(Register dst, int imm8);
  1.1581 +  void shrq(Register dst);
  1.1582 +
  1.1583 +  void smovl(); // QQQ generic?
  1.1584 +
  1.1585 +  // Compute Square Root of Scalar Double-Precision Floating-Point Value
  1.1586 +  void sqrtsd(XMMRegister dst, Address src);
  1.1587 +  void sqrtsd(XMMRegister dst, XMMRegister src);
  1.1588 +
  1.1589 +  // Compute Square Root of Scalar Single-Precision Floating-Point Value
  1.1590 +  void sqrtss(XMMRegister dst, Address src);
  1.1591 +  void sqrtss(XMMRegister dst, XMMRegister src);
  1.1592 +
  1.1593 +  void std();
  1.1594 +
  1.1595 +  void stmxcsr( Address dst );
  1.1596 +
  1.1597 +  void subl(Address dst, int32_t imm32);
  1.1598 +  void subl(Address dst, Register src);
  1.1599 +  void subl(Register dst, int32_t imm32);
  1.1600 +  void subl(Register dst, Address src);
  1.1601 +  void subl(Register dst, Register src);
  1.1602 +
  1.1603 +  void subq(Address dst, int32_t imm32);
  1.1604 +  void subq(Address dst, Register src);
  1.1605 +  void subq(Register dst, int32_t imm32);
  1.1606 +  void subq(Register dst, Address src);
  1.1607 +  void subq(Register dst, Register src);
  1.1608 +
  1.1609 +  // Force generation of a 4 byte immediate value even if it fits into 8bit
  1.1610 +  void subl_imm32(Register dst, int32_t imm32);
  1.1611 +  void subq_imm32(Register dst, int32_t imm32);
  1.1612 +
  1.1613 +  // Subtract Scalar Double-Precision Floating-Point Values
  1.1614 +  void subsd(XMMRegister dst, Address src);
  1.1615 +  void subsd(XMMRegister dst, XMMRegister src);
  1.1616 +
  1.1617 +  // Subtract Scalar Single-Precision Floating-Point Values
  1.1618 +  void subss(XMMRegister dst, Address src);
  1.1619 +  void subss(XMMRegister dst, XMMRegister src);
  1.1620 +
  1.1621 +  void testb(Register dst, int imm8);
  1.1622 +
  1.1623 +  void testl(Register dst, int32_t imm32);
  1.1624 +  void testl(Register dst, Register src);
  1.1625 +  void testl(Register dst, Address src);
  1.1626 +
  1.1627 +  void testq(Register dst, int32_t imm32);
  1.1628 +  void testq(Register dst, Register src);
  1.1629 +
  1.1630 +  // BMI - count trailing zeros
  1.1631 +  void tzcntl(Register dst, Register src);
  1.1632 +  void tzcntq(Register dst, Register src);
  1.1633 +
  1.1634 +  // Unordered Compare Scalar Double-Precision Floating-Point Values and set EFLAGS
  1.1635 +  void ucomisd(XMMRegister dst, Address src);
  1.1636 +  void ucomisd(XMMRegister dst, XMMRegister src);
  1.1637 +
  1.1638 +  // Unordered Compare Scalar Single-Precision Floating-Point Values and set EFLAGS
  1.1639 +  void ucomiss(XMMRegister dst, Address src);
  1.1640 +  void ucomiss(XMMRegister dst, XMMRegister src);
  1.1641 +
  1.1642 +  void xabort(int8_t imm8);
  1.1643 +
  1.1644 +  void xaddl(Address dst, Register src);
  1.1645 +
  1.1646 +  void xaddq(Address dst, Register src);
  1.1647 +
  1.1648 +  void xbegin(Label& abort, relocInfo::relocType rtype = relocInfo::none);
  1.1649 +
  1.1650 +  void xchgl(Register reg, Address adr);
  1.1651 +  void xchgl(Register dst, Register src);
  1.1652 +
  1.1653 +  void xchgq(Register reg, Address adr);
  1.1654 +  void xchgq(Register dst, Register src);
  1.1655 +
  1.1656 +  void xend();
  1.1657 +
  1.1658 +  // Get Value of Extended Control Register
  1.1659 +  void xgetbv();
  1.1660 +
  1.1661 +  void xorl(Register dst, int32_t imm32);
  1.1662 +  void xorl(Register dst, Address src);
  1.1663 +  void xorl(Register dst, Register src);
  1.1664 +
  1.1665 +  void xorq(Register dst, Address src);
  1.1666 +  void xorq(Register dst, Register src);
  1.1667 +
  1.1668 +  void set_byte_if_not_zero(Register dst); // sets reg to 1 if not zero, otherwise 0
  1.1669 +
  1.1670 +  // AVX 3-operands scalar instructions (encoded with VEX prefix)
  1.1671 +
  1.1672 +  void vaddsd(XMMRegister dst, XMMRegister nds, Address src);
  1.1673 +  void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
  1.1674 +  void vaddss(XMMRegister dst, XMMRegister nds, Address src);
  1.1675 +  void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src);
  1.1676 +  void vdivsd(XMMRegister dst, XMMRegister nds, Address src);
  1.1677 +  void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
  1.1678 +  void vdivss(XMMRegister dst, XMMRegister nds, Address src);
  1.1679 +  void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src);
  1.1680 +  void vmulsd(XMMRegister dst, XMMRegister nds, Address src);
  1.1681 +  void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
  1.1682 +  void vmulss(XMMRegister dst, XMMRegister nds, Address src);
  1.1683 +  void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src);
  1.1684 +  void vsubsd(XMMRegister dst, XMMRegister nds, Address src);
  1.1685 +  void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
  1.1686 +  void vsubss(XMMRegister dst, XMMRegister nds, Address src);
  1.1687 +  void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src);
  1.1688 +
  1.1689 +
  1.1690 +  //====================VECTOR ARITHMETIC=====================================
  1.1691 +
  1.1692 +  // Add Packed Floating-Point Values
  1.1693 +  void addpd(XMMRegister dst, XMMRegister src);
  1.1694 +  void addps(XMMRegister dst, XMMRegister src);
  1.1695 +  void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1696 +  void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1697 +  void vaddpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1698 +  void vaddps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1699 +
  1.1700 +  // Subtract Packed Floating-Point Values
  1.1701 +  void subpd(XMMRegister dst, XMMRegister src);
  1.1702 +  void subps(XMMRegister dst, XMMRegister src);
  1.1703 +  void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1704 +  void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1705 +  void vsubpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1706 +  void vsubps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1707 +
  1.1708 +  // Multiply Packed Floating-Point Values
  1.1709 +  void mulpd(XMMRegister dst, XMMRegister src);
  1.1710 +  void mulps(XMMRegister dst, XMMRegister src);
  1.1711 +  void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1712 +  void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1713 +  void vmulpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1714 +  void vmulps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1715 +
  1.1716 +  // Divide Packed Floating-Point Values
  1.1717 +  void divpd(XMMRegister dst, XMMRegister src);
  1.1718 +  void divps(XMMRegister dst, XMMRegister src);
  1.1719 +  void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1720 +  void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1721 +  void vdivpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1722 +  void vdivps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1723 +
  1.1724 +  // Bitwise Logical AND of Packed Floating-Point Values
  1.1725 +  void andpd(XMMRegister dst, XMMRegister src);
  1.1726 +  void andps(XMMRegister dst, XMMRegister src);
  1.1727 +  void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1728 +  void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1729 +  void vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1730 +  void vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1731 +
  1.1732 +  // Bitwise Logical XOR of Packed Floating-Point Values
  1.1733 +  void xorpd(XMMRegister dst, XMMRegister src);
  1.1734 +  void xorps(XMMRegister dst, XMMRegister src);
  1.1735 +  void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1736 +  void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1737 +  void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1738 +  void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1739 +
  1.1740 +  // Add packed integers
  1.1741 +  void paddb(XMMRegister dst, XMMRegister src);
  1.1742 +  void paddw(XMMRegister dst, XMMRegister src);
  1.1743 +  void paddd(XMMRegister dst, XMMRegister src);
  1.1744 +  void paddq(XMMRegister dst, XMMRegister src);
  1.1745 +  void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1746 +  void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1747 +  void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1748 +  void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1749 +  void vpaddb(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1750 +  void vpaddw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1751 +  void vpaddd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1752 +  void vpaddq(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1753 +
  1.1754 +  // Sub packed integers
  1.1755 +  void psubb(XMMRegister dst, XMMRegister src);
  1.1756 +  void psubw(XMMRegister dst, XMMRegister src);
  1.1757 +  void psubd(XMMRegister dst, XMMRegister src);
  1.1758 +  void psubq(XMMRegister dst, XMMRegister src);
  1.1759 +  void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1760 +  void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1761 +  void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1762 +  void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1763 +  void vpsubb(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1764 +  void vpsubw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1765 +  void vpsubd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1766 +  void vpsubq(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1767 +
  1.1768 +  // Multiply packed integers (only shorts and ints)
  1.1769 +  void pmullw(XMMRegister dst, XMMRegister src);
  1.1770 +  void pmulld(XMMRegister dst, XMMRegister src);
  1.1771 +  void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1772 +  void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1773 +  void vpmullw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1774 +  void vpmulld(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1775 +
  1.1776 +  // Shift left packed integers
  1.1777 +  void psllw(XMMRegister dst, int shift);
  1.1778 +  void pslld(XMMRegister dst, int shift);
  1.1779 +  void psllq(XMMRegister dst, int shift);
  1.1780 +  void psllw(XMMRegister dst, XMMRegister shift);
  1.1781 +  void pslld(XMMRegister dst, XMMRegister shift);
  1.1782 +  void psllq(XMMRegister dst, XMMRegister shift);
  1.1783 +  void vpsllw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
  1.1784 +  void vpslld(XMMRegister dst, XMMRegister src, int shift, bool vector256);
  1.1785 +  void vpsllq(XMMRegister dst, XMMRegister src, int shift, bool vector256);
  1.1786 +  void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
  1.1787 +  void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
  1.1788 +  void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
  1.1789 +
  1.1790 +  // Logical shift right packed integers
  1.1791 +  void psrlw(XMMRegister dst, int shift);
  1.1792 +  void psrld(XMMRegister dst, int shift);
  1.1793 +  void psrlq(XMMRegister dst, int shift);
  1.1794 +  void psrlw(XMMRegister dst, XMMRegister shift);
  1.1795 +  void psrld(XMMRegister dst, XMMRegister shift);
  1.1796 +  void psrlq(XMMRegister dst, XMMRegister shift);
  1.1797 +  void vpsrlw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
  1.1798 +  void vpsrld(XMMRegister dst, XMMRegister src, int shift, bool vector256);
  1.1799 +  void vpsrlq(XMMRegister dst, XMMRegister src, int shift, bool vector256);
  1.1800 +  void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
  1.1801 +  void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
  1.1802 +  void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
  1.1803 +
  1.1804 +  // Arithmetic shift right packed integers (only shorts and ints, no instructions for longs)
  1.1805 +  void psraw(XMMRegister dst, int shift);
  1.1806 +  void psrad(XMMRegister dst, int shift);
  1.1807 +  void psraw(XMMRegister dst, XMMRegister shift);
  1.1808 +  void psrad(XMMRegister dst, XMMRegister shift);
  1.1809 +  void vpsraw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
  1.1810 +  void vpsrad(XMMRegister dst, XMMRegister src, int shift, bool vector256);
  1.1811 +  void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
  1.1812 +  void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
  1.1813 +
  1.1814 +  // And packed integers
  1.1815 +  void pand(XMMRegister dst, XMMRegister src);
  1.1816 +  void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1817 +  void vpand(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1818 +
  1.1819 +  // Or packed integers
  1.1820 +  void por(XMMRegister dst, XMMRegister src);
  1.1821 +  void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1822 +  void vpor(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1823 +
  1.1824 +  // Xor packed integers
  1.1825 +  void pxor(XMMRegister dst, XMMRegister src);
  1.1826 +  void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
  1.1827 +  void vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  1.1828 +
  1.1829 +  // Copy low 128bit into high 128bit of YMM registers.
  1.1830 +  void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
  1.1831 +  void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
  1.1832 +
  1.1833 +  // Load/store high 128bit of YMM registers which does not destroy other half.
  1.1834 +  void vinsertf128h(XMMRegister dst, Address src);
  1.1835 +  void vinserti128h(XMMRegister dst, Address src);
  1.1836 +  void vextractf128h(Address dst, XMMRegister src);
  1.1837 +  void vextracti128h(Address dst, XMMRegister src);
  1.1838 +
  1.1839 +  // duplicate 4-bytes integer data from src into 8 locations in dest
  1.1840 +  void vpbroadcastd(XMMRegister dst, XMMRegister src);
  1.1841 +
  1.1842 +  // Carry-Less Multiplication Quadword
  1.1843 +  void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask);
  1.1844 +
  1.1845 +  // AVX instruction which is used to clear upper 128 bits of YMM registers and
  1.1846 +  // to avoid transaction penalty between AVX and SSE states. There is no
  1.1847 +  // penalty if legacy SSE instructions are encoded using VEX prefix because
  1.1848 +  // they always clear upper 128 bits. It should be used before calling
  1.1849 +  // runtime code and native libraries.
  1.1850 +  void vzeroupper();
  1.1851 +
  1.1852 + protected:
  1.1853 +  // Next instructions require address alignment 16 bytes SSE mode.
  1.1854 +  // They should be called only from corresponding MacroAssembler instructions.
  1.1855 +  void andpd(XMMRegister dst, Address src);
  1.1856 +  void andps(XMMRegister dst, Address src);
  1.1857 +  void xorpd(XMMRegister dst, Address src);
  1.1858 +  void xorps(XMMRegister dst, Address src);
  1.1859 +
  1.1860 +};
  1.1861 +
  1.1862 +#endif // CPU_X86_VM_ASSEMBLER_X86_HPP

mercurial