src/cpu/x86/vm/x86.ad

changeset 3882
8c92982cbbc4
parent 3577
9b8ce46870df
child 3886
6f8f439e247d
     1.1 --- a/src/cpu/x86/vm/x86.ad	Thu Jun 14 14:59:52 2012 -0700
     1.2 +++ b/src/cpu/x86/vm/x86.ad	Fri Jun 15 01:25:19 2012 -0700
     1.3 @@ -24,6 +24,456 @@
     1.4  
     1.5  // X86 Common Architecture Description File
     1.6  
     1.7 +//----------REGISTER DEFINITION BLOCK------------------------------------------
     1.8 +// This information is used by the matcher and the register allocator to
     1.9 +// describe individual registers and classes of registers within the target
    1.10 +// archtecture.
    1.11 +
    1.12 +register %{
    1.13 +//----------Architecture Description Register Definitions----------------------
    1.14 +// General Registers
    1.15 +// "reg_def"  name ( register save type, C convention save type,
    1.16 +//                   ideal register type, encoding );
    1.17 +// Register Save Types:
    1.18 +//
    1.19 +// NS  = No-Save:       The register allocator assumes that these registers
    1.20 +//                      can be used without saving upon entry to the method, &
    1.21 +//                      that they do not need to be saved at call sites.
    1.22 +//
    1.23 +// SOC = Save-On-Call:  The register allocator assumes that these registers
    1.24 +//                      can be used without saving upon entry to the method,
    1.25 +//                      but that they must be saved at call sites.
    1.26 +//
    1.27 +// SOE = Save-On-Entry: The register allocator assumes that these registers
    1.28 +//                      must be saved before using them upon entry to the
    1.29 +//                      method, but they do not need to be saved at call
    1.30 +//                      sites.
    1.31 +//
    1.32 +// AS  = Always-Save:   The register allocator assumes that these registers
    1.33 +//                      must be saved before using them upon entry to the
    1.34 +//                      method, & that they must be saved at call sites.
    1.35 +//
    1.36 +// Ideal Register Type is used to determine how to save & restore a
    1.37 +// register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
    1.38 +// spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
    1.39 +//
    1.40 +// The encoding number is the actual bit-pattern placed into the opcodes.
    1.41 +
    1.42 +// XMM registers.  256-bit registers or 8 words each, labeled (a)-h.
    1.43 +// Word a in each register holds a Float, words ab hold a Double.
    1.44 +// The whole registers are used in SSE4.2 version intrinsics,
    1.45 +// array copy stubs and superword operations (see UseSSE42Intrinsics,
    1.46 +// UseXMMForArrayCopy and UseSuperword flags).
    1.47 +// XMM8-XMM15 must be encoded with REX (VEX for UseAVX).
    1.48 +// Linux ABI:   No register preserved across function calls
    1.49 +//              XMM0-XMM7 might hold parameters
    1.50 +// Windows ABI: XMM6-XMM15 preserved across function calls
    1.51 +//              XMM0-XMM3 might hold parameters
    1.52 +
    1.53 +reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
    1.54 +reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next());
    1.55 +reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next());
    1.56 +reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next()->next());
    1.57 +reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next()->next()->next());
    1.58 +reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next()->next()->next()->next());
    1.59 +reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next()->next()->next()->next()->next());
    1.60 +reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
    1.61 +
    1.62 +reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
    1.63 +reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next());
    1.64 +reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next());
    1.65 +reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next()->next());
    1.66 +reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next()->next()->next());
    1.67 +reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next()->next()->next()->next());
    1.68 +reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next()->next()->next()->next()->next());
    1.69 +reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
    1.70 +
    1.71 +reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
    1.72 +reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next());
    1.73 +reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next());
    1.74 +reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next()->next());
    1.75 +reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next()->next()->next());
    1.76 +reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next()->next()->next()->next());
    1.77 +reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next()->next()->next()->next()->next());
    1.78 +reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
    1.79 +
    1.80 +reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
    1.81 +reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next());
    1.82 +reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next());
    1.83 +reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next()->next());
    1.84 +reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next()->next()->next());
    1.85 +reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next()->next()->next()->next());
    1.86 +reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next()->next()->next()->next()->next());
    1.87 +reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
    1.88 +
    1.89 +reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
    1.90 +reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next());
    1.91 +reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next());
    1.92 +reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next()->next());
    1.93 +reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next()->next()->next());
    1.94 +reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next()->next()->next()->next());
    1.95 +reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next()->next()->next()->next()->next());
    1.96 +reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
    1.97 +
    1.98 +reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
    1.99 +reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next());
   1.100 +reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next());
   1.101 +reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next()->next());
   1.102 +reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next()->next()->next());
   1.103 +reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next()->next()->next()->next());
   1.104 +reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next()->next()->next()->next()->next());
   1.105 +reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
   1.106 +
   1.107 +#ifdef _WIN64
   1.108 +
   1.109 +reg_def XMM6 ( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg());
   1.110 +reg_def XMM6b( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next());
   1.111 +reg_def XMM6c( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next());
   1.112 +reg_def XMM6d( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next());
   1.113 +reg_def XMM6e( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next());
   1.114 +reg_def XMM6f( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next());
   1.115 +reg_def XMM6g( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next()->next());
   1.116 +reg_def XMM6h( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
   1.117 +
   1.118 +reg_def XMM7 ( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg());
   1.119 +reg_def XMM7b( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next());
   1.120 +reg_def XMM7c( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next());
   1.121 +reg_def XMM7d( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next());
   1.122 +reg_def XMM7e( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next());
   1.123 +reg_def XMM7f( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next());
   1.124 +reg_def XMM7g( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next()->next());
   1.125 +reg_def XMM7h( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
   1.126 +
   1.127 +reg_def XMM8 ( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg());
   1.128 +reg_def XMM8b( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next());
   1.129 +reg_def XMM8c( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next());
   1.130 +reg_def XMM8d( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next());
   1.131 +reg_def XMM8e( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next());
   1.132 +reg_def XMM8f( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next());
   1.133 +reg_def XMM8g( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next()->next());
   1.134 +reg_def XMM8h( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
   1.135 +
   1.136 +reg_def XMM9 ( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg());
   1.137 +reg_def XMM9b( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next());
   1.138 +reg_def XMM9c( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next());
   1.139 +reg_def XMM9d( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next());
   1.140 +reg_def XMM9e( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next());
   1.141 +reg_def XMM9f( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next());
   1.142 +reg_def XMM9g( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next()->next());
   1.143 +reg_def XMM9h( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
   1.144 +
   1.145 +reg_def XMM10 ( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg());
   1.146 +reg_def XMM10b( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next());
   1.147 +reg_def XMM10c( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next());
   1.148 +reg_def XMM10d( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next());
   1.149 +reg_def XMM10e( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next());
   1.150 +reg_def XMM10f( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next());
   1.151 +reg_def XMM10g( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next()->next());
   1.152 +reg_def XMM10h( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
   1.153 +
   1.154 +reg_def XMM11 ( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg());
   1.155 +reg_def XMM11b( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next());
   1.156 +reg_def XMM11c( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next());
   1.157 +reg_def XMM11d( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next());
   1.158 +reg_def XMM11e( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next());
   1.159 +reg_def XMM11f( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next());
   1.160 +reg_def XMM11g( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next()->next());
   1.161 +reg_def XMM11h( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
   1.162 +
   1.163 +reg_def XMM12 ( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg());
   1.164 +reg_def XMM12b( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next());
   1.165 +reg_def XMM12c( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next());
   1.166 +reg_def XMM12d( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next());
   1.167 +reg_def XMM12e( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next());
   1.168 +reg_def XMM12f( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next());
   1.169 +reg_def XMM12g( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next()->next());
   1.170 +reg_def XMM12h( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
   1.171 +
   1.172 +reg_def XMM13 ( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg());
   1.173 +reg_def XMM13b( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next());
   1.174 +reg_def XMM13c( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next());
   1.175 +reg_def XMM13d( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next());
   1.176 +reg_def XMM13e( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next());
   1.177 +reg_def XMM13f( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next());
   1.178 +reg_def XMM13g( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next()->next());
   1.179 +reg_def XMM13h( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
   1.180 +
   1.181 +reg_def XMM14 ( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg());
   1.182 +reg_def XMM14b( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next());
   1.183 +reg_def XMM14c( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next());
   1.184 +reg_def XMM14d( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next());
   1.185 +reg_def XMM14e( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next());
   1.186 +reg_def XMM14f( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next());
   1.187 +reg_def XMM14g( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next()->next());
   1.188 +reg_def XMM14h( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
   1.189 +
   1.190 +reg_def XMM15 ( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg());
   1.191 +reg_def XMM15b( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next());
   1.192 +reg_def XMM15c( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next());
   1.193 +reg_def XMM15d( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next());
   1.194 +reg_def XMM15e( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next());
   1.195 +reg_def XMM15f( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next());
   1.196 +reg_def XMM15g( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next()->next());
   1.197 +reg_def XMM15h( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
   1.198 +
   1.199 +#else // _WIN64
   1.200 +
   1.201 +reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
   1.202 +reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next());
   1.203 +reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next());
   1.204 +reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next());
   1.205 +reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next());
   1.206 +reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next());
   1.207 +reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next()->next());
   1.208 +reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
   1.209 +
   1.210 +reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
   1.211 +reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next());
   1.212 +reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next());
   1.213 +reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next());
   1.214 +reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next());
   1.215 +reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next());
   1.216 +reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next()->next());
   1.217 +reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
   1.218 +
   1.219 +#ifdef _LP64
   1.220 +
   1.221 +reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
   1.222 +reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next());
   1.223 +reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next());
   1.224 +reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next());
   1.225 +reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next());
   1.226 +reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next());
   1.227 +reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next()->next());
   1.228 +reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
   1.229 +
   1.230 +reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
   1.231 +reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next());
   1.232 +reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next());
   1.233 +reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next());
   1.234 +reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next());
   1.235 +reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next());
   1.236 +reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next()->next());
   1.237 +reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
   1.238 +
   1.239 +reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
   1.240 +reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next());
   1.241 +reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next());
   1.242 +reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next());
   1.243 +reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next());
   1.244 +reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next());
   1.245 +reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next()->next());
   1.246 +reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
   1.247 +
   1.248 +reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
   1.249 +reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next());
   1.250 +reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next());
   1.251 +reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next());
   1.252 +reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next());
   1.253 +reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next());
   1.254 +reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next()->next());
   1.255 +reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
   1.256 +
   1.257 +reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
   1.258 +reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next());
   1.259 +reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next());
   1.260 +reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next());
   1.261 +reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next());
   1.262 +reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next());
   1.263 +reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next()->next());
   1.264 +reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
   1.265 +
   1.266 +reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
   1.267 +reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next());
   1.268 +reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next());
   1.269 +reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next());
   1.270 +reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next());
   1.271 +reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next());
   1.272 +reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next()->next());
   1.273 +reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
   1.274 +
   1.275 +reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
   1.276 +reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next());
   1.277 +reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next());
   1.278 +reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next());
   1.279 +reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next());
   1.280 +reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next());
   1.281 +reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next()->next());
   1.282 +reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
   1.283 +
   1.284 +reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
   1.285 +reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next());
   1.286 +reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next());
   1.287 +reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next());
   1.288 +reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next());
   1.289 +reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next());
   1.290 +reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next()->next());
   1.291 +reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
   1.292 +
   1.293 +#endif // _LP64
   1.294 +
   1.295 +#endif // _WIN64
   1.296 +
   1.297 +#ifdef _LP64
   1.298 +reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
   1.299 +#else
   1.300 +reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
   1.301 +#endif // _LP64
   1.302 +
   1.303 +alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
   1.304 +                   XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
   1.305 +                   XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
   1.306 +                   XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
   1.307 +                   XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
   1.308 +                   XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
   1.309 +                   XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
   1.310 +                   XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
   1.311 +#ifdef _LP64
   1.312 +                  ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
   1.313 +                   XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
   1.314 +                   XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
   1.315 +                   XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
   1.316 +                   XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
   1.317 +                   XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
   1.318 +                   XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
   1.319 +                   XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
   1.320 +#endif
   1.321 +                   );
   1.322 +
   1.323 +// flags allocation class should be last.
   1.324 +alloc_class chunk2(RFLAGS);
   1.325 +
   1.326 +// Singleton class for condition codes
   1.327 +reg_class int_flags(RFLAGS);
   1.328 +
   1.329 +// Class for all float registers
   1.330 +reg_class float_reg(XMM0,
   1.331 +                    XMM1,
   1.332 +                    XMM2,
   1.333 +                    XMM3,
   1.334 +                    XMM4,
   1.335 +                    XMM5,
   1.336 +                    XMM6,
   1.337 +                    XMM7
   1.338 +#ifdef _LP64
   1.339 +                   ,XMM8,
   1.340 +                    XMM9,
   1.341 +                    XMM10,
   1.342 +                    XMM11,
   1.343 +                    XMM12,
   1.344 +                    XMM13,
   1.345 +                    XMM14,
   1.346 +                    XMM15
   1.347 +#endif
   1.348 +                    );
   1.349 +
   1.350 +// Class for all double registers
   1.351 +reg_class double_reg(XMM0,  XMM0b,
   1.352 +                     XMM1,  XMM1b,
   1.353 +                     XMM2,  XMM2b,
   1.354 +                     XMM3,  XMM3b,
   1.355 +                     XMM4,  XMM4b,
   1.356 +                     XMM5,  XMM5b,
   1.357 +                     XMM6,  XMM6b,
   1.358 +                     XMM7,  XMM7b
   1.359 +#ifdef _LP64
   1.360 +                    ,XMM8,  XMM8b,
   1.361 +                     XMM9,  XMM9b,
   1.362 +                     XMM10, XMM10b,
   1.363 +                     XMM11, XMM11b,
   1.364 +                     XMM12, XMM12b,
   1.365 +                     XMM13, XMM13b,
   1.366 +                     XMM14, XMM14b,
   1.367 +                     XMM15, XMM15b
   1.368 +#endif
   1.369 +                     );
   1.370 +
   1.371 +// Class for all 32bit vector registers
   1.372 +reg_class vectors_reg(XMM0,
   1.373 +                      XMM1,
   1.374 +                      XMM2,
   1.375 +                      XMM3,
   1.376 +                      XMM4,
   1.377 +                      XMM5,
   1.378 +                      XMM6,
   1.379 +                      XMM7
   1.380 +#ifdef _LP64
   1.381 +                     ,XMM8,
   1.382 +                      XMM9,
   1.383 +                      XMM10,
   1.384 +                      XMM11,
   1.385 +                      XMM12,
   1.386 +                      XMM13,
   1.387 +                      XMM14,
   1.388 +                      XMM15
   1.389 +#endif
   1.390 +                      );
   1.391 +
   1.392 +// Class for all 64bit vector registers
   1.393 +reg_class vectord_reg(XMM0,  XMM0b,
   1.394 +                      XMM1,  XMM1b,
   1.395 +                      XMM2,  XMM2b,
   1.396 +                      XMM3,  XMM3b,
   1.397 +                      XMM4,  XMM4b,
   1.398 +                      XMM5,  XMM5b,
   1.399 +                      XMM6,  XMM6b,
   1.400 +                      XMM7,  XMM7b
   1.401 +#ifdef _LP64
   1.402 +                     ,XMM8,  XMM8b,
   1.403 +                      XMM9,  XMM9b,
   1.404 +                      XMM10, XMM10b,
   1.405 +                      XMM11, XMM11b,
   1.406 +                      XMM12, XMM12b,
   1.407 +                      XMM13, XMM13b,
   1.408 +                      XMM14, XMM14b,
   1.409 +                      XMM15, XMM15b
   1.410 +#endif
   1.411 +                      );
   1.412 +
   1.413 +// Class for all 128bit vector registers
   1.414 +reg_class vectorx_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,
   1.415 +                      XMM1,  XMM1b,  XMM1c,  XMM1d,
   1.416 +                      XMM2,  XMM2b,  XMM2c,  XMM2d,
   1.417 +                      XMM3,  XMM3b,  XMM3c,  XMM3d,
   1.418 +                      XMM4,  XMM4b,  XMM4c,  XMM4d,
   1.419 +                      XMM5,  XMM5b,  XMM5c,  XMM5d,
   1.420 +                      XMM6,  XMM6b,  XMM6c,  XMM6d,
   1.421 +                      XMM7,  XMM7b,  XMM7c,  XMM7d
   1.422 +#ifdef _LP64
   1.423 +                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,
   1.424 +                      XMM9,  XMM9b,  XMM9c,  XMM9d,
   1.425 +                      XMM10, XMM10b, XMM10c, XMM10d,
   1.426 +                      XMM11, XMM11b, XMM11c, XMM11d,
   1.427 +                      XMM12, XMM12b, XMM12c, XMM12d,
   1.428 +                      XMM13, XMM13b, XMM13c, XMM13d,
   1.429 +                      XMM14, XMM14b, XMM14c, XMM14d,
   1.430 +                      XMM15, XMM15b, XMM15c, XMM15d
   1.431 +#endif
   1.432 +                      );
   1.433 +
   1.434 +// Class for all 256bit vector registers
   1.435 +reg_class vectory_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
   1.436 +                      XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
   1.437 +                      XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
   1.438 +                      XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
   1.439 +                      XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
   1.440 +                      XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
   1.441 +                      XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
   1.442 +                      XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
   1.443 +#ifdef _LP64
   1.444 +                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
   1.445 +                      XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
   1.446 +                      XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
   1.447 +                      XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
   1.448 +                      XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
   1.449 +                      XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
   1.450 +                      XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
   1.451 +                      XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
   1.452 +#endif
   1.453 +                      );
   1.454 +
   1.455 +%}
   1.456 +
   1.457  source %{
   1.458    // Float masks come from different places depending on platform.
   1.459  #ifdef _LP64
   1.460 @@ -38,6 +488,252 @@
   1.461    static address double_signflip() { return (address)double_signflip_pool; }
   1.462  #endif
   1.463  
   1.464 +// Map Types to machine register types
   1.465 +const int Matcher::base2reg[Type::lastype] = {
   1.466 +  Node::NotAMachineReg,0,0, Op_RegI, Op_RegL, 0, Op_RegN,
   1.467 +  Node::NotAMachineReg, Node::NotAMachineReg, /* tuple, array */
   1.468 +  Op_VecS, Op_VecD, Op_VecX, Op_VecY, /* Vectors */
   1.469 +  Op_RegP, Op_RegP, Op_RegP, Op_RegP, Op_RegP, Op_RegP, /* the pointers */
   1.470 +  0, 0/*abio*/,
   1.471 +  Op_RegP /* Return address */, 0, /* the memories */
   1.472 +  Op_RegF, Op_RegF, Op_RegF, Op_RegD, Op_RegD, Op_RegD,
   1.473 +  0  /*bottom*/
   1.474 +};
   1.475 +
   1.476 +// Max vector size in bytes. 0 if not supported.
   1.477 +const int Matcher::vector_width_in_bytes(BasicType bt) {
   1.478 +  assert(is_java_primitive(bt), "only primitive type vectors");
   1.479 +  if (UseSSE < 2) return 0;
   1.480 +  // SSE2 supports 128bit vectors for all types.
   1.481 +  // AVX2 supports 256bit vectors for all types.
   1.482 +  int size = (UseAVX > 1) ? 32 : 16;
   1.483 +  // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
   1.484 +  if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
   1.485 +    size = 32;
   1.486 +  // Use flag to limit vector size.
   1.487 +  size = MIN2(size,(int)MaxVectorSize);
   1.488 +  // Minimum 2 values in vector (or 4 for bytes).
   1.489 +  switch (bt) {
   1.490 +  case T_DOUBLE:
   1.491 +  case T_LONG:
   1.492 +    if (size < 16) return 0;
   1.493 +  case T_FLOAT:
   1.494 +  case T_INT:
   1.495 +    if (size < 8) return 0;
   1.496 +  case T_BOOLEAN:
   1.497 +  case T_BYTE:
   1.498 +  case T_CHAR:
   1.499 +  case T_SHORT:
   1.500 +    if (size < 4) return 0;
   1.501 +    break;
   1.502 +  default:
   1.503 +    ShouldNotReachHere();
   1.504 +  }
   1.505 +  return size;
   1.506 +}
   1.507 +
   1.508 +// Limits on vector size (number of elements) loaded into vector.
   1.509 +const int Matcher::max_vector_size(const BasicType bt) {
   1.510 +  return vector_width_in_bytes(bt)/type2aelembytes(bt);
   1.511 +}
   1.512 +const int Matcher::min_vector_size(const BasicType bt) {
   1.513 +  int max_size = max_vector_size(bt);
   1.514 +  // Min size which can be loaded into vector is 4 bytes.
   1.515 +  int size = (type2aelembytes(bt) == 1) ? 4 : 2;
   1.516 +  return MIN2(size,max_size);
   1.517 +}
   1.518 +
   1.519 +// Vector ideal reg corresponding to specidied size in bytes
   1.520 +const int Matcher::vector_ideal_reg(int size) {
   1.521 +  assert(MaxVectorSize >= size, "");
   1.522 +  switch(size) {
   1.523 +    case  4: return Op_VecS;
   1.524 +    case  8: return Op_VecD;
   1.525 +    case 16: return Op_VecX;
   1.526 +    case 32: return Op_VecY;
   1.527 +  }
   1.528 +  ShouldNotReachHere();
   1.529 +  return 0;
   1.530 +}
   1.531 +
   1.532 +// x86 supports misaligned vectors store/load.
   1.533 +const bool Matcher::misaligned_vectors_ok() {
   1.534 +  return !AlignVector; // can be changed by flag
   1.535 +}
   1.536 +
   1.537 +// Helper methods for MachSpillCopyNode::implementation().
   1.538 +static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
   1.539 +                          int src_hi, int dst_hi, uint ireg, outputStream* st) {
   1.540 +  // In 64-bit VM size calculation is very complex. Emitting instructions
   1.541 +  // into scratch buffer is used to get size in 64-bit VM.
   1.542 +  LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
   1.543 +  assert(ireg == Op_VecS || // 32bit vector
   1.544 +         (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
   1.545 +         (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
   1.546 +         "no non-adjacent vector moves" );
   1.547 +  if (cbuf) {
   1.548 +    MacroAssembler _masm(cbuf);
   1.549 +    int offset = __ offset();
   1.550 +    switch (ireg) {
   1.551 +    case Op_VecS: // copy whole register
   1.552 +    case Op_VecD:
   1.553 +    case Op_VecX:
   1.554 +      __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
   1.555 +      break;
   1.556 +    case Op_VecY:
   1.557 +      __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
   1.558 +      break;
   1.559 +    default:
   1.560 +      ShouldNotReachHere();
   1.561 +    }
   1.562 +    int size = __ offset() - offset;
   1.563 +#ifdef ASSERT
   1.564 +    // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
   1.565 +    assert(!do_size || size == 4, "incorrect size calculattion");
   1.566 +#endif
   1.567 +    return size;
   1.568 +#ifndef PRODUCT
   1.569 +  } else if (!do_size) {
   1.570 +    switch (ireg) {
   1.571 +    case Op_VecS:
   1.572 +    case Op_VecD:
   1.573 +    case Op_VecX:
   1.574 +      st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
   1.575 +      break;
   1.576 +    case Op_VecY:
   1.577 +      st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
   1.578 +      break;
   1.579 +    default:
   1.580 +      ShouldNotReachHere();
   1.581 +    }
   1.582 +#endif
   1.583 +  }
   1.584 +  // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
   1.585 +  return 4;
   1.586 +}
   1.587 +
   1.588 +static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
   1.589 +                            int stack_offset, int reg, uint ireg, outputStream* st) {
   1.590 +  // In 64-bit VM size calculation is very complex. Emitting instructions
   1.591 +  // into scratch buffer is used to get size in 64-bit VM.
   1.592 +  LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
   1.593 +  if (cbuf) {
   1.594 +    MacroAssembler _masm(cbuf);
   1.595 +    int offset = __ offset();
   1.596 +    if (is_load) {
   1.597 +      switch (ireg) {
   1.598 +      case Op_VecS:
   1.599 +        __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
   1.600 +        break;
   1.601 +      case Op_VecD:
   1.602 +        __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
   1.603 +        break;
   1.604 +      case Op_VecX:
   1.605 +        __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
   1.606 +        break;
   1.607 +      case Op_VecY:
   1.608 +        __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
   1.609 +        break;
   1.610 +      default:
   1.611 +        ShouldNotReachHere();
   1.612 +      }
   1.613 +    } else { // store
   1.614 +      switch (ireg) {
   1.615 +      case Op_VecS:
   1.616 +        __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
   1.617 +        break;
   1.618 +      case Op_VecD:
   1.619 +        __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
   1.620 +        break;
   1.621 +      case Op_VecX:
   1.622 +        __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
   1.623 +        break;
   1.624 +      case Op_VecY:
   1.625 +        __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
   1.626 +        break;
   1.627 +      default:
   1.628 +        ShouldNotReachHere();
   1.629 +      }
   1.630 +    }
   1.631 +    int size = __ offset() - offset;
   1.632 +#ifdef ASSERT
   1.633 +    int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4);
   1.634 +    // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
   1.635 +    assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
   1.636 +#endif
   1.637 +    return size;
   1.638 +#ifndef PRODUCT
   1.639 +  } else if (!do_size) {
   1.640 +    if (is_load) {
   1.641 +      switch (ireg) {
   1.642 +      case Op_VecS:
   1.643 +        st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
   1.644 +        break;
   1.645 +      case Op_VecD:
   1.646 +        st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
   1.647 +        break;
   1.648 +       case Op_VecX:
   1.649 +        st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
   1.650 +        break;
   1.651 +      case Op_VecY:
   1.652 +        st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
   1.653 +        break;
   1.654 +      default:
   1.655 +        ShouldNotReachHere();
   1.656 +      }
   1.657 +    } else { // store
   1.658 +      switch (ireg) {
   1.659 +      case Op_VecS:
   1.660 +        st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
   1.661 +        break;
   1.662 +      case Op_VecD:
   1.663 +        st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
   1.664 +        break;
   1.665 +       case Op_VecX:
   1.666 +        st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
   1.667 +        break;
   1.668 +      case Op_VecY:
   1.669 +        st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
   1.670 +        break;
   1.671 +      default:
   1.672 +        ShouldNotReachHere();
   1.673 +      }
   1.674 +    }
   1.675 +#endif
   1.676 +  }
   1.677 +  int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4);
   1.678 +  // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
   1.679 +  return 5+offset_size;
   1.680 +}
   1.681 +
   1.682 +static inline jfloat replicate4_imm(int con, int width) {
   1.683 +  // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
   1.684 +  assert(width == 1 || width == 2, "only byte or short types here");
   1.685 +  int bit_width = width * 8;
   1.686 +  jint val = con;
   1.687 +  val &= (1 << bit_width) - 1;  // mask off sign bits
   1.688 +  while(bit_width < 32) {
   1.689 +    val |= (val << bit_width);
   1.690 +    bit_width <<= 1;
   1.691 +  }
   1.692 +  jfloat fval = *((jfloat*) &val);  // coerce to float type
   1.693 +  return fval;
   1.694 +}
   1.695 +
   1.696 +static inline jdouble replicate8_imm(int con, int width) {
   1.697 +  // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
   1.698 +  assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
   1.699 +  int bit_width = width * 8;
   1.700 +  jlong val = con;
   1.701 +  val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
   1.702 +  while(bit_width < 64) {
   1.703 +    val |= (val << bit_width);
   1.704 +    bit_width <<= 1;
   1.705 +  }
   1.706 +  jdouble dval = *((jdouble*) &val);  // coerce to double type
   1.707 +  return dval;
   1.708 +}
   1.709 +
   1.710  #ifndef PRODUCT
   1.711    void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
   1.712      st->print("nop \t# %d bytes pad for loops and calls", _count);
   1.713 @@ -103,6 +799,46 @@
   1.714  
   1.715  %}
   1.716  
   1.717 +
   1.718 +//----------OPERANDS-----------------------------------------------------------
   1.719 +// Operand definitions must precede instruction definitions for correct parsing
   1.720 +// in the ADLC because operands constitute user defined types which are used in
   1.721 +// instruction definitions.
   1.722 +
   1.723 +// Vectors
   1.724 +operand vecS() %{
   1.725 +  constraint(ALLOC_IN_RC(vectors_reg));
   1.726 +  match(VecS);
   1.727 +
   1.728 +  format %{ %}
   1.729 +  interface(REG_INTER);
   1.730 +%}
   1.731 +
   1.732 +operand vecD() %{
   1.733 +  constraint(ALLOC_IN_RC(vectord_reg));
   1.734 +  match(VecD);
   1.735 +
   1.736 +  format %{ %}
   1.737 +  interface(REG_INTER);
   1.738 +%}
   1.739 +
   1.740 +operand vecX() %{
   1.741 +  constraint(ALLOC_IN_RC(vectorx_reg));
   1.742 +  match(VecX);
   1.743 +
   1.744 +  format %{ %}
   1.745 +  interface(REG_INTER);
   1.746 +%}
   1.747 +
   1.748 +operand vecY() %{
   1.749 +  constraint(ALLOC_IN_RC(vectory_reg));
   1.750 +  match(VecY);
   1.751 +
   1.752 +  format %{ %}
   1.753 +  interface(REG_INTER);
   1.754 +%}
   1.755 +
   1.756 +
   1.757  // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
   1.758  
   1.759  // ============================================================================
   1.760 @@ -852,3 +1588,797 @@
   1.761    ins_pipe(pipe_slow);
   1.762  %}
   1.763  
   1.764 +
   1.765 +// ====================VECTOR INSTRUCTIONS=====================================
   1.766 +
   1.767 +// Load vectors (4 bytes long)
   1.768 +instruct loadV4(vecS dst, memory mem) %{
   1.769 +  predicate(n->as_LoadVector()->memory_size() == 4);
   1.770 +  match(Set dst (LoadVector mem));
   1.771 +  ins_cost(125);
   1.772 +  format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
   1.773 +  ins_encode %{
   1.774 +    __ movdl($dst$$XMMRegister, $mem$$Address);
   1.775 +  %}
   1.776 +  ins_pipe( pipe_slow );
   1.777 +%}
   1.778 +
   1.779 +// Load vectors (8 bytes long)
   1.780 +instruct loadV8(vecD dst, memory mem) %{
   1.781 +  predicate(n->as_LoadVector()->memory_size() == 8);
   1.782 +  match(Set dst (LoadVector mem));
   1.783 +  ins_cost(125);
   1.784 +  format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
   1.785 +  ins_encode %{
   1.786 +    __ movq($dst$$XMMRegister, $mem$$Address);
   1.787 +  %}
   1.788 +  ins_pipe( pipe_slow );
   1.789 +%}
   1.790 +
   1.791 +// Load vectors (16 bytes long)
   1.792 +instruct loadV16(vecX dst, memory mem) %{
   1.793 +  predicate(n->as_LoadVector()->memory_size() == 16);
   1.794 +  match(Set dst (LoadVector mem));
   1.795 +  ins_cost(125);
   1.796 +  format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
   1.797 +  ins_encode %{
   1.798 +    __ movdqu($dst$$XMMRegister, $mem$$Address);
   1.799 +  %}
   1.800 +  ins_pipe( pipe_slow );
   1.801 +%}
   1.802 +
   1.803 +// Load vectors (32 bytes long)
   1.804 +instruct loadV32(vecY dst, memory mem) %{
   1.805 +  predicate(n->as_LoadVector()->memory_size() == 32);
   1.806 +  match(Set dst (LoadVector mem));
   1.807 +  ins_cost(125);
   1.808 +  format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
   1.809 +  ins_encode %{
   1.810 +    __ vmovdqu($dst$$XMMRegister, $mem$$Address);
   1.811 +  %}
   1.812 +  ins_pipe( pipe_slow );
   1.813 +%}
   1.814 +
   1.815 +// Store vectors
   1.816 +instruct storeV4(memory mem, vecS src) %{
   1.817 +  predicate(n->as_StoreVector()->memory_size() == 4);
   1.818 +  match(Set mem (StoreVector mem src));
   1.819 +  ins_cost(145);
   1.820 +  format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
   1.821 +  ins_encode %{
   1.822 +    __ movdl($mem$$Address, $src$$XMMRegister);
   1.823 +  %}
   1.824 +  ins_pipe( pipe_slow );
   1.825 +%}
   1.826 +
   1.827 +instruct storeV8(memory mem, vecD src) %{
   1.828 +  predicate(n->as_StoreVector()->memory_size() == 8);
   1.829 +  match(Set mem (StoreVector mem src));
   1.830 +  ins_cost(145);
   1.831 +  format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
   1.832 +  ins_encode %{
   1.833 +    __ movq($mem$$Address, $src$$XMMRegister);
   1.834 +  %}
   1.835 +  ins_pipe( pipe_slow );
   1.836 +%}
   1.837 +
   1.838 +instruct storeV16(memory mem, vecX src) %{
   1.839 +  predicate(n->as_StoreVector()->memory_size() == 16);
   1.840 +  match(Set mem (StoreVector mem src));
   1.841 +  ins_cost(145);
   1.842 +  format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
   1.843 +  ins_encode %{
   1.844 +    __ movdqu($mem$$Address, $src$$XMMRegister);
   1.845 +  %}
   1.846 +  ins_pipe( pipe_slow );
   1.847 +%}
   1.848 +
   1.849 +instruct storeV32(memory mem, vecY src) %{
   1.850 +  predicate(n->as_StoreVector()->memory_size() == 32);
   1.851 +  match(Set mem (StoreVector mem src));
   1.852 +  ins_cost(145);
   1.853 +  format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
   1.854 +  ins_encode %{
   1.855 +    __ vmovdqu($mem$$Address, $src$$XMMRegister);
   1.856 +  %}
   1.857 +  ins_pipe( pipe_slow );
   1.858 +%}
   1.859 +
   1.860 +// Replicate byte scalar to be vector
   1.861 +instruct Repl4B(vecS dst, rRegI src) %{
   1.862 +  predicate(n->as_Vector()->length() == 4);
   1.863 +  match(Set dst (ReplicateB src));
   1.864 +  format %{ "movd    $dst,$src\n\t"
   1.865 +            "punpcklbw $dst,$dst\n\t"
   1.866 +            "pshuflw $dst,$dst,0x00\t! replicate4B" %}
   1.867 +  ins_encode %{
   1.868 +    __ movdl($dst$$XMMRegister, $src$$Register);
   1.869 +    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
   1.870 +    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
   1.871 +  %}
   1.872 +  ins_pipe( pipe_slow );
   1.873 +%}
   1.874 +
   1.875 +instruct Repl8B(vecD dst, rRegI src) %{
   1.876 +  predicate(n->as_Vector()->length() == 8);
   1.877 +  match(Set dst (ReplicateB src));
   1.878 +  format %{ "movd    $dst,$src\n\t"
   1.879 +            "punpcklbw $dst,$dst\n\t"
   1.880 +            "pshuflw $dst,$dst,0x00\t! replicate8B" %}
   1.881 +  ins_encode %{
   1.882 +    __ movdl($dst$$XMMRegister, $src$$Register);
   1.883 +    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
   1.884 +    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
   1.885 +  %}
   1.886 +  ins_pipe( pipe_slow );
   1.887 +%}
   1.888 +
   1.889 +instruct Repl16B(vecX dst, rRegI src) %{
   1.890 +  predicate(n->as_Vector()->length() == 16);
   1.891 +  match(Set dst (ReplicateB src));
   1.892 +  format %{ "movd    $dst,$src\n\t"
   1.893 +            "punpcklbw $dst,$dst\n\t"
   1.894 +            "pshuflw $dst,$dst,0x00\n\t"
   1.895 +            "movlhps $dst,$dst\t! replicate16B" %}
   1.896 +  ins_encode %{
   1.897 +    __ movdl($dst$$XMMRegister, $src$$Register);
   1.898 +    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
   1.899 +    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
   1.900 +    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
   1.901 +  %}
   1.902 +  ins_pipe( pipe_slow );
   1.903 +%}
   1.904 +
   1.905 +instruct Repl32B(vecY dst, rRegI src) %{
   1.906 +  predicate(n->as_Vector()->length() == 32);
   1.907 +  match(Set dst (ReplicateB src));
   1.908 +  format %{ "movd    $dst,$src\n\t"
   1.909 +            "punpcklbw $dst,$dst\n\t"
   1.910 +            "pshuflw $dst,$dst,0x00\n\t"
   1.911 +            "movlhps $dst,$dst\n\t"
   1.912 +            "vinsertf128h $dst,$dst,$dst\t! replicate32B" %}
   1.913 +  ins_encode %{
   1.914 +    __ movdl($dst$$XMMRegister, $src$$Register);
   1.915 +    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
   1.916 +    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
   1.917 +    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
   1.918 +    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
   1.919 +  %}
   1.920 +  ins_pipe( pipe_slow );
   1.921 +%}
   1.922 +
   1.923 +// Replicate byte scalar immediate to be vector by loading from const table.
   1.924 +instruct Repl4B_imm(vecS dst, immI con) %{
   1.925 +  predicate(n->as_Vector()->length() == 4);
   1.926 +  match(Set dst (ReplicateB con));
   1.927 +  format %{ "movss   $dst,[$constantaddress]\t! replicate4B($con)" %}
   1.928 +  ins_encode %{
   1.929 +    __ movflt($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
   1.930 +  %}
   1.931 +  ins_pipe( pipe_slow );
   1.932 +%}
   1.933 +
   1.934 +instruct Repl8B_imm(vecD dst, immI con) %{
   1.935 +  predicate(n->as_Vector()->length() == 8);
   1.936 +  match(Set dst (ReplicateB con));
   1.937 +  format %{ "movsd   $dst,[$constantaddress]\t! replicate8B($con)" %}
   1.938 +  ins_encode %{
   1.939 +    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
   1.940 +  %}
   1.941 +  ins_pipe( pipe_slow );
   1.942 +%}
   1.943 +
   1.944 +instruct Repl16B_imm(vecX dst, immI con) %{
   1.945 +  predicate(n->as_Vector()->length() == 16);
   1.946 +  match(Set dst (ReplicateB con));
   1.947 +  format %{ "movsd   $dst,[$constantaddress]\t! replicate16B($con)\n\t"
   1.948 +            "movlhps $dst,$dst" %}
   1.949 +  ins_encode %{
   1.950 +    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
   1.951 +    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
   1.952 +  %}
   1.953 +  ins_pipe( pipe_slow );
   1.954 +%}
   1.955 +
   1.956 +instruct Repl32B_imm(vecY dst, immI con) %{
   1.957 +  predicate(n->as_Vector()->length() == 32);
   1.958 +  match(Set dst (ReplicateB con));
   1.959 +  format %{ "movsd   $dst,[$constantaddress]\t! lreplicate32B($con)\n\t"
   1.960 +            "movlhps $dst,$dst\n\t"
   1.961 +            "vinsertf128h $dst,$dst,$dst" %}
   1.962 +  ins_encode %{
   1.963 +    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
   1.964 +    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
   1.965 +    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
   1.966 +  %}
   1.967 +  ins_pipe( pipe_slow );
   1.968 +%}
   1.969 +
   1.970 +// Replicate byte scalar zero to be vector
   1.971 +instruct Repl4B_zero(vecS dst, immI0 zero) %{
   1.972 +  predicate(n->as_Vector()->length() == 4);
   1.973 +  match(Set dst (ReplicateB zero));
   1.974 +  format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
   1.975 +  ins_encode %{
   1.976 +    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
   1.977 +  %}
   1.978 +  ins_pipe( fpu_reg_reg );
   1.979 +%}
   1.980 +
   1.981 +instruct Repl8B_zero(vecD dst, immI0 zero) %{
   1.982 +  predicate(n->as_Vector()->length() == 8);
   1.983 +  match(Set dst (ReplicateB zero));
   1.984 +  format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
   1.985 +  ins_encode %{
   1.986 +    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
   1.987 +  %}
   1.988 +  ins_pipe( fpu_reg_reg );
   1.989 +%}
   1.990 +
   1.991 +instruct Repl16B_zero(vecX dst, immI0 zero) %{
   1.992 +  predicate(n->as_Vector()->length() == 16);
   1.993 +  match(Set dst (ReplicateB zero));
   1.994 +  format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
   1.995 +  ins_encode %{
   1.996 +    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
   1.997 +  %}
   1.998 +  ins_pipe( fpu_reg_reg );
   1.999 +%}
  1.1000 +
  1.1001 +instruct Repl32B_zero(vecY dst, immI0 zero) %{
  1.1002 +  predicate(n->as_Vector()->length() == 32);
  1.1003 +  match(Set dst (ReplicateB zero));
  1.1004 +  format %{ "vxorpd  $dst,$dst,$dst\t! replicate32B zero" %}
  1.1005 +  ins_encode %{
  1.1006 +    // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
  1.1007 +    bool vector256 = true;
  1.1008 +    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
  1.1009 +  %}
  1.1010 +  ins_pipe( fpu_reg_reg );
  1.1011 +%}
  1.1012 +
  1.1013 +// Replicate char/short (2 byte) scalar to be vector
  1.1014 +instruct Repl2S(vecS dst, rRegI src) %{
  1.1015 +  predicate(n->as_Vector()->length() == 2);
  1.1016 +  match(Set dst (ReplicateS src));
  1.1017 +  format %{ "movd    $dst,$src\n\t"
  1.1018 +            "pshuflw $dst,$dst,0x00\t! replicate2S" %}
  1.1019 +  ins_encode %{
  1.1020 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.1021 +    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.1022 +  %}
  1.1023 +  ins_pipe( fpu_reg_reg );
  1.1024 +%}
  1.1025 +
  1.1026 +instruct Repl4S(vecD dst, rRegI src) %{
  1.1027 +  predicate(n->as_Vector()->length() == 4);
  1.1028 +  match(Set dst (ReplicateS src));
  1.1029 +  format %{ "movd    $dst,$src\n\t"
  1.1030 +            "pshuflw $dst,$dst,0x00\t! replicate4S" %}
  1.1031 +  ins_encode %{
  1.1032 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.1033 +    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.1034 +  %}
  1.1035 +  ins_pipe( fpu_reg_reg );
  1.1036 +%}
  1.1037 +
  1.1038 +instruct Repl8S(vecX dst, rRegI src) %{
  1.1039 +  predicate(n->as_Vector()->length() == 8);
  1.1040 +  match(Set dst (ReplicateS src));
  1.1041 +  format %{ "movd    $dst,$src\n\t"
  1.1042 +            "pshuflw $dst,$dst,0x00\n\t"
  1.1043 +            "movlhps $dst,$dst\t! replicate8S" %}
  1.1044 +  ins_encode %{
  1.1045 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.1046 +    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.1047 +    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
  1.1048 +  %}
  1.1049 +  ins_pipe( pipe_slow );
  1.1050 +%}
  1.1051 +
  1.1052 +instruct Repl16S(vecY dst, rRegI src) %{
  1.1053 +  predicate(n->as_Vector()->length() == 16);
  1.1054 +  match(Set dst (ReplicateS src));
  1.1055 +  format %{ "movd    $dst,$src\n\t"
  1.1056 +            "pshuflw $dst,$dst,0x00\n\t"
  1.1057 +            "movlhps $dst,$dst\n\t"
  1.1058 +            "vinsertf128h $dst,$dst,$dst\t! replicate16S" %}
  1.1059 +  ins_encode %{
  1.1060 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.1061 +    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.1062 +    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
  1.1063 +    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.1064 +  %}
  1.1065 +  ins_pipe( pipe_slow );
  1.1066 +%}
  1.1067 +
  1.1068 +// Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
  1.1069 +instruct Repl2S_imm(vecS dst, immI con) %{
  1.1070 +  predicate(n->as_Vector()->length() == 2);
  1.1071 +  match(Set dst (ReplicateS con));
  1.1072 +  format %{ "movss   $dst,[$constantaddress]\t! replicate2S($con)" %}
  1.1073 +  ins_encode %{
  1.1074 +    __ movflt($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
  1.1075 +  %}
  1.1076 +  ins_pipe( fpu_reg_reg );
  1.1077 +%}
  1.1078 +
  1.1079 +instruct Repl4S_imm(vecD dst, immI con) %{
  1.1080 +  predicate(n->as_Vector()->length() == 4);
  1.1081 +  match(Set dst (ReplicateS con));
  1.1082 +  format %{ "movsd   $dst,[$constantaddress]\t! replicate4S($con)" %}
  1.1083 +  ins_encode %{
  1.1084 +    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
  1.1085 +  %}
  1.1086 +  ins_pipe( fpu_reg_reg );
  1.1087 +%}
  1.1088 +
  1.1089 +instruct Repl8S_imm(vecX dst, immI con) %{
  1.1090 +  predicate(n->as_Vector()->length() == 8);
  1.1091 +  match(Set dst (ReplicateS con));
  1.1092 +  format %{ "movsd   $dst,[$constantaddress]\t! replicate8S($con)\n\t"
  1.1093 +            "movlhps $dst,$dst" %}
  1.1094 +  ins_encode %{
  1.1095 +    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
  1.1096 +    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
  1.1097 +  %}
  1.1098 +  ins_pipe( pipe_slow );
  1.1099 +%}
  1.1100 +
  1.1101 +instruct Repl16S_imm(vecY dst, immI con) %{
  1.1102 +  predicate(n->as_Vector()->length() == 16);
  1.1103 +  match(Set dst (ReplicateS con));
  1.1104 +  format %{ "movsd   $dst,[$constantaddress]\t! replicate16S($con)\n\t"
  1.1105 +            "movlhps $dst,$dst\n\t"
  1.1106 +            "vinsertf128h $dst,$dst,$dst" %}
  1.1107 +  ins_encode %{
  1.1108 +    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
  1.1109 +    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
  1.1110 +    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.1111 +  %}
  1.1112 +  ins_pipe( pipe_slow );
  1.1113 +%}
  1.1114 +
  1.1115 +// Replicate char/short (2 byte) scalar zero to be vector
  1.1116 +instruct Repl2S_zero(vecS dst, immI0 zero) %{
  1.1117 +  predicate(n->as_Vector()->length() == 2);
  1.1118 +  match(Set dst (ReplicateS zero));
  1.1119 +  format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
  1.1120 +  ins_encode %{
  1.1121 +    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
  1.1122 +  %}
  1.1123 +  ins_pipe( fpu_reg_reg );
  1.1124 +%}
  1.1125 +
  1.1126 +instruct Repl4S_zero(vecD dst, immI0 zero) %{
  1.1127 +  predicate(n->as_Vector()->length() == 4);
  1.1128 +  match(Set dst (ReplicateS zero));
  1.1129 +  format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
  1.1130 +  ins_encode %{
  1.1131 +    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
  1.1132 +  %}
  1.1133 +  ins_pipe( fpu_reg_reg );
  1.1134 +%}
  1.1135 +
  1.1136 +instruct Repl8S_zero(vecX dst, immI0 zero) %{
  1.1137 +  predicate(n->as_Vector()->length() == 8);
  1.1138 +  match(Set dst (ReplicateS zero));
  1.1139 +  format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
  1.1140 +  ins_encode %{
  1.1141 +    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
  1.1142 +  %}
  1.1143 +  ins_pipe( fpu_reg_reg );
  1.1144 +%}
  1.1145 +
  1.1146 +instruct Repl16S_zero(vecY dst, immI0 zero) %{
  1.1147 +  predicate(n->as_Vector()->length() == 16);
  1.1148 +  match(Set dst (ReplicateS zero));
  1.1149 +  format %{ "vxorpd  $dst,$dst,$dst\t! replicate16S zero" %}
  1.1150 +  ins_encode %{
  1.1151 +    // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
  1.1152 +    bool vector256 = true;
  1.1153 +    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
  1.1154 +  %}
  1.1155 +  ins_pipe( fpu_reg_reg );
  1.1156 +%}
  1.1157 +
  1.1158 +// Replicate integer (4 byte) scalar to be vector
  1.1159 +instruct Repl2I(vecD dst, rRegI src) %{
  1.1160 +  predicate(n->as_Vector()->length() == 2);
  1.1161 +  match(Set dst (ReplicateI src));
  1.1162 +  format %{ "movd    $dst,$src\n\t"
  1.1163 +            "pshufd  $dst,$dst,0x00\t! replicate2I" %}
  1.1164 +  ins_encode %{
  1.1165 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.1166 +    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.1167 +  %}
  1.1168 +  ins_pipe( fpu_reg_reg );
  1.1169 +%}
  1.1170 +
  1.1171 +instruct Repl4I(vecX dst, rRegI src) %{
  1.1172 +  predicate(n->as_Vector()->length() == 4);
  1.1173 +  match(Set dst (ReplicateI src));
  1.1174 +  format %{ "movd    $dst,$src\n\t"
  1.1175 +            "pshufd  $dst,$dst,0x00\t! replicate4I" %}
  1.1176 +  ins_encode %{
  1.1177 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.1178 +    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.1179 +  %}
  1.1180 +  ins_pipe( pipe_slow );
  1.1181 +%}
  1.1182 +
  1.1183 +instruct Repl8I(vecY dst, rRegI src) %{
  1.1184 +  predicate(n->as_Vector()->length() == 8);
  1.1185 +  match(Set dst (ReplicateI src));
  1.1186 +  format %{ "movd    $dst,$src\n\t"
  1.1187 +            "pshufd  $dst,$dst,0x00\n\t"
  1.1188 +            "vinsertf128h $dst,$dst,$dst\t! replicate8I" %}
  1.1189 +  ins_encode %{
  1.1190 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.1191 +    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.1192 +    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.1193 +  %}
  1.1194 +  ins_pipe( pipe_slow );
  1.1195 +%}
  1.1196 +
  1.1197 +// Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
  1.1198 +instruct Repl2I_imm(vecD dst, immI con) %{
  1.1199 +  predicate(n->as_Vector()->length() == 2);
  1.1200 +  match(Set dst (ReplicateI con));
  1.1201 +  format %{ "movsd   $dst,[$constantaddress]\t! replicate2I($con)" %}
  1.1202 +  ins_encode %{
  1.1203 +    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
  1.1204 +  %}
  1.1205 +  ins_pipe( fpu_reg_reg );
  1.1206 +%}
  1.1207 +
  1.1208 +instruct Repl4I_imm(vecX dst, immI con) %{
  1.1209 +  predicate(n->as_Vector()->length() == 4);
  1.1210 +  match(Set dst (ReplicateI con));
  1.1211 +  format %{ "movsd   $dst,[$constantaddress]\t! replicate4I($con)\n\t"
  1.1212 +            "movlhps $dst,$dst" %}
  1.1213 +  ins_encode %{
  1.1214 +    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
  1.1215 +    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
  1.1216 +  %}
  1.1217 +  ins_pipe( pipe_slow );
  1.1218 +%}
  1.1219 +
  1.1220 +instruct Repl8I_imm(vecY dst, immI con) %{
  1.1221 +  predicate(n->as_Vector()->length() == 8);
  1.1222 +  match(Set dst (ReplicateI con));
  1.1223 +  format %{ "movsd   $dst,[$constantaddress]\t! replicate8I($con)\n\t"
  1.1224 +            "movlhps $dst,$dst\n\t"
  1.1225 +            "vinsertf128h $dst,$dst,$dst" %}
  1.1226 +  ins_encode %{
  1.1227 +    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
  1.1228 +    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
  1.1229 +    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.1230 +  %}
  1.1231 +  ins_pipe( pipe_slow );
  1.1232 +%}
  1.1233 +
  1.1234 +// Integer could be loaded into xmm register directly from memory.
  1.1235 +instruct Repl2I_mem(vecD dst, memory mem) %{
  1.1236 +  predicate(n->as_Vector()->length() == 2);
  1.1237 +  match(Set dst (ReplicateI mem));
  1.1238 +  format %{ "movd    $dst,$mem\n\t"
  1.1239 +            "pshufd  $dst,$dst,0x00\t! replicate2I" %}
  1.1240 +  ins_encode %{
  1.1241 +    __ movdl($dst$$XMMRegister, $mem$$Address);
  1.1242 +    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.1243 +  %}
  1.1244 +  ins_pipe( fpu_reg_reg );
  1.1245 +%}
  1.1246 +
  1.1247 +instruct Repl4I_mem(vecX dst, memory mem) %{
  1.1248 +  predicate(n->as_Vector()->length() == 4);
  1.1249 +  match(Set dst (ReplicateI mem));
  1.1250 +  format %{ "movd    $dst,$mem\n\t"
  1.1251 +            "pshufd  $dst,$dst,0x00\t! replicate4I" %}
  1.1252 +  ins_encode %{
  1.1253 +    __ movdl($dst$$XMMRegister, $mem$$Address);
  1.1254 +    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.1255 +  %}
  1.1256 +  ins_pipe( pipe_slow );
  1.1257 +%}
  1.1258 +
  1.1259 +instruct Repl8I_mem(vecY dst, memory mem) %{
  1.1260 +  predicate(n->as_Vector()->length() == 8);
  1.1261 +  match(Set dst (ReplicateI mem));
  1.1262 +  format %{ "movd    $dst,$mem\n\t"
  1.1263 +            "pshufd  $dst,$dst,0x00\n\t"
  1.1264 +            "vinsertf128h $dst,$dst,$dst\t! replicate8I" %}
  1.1265 +  ins_encode %{
  1.1266 +    __ movdl($dst$$XMMRegister, $mem$$Address);
  1.1267 +    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.1268 +    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.1269 +  %}
  1.1270 +  ins_pipe( pipe_slow );
  1.1271 +%}
  1.1272 +
  1.1273 +// Replicate integer (4 byte) scalar zero to be vector
  1.1274 +instruct Repl2I_zero(vecD dst, immI0 zero) %{
  1.1275 +  predicate(n->as_Vector()->length() == 2);
  1.1276 +  match(Set dst (ReplicateI zero));
  1.1277 +  format %{ "pxor    $dst,$dst\t! replicate2I" %}
  1.1278 +  ins_encode %{
  1.1279 +    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
  1.1280 +  %}
  1.1281 +  ins_pipe( fpu_reg_reg );
  1.1282 +%}
  1.1283 +
  1.1284 +instruct Repl4I_zero(vecX dst, immI0 zero) %{
  1.1285 +  predicate(n->as_Vector()->length() == 4);
  1.1286 +  match(Set dst (ReplicateI zero));
  1.1287 +  format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
  1.1288 +  ins_encode %{
  1.1289 +    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
  1.1290 +  %}
  1.1291 +  ins_pipe( fpu_reg_reg );
  1.1292 +%}
  1.1293 +
  1.1294 +instruct Repl8I_zero(vecY dst, immI0 zero) %{
  1.1295 +  predicate(n->as_Vector()->length() == 8);
  1.1296 +  match(Set dst (ReplicateI zero));
  1.1297 +  format %{ "vxorpd  $dst,$dst,$dst\t! replicate8I zero" %}
  1.1298 +  ins_encode %{
  1.1299 +    // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
  1.1300 +    bool vector256 = true;
  1.1301 +    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
  1.1302 +  %}
  1.1303 +  ins_pipe( fpu_reg_reg );
  1.1304 +%}
  1.1305 +
  1.1306 +// Replicate long (8 byte) scalar to be vector
  1.1307 +#ifdef _LP64
  1.1308 +instruct Repl2L(vecX dst, rRegL src) %{
  1.1309 +  predicate(n->as_Vector()->length() == 2);
  1.1310 +  match(Set dst (ReplicateL src));
  1.1311 +  format %{ "movdq   $dst,$src\n\t"
  1.1312 +            "movlhps $dst,$dst\t! replicate2L" %}
  1.1313 +  ins_encode %{
  1.1314 +    __ movdq($dst$$XMMRegister, $src$$Register);
  1.1315 +    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
  1.1316 +  %}
  1.1317 +  ins_pipe( pipe_slow );
  1.1318 +%}
  1.1319 +
  1.1320 +instruct Repl4L(vecY dst, rRegL src) %{
  1.1321 +  predicate(n->as_Vector()->length() == 4);
  1.1322 +  match(Set dst (ReplicateL src));
  1.1323 +  format %{ "movdq   $dst,$src\n\t"
  1.1324 +            "movlhps $dst,$dst\n\t"
  1.1325 +            "vinsertf128h $dst,$dst,$dst\t! replicate4L" %}
  1.1326 +  ins_encode %{
  1.1327 +    __ movdq($dst$$XMMRegister, $src$$Register);
  1.1328 +    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
  1.1329 +    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.1330 +  %}
  1.1331 +  ins_pipe( pipe_slow );
  1.1332 +%}
  1.1333 +#else // _LP64
  1.1334 +instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
  1.1335 +  predicate(n->as_Vector()->length() == 2);
  1.1336 +  match(Set dst (ReplicateL src));
  1.1337 +  effect(TEMP dst, USE src, TEMP tmp);
  1.1338 +  format %{ "movdl   $dst,$src.lo\n\t"
  1.1339 +            "movdl   $tmp,$src.hi\n\t"
  1.1340 +            "punpckldq $dst,$tmp\n\t"
  1.1341 +            "movlhps $dst,$dst\t! replicate2L"%}
  1.1342 +  ins_encode %{
  1.1343 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.1344 +    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
  1.1345 +    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
  1.1346 +    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
  1.1347 +  %}
  1.1348 +  ins_pipe( pipe_slow );
  1.1349 +%}
  1.1350 +
  1.1351 +instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
  1.1352 +  predicate(n->as_Vector()->length() == 4);
  1.1353 +  match(Set dst (ReplicateL src));
  1.1354 +  effect(TEMP dst, USE src, TEMP tmp);
  1.1355 +  format %{ "movdl   $dst,$src.lo\n\t"
  1.1356 +            "movdl   $tmp,$src.hi\n\t"
  1.1357 +            "punpckldq $dst,$tmp\n\t"
  1.1358 +            "movlhps $dst,$dst\n\t"
  1.1359 +            "vinsertf128h $dst,$dst,$dst\t! replicate4L" %}
  1.1360 +  ins_encode %{
  1.1361 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.1362 +    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
  1.1363 +    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
  1.1364 +    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
  1.1365 +    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.1366 +  %}
  1.1367 +  ins_pipe( pipe_slow );
  1.1368 +%}
  1.1369 +#endif // _LP64
  1.1370 +
  1.1371 +// Replicate long (8 byte) scalar immediate to be vector by loading from const table.
  1.1372 +instruct Repl2L_imm(vecX dst, immL con) %{
  1.1373 +  predicate(n->as_Vector()->length() == 2);
  1.1374 +  match(Set dst (ReplicateL con));
  1.1375 +  format %{ "movsd   $dst,[$constantaddress]\t! replicate2L($con)\n\t"
  1.1376 +            "movlhps $dst,$dst" %}
  1.1377 +  ins_encode %{
  1.1378 +    __ movdbl($dst$$XMMRegister, $constantaddress($con));
  1.1379 +    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
  1.1380 +  %}
  1.1381 +  ins_pipe( pipe_slow );
  1.1382 +%}
  1.1383 +
  1.1384 +instruct Repl4L_imm(vecY dst, immL con) %{
  1.1385 +  predicate(n->as_Vector()->length() == 4);
  1.1386 +  match(Set dst (ReplicateL con));
  1.1387 +  format %{ "movsd   $dst,[$constantaddress]\t! replicate4L($con)\n\t"
  1.1388 +            "movlhps $dst,$dst\n\t"
  1.1389 +            "vinsertf128h $dst,$dst,$dst" %}
  1.1390 +  ins_encode %{
  1.1391 +    __ movdbl($dst$$XMMRegister, $constantaddress($con));
  1.1392 +    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
  1.1393 +    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.1394 +  %}
  1.1395 +  ins_pipe( pipe_slow );
  1.1396 +%}
  1.1397 +
  1.1398 +// Long could be loaded into xmm register directly from memory.
  1.1399 +instruct Repl2L_mem(vecX dst, memory mem) %{
  1.1400 +  predicate(n->as_Vector()->length() == 2);
  1.1401 +  match(Set dst (ReplicateL mem));
  1.1402 +  format %{ "movq    $dst,$mem\n\t"
  1.1403 +            "movlhps $dst,$dst\t! replicate2L" %}
  1.1404 +  ins_encode %{
  1.1405 +    __ movq($dst$$XMMRegister, $mem$$Address);
  1.1406 +    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
  1.1407 +  %}
  1.1408 +  ins_pipe( pipe_slow );
  1.1409 +%}
  1.1410 +
  1.1411 +instruct Repl4L_mem(vecY dst, memory mem) %{
  1.1412 +  predicate(n->as_Vector()->length() == 4);
  1.1413 +  match(Set dst (ReplicateL mem));
  1.1414 +  format %{ "movq    $dst,$mem\n\t"
  1.1415 +            "movlhps $dst,$dst\n\t"
  1.1416 +            "vinsertf128h $dst,$dst,$dst\t! replicate4L" %}
  1.1417 +  ins_encode %{
  1.1418 +    __ movq($dst$$XMMRegister, $mem$$Address);
  1.1419 +    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
  1.1420 +    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.1421 +  %}
  1.1422 +  ins_pipe( pipe_slow );
  1.1423 +%}
  1.1424 +
  1.1425 +// Replicate long (8 byte) scalar zero to be vector
  1.1426 +instruct Repl2L_zero(vecX dst, immL0 zero) %{
  1.1427 +  predicate(n->as_Vector()->length() == 2);
  1.1428 +  match(Set dst (ReplicateL zero));
  1.1429 +  format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
  1.1430 +  ins_encode %{
  1.1431 +    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
  1.1432 +  %}
  1.1433 +  ins_pipe( fpu_reg_reg );
  1.1434 +%}
  1.1435 +
  1.1436 +instruct Repl4L_zero(vecY dst, immL0 zero) %{
  1.1437 +  predicate(n->as_Vector()->length() == 4);
  1.1438 +  match(Set dst (ReplicateL zero));
  1.1439 +  format %{ "vxorpd  $dst,$dst,$dst\t! replicate4L zero" %}
  1.1440 +  ins_encode %{
  1.1441 +    // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
  1.1442 +    bool vector256 = true;
  1.1443 +    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
  1.1444 +  %}
  1.1445 +  ins_pipe( fpu_reg_reg );
  1.1446 +%}
  1.1447 +
  1.1448 +// Replicate float (4 byte) scalar to be vector
  1.1449 +instruct Repl2F(vecD dst, regF src) %{
  1.1450 +  predicate(n->as_Vector()->length() == 2);
  1.1451 +  match(Set dst (ReplicateF src));
  1.1452 +  format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
  1.1453 +  ins_encode %{
  1.1454 +    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
  1.1455 +  %}
  1.1456 +  ins_pipe( fpu_reg_reg );
  1.1457 +%}
  1.1458 +
  1.1459 +instruct Repl4F(vecX dst, regF src) %{
  1.1460 +  predicate(n->as_Vector()->length() == 4);
  1.1461 +  match(Set dst (ReplicateF src));
  1.1462 +  format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
  1.1463 +  ins_encode %{
  1.1464 +    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
  1.1465 +  %}
  1.1466 +  ins_pipe( pipe_slow );
  1.1467 +%}
  1.1468 +
  1.1469 +instruct Repl8F(vecY dst, regF src) %{
  1.1470 +  predicate(n->as_Vector()->length() == 8);
  1.1471 +  match(Set dst (ReplicateF src));
  1.1472 +  format %{ "pshufd  $dst,$src,0x00\n\t"
  1.1473 +            "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
  1.1474 +  ins_encode %{
  1.1475 +    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
  1.1476 +    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.1477 +  %}
  1.1478 +  ins_pipe( pipe_slow );
  1.1479 +%}
  1.1480 +
  1.1481 +// Replicate float (4 byte) scalar zero to be vector
  1.1482 +instruct Repl2F_zero(vecD dst, immF0 zero) %{
  1.1483 +  predicate(n->as_Vector()->length() == 2);
  1.1484 +  match(Set dst (ReplicateF zero));
  1.1485 +  format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
  1.1486 +  ins_encode %{
  1.1487 +    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
  1.1488 +  %}
  1.1489 +  ins_pipe( fpu_reg_reg );
  1.1490 +%}
  1.1491 +
  1.1492 +instruct Repl4F_zero(vecX dst, immF0 zero) %{
  1.1493 +  predicate(n->as_Vector()->length() == 4);
  1.1494 +  match(Set dst (ReplicateF zero));
  1.1495 +  format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
  1.1496 +  ins_encode %{
  1.1497 +    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
  1.1498 +  %}
  1.1499 +  ins_pipe( fpu_reg_reg );
  1.1500 +%}
  1.1501 +
  1.1502 +instruct Repl8F_zero(vecY dst, immF0 zero) %{
  1.1503 +  predicate(n->as_Vector()->length() == 8);
  1.1504 +  match(Set dst (ReplicateF zero));
  1.1505 +  format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
  1.1506 +  ins_encode %{
  1.1507 +    bool vector256 = true;
  1.1508 +    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
  1.1509 +  %}
  1.1510 +  ins_pipe( fpu_reg_reg );
  1.1511 +%}
  1.1512 +
  1.1513 +// Replicate double (8 bytes) scalar to be vector
  1.1514 +instruct Repl2D(vecX dst, regD src) %{
  1.1515 +  predicate(n->as_Vector()->length() == 2);
  1.1516 +  match(Set dst (ReplicateD src));
  1.1517 +  format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
  1.1518 +  ins_encode %{
  1.1519 +    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
  1.1520 +  %}
  1.1521 +  ins_pipe( pipe_slow );
  1.1522 +%}
  1.1523 +
  1.1524 +instruct Repl4D(vecY dst, regD src) %{
  1.1525 +  predicate(n->as_Vector()->length() == 4);
  1.1526 +  match(Set dst (ReplicateD src));
  1.1527 +  format %{ "pshufd  $dst,$src,0x44\n\t"
  1.1528 +            "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
  1.1529 +  ins_encode %{
  1.1530 +    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
  1.1531 +    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.1532 +  %}
  1.1533 +  ins_pipe( pipe_slow );
  1.1534 +%}
  1.1535 +
  1.1536 +// Replicate double (8 byte) scalar zero to be vector
  1.1537 +instruct Repl2D_zero(vecX dst, immD0 zero) %{
  1.1538 +  predicate(n->as_Vector()->length() == 2);
  1.1539 +  match(Set dst (ReplicateD zero));
  1.1540 +  format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
  1.1541 +  ins_encode %{
  1.1542 +    __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
  1.1543 +  %}
  1.1544 +  ins_pipe( fpu_reg_reg );
  1.1545 +%}
  1.1546 +
  1.1547 +instruct Repl4D_zero(vecY dst, immD0 zero) %{
  1.1548 +  predicate(n->as_Vector()->length() == 4);
  1.1549 +  match(Set dst (ReplicateD zero));
  1.1550 +  format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
  1.1551 +  ins_encode %{
  1.1552 +    bool vector256 = true;
  1.1553 +    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
  1.1554 +  %}
  1.1555 +  ins_pipe( fpu_reg_reg );
  1.1556 +%}
  1.1557 +

mercurial