src/cpu/x86/vm/x86.ad

changeset 0
f90c822e73f8
child 6876
710a3c8b516e
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/cpu/x86/vm/x86.ad	Wed Apr 27 01:25:04 2016 +0800
     1.3 @@ -0,0 +1,5108 @@
     1.4 +//
     1.5 +// Copyright (c) 2011, 2012, Oracle and/or its affiliates. All rights reserved.
     1.6 +// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     1.7 +//
     1.8 +// This code is free software; you can redistribute it and/or modify it
     1.9 +// under the terms of the GNU General Public License version 2 only, as
    1.10 +// published by the Free Software Foundation.
    1.11 +//
    1.12 +// This code is distributed in the hope that it will be useful, but WITHOUT
    1.13 +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    1.14 +// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    1.15 +// version 2 for more details (a copy is included in the LICENSE file that
    1.16 +// accompanied this code).
    1.17 +//
    1.18 +// You should have received a copy of the GNU General Public License version
    1.19 +// 2 along with this work; if not, write to the Free Software Foundation,
    1.20 +// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    1.21 +//
    1.22 +// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    1.23 +// or visit www.oracle.com if you need additional information or have any
    1.24 +// questions.
    1.25 +//
    1.26 +//
    1.27 +
    1.28 +// X86 Common Architecture Description File
    1.29 +
    1.30 +//----------REGISTER DEFINITION BLOCK------------------------------------------
    1.31 +// This information is used by the matcher and the register allocator to
    1.32 +// describe individual registers and classes of registers within the target
    1.33 +// archtecture.
    1.34 +
    1.35 +register %{
    1.36 +//----------Architecture Description Register Definitions----------------------
    1.37 +// General Registers
    1.38 +// "reg_def"  name ( register save type, C convention save type,
    1.39 +//                   ideal register type, encoding );
    1.40 +// Register Save Types:
    1.41 +//
    1.42 +// NS  = No-Save:       The register allocator assumes that these registers
    1.43 +//                      can be used without saving upon entry to the method, &
    1.44 +//                      that they do not need to be saved at call sites.
    1.45 +//
    1.46 +// SOC = Save-On-Call:  The register allocator assumes that these registers
    1.47 +//                      can be used without saving upon entry to the method,
    1.48 +//                      but that they must be saved at call sites.
    1.49 +//
    1.50 +// SOE = Save-On-Entry: The register allocator assumes that these registers
    1.51 +//                      must be saved before using them upon entry to the
    1.52 +//                      method, but they do not need to be saved at call
    1.53 +//                      sites.
    1.54 +//
    1.55 +// AS  = Always-Save:   The register allocator assumes that these registers
    1.56 +//                      must be saved before using them upon entry to the
    1.57 +//                      method, & that they must be saved at call sites.
    1.58 +//
    1.59 +// Ideal Register Type is used to determine how to save & restore a
    1.60 +// register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
    1.61 +// spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
    1.62 +//
    1.63 +// The encoding number is the actual bit-pattern placed into the opcodes.
    1.64 +
    1.65 +// XMM registers.  256-bit registers or 8 words each, labeled (a)-h.
    1.66 +// Word a in each register holds a Float, words ab hold a Double.
    1.67 +// The whole registers are used in SSE4.2 version intrinsics,
    1.68 +// array copy stubs and superword operations (see UseSSE42Intrinsics,
    1.69 +// UseXMMForArrayCopy and UseSuperword flags).
    1.70 +// XMM8-XMM15 must be encoded with REX (VEX for UseAVX).
    1.71 +// Linux ABI:   No register preserved across function calls
    1.72 +//              XMM0-XMM7 might hold parameters
    1.73 +// Windows ABI: XMM6-XMM15 preserved across function calls
    1.74 +//              XMM0-XMM3 might hold parameters
    1.75 +
    1.76 +reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
    1.77 +reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
    1.78 +reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
    1.79 +reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
    1.80 +reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
    1.81 +reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
    1.82 +reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
    1.83 +reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
    1.84 +
    1.85 +reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
    1.86 +reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
    1.87 +reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
    1.88 +reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
    1.89 +reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
    1.90 +reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
    1.91 +reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
    1.92 +reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
    1.93 +
    1.94 +reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
    1.95 +reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
    1.96 +reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
    1.97 +reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
    1.98 +reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
    1.99 +reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
   1.100 +reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
   1.101 +reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
   1.102 +
   1.103 +reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
   1.104 +reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
   1.105 +reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
   1.106 +reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
   1.107 +reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
   1.108 +reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
   1.109 +reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
   1.110 +reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
   1.111 +
   1.112 +reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
   1.113 +reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
   1.114 +reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
   1.115 +reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
   1.116 +reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
   1.117 +reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
   1.118 +reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
   1.119 +reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
   1.120 +
   1.121 +reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
   1.122 +reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
   1.123 +reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
   1.124 +reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
   1.125 +reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
   1.126 +reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
   1.127 +reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
   1.128 +reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
   1.129 +
   1.130 +#ifdef _WIN64
   1.131 +
   1.132 +reg_def XMM6 ( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg());
   1.133 +reg_def XMM6b( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(1));
   1.134 +reg_def XMM6c( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(2));
   1.135 +reg_def XMM6d( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(3));
   1.136 +reg_def XMM6e( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(4));
   1.137 +reg_def XMM6f( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(5));
   1.138 +reg_def XMM6g( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(6));
   1.139 +reg_def XMM6h( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(7));
   1.140 +
   1.141 +reg_def XMM7 ( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg());
   1.142 +reg_def XMM7b( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(1));
   1.143 +reg_def XMM7c( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(2));
   1.144 +reg_def XMM7d( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(3));
   1.145 +reg_def XMM7e( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(4));
   1.146 +reg_def XMM7f( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(5));
   1.147 +reg_def XMM7g( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(6));
   1.148 +reg_def XMM7h( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(7));
   1.149 +
   1.150 +reg_def XMM8 ( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg());
   1.151 +reg_def XMM8b( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(1));
   1.152 +reg_def XMM8c( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(2));
   1.153 +reg_def XMM8d( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(3));
   1.154 +reg_def XMM8e( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(4));
   1.155 +reg_def XMM8f( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(5));
   1.156 +reg_def XMM8g( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(6));
   1.157 +reg_def XMM8h( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(7));
   1.158 +
   1.159 +reg_def XMM9 ( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg());
   1.160 +reg_def XMM9b( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(1));
   1.161 +reg_def XMM9c( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(2));
   1.162 +reg_def XMM9d( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(3));
   1.163 +reg_def XMM9e( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(4));
   1.164 +reg_def XMM9f( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(5));
   1.165 +reg_def XMM9g( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(6));
   1.166 +reg_def XMM9h( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(7));
   1.167 +
   1.168 +reg_def XMM10 ( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg());
   1.169 +reg_def XMM10b( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(1));
   1.170 +reg_def XMM10c( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(2));
   1.171 +reg_def XMM10d( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(3));
   1.172 +reg_def XMM10e( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(4));
   1.173 +reg_def XMM10f( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(5));
   1.174 +reg_def XMM10g( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(6));
   1.175 +reg_def XMM10h( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(7));
   1.176 +
   1.177 +reg_def XMM11 ( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg());
   1.178 +reg_def XMM11b( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(1));
   1.179 +reg_def XMM11c( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(2));
   1.180 +reg_def XMM11d( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(3));
   1.181 +reg_def XMM11e( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(4));
   1.182 +reg_def XMM11f( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(5));
   1.183 +reg_def XMM11g( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(6));
   1.184 +reg_def XMM11h( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(7));
   1.185 +
   1.186 +reg_def XMM12 ( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg());
   1.187 +reg_def XMM12b( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(1));
   1.188 +reg_def XMM12c( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(2));
   1.189 +reg_def XMM12d( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(3));
   1.190 +reg_def XMM12e( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(4));
   1.191 +reg_def XMM12f( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(5));
   1.192 +reg_def XMM12g( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(6));
   1.193 +reg_def XMM12h( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(7));
   1.194 +
   1.195 +reg_def XMM13 ( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg());
   1.196 +reg_def XMM13b( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(1));
   1.197 +reg_def XMM13c( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(2));
   1.198 +reg_def XMM13d( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(3));
   1.199 +reg_def XMM13e( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(4));
   1.200 +reg_def XMM13f( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(5));
   1.201 +reg_def XMM13g( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(6));
   1.202 +reg_def XMM13h( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(7));
   1.203 +
   1.204 +reg_def XMM14 ( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg());
   1.205 +reg_def XMM14b( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(1));
   1.206 +reg_def XMM14c( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(2));
   1.207 +reg_def XMM14d( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(3));
   1.208 +reg_def XMM14e( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(4));
   1.209 +reg_def XMM14f( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(5));
   1.210 +reg_def XMM14g( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(6));
   1.211 +reg_def XMM14h( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(7));
   1.212 +
   1.213 +reg_def XMM15 ( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg());
   1.214 +reg_def XMM15b( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(1));
   1.215 +reg_def XMM15c( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(2));
   1.216 +reg_def XMM15d( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(3));
   1.217 +reg_def XMM15e( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(4));
   1.218 +reg_def XMM15f( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(5));
   1.219 +reg_def XMM15g( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(6));
   1.220 +reg_def XMM15h( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(7));
   1.221 +
   1.222 +#else // _WIN64
   1.223 +
   1.224 +reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
   1.225 +reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
   1.226 +reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
   1.227 +reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
   1.228 +reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
   1.229 +reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
   1.230 +reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
   1.231 +reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
   1.232 +
   1.233 +reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
   1.234 +reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
   1.235 +reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
   1.236 +reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
   1.237 +reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
   1.238 +reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
   1.239 +reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
   1.240 +reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
   1.241 +
   1.242 +#ifdef _LP64
   1.243 +
   1.244 +reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
   1.245 +reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
   1.246 +reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
   1.247 +reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
   1.248 +reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
   1.249 +reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
   1.250 +reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
   1.251 +reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
   1.252 +
   1.253 +reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
   1.254 +reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
   1.255 +reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
   1.256 +reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
   1.257 +reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
   1.258 +reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
   1.259 +reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
   1.260 +reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
   1.261 +
   1.262 +reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
   1.263 +reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
   1.264 +reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
   1.265 +reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
   1.266 +reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
   1.267 +reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
   1.268 +reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
   1.269 +reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
   1.270 +
   1.271 +reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
   1.272 +reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
   1.273 +reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
   1.274 +reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
   1.275 +reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
   1.276 +reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
   1.277 +reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
   1.278 +reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
   1.279 +
   1.280 +reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
   1.281 +reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
   1.282 +reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
   1.283 +reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
   1.284 +reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
   1.285 +reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
   1.286 +reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
   1.287 +reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
   1.288 +
   1.289 +reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
   1.290 +reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
   1.291 +reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
   1.292 +reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
   1.293 +reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
   1.294 +reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
   1.295 +reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
   1.296 +reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
   1.297 +
   1.298 +reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
   1.299 +reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
   1.300 +reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
   1.301 +reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
   1.302 +reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
   1.303 +reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
   1.304 +reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
   1.305 +reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
   1.306 +
   1.307 +reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
   1.308 +reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
   1.309 +reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
   1.310 +reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
   1.311 +reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
   1.312 +reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
   1.313 +reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
   1.314 +reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
   1.315 +
   1.316 +#endif // _LP64
   1.317 +
   1.318 +#endif // _WIN64
   1.319 +
   1.320 +#ifdef _LP64
   1.321 +reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
   1.322 +#else
   1.323 +reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
   1.324 +#endif // _LP64
   1.325 +
   1.326 +alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
   1.327 +                   XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
   1.328 +                   XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
   1.329 +                   XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
   1.330 +                   XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
   1.331 +                   XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
   1.332 +                   XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
   1.333 +                   XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
   1.334 +#ifdef _LP64
   1.335 +                  ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
   1.336 +                   XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
   1.337 +                   XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
   1.338 +                   XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
   1.339 +                   XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
   1.340 +                   XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
   1.341 +                   XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
   1.342 +                   XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
   1.343 +#endif
   1.344 +                   );
   1.345 +
   1.346 +// flags allocation class should be last.
   1.347 +alloc_class chunk2(RFLAGS);
   1.348 +
   1.349 +// Singleton class for condition codes
   1.350 +reg_class int_flags(RFLAGS);
   1.351 +
   1.352 +// Class for all float registers
   1.353 +reg_class float_reg(XMM0,
   1.354 +                    XMM1,
   1.355 +                    XMM2,
   1.356 +                    XMM3,
   1.357 +                    XMM4,
   1.358 +                    XMM5,
   1.359 +                    XMM6,
   1.360 +                    XMM7
   1.361 +#ifdef _LP64
   1.362 +                   ,XMM8,
   1.363 +                    XMM9,
   1.364 +                    XMM10,
   1.365 +                    XMM11,
   1.366 +                    XMM12,
   1.367 +                    XMM13,
   1.368 +                    XMM14,
   1.369 +                    XMM15
   1.370 +#endif
   1.371 +                    );
   1.372 +
   1.373 +// Class for all double registers
   1.374 +reg_class double_reg(XMM0,  XMM0b,
   1.375 +                     XMM1,  XMM1b,
   1.376 +                     XMM2,  XMM2b,
   1.377 +                     XMM3,  XMM3b,
   1.378 +                     XMM4,  XMM4b,
   1.379 +                     XMM5,  XMM5b,
   1.380 +                     XMM6,  XMM6b,
   1.381 +                     XMM7,  XMM7b
   1.382 +#ifdef _LP64
   1.383 +                    ,XMM8,  XMM8b,
   1.384 +                     XMM9,  XMM9b,
   1.385 +                     XMM10, XMM10b,
   1.386 +                     XMM11, XMM11b,
   1.387 +                     XMM12, XMM12b,
   1.388 +                     XMM13, XMM13b,
   1.389 +                     XMM14, XMM14b,
   1.390 +                     XMM15, XMM15b
   1.391 +#endif
   1.392 +                     );
   1.393 +
   1.394 +// Class for all 32bit vector registers
   1.395 +reg_class vectors_reg(XMM0,
   1.396 +                      XMM1,
   1.397 +                      XMM2,
   1.398 +                      XMM3,
   1.399 +                      XMM4,
   1.400 +                      XMM5,
   1.401 +                      XMM6,
   1.402 +                      XMM7
   1.403 +#ifdef _LP64
   1.404 +                     ,XMM8,
   1.405 +                      XMM9,
   1.406 +                      XMM10,
   1.407 +                      XMM11,
   1.408 +                      XMM12,
   1.409 +                      XMM13,
   1.410 +                      XMM14,
   1.411 +                      XMM15
   1.412 +#endif
   1.413 +                      );
   1.414 +
   1.415 +// Class for all 64bit vector registers
   1.416 +reg_class vectord_reg(XMM0,  XMM0b,
   1.417 +                      XMM1,  XMM1b,
   1.418 +                      XMM2,  XMM2b,
   1.419 +                      XMM3,  XMM3b,
   1.420 +                      XMM4,  XMM4b,
   1.421 +                      XMM5,  XMM5b,
   1.422 +                      XMM6,  XMM6b,
   1.423 +                      XMM7,  XMM7b
   1.424 +#ifdef _LP64
   1.425 +                     ,XMM8,  XMM8b,
   1.426 +                      XMM9,  XMM9b,
   1.427 +                      XMM10, XMM10b,
   1.428 +                      XMM11, XMM11b,
   1.429 +                      XMM12, XMM12b,
   1.430 +                      XMM13, XMM13b,
   1.431 +                      XMM14, XMM14b,
   1.432 +                      XMM15, XMM15b
   1.433 +#endif
   1.434 +                      );
   1.435 +
   1.436 +// Class for all 128bit vector registers
   1.437 +reg_class vectorx_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,
   1.438 +                      XMM1,  XMM1b,  XMM1c,  XMM1d,
   1.439 +                      XMM2,  XMM2b,  XMM2c,  XMM2d,
   1.440 +                      XMM3,  XMM3b,  XMM3c,  XMM3d,
   1.441 +                      XMM4,  XMM4b,  XMM4c,  XMM4d,
   1.442 +                      XMM5,  XMM5b,  XMM5c,  XMM5d,
   1.443 +                      XMM6,  XMM6b,  XMM6c,  XMM6d,
   1.444 +                      XMM7,  XMM7b,  XMM7c,  XMM7d
   1.445 +#ifdef _LP64
   1.446 +                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,
   1.447 +                      XMM9,  XMM9b,  XMM9c,  XMM9d,
   1.448 +                      XMM10, XMM10b, XMM10c, XMM10d,
   1.449 +                      XMM11, XMM11b, XMM11c, XMM11d,
   1.450 +                      XMM12, XMM12b, XMM12c, XMM12d,
   1.451 +                      XMM13, XMM13b, XMM13c, XMM13d,
   1.452 +                      XMM14, XMM14b, XMM14c, XMM14d,
   1.453 +                      XMM15, XMM15b, XMM15c, XMM15d
   1.454 +#endif
   1.455 +                      );
   1.456 +
   1.457 +// Class for all 256bit vector registers
   1.458 +reg_class vectory_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
   1.459 +                      XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
   1.460 +                      XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
   1.461 +                      XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
   1.462 +                      XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
   1.463 +                      XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
   1.464 +                      XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
   1.465 +                      XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
   1.466 +#ifdef _LP64
   1.467 +                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
   1.468 +                      XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
   1.469 +                      XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
   1.470 +                      XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
   1.471 +                      XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
   1.472 +                      XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
   1.473 +                      XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
   1.474 +                      XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
   1.475 +#endif
   1.476 +                      );
   1.477 +
   1.478 +%}
   1.479 +
   1.480 +
   1.481 +//----------SOURCE BLOCK-------------------------------------------------------
   1.482 +// This is a block of C++ code which provides values, functions, and
   1.483 +// definitions necessary in the rest of the architecture description
   1.484 +
   1.485 +source_hpp %{
   1.486 +// Header information of the source block.
   1.487 +// Method declarations/definitions which are used outside
   1.488 +// the ad-scope can conveniently be defined here.
   1.489 +//
   1.490 +// To keep related declarations/definitions/uses close together,
   1.491 +// we switch between source %{ }% and source_hpp %{ }% freely as needed.
   1.492 +
   1.493 +class CallStubImpl {
   1.494 + 
   1.495 +  //--------------------------------------------------------------
   1.496 +  //---<  Used for optimization in Compile::shorten_branches  >---
   1.497 +  //--------------------------------------------------------------
   1.498 +
   1.499 + public:
   1.500 +  // Size of call trampoline stub.
   1.501 +  static uint size_call_trampoline() {
   1.502 +    return 0; // no call trampolines on this platform
   1.503 +  }
   1.504 +  
   1.505 +  // number of relocations needed by a call trampoline stub
   1.506 +  static uint reloc_call_trampoline() { 
   1.507 +    return 0; // no call trampolines on this platform
   1.508 +  }
   1.509 +};
   1.510 +
   1.511 +class HandlerImpl {
   1.512 +
   1.513 + public:
   1.514 +
   1.515 +  static int emit_exception_handler(CodeBuffer &cbuf);
   1.516 +  static int emit_deopt_handler(CodeBuffer& cbuf);
   1.517 +
   1.518 +  static uint size_exception_handler() {
   1.519 +    // NativeCall instruction size is the same as NativeJump.
   1.520 +    // exception handler starts out as jump and can be patched to
   1.521 +    // a call be deoptimization.  (4932387)
   1.522 +    // Note that this value is also credited (in output.cpp) to
   1.523 +    // the size of the code section.
   1.524 +    return NativeJump::instruction_size;
   1.525 +  }
   1.526 +
   1.527 +#ifdef _LP64
   1.528 +  static uint size_deopt_handler() {
   1.529 +    // three 5 byte instructions
   1.530 +    return 15;
   1.531 +  }
   1.532 +#else
   1.533 +  static uint size_deopt_handler() {
   1.534 +    // NativeCall instruction size is the same as NativeJump.
   1.535 +    // exception handler starts out as jump and can be patched to
   1.536 +    // a call be deoptimization.  (4932387)
   1.537 +    // Note that this value is also credited (in output.cpp) to
   1.538 +    // the size of the code section.
   1.539 +    return 5 + NativeJump::instruction_size; // pushl(); jmp;
   1.540 +  }
   1.541 +#endif
   1.542 +};
   1.543 +
   1.544 +%} // end source_hpp
   1.545 +
   1.546 +source %{
   1.547 +
   1.548 +// Emit exception handler code.
   1.549 +// Stuff framesize into a register and call a VM stub routine.
   1.550 +int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
   1.551 +
   1.552 +  // Note that the code buffer's insts_mark is always relative to insts.
   1.553 +  // That's why we must use the macroassembler to generate a handler.
   1.554 +  MacroAssembler _masm(&cbuf);
   1.555 +  address base = __ start_a_stub(size_exception_handler());
   1.556 +  if (base == NULL)  return 0;  // CodeBuffer::expand failed
   1.557 +  int offset = __ offset();
   1.558 +  __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
   1.559 +  assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
   1.560 +  __ end_a_stub();
   1.561 +  return offset;
   1.562 +}
   1.563 +
   1.564 +// Emit deopt handler code.
   1.565 +int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
   1.566 +
   1.567 +  // Note that the code buffer's insts_mark is always relative to insts.
   1.568 +  // That's why we must use the macroassembler to generate a handler.
   1.569 +  MacroAssembler _masm(&cbuf);
   1.570 +  address base = __ start_a_stub(size_deopt_handler());
   1.571 +  if (base == NULL)  return 0;  // CodeBuffer::expand failed
   1.572 +  int offset = __ offset();
   1.573 +
   1.574 +#ifdef _LP64
   1.575 +  address the_pc = (address) __ pc();
   1.576 +  Label next;
   1.577 +  // push a "the_pc" on the stack without destroying any registers
   1.578 +  // as they all may be live.
   1.579 +
   1.580 +  // push address of "next"
   1.581 +  __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
   1.582 +  __ bind(next);
   1.583 +  // adjust it so it matches "the_pc"
   1.584 +  __ subptr(Address(rsp, 0), __ offset() - offset);
   1.585 +#else
   1.586 +  InternalAddress here(__ pc());
   1.587 +  __ pushptr(here.addr());
   1.588 +#endif
   1.589 +
   1.590 +  __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
   1.591 +  assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow");
   1.592 +  __ end_a_stub();
   1.593 +  return offset;
   1.594 +}
   1.595 +
   1.596 +
   1.597 +//=============================================================================
   1.598 +
   1.599 +  // Float masks come from different places depending on platform.
   1.600 +#ifdef _LP64
   1.601 +  static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
   1.602 +  static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
   1.603 +  static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
   1.604 +  static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
   1.605 +#else
   1.606 +  static address float_signmask()  { return (address)float_signmask_pool; }
   1.607 +  static address float_signflip()  { return (address)float_signflip_pool; }
   1.608 +  static address double_signmask() { return (address)double_signmask_pool; }
   1.609 +  static address double_signflip() { return (address)double_signflip_pool; }
   1.610 +#endif
   1.611 +
   1.612 +
   1.613 +const bool Matcher::match_rule_supported(int opcode) {
   1.614 +  if (!has_match_rule(opcode))
   1.615 +    return false;
   1.616 +
   1.617 +  switch (opcode) {
   1.618 +    case Op_PopCountI:
   1.619 +    case Op_PopCountL:
   1.620 +      if (!UsePopCountInstruction)
   1.621 +        return false;
   1.622 +    break;
   1.623 +    case Op_MulVI:
   1.624 +      if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
   1.625 +        return false;
   1.626 +    break;
   1.627 +    case Op_CompareAndSwapL:
   1.628 +#ifdef _LP64
   1.629 +    case Op_CompareAndSwapP:
   1.630 +#endif
   1.631 +      if (!VM_Version::supports_cx8())
   1.632 +        return false;
   1.633 +    break;
   1.634 +  }
   1.635 +
   1.636 +  return true;  // Per default match rules are supported.
   1.637 +}
   1.638 +
   1.639 +// Max vector size in bytes. 0 if not supported.
   1.640 +const int Matcher::vector_width_in_bytes(BasicType bt) {
   1.641 +  assert(is_java_primitive(bt), "only primitive type vectors");
   1.642 +  if (UseSSE < 2) return 0;
   1.643 +  // SSE2 supports 128bit vectors for all types.
   1.644 +  // AVX2 supports 256bit vectors for all types.
   1.645 +  int size = (UseAVX > 1) ? 32 : 16;
   1.646 +  // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
   1.647 +  if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
   1.648 +    size = 32;
   1.649 +  // Use flag to limit vector size.
   1.650 +  size = MIN2(size,(int)MaxVectorSize);
   1.651 +  // Minimum 2 values in vector (or 4 for bytes).
   1.652 +  switch (bt) {
   1.653 +  case T_DOUBLE:
   1.654 +  case T_LONG:
   1.655 +    if (size < 16) return 0;
   1.656 +  case T_FLOAT:
   1.657 +  case T_INT:
   1.658 +    if (size < 8) return 0;
   1.659 +  case T_BOOLEAN:
   1.660 +  case T_BYTE:
   1.661 +  case T_CHAR:
   1.662 +  case T_SHORT:
   1.663 +    if (size < 4) return 0;
   1.664 +    break;
   1.665 +  default:
   1.666 +    ShouldNotReachHere();
   1.667 +  }
   1.668 +  return size;
   1.669 +}
   1.670 +
   1.671 +// Limits on vector size (number of elements) loaded into vector.
   1.672 +const int Matcher::max_vector_size(const BasicType bt) {
   1.673 +  return vector_width_in_bytes(bt)/type2aelembytes(bt);
   1.674 +}
   1.675 +const int Matcher::min_vector_size(const BasicType bt) {
   1.676 +  int max_size = max_vector_size(bt);
   1.677 +  // Min size which can be loaded into vector is 4 bytes.
   1.678 +  int size = (type2aelembytes(bt) == 1) ? 4 : 2;
   1.679 +  return MIN2(size,max_size);
   1.680 +}
   1.681 +
   1.682 +// Vector ideal reg corresponding to specidied size in bytes
   1.683 +const int Matcher::vector_ideal_reg(int size) {
   1.684 +  assert(MaxVectorSize >= size, "");
   1.685 +  switch(size) {
   1.686 +    case  4: return Op_VecS;
   1.687 +    case  8: return Op_VecD;
   1.688 +    case 16: return Op_VecX;
   1.689 +    case 32: return Op_VecY;
   1.690 +  }
   1.691 +  ShouldNotReachHere();
   1.692 +  return 0;
   1.693 +}
   1.694 +
   1.695 +// Only lowest bits of xmm reg are used for vector shift count.
   1.696 +const int Matcher::vector_shift_count_ideal_reg(int size) {
   1.697 +  return Op_VecS;
   1.698 +}
   1.699 +
   1.700 +// x86 supports misaligned vectors store/load.
   1.701 +const bool Matcher::misaligned_vectors_ok() {
   1.702 +  return !AlignVector; // can be changed by flag
   1.703 +}
   1.704 +
   1.705 +// x86 AES instructions are compatible with SunJCE expanded
   1.706 +// keys, hence we do not need to pass the original key to stubs
   1.707 +const bool Matcher::pass_original_key_for_aes() {
   1.708 +  return false;
   1.709 +}
   1.710 +
   1.711 +// Helper methods for MachSpillCopyNode::implementation().
   1.712 +static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
   1.713 +                          int src_hi, int dst_hi, uint ireg, outputStream* st) {
   1.714 +  // In 64-bit VM size calculation is very complex. Emitting instructions
   1.715 +  // into scratch buffer is used to get size in 64-bit VM.
   1.716 +  LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
   1.717 +  assert(ireg == Op_VecS || // 32bit vector
   1.718 +         (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
   1.719 +         (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
   1.720 +         "no non-adjacent vector moves" );
   1.721 +  if (cbuf) {
   1.722 +    MacroAssembler _masm(cbuf);
   1.723 +    int offset = __ offset();
   1.724 +    switch (ireg) {
   1.725 +    case Op_VecS: // copy whole register
   1.726 +    case Op_VecD:
   1.727 +    case Op_VecX:
   1.728 +      __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
   1.729 +      break;
   1.730 +    case Op_VecY:
   1.731 +      __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
   1.732 +      break;
   1.733 +    default:
   1.734 +      ShouldNotReachHere();
   1.735 +    }
   1.736 +    int size = __ offset() - offset;
   1.737 +#ifdef ASSERT
   1.738 +    // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
   1.739 +    assert(!do_size || size == 4, "incorrect size calculattion");
   1.740 +#endif
   1.741 +    return size;
   1.742 +#ifndef PRODUCT
   1.743 +  } else if (!do_size) {
   1.744 +    switch (ireg) {
   1.745 +    case Op_VecS:
   1.746 +    case Op_VecD:
   1.747 +    case Op_VecX:
   1.748 +      st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
   1.749 +      break;
   1.750 +    case Op_VecY:
   1.751 +      st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
   1.752 +      break;
   1.753 +    default:
   1.754 +      ShouldNotReachHere();
   1.755 +    }
   1.756 +#endif
   1.757 +  }
   1.758 +  // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
   1.759 +  return 4;
   1.760 +}
   1.761 +
   1.762 +static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
   1.763 +                            int stack_offset, int reg, uint ireg, outputStream* st) {
   1.764 +  // In 64-bit VM size calculation is very complex. Emitting instructions
   1.765 +  // into scratch buffer is used to get size in 64-bit VM.
   1.766 +  LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
   1.767 +  if (cbuf) {
   1.768 +    MacroAssembler _masm(cbuf);
   1.769 +    int offset = __ offset();
   1.770 +    if (is_load) {
   1.771 +      switch (ireg) {
   1.772 +      case Op_VecS:
   1.773 +        __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
   1.774 +        break;
   1.775 +      case Op_VecD:
   1.776 +        __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
   1.777 +        break;
   1.778 +      case Op_VecX:
   1.779 +        __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
   1.780 +        break;
   1.781 +      case Op_VecY:
   1.782 +        __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
   1.783 +        break;
   1.784 +      default:
   1.785 +        ShouldNotReachHere();
   1.786 +      }
   1.787 +    } else { // store
   1.788 +      switch (ireg) {
   1.789 +      case Op_VecS:
   1.790 +        __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
   1.791 +        break;
   1.792 +      case Op_VecD:
   1.793 +        __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
   1.794 +        break;
   1.795 +      case Op_VecX:
   1.796 +        __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
   1.797 +        break;
   1.798 +      case Op_VecY:
   1.799 +        __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
   1.800 +        break;
   1.801 +      default:
   1.802 +        ShouldNotReachHere();
   1.803 +      }
   1.804 +    }
   1.805 +    int size = __ offset() - offset;
   1.806 +#ifdef ASSERT
   1.807 +    int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4);
   1.808 +    // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
   1.809 +    assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
   1.810 +#endif
   1.811 +    return size;
   1.812 +#ifndef PRODUCT
   1.813 +  } else if (!do_size) {
   1.814 +    if (is_load) {
   1.815 +      switch (ireg) {
   1.816 +      case Op_VecS:
   1.817 +        st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
   1.818 +        break;
   1.819 +      case Op_VecD:
   1.820 +        st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
   1.821 +        break;
   1.822 +       case Op_VecX:
   1.823 +        st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
   1.824 +        break;
   1.825 +      case Op_VecY:
   1.826 +        st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
   1.827 +        break;
   1.828 +      default:
   1.829 +        ShouldNotReachHere();
   1.830 +      }
   1.831 +    } else { // store
   1.832 +      switch (ireg) {
   1.833 +      case Op_VecS:
   1.834 +        st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
   1.835 +        break;
   1.836 +      case Op_VecD:
   1.837 +        st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
   1.838 +        break;
   1.839 +       case Op_VecX:
   1.840 +        st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
   1.841 +        break;
   1.842 +      case Op_VecY:
   1.843 +        st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
   1.844 +        break;
   1.845 +      default:
   1.846 +        ShouldNotReachHere();
   1.847 +      }
   1.848 +    }
   1.849 +#endif
   1.850 +  }
   1.851 +  int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4);
   1.852 +  // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
   1.853 +  return 5+offset_size;
   1.854 +}
   1.855 +
   1.856 +static inline jfloat replicate4_imm(int con, int width) {
   1.857 +  // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
   1.858 +  assert(width == 1 || width == 2, "only byte or short types here");
   1.859 +  int bit_width = width * 8;
   1.860 +  jint val = con;
   1.861 +  val &= (1 << bit_width) - 1;  // mask off sign bits
   1.862 +  while(bit_width < 32) {
   1.863 +    val |= (val << bit_width);
   1.864 +    bit_width <<= 1;
   1.865 +  }
   1.866 +  jfloat fval = *((jfloat*) &val);  // coerce to float type
   1.867 +  return fval;
   1.868 +}
   1.869 +
   1.870 +static inline jdouble replicate8_imm(int con, int width) {
   1.871 +  // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
   1.872 +  assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
   1.873 +  int bit_width = width * 8;
   1.874 +  jlong val = con;
   1.875 +  val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
   1.876 +  while(bit_width < 64) {
   1.877 +    val |= (val << bit_width);
   1.878 +    bit_width <<= 1;
   1.879 +  }
   1.880 +  jdouble dval = *((jdouble*) &val);  // coerce to double type
   1.881 +  return dval;
   1.882 +}
   1.883 +
   1.884 +#ifndef PRODUCT
   1.885 +  void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
   1.886 +    st->print("nop \t# %d bytes pad for loops and calls", _count);
   1.887 +  }
   1.888 +#endif
   1.889 +
   1.890 +  void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
   1.891 +    MacroAssembler _masm(&cbuf);
   1.892 +    __ nop(_count);
   1.893 +  }
   1.894 +
   1.895 +  uint MachNopNode::size(PhaseRegAlloc*) const {
   1.896 +    return _count;
   1.897 +  }
   1.898 +
   1.899 +#ifndef PRODUCT
   1.900 +  void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
   1.901 +    st->print("# breakpoint");
   1.902 +  }
   1.903 +#endif
   1.904 +
   1.905 +  void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
   1.906 +    MacroAssembler _masm(&cbuf);
   1.907 +    __ int3();
   1.908 +  }
   1.909 +
   1.910 +  uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
   1.911 +    return MachNode::size(ra_);
   1.912 +  }
   1.913 +
   1.914 +%}
   1.915 +
   1.916 +encode %{
   1.917 +
   1.918 +  enc_class preserve_SP %{
   1.919 +    debug_only(int off0 = cbuf.insts_size());
   1.920 +    MacroAssembler _masm(&cbuf);
   1.921 +    // RBP is preserved across all calls, even compiled calls.
   1.922 +    // Use it to preserve RSP in places where the callee might change the SP.
   1.923 +    __ movptr(rbp_mh_SP_save, rsp);
   1.924 +    debug_only(int off1 = cbuf.insts_size());
   1.925 +    assert(off1 - off0 == preserve_SP_size(), "correct size prediction");
   1.926 +  %}
   1.927 +
   1.928 +  enc_class restore_SP %{
   1.929 +    MacroAssembler _masm(&cbuf);
   1.930 +    __ movptr(rsp, rbp_mh_SP_save);
   1.931 +  %}
   1.932 +
   1.933 +  enc_class call_epilog %{
   1.934 +    if (VerifyStackAtCalls) {
   1.935 +      // Check that stack depth is unchanged: find majik cookie on stack
   1.936 +      int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
   1.937 +      MacroAssembler _masm(&cbuf);
   1.938 +      Label L;
   1.939 +      __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
   1.940 +      __ jccb(Assembler::equal, L);
   1.941 +      // Die if stack mismatch
   1.942 +      __ int3();
   1.943 +      __ bind(L);
   1.944 +    }
   1.945 +  %}
   1.946 +
   1.947 +%}
   1.948 +
   1.949 +
   1.950 +//----------OPERANDS-----------------------------------------------------------
   1.951 +// Operand definitions must precede instruction definitions for correct parsing
   1.952 +// in the ADLC because operands constitute user defined types which are used in
   1.953 +// instruction definitions.
   1.954 +
   1.955 +// Vectors
   1.956 +operand vecS() %{
   1.957 +  constraint(ALLOC_IN_RC(vectors_reg));
   1.958 +  match(VecS);
   1.959 +
   1.960 +  format %{ %}
   1.961 +  interface(REG_INTER);
   1.962 +%}
   1.963 +
   1.964 +operand vecD() %{
   1.965 +  constraint(ALLOC_IN_RC(vectord_reg));
   1.966 +  match(VecD);
   1.967 +
   1.968 +  format %{ %}
   1.969 +  interface(REG_INTER);
   1.970 +%}
   1.971 +
   1.972 +operand vecX() %{
   1.973 +  constraint(ALLOC_IN_RC(vectorx_reg));
   1.974 +  match(VecX);
   1.975 +
   1.976 +  format %{ %}
   1.977 +  interface(REG_INTER);
   1.978 +%}
   1.979 +
   1.980 +operand vecY() %{
   1.981 +  constraint(ALLOC_IN_RC(vectory_reg));
   1.982 +  match(VecY);
   1.983 +
   1.984 +  format %{ %}
   1.985 +  interface(REG_INTER);
   1.986 +%}
   1.987 +
   1.988 +
   1.989 +// INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
   1.990 +
   1.991 +// ============================================================================
   1.992 +
   1.993 +instruct ShouldNotReachHere() %{
   1.994 +  match(Halt);
   1.995 +  format %{ "int3\t# ShouldNotReachHere" %}
   1.996 +  ins_encode %{
   1.997 +    __ int3();
   1.998 +  %}
   1.999 +  ins_pipe(pipe_slow);
  1.1000 +%}
  1.1001 +
  1.1002 +// ============================================================================
  1.1003 +
  1.1004 +instruct addF_reg(regF dst, regF src) %{
  1.1005 +  predicate((UseSSE>=1) && (UseAVX == 0));
  1.1006 +  match(Set dst (AddF dst src));
  1.1007 +
  1.1008 +  format %{ "addss   $dst, $src" %}
  1.1009 +  ins_cost(150);
  1.1010 +  ins_encode %{
  1.1011 +    __ addss($dst$$XMMRegister, $src$$XMMRegister);
  1.1012 +  %}
  1.1013 +  ins_pipe(pipe_slow);
  1.1014 +%}
  1.1015 +
  1.1016 +instruct addF_mem(regF dst, memory src) %{
  1.1017 +  predicate((UseSSE>=1) && (UseAVX == 0));
  1.1018 +  match(Set dst (AddF dst (LoadF src)));
  1.1019 +
  1.1020 +  format %{ "addss   $dst, $src" %}
  1.1021 +  ins_cost(150);
  1.1022 +  ins_encode %{
  1.1023 +    __ addss($dst$$XMMRegister, $src$$Address);
  1.1024 +  %}
  1.1025 +  ins_pipe(pipe_slow);
  1.1026 +%}
  1.1027 +
  1.1028 +instruct addF_imm(regF dst, immF con) %{
  1.1029 +  predicate((UseSSE>=1) && (UseAVX == 0));
  1.1030 +  match(Set dst (AddF dst con));
  1.1031 +  format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
  1.1032 +  ins_cost(150);
  1.1033 +  ins_encode %{
  1.1034 +    __ addss($dst$$XMMRegister, $constantaddress($con));
  1.1035 +  %}
  1.1036 +  ins_pipe(pipe_slow);
  1.1037 +%}
  1.1038 +
  1.1039 +instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
  1.1040 +  predicate(UseAVX > 0);
  1.1041 +  match(Set dst (AddF src1 src2));
  1.1042 +
  1.1043 +  format %{ "vaddss  $dst, $src1, $src2" %}
  1.1044 +  ins_cost(150);
  1.1045 +  ins_encode %{
  1.1046 +    __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
  1.1047 +  %}
  1.1048 +  ins_pipe(pipe_slow);
  1.1049 +%}
  1.1050 +
  1.1051 +instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
  1.1052 +  predicate(UseAVX > 0);
  1.1053 +  match(Set dst (AddF src1 (LoadF src2)));
  1.1054 +
  1.1055 +  format %{ "vaddss  $dst, $src1, $src2" %}
  1.1056 +  ins_cost(150);
  1.1057 +  ins_encode %{
  1.1058 +    __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
  1.1059 +  %}
  1.1060 +  ins_pipe(pipe_slow);
  1.1061 +%}
  1.1062 +
  1.1063 +instruct addF_reg_imm(regF dst, regF src, immF con) %{
  1.1064 +  predicate(UseAVX > 0);
  1.1065 +  match(Set dst (AddF src con));
  1.1066 +
  1.1067 +  format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
  1.1068 +  ins_cost(150);
  1.1069 +  ins_encode %{
  1.1070 +    __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
  1.1071 +  %}
  1.1072 +  ins_pipe(pipe_slow);
  1.1073 +%}
  1.1074 +
  1.1075 +instruct addD_reg(regD dst, regD src) %{
  1.1076 +  predicate((UseSSE>=2) && (UseAVX == 0));
  1.1077 +  match(Set dst (AddD dst src));
  1.1078 +
  1.1079 +  format %{ "addsd   $dst, $src" %}
  1.1080 +  ins_cost(150);
  1.1081 +  ins_encode %{
  1.1082 +    __ addsd($dst$$XMMRegister, $src$$XMMRegister);
  1.1083 +  %}
  1.1084 +  ins_pipe(pipe_slow);
  1.1085 +%}
  1.1086 +
  1.1087 +instruct addD_mem(regD dst, memory src) %{
  1.1088 +  predicate((UseSSE>=2) && (UseAVX == 0));
  1.1089 +  match(Set dst (AddD dst (LoadD src)));
  1.1090 +
  1.1091 +  format %{ "addsd   $dst, $src" %}
  1.1092 +  ins_cost(150);
  1.1093 +  ins_encode %{
  1.1094 +    __ addsd($dst$$XMMRegister, $src$$Address);
  1.1095 +  %}
  1.1096 +  ins_pipe(pipe_slow);
  1.1097 +%}
  1.1098 +
  1.1099 +instruct addD_imm(regD dst, immD con) %{
  1.1100 +  predicate((UseSSE>=2) && (UseAVX == 0));
  1.1101 +  match(Set dst (AddD dst con));
  1.1102 +  format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
  1.1103 +  ins_cost(150);
  1.1104 +  ins_encode %{
  1.1105 +    __ addsd($dst$$XMMRegister, $constantaddress($con));
  1.1106 +  %}
  1.1107 +  ins_pipe(pipe_slow);
  1.1108 +%}
  1.1109 +
  1.1110 +instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
  1.1111 +  predicate(UseAVX > 0);
  1.1112 +  match(Set dst (AddD src1 src2));
  1.1113 +
  1.1114 +  format %{ "vaddsd  $dst, $src1, $src2" %}
  1.1115 +  ins_cost(150);
  1.1116 +  ins_encode %{
  1.1117 +    __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
  1.1118 +  %}
  1.1119 +  ins_pipe(pipe_slow);
  1.1120 +%}
  1.1121 +
  1.1122 +instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
  1.1123 +  predicate(UseAVX > 0);
  1.1124 +  match(Set dst (AddD src1 (LoadD src2)));
  1.1125 +
  1.1126 +  format %{ "vaddsd  $dst, $src1, $src2" %}
  1.1127 +  ins_cost(150);
  1.1128 +  ins_encode %{
  1.1129 +    __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
  1.1130 +  %}
  1.1131 +  ins_pipe(pipe_slow);
  1.1132 +%}
  1.1133 +
  1.1134 +instruct addD_reg_imm(regD dst, regD src, immD con) %{
  1.1135 +  predicate(UseAVX > 0);
  1.1136 +  match(Set dst (AddD src con));
  1.1137 +
  1.1138 +  format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
  1.1139 +  ins_cost(150);
  1.1140 +  ins_encode %{
  1.1141 +    __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
  1.1142 +  %}
  1.1143 +  ins_pipe(pipe_slow);
  1.1144 +%}
  1.1145 +
  1.1146 +instruct subF_reg(regF dst, regF src) %{
  1.1147 +  predicate((UseSSE>=1) && (UseAVX == 0));
  1.1148 +  match(Set dst (SubF dst src));
  1.1149 +
  1.1150 +  format %{ "subss   $dst, $src" %}
  1.1151 +  ins_cost(150);
  1.1152 +  ins_encode %{
  1.1153 +    __ subss($dst$$XMMRegister, $src$$XMMRegister);
  1.1154 +  %}
  1.1155 +  ins_pipe(pipe_slow);
  1.1156 +%}
  1.1157 +
  1.1158 +instruct subF_mem(regF dst, memory src) %{
  1.1159 +  predicate((UseSSE>=1) && (UseAVX == 0));
  1.1160 +  match(Set dst (SubF dst (LoadF src)));
  1.1161 +
  1.1162 +  format %{ "subss   $dst, $src" %}
  1.1163 +  ins_cost(150);
  1.1164 +  ins_encode %{
  1.1165 +    __ subss($dst$$XMMRegister, $src$$Address);
  1.1166 +  %}
  1.1167 +  ins_pipe(pipe_slow);
  1.1168 +%}
  1.1169 +
  1.1170 +instruct subF_imm(regF dst, immF con) %{
  1.1171 +  predicate((UseSSE>=1) && (UseAVX == 0));
  1.1172 +  match(Set dst (SubF dst con));
  1.1173 +  format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
  1.1174 +  ins_cost(150);
  1.1175 +  ins_encode %{
  1.1176 +    __ subss($dst$$XMMRegister, $constantaddress($con));
  1.1177 +  %}
  1.1178 +  ins_pipe(pipe_slow);
  1.1179 +%}
  1.1180 +
  1.1181 +instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
  1.1182 +  predicate(UseAVX > 0);
  1.1183 +  match(Set dst (SubF src1 src2));
  1.1184 +
  1.1185 +  format %{ "vsubss  $dst, $src1, $src2" %}
  1.1186 +  ins_cost(150);
  1.1187 +  ins_encode %{
  1.1188 +    __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
  1.1189 +  %}
  1.1190 +  ins_pipe(pipe_slow);
  1.1191 +%}
  1.1192 +
  1.1193 +instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
  1.1194 +  predicate(UseAVX > 0);
  1.1195 +  match(Set dst (SubF src1 (LoadF src2)));
  1.1196 +
  1.1197 +  format %{ "vsubss  $dst, $src1, $src2" %}
  1.1198 +  ins_cost(150);
  1.1199 +  ins_encode %{
  1.1200 +    __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
  1.1201 +  %}
  1.1202 +  ins_pipe(pipe_slow);
  1.1203 +%}
  1.1204 +
  1.1205 +instruct subF_reg_imm(regF dst, regF src, immF con) %{
  1.1206 +  predicate(UseAVX > 0);
  1.1207 +  match(Set dst (SubF src con));
  1.1208 +
  1.1209 +  format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
  1.1210 +  ins_cost(150);
  1.1211 +  ins_encode %{
  1.1212 +    __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
  1.1213 +  %}
  1.1214 +  ins_pipe(pipe_slow);
  1.1215 +%}
  1.1216 +
  1.1217 +instruct subD_reg(regD dst, regD src) %{
  1.1218 +  predicate((UseSSE>=2) && (UseAVX == 0));
  1.1219 +  match(Set dst (SubD dst src));
  1.1220 +
  1.1221 +  format %{ "subsd   $dst, $src" %}
  1.1222 +  ins_cost(150);
  1.1223 +  ins_encode %{
  1.1224 +    __ subsd($dst$$XMMRegister, $src$$XMMRegister);
  1.1225 +  %}
  1.1226 +  ins_pipe(pipe_slow);
  1.1227 +%}
  1.1228 +
  1.1229 +instruct subD_mem(regD dst, memory src) %{
  1.1230 +  predicate((UseSSE>=2) && (UseAVX == 0));
  1.1231 +  match(Set dst (SubD dst (LoadD src)));
  1.1232 +
  1.1233 +  format %{ "subsd   $dst, $src" %}
  1.1234 +  ins_cost(150);
  1.1235 +  ins_encode %{
  1.1236 +    __ subsd($dst$$XMMRegister, $src$$Address);
  1.1237 +  %}
  1.1238 +  ins_pipe(pipe_slow);
  1.1239 +%}
  1.1240 +
  1.1241 +instruct subD_imm(regD dst, immD con) %{
  1.1242 +  predicate((UseSSE>=2) && (UseAVX == 0));
  1.1243 +  match(Set dst (SubD dst con));
  1.1244 +  format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
  1.1245 +  ins_cost(150);
  1.1246 +  ins_encode %{
  1.1247 +    __ subsd($dst$$XMMRegister, $constantaddress($con));
  1.1248 +  %}
  1.1249 +  ins_pipe(pipe_slow);
  1.1250 +%}
  1.1251 +
  1.1252 +instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
  1.1253 +  predicate(UseAVX > 0);
  1.1254 +  match(Set dst (SubD src1 src2));
  1.1255 +
  1.1256 +  format %{ "vsubsd  $dst, $src1, $src2" %}
  1.1257 +  ins_cost(150);
  1.1258 +  ins_encode %{
  1.1259 +    __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
  1.1260 +  %}
  1.1261 +  ins_pipe(pipe_slow);
  1.1262 +%}
  1.1263 +
  1.1264 +instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
  1.1265 +  predicate(UseAVX > 0);
  1.1266 +  match(Set dst (SubD src1 (LoadD src2)));
  1.1267 +
  1.1268 +  format %{ "vsubsd  $dst, $src1, $src2" %}
  1.1269 +  ins_cost(150);
  1.1270 +  ins_encode %{
  1.1271 +    __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
  1.1272 +  %}
  1.1273 +  ins_pipe(pipe_slow);
  1.1274 +%}
  1.1275 +
  1.1276 +instruct subD_reg_imm(regD dst, regD src, immD con) %{
  1.1277 +  predicate(UseAVX > 0);
  1.1278 +  match(Set dst (SubD src con));
  1.1279 +
  1.1280 +  format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
  1.1281 +  ins_cost(150);
  1.1282 +  ins_encode %{
  1.1283 +    __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
  1.1284 +  %}
  1.1285 +  ins_pipe(pipe_slow);
  1.1286 +%}
  1.1287 +
  1.1288 +instruct mulF_reg(regF dst, regF src) %{
  1.1289 +  predicate((UseSSE>=1) && (UseAVX == 0));
  1.1290 +  match(Set dst (MulF dst src));
  1.1291 +
  1.1292 +  format %{ "mulss   $dst, $src" %}
  1.1293 +  ins_cost(150);
  1.1294 +  ins_encode %{
  1.1295 +    __ mulss($dst$$XMMRegister, $src$$XMMRegister);
  1.1296 +  %}
  1.1297 +  ins_pipe(pipe_slow);
  1.1298 +%}
  1.1299 +
  1.1300 +instruct mulF_mem(regF dst, memory src) %{
  1.1301 +  predicate((UseSSE>=1) && (UseAVX == 0));
  1.1302 +  match(Set dst (MulF dst (LoadF src)));
  1.1303 +
  1.1304 +  format %{ "mulss   $dst, $src" %}
  1.1305 +  ins_cost(150);
  1.1306 +  ins_encode %{
  1.1307 +    __ mulss($dst$$XMMRegister, $src$$Address);
  1.1308 +  %}
  1.1309 +  ins_pipe(pipe_slow);
  1.1310 +%}
  1.1311 +
  1.1312 +instruct mulF_imm(regF dst, immF con) %{
  1.1313 +  predicate((UseSSE>=1) && (UseAVX == 0));
  1.1314 +  match(Set dst (MulF dst con));
  1.1315 +  format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
  1.1316 +  ins_cost(150);
  1.1317 +  ins_encode %{
  1.1318 +    __ mulss($dst$$XMMRegister, $constantaddress($con));
  1.1319 +  %}
  1.1320 +  ins_pipe(pipe_slow);
  1.1321 +%}
  1.1322 +
  1.1323 +instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
  1.1324 +  predicate(UseAVX > 0);
  1.1325 +  match(Set dst (MulF src1 src2));
  1.1326 +
  1.1327 +  format %{ "vmulss  $dst, $src1, $src2" %}
  1.1328 +  ins_cost(150);
  1.1329 +  ins_encode %{
  1.1330 +    __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
  1.1331 +  %}
  1.1332 +  ins_pipe(pipe_slow);
  1.1333 +%}
  1.1334 +
  1.1335 +instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
  1.1336 +  predicate(UseAVX > 0);
  1.1337 +  match(Set dst (MulF src1 (LoadF src2)));
  1.1338 +
  1.1339 +  format %{ "vmulss  $dst, $src1, $src2" %}
  1.1340 +  ins_cost(150);
  1.1341 +  ins_encode %{
  1.1342 +    __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
  1.1343 +  %}
  1.1344 +  ins_pipe(pipe_slow);
  1.1345 +%}
  1.1346 +
  1.1347 +instruct mulF_reg_imm(regF dst, regF src, immF con) %{
  1.1348 +  predicate(UseAVX > 0);
  1.1349 +  match(Set dst (MulF src con));
  1.1350 +
  1.1351 +  format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
  1.1352 +  ins_cost(150);
  1.1353 +  ins_encode %{
  1.1354 +    __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
  1.1355 +  %}
  1.1356 +  ins_pipe(pipe_slow);
  1.1357 +%}
  1.1358 +
  1.1359 +instruct mulD_reg(regD dst, regD src) %{
  1.1360 +  predicate((UseSSE>=2) && (UseAVX == 0));
  1.1361 +  match(Set dst (MulD dst src));
  1.1362 +
  1.1363 +  format %{ "mulsd   $dst, $src" %}
  1.1364 +  ins_cost(150);
  1.1365 +  ins_encode %{
  1.1366 +    __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
  1.1367 +  %}
  1.1368 +  ins_pipe(pipe_slow);
  1.1369 +%}
  1.1370 +
  1.1371 +instruct mulD_mem(regD dst, memory src) %{
  1.1372 +  predicate((UseSSE>=2) && (UseAVX == 0));
  1.1373 +  match(Set dst (MulD dst (LoadD src)));
  1.1374 +
  1.1375 +  format %{ "mulsd   $dst, $src" %}
  1.1376 +  ins_cost(150);
  1.1377 +  ins_encode %{
  1.1378 +    __ mulsd($dst$$XMMRegister, $src$$Address);
  1.1379 +  %}
  1.1380 +  ins_pipe(pipe_slow);
  1.1381 +%}
  1.1382 +
  1.1383 +instruct mulD_imm(regD dst, immD con) %{
  1.1384 +  predicate((UseSSE>=2) && (UseAVX == 0));
  1.1385 +  match(Set dst (MulD dst con));
  1.1386 +  format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
  1.1387 +  ins_cost(150);
  1.1388 +  ins_encode %{
  1.1389 +    __ mulsd($dst$$XMMRegister, $constantaddress($con));
  1.1390 +  %}
  1.1391 +  ins_pipe(pipe_slow);
  1.1392 +%}
  1.1393 +
  1.1394 +instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
  1.1395 +  predicate(UseAVX > 0);
  1.1396 +  match(Set dst (MulD src1 src2));
  1.1397 +
  1.1398 +  format %{ "vmulsd  $dst, $src1, $src2" %}
  1.1399 +  ins_cost(150);
  1.1400 +  ins_encode %{
  1.1401 +    __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
  1.1402 +  %}
  1.1403 +  ins_pipe(pipe_slow);
  1.1404 +%}
  1.1405 +
  1.1406 +instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
  1.1407 +  predicate(UseAVX > 0);
  1.1408 +  match(Set dst (MulD src1 (LoadD src2)));
  1.1409 +
  1.1410 +  format %{ "vmulsd  $dst, $src1, $src2" %}
  1.1411 +  ins_cost(150);
  1.1412 +  ins_encode %{
  1.1413 +    __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
  1.1414 +  %}
  1.1415 +  ins_pipe(pipe_slow);
  1.1416 +%}
  1.1417 +
  1.1418 +instruct mulD_reg_imm(regD dst, regD src, immD con) %{
  1.1419 +  predicate(UseAVX > 0);
  1.1420 +  match(Set dst (MulD src con));
  1.1421 +
  1.1422 +  format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
  1.1423 +  ins_cost(150);
  1.1424 +  ins_encode %{
  1.1425 +    __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
  1.1426 +  %}
  1.1427 +  ins_pipe(pipe_slow);
  1.1428 +%}
  1.1429 +
  1.1430 +instruct divF_reg(regF dst, regF src) %{
  1.1431 +  predicate((UseSSE>=1) && (UseAVX == 0));
  1.1432 +  match(Set dst (DivF dst src));
  1.1433 +
  1.1434 +  format %{ "divss   $dst, $src" %}
  1.1435 +  ins_cost(150);
  1.1436 +  ins_encode %{
  1.1437 +    __ divss($dst$$XMMRegister, $src$$XMMRegister);
  1.1438 +  %}
  1.1439 +  ins_pipe(pipe_slow);
  1.1440 +%}
  1.1441 +
  1.1442 +instruct divF_mem(regF dst, memory src) %{
  1.1443 +  predicate((UseSSE>=1) && (UseAVX == 0));
  1.1444 +  match(Set dst (DivF dst (LoadF src)));
  1.1445 +
  1.1446 +  format %{ "divss   $dst, $src" %}
  1.1447 +  ins_cost(150);
  1.1448 +  ins_encode %{
  1.1449 +    __ divss($dst$$XMMRegister, $src$$Address);
  1.1450 +  %}
  1.1451 +  ins_pipe(pipe_slow);
  1.1452 +%}
  1.1453 +
  1.1454 +instruct divF_imm(regF dst, immF con) %{
  1.1455 +  predicate((UseSSE>=1) && (UseAVX == 0));
  1.1456 +  match(Set dst (DivF dst con));
  1.1457 +  format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
  1.1458 +  ins_cost(150);
  1.1459 +  ins_encode %{
  1.1460 +    __ divss($dst$$XMMRegister, $constantaddress($con));
  1.1461 +  %}
  1.1462 +  ins_pipe(pipe_slow);
  1.1463 +%}
  1.1464 +
  1.1465 +instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
  1.1466 +  predicate(UseAVX > 0);
  1.1467 +  match(Set dst (DivF src1 src2));
  1.1468 +
  1.1469 +  format %{ "vdivss  $dst, $src1, $src2" %}
  1.1470 +  ins_cost(150);
  1.1471 +  ins_encode %{
  1.1472 +    __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
  1.1473 +  %}
  1.1474 +  ins_pipe(pipe_slow);
  1.1475 +%}
  1.1476 +
  1.1477 +instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
  1.1478 +  predicate(UseAVX > 0);
  1.1479 +  match(Set dst (DivF src1 (LoadF src2)));
  1.1480 +
  1.1481 +  format %{ "vdivss  $dst, $src1, $src2" %}
  1.1482 +  ins_cost(150);
  1.1483 +  ins_encode %{
  1.1484 +    __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
  1.1485 +  %}
  1.1486 +  ins_pipe(pipe_slow);
  1.1487 +%}
  1.1488 +
  1.1489 +instruct divF_reg_imm(regF dst, regF src, immF con) %{
  1.1490 +  predicate(UseAVX > 0);
  1.1491 +  match(Set dst (DivF src con));
  1.1492 +
  1.1493 +  format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
  1.1494 +  ins_cost(150);
  1.1495 +  ins_encode %{
  1.1496 +    __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
  1.1497 +  %}
  1.1498 +  ins_pipe(pipe_slow);
  1.1499 +%}
  1.1500 +
  1.1501 +instruct divD_reg(regD dst, regD src) %{
  1.1502 +  predicate((UseSSE>=2) && (UseAVX == 0));
  1.1503 +  match(Set dst (DivD dst src));
  1.1504 +
  1.1505 +  format %{ "divsd   $dst, $src" %}
  1.1506 +  ins_cost(150);
  1.1507 +  ins_encode %{
  1.1508 +    __ divsd($dst$$XMMRegister, $src$$XMMRegister);
  1.1509 +  %}
  1.1510 +  ins_pipe(pipe_slow);
  1.1511 +%}
  1.1512 +
  1.1513 +instruct divD_mem(regD dst, memory src) %{
  1.1514 +  predicate((UseSSE>=2) && (UseAVX == 0));
  1.1515 +  match(Set dst (DivD dst (LoadD src)));
  1.1516 +
  1.1517 +  format %{ "divsd   $dst, $src" %}
  1.1518 +  ins_cost(150);
  1.1519 +  ins_encode %{
  1.1520 +    __ divsd($dst$$XMMRegister, $src$$Address);
  1.1521 +  %}
  1.1522 +  ins_pipe(pipe_slow);
  1.1523 +%}
  1.1524 +
  1.1525 +instruct divD_imm(regD dst, immD con) %{
  1.1526 +  predicate((UseSSE>=2) && (UseAVX == 0));
  1.1527 +  match(Set dst (DivD dst con));
  1.1528 +  format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
  1.1529 +  ins_cost(150);
  1.1530 +  ins_encode %{
  1.1531 +    __ divsd($dst$$XMMRegister, $constantaddress($con));
  1.1532 +  %}
  1.1533 +  ins_pipe(pipe_slow);
  1.1534 +%}
  1.1535 +
  1.1536 +instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
  1.1537 +  predicate(UseAVX > 0);
  1.1538 +  match(Set dst (DivD src1 src2));
  1.1539 +
  1.1540 +  format %{ "vdivsd  $dst, $src1, $src2" %}
  1.1541 +  ins_cost(150);
  1.1542 +  ins_encode %{
  1.1543 +    __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
  1.1544 +  %}
  1.1545 +  ins_pipe(pipe_slow);
  1.1546 +%}
  1.1547 +
  1.1548 +instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
  1.1549 +  predicate(UseAVX > 0);
  1.1550 +  match(Set dst (DivD src1 (LoadD src2)));
  1.1551 +
  1.1552 +  format %{ "vdivsd  $dst, $src1, $src2" %}
  1.1553 +  ins_cost(150);
  1.1554 +  ins_encode %{
  1.1555 +    __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
  1.1556 +  %}
  1.1557 +  ins_pipe(pipe_slow);
  1.1558 +%}
  1.1559 +
  1.1560 +instruct divD_reg_imm(regD dst, regD src, immD con) %{
  1.1561 +  predicate(UseAVX > 0);
  1.1562 +  match(Set dst (DivD src con));
  1.1563 +
  1.1564 +  format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
  1.1565 +  ins_cost(150);
  1.1566 +  ins_encode %{
  1.1567 +    __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
  1.1568 +  %}
  1.1569 +  ins_pipe(pipe_slow);
  1.1570 +%}
  1.1571 +
  1.1572 +instruct absF_reg(regF dst) %{
  1.1573 +  predicate((UseSSE>=1) && (UseAVX == 0));
  1.1574 +  match(Set dst (AbsF dst));
  1.1575 +  ins_cost(150);
  1.1576 +  format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
  1.1577 +  ins_encode %{
  1.1578 +    __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
  1.1579 +  %}
  1.1580 +  ins_pipe(pipe_slow);
  1.1581 +%}
  1.1582 +
  1.1583 +instruct absF_reg_reg(regF dst, regF src) %{
  1.1584 +  predicate(UseAVX > 0);
  1.1585 +  match(Set dst (AbsF src));
  1.1586 +  ins_cost(150);
  1.1587 +  format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
  1.1588 +  ins_encode %{
  1.1589 +    bool vector256 = false;
  1.1590 +    __ vandps($dst$$XMMRegister, $src$$XMMRegister,
  1.1591 +              ExternalAddress(float_signmask()), vector256);
  1.1592 +  %}
  1.1593 +  ins_pipe(pipe_slow);
  1.1594 +%}
  1.1595 +
  1.1596 +instruct absD_reg(regD dst) %{
  1.1597 +  predicate((UseSSE>=2) && (UseAVX == 0));
  1.1598 +  match(Set dst (AbsD dst));
  1.1599 +  ins_cost(150);
  1.1600 +  format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
  1.1601 +            "# abs double by sign masking" %}
  1.1602 +  ins_encode %{
  1.1603 +    __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
  1.1604 +  %}
  1.1605 +  ins_pipe(pipe_slow);
  1.1606 +%}
  1.1607 +
  1.1608 +instruct absD_reg_reg(regD dst, regD src) %{
  1.1609 +  predicate(UseAVX > 0);
  1.1610 +  match(Set dst (AbsD src));
  1.1611 +  ins_cost(150);
  1.1612 +  format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
  1.1613 +            "# abs double by sign masking" %}
  1.1614 +  ins_encode %{
  1.1615 +    bool vector256 = false;
  1.1616 +    __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
  1.1617 +              ExternalAddress(double_signmask()), vector256);
  1.1618 +  %}
  1.1619 +  ins_pipe(pipe_slow);
  1.1620 +%}
  1.1621 +
  1.1622 +instruct negF_reg(regF dst) %{
  1.1623 +  predicate((UseSSE>=1) && (UseAVX == 0));
  1.1624 +  match(Set dst (NegF dst));
  1.1625 +  ins_cost(150);
  1.1626 +  format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
  1.1627 +  ins_encode %{
  1.1628 +    __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
  1.1629 +  %}
  1.1630 +  ins_pipe(pipe_slow);
  1.1631 +%}
  1.1632 +
  1.1633 +instruct negF_reg_reg(regF dst, regF src) %{
  1.1634 +  predicate(UseAVX > 0);
  1.1635 +  match(Set dst (NegF src));
  1.1636 +  ins_cost(150);
  1.1637 +  format %{ "vxorps  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
  1.1638 +  ins_encode %{
  1.1639 +    bool vector256 = false;
  1.1640 +    __ vxorps($dst$$XMMRegister, $src$$XMMRegister,
  1.1641 +              ExternalAddress(float_signflip()), vector256);
  1.1642 +  %}
  1.1643 +  ins_pipe(pipe_slow);
  1.1644 +%}
  1.1645 +
  1.1646 +instruct negD_reg(regD dst) %{
  1.1647 +  predicate((UseSSE>=2) && (UseAVX == 0));
  1.1648 +  match(Set dst (NegD dst));
  1.1649 +  ins_cost(150);
  1.1650 +  format %{ "xorpd   $dst, [0x8000000000000000]\t"
  1.1651 +            "# neg double by sign flipping" %}
  1.1652 +  ins_encode %{
  1.1653 +    __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
  1.1654 +  %}
  1.1655 +  ins_pipe(pipe_slow);
  1.1656 +%}
  1.1657 +
  1.1658 +instruct negD_reg_reg(regD dst, regD src) %{
  1.1659 +  predicate(UseAVX > 0);
  1.1660 +  match(Set dst (NegD src));
  1.1661 +  ins_cost(150);
  1.1662 +  format %{ "vxorpd  $dst, $src, [0x8000000000000000]\t"
  1.1663 +            "# neg double by sign flipping" %}
  1.1664 +  ins_encode %{
  1.1665 +    bool vector256 = false;
  1.1666 +    __ vxorpd($dst$$XMMRegister, $src$$XMMRegister,
  1.1667 +              ExternalAddress(double_signflip()), vector256);
  1.1668 +  %}
  1.1669 +  ins_pipe(pipe_slow);
  1.1670 +%}
  1.1671 +
  1.1672 +instruct sqrtF_reg(regF dst, regF src) %{
  1.1673 +  predicate(UseSSE>=1);
  1.1674 +  match(Set dst (ConvD2F (SqrtD (ConvF2D src))));
  1.1675 +
  1.1676 +  format %{ "sqrtss  $dst, $src" %}
  1.1677 +  ins_cost(150);
  1.1678 +  ins_encode %{
  1.1679 +    __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
  1.1680 +  %}
  1.1681 +  ins_pipe(pipe_slow);
  1.1682 +%}
  1.1683 +
  1.1684 +instruct sqrtF_mem(regF dst, memory src) %{
  1.1685 +  predicate(UseSSE>=1);
  1.1686 +  match(Set dst (ConvD2F (SqrtD (ConvF2D (LoadF src)))));
  1.1687 +
  1.1688 +  format %{ "sqrtss  $dst, $src" %}
  1.1689 +  ins_cost(150);
  1.1690 +  ins_encode %{
  1.1691 +    __ sqrtss($dst$$XMMRegister, $src$$Address);
  1.1692 +  %}
  1.1693 +  ins_pipe(pipe_slow);
  1.1694 +%}
  1.1695 +
  1.1696 +instruct sqrtF_imm(regF dst, immF con) %{
  1.1697 +  predicate(UseSSE>=1);
  1.1698 +  match(Set dst (ConvD2F (SqrtD (ConvF2D con))));
  1.1699 +  format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
  1.1700 +  ins_cost(150);
  1.1701 +  ins_encode %{
  1.1702 +    __ sqrtss($dst$$XMMRegister, $constantaddress($con));
  1.1703 +  %}
  1.1704 +  ins_pipe(pipe_slow);
  1.1705 +%}
  1.1706 +
  1.1707 +instruct sqrtD_reg(regD dst, regD src) %{
  1.1708 +  predicate(UseSSE>=2);
  1.1709 +  match(Set dst (SqrtD src));
  1.1710 +
  1.1711 +  format %{ "sqrtsd  $dst, $src" %}
  1.1712 +  ins_cost(150);
  1.1713 +  ins_encode %{
  1.1714 +    __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
  1.1715 +  %}
  1.1716 +  ins_pipe(pipe_slow);
  1.1717 +%}
  1.1718 +
  1.1719 +instruct sqrtD_mem(regD dst, memory src) %{
  1.1720 +  predicate(UseSSE>=2);
  1.1721 +  match(Set dst (SqrtD (LoadD src)));
  1.1722 +
  1.1723 +  format %{ "sqrtsd  $dst, $src" %}
  1.1724 +  ins_cost(150);
  1.1725 +  ins_encode %{
  1.1726 +    __ sqrtsd($dst$$XMMRegister, $src$$Address);
  1.1727 +  %}
  1.1728 +  ins_pipe(pipe_slow);
  1.1729 +%}
  1.1730 +
  1.1731 +instruct sqrtD_imm(regD dst, immD con) %{
  1.1732 +  predicate(UseSSE>=2);
  1.1733 +  match(Set dst (SqrtD con));
  1.1734 +  format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
  1.1735 +  ins_cost(150);
  1.1736 +  ins_encode %{
  1.1737 +    __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
  1.1738 +  %}
  1.1739 +  ins_pipe(pipe_slow);
  1.1740 +%}
  1.1741 +
  1.1742 +
  1.1743 +// ====================VECTOR INSTRUCTIONS=====================================
  1.1744 +
  1.1745 +// Load vectors (4 bytes long)
  1.1746 +instruct loadV4(vecS dst, memory mem) %{
  1.1747 +  predicate(n->as_LoadVector()->memory_size() == 4);
  1.1748 +  match(Set dst (LoadVector mem));
  1.1749 +  ins_cost(125);
  1.1750 +  format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
  1.1751 +  ins_encode %{
  1.1752 +    __ movdl($dst$$XMMRegister, $mem$$Address);
  1.1753 +  %}
  1.1754 +  ins_pipe( pipe_slow );
  1.1755 +%}
  1.1756 +
  1.1757 +// Load vectors (8 bytes long)
  1.1758 +instruct loadV8(vecD dst, memory mem) %{
  1.1759 +  predicate(n->as_LoadVector()->memory_size() == 8);
  1.1760 +  match(Set dst (LoadVector mem));
  1.1761 +  ins_cost(125);
  1.1762 +  format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
  1.1763 +  ins_encode %{
  1.1764 +    __ movq($dst$$XMMRegister, $mem$$Address);
  1.1765 +  %}
  1.1766 +  ins_pipe( pipe_slow );
  1.1767 +%}
  1.1768 +
  1.1769 +// Load vectors (16 bytes long)
  1.1770 +instruct loadV16(vecX dst, memory mem) %{
  1.1771 +  predicate(n->as_LoadVector()->memory_size() == 16);
  1.1772 +  match(Set dst (LoadVector mem));
  1.1773 +  ins_cost(125);
  1.1774 +  format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
  1.1775 +  ins_encode %{
  1.1776 +    __ movdqu($dst$$XMMRegister, $mem$$Address);
  1.1777 +  %}
  1.1778 +  ins_pipe( pipe_slow );
  1.1779 +%}
  1.1780 +
  1.1781 +// Load vectors (32 bytes long)
  1.1782 +instruct loadV32(vecY dst, memory mem) %{
  1.1783 +  predicate(n->as_LoadVector()->memory_size() == 32);
  1.1784 +  match(Set dst (LoadVector mem));
  1.1785 +  ins_cost(125);
  1.1786 +  format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
  1.1787 +  ins_encode %{
  1.1788 +    __ vmovdqu($dst$$XMMRegister, $mem$$Address);
  1.1789 +  %}
  1.1790 +  ins_pipe( pipe_slow );
  1.1791 +%}
  1.1792 +
  1.1793 +// Store vectors
  1.1794 +instruct storeV4(memory mem, vecS src) %{
  1.1795 +  predicate(n->as_StoreVector()->memory_size() == 4);
  1.1796 +  match(Set mem (StoreVector mem src));
  1.1797 +  ins_cost(145);
  1.1798 +  format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
  1.1799 +  ins_encode %{
  1.1800 +    __ movdl($mem$$Address, $src$$XMMRegister);
  1.1801 +  %}
  1.1802 +  ins_pipe( pipe_slow );
  1.1803 +%}
  1.1804 +
  1.1805 +instruct storeV8(memory mem, vecD src) %{
  1.1806 +  predicate(n->as_StoreVector()->memory_size() == 8);
  1.1807 +  match(Set mem (StoreVector mem src));
  1.1808 +  ins_cost(145);
  1.1809 +  format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
  1.1810 +  ins_encode %{
  1.1811 +    __ movq($mem$$Address, $src$$XMMRegister);
  1.1812 +  %}
  1.1813 +  ins_pipe( pipe_slow );
  1.1814 +%}
  1.1815 +
  1.1816 +instruct storeV16(memory mem, vecX src) %{
  1.1817 +  predicate(n->as_StoreVector()->memory_size() == 16);
  1.1818 +  match(Set mem (StoreVector mem src));
  1.1819 +  ins_cost(145);
  1.1820 +  format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
  1.1821 +  ins_encode %{
  1.1822 +    __ movdqu($mem$$Address, $src$$XMMRegister);
  1.1823 +  %}
  1.1824 +  ins_pipe( pipe_slow );
  1.1825 +%}
  1.1826 +
  1.1827 +instruct storeV32(memory mem, vecY src) %{
  1.1828 +  predicate(n->as_StoreVector()->memory_size() == 32);
  1.1829 +  match(Set mem (StoreVector mem src));
  1.1830 +  ins_cost(145);
  1.1831 +  format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
  1.1832 +  ins_encode %{
  1.1833 +    __ vmovdqu($mem$$Address, $src$$XMMRegister);
  1.1834 +  %}
  1.1835 +  ins_pipe( pipe_slow );
  1.1836 +%}
  1.1837 +
  1.1838 +// Replicate byte scalar to be vector
  1.1839 +instruct Repl4B(vecS dst, rRegI src) %{
  1.1840 +  predicate(n->as_Vector()->length() == 4);
  1.1841 +  match(Set dst (ReplicateB src));
  1.1842 +  format %{ "movd    $dst,$src\n\t"
  1.1843 +            "punpcklbw $dst,$dst\n\t"
  1.1844 +            "pshuflw $dst,$dst,0x00\t! replicate4B" %}
  1.1845 +  ins_encode %{
  1.1846 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.1847 +    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
  1.1848 +    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.1849 +  %}
  1.1850 +  ins_pipe( pipe_slow );
  1.1851 +%}
  1.1852 +
  1.1853 +instruct Repl8B(vecD dst, rRegI src) %{
  1.1854 +  predicate(n->as_Vector()->length() == 8);
  1.1855 +  match(Set dst (ReplicateB src));
  1.1856 +  format %{ "movd    $dst,$src\n\t"
  1.1857 +            "punpcklbw $dst,$dst\n\t"
  1.1858 +            "pshuflw $dst,$dst,0x00\t! replicate8B" %}
  1.1859 +  ins_encode %{
  1.1860 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.1861 +    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
  1.1862 +    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.1863 +  %}
  1.1864 +  ins_pipe( pipe_slow );
  1.1865 +%}
  1.1866 +
  1.1867 +instruct Repl16B(vecX dst, rRegI src) %{
  1.1868 +  predicate(n->as_Vector()->length() == 16);
  1.1869 +  match(Set dst (ReplicateB src));
  1.1870 +  format %{ "movd    $dst,$src\n\t"
  1.1871 +            "punpcklbw $dst,$dst\n\t"
  1.1872 +            "pshuflw $dst,$dst,0x00\n\t"
  1.1873 +            "punpcklqdq $dst,$dst\t! replicate16B" %}
  1.1874 +  ins_encode %{
  1.1875 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.1876 +    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
  1.1877 +    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.1878 +    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
  1.1879 +  %}
  1.1880 +  ins_pipe( pipe_slow );
  1.1881 +%}
  1.1882 +
  1.1883 +instruct Repl32B(vecY dst, rRegI src) %{
  1.1884 +  predicate(n->as_Vector()->length() == 32);
  1.1885 +  match(Set dst (ReplicateB src));
  1.1886 +  format %{ "movd    $dst,$src\n\t"
  1.1887 +            "punpcklbw $dst,$dst\n\t"
  1.1888 +            "pshuflw $dst,$dst,0x00\n\t"
  1.1889 +            "punpcklqdq $dst,$dst\n\t"
  1.1890 +            "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
  1.1891 +  ins_encode %{
  1.1892 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.1893 +    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
  1.1894 +    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.1895 +    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
  1.1896 +    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.1897 +  %}
  1.1898 +  ins_pipe( pipe_slow );
  1.1899 +%}
  1.1900 +
  1.1901 +// Replicate byte scalar immediate to be vector by loading from const table.
  1.1902 +instruct Repl4B_imm(vecS dst, immI con) %{
  1.1903 +  predicate(n->as_Vector()->length() == 4);
  1.1904 +  match(Set dst (ReplicateB con));
  1.1905 +  format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
  1.1906 +  ins_encode %{
  1.1907 +    __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
  1.1908 +  %}
  1.1909 +  ins_pipe( pipe_slow );
  1.1910 +%}
  1.1911 +
  1.1912 +instruct Repl8B_imm(vecD dst, immI con) %{
  1.1913 +  predicate(n->as_Vector()->length() == 8);
  1.1914 +  match(Set dst (ReplicateB con));
  1.1915 +  format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
  1.1916 +  ins_encode %{
  1.1917 +    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
  1.1918 +  %}
  1.1919 +  ins_pipe( pipe_slow );
  1.1920 +%}
  1.1921 +
  1.1922 +instruct Repl16B_imm(vecX dst, immI con) %{
  1.1923 +  predicate(n->as_Vector()->length() == 16);
  1.1924 +  match(Set dst (ReplicateB con));
  1.1925 +  format %{ "movq    $dst,[$constantaddress]\n\t"
  1.1926 +            "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
  1.1927 +  ins_encode %{
  1.1928 +    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
  1.1929 +    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
  1.1930 +  %}
  1.1931 +  ins_pipe( pipe_slow );
  1.1932 +%}
  1.1933 +
  1.1934 +instruct Repl32B_imm(vecY dst, immI con) %{
  1.1935 +  predicate(n->as_Vector()->length() == 32);
  1.1936 +  match(Set dst (ReplicateB con));
  1.1937 +  format %{ "movq    $dst,[$constantaddress]\n\t"
  1.1938 +            "punpcklqdq $dst,$dst\n\t"
  1.1939 +            "vinserti128h $dst,$dst,$dst\t! lreplicate32B($con)" %}
  1.1940 +  ins_encode %{
  1.1941 +    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
  1.1942 +    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
  1.1943 +    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.1944 +  %}
  1.1945 +  ins_pipe( pipe_slow );
  1.1946 +%}
  1.1947 +
  1.1948 +// Replicate byte scalar zero to be vector
  1.1949 +instruct Repl4B_zero(vecS dst, immI0 zero) %{
  1.1950 +  predicate(n->as_Vector()->length() == 4);
  1.1951 +  match(Set dst (ReplicateB zero));
  1.1952 +  format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
  1.1953 +  ins_encode %{
  1.1954 +    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
  1.1955 +  %}
  1.1956 +  ins_pipe( fpu_reg_reg );
  1.1957 +%}
  1.1958 +
  1.1959 +instruct Repl8B_zero(vecD dst, immI0 zero) %{
  1.1960 +  predicate(n->as_Vector()->length() == 8);
  1.1961 +  match(Set dst (ReplicateB zero));
  1.1962 +  format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
  1.1963 +  ins_encode %{
  1.1964 +    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
  1.1965 +  %}
  1.1966 +  ins_pipe( fpu_reg_reg );
  1.1967 +%}
  1.1968 +
  1.1969 +instruct Repl16B_zero(vecX dst, immI0 zero) %{
  1.1970 +  predicate(n->as_Vector()->length() == 16);
  1.1971 +  match(Set dst (ReplicateB zero));
  1.1972 +  format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
  1.1973 +  ins_encode %{
  1.1974 +    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
  1.1975 +  %}
  1.1976 +  ins_pipe( fpu_reg_reg );
  1.1977 +%}
  1.1978 +
  1.1979 +instruct Repl32B_zero(vecY dst, immI0 zero) %{
  1.1980 +  predicate(n->as_Vector()->length() == 32);
  1.1981 +  match(Set dst (ReplicateB zero));
  1.1982 +  format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
  1.1983 +  ins_encode %{
  1.1984 +    // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
  1.1985 +    bool vector256 = true;
  1.1986 +    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
  1.1987 +  %}
  1.1988 +  ins_pipe( fpu_reg_reg );
  1.1989 +%}
  1.1990 +
  1.1991 +// Replicate char/short (2 byte) scalar to be vector
  1.1992 +instruct Repl2S(vecS dst, rRegI src) %{
  1.1993 +  predicate(n->as_Vector()->length() == 2);
  1.1994 +  match(Set dst (ReplicateS src));
  1.1995 +  format %{ "movd    $dst,$src\n\t"
  1.1996 +            "pshuflw $dst,$dst,0x00\t! replicate2S" %}
  1.1997 +  ins_encode %{
  1.1998 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.1999 +    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.2000 +  %}
  1.2001 +  ins_pipe( fpu_reg_reg );
  1.2002 +%}
  1.2003 +
  1.2004 +instruct Repl4S(vecD dst, rRegI src) %{
  1.2005 +  predicate(n->as_Vector()->length() == 4);
  1.2006 +  match(Set dst (ReplicateS src));
  1.2007 +  format %{ "movd    $dst,$src\n\t"
  1.2008 +            "pshuflw $dst,$dst,0x00\t! replicate4S" %}
  1.2009 +  ins_encode %{
  1.2010 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.2011 +    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.2012 +  %}
  1.2013 +  ins_pipe( fpu_reg_reg );
  1.2014 +%}
  1.2015 +
  1.2016 +instruct Repl8S(vecX dst, rRegI src) %{
  1.2017 +  predicate(n->as_Vector()->length() == 8);
  1.2018 +  match(Set dst (ReplicateS src));
  1.2019 +  format %{ "movd    $dst,$src\n\t"
  1.2020 +            "pshuflw $dst,$dst,0x00\n\t"
  1.2021 +            "punpcklqdq $dst,$dst\t! replicate8S" %}
  1.2022 +  ins_encode %{
  1.2023 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.2024 +    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.2025 +    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
  1.2026 +  %}
  1.2027 +  ins_pipe( pipe_slow );
  1.2028 +%}
  1.2029 +
  1.2030 +instruct Repl16S(vecY dst, rRegI src) %{
  1.2031 +  predicate(n->as_Vector()->length() == 16);
  1.2032 +  match(Set dst (ReplicateS src));
  1.2033 +  format %{ "movd    $dst,$src\n\t"
  1.2034 +            "pshuflw $dst,$dst,0x00\n\t"
  1.2035 +            "punpcklqdq $dst,$dst\n\t"
  1.2036 +            "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
  1.2037 +  ins_encode %{
  1.2038 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.2039 +    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.2040 +    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
  1.2041 +    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.2042 +  %}
  1.2043 +  ins_pipe( pipe_slow );
  1.2044 +%}
  1.2045 +
  1.2046 +// Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
  1.2047 +instruct Repl2S_imm(vecS dst, immI con) %{
  1.2048 +  predicate(n->as_Vector()->length() == 2);
  1.2049 +  match(Set dst (ReplicateS con));
  1.2050 +  format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
  1.2051 +  ins_encode %{
  1.2052 +    __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
  1.2053 +  %}
  1.2054 +  ins_pipe( fpu_reg_reg );
  1.2055 +%}
  1.2056 +
  1.2057 +instruct Repl4S_imm(vecD dst, immI con) %{
  1.2058 +  predicate(n->as_Vector()->length() == 4);
  1.2059 +  match(Set dst (ReplicateS con));
  1.2060 +  format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
  1.2061 +  ins_encode %{
  1.2062 +    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
  1.2063 +  %}
  1.2064 +  ins_pipe( fpu_reg_reg );
  1.2065 +%}
  1.2066 +
  1.2067 +instruct Repl8S_imm(vecX dst, immI con) %{
  1.2068 +  predicate(n->as_Vector()->length() == 8);
  1.2069 +  match(Set dst (ReplicateS con));
  1.2070 +  format %{ "movq    $dst,[$constantaddress]\n\t"
  1.2071 +            "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
  1.2072 +  ins_encode %{
  1.2073 +    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
  1.2074 +    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
  1.2075 +  %}
  1.2076 +  ins_pipe( pipe_slow );
  1.2077 +%}
  1.2078 +
  1.2079 +instruct Repl16S_imm(vecY dst, immI con) %{
  1.2080 +  predicate(n->as_Vector()->length() == 16);
  1.2081 +  match(Set dst (ReplicateS con));
  1.2082 +  format %{ "movq    $dst,[$constantaddress]\n\t"
  1.2083 +            "punpcklqdq $dst,$dst\n\t"
  1.2084 +            "vinserti128h $dst,$dst,$dst\t! replicate16S($con)" %}
  1.2085 +  ins_encode %{
  1.2086 +    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
  1.2087 +    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
  1.2088 +    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.2089 +  %}
  1.2090 +  ins_pipe( pipe_slow );
  1.2091 +%}
  1.2092 +
  1.2093 +// Replicate char/short (2 byte) scalar zero to be vector
  1.2094 +instruct Repl2S_zero(vecS dst, immI0 zero) %{
  1.2095 +  predicate(n->as_Vector()->length() == 2);
  1.2096 +  match(Set dst (ReplicateS zero));
  1.2097 +  format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
  1.2098 +  ins_encode %{
  1.2099 +    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
  1.2100 +  %}
  1.2101 +  ins_pipe( fpu_reg_reg );
  1.2102 +%}
  1.2103 +
  1.2104 +instruct Repl4S_zero(vecD dst, immI0 zero) %{
  1.2105 +  predicate(n->as_Vector()->length() == 4);
  1.2106 +  match(Set dst (ReplicateS zero));
  1.2107 +  format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
  1.2108 +  ins_encode %{
  1.2109 +    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
  1.2110 +  %}
  1.2111 +  ins_pipe( fpu_reg_reg );
  1.2112 +%}
  1.2113 +
  1.2114 +instruct Repl8S_zero(vecX dst, immI0 zero) %{
  1.2115 +  predicate(n->as_Vector()->length() == 8);
  1.2116 +  match(Set dst (ReplicateS zero));
  1.2117 +  format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
  1.2118 +  ins_encode %{
  1.2119 +    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
  1.2120 +  %}
  1.2121 +  ins_pipe( fpu_reg_reg );
  1.2122 +%}
  1.2123 +
  1.2124 +instruct Repl16S_zero(vecY dst, immI0 zero) %{
  1.2125 +  predicate(n->as_Vector()->length() == 16);
  1.2126 +  match(Set dst (ReplicateS zero));
  1.2127 +  format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
  1.2128 +  ins_encode %{
  1.2129 +    // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
  1.2130 +    bool vector256 = true;
  1.2131 +    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
  1.2132 +  %}
  1.2133 +  ins_pipe( fpu_reg_reg );
  1.2134 +%}
  1.2135 +
  1.2136 +// Replicate integer (4 byte) scalar to be vector
  1.2137 +instruct Repl2I(vecD dst, rRegI src) %{
  1.2138 +  predicate(n->as_Vector()->length() == 2);
  1.2139 +  match(Set dst (ReplicateI src));
  1.2140 +  format %{ "movd    $dst,$src\n\t"
  1.2141 +            "pshufd  $dst,$dst,0x00\t! replicate2I" %}
  1.2142 +  ins_encode %{
  1.2143 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.2144 +    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.2145 +  %}
  1.2146 +  ins_pipe( fpu_reg_reg );
  1.2147 +%}
  1.2148 +
  1.2149 +instruct Repl4I(vecX dst, rRegI src) %{
  1.2150 +  predicate(n->as_Vector()->length() == 4);
  1.2151 +  match(Set dst (ReplicateI src));
  1.2152 +  format %{ "movd    $dst,$src\n\t"
  1.2153 +            "pshufd  $dst,$dst,0x00\t! replicate4I" %}
  1.2154 +  ins_encode %{
  1.2155 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.2156 +    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.2157 +  %}
  1.2158 +  ins_pipe( pipe_slow );
  1.2159 +%}
  1.2160 +
  1.2161 +instruct Repl8I(vecY dst, rRegI src) %{
  1.2162 +  predicate(n->as_Vector()->length() == 8);
  1.2163 +  match(Set dst (ReplicateI src));
  1.2164 +  format %{ "movd    $dst,$src\n\t"
  1.2165 +            "pshufd  $dst,$dst,0x00\n\t"
  1.2166 +            "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
  1.2167 +  ins_encode %{
  1.2168 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.2169 +    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.2170 +    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.2171 +  %}
  1.2172 +  ins_pipe( pipe_slow );
  1.2173 +%}
  1.2174 +
  1.2175 +// Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
  1.2176 +instruct Repl2I_imm(vecD dst, immI con) %{
  1.2177 +  predicate(n->as_Vector()->length() == 2);
  1.2178 +  match(Set dst (ReplicateI con));
  1.2179 +  format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
  1.2180 +  ins_encode %{
  1.2181 +    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
  1.2182 +  %}
  1.2183 +  ins_pipe( fpu_reg_reg );
  1.2184 +%}
  1.2185 +
  1.2186 +instruct Repl4I_imm(vecX dst, immI con) %{
  1.2187 +  predicate(n->as_Vector()->length() == 4);
  1.2188 +  match(Set dst (ReplicateI con));
  1.2189 +  format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
  1.2190 +            "punpcklqdq $dst,$dst" %}
  1.2191 +  ins_encode %{
  1.2192 +    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
  1.2193 +    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
  1.2194 +  %}
  1.2195 +  ins_pipe( pipe_slow );
  1.2196 +%}
  1.2197 +
  1.2198 +instruct Repl8I_imm(vecY dst, immI con) %{
  1.2199 +  predicate(n->as_Vector()->length() == 8);
  1.2200 +  match(Set dst (ReplicateI con));
  1.2201 +  format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
  1.2202 +            "punpcklqdq $dst,$dst\n\t"
  1.2203 +            "vinserti128h $dst,$dst,$dst" %}
  1.2204 +  ins_encode %{
  1.2205 +    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
  1.2206 +    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
  1.2207 +    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.2208 +  %}
  1.2209 +  ins_pipe( pipe_slow );
  1.2210 +%}
  1.2211 +
  1.2212 +// Integer could be loaded into xmm register directly from memory.
  1.2213 +instruct Repl2I_mem(vecD dst, memory mem) %{
  1.2214 +  predicate(n->as_Vector()->length() == 2);
  1.2215 +  match(Set dst (ReplicateI (LoadI mem)));
  1.2216 +  format %{ "movd    $dst,$mem\n\t"
  1.2217 +            "pshufd  $dst,$dst,0x00\t! replicate2I" %}
  1.2218 +  ins_encode %{
  1.2219 +    __ movdl($dst$$XMMRegister, $mem$$Address);
  1.2220 +    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.2221 +  %}
  1.2222 +  ins_pipe( fpu_reg_reg );
  1.2223 +%}
  1.2224 +
  1.2225 +instruct Repl4I_mem(vecX dst, memory mem) %{
  1.2226 +  predicate(n->as_Vector()->length() == 4);
  1.2227 +  match(Set dst (ReplicateI (LoadI mem)));
  1.2228 +  format %{ "movd    $dst,$mem\n\t"
  1.2229 +            "pshufd  $dst,$dst,0x00\t! replicate4I" %}
  1.2230 +  ins_encode %{
  1.2231 +    __ movdl($dst$$XMMRegister, $mem$$Address);
  1.2232 +    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.2233 +  %}
  1.2234 +  ins_pipe( pipe_slow );
  1.2235 +%}
  1.2236 +
  1.2237 +instruct Repl8I_mem(vecY dst, memory mem) %{
  1.2238 +  predicate(n->as_Vector()->length() == 8);
  1.2239 +  match(Set dst (ReplicateI (LoadI mem)));
  1.2240 +  format %{ "movd    $dst,$mem\n\t"
  1.2241 +            "pshufd  $dst,$dst,0x00\n\t"
  1.2242 +            "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
  1.2243 +  ins_encode %{
  1.2244 +    __ movdl($dst$$XMMRegister, $mem$$Address);
  1.2245 +    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
  1.2246 +    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.2247 +  %}
  1.2248 +  ins_pipe( pipe_slow );
  1.2249 +%}
  1.2250 +
  1.2251 +// Replicate integer (4 byte) scalar zero to be vector
  1.2252 +instruct Repl2I_zero(vecD dst, immI0 zero) %{
  1.2253 +  predicate(n->as_Vector()->length() == 2);
  1.2254 +  match(Set dst (ReplicateI zero));
  1.2255 +  format %{ "pxor    $dst,$dst\t! replicate2I" %}
  1.2256 +  ins_encode %{
  1.2257 +    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
  1.2258 +  %}
  1.2259 +  ins_pipe( fpu_reg_reg );
  1.2260 +%}
  1.2261 +
  1.2262 +instruct Repl4I_zero(vecX dst, immI0 zero) %{
  1.2263 +  predicate(n->as_Vector()->length() == 4);
  1.2264 +  match(Set dst (ReplicateI zero));
  1.2265 +  format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
  1.2266 +  ins_encode %{
  1.2267 +    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
  1.2268 +  %}
  1.2269 +  ins_pipe( fpu_reg_reg );
  1.2270 +%}
  1.2271 +
  1.2272 +instruct Repl8I_zero(vecY dst, immI0 zero) %{
  1.2273 +  predicate(n->as_Vector()->length() == 8);
  1.2274 +  match(Set dst (ReplicateI zero));
  1.2275 +  format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
  1.2276 +  ins_encode %{
  1.2277 +    // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
  1.2278 +    bool vector256 = true;
  1.2279 +    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
  1.2280 +  %}
  1.2281 +  ins_pipe( fpu_reg_reg );
  1.2282 +%}
  1.2283 +
  1.2284 +// Replicate long (8 byte) scalar to be vector
  1.2285 +#ifdef _LP64
  1.2286 +instruct Repl2L(vecX dst, rRegL src) %{
  1.2287 +  predicate(n->as_Vector()->length() == 2);
  1.2288 +  match(Set dst (ReplicateL src));
  1.2289 +  format %{ "movdq   $dst,$src\n\t"
  1.2290 +            "punpcklqdq $dst,$dst\t! replicate2L" %}
  1.2291 +  ins_encode %{
  1.2292 +    __ movdq($dst$$XMMRegister, $src$$Register);
  1.2293 +    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
  1.2294 +  %}
  1.2295 +  ins_pipe( pipe_slow );
  1.2296 +%}
  1.2297 +
  1.2298 +instruct Repl4L(vecY dst, rRegL src) %{
  1.2299 +  predicate(n->as_Vector()->length() == 4);
  1.2300 +  match(Set dst (ReplicateL src));
  1.2301 +  format %{ "movdq   $dst,$src\n\t"
  1.2302 +            "punpcklqdq $dst,$dst\n\t"
  1.2303 +            "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
  1.2304 +  ins_encode %{
  1.2305 +    __ movdq($dst$$XMMRegister, $src$$Register);
  1.2306 +    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
  1.2307 +    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.2308 +  %}
  1.2309 +  ins_pipe( pipe_slow );
  1.2310 +%}
  1.2311 +#else // _LP64
  1.2312 +instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
  1.2313 +  predicate(n->as_Vector()->length() == 2);
  1.2314 +  match(Set dst (ReplicateL src));
  1.2315 +  effect(TEMP dst, USE src, TEMP tmp);
  1.2316 +  format %{ "movdl   $dst,$src.lo\n\t"
  1.2317 +            "movdl   $tmp,$src.hi\n\t"
  1.2318 +            "punpckldq $dst,$tmp\n\t"
  1.2319 +            "punpcklqdq $dst,$dst\t! replicate2L"%}
  1.2320 +  ins_encode %{
  1.2321 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.2322 +    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
  1.2323 +    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
  1.2324 +    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
  1.2325 +  %}
  1.2326 +  ins_pipe( pipe_slow );
  1.2327 +%}
  1.2328 +
  1.2329 +instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
  1.2330 +  predicate(n->as_Vector()->length() == 4);
  1.2331 +  match(Set dst (ReplicateL src));
  1.2332 +  effect(TEMP dst, USE src, TEMP tmp);
  1.2333 +  format %{ "movdl   $dst,$src.lo\n\t"
  1.2334 +            "movdl   $tmp,$src.hi\n\t"
  1.2335 +            "punpckldq $dst,$tmp\n\t"
  1.2336 +            "punpcklqdq $dst,$dst\n\t"
  1.2337 +            "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
  1.2338 +  ins_encode %{
  1.2339 +    __ movdl($dst$$XMMRegister, $src$$Register);
  1.2340 +    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
  1.2341 +    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
  1.2342 +    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
  1.2343 +    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.2344 +  %}
  1.2345 +  ins_pipe( pipe_slow );
  1.2346 +%}
  1.2347 +#endif // _LP64
  1.2348 +
  1.2349 +// Replicate long (8 byte) scalar immediate to be vector by loading from const table.
  1.2350 +instruct Repl2L_imm(vecX dst, immL con) %{
  1.2351 +  predicate(n->as_Vector()->length() == 2);
  1.2352 +  match(Set dst (ReplicateL con));
  1.2353 +  format %{ "movq    $dst,[$constantaddress]\n\t"
  1.2354 +            "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
  1.2355 +  ins_encode %{
  1.2356 +    __ movq($dst$$XMMRegister, $constantaddress($con));
  1.2357 +    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
  1.2358 +  %}
  1.2359 +  ins_pipe( pipe_slow );
  1.2360 +%}
  1.2361 +
  1.2362 +instruct Repl4L_imm(vecY dst, immL con) %{
  1.2363 +  predicate(n->as_Vector()->length() == 4);
  1.2364 +  match(Set dst (ReplicateL con));
  1.2365 +  format %{ "movq    $dst,[$constantaddress]\n\t"
  1.2366 +            "punpcklqdq $dst,$dst\n\t"
  1.2367 +            "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %}
  1.2368 +  ins_encode %{
  1.2369 +    __ movq($dst$$XMMRegister, $constantaddress($con));
  1.2370 +    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
  1.2371 +    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.2372 +  %}
  1.2373 +  ins_pipe( pipe_slow );
  1.2374 +%}
  1.2375 +
  1.2376 +// Long could be loaded into xmm register directly from memory.
  1.2377 +instruct Repl2L_mem(vecX dst, memory mem) %{
  1.2378 +  predicate(n->as_Vector()->length() == 2);
  1.2379 +  match(Set dst (ReplicateL (LoadL mem)));
  1.2380 +  format %{ "movq    $dst,$mem\n\t"
  1.2381 +            "punpcklqdq $dst,$dst\t! replicate2L" %}
  1.2382 +  ins_encode %{
  1.2383 +    __ movq($dst$$XMMRegister, $mem$$Address);
  1.2384 +    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
  1.2385 +  %}
  1.2386 +  ins_pipe( pipe_slow );
  1.2387 +%}
  1.2388 +
  1.2389 +instruct Repl4L_mem(vecY dst, memory mem) %{
  1.2390 +  predicate(n->as_Vector()->length() == 4);
  1.2391 +  match(Set dst (ReplicateL (LoadL mem)));
  1.2392 +  format %{ "movq    $dst,$mem\n\t"
  1.2393 +            "punpcklqdq $dst,$dst\n\t"
  1.2394 +            "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
  1.2395 +  ins_encode %{
  1.2396 +    __ movq($dst$$XMMRegister, $mem$$Address);
  1.2397 +    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
  1.2398 +    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.2399 +  %}
  1.2400 +  ins_pipe( pipe_slow );
  1.2401 +%}
  1.2402 +
  1.2403 +// Replicate long (8 byte) scalar zero to be vector
  1.2404 +instruct Repl2L_zero(vecX dst, immL0 zero) %{
  1.2405 +  predicate(n->as_Vector()->length() == 2);
  1.2406 +  match(Set dst (ReplicateL zero));
  1.2407 +  format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
  1.2408 +  ins_encode %{
  1.2409 +    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
  1.2410 +  %}
  1.2411 +  ins_pipe( fpu_reg_reg );
  1.2412 +%}
  1.2413 +
  1.2414 +instruct Repl4L_zero(vecY dst, immL0 zero) %{
  1.2415 +  predicate(n->as_Vector()->length() == 4);
  1.2416 +  match(Set dst (ReplicateL zero));
  1.2417 +  format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
  1.2418 +  ins_encode %{
  1.2419 +    // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
  1.2420 +    bool vector256 = true;
  1.2421 +    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
  1.2422 +  %}
  1.2423 +  ins_pipe( fpu_reg_reg );
  1.2424 +%}
  1.2425 +
  1.2426 +// Replicate float (4 byte) scalar to be vector
  1.2427 +instruct Repl2F(vecD dst, regF src) %{
  1.2428 +  predicate(n->as_Vector()->length() == 2);
  1.2429 +  match(Set dst (ReplicateF src));
  1.2430 +  format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
  1.2431 +  ins_encode %{
  1.2432 +    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
  1.2433 +  %}
  1.2434 +  ins_pipe( fpu_reg_reg );
  1.2435 +%}
  1.2436 +
  1.2437 +instruct Repl4F(vecX dst, regF src) %{
  1.2438 +  predicate(n->as_Vector()->length() == 4);
  1.2439 +  match(Set dst (ReplicateF src));
  1.2440 +  format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
  1.2441 +  ins_encode %{
  1.2442 +    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
  1.2443 +  %}
  1.2444 +  ins_pipe( pipe_slow );
  1.2445 +%}
  1.2446 +
  1.2447 +instruct Repl8F(vecY dst, regF src) %{
  1.2448 +  predicate(n->as_Vector()->length() == 8);
  1.2449 +  match(Set dst (ReplicateF src));
  1.2450 +  format %{ "pshufd  $dst,$src,0x00\n\t"
  1.2451 +            "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
  1.2452 +  ins_encode %{
  1.2453 +    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
  1.2454 +    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.2455 +  %}
  1.2456 +  ins_pipe( pipe_slow );
  1.2457 +%}
  1.2458 +
  1.2459 +// Replicate float (4 byte) scalar zero to be vector
  1.2460 +instruct Repl2F_zero(vecD dst, immF0 zero) %{
  1.2461 +  predicate(n->as_Vector()->length() == 2);
  1.2462 +  match(Set dst (ReplicateF zero));
  1.2463 +  format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
  1.2464 +  ins_encode %{
  1.2465 +    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
  1.2466 +  %}
  1.2467 +  ins_pipe( fpu_reg_reg );
  1.2468 +%}
  1.2469 +
  1.2470 +instruct Repl4F_zero(vecX dst, immF0 zero) %{
  1.2471 +  predicate(n->as_Vector()->length() == 4);
  1.2472 +  match(Set dst (ReplicateF zero));
  1.2473 +  format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
  1.2474 +  ins_encode %{
  1.2475 +    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
  1.2476 +  %}
  1.2477 +  ins_pipe( fpu_reg_reg );
  1.2478 +%}
  1.2479 +
  1.2480 +instruct Repl8F_zero(vecY dst, immF0 zero) %{
  1.2481 +  predicate(n->as_Vector()->length() == 8);
  1.2482 +  match(Set dst (ReplicateF zero));
  1.2483 +  format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
  1.2484 +  ins_encode %{
  1.2485 +    bool vector256 = true;
  1.2486 +    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
  1.2487 +  %}
  1.2488 +  ins_pipe( fpu_reg_reg );
  1.2489 +%}
  1.2490 +
  1.2491 +// Replicate double (8 bytes) scalar to be vector
  1.2492 +instruct Repl2D(vecX dst, regD src) %{
  1.2493 +  predicate(n->as_Vector()->length() == 2);
  1.2494 +  match(Set dst (ReplicateD src));
  1.2495 +  format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
  1.2496 +  ins_encode %{
  1.2497 +    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
  1.2498 +  %}
  1.2499 +  ins_pipe( pipe_slow );
  1.2500 +%}
  1.2501 +
  1.2502 +instruct Repl4D(vecY dst, regD src) %{
  1.2503 +  predicate(n->as_Vector()->length() == 4);
  1.2504 +  match(Set dst (ReplicateD src));
  1.2505 +  format %{ "pshufd  $dst,$src,0x44\n\t"
  1.2506 +            "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
  1.2507 +  ins_encode %{
  1.2508 +    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
  1.2509 +    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
  1.2510 +  %}
  1.2511 +  ins_pipe( pipe_slow );
  1.2512 +%}
  1.2513 +
  1.2514 +// Replicate double (8 byte) scalar zero to be vector
  1.2515 +instruct Repl2D_zero(vecX dst, immD0 zero) %{
  1.2516 +  predicate(n->as_Vector()->length() == 2);
  1.2517 +  match(Set dst (ReplicateD zero));
  1.2518 +  format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
  1.2519 +  ins_encode %{
  1.2520 +    __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
  1.2521 +  %}
  1.2522 +  ins_pipe( fpu_reg_reg );
  1.2523 +%}
  1.2524 +
  1.2525 +instruct Repl4D_zero(vecY dst, immD0 zero) %{
  1.2526 +  predicate(n->as_Vector()->length() == 4);
  1.2527 +  match(Set dst (ReplicateD zero));
  1.2528 +  format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
  1.2529 +  ins_encode %{
  1.2530 +    bool vector256 = true;
  1.2531 +    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
  1.2532 +  %}
  1.2533 +  ins_pipe( fpu_reg_reg );
  1.2534 +%}
  1.2535 +
  1.2536 +// ====================VECTOR ARITHMETIC=======================================
  1.2537 +
  1.2538 +// --------------------------------- ADD --------------------------------------
  1.2539 +
  1.2540 +// Bytes vector add
  1.2541 +instruct vadd4B(vecS dst, vecS src) %{
  1.2542 +  predicate(n->as_Vector()->length() == 4);
  1.2543 +  match(Set dst (AddVB dst src));
  1.2544 +  format %{ "paddb   $dst,$src\t! add packed4B" %}
  1.2545 +  ins_encode %{
  1.2546 +    __ paddb($dst$$XMMRegister, $src$$XMMRegister);
  1.2547 +  %}
  1.2548 +  ins_pipe( pipe_slow );
  1.2549 +%}
  1.2550 +
  1.2551 +instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
  1.2552 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.2553 +  match(Set dst (AddVB src1 src2));
  1.2554 +  format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
  1.2555 +  ins_encode %{
  1.2556 +    bool vector256 = false;
  1.2557 +    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.2558 +  %}
  1.2559 +  ins_pipe( pipe_slow );
  1.2560 +%}
  1.2561 +
  1.2562 +instruct vadd8B(vecD dst, vecD src) %{
  1.2563 +  predicate(n->as_Vector()->length() == 8);
  1.2564 +  match(Set dst (AddVB dst src));
  1.2565 +  format %{ "paddb   $dst,$src\t! add packed8B" %}
  1.2566 +  ins_encode %{
  1.2567 +    __ paddb($dst$$XMMRegister, $src$$XMMRegister);
  1.2568 +  %}
  1.2569 +  ins_pipe( pipe_slow );
  1.2570 +%}
  1.2571 +
  1.2572 +instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
  1.2573 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.2574 +  match(Set dst (AddVB src1 src2));
  1.2575 +  format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
  1.2576 +  ins_encode %{
  1.2577 +    bool vector256 = false;
  1.2578 +    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.2579 +  %}
  1.2580 +  ins_pipe( pipe_slow );
  1.2581 +%}
  1.2582 +
  1.2583 +instruct vadd16B(vecX dst, vecX src) %{
  1.2584 +  predicate(n->as_Vector()->length() == 16);
  1.2585 +  match(Set dst (AddVB dst src));
  1.2586 +  format %{ "paddb   $dst,$src\t! add packed16B" %}
  1.2587 +  ins_encode %{
  1.2588 +    __ paddb($dst$$XMMRegister, $src$$XMMRegister);
  1.2589 +  %}
  1.2590 +  ins_pipe( pipe_slow );
  1.2591 +%}
  1.2592 +
  1.2593 +instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
  1.2594 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
  1.2595 +  match(Set dst (AddVB src1 src2));
  1.2596 +  format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
  1.2597 +  ins_encode %{
  1.2598 +    bool vector256 = false;
  1.2599 +    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.2600 +  %}
  1.2601 +  ins_pipe( pipe_slow );
  1.2602 +%}
  1.2603 +
  1.2604 +instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
  1.2605 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
  1.2606 +  match(Set dst (AddVB src (LoadVector mem)));
  1.2607 +  format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
  1.2608 +  ins_encode %{
  1.2609 +    bool vector256 = false;
  1.2610 +    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.2611 +  %}
  1.2612 +  ins_pipe( pipe_slow );
  1.2613 +%}
  1.2614 +
  1.2615 +instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
  1.2616 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
  1.2617 +  match(Set dst (AddVB src1 src2));
  1.2618 +  format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
  1.2619 +  ins_encode %{
  1.2620 +    bool vector256 = true;
  1.2621 +    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.2622 +  %}
  1.2623 +  ins_pipe( pipe_slow );
  1.2624 +%}
  1.2625 +
  1.2626 +instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
  1.2627 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
  1.2628 +  match(Set dst (AddVB src (LoadVector mem)));
  1.2629 +  format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
  1.2630 +  ins_encode %{
  1.2631 +    bool vector256 = true;
  1.2632 +    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.2633 +  %}
  1.2634 +  ins_pipe( pipe_slow );
  1.2635 +%}
  1.2636 +
  1.2637 +// Shorts/Chars vector add
  1.2638 +instruct vadd2S(vecS dst, vecS src) %{
  1.2639 +  predicate(n->as_Vector()->length() == 2);
  1.2640 +  match(Set dst (AddVS dst src));
  1.2641 +  format %{ "paddw   $dst,$src\t! add packed2S" %}
  1.2642 +  ins_encode %{
  1.2643 +    __ paddw($dst$$XMMRegister, $src$$XMMRegister);
  1.2644 +  %}
  1.2645 +  ins_pipe( pipe_slow );
  1.2646 +%}
  1.2647 +
  1.2648 +instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
  1.2649 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.2650 +  match(Set dst (AddVS src1 src2));
  1.2651 +  format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
  1.2652 +  ins_encode %{
  1.2653 +    bool vector256 = false;
  1.2654 +    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.2655 +  %}
  1.2656 +  ins_pipe( pipe_slow );
  1.2657 +%}
  1.2658 +
  1.2659 +instruct vadd4S(vecD dst, vecD src) %{
  1.2660 +  predicate(n->as_Vector()->length() == 4);
  1.2661 +  match(Set dst (AddVS dst src));
  1.2662 +  format %{ "paddw   $dst,$src\t! add packed4S" %}
  1.2663 +  ins_encode %{
  1.2664 +    __ paddw($dst$$XMMRegister, $src$$XMMRegister);
  1.2665 +  %}
  1.2666 +  ins_pipe( pipe_slow );
  1.2667 +%}
  1.2668 +
  1.2669 +instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
  1.2670 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.2671 +  match(Set dst (AddVS src1 src2));
  1.2672 +  format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
  1.2673 +  ins_encode %{
  1.2674 +    bool vector256 = false;
  1.2675 +    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.2676 +  %}
  1.2677 +  ins_pipe( pipe_slow );
  1.2678 +%}
  1.2679 +
  1.2680 +instruct vadd8S(vecX dst, vecX src) %{
  1.2681 +  predicate(n->as_Vector()->length() == 8);
  1.2682 +  match(Set dst (AddVS dst src));
  1.2683 +  format %{ "paddw   $dst,$src\t! add packed8S" %}
  1.2684 +  ins_encode %{
  1.2685 +    __ paddw($dst$$XMMRegister, $src$$XMMRegister);
  1.2686 +  %}
  1.2687 +  ins_pipe( pipe_slow );
  1.2688 +%}
  1.2689 +
  1.2690 +instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
  1.2691 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.2692 +  match(Set dst (AddVS src1 src2));
  1.2693 +  format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
  1.2694 +  ins_encode %{
  1.2695 +    bool vector256 = false;
  1.2696 +    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.2697 +  %}
  1.2698 +  ins_pipe( pipe_slow );
  1.2699 +%}
  1.2700 +
  1.2701 +instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
  1.2702 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.2703 +  match(Set dst (AddVS src (LoadVector mem)));
  1.2704 +  format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
  1.2705 +  ins_encode %{
  1.2706 +    bool vector256 = false;
  1.2707 +    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.2708 +  %}
  1.2709 +  ins_pipe( pipe_slow );
  1.2710 +%}
  1.2711 +
  1.2712 +instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
  1.2713 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
  1.2714 +  match(Set dst (AddVS src1 src2));
  1.2715 +  format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
  1.2716 +  ins_encode %{
  1.2717 +    bool vector256 = true;
  1.2718 +    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.2719 +  %}
  1.2720 +  ins_pipe( pipe_slow );
  1.2721 +%}
  1.2722 +
  1.2723 +instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
  1.2724 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
  1.2725 +  match(Set dst (AddVS src (LoadVector mem)));
  1.2726 +  format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
  1.2727 +  ins_encode %{
  1.2728 +    bool vector256 = true;
  1.2729 +    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.2730 +  %}
  1.2731 +  ins_pipe( pipe_slow );
  1.2732 +%}
  1.2733 +
  1.2734 +// Integers vector add
  1.2735 +instruct vadd2I(vecD dst, vecD src) %{
  1.2736 +  predicate(n->as_Vector()->length() == 2);
  1.2737 +  match(Set dst (AddVI dst src));
  1.2738 +  format %{ "paddd   $dst,$src\t! add packed2I" %}
  1.2739 +  ins_encode %{
  1.2740 +    __ paddd($dst$$XMMRegister, $src$$XMMRegister);
  1.2741 +  %}
  1.2742 +  ins_pipe( pipe_slow );
  1.2743 +%}
  1.2744 +
  1.2745 +instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
  1.2746 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.2747 +  match(Set dst (AddVI src1 src2));
  1.2748 +  format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
  1.2749 +  ins_encode %{
  1.2750 +    bool vector256 = false;
  1.2751 +    __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.2752 +  %}
  1.2753 +  ins_pipe( pipe_slow );
  1.2754 +%}
  1.2755 +
  1.2756 +instruct vadd4I(vecX dst, vecX src) %{
  1.2757 +  predicate(n->as_Vector()->length() == 4);
  1.2758 +  match(Set dst (AddVI dst src));
  1.2759 +  format %{ "paddd   $dst,$src\t! add packed4I" %}
  1.2760 +  ins_encode %{
  1.2761 +    __ paddd($dst$$XMMRegister, $src$$XMMRegister);
  1.2762 +  %}
  1.2763 +  ins_pipe( pipe_slow );
  1.2764 +%}
  1.2765 +
  1.2766 +instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
  1.2767 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.2768 +  match(Set dst (AddVI src1 src2));
  1.2769 +  format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
  1.2770 +  ins_encode %{
  1.2771 +    bool vector256 = false;
  1.2772 +    __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.2773 +  %}
  1.2774 +  ins_pipe( pipe_slow );
  1.2775 +%}
  1.2776 +
  1.2777 +instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
  1.2778 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.2779 +  match(Set dst (AddVI src (LoadVector mem)));
  1.2780 +  format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
  1.2781 +  ins_encode %{
  1.2782 +    bool vector256 = false;
  1.2783 +    __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.2784 +  %}
  1.2785 +  ins_pipe( pipe_slow );
  1.2786 +%}
  1.2787 +
  1.2788 +instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
  1.2789 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
  1.2790 +  match(Set dst (AddVI src1 src2));
  1.2791 +  format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
  1.2792 +  ins_encode %{
  1.2793 +    bool vector256 = true;
  1.2794 +    __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.2795 +  %}
  1.2796 +  ins_pipe( pipe_slow );
  1.2797 +%}
  1.2798 +
  1.2799 +instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
  1.2800 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
  1.2801 +  match(Set dst (AddVI src (LoadVector mem)));
  1.2802 +  format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
  1.2803 +  ins_encode %{
  1.2804 +    bool vector256 = true;
  1.2805 +    __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.2806 +  %}
  1.2807 +  ins_pipe( pipe_slow );
  1.2808 +%}
  1.2809 +
  1.2810 +// Longs vector add
  1.2811 +instruct vadd2L(vecX dst, vecX src) %{
  1.2812 +  predicate(n->as_Vector()->length() == 2);
  1.2813 +  match(Set dst (AddVL dst src));
  1.2814 +  format %{ "paddq   $dst,$src\t! add packed2L" %}
  1.2815 +  ins_encode %{
  1.2816 +    __ paddq($dst$$XMMRegister, $src$$XMMRegister);
  1.2817 +  %}
  1.2818 +  ins_pipe( pipe_slow );
  1.2819 +%}
  1.2820 +
  1.2821 +instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
  1.2822 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.2823 +  match(Set dst (AddVL src1 src2));
  1.2824 +  format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
  1.2825 +  ins_encode %{
  1.2826 +    bool vector256 = false;
  1.2827 +    __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.2828 +  %}
  1.2829 +  ins_pipe( pipe_slow );
  1.2830 +%}
  1.2831 +
  1.2832 +instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
  1.2833 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.2834 +  match(Set dst (AddVL src (LoadVector mem)));
  1.2835 +  format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
  1.2836 +  ins_encode %{
  1.2837 +    bool vector256 = false;
  1.2838 +    __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.2839 +  %}
  1.2840 +  ins_pipe( pipe_slow );
  1.2841 +%}
  1.2842 +
  1.2843 +instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
  1.2844 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
  1.2845 +  match(Set dst (AddVL src1 src2));
  1.2846 +  format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
  1.2847 +  ins_encode %{
  1.2848 +    bool vector256 = true;
  1.2849 +    __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.2850 +  %}
  1.2851 +  ins_pipe( pipe_slow );
  1.2852 +%}
  1.2853 +
  1.2854 +instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
  1.2855 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
  1.2856 +  match(Set dst (AddVL src (LoadVector mem)));
  1.2857 +  format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
  1.2858 +  ins_encode %{
  1.2859 +    bool vector256 = true;
  1.2860 +    __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.2861 +  %}
  1.2862 +  ins_pipe( pipe_slow );
  1.2863 +%}
  1.2864 +
  1.2865 +// Floats vector add
  1.2866 +instruct vadd2F(vecD dst, vecD src) %{
  1.2867 +  predicate(n->as_Vector()->length() == 2);
  1.2868 +  match(Set dst (AddVF dst src));
  1.2869 +  format %{ "addps   $dst,$src\t! add packed2F" %}
  1.2870 +  ins_encode %{
  1.2871 +    __ addps($dst$$XMMRegister, $src$$XMMRegister);
  1.2872 +  %}
  1.2873 +  ins_pipe( pipe_slow );
  1.2874 +%}
  1.2875 +
  1.2876 +instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
  1.2877 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.2878 +  match(Set dst (AddVF src1 src2));
  1.2879 +  format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
  1.2880 +  ins_encode %{
  1.2881 +    bool vector256 = false;
  1.2882 +    __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.2883 +  %}
  1.2884 +  ins_pipe( pipe_slow );
  1.2885 +%}
  1.2886 +
  1.2887 +instruct vadd4F(vecX dst, vecX src) %{
  1.2888 +  predicate(n->as_Vector()->length() == 4);
  1.2889 +  match(Set dst (AddVF dst src));
  1.2890 +  format %{ "addps   $dst,$src\t! add packed4F" %}
  1.2891 +  ins_encode %{
  1.2892 +    __ addps($dst$$XMMRegister, $src$$XMMRegister);
  1.2893 +  %}
  1.2894 +  ins_pipe( pipe_slow );
  1.2895 +%}
  1.2896 +
  1.2897 +instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
  1.2898 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.2899 +  match(Set dst (AddVF src1 src2));
  1.2900 +  format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
  1.2901 +  ins_encode %{
  1.2902 +    bool vector256 = false;
  1.2903 +    __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.2904 +  %}
  1.2905 +  ins_pipe( pipe_slow );
  1.2906 +%}
  1.2907 +
  1.2908 +instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
  1.2909 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.2910 +  match(Set dst (AddVF src (LoadVector mem)));
  1.2911 +  format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
  1.2912 +  ins_encode %{
  1.2913 +    bool vector256 = false;
  1.2914 +    __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.2915 +  %}
  1.2916 +  ins_pipe( pipe_slow );
  1.2917 +%}
  1.2918 +
  1.2919 +instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
  1.2920 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.2921 +  match(Set dst (AddVF src1 src2));
  1.2922 +  format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
  1.2923 +  ins_encode %{
  1.2924 +    bool vector256 = true;
  1.2925 +    __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.2926 +  %}
  1.2927 +  ins_pipe( pipe_slow );
  1.2928 +%}
  1.2929 +
  1.2930 +instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
  1.2931 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.2932 +  match(Set dst (AddVF src (LoadVector mem)));
  1.2933 +  format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
  1.2934 +  ins_encode %{
  1.2935 +    bool vector256 = true;
  1.2936 +    __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.2937 +  %}
  1.2938 +  ins_pipe( pipe_slow );
  1.2939 +%}
  1.2940 +
  1.2941 +// Doubles vector add
  1.2942 +instruct vadd2D(vecX dst, vecX src) %{
  1.2943 +  predicate(n->as_Vector()->length() == 2);
  1.2944 +  match(Set dst (AddVD dst src));
  1.2945 +  format %{ "addpd   $dst,$src\t! add packed2D" %}
  1.2946 +  ins_encode %{
  1.2947 +    __ addpd($dst$$XMMRegister, $src$$XMMRegister);
  1.2948 +  %}
  1.2949 +  ins_pipe( pipe_slow );
  1.2950 +%}
  1.2951 +
  1.2952 +instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
  1.2953 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.2954 +  match(Set dst (AddVD src1 src2));
  1.2955 +  format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
  1.2956 +  ins_encode %{
  1.2957 +    bool vector256 = false;
  1.2958 +    __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.2959 +  %}
  1.2960 +  ins_pipe( pipe_slow );
  1.2961 +%}
  1.2962 +
  1.2963 +instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
  1.2964 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.2965 +  match(Set dst (AddVD src (LoadVector mem)));
  1.2966 +  format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
  1.2967 +  ins_encode %{
  1.2968 +    bool vector256 = false;
  1.2969 +    __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.2970 +  %}
  1.2971 +  ins_pipe( pipe_slow );
  1.2972 +%}
  1.2973 +
  1.2974 +instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
  1.2975 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.2976 +  match(Set dst (AddVD src1 src2));
  1.2977 +  format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
  1.2978 +  ins_encode %{
  1.2979 +    bool vector256 = true;
  1.2980 +    __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.2981 +  %}
  1.2982 +  ins_pipe( pipe_slow );
  1.2983 +%}
  1.2984 +
  1.2985 +instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
  1.2986 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.2987 +  match(Set dst (AddVD src (LoadVector mem)));
  1.2988 +  format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
  1.2989 +  ins_encode %{
  1.2990 +    bool vector256 = true;
  1.2991 +    __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.2992 +  %}
  1.2993 +  ins_pipe( pipe_slow );
  1.2994 +%}
  1.2995 +
  1.2996 +// --------------------------------- SUB --------------------------------------
  1.2997 +
  1.2998 +// Bytes vector sub
  1.2999 +instruct vsub4B(vecS dst, vecS src) %{
  1.3000 +  predicate(n->as_Vector()->length() == 4);
  1.3001 +  match(Set dst (SubVB dst src));
  1.3002 +  format %{ "psubb   $dst,$src\t! sub packed4B" %}
  1.3003 +  ins_encode %{
  1.3004 +    __ psubb($dst$$XMMRegister, $src$$XMMRegister);
  1.3005 +  %}
  1.3006 +  ins_pipe( pipe_slow );
  1.3007 +%}
  1.3008 +
  1.3009 +instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
  1.3010 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.3011 +  match(Set dst (SubVB src1 src2));
  1.3012 +  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
  1.3013 +  ins_encode %{
  1.3014 +    bool vector256 = false;
  1.3015 +    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3016 +  %}
  1.3017 +  ins_pipe( pipe_slow );
  1.3018 +%}
  1.3019 +
  1.3020 +instruct vsub8B(vecD dst, vecD src) %{
  1.3021 +  predicate(n->as_Vector()->length() == 8);
  1.3022 +  match(Set dst (SubVB dst src));
  1.3023 +  format %{ "psubb   $dst,$src\t! sub packed8B" %}
  1.3024 +  ins_encode %{
  1.3025 +    __ psubb($dst$$XMMRegister, $src$$XMMRegister);
  1.3026 +  %}
  1.3027 +  ins_pipe( pipe_slow );
  1.3028 +%}
  1.3029 +
  1.3030 +instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
  1.3031 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.3032 +  match(Set dst (SubVB src1 src2));
  1.3033 +  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
  1.3034 +  ins_encode %{
  1.3035 +    bool vector256 = false;
  1.3036 +    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3037 +  %}
  1.3038 +  ins_pipe( pipe_slow );
  1.3039 +%}
  1.3040 +
  1.3041 +instruct vsub16B(vecX dst, vecX src) %{
  1.3042 +  predicate(n->as_Vector()->length() == 16);
  1.3043 +  match(Set dst (SubVB dst src));
  1.3044 +  format %{ "psubb   $dst,$src\t! sub packed16B" %}
  1.3045 +  ins_encode %{
  1.3046 +    __ psubb($dst$$XMMRegister, $src$$XMMRegister);
  1.3047 +  %}
  1.3048 +  ins_pipe( pipe_slow );
  1.3049 +%}
  1.3050 +
  1.3051 +instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
  1.3052 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
  1.3053 +  match(Set dst (SubVB src1 src2));
  1.3054 +  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
  1.3055 +  ins_encode %{
  1.3056 +    bool vector256 = false;
  1.3057 +    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3058 +  %}
  1.3059 +  ins_pipe( pipe_slow );
  1.3060 +%}
  1.3061 +
  1.3062 +instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
  1.3063 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
  1.3064 +  match(Set dst (SubVB src (LoadVector mem)));
  1.3065 +  format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
  1.3066 +  ins_encode %{
  1.3067 +    bool vector256 = false;
  1.3068 +    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3069 +  %}
  1.3070 +  ins_pipe( pipe_slow );
  1.3071 +%}
  1.3072 +
  1.3073 +instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
  1.3074 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
  1.3075 +  match(Set dst (SubVB src1 src2));
  1.3076 +  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
  1.3077 +  ins_encode %{
  1.3078 +    bool vector256 = true;
  1.3079 +    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3080 +  %}
  1.3081 +  ins_pipe( pipe_slow );
  1.3082 +%}
  1.3083 +
  1.3084 +instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
  1.3085 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
  1.3086 +  match(Set dst (SubVB src (LoadVector mem)));
  1.3087 +  format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
  1.3088 +  ins_encode %{
  1.3089 +    bool vector256 = true;
  1.3090 +    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3091 +  %}
  1.3092 +  ins_pipe( pipe_slow );
  1.3093 +%}
  1.3094 +
  1.3095 +// Shorts/Chars vector sub
  1.3096 +instruct vsub2S(vecS dst, vecS src) %{
  1.3097 +  predicate(n->as_Vector()->length() == 2);
  1.3098 +  match(Set dst (SubVS dst src));
  1.3099 +  format %{ "psubw   $dst,$src\t! sub packed2S" %}
  1.3100 +  ins_encode %{
  1.3101 +    __ psubw($dst$$XMMRegister, $src$$XMMRegister);
  1.3102 +  %}
  1.3103 +  ins_pipe( pipe_slow );
  1.3104 +%}
  1.3105 +
  1.3106 +instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
  1.3107 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.3108 +  match(Set dst (SubVS src1 src2));
  1.3109 +  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
  1.3110 +  ins_encode %{
  1.3111 +    bool vector256 = false;
  1.3112 +    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3113 +  %}
  1.3114 +  ins_pipe( pipe_slow );
  1.3115 +%}
  1.3116 +
  1.3117 +instruct vsub4S(vecD dst, vecD src) %{
  1.3118 +  predicate(n->as_Vector()->length() == 4);
  1.3119 +  match(Set dst (SubVS dst src));
  1.3120 +  format %{ "psubw   $dst,$src\t! sub packed4S" %}
  1.3121 +  ins_encode %{
  1.3122 +    __ psubw($dst$$XMMRegister, $src$$XMMRegister);
  1.3123 +  %}
  1.3124 +  ins_pipe( pipe_slow );
  1.3125 +%}
  1.3126 +
  1.3127 +instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
  1.3128 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.3129 +  match(Set dst (SubVS src1 src2));
  1.3130 +  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
  1.3131 +  ins_encode %{
  1.3132 +    bool vector256 = false;
  1.3133 +    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3134 +  %}
  1.3135 +  ins_pipe( pipe_slow );
  1.3136 +%}
  1.3137 +
  1.3138 +instruct vsub8S(vecX dst, vecX src) %{
  1.3139 +  predicate(n->as_Vector()->length() == 8);
  1.3140 +  match(Set dst (SubVS dst src));
  1.3141 +  format %{ "psubw   $dst,$src\t! sub packed8S" %}
  1.3142 +  ins_encode %{
  1.3143 +    __ psubw($dst$$XMMRegister, $src$$XMMRegister);
  1.3144 +  %}
  1.3145 +  ins_pipe( pipe_slow );
  1.3146 +%}
  1.3147 +
  1.3148 +instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
  1.3149 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.3150 +  match(Set dst (SubVS src1 src2));
  1.3151 +  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
  1.3152 +  ins_encode %{
  1.3153 +    bool vector256 = false;
  1.3154 +    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3155 +  %}
  1.3156 +  ins_pipe( pipe_slow );
  1.3157 +%}
  1.3158 +
  1.3159 +instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
  1.3160 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.3161 +  match(Set dst (SubVS src (LoadVector mem)));
  1.3162 +  format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
  1.3163 +  ins_encode %{
  1.3164 +    bool vector256 = false;
  1.3165 +    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3166 +  %}
  1.3167 +  ins_pipe( pipe_slow );
  1.3168 +%}
  1.3169 +
  1.3170 +instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
  1.3171 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
  1.3172 +  match(Set dst (SubVS src1 src2));
  1.3173 +  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
  1.3174 +  ins_encode %{
  1.3175 +    bool vector256 = true;
  1.3176 +    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3177 +  %}
  1.3178 +  ins_pipe( pipe_slow );
  1.3179 +%}
  1.3180 +
  1.3181 +instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
  1.3182 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
  1.3183 +  match(Set dst (SubVS src (LoadVector mem)));
  1.3184 +  format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
  1.3185 +  ins_encode %{
  1.3186 +    bool vector256 = true;
  1.3187 +    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3188 +  %}
  1.3189 +  ins_pipe( pipe_slow );
  1.3190 +%}
  1.3191 +
  1.3192 +// Integers vector sub
  1.3193 +instruct vsub2I(vecD dst, vecD src) %{
  1.3194 +  predicate(n->as_Vector()->length() == 2);
  1.3195 +  match(Set dst (SubVI dst src));
  1.3196 +  format %{ "psubd   $dst,$src\t! sub packed2I" %}
  1.3197 +  ins_encode %{
  1.3198 +    __ psubd($dst$$XMMRegister, $src$$XMMRegister);
  1.3199 +  %}
  1.3200 +  ins_pipe( pipe_slow );
  1.3201 +%}
  1.3202 +
  1.3203 +instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
  1.3204 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.3205 +  match(Set dst (SubVI src1 src2));
  1.3206 +  format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
  1.3207 +  ins_encode %{
  1.3208 +    bool vector256 = false;
  1.3209 +    __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3210 +  %}
  1.3211 +  ins_pipe( pipe_slow );
  1.3212 +%}
  1.3213 +
  1.3214 +instruct vsub4I(vecX dst, vecX src) %{
  1.3215 +  predicate(n->as_Vector()->length() == 4);
  1.3216 +  match(Set dst (SubVI dst src));
  1.3217 +  format %{ "psubd   $dst,$src\t! sub packed4I" %}
  1.3218 +  ins_encode %{
  1.3219 +    __ psubd($dst$$XMMRegister, $src$$XMMRegister);
  1.3220 +  %}
  1.3221 +  ins_pipe( pipe_slow );
  1.3222 +%}
  1.3223 +
  1.3224 +instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
  1.3225 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.3226 +  match(Set dst (SubVI src1 src2));
  1.3227 +  format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
  1.3228 +  ins_encode %{
  1.3229 +    bool vector256 = false;
  1.3230 +    __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3231 +  %}
  1.3232 +  ins_pipe( pipe_slow );
  1.3233 +%}
  1.3234 +
  1.3235 +instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
  1.3236 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.3237 +  match(Set dst (SubVI src (LoadVector mem)));
  1.3238 +  format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
  1.3239 +  ins_encode %{
  1.3240 +    bool vector256 = false;
  1.3241 +    __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3242 +  %}
  1.3243 +  ins_pipe( pipe_slow );
  1.3244 +%}
  1.3245 +
  1.3246 +instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
  1.3247 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
  1.3248 +  match(Set dst (SubVI src1 src2));
  1.3249 +  format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
  1.3250 +  ins_encode %{
  1.3251 +    bool vector256 = true;
  1.3252 +    __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3253 +  %}
  1.3254 +  ins_pipe( pipe_slow );
  1.3255 +%}
  1.3256 +
  1.3257 +instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
  1.3258 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
  1.3259 +  match(Set dst (SubVI src (LoadVector mem)));
  1.3260 +  format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
  1.3261 +  ins_encode %{
  1.3262 +    bool vector256 = true;
  1.3263 +    __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3264 +  %}
  1.3265 +  ins_pipe( pipe_slow );
  1.3266 +%}
  1.3267 +
  1.3268 +// Longs vector sub
  1.3269 +instruct vsub2L(vecX dst, vecX src) %{
  1.3270 +  predicate(n->as_Vector()->length() == 2);
  1.3271 +  match(Set dst (SubVL dst src));
  1.3272 +  format %{ "psubq   $dst,$src\t! sub packed2L" %}
  1.3273 +  ins_encode %{
  1.3274 +    __ psubq($dst$$XMMRegister, $src$$XMMRegister);
  1.3275 +  %}
  1.3276 +  ins_pipe( pipe_slow );
  1.3277 +%}
  1.3278 +
  1.3279 +instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
  1.3280 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.3281 +  match(Set dst (SubVL src1 src2));
  1.3282 +  format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
  1.3283 +  ins_encode %{
  1.3284 +    bool vector256 = false;
  1.3285 +    __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3286 +  %}
  1.3287 +  ins_pipe( pipe_slow );
  1.3288 +%}
  1.3289 +
  1.3290 +instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
  1.3291 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.3292 +  match(Set dst (SubVL src (LoadVector mem)));
  1.3293 +  format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
  1.3294 +  ins_encode %{
  1.3295 +    bool vector256 = false;
  1.3296 +    __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3297 +  %}
  1.3298 +  ins_pipe( pipe_slow );
  1.3299 +%}
  1.3300 +
  1.3301 +instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
  1.3302 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
  1.3303 +  match(Set dst (SubVL src1 src2));
  1.3304 +  format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
  1.3305 +  ins_encode %{
  1.3306 +    bool vector256 = true;
  1.3307 +    __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3308 +  %}
  1.3309 +  ins_pipe( pipe_slow );
  1.3310 +%}
  1.3311 +
  1.3312 +instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
  1.3313 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
  1.3314 +  match(Set dst (SubVL src (LoadVector mem)));
  1.3315 +  format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
  1.3316 +  ins_encode %{
  1.3317 +    bool vector256 = true;
  1.3318 +    __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3319 +  %}
  1.3320 +  ins_pipe( pipe_slow );
  1.3321 +%}
  1.3322 +
  1.3323 +// Floats vector sub
  1.3324 +instruct vsub2F(vecD dst, vecD src) %{
  1.3325 +  predicate(n->as_Vector()->length() == 2);
  1.3326 +  match(Set dst (SubVF dst src));
  1.3327 +  format %{ "subps   $dst,$src\t! sub packed2F" %}
  1.3328 +  ins_encode %{
  1.3329 +    __ subps($dst$$XMMRegister, $src$$XMMRegister);
  1.3330 +  %}
  1.3331 +  ins_pipe( pipe_slow );
  1.3332 +%}
  1.3333 +
  1.3334 +instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
  1.3335 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.3336 +  match(Set dst (SubVF src1 src2));
  1.3337 +  format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
  1.3338 +  ins_encode %{
  1.3339 +    bool vector256 = false;
  1.3340 +    __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3341 +  %}
  1.3342 +  ins_pipe( pipe_slow );
  1.3343 +%}
  1.3344 +
  1.3345 +instruct vsub4F(vecX dst, vecX src) %{
  1.3346 +  predicate(n->as_Vector()->length() == 4);
  1.3347 +  match(Set dst (SubVF dst src));
  1.3348 +  format %{ "subps   $dst,$src\t! sub packed4F" %}
  1.3349 +  ins_encode %{
  1.3350 +    __ subps($dst$$XMMRegister, $src$$XMMRegister);
  1.3351 +  %}
  1.3352 +  ins_pipe( pipe_slow );
  1.3353 +%}
  1.3354 +
  1.3355 +instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
  1.3356 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.3357 +  match(Set dst (SubVF src1 src2));
  1.3358 +  format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
  1.3359 +  ins_encode %{
  1.3360 +    bool vector256 = false;
  1.3361 +    __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3362 +  %}
  1.3363 +  ins_pipe( pipe_slow );
  1.3364 +%}
  1.3365 +
  1.3366 +instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
  1.3367 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.3368 +  match(Set dst (SubVF src (LoadVector mem)));
  1.3369 +  format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
  1.3370 +  ins_encode %{
  1.3371 +    bool vector256 = false;
  1.3372 +    __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3373 +  %}
  1.3374 +  ins_pipe( pipe_slow );
  1.3375 +%}
  1.3376 +
  1.3377 +instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
  1.3378 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.3379 +  match(Set dst (SubVF src1 src2));
  1.3380 +  format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
  1.3381 +  ins_encode %{
  1.3382 +    bool vector256 = true;
  1.3383 +    __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3384 +  %}
  1.3385 +  ins_pipe( pipe_slow );
  1.3386 +%}
  1.3387 +
  1.3388 +instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
  1.3389 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.3390 +  match(Set dst (SubVF src (LoadVector mem)));
  1.3391 +  format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
  1.3392 +  ins_encode %{
  1.3393 +    bool vector256 = true;
  1.3394 +    __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3395 +  %}
  1.3396 +  ins_pipe( pipe_slow );
  1.3397 +%}
  1.3398 +
  1.3399 +// Doubles vector sub
  1.3400 +instruct vsub2D(vecX dst, vecX src) %{
  1.3401 +  predicate(n->as_Vector()->length() == 2);
  1.3402 +  match(Set dst (SubVD dst src));
  1.3403 +  format %{ "subpd   $dst,$src\t! sub packed2D" %}
  1.3404 +  ins_encode %{
  1.3405 +    __ subpd($dst$$XMMRegister, $src$$XMMRegister);
  1.3406 +  %}
  1.3407 +  ins_pipe( pipe_slow );
  1.3408 +%}
  1.3409 +
  1.3410 +instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
  1.3411 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.3412 +  match(Set dst (SubVD src1 src2));
  1.3413 +  format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
  1.3414 +  ins_encode %{
  1.3415 +    bool vector256 = false;
  1.3416 +    __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3417 +  %}
  1.3418 +  ins_pipe( pipe_slow );
  1.3419 +%}
  1.3420 +
  1.3421 +instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
  1.3422 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.3423 +  match(Set dst (SubVD src (LoadVector mem)));
  1.3424 +  format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
  1.3425 +  ins_encode %{
  1.3426 +    bool vector256 = false;
  1.3427 +    __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3428 +  %}
  1.3429 +  ins_pipe( pipe_slow );
  1.3430 +%}
  1.3431 +
  1.3432 +instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
  1.3433 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.3434 +  match(Set dst (SubVD src1 src2));
  1.3435 +  format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
  1.3436 +  ins_encode %{
  1.3437 +    bool vector256 = true;
  1.3438 +    __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3439 +  %}
  1.3440 +  ins_pipe( pipe_slow );
  1.3441 +%}
  1.3442 +
  1.3443 +instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
  1.3444 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.3445 +  match(Set dst (SubVD src (LoadVector mem)));
  1.3446 +  format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
  1.3447 +  ins_encode %{
  1.3448 +    bool vector256 = true;
  1.3449 +    __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3450 +  %}
  1.3451 +  ins_pipe( pipe_slow );
  1.3452 +%}
  1.3453 +
  1.3454 +// --------------------------------- MUL --------------------------------------
  1.3455 +
  1.3456 +// Shorts/Chars vector mul
  1.3457 +instruct vmul2S(vecS dst, vecS src) %{
  1.3458 +  predicate(n->as_Vector()->length() == 2);
  1.3459 +  match(Set dst (MulVS dst src));
  1.3460 +  format %{ "pmullw $dst,$src\t! mul packed2S" %}
  1.3461 +  ins_encode %{
  1.3462 +    __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
  1.3463 +  %}
  1.3464 +  ins_pipe( pipe_slow );
  1.3465 +%}
  1.3466 +
  1.3467 +instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
  1.3468 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.3469 +  match(Set dst (MulVS src1 src2));
  1.3470 +  format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
  1.3471 +  ins_encode %{
  1.3472 +    bool vector256 = false;
  1.3473 +    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3474 +  %}
  1.3475 +  ins_pipe( pipe_slow );
  1.3476 +%}
  1.3477 +
  1.3478 +instruct vmul4S(vecD dst, vecD src) %{
  1.3479 +  predicate(n->as_Vector()->length() == 4);
  1.3480 +  match(Set dst (MulVS dst src));
  1.3481 +  format %{ "pmullw  $dst,$src\t! mul packed4S" %}
  1.3482 +  ins_encode %{
  1.3483 +    __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
  1.3484 +  %}
  1.3485 +  ins_pipe( pipe_slow );
  1.3486 +%}
  1.3487 +
  1.3488 +instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
  1.3489 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.3490 +  match(Set dst (MulVS src1 src2));
  1.3491 +  format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
  1.3492 +  ins_encode %{
  1.3493 +    bool vector256 = false;
  1.3494 +    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3495 +  %}
  1.3496 +  ins_pipe( pipe_slow );
  1.3497 +%}
  1.3498 +
  1.3499 +instruct vmul8S(vecX dst, vecX src) %{
  1.3500 +  predicate(n->as_Vector()->length() == 8);
  1.3501 +  match(Set dst (MulVS dst src));
  1.3502 +  format %{ "pmullw  $dst,$src\t! mul packed8S" %}
  1.3503 +  ins_encode %{
  1.3504 +    __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
  1.3505 +  %}
  1.3506 +  ins_pipe( pipe_slow );
  1.3507 +%}
  1.3508 +
  1.3509 +instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
  1.3510 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.3511 +  match(Set dst (MulVS src1 src2));
  1.3512 +  format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
  1.3513 +  ins_encode %{
  1.3514 +    bool vector256 = false;
  1.3515 +    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3516 +  %}
  1.3517 +  ins_pipe( pipe_slow );
  1.3518 +%}
  1.3519 +
  1.3520 +instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
  1.3521 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.3522 +  match(Set dst (MulVS src (LoadVector mem)));
  1.3523 +  format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
  1.3524 +  ins_encode %{
  1.3525 +    bool vector256 = false;
  1.3526 +    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3527 +  %}
  1.3528 +  ins_pipe( pipe_slow );
  1.3529 +%}
  1.3530 +
  1.3531 +instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
  1.3532 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
  1.3533 +  match(Set dst (MulVS src1 src2));
  1.3534 +  format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
  1.3535 +  ins_encode %{
  1.3536 +    bool vector256 = true;
  1.3537 +    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3538 +  %}
  1.3539 +  ins_pipe( pipe_slow );
  1.3540 +%}
  1.3541 +
  1.3542 +instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
  1.3543 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
  1.3544 +  match(Set dst (MulVS src (LoadVector mem)));
  1.3545 +  format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
  1.3546 +  ins_encode %{
  1.3547 +    bool vector256 = true;
  1.3548 +    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3549 +  %}
  1.3550 +  ins_pipe( pipe_slow );
  1.3551 +%}
  1.3552 +
  1.3553 +// Integers vector mul (sse4_1)
  1.3554 +instruct vmul2I(vecD dst, vecD src) %{
  1.3555 +  predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
  1.3556 +  match(Set dst (MulVI dst src));
  1.3557 +  format %{ "pmulld  $dst,$src\t! mul packed2I" %}
  1.3558 +  ins_encode %{
  1.3559 +    __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
  1.3560 +  %}
  1.3561 +  ins_pipe( pipe_slow );
  1.3562 +%}
  1.3563 +
  1.3564 +instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
  1.3565 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.3566 +  match(Set dst (MulVI src1 src2));
  1.3567 +  format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
  1.3568 +  ins_encode %{
  1.3569 +    bool vector256 = false;
  1.3570 +    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3571 +  %}
  1.3572 +  ins_pipe( pipe_slow );
  1.3573 +%}
  1.3574 +
  1.3575 +instruct vmul4I(vecX dst, vecX src) %{
  1.3576 +  predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
  1.3577 +  match(Set dst (MulVI dst src));
  1.3578 +  format %{ "pmulld  $dst,$src\t! mul packed4I" %}
  1.3579 +  ins_encode %{
  1.3580 +    __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
  1.3581 +  %}
  1.3582 +  ins_pipe( pipe_slow );
  1.3583 +%}
  1.3584 +
  1.3585 +instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
  1.3586 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.3587 +  match(Set dst (MulVI src1 src2));
  1.3588 +  format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
  1.3589 +  ins_encode %{
  1.3590 +    bool vector256 = false;
  1.3591 +    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3592 +  %}
  1.3593 +  ins_pipe( pipe_slow );
  1.3594 +%}
  1.3595 +
  1.3596 +instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
  1.3597 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.3598 +  match(Set dst (MulVI src (LoadVector mem)));
  1.3599 +  format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
  1.3600 +  ins_encode %{
  1.3601 +    bool vector256 = false;
  1.3602 +    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3603 +  %}
  1.3604 +  ins_pipe( pipe_slow );
  1.3605 +%}
  1.3606 +
  1.3607 +instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
  1.3608 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
  1.3609 +  match(Set dst (MulVI src1 src2));
  1.3610 +  format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
  1.3611 +  ins_encode %{
  1.3612 +    bool vector256 = true;
  1.3613 +    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3614 +  %}
  1.3615 +  ins_pipe( pipe_slow );
  1.3616 +%}
  1.3617 +
  1.3618 +instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
  1.3619 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
  1.3620 +  match(Set dst (MulVI src (LoadVector mem)));
  1.3621 +  format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
  1.3622 +  ins_encode %{
  1.3623 +    bool vector256 = true;
  1.3624 +    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3625 +  %}
  1.3626 +  ins_pipe( pipe_slow );
  1.3627 +%}
  1.3628 +
  1.3629 +// Floats vector mul
  1.3630 +instruct vmul2F(vecD dst, vecD src) %{
  1.3631 +  predicate(n->as_Vector()->length() == 2);
  1.3632 +  match(Set dst (MulVF dst src));
  1.3633 +  format %{ "mulps   $dst,$src\t! mul packed2F" %}
  1.3634 +  ins_encode %{
  1.3635 +    __ mulps($dst$$XMMRegister, $src$$XMMRegister);
  1.3636 +  %}
  1.3637 +  ins_pipe( pipe_slow );
  1.3638 +%}
  1.3639 +
  1.3640 +instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
  1.3641 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.3642 +  match(Set dst (MulVF src1 src2));
  1.3643 +  format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
  1.3644 +  ins_encode %{
  1.3645 +    bool vector256 = false;
  1.3646 +    __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3647 +  %}
  1.3648 +  ins_pipe( pipe_slow );
  1.3649 +%}
  1.3650 +
  1.3651 +instruct vmul4F(vecX dst, vecX src) %{
  1.3652 +  predicate(n->as_Vector()->length() == 4);
  1.3653 +  match(Set dst (MulVF dst src));
  1.3654 +  format %{ "mulps   $dst,$src\t! mul packed4F" %}
  1.3655 +  ins_encode %{
  1.3656 +    __ mulps($dst$$XMMRegister, $src$$XMMRegister);
  1.3657 +  %}
  1.3658 +  ins_pipe( pipe_slow );
  1.3659 +%}
  1.3660 +
  1.3661 +instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
  1.3662 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.3663 +  match(Set dst (MulVF src1 src2));
  1.3664 +  format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
  1.3665 +  ins_encode %{
  1.3666 +    bool vector256 = false;
  1.3667 +    __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3668 +  %}
  1.3669 +  ins_pipe( pipe_slow );
  1.3670 +%}
  1.3671 +
  1.3672 +instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
  1.3673 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.3674 +  match(Set dst (MulVF src (LoadVector mem)));
  1.3675 +  format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
  1.3676 +  ins_encode %{
  1.3677 +    bool vector256 = false;
  1.3678 +    __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3679 +  %}
  1.3680 +  ins_pipe( pipe_slow );
  1.3681 +%}
  1.3682 +
  1.3683 +instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
  1.3684 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.3685 +  match(Set dst (MulVF src1 src2));
  1.3686 +  format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
  1.3687 +  ins_encode %{
  1.3688 +    bool vector256 = true;
  1.3689 +    __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3690 +  %}
  1.3691 +  ins_pipe( pipe_slow );
  1.3692 +%}
  1.3693 +
  1.3694 +instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
  1.3695 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.3696 +  match(Set dst (MulVF src (LoadVector mem)));
  1.3697 +  format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
  1.3698 +  ins_encode %{
  1.3699 +    bool vector256 = true;
  1.3700 +    __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3701 +  %}
  1.3702 +  ins_pipe( pipe_slow );
  1.3703 +%}
  1.3704 +
  1.3705 +// Doubles vector mul
  1.3706 +instruct vmul2D(vecX dst, vecX src) %{
  1.3707 +  predicate(n->as_Vector()->length() == 2);
  1.3708 +  match(Set dst (MulVD dst src));
  1.3709 +  format %{ "mulpd   $dst,$src\t! mul packed2D" %}
  1.3710 +  ins_encode %{
  1.3711 +    __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
  1.3712 +  %}
  1.3713 +  ins_pipe( pipe_slow );
  1.3714 +%}
  1.3715 +
  1.3716 +instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
  1.3717 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.3718 +  match(Set dst (MulVD src1 src2));
  1.3719 +  format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
  1.3720 +  ins_encode %{
  1.3721 +    bool vector256 = false;
  1.3722 +    __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3723 +  %}
  1.3724 +  ins_pipe( pipe_slow );
  1.3725 +%}
  1.3726 +
  1.3727 +instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
  1.3728 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.3729 +  match(Set dst (MulVD src (LoadVector mem)));
  1.3730 +  format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
  1.3731 +  ins_encode %{
  1.3732 +    bool vector256 = false;
  1.3733 +    __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3734 +  %}
  1.3735 +  ins_pipe( pipe_slow );
  1.3736 +%}
  1.3737 +
  1.3738 +instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
  1.3739 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.3740 +  match(Set dst (MulVD src1 src2));
  1.3741 +  format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
  1.3742 +  ins_encode %{
  1.3743 +    bool vector256 = true;
  1.3744 +    __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3745 +  %}
  1.3746 +  ins_pipe( pipe_slow );
  1.3747 +%}
  1.3748 +
  1.3749 +instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
  1.3750 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.3751 +  match(Set dst (MulVD src (LoadVector mem)));
  1.3752 +  format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
  1.3753 +  ins_encode %{
  1.3754 +    bool vector256 = true;
  1.3755 +    __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3756 +  %}
  1.3757 +  ins_pipe( pipe_slow );
  1.3758 +%}
  1.3759 +
  1.3760 +// --------------------------------- DIV --------------------------------------
  1.3761 +
  1.3762 +// Floats vector div
  1.3763 +instruct vdiv2F(vecD dst, vecD src) %{
  1.3764 +  predicate(n->as_Vector()->length() == 2);
  1.3765 +  match(Set dst (DivVF dst src));
  1.3766 +  format %{ "divps   $dst,$src\t! div packed2F" %}
  1.3767 +  ins_encode %{
  1.3768 +    __ divps($dst$$XMMRegister, $src$$XMMRegister);
  1.3769 +  %}
  1.3770 +  ins_pipe( pipe_slow );
  1.3771 +%}
  1.3772 +
  1.3773 +instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
  1.3774 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.3775 +  match(Set dst (DivVF src1 src2));
  1.3776 +  format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
  1.3777 +  ins_encode %{
  1.3778 +    bool vector256 = false;
  1.3779 +    __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3780 +  %}
  1.3781 +  ins_pipe( pipe_slow );
  1.3782 +%}
  1.3783 +
  1.3784 +instruct vdiv4F(vecX dst, vecX src) %{
  1.3785 +  predicate(n->as_Vector()->length() == 4);
  1.3786 +  match(Set dst (DivVF dst src));
  1.3787 +  format %{ "divps   $dst,$src\t! div packed4F" %}
  1.3788 +  ins_encode %{
  1.3789 +    __ divps($dst$$XMMRegister, $src$$XMMRegister);
  1.3790 +  %}
  1.3791 +  ins_pipe( pipe_slow );
  1.3792 +%}
  1.3793 +
  1.3794 +instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
  1.3795 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.3796 +  match(Set dst (DivVF src1 src2));
  1.3797 +  format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
  1.3798 +  ins_encode %{
  1.3799 +    bool vector256 = false;
  1.3800 +    __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3801 +  %}
  1.3802 +  ins_pipe( pipe_slow );
  1.3803 +%}
  1.3804 +
  1.3805 +instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
  1.3806 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.3807 +  match(Set dst (DivVF src (LoadVector mem)));
  1.3808 +  format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
  1.3809 +  ins_encode %{
  1.3810 +    bool vector256 = false;
  1.3811 +    __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3812 +  %}
  1.3813 +  ins_pipe( pipe_slow );
  1.3814 +%}
  1.3815 +
  1.3816 +instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
  1.3817 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.3818 +  match(Set dst (DivVF src1 src2));
  1.3819 +  format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
  1.3820 +  ins_encode %{
  1.3821 +    bool vector256 = true;
  1.3822 +    __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3823 +  %}
  1.3824 +  ins_pipe( pipe_slow );
  1.3825 +%}
  1.3826 +
  1.3827 +instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
  1.3828 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.3829 +  match(Set dst (DivVF src (LoadVector mem)));
  1.3830 +  format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
  1.3831 +  ins_encode %{
  1.3832 +    bool vector256 = true;
  1.3833 +    __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3834 +  %}
  1.3835 +  ins_pipe( pipe_slow );
  1.3836 +%}
  1.3837 +
  1.3838 +// Doubles vector div
  1.3839 +instruct vdiv2D(vecX dst, vecX src) %{
  1.3840 +  predicate(n->as_Vector()->length() == 2);
  1.3841 +  match(Set dst (DivVD dst src));
  1.3842 +  format %{ "divpd   $dst,$src\t! div packed2D" %}
  1.3843 +  ins_encode %{
  1.3844 +    __ divpd($dst$$XMMRegister, $src$$XMMRegister);
  1.3845 +  %}
  1.3846 +  ins_pipe( pipe_slow );
  1.3847 +%}
  1.3848 +
  1.3849 +instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
  1.3850 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.3851 +  match(Set dst (DivVD src1 src2));
  1.3852 +  format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
  1.3853 +  ins_encode %{
  1.3854 +    bool vector256 = false;
  1.3855 +    __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3856 +  %}
  1.3857 +  ins_pipe( pipe_slow );
  1.3858 +%}
  1.3859 +
  1.3860 +instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
  1.3861 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.3862 +  match(Set dst (DivVD src (LoadVector mem)));
  1.3863 +  format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
  1.3864 +  ins_encode %{
  1.3865 +    bool vector256 = false;
  1.3866 +    __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3867 +  %}
  1.3868 +  ins_pipe( pipe_slow );
  1.3869 +%}
  1.3870 +
  1.3871 +instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
  1.3872 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.3873 +  match(Set dst (DivVD src1 src2));
  1.3874 +  format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
  1.3875 +  ins_encode %{
  1.3876 +    bool vector256 = true;
  1.3877 +    __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.3878 +  %}
  1.3879 +  ins_pipe( pipe_slow );
  1.3880 +%}
  1.3881 +
  1.3882 +instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
  1.3883 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.3884 +  match(Set dst (DivVD src (LoadVector mem)));
  1.3885 +  format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
  1.3886 +  ins_encode %{
  1.3887 +    bool vector256 = true;
  1.3888 +    __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.3889 +  %}
  1.3890 +  ins_pipe( pipe_slow );
  1.3891 +%}
  1.3892 +
  1.3893 +// ------------------------------ Shift ---------------------------------------
  1.3894 +
  1.3895 +// Left and right shift count vectors are the same on x86
  1.3896 +// (only lowest bits of xmm reg are used for count).
  1.3897 +instruct vshiftcnt(vecS dst, rRegI cnt) %{
  1.3898 +  match(Set dst (LShiftCntV cnt));
  1.3899 +  match(Set dst (RShiftCntV cnt));
  1.3900 +  format %{ "movd    $dst,$cnt\t! load shift count" %}
  1.3901 +  ins_encode %{
  1.3902 +    __ movdl($dst$$XMMRegister, $cnt$$Register);
  1.3903 +  %}
  1.3904 +  ins_pipe( pipe_slow );
  1.3905 +%}
  1.3906 +
  1.3907 +// ------------------------------ LeftShift -----------------------------------
  1.3908 +
  1.3909 +// Shorts/Chars vector left shift
  1.3910 +instruct vsll2S(vecS dst, vecS shift) %{
  1.3911 +  predicate(n->as_Vector()->length() == 2);
  1.3912 +  match(Set dst (LShiftVS dst shift));
  1.3913 +  format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
  1.3914 +  ins_encode %{
  1.3915 +    __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
  1.3916 +  %}
  1.3917 +  ins_pipe( pipe_slow );
  1.3918 +%}
  1.3919 +
  1.3920 +instruct vsll2S_imm(vecS dst, immI8 shift) %{
  1.3921 +  predicate(n->as_Vector()->length() == 2);
  1.3922 +  match(Set dst (LShiftVS dst shift));
  1.3923 +  format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
  1.3924 +  ins_encode %{
  1.3925 +    __ psllw($dst$$XMMRegister, (int)$shift$$constant);
  1.3926 +  %}
  1.3927 +  ins_pipe( pipe_slow );
  1.3928 +%}
  1.3929 +
  1.3930 +instruct vsll2S_reg(vecS dst, vecS src, vecS shift) %{
  1.3931 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.3932 +  match(Set dst (LShiftVS src shift));
  1.3933 +  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
  1.3934 +  ins_encode %{
  1.3935 +    bool vector256 = false;
  1.3936 +    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.3937 +  %}
  1.3938 +  ins_pipe( pipe_slow );
  1.3939 +%}
  1.3940 +
  1.3941 +instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
  1.3942 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.3943 +  match(Set dst (LShiftVS src shift));
  1.3944 +  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
  1.3945 +  ins_encode %{
  1.3946 +    bool vector256 = false;
  1.3947 +    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.3948 +  %}
  1.3949 +  ins_pipe( pipe_slow );
  1.3950 +%}
  1.3951 +
  1.3952 +instruct vsll4S(vecD dst, vecS shift) %{
  1.3953 +  predicate(n->as_Vector()->length() == 4);
  1.3954 +  match(Set dst (LShiftVS dst shift));
  1.3955 +  format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
  1.3956 +  ins_encode %{
  1.3957 +    __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
  1.3958 +  %}
  1.3959 +  ins_pipe( pipe_slow );
  1.3960 +%}
  1.3961 +
  1.3962 +instruct vsll4S_imm(vecD dst, immI8 shift) %{
  1.3963 +  predicate(n->as_Vector()->length() == 4);
  1.3964 +  match(Set dst (LShiftVS dst shift));
  1.3965 +  format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
  1.3966 +  ins_encode %{
  1.3967 +    __ psllw($dst$$XMMRegister, (int)$shift$$constant);
  1.3968 +  %}
  1.3969 +  ins_pipe( pipe_slow );
  1.3970 +%}
  1.3971 +
  1.3972 +instruct vsll4S_reg(vecD dst, vecD src, vecS shift) %{
  1.3973 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.3974 +  match(Set dst (LShiftVS src shift));
  1.3975 +  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
  1.3976 +  ins_encode %{
  1.3977 +    bool vector256 = false;
  1.3978 +    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.3979 +  %}
  1.3980 +  ins_pipe( pipe_slow );
  1.3981 +%}
  1.3982 +
  1.3983 +instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
  1.3984 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.3985 +  match(Set dst (LShiftVS src shift));
  1.3986 +  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
  1.3987 +  ins_encode %{
  1.3988 +    bool vector256 = false;
  1.3989 +    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.3990 +  %}
  1.3991 +  ins_pipe( pipe_slow );
  1.3992 +%}
  1.3993 +
  1.3994 +instruct vsll8S(vecX dst, vecS shift) %{
  1.3995 +  predicate(n->as_Vector()->length() == 8);
  1.3996 +  match(Set dst (LShiftVS dst shift));
  1.3997 +  format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
  1.3998 +  ins_encode %{
  1.3999 +    __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
  1.4000 +  %}
  1.4001 +  ins_pipe( pipe_slow );
  1.4002 +%}
  1.4003 +
  1.4004 +instruct vsll8S_imm(vecX dst, immI8 shift) %{
  1.4005 +  predicate(n->as_Vector()->length() == 8);
  1.4006 +  match(Set dst (LShiftVS dst shift));
  1.4007 +  format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
  1.4008 +  ins_encode %{
  1.4009 +    __ psllw($dst$$XMMRegister, (int)$shift$$constant);
  1.4010 +  %}
  1.4011 +  ins_pipe( pipe_slow );
  1.4012 +%}
  1.4013 +
  1.4014 +instruct vsll8S_reg(vecX dst, vecX src, vecS shift) %{
  1.4015 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.4016 +  match(Set dst (LShiftVS src shift));
  1.4017 +  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
  1.4018 +  ins_encode %{
  1.4019 +    bool vector256 = false;
  1.4020 +    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4021 +  %}
  1.4022 +  ins_pipe( pipe_slow );
  1.4023 +%}
  1.4024 +
  1.4025 +instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
  1.4026 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.4027 +  match(Set dst (LShiftVS src shift));
  1.4028 +  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
  1.4029 +  ins_encode %{
  1.4030 +    bool vector256 = false;
  1.4031 +    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4032 +  %}
  1.4033 +  ins_pipe( pipe_slow );
  1.4034 +%}
  1.4035 +
  1.4036 +instruct vsll16S_reg(vecY dst, vecY src, vecS shift) %{
  1.4037 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
  1.4038 +  match(Set dst (LShiftVS src shift));
  1.4039 +  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
  1.4040 +  ins_encode %{
  1.4041 +    bool vector256 = true;
  1.4042 +    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4043 +  %}
  1.4044 +  ins_pipe( pipe_slow );
  1.4045 +%}
  1.4046 +
  1.4047 +instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
  1.4048 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
  1.4049 +  match(Set dst (LShiftVS src shift));
  1.4050 +  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
  1.4051 +  ins_encode %{
  1.4052 +    bool vector256 = true;
  1.4053 +    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4054 +  %}
  1.4055 +  ins_pipe( pipe_slow );
  1.4056 +%}
  1.4057 +
  1.4058 +// Integers vector left shift
  1.4059 +instruct vsll2I(vecD dst, vecS shift) %{
  1.4060 +  predicate(n->as_Vector()->length() == 2);
  1.4061 +  match(Set dst (LShiftVI dst shift));
  1.4062 +  format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
  1.4063 +  ins_encode %{
  1.4064 +    __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
  1.4065 +  %}
  1.4066 +  ins_pipe( pipe_slow );
  1.4067 +%}
  1.4068 +
  1.4069 +instruct vsll2I_imm(vecD dst, immI8 shift) %{
  1.4070 +  predicate(n->as_Vector()->length() == 2);
  1.4071 +  match(Set dst (LShiftVI dst shift));
  1.4072 +  format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
  1.4073 +  ins_encode %{
  1.4074 +    __ pslld($dst$$XMMRegister, (int)$shift$$constant);
  1.4075 +  %}
  1.4076 +  ins_pipe( pipe_slow );
  1.4077 +%}
  1.4078 +
  1.4079 +instruct vsll2I_reg(vecD dst, vecD src, vecS shift) %{
  1.4080 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.4081 +  match(Set dst (LShiftVI src shift));
  1.4082 +  format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
  1.4083 +  ins_encode %{
  1.4084 +    bool vector256 = false;
  1.4085 +    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4086 +  %}
  1.4087 +  ins_pipe( pipe_slow );
  1.4088 +%}
  1.4089 +
  1.4090 +instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
  1.4091 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.4092 +  match(Set dst (LShiftVI src shift));
  1.4093 +  format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
  1.4094 +  ins_encode %{
  1.4095 +    bool vector256 = false;
  1.4096 +    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4097 +  %}
  1.4098 +  ins_pipe( pipe_slow );
  1.4099 +%}
  1.4100 +
  1.4101 +instruct vsll4I(vecX dst, vecS shift) %{
  1.4102 +  predicate(n->as_Vector()->length() == 4);
  1.4103 +  match(Set dst (LShiftVI dst shift));
  1.4104 +  format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
  1.4105 +  ins_encode %{
  1.4106 +    __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
  1.4107 +  %}
  1.4108 +  ins_pipe( pipe_slow );
  1.4109 +%}
  1.4110 +
  1.4111 +instruct vsll4I_imm(vecX dst, immI8 shift) %{
  1.4112 +  predicate(n->as_Vector()->length() == 4);
  1.4113 +  match(Set dst (LShiftVI dst shift));
  1.4114 +  format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
  1.4115 +  ins_encode %{
  1.4116 +    __ pslld($dst$$XMMRegister, (int)$shift$$constant);
  1.4117 +  %}
  1.4118 +  ins_pipe( pipe_slow );
  1.4119 +%}
  1.4120 +
  1.4121 +instruct vsll4I_reg(vecX dst, vecX src, vecS shift) %{
  1.4122 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.4123 +  match(Set dst (LShiftVI src shift));
  1.4124 +  format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
  1.4125 +  ins_encode %{
  1.4126 +    bool vector256 = false;
  1.4127 +    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4128 +  %}
  1.4129 +  ins_pipe( pipe_slow );
  1.4130 +%}
  1.4131 +
  1.4132 +instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
  1.4133 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.4134 +  match(Set dst (LShiftVI src shift));
  1.4135 +  format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
  1.4136 +  ins_encode %{
  1.4137 +    bool vector256 = false;
  1.4138 +    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4139 +  %}
  1.4140 +  ins_pipe( pipe_slow );
  1.4141 +%}
  1.4142 +
  1.4143 +instruct vsll8I_reg(vecY dst, vecY src, vecS shift) %{
  1.4144 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
  1.4145 +  match(Set dst (LShiftVI src shift));
  1.4146 +  format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
  1.4147 +  ins_encode %{
  1.4148 +    bool vector256 = true;
  1.4149 +    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4150 +  %}
  1.4151 +  ins_pipe( pipe_slow );
  1.4152 +%}
  1.4153 +
  1.4154 +instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
  1.4155 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
  1.4156 +  match(Set dst (LShiftVI src shift));
  1.4157 +  format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
  1.4158 +  ins_encode %{
  1.4159 +    bool vector256 = true;
  1.4160 +    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4161 +  %}
  1.4162 +  ins_pipe( pipe_slow );
  1.4163 +%}
  1.4164 +
  1.4165 +// Longs vector left shift
  1.4166 +instruct vsll2L(vecX dst, vecS shift) %{
  1.4167 +  predicate(n->as_Vector()->length() == 2);
  1.4168 +  match(Set dst (LShiftVL dst shift));
  1.4169 +  format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
  1.4170 +  ins_encode %{
  1.4171 +    __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
  1.4172 +  %}
  1.4173 +  ins_pipe( pipe_slow );
  1.4174 +%}
  1.4175 +
  1.4176 +instruct vsll2L_imm(vecX dst, immI8 shift) %{
  1.4177 +  predicate(n->as_Vector()->length() == 2);
  1.4178 +  match(Set dst (LShiftVL dst shift));
  1.4179 +  format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
  1.4180 +  ins_encode %{
  1.4181 +    __ psllq($dst$$XMMRegister, (int)$shift$$constant);
  1.4182 +  %}
  1.4183 +  ins_pipe( pipe_slow );
  1.4184 +%}
  1.4185 +
  1.4186 +instruct vsll2L_reg(vecX dst, vecX src, vecS shift) %{
  1.4187 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.4188 +  match(Set dst (LShiftVL src shift));
  1.4189 +  format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
  1.4190 +  ins_encode %{
  1.4191 +    bool vector256 = false;
  1.4192 +    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4193 +  %}
  1.4194 +  ins_pipe( pipe_slow );
  1.4195 +%}
  1.4196 +
  1.4197 +instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
  1.4198 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.4199 +  match(Set dst (LShiftVL src shift));
  1.4200 +  format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
  1.4201 +  ins_encode %{
  1.4202 +    bool vector256 = false;
  1.4203 +    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4204 +  %}
  1.4205 +  ins_pipe( pipe_slow );
  1.4206 +%}
  1.4207 +
  1.4208 +instruct vsll4L_reg(vecY dst, vecY src, vecS shift) %{
  1.4209 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
  1.4210 +  match(Set dst (LShiftVL src shift));
  1.4211 +  format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
  1.4212 +  ins_encode %{
  1.4213 +    bool vector256 = true;
  1.4214 +    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4215 +  %}
  1.4216 +  ins_pipe( pipe_slow );
  1.4217 +%}
  1.4218 +
  1.4219 +instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
  1.4220 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
  1.4221 +  match(Set dst (LShiftVL src shift));
  1.4222 +  format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
  1.4223 +  ins_encode %{
  1.4224 +    bool vector256 = true;
  1.4225 +    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4226 +  %}
  1.4227 +  ins_pipe( pipe_slow );
  1.4228 +%}
  1.4229 +
  1.4230 +// ----------------------- LogicalRightShift -----------------------------------
  1.4231 +
  1.4232 +// Shorts vector logical right shift produces incorrect Java result
  1.4233 +// for negative data because java code convert short value into int with
  1.4234 +// sign extension before a shift. But char vectors are fine since chars are
  1.4235 +// unsigned values.
  1.4236 +
  1.4237 +instruct vsrl2S(vecS dst, vecS shift) %{
  1.4238 +  predicate(n->as_Vector()->length() == 2);
  1.4239 +  match(Set dst (URShiftVS dst shift));
  1.4240 +  format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
  1.4241 +  ins_encode %{
  1.4242 +    __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
  1.4243 +  %}
  1.4244 +  ins_pipe( pipe_slow );
  1.4245 +%}
  1.4246 +
  1.4247 +instruct vsrl2S_imm(vecS dst, immI8 shift) %{
  1.4248 +  predicate(n->as_Vector()->length() == 2);
  1.4249 +  match(Set dst (URShiftVS dst shift));
  1.4250 +  format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
  1.4251 +  ins_encode %{
  1.4252 +    __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
  1.4253 +  %}
  1.4254 +  ins_pipe( pipe_slow );
  1.4255 +%}
  1.4256 +
  1.4257 +instruct vsrl2S_reg(vecS dst, vecS src, vecS shift) %{
  1.4258 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.4259 +  match(Set dst (URShiftVS src shift));
  1.4260 +  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
  1.4261 +  ins_encode %{
  1.4262 +    bool vector256 = false;
  1.4263 +    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4264 +  %}
  1.4265 +  ins_pipe( pipe_slow );
  1.4266 +%}
  1.4267 +
  1.4268 +instruct vsrl2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
  1.4269 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.4270 +  match(Set dst (URShiftVS src shift));
  1.4271 +  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
  1.4272 +  ins_encode %{
  1.4273 +    bool vector256 = false;
  1.4274 +    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4275 +  %}
  1.4276 +  ins_pipe( pipe_slow );
  1.4277 +%}
  1.4278 +
  1.4279 +instruct vsrl4S(vecD dst, vecS shift) %{
  1.4280 +  predicate(n->as_Vector()->length() == 4);
  1.4281 +  match(Set dst (URShiftVS dst shift));
  1.4282 +  format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
  1.4283 +  ins_encode %{
  1.4284 +    __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
  1.4285 +  %}
  1.4286 +  ins_pipe( pipe_slow );
  1.4287 +%}
  1.4288 +
  1.4289 +instruct vsrl4S_imm(vecD dst, immI8 shift) %{
  1.4290 +  predicate(n->as_Vector()->length() == 4);
  1.4291 +  match(Set dst (URShiftVS dst shift));
  1.4292 +  format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
  1.4293 +  ins_encode %{
  1.4294 +    __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
  1.4295 +  %}
  1.4296 +  ins_pipe( pipe_slow );
  1.4297 +%}
  1.4298 +
  1.4299 +instruct vsrl4S_reg(vecD dst, vecD src, vecS shift) %{
  1.4300 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.4301 +  match(Set dst (URShiftVS src shift));
  1.4302 +  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
  1.4303 +  ins_encode %{
  1.4304 +    bool vector256 = false;
  1.4305 +    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4306 +  %}
  1.4307 +  ins_pipe( pipe_slow );
  1.4308 +%}
  1.4309 +
  1.4310 +instruct vsrl4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
  1.4311 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.4312 +  match(Set dst (URShiftVS src shift));
  1.4313 +  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
  1.4314 +  ins_encode %{
  1.4315 +    bool vector256 = false;
  1.4316 +    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4317 +  %}
  1.4318 +  ins_pipe( pipe_slow );
  1.4319 +%}
  1.4320 +
  1.4321 +instruct vsrl8S(vecX dst, vecS shift) %{
  1.4322 +  predicate(n->as_Vector()->length() == 8);
  1.4323 +  match(Set dst (URShiftVS dst shift));
  1.4324 +  format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
  1.4325 +  ins_encode %{
  1.4326 +    __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
  1.4327 +  %}
  1.4328 +  ins_pipe( pipe_slow );
  1.4329 +%}
  1.4330 +
  1.4331 +instruct vsrl8S_imm(vecX dst, immI8 shift) %{
  1.4332 +  predicate(n->as_Vector()->length() == 8);
  1.4333 +  match(Set dst (URShiftVS dst shift));
  1.4334 +  format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
  1.4335 +  ins_encode %{
  1.4336 +    __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
  1.4337 +  %}
  1.4338 +  ins_pipe( pipe_slow );
  1.4339 +%}
  1.4340 +
  1.4341 +instruct vsrl8S_reg(vecX dst, vecX src, vecS shift) %{
  1.4342 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.4343 +  match(Set dst (URShiftVS src shift));
  1.4344 +  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
  1.4345 +  ins_encode %{
  1.4346 +    bool vector256 = false;
  1.4347 +    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4348 +  %}
  1.4349 +  ins_pipe( pipe_slow );
  1.4350 +%}
  1.4351 +
  1.4352 +instruct vsrl8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
  1.4353 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.4354 +  match(Set dst (URShiftVS src shift));
  1.4355 +  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
  1.4356 +  ins_encode %{
  1.4357 +    bool vector256 = false;
  1.4358 +    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4359 +  %}
  1.4360 +  ins_pipe( pipe_slow );
  1.4361 +%}
  1.4362 +
  1.4363 +instruct vsrl16S_reg(vecY dst, vecY src, vecS shift) %{
  1.4364 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
  1.4365 +  match(Set dst (URShiftVS src shift));
  1.4366 +  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
  1.4367 +  ins_encode %{
  1.4368 +    bool vector256 = true;
  1.4369 +    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4370 +  %}
  1.4371 +  ins_pipe( pipe_slow );
  1.4372 +%}
  1.4373 +
  1.4374 +instruct vsrl16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
  1.4375 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
  1.4376 +  match(Set dst (URShiftVS src shift));
  1.4377 +  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
  1.4378 +  ins_encode %{
  1.4379 +    bool vector256 = true;
  1.4380 +    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4381 +  %}
  1.4382 +  ins_pipe( pipe_slow );
  1.4383 +%}
  1.4384 +
  1.4385 +// Integers vector logical right shift
  1.4386 +instruct vsrl2I(vecD dst, vecS shift) %{
  1.4387 +  predicate(n->as_Vector()->length() == 2);
  1.4388 +  match(Set dst (URShiftVI dst shift));
  1.4389 +  format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
  1.4390 +  ins_encode %{
  1.4391 +    __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
  1.4392 +  %}
  1.4393 +  ins_pipe( pipe_slow );
  1.4394 +%}
  1.4395 +
  1.4396 +instruct vsrl2I_imm(vecD dst, immI8 shift) %{
  1.4397 +  predicate(n->as_Vector()->length() == 2);
  1.4398 +  match(Set dst (URShiftVI dst shift));
  1.4399 +  format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
  1.4400 +  ins_encode %{
  1.4401 +    __ psrld($dst$$XMMRegister, (int)$shift$$constant);
  1.4402 +  %}
  1.4403 +  ins_pipe( pipe_slow );
  1.4404 +%}
  1.4405 +
  1.4406 +instruct vsrl2I_reg(vecD dst, vecD src, vecS shift) %{
  1.4407 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.4408 +  match(Set dst (URShiftVI src shift));
  1.4409 +  format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
  1.4410 +  ins_encode %{
  1.4411 +    bool vector256 = false;
  1.4412 +    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4413 +  %}
  1.4414 +  ins_pipe( pipe_slow );
  1.4415 +%}
  1.4416 +
  1.4417 +instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
  1.4418 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.4419 +  match(Set dst (URShiftVI src shift));
  1.4420 +  format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
  1.4421 +  ins_encode %{
  1.4422 +    bool vector256 = false;
  1.4423 +    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4424 +  %}
  1.4425 +  ins_pipe( pipe_slow );
  1.4426 +%}
  1.4427 +
  1.4428 +instruct vsrl4I(vecX dst, vecS shift) %{
  1.4429 +  predicate(n->as_Vector()->length() == 4);
  1.4430 +  match(Set dst (URShiftVI dst shift));
  1.4431 +  format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
  1.4432 +  ins_encode %{
  1.4433 +    __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
  1.4434 +  %}
  1.4435 +  ins_pipe( pipe_slow );
  1.4436 +%}
  1.4437 +
  1.4438 +instruct vsrl4I_imm(vecX dst, immI8 shift) %{
  1.4439 +  predicate(n->as_Vector()->length() == 4);
  1.4440 +  match(Set dst (URShiftVI dst shift));
  1.4441 +  format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
  1.4442 +  ins_encode %{
  1.4443 +    __ psrld($dst$$XMMRegister, (int)$shift$$constant);
  1.4444 +  %}
  1.4445 +  ins_pipe( pipe_slow );
  1.4446 +%}
  1.4447 +
  1.4448 +instruct vsrl4I_reg(vecX dst, vecX src, vecS shift) %{
  1.4449 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.4450 +  match(Set dst (URShiftVI src shift));
  1.4451 +  format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
  1.4452 +  ins_encode %{
  1.4453 +    bool vector256 = false;
  1.4454 +    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4455 +  %}
  1.4456 +  ins_pipe( pipe_slow );
  1.4457 +%}
  1.4458 +
  1.4459 +instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
  1.4460 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.4461 +  match(Set dst (URShiftVI src shift));
  1.4462 +  format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
  1.4463 +  ins_encode %{
  1.4464 +    bool vector256 = false;
  1.4465 +    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4466 +  %}
  1.4467 +  ins_pipe( pipe_slow );
  1.4468 +%}
  1.4469 +
  1.4470 +instruct vsrl8I_reg(vecY dst, vecY src, vecS shift) %{
  1.4471 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
  1.4472 +  match(Set dst (URShiftVI src shift));
  1.4473 +  format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
  1.4474 +  ins_encode %{
  1.4475 +    bool vector256 = true;
  1.4476 +    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4477 +  %}
  1.4478 +  ins_pipe( pipe_slow );
  1.4479 +%}
  1.4480 +
  1.4481 +instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
  1.4482 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
  1.4483 +  match(Set dst (URShiftVI src shift));
  1.4484 +  format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
  1.4485 +  ins_encode %{
  1.4486 +    bool vector256 = true;
  1.4487 +    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4488 +  %}
  1.4489 +  ins_pipe( pipe_slow );
  1.4490 +%}
  1.4491 +
  1.4492 +// Longs vector logical right shift
  1.4493 +instruct vsrl2L(vecX dst, vecS shift) %{
  1.4494 +  predicate(n->as_Vector()->length() == 2);
  1.4495 +  match(Set dst (URShiftVL dst shift));
  1.4496 +  format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
  1.4497 +  ins_encode %{
  1.4498 +    __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
  1.4499 +  %}
  1.4500 +  ins_pipe( pipe_slow );
  1.4501 +%}
  1.4502 +
  1.4503 +instruct vsrl2L_imm(vecX dst, immI8 shift) %{
  1.4504 +  predicate(n->as_Vector()->length() == 2);
  1.4505 +  match(Set dst (URShiftVL dst shift));
  1.4506 +  format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
  1.4507 +  ins_encode %{
  1.4508 +    __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
  1.4509 +  %}
  1.4510 +  ins_pipe( pipe_slow );
  1.4511 +%}
  1.4512 +
  1.4513 +instruct vsrl2L_reg(vecX dst, vecX src, vecS shift) %{
  1.4514 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.4515 +  match(Set dst (URShiftVL src shift));
  1.4516 +  format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
  1.4517 +  ins_encode %{
  1.4518 +    bool vector256 = false;
  1.4519 +    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4520 +  %}
  1.4521 +  ins_pipe( pipe_slow );
  1.4522 +%}
  1.4523 +
  1.4524 +instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
  1.4525 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.4526 +  match(Set dst (URShiftVL src shift));
  1.4527 +  format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
  1.4528 +  ins_encode %{
  1.4529 +    bool vector256 = false;
  1.4530 +    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4531 +  %}
  1.4532 +  ins_pipe( pipe_slow );
  1.4533 +%}
  1.4534 +
  1.4535 +instruct vsrl4L_reg(vecY dst, vecY src, vecS shift) %{
  1.4536 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
  1.4537 +  match(Set dst (URShiftVL src shift));
  1.4538 +  format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
  1.4539 +  ins_encode %{
  1.4540 +    bool vector256 = true;
  1.4541 +    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4542 +  %}
  1.4543 +  ins_pipe( pipe_slow );
  1.4544 +%}
  1.4545 +
  1.4546 +instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
  1.4547 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
  1.4548 +  match(Set dst (URShiftVL src shift));
  1.4549 +  format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
  1.4550 +  ins_encode %{
  1.4551 +    bool vector256 = true;
  1.4552 +    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4553 +  %}
  1.4554 +  ins_pipe( pipe_slow );
  1.4555 +%}
  1.4556 +
  1.4557 +// ------------------- ArithmeticRightShift -----------------------------------
  1.4558 +
  1.4559 +// Shorts/Chars vector arithmetic right shift
  1.4560 +instruct vsra2S(vecS dst, vecS shift) %{
  1.4561 +  predicate(n->as_Vector()->length() == 2);
  1.4562 +  match(Set dst (RShiftVS dst shift));
  1.4563 +  format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
  1.4564 +  ins_encode %{
  1.4565 +    __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
  1.4566 +  %}
  1.4567 +  ins_pipe( pipe_slow );
  1.4568 +%}
  1.4569 +
  1.4570 +instruct vsra2S_imm(vecS dst, immI8 shift) %{
  1.4571 +  predicate(n->as_Vector()->length() == 2);
  1.4572 +  match(Set dst (RShiftVS dst shift));
  1.4573 +  format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
  1.4574 +  ins_encode %{
  1.4575 +    __ psraw($dst$$XMMRegister, (int)$shift$$constant);
  1.4576 +  %}
  1.4577 +  ins_pipe( pipe_slow );
  1.4578 +%}
  1.4579 +
  1.4580 +instruct vsra2S_reg(vecS dst, vecS src, vecS shift) %{
  1.4581 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.4582 +  match(Set dst (RShiftVS src shift));
  1.4583 +  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
  1.4584 +  ins_encode %{
  1.4585 +    bool vector256 = false;
  1.4586 +    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4587 +  %}
  1.4588 +  ins_pipe( pipe_slow );
  1.4589 +%}
  1.4590 +
  1.4591 +instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
  1.4592 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.4593 +  match(Set dst (RShiftVS src shift));
  1.4594 +  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
  1.4595 +  ins_encode %{
  1.4596 +    bool vector256 = false;
  1.4597 +    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4598 +  %}
  1.4599 +  ins_pipe( pipe_slow );
  1.4600 +%}
  1.4601 +
  1.4602 +instruct vsra4S(vecD dst, vecS shift) %{
  1.4603 +  predicate(n->as_Vector()->length() == 4);
  1.4604 +  match(Set dst (RShiftVS dst shift));
  1.4605 +  format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
  1.4606 +  ins_encode %{
  1.4607 +    __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
  1.4608 +  %}
  1.4609 +  ins_pipe( pipe_slow );
  1.4610 +%}
  1.4611 +
  1.4612 +instruct vsra4S_imm(vecD dst, immI8 shift) %{
  1.4613 +  predicate(n->as_Vector()->length() == 4);
  1.4614 +  match(Set dst (RShiftVS dst shift));
  1.4615 +  format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
  1.4616 +  ins_encode %{
  1.4617 +    __ psraw($dst$$XMMRegister, (int)$shift$$constant);
  1.4618 +  %}
  1.4619 +  ins_pipe( pipe_slow );
  1.4620 +%}
  1.4621 +
  1.4622 +instruct vsra4S_reg(vecD dst, vecD src, vecS shift) %{
  1.4623 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.4624 +  match(Set dst (RShiftVS src shift));
  1.4625 +  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
  1.4626 +  ins_encode %{
  1.4627 +    bool vector256 = false;
  1.4628 +    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4629 +  %}
  1.4630 +  ins_pipe( pipe_slow );
  1.4631 +%}
  1.4632 +
  1.4633 +instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
  1.4634 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.4635 +  match(Set dst (RShiftVS src shift));
  1.4636 +  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
  1.4637 +  ins_encode %{
  1.4638 +    bool vector256 = false;
  1.4639 +    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4640 +  %}
  1.4641 +  ins_pipe( pipe_slow );
  1.4642 +%}
  1.4643 +
  1.4644 +instruct vsra8S(vecX dst, vecS shift) %{
  1.4645 +  predicate(n->as_Vector()->length() == 8);
  1.4646 +  match(Set dst (RShiftVS dst shift));
  1.4647 +  format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
  1.4648 +  ins_encode %{
  1.4649 +    __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
  1.4650 +  %}
  1.4651 +  ins_pipe( pipe_slow );
  1.4652 +%}
  1.4653 +
  1.4654 +instruct vsra8S_imm(vecX dst, immI8 shift) %{
  1.4655 +  predicate(n->as_Vector()->length() == 8);
  1.4656 +  match(Set dst (RShiftVS dst shift));
  1.4657 +  format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
  1.4658 +  ins_encode %{
  1.4659 +    __ psraw($dst$$XMMRegister, (int)$shift$$constant);
  1.4660 +  %}
  1.4661 +  ins_pipe( pipe_slow );
  1.4662 +%}
  1.4663 +
  1.4664 +instruct vsra8S_reg(vecX dst, vecX src, vecS shift) %{
  1.4665 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.4666 +  match(Set dst (RShiftVS src shift));
  1.4667 +  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
  1.4668 +  ins_encode %{
  1.4669 +    bool vector256 = false;
  1.4670 +    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4671 +  %}
  1.4672 +  ins_pipe( pipe_slow );
  1.4673 +%}
  1.4674 +
  1.4675 +instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
  1.4676 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
  1.4677 +  match(Set dst (RShiftVS src shift));
  1.4678 +  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
  1.4679 +  ins_encode %{
  1.4680 +    bool vector256 = false;
  1.4681 +    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4682 +  %}
  1.4683 +  ins_pipe( pipe_slow );
  1.4684 +%}
  1.4685 +
  1.4686 +instruct vsra16S_reg(vecY dst, vecY src, vecS shift) %{
  1.4687 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
  1.4688 +  match(Set dst (RShiftVS src shift));
  1.4689 +  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
  1.4690 +  ins_encode %{
  1.4691 +    bool vector256 = true;
  1.4692 +    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4693 +  %}
  1.4694 +  ins_pipe( pipe_slow );
  1.4695 +%}
  1.4696 +
  1.4697 +instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
  1.4698 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
  1.4699 +  match(Set dst (RShiftVS src shift));
  1.4700 +  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
  1.4701 +  ins_encode %{
  1.4702 +    bool vector256 = true;
  1.4703 +    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4704 +  %}
  1.4705 +  ins_pipe( pipe_slow );
  1.4706 +%}
  1.4707 +
  1.4708 +// Integers vector arithmetic right shift
  1.4709 +instruct vsra2I(vecD dst, vecS shift) %{
  1.4710 +  predicate(n->as_Vector()->length() == 2);
  1.4711 +  match(Set dst (RShiftVI dst shift));
  1.4712 +  format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
  1.4713 +  ins_encode %{
  1.4714 +    __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
  1.4715 +  %}
  1.4716 +  ins_pipe( pipe_slow );
  1.4717 +%}
  1.4718 +
  1.4719 +instruct vsra2I_imm(vecD dst, immI8 shift) %{
  1.4720 +  predicate(n->as_Vector()->length() == 2);
  1.4721 +  match(Set dst (RShiftVI dst shift));
  1.4722 +  format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
  1.4723 +  ins_encode %{
  1.4724 +    __ psrad($dst$$XMMRegister, (int)$shift$$constant);
  1.4725 +  %}
  1.4726 +  ins_pipe( pipe_slow );
  1.4727 +%}
  1.4728 +
  1.4729 +instruct vsra2I_reg(vecD dst, vecD src, vecS shift) %{
  1.4730 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.4731 +  match(Set dst (RShiftVI src shift));
  1.4732 +  format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
  1.4733 +  ins_encode %{
  1.4734 +    bool vector256 = false;
  1.4735 +    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4736 +  %}
  1.4737 +  ins_pipe( pipe_slow );
  1.4738 +%}
  1.4739 +
  1.4740 +instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
  1.4741 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
  1.4742 +  match(Set dst (RShiftVI src shift));
  1.4743 +  format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
  1.4744 +  ins_encode %{
  1.4745 +    bool vector256 = false;
  1.4746 +    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4747 +  %}
  1.4748 +  ins_pipe( pipe_slow );
  1.4749 +%}
  1.4750 +
  1.4751 +instruct vsra4I(vecX dst, vecS shift) %{
  1.4752 +  predicate(n->as_Vector()->length() == 4);
  1.4753 +  match(Set dst (RShiftVI dst shift));
  1.4754 +  format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
  1.4755 +  ins_encode %{
  1.4756 +    __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
  1.4757 +  %}
  1.4758 +  ins_pipe( pipe_slow );
  1.4759 +%}
  1.4760 +
  1.4761 +instruct vsra4I_imm(vecX dst, immI8 shift) %{
  1.4762 +  predicate(n->as_Vector()->length() == 4);
  1.4763 +  match(Set dst (RShiftVI dst shift));
  1.4764 +  format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
  1.4765 +  ins_encode %{
  1.4766 +    __ psrad($dst$$XMMRegister, (int)$shift$$constant);
  1.4767 +  %}
  1.4768 +  ins_pipe( pipe_slow );
  1.4769 +%}
  1.4770 +
  1.4771 +instruct vsra4I_reg(vecX dst, vecX src, vecS shift) %{
  1.4772 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.4773 +  match(Set dst (RShiftVI src shift));
  1.4774 +  format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
  1.4775 +  ins_encode %{
  1.4776 +    bool vector256 = false;
  1.4777 +    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4778 +  %}
  1.4779 +  ins_pipe( pipe_slow );
  1.4780 +%}
  1.4781 +
  1.4782 +instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
  1.4783 +  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
  1.4784 +  match(Set dst (RShiftVI src shift));
  1.4785 +  format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
  1.4786 +  ins_encode %{
  1.4787 +    bool vector256 = false;
  1.4788 +    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4789 +  %}
  1.4790 +  ins_pipe( pipe_slow );
  1.4791 +%}
  1.4792 +
  1.4793 +instruct vsra8I_reg(vecY dst, vecY src, vecS shift) %{
  1.4794 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
  1.4795 +  match(Set dst (RShiftVI src shift));
  1.4796 +  format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
  1.4797 +  ins_encode %{
  1.4798 +    bool vector256 = true;
  1.4799 +    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
  1.4800 +  %}
  1.4801 +  ins_pipe( pipe_slow );
  1.4802 +%}
  1.4803 +
  1.4804 +instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
  1.4805 +  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
  1.4806 +  match(Set dst (RShiftVI src shift));
  1.4807 +  format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
  1.4808 +  ins_encode %{
  1.4809 +    bool vector256 = true;
  1.4810 +    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
  1.4811 +  %}
  1.4812 +  ins_pipe( pipe_slow );
  1.4813 +%}
  1.4814 +
  1.4815 +// There are no longs vector arithmetic right shift instructions.
  1.4816 +
  1.4817 +
  1.4818 +// --------------------------------- AND --------------------------------------
  1.4819 +
  1.4820 +instruct vand4B(vecS dst, vecS src) %{
  1.4821 +  predicate(n->as_Vector()->length_in_bytes() == 4);
  1.4822 +  match(Set dst (AndV dst src));
  1.4823 +  format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
  1.4824 +  ins_encode %{
  1.4825 +    __ pand($dst$$XMMRegister, $src$$XMMRegister);
  1.4826 +  %}
  1.4827 +  ins_pipe( pipe_slow );
  1.4828 +%}
  1.4829 +
  1.4830 +instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
  1.4831 +  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
  1.4832 +  match(Set dst (AndV src1 src2));
  1.4833 +  format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
  1.4834 +  ins_encode %{
  1.4835 +    bool vector256 = false;
  1.4836 +    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.4837 +  %}
  1.4838 +  ins_pipe( pipe_slow );
  1.4839 +%}
  1.4840 +
  1.4841 +instruct vand8B(vecD dst, vecD src) %{
  1.4842 +  predicate(n->as_Vector()->length_in_bytes() == 8);
  1.4843 +  match(Set dst (AndV dst src));
  1.4844 +  format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
  1.4845 +  ins_encode %{
  1.4846 +    __ pand($dst$$XMMRegister, $src$$XMMRegister);
  1.4847 +  %}
  1.4848 +  ins_pipe( pipe_slow );
  1.4849 +%}
  1.4850 +
  1.4851 +instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
  1.4852 +  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
  1.4853 +  match(Set dst (AndV src1 src2));
  1.4854 +  format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
  1.4855 +  ins_encode %{
  1.4856 +    bool vector256 = false;
  1.4857 +    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.4858 +  %}
  1.4859 +  ins_pipe( pipe_slow );
  1.4860 +%}
  1.4861 +
  1.4862 +instruct vand16B(vecX dst, vecX src) %{
  1.4863 +  predicate(n->as_Vector()->length_in_bytes() == 16);
  1.4864 +  match(Set dst (AndV dst src));
  1.4865 +  format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
  1.4866 +  ins_encode %{
  1.4867 +    __ pand($dst$$XMMRegister, $src$$XMMRegister);
  1.4868 +  %}
  1.4869 +  ins_pipe( pipe_slow );
  1.4870 +%}
  1.4871 +
  1.4872 +instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
  1.4873 +  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
  1.4874 +  match(Set dst (AndV src1 src2));
  1.4875 +  format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
  1.4876 +  ins_encode %{
  1.4877 +    bool vector256 = false;
  1.4878 +    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.4879 +  %}
  1.4880 +  ins_pipe( pipe_slow );
  1.4881 +%}
  1.4882 +
  1.4883 +instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
  1.4884 +  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
  1.4885 +  match(Set dst (AndV src (LoadVector mem)));
  1.4886 +  format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
  1.4887 +  ins_encode %{
  1.4888 +    bool vector256 = false;
  1.4889 +    __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.4890 +  %}
  1.4891 +  ins_pipe( pipe_slow );
  1.4892 +%}
  1.4893 +
  1.4894 +instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
  1.4895 +  predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
  1.4896 +  match(Set dst (AndV src1 src2));
  1.4897 +  format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
  1.4898 +  ins_encode %{
  1.4899 +    bool vector256 = true;
  1.4900 +    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.4901 +  %}
  1.4902 +  ins_pipe( pipe_slow );
  1.4903 +%}
  1.4904 +
  1.4905 +instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
  1.4906 +  predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
  1.4907 +  match(Set dst (AndV src (LoadVector mem)));
  1.4908 +  format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
  1.4909 +  ins_encode %{
  1.4910 +    bool vector256 = true;
  1.4911 +    __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.4912 +  %}
  1.4913 +  ins_pipe( pipe_slow );
  1.4914 +%}
  1.4915 +
  1.4916 +// --------------------------------- OR ---------------------------------------
  1.4917 +
  1.4918 +instruct vor4B(vecS dst, vecS src) %{
  1.4919 +  predicate(n->as_Vector()->length_in_bytes() == 4);
  1.4920 +  match(Set dst (OrV dst src));
  1.4921 +  format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
  1.4922 +  ins_encode %{
  1.4923 +    __ por($dst$$XMMRegister, $src$$XMMRegister);
  1.4924 +  %}
  1.4925 +  ins_pipe( pipe_slow );
  1.4926 +%}
  1.4927 +
  1.4928 +instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
  1.4929 +  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
  1.4930 +  match(Set dst (OrV src1 src2));
  1.4931 +  format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
  1.4932 +  ins_encode %{
  1.4933 +    bool vector256 = false;
  1.4934 +    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.4935 +  %}
  1.4936 +  ins_pipe( pipe_slow );
  1.4937 +%}
  1.4938 +
  1.4939 +instruct vor8B(vecD dst, vecD src) %{
  1.4940 +  predicate(n->as_Vector()->length_in_bytes() == 8);
  1.4941 +  match(Set dst (OrV dst src));
  1.4942 +  format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
  1.4943 +  ins_encode %{
  1.4944 +    __ por($dst$$XMMRegister, $src$$XMMRegister);
  1.4945 +  %}
  1.4946 +  ins_pipe( pipe_slow );
  1.4947 +%}
  1.4948 +
  1.4949 +instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
  1.4950 +  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
  1.4951 +  match(Set dst (OrV src1 src2));
  1.4952 +  format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
  1.4953 +  ins_encode %{
  1.4954 +    bool vector256 = false;
  1.4955 +    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.4956 +  %}
  1.4957 +  ins_pipe( pipe_slow );
  1.4958 +%}
  1.4959 +
  1.4960 +instruct vor16B(vecX dst, vecX src) %{
  1.4961 +  predicate(n->as_Vector()->length_in_bytes() == 16);
  1.4962 +  match(Set dst (OrV dst src));
  1.4963 +  format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
  1.4964 +  ins_encode %{
  1.4965 +    __ por($dst$$XMMRegister, $src$$XMMRegister);
  1.4966 +  %}
  1.4967 +  ins_pipe( pipe_slow );
  1.4968 +%}
  1.4969 +
  1.4970 +instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
  1.4971 +  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
  1.4972 +  match(Set dst (OrV src1 src2));
  1.4973 +  format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
  1.4974 +  ins_encode %{
  1.4975 +    bool vector256 = false;
  1.4976 +    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.4977 +  %}
  1.4978 +  ins_pipe( pipe_slow );
  1.4979 +%}
  1.4980 +
  1.4981 +instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
  1.4982 +  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
  1.4983 +  match(Set dst (OrV src (LoadVector mem)));
  1.4984 +  format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
  1.4985 +  ins_encode %{
  1.4986 +    bool vector256 = false;
  1.4987 +    __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.4988 +  %}
  1.4989 +  ins_pipe( pipe_slow );
  1.4990 +%}
  1.4991 +
  1.4992 +instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
  1.4993 +  predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
  1.4994 +  match(Set dst (OrV src1 src2));
  1.4995 +  format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
  1.4996 +  ins_encode %{
  1.4997 +    bool vector256 = true;
  1.4998 +    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.4999 +  %}
  1.5000 +  ins_pipe( pipe_slow );
  1.5001 +%}
  1.5002 +
  1.5003 +instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
  1.5004 +  predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
  1.5005 +  match(Set dst (OrV src (LoadVector mem)));
  1.5006 +  format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
  1.5007 +  ins_encode %{
  1.5008 +    bool vector256 = true;
  1.5009 +    __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.5010 +  %}
  1.5011 +  ins_pipe( pipe_slow );
  1.5012 +%}
  1.5013 +
  1.5014 +// --------------------------------- XOR --------------------------------------
  1.5015 +
  1.5016 +instruct vxor4B(vecS dst, vecS src) %{
  1.5017 +  predicate(n->as_Vector()->length_in_bytes() == 4);
  1.5018 +  match(Set dst (XorV dst src));
  1.5019 +  format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
  1.5020 +  ins_encode %{
  1.5021 +    __ pxor($dst$$XMMRegister, $src$$XMMRegister);
  1.5022 +  %}
  1.5023 +  ins_pipe( pipe_slow );
  1.5024 +%}
  1.5025 +
  1.5026 +instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
  1.5027 +  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
  1.5028 +  match(Set dst (XorV src1 src2));
  1.5029 +  format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
  1.5030 +  ins_encode %{
  1.5031 +    bool vector256 = false;
  1.5032 +    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.5033 +  %}
  1.5034 +  ins_pipe( pipe_slow );
  1.5035 +%}
  1.5036 +
  1.5037 +instruct vxor8B(vecD dst, vecD src) %{
  1.5038 +  predicate(n->as_Vector()->length_in_bytes() == 8);
  1.5039 +  match(Set dst (XorV dst src));
  1.5040 +  format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
  1.5041 +  ins_encode %{
  1.5042 +    __ pxor($dst$$XMMRegister, $src$$XMMRegister);
  1.5043 +  %}
  1.5044 +  ins_pipe( pipe_slow );
  1.5045 +%}
  1.5046 +
  1.5047 +instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
  1.5048 +  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
  1.5049 +  match(Set dst (XorV src1 src2));
  1.5050 +  format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
  1.5051 +  ins_encode %{
  1.5052 +    bool vector256 = false;
  1.5053 +    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.5054 +  %}
  1.5055 +  ins_pipe( pipe_slow );
  1.5056 +%}
  1.5057 +
  1.5058 +instruct vxor16B(vecX dst, vecX src) %{
  1.5059 +  predicate(n->as_Vector()->length_in_bytes() == 16);
  1.5060 +  match(Set dst (XorV dst src));
  1.5061 +  format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
  1.5062 +  ins_encode %{
  1.5063 +    __ pxor($dst$$XMMRegister, $src$$XMMRegister);
  1.5064 +  %}
  1.5065 +  ins_pipe( pipe_slow );
  1.5066 +%}
  1.5067 +
  1.5068 +instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
  1.5069 +  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
  1.5070 +  match(Set dst (XorV src1 src2));
  1.5071 +  format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
  1.5072 +  ins_encode %{
  1.5073 +    bool vector256 = false;
  1.5074 +    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.5075 +  %}
  1.5076 +  ins_pipe( pipe_slow );
  1.5077 +%}
  1.5078 +
  1.5079 +instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
  1.5080 +  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
  1.5081 +  match(Set dst (XorV src (LoadVector mem)));
  1.5082 +  format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
  1.5083 +  ins_encode %{
  1.5084 +    bool vector256 = false;
  1.5085 +    __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.5086 +  %}
  1.5087 +  ins_pipe( pipe_slow );
  1.5088 +%}
  1.5089 +
  1.5090 +instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
  1.5091 +  predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
  1.5092 +  match(Set dst (XorV src1 src2));
  1.5093 +  format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
  1.5094 +  ins_encode %{
  1.5095 +    bool vector256 = true;
  1.5096 +    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
  1.5097 +  %}
  1.5098 +  ins_pipe( pipe_slow );
  1.5099 +%}
  1.5100 +
  1.5101 +instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
  1.5102 +  predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
  1.5103 +  match(Set dst (XorV src (LoadVector mem)));
  1.5104 +  format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
  1.5105 +  ins_encode %{
  1.5106 +    bool vector256 = true;
  1.5107 +    __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
  1.5108 +  %}
  1.5109 +  ins_pipe( pipe_slow );
  1.5110 +%}
  1.5111 +

mercurial