1.1 --- a/src/cpu/x86/vm/x86.ad Thu Jun 14 14:59:52 2012 -0700 1.2 +++ b/src/cpu/x86/vm/x86.ad Fri Jun 15 01:25:19 2012 -0700 1.3 @@ -24,6 +24,456 @@ 1.4 1.5 // X86 Common Architecture Description File 1.6 1.7 +//----------REGISTER DEFINITION BLOCK------------------------------------------ 1.8 +// This information is used by the matcher and the register allocator to 1.9 +// describe individual registers and classes of registers within the target 1.10 +// archtecture. 1.11 + 1.12 +register %{ 1.13 +//----------Architecture Description Register Definitions---------------------- 1.14 +// General Registers 1.15 +// "reg_def" name ( register save type, C convention save type, 1.16 +// ideal register type, encoding ); 1.17 +// Register Save Types: 1.18 +// 1.19 +// NS = No-Save: The register allocator assumes that these registers 1.20 +// can be used without saving upon entry to the method, & 1.21 +// that they do not need to be saved at call sites. 1.22 +// 1.23 +// SOC = Save-On-Call: The register allocator assumes that these registers 1.24 +// can be used without saving upon entry to the method, 1.25 +// but that they must be saved at call sites. 1.26 +// 1.27 +// SOE = Save-On-Entry: The register allocator assumes that these registers 1.28 +// must be saved before using them upon entry to the 1.29 +// method, but they do not need to be saved at call 1.30 +// sites. 1.31 +// 1.32 +// AS = Always-Save: The register allocator assumes that these registers 1.33 +// must be saved before using them upon entry to the 1.34 +// method, & that they must be saved at call sites. 1.35 +// 1.36 +// Ideal Register Type is used to determine how to save & restore a 1.37 +// register. Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get 1.38 +// spilled with LoadP/StoreP. If the register supports both, use Op_RegI. 1.39 +// 1.40 +// The encoding number is the actual bit-pattern placed into the opcodes. 1.41 + 1.42 +// XMM registers. 256-bit registers or 8 words each, labeled (a)-h. 1.43 +// Word a in each register holds a Float, words ab hold a Double. 1.44 +// The whole registers are used in SSE4.2 version intrinsics, 1.45 +// array copy stubs and superword operations (see UseSSE42Intrinsics, 1.46 +// UseXMMForArrayCopy and UseSuperword flags). 1.47 +// XMM8-XMM15 must be encoded with REX (VEX for UseAVX). 1.48 +// Linux ABI: No register preserved across function calls 1.49 +// XMM0-XMM7 might hold parameters 1.50 +// Windows ABI: XMM6-XMM15 preserved across function calls 1.51 +// XMM0-XMM3 might hold parameters 1.52 + 1.53 +reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()); 1.54 +reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()); 1.55 +reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next()); 1.56 +reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next()->next()); 1.57 +reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next()->next()->next()); 1.58 +reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next()->next()->next()->next()); 1.59 +reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.60 +reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.61 + 1.62 +reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()); 1.63 +reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()); 1.64 +reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next()); 1.65 +reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next()->next()); 1.66 +reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next()->next()->next()); 1.67 +reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next()->next()->next()->next()); 1.68 +reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.69 +reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.70 + 1.71 +reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()); 1.72 +reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()); 1.73 +reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next()); 1.74 +reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next()->next()); 1.75 +reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next()->next()->next()); 1.76 +reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next()->next()->next()->next()); 1.77 +reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.78 +reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.79 + 1.80 +reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()); 1.81 +reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()); 1.82 +reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next()); 1.83 +reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next()->next()); 1.84 +reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next()->next()->next()); 1.85 +reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next()->next()->next()->next()); 1.86 +reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.87 +reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.88 + 1.89 +reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()); 1.90 +reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()); 1.91 +reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next()); 1.92 +reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next()->next()); 1.93 +reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next()->next()->next()); 1.94 +reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next()->next()->next()->next()); 1.95 +reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.96 +reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.97 + 1.98 +reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()); 1.99 +reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()); 1.100 +reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next()); 1.101 +reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next()->next()); 1.102 +reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next()->next()->next()); 1.103 +reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next()->next()->next()->next()); 1.104 +reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.105 +reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.106 + 1.107 +#ifdef _WIN64 1.108 + 1.109 +reg_def XMM6 ( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()); 1.110 +reg_def XMM6b( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()); 1.111 +reg_def XMM6c( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next()); 1.112 +reg_def XMM6d( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()); 1.113 +reg_def XMM6e( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()); 1.114 +reg_def XMM6f( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next()); 1.115 +reg_def XMM6g( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.116 +reg_def XMM6h( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.117 + 1.118 +reg_def XMM7 ( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()); 1.119 +reg_def XMM7b( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()); 1.120 +reg_def XMM7c( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next()); 1.121 +reg_def XMM7d( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()); 1.122 +reg_def XMM7e( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()); 1.123 +reg_def XMM7f( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next()); 1.124 +reg_def XMM7g( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.125 +reg_def XMM7h( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.126 + 1.127 +reg_def XMM8 ( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()); 1.128 +reg_def XMM8b( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()); 1.129 +reg_def XMM8c( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next()); 1.130 +reg_def XMM8d( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()); 1.131 +reg_def XMM8e( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()); 1.132 +reg_def XMM8f( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next()); 1.133 +reg_def XMM8g( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.134 +reg_def XMM8h( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.135 + 1.136 +reg_def XMM9 ( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()); 1.137 +reg_def XMM9b( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()); 1.138 +reg_def XMM9c( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next()); 1.139 +reg_def XMM9d( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()); 1.140 +reg_def XMM9e( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()); 1.141 +reg_def XMM9f( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next()); 1.142 +reg_def XMM9g( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.143 +reg_def XMM9h( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.144 + 1.145 +reg_def XMM10 ( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()); 1.146 +reg_def XMM10b( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()); 1.147 +reg_def XMM10c( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next()); 1.148 +reg_def XMM10d( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()); 1.149 +reg_def XMM10e( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()); 1.150 +reg_def XMM10f( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next()); 1.151 +reg_def XMM10g( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.152 +reg_def XMM10h( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.153 + 1.154 +reg_def XMM11 ( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()); 1.155 +reg_def XMM11b( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()); 1.156 +reg_def XMM11c( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next()); 1.157 +reg_def XMM11d( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()); 1.158 +reg_def XMM11e( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()); 1.159 +reg_def XMM11f( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next()); 1.160 +reg_def XMM11g( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.161 +reg_def XMM11h( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.162 + 1.163 +reg_def XMM12 ( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()); 1.164 +reg_def XMM12b( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()); 1.165 +reg_def XMM12c( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next()); 1.166 +reg_def XMM12d( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()); 1.167 +reg_def XMM12e( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()); 1.168 +reg_def XMM12f( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next()); 1.169 +reg_def XMM12g( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.170 +reg_def XMM12h( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.171 + 1.172 +reg_def XMM13 ( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()); 1.173 +reg_def XMM13b( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()); 1.174 +reg_def XMM13c( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next()); 1.175 +reg_def XMM13d( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()); 1.176 +reg_def XMM13e( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()); 1.177 +reg_def XMM13f( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next()); 1.178 +reg_def XMM13g( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.179 +reg_def XMM13h( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.180 + 1.181 +reg_def XMM14 ( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()); 1.182 +reg_def XMM14b( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()); 1.183 +reg_def XMM14c( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next()); 1.184 +reg_def XMM14d( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()); 1.185 +reg_def XMM14e( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()); 1.186 +reg_def XMM14f( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next()); 1.187 +reg_def XMM14g( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.188 +reg_def XMM14h( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.189 + 1.190 +reg_def XMM15 ( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()); 1.191 +reg_def XMM15b( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()); 1.192 +reg_def XMM15c( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next()); 1.193 +reg_def XMM15d( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()); 1.194 +reg_def XMM15e( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()); 1.195 +reg_def XMM15f( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next()); 1.196 +reg_def XMM15g( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.197 +reg_def XMM15h( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.198 + 1.199 +#else // _WIN64 1.200 + 1.201 +reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()); 1.202 +reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()); 1.203 +reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next()); 1.204 +reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()); 1.205 +reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()); 1.206 +reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next()); 1.207 +reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.208 +reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.209 + 1.210 +reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()); 1.211 +reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()); 1.212 +reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next()); 1.213 +reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()); 1.214 +reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()); 1.215 +reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next()); 1.216 +reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.217 +reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.218 + 1.219 +#ifdef _LP64 1.220 + 1.221 +reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()); 1.222 +reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()); 1.223 +reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next()); 1.224 +reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()); 1.225 +reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()); 1.226 +reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next()); 1.227 +reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.228 +reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.229 + 1.230 +reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()); 1.231 +reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()); 1.232 +reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next()); 1.233 +reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()); 1.234 +reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()); 1.235 +reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next()); 1.236 +reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.237 +reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.238 + 1.239 +reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()); 1.240 +reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()); 1.241 +reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next()); 1.242 +reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()); 1.243 +reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()); 1.244 +reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next()); 1.245 +reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.246 +reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.247 + 1.248 +reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()); 1.249 +reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()); 1.250 +reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next()); 1.251 +reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()); 1.252 +reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()); 1.253 +reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next()); 1.254 +reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.255 +reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.256 + 1.257 +reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()); 1.258 +reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()); 1.259 +reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next()); 1.260 +reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()); 1.261 +reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()); 1.262 +reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next()); 1.263 +reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.264 +reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.265 + 1.266 +reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()); 1.267 +reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()); 1.268 +reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next()); 1.269 +reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()); 1.270 +reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()); 1.271 +reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next()); 1.272 +reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.273 +reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.274 + 1.275 +reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()); 1.276 +reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()); 1.277 +reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next()); 1.278 +reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()); 1.279 +reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()); 1.280 +reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next()); 1.281 +reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.282 +reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.283 + 1.284 +reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()); 1.285 +reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()); 1.286 +reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next()); 1.287 +reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()); 1.288 +reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()); 1.289 +reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next()); 1.290 +reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next()->next()); 1.291 +reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next()->next()->next()); 1.292 + 1.293 +#endif // _LP64 1.294 + 1.295 +#endif // _WIN64 1.296 + 1.297 +#ifdef _LP64 1.298 +reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad()); 1.299 +#else 1.300 +reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad()); 1.301 +#endif // _LP64 1.302 + 1.303 +alloc_class chunk1(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, 1.304 + XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, 1.305 + XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, 1.306 + XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, 1.307 + XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, 1.308 + XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, 1.309 + XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, 1.310 + XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h 1.311 +#ifdef _LP64 1.312 + ,XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, 1.313 + XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, 1.314 + XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, 1.315 + XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, 1.316 + XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, 1.317 + XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, 1.318 + XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, 1.319 + XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h 1.320 +#endif 1.321 + ); 1.322 + 1.323 +// flags allocation class should be last. 1.324 +alloc_class chunk2(RFLAGS); 1.325 + 1.326 +// Singleton class for condition codes 1.327 +reg_class int_flags(RFLAGS); 1.328 + 1.329 +// Class for all float registers 1.330 +reg_class float_reg(XMM0, 1.331 + XMM1, 1.332 + XMM2, 1.333 + XMM3, 1.334 + XMM4, 1.335 + XMM5, 1.336 + XMM6, 1.337 + XMM7 1.338 +#ifdef _LP64 1.339 + ,XMM8, 1.340 + XMM9, 1.341 + XMM10, 1.342 + XMM11, 1.343 + XMM12, 1.344 + XMM13, 1.345 + XMM14, 1.346 + XMM15 1.347 +#endif 1.348 + ); 1.349 + 1.350 +// Class for all double registers 1.351 +reg_class double_reg(XMM0, XMM0b, 1.352 + XMM1, XMM1b, 1.353 + XMM2, XMM2b, 1.354 + XMM3, XMM3b, 1.355 + XMM4, XMM4b, 1.356 + XMM5, XMM5b, 1.357 + XMM6, XMM6b, 1.358 + XMM7, XMM7b 1.359 +#ifdef _LP64 1.360 + ,XMM8, XMM8b, 1.361 + XMM9, XMM9b, 1.362 + XMM10, XMM10b, 1.363 + XMM11, XMM11b, 1.364 + XMM12, XMM12b, 1.365 + XMM13, XMM13b, 1.366 + XMM14, XMM14b, 1.367 + XMM15, XMM15b 1.368 +#endif 1.369 + ); 1.370 + 1.371 +// Class for all 32bit vector registers 1.372 +reg_class vectors_reg(XMM0, 1.373 + XMM1, 1.374 + XMM2, 1.375 + XMM3, 1.376 + XMM4, 1.377 + XMM5, 1.378 + XMM6, 1.379 + XMM7 1.380 +#ifdef _LP64 1.381 + ,XMM8, 1.382 + XMM9, 1.383 + XMM10, 1.384 + XMM11, 1.385 + XMM12, 1.386 + XMM13, 1.387 + XMM14, 1.388 + XMM15 1.389 +#endif 1.390 + ); 1.391 + 1.392 +// Class for all 64bit vector registers 1.393 +reg_class vectord_reg(XMM0, XMM0b, 1.394 + XMM1, XMM1b, 1.395 + XMM2, XMM2b, 1.396 + XMM3, XMM3b, 1.397 + XMM4, XMM4b, 1.398 + XMM5, XMM5b, 1.399 + XMM6, XMM6b, 1.400 + XMM7, XMM7b 1.401 +#ifdef _LP64 1.402 + ,XMM8, XMM8b, 1.403 + XMM9, XMM9b, 1.404 + XMM10, XMM10b, 1.405 + XMM11, XMM11b, 1.406 + XMM12, XMM12b, 1.407 + XMM13, XMM13b, 1.408 + XMM14, XMM14b, 1.409 + XMM15, XMM15b 1.410 +#endif 1.411 + ); 1.412 + 1.413 +// Class for all 128bit vector registers 1.414 +reg_class vectorx_reg(XMM0, XMM0b, XMM0c, XMM0d, 1.415 + XMM1, XMM1b, XMM1c, XMM1d, 1.416 + XMM2, XMM2b, XMM2c, XMM2d, 1.417 + XMM3, XMM3b, XMM3c, XMM3d, 1.418 + XMM4, XMM4b, XMM4c, XMM4d, 1.419 + XMM5, XMM5b, XMM5c, XMM5d, 1.420 + XMM6, XMM6b, XMM6c, XMM6d, 1.421 + XMM7, XMM7b, XMM7c, XMM7d 1.422 +#ifdef _LP64 1.423 + ,XMM8, XMM8b, XMM8c, XMM8d, 1.424 + XMM9, XMM9b, XMM9c, XMM9d, 1.425 + XMM10, XMM10b, XMM10c, XMM10d, 1.426 + XMM11, XMM11b, XMM11c, XMM11d, 1.427 + XMM12, XMM12b, XMM12c, XMM12d, 1.428 + XMM13, XMM13b, XMM13c, XMM13d, 1.429 + XMM14, XMM14b, XMM14c, XMM14d, 1.430 + XMM15, XMM15b, XMM15c, XMM15d 1.431 +#endif 1.432 + ); 1.433 + 1.434 +// Class for all 256bit vector registers 1.435 +reg_class vectory_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, 1.436 + XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, 1.437 + XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, 1.438 + XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, 1.439 + XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, 1.440 + XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, 1.441 + XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, 1.442 + XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h 1.443 +#ifdef _LP64 1.444 + ,XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, 1.445 + XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, 1.446 + XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, 1.447 + XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, 1.448 + XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, 1.449 + XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, 1.450 + XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, 1.451 + XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h 1.452 +#endif 1.453 + ); 1.454 + 1.455 +%} 1.456 + 1.457 source %{ 1.458 // Float masks come from different places depending on platform. 1.459 #ifdef _LP64 1.460 @@ -38,6 +488,252 @@ 1.461 static address double_signflip() { return (address)double_signflip_pool; } 1.462 #endif 1.463 1.464 +// Map Types to machine register types 1.465 +const int Matcher::base2reg[Type::lastype] = { 1.466 + Node::NotAMachineReg,0,0, Op_RegI, Op_RegL, 0, Op_RegN, 1.467 + Node::NotAMachineReg, Node::NotAMachineReg, /* tuple, array */ 1.468 + Op_VecS, Op_VecD, Op_VecX, Op_VecY, /* Vectors */ 1.469 + Op_RegP, Op_RegP, Op_RegP, Op_RegP, Op_RegP, Op_RegP, /* the pointers */ 1.470 + 0, 0/*abio*/, 1.471 + Op_RegP /* Return address */, 0, /* the memories */ 1.472 + Op_RegF, Op_RegF, Op_RegF, Op_RegD, Op_RegD, Op_RegD, 1.473 + 0 /*bottom*/ 1.474 +}; 1.475 + 1.476 +// Max vector size in bytes. 0 if not supported. 1.477 +const int Matcher::vector_width_in_bytes(BasicType bt) { 1.478 + assert(is_java_primitive(bt), "only primitive type vectors"); 1.479 + if (UseSSE < 2) return 0; 1.480 + // SSE2 supports 128bit vectors for all types. 1.481 + // AVX2 supports 256bit vectors for all types. 1.482 + int size = (UseAVX > 1) ? 32 : 16; 1.483 + // AVX1 supports 256bit vectors only for FLOAT and DOUBLE. 1.484 + if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE)) 1.485 + size = 32; 1.486 + // Use flag to limit vector size. 1.487 + size = MIN2(size,(int)MaxVectorSize); 1.488 + // Minimum 2 values in vector (or 4 for bytes). 1.489 + switch (bt) { 1.490 + case T_DOUBLE: 1.491 + case T_LONG: 1.492 + if (size < 16) return 0; 1.493 + case T_FLOAT: 1.494 + case T_INT: 1.495 + if (size < 8) return 0; 1.496 + case T_BOOLEAN: 1.497 + case T_BYTE: 1.498 + case T_CHAR: 1.499 + case T_SHORT: 1.500 + if (size < 4) return 0; 1.501 + break; 1.502 + default: 1.503 + ShouldNotReachHere(); 1.504 + } 1.505 + return size; 1.506 +} 1.507 + 1.508 +// Limits on vector size (number of elements) loaded into vector. 1.509 +const int Matcher::max_vector_size(const BasicType bt) { 1.510 + return vector_width_in_bytes(bt)/type2aelembytes(bt); 1.511 +} 1.512 +const int Matcher::min_vector_size(const BasicType bt) { 1.513 + int max_size = max_vector_size(bt); 1.514 + // Min size which can be loaded into vector is 4 bytes. 1.515 + int size = (type2aelembytes(bt) == 1) ? 4 : 2; 1.516 + return MIN2(size,max_size); 1.517 +} 1.518 + 1.519 +// Vector ideal reg corresponding to specidied size in bytes 1.520 +const int Matcher::vector_ideal_reg(int size) { 1.521 + assert(MaxVectorSize >= size, ""); 1.522 + switch(size) { 1.523 + case 4: return Op_VecS; 1.524 + case 8: return Op_VecD; 1.525 + case 16: return Op_VecX; 1.526 + case 32: return Op_VecY; 1.527 + } 1.528 + ShouldNotReachHere(); 1.529 + return 0; 1.530 +} 1.531 + 1.532 +// x86 supports misaligned vectors store/load. 1.533 +const bool Matcher::misaligned_vectors_ok() { 1.534 + return !AlignVector; // can be changed by flag 1.535 +} 1.536 + 1.537 +// Helper methods for MachSpillCopyNode::implementation(). 1.538 +static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo, 1.539 + int src_hi, int dst_hi, uint ireg, outputStream* st) { 1.540 + // In 64-bit VM size calculation is very complex. Emitting instructions 1.541 + // into scratch buffer is used to get size in 64-bit VM. 1.542 + LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); ) 1.543 + assert(ireg == Op_VecS || // 32bit vector 1.544 + (src_lo & 1) == 0 && (src_lo + 1) == src_hi && 1.545 + (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi, 1.546 + "no non-adjacent vector moves" ); 1.547 + if (cbuf) { 1.548 + MacroAssembler _masm(cbuf); 1.549 + int offset = __ offset(); 1.550 + switch (ireg) { 1.551 + case Op_VecS: // copy whole register 1.552 + case Op_VecD: 1.553 + case Op_VecX: 1.554 + __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo])); 1.555 + break; 1.556 + case Op_VecY: 1.557 + __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo])); 1.558 + break; 1.559 + default: 1.560 + ShouldNotReachHere(); 1.561 + } 1.562 + int size = __ offset() - offset; 1.563 +#ifdef ASSERT 1.564 + // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix. 1.565 + assert(!do_size || size == 4, "incorrect size calculattion"); 1.566 +#endif 1.567 + return size; 1.568 +#ifndef PRODUCT 1.569 + } else if (!do_size) { 1.570 + switch (ireg) { 1.571 + case Op_VecS: 1.572 + case Op_VecD: 1.573 + case Op_VecX: 1.574 + st->print("movdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]); 1.575 + break; 1.576 + case Op_VecY: 1.577 + st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]); 1.578 + break; 1.579 + default: 1.580 + ShouldNotReachHere(); 1.581 + } 1.582 +#endif 1.583 + } 1.584 + // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix. 1.585 + return 4; 1.586 +} 1.587 + 1.588 +static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load, 1.589 + int stack_offset, int reg, uint ireg, outputStream* st) { 1.590 + // In 64-bit VM size calculation is very complex. Emitting instructions 1.591 + // into scratch buffer is used to get size in 64-bit VM. 1.592 + LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); ) 1.593 + if (cbuf) { 1.594 + MacroAssembler _masm(cbuf); 1.595 + int offset = __ offset(); 1.596 + if (is_load) { 1.597 + switch (ireg) { 1.598 + case Op_VecS: 1.599 + __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset)); 1.600 + break; 1.601 + case Op_VecD: 1.602 + __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset)); 1.603 + break; 1.604 + case Op_VecX: 1.605 + __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset)); 1.606 + break; 1.607 + case Op_VecY: 1.608 + __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset)); 1.609 + break; 1.610 + default: 1.611 + ShouldNotReachHere(); 1.612 + } 1.613 + } else { // store 1.614 + switch (ireg) { 1.615 + case Op_VecS: 1.616 + __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg])); 1.617 + break; 1.618 + case Op_VecD: 1.619 + __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg])); 1.620 + break; 1.621 + case Op_VecX: 1.622 + __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg])); 1.623 + break; 1.624 + case Op_VecY: 1.625 + __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg])); 1.626 + break; 1.627 + default: 1.628 + ShouldNotReachHere(); 1.629 + } 1.630 + } 1.631 + int size = __ offset() - offset; 1.632 +#ifdef ASSERT 1.633 + int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4); 1.634 + // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix. 1.635 + assert(!do_size || size == (5+offset_size), "incorrect size calculattion"); 1.636 +#endif 1.637 + return size; 1.638 +#ifndef PRODUCT 1.639 + } else if (!do_size) { 1.640 + if (is_load) { 1.641 + switch (ireg) { 1.642 + case Op_VecS: 1.643 + st->print("movd %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset); 1.644 + break; 1.645 + case Op_VecD: 1.646 + st->print("movq %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset); 1.647 + break; 1.648 + case Op_VecX: 1.649 + st->print("movdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset); 1.650 + break; 1.651 + case Op_VecY: 1.652 + st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset); 1.653 + break; 1.654 + default: 1.655 + ShouldNotReachHere(); 1.656 + } 1.657 + } else { // store 1.658 + switch (ireg) { 1.659 + case Op_VecS: 1.660 + st->print("movd [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]); 1.661 + break; 1.662 + case Op_VecD: 1.663 + st->print("movq [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]); 1.664 + break; 1.665 + case Op_VecX: 1.666 + st->print("movdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]); 1.667 + break; 1.668 + case Op_VecY: 1.669 + st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]); 1.670 + break; 1.671 + default: 1.672 + ShouldNotReachHere(); 1.673 + } 1.674 + } 1.675 +#endif 1.676 + } 1.677 + int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4); 1.678 + // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix. 1.679 + return 5+offset_size; 1.680 +} 1.681 + 1.682 +static inline jfloat replicate4_imm(int con, int width) { 1.683 + // Load a constant of "width" (in bytes) and replicate it to fill 32bit. 1.684 + assert(width == 1 || width == 2, "only byte or short types here"); 1.685 + int bit_width = width * 8; 1.686 + jint val = con; 1.687 + val &= (1 << bit_width) - 1; // mask off sign bits 1.688 + while(bit_width < 32) { 1.689 + val |= (val << bit_width); 1.690 + bit_width <<= 1; 1.691 + } 1.692 + jfloat fval = *((jfloat*) &val); // coerce to float type 1.693 + return fval; 1.694 +} 1.695 + 1.696 +static inline jdouble replicate8_imm(int con, int width) { 1.697 + // Load a constant of "width" (in bytes) and replicate it to fill 64bit. 1.698 + assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here"); 1.699 + int bit_width = width * 8; 1.700 + jlong val = con; 1.701 + val &= (((jlong) 1) << bit_width) - 1; // mask off sign bits 1.702 + while(bit_width < 64) { 1.703 + val |= (val << bit_width); 1.704 + bit_width <<= 1; 1.705 + } 1.706 + jdouble dval = *((jdouble*) &val); // coerce to double type 1.707 + return dval; 1.708 +} 1.709 + 1.710 #ifndef PRODUCT 1.711 void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const { 1.712 st->print("nop \t# %d bytes pad for loops and calls", _count); 1.713 @@ -103,6 +799,46 @@ 1.714 1.715 %} 1.716 1.717 + 1.718 +//----------OPERANDS----------------------------------------------------------- 1.719 +// Operand definitions must precede instruction definitions for correct parsing 1.720 +// in the ADLC because operands constitute user defined types which are used in 1.721 +// instruction definitions. 1.722 + 1.723 +// Vectors 1.724 +operand vecS() %{ 1.725 + constraint(ALLOC_IN_RC(vectors_reg)); 1.726 + match(VecS); 1.727 + 1.728 + format %{ %} 1.729 + interface(REG_INTER); 1.730 +%} 1.731 + 1.732 +operand vecD() %{ 1.733 + constraint(ALLOC_IN_RC(vectord_reg)); 1.734 + match(VecD); 1.735 + 1.736 + format %{ %} 1.737 + interface(REG_INTER); 1.738 +%} 1.739 + 1.740 +operand vecX() %{ 1.741 + constraint(ALLOC_IN_RC(vectorx_reg)); 1.742 + match(VecX); 1.743 + 1.744 + format %{ %} 1.745 + interface(REG_INTER); 1.746 +%} 1.747 + 1.748 +operand vecY() %{ 1.749 + constraint(ALLOC_IN_RC(vectory_reg)); 1.750 + match(VecY); 1.751 + 1.752 + format %{ %} 1.753 + interface(REG_INTER); 1.754 +%} 1.755 + 1.756 + 1.757 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit) 1.758 1.759 // ============================================================================ 1.760 @@ -852,3 +1588,797 @@ 1.761 ins_pipe(pipe_slow); 1.762 %} 1.763 1.764 + 1.765 +// ====================VECTOR INSTRUCTIONS===================================== 1.766 + 1.767 +// Load vectors (4 bytes long) 1.768 +instruct loadV4(vecS dst, memory mem) %{ 1.769 + predicate(n->as_LoadVector()->memory_size() == 4); 1.770 + match(Set dst (LoadVector mem)); 1.771 + ins_cost(125); 1.772 + format %{ "movd $dst,$mem\t! load vector (4 bytes)" %} 1.773 + ins_encode %{ 1.774 + __ movdl($dst$$XMMRegister, $mem$$Address); 1.775 + %} 1.776 + ins_pipe( pipe_slow ); 1.777 +%} 1.778 + 1.779 +// Load vectors (8 bytes long) 1.780 +instruct loadV8(vecD dst, memory mem) %{ 1.781 + predicate(n->as_LoadVector()->memory_size() == 8); 1.782 + match(Set dst (LoadVector mem)); 1.783 + ins_cost(125); 1.784 + format %{ "movq $dst,$mem\t! load vector (8 bytes)" %} 1.785 + ins_encode %{ 1.786 + __ movq($dst$$XMMRegister, $mem$$Address); 1.787 + %} 1.788 + ins_pipe( pipe_slow ); 1.789 +%} 1.790 + 1.791 +// Load vectors (16 bytes long) 1.792 +instruct loadV16(vecX dst, memory mem) %{ 1.793 + predicate(n->as_LoadVector()->memory_size() == 16); 1.794 + match(Set dst (LoadVector mem)); 1.795 + ins_cost(125); 1.796 + format %{ "movdqu $dst,$mem\t! load vector (16 bytes)" %} 1.797 + ins_encode %{ 1.798 + __ movdqu($dst$$XMMRegister, $mem$$Address); 1.799 + %} 1.800 + ins_pipe( pipe_slow ); 1.801 +%} 1.802 + 1.803 +// Load vectors (32 bytes long) 1.804 +instruct loadV32(vecY dst, memory mem) %{ 1.805 + predicate(n->as_LoadVector()->memory_size() == 32); 1.806 + match(Set dst (LoadVector mem)); 1.807 + ins_cost(125); 1.808 + format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %} 1.809 + ins_encode %{ 1.810 + __ vmovdqu($dst$$XMMRegister, $mem$$Address); 1.811 + %} 1.812 + ins_pipe( pipe_slow ); 1.813 +%} 1.814 + 1.815 +// Store vectors 1.816 +instruct storeV4(memory mem, vecS src) %{ 1.817 + predicate(n->as_StoreVector()->memory_size() == 4); 1.818 + match(Set mem (StoreVector mem src)); 1.819 + ins_cost(145); 1.820 + format %{ "movd $mem,$src\t! store vector (4 bytes)" %} 1.821 + ins_encode %{ 1.822 + __ movdl($mem$$Address, $src$$XMMRegister); 1.823 + %} 1.824 + ins_pipe( pipe_slow ); 1.825 +%} 1.826 + 1.827 +instruct storeV8(memory mem, vecD src) %{ 1.828 + predicate(n->as_StoreVector()->memory_size() == 8); 1.829 + match(Set mem (StoreVector mem src)); 1.830 + ins_cost(145); 1.831 + format %{ "movq $mem,$src\t! store vector (8 bytes)" %} 1.832 + ins_encode %{ 1.833 + __ movq($mem$$Address, $src$$XMMRegister); 1.834 + %} 1.835 + ins_pipe( pipe_slow ); 1.836 +%} 1.837 + 1.838 +instruct storeV16(memory mem, vecX src) %{ 1.839 + predicate(n->as_StoreVector()->memory_size() == 16); 1.840 + match(Set mem (StoreVector mem src)); 1.841 + ins_cost(145); 1.842 + format %{ "movdqu $mem,$src\t! store vector (16 bytes)" %} 1.843 + ins_encode %{ 1.844 + __ movdqu($mem$$Address, $src$$XMMRegister); 1.845 + %} 1.846 + ins_pipe( pipe_slow ); 1.847 +%} 1.848 + 1.849 +instruct storeV32(memory mem, vecY src) %{ 1.850 + predicate(n->as_StoreVector()->memory_size() == 32); 1.851 + match(Set mem (StoreVector mem src)); 1.852 + ins_cost(145); 1.853 + format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %} 1.854 + ins_encode %{ 1.855 + __ vmovdqu($mem$$Address, $src$$XMMRegister); 1.856 + %} 1.857 + ins_pipe( pipe_slow ); 1.858 +%} 1.859 + 1.860 +// Replicate byte scalar to be vector 1.861 +instruct Repl4B(vecS dst, rRegI src) %{ 1.862 + predicate(n->as_Vector()->length() == 4); 1.863 + match(Set dst (ReplicateB src)); 1.864 + format %{ "movd $dst,$src\n\t" 1.865 + "punpcklbw $dst,$dst\n\t" 1.866 + "pshuflw $dst,$dst,0x00\t! replicate4B" %} 1.867 + ins_encode %{ 1.868 + __ movdl($dst$$XMMRegister, $src$$Register); 1.869 + __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister); 1.870 + __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); 1.871 + %} 1.872 + ins_pipe( pipe_slow ); 1.873 +%} 1.874 + 1.875 +instruct Repl8B(vecD dst, rRegI src) %{ 1.876 + predicate(n->as_Vector()->length() == 8); 1.877 + match(Set dst (ReplicateB src)); 1.878 + format %{ "movd $dst,$src\n\t" 1.879 + "punpcklbw $dst,$dst\n\t" 1.880 + "pshuflw $dst,$dst,0x00\t! replicate8B" %} 1.881 + ins_encode %{ 1.882 + __ movdl($dst$$XMMRegister, $src$$Register); 1.883 + __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister); 1.884 + __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); 1.885 + %} 1.886 + ins_pipe( pipe_slow ); 1.887 +%} 1.888 + 1.889 +instruct Repl16B(vecX dst, rRegI src) %{ 1.890 + predicate(n->as_Vector()->length() == 16); 1.891 + match(Set dst (ReplicateB src)); 1.892 + format %{ "movd $dst,$src\n\t" 1.893 + "punpcklbw $dst,$dst\n\t" 1.894 + "pshuflw $dst,$dst,0x00\n\t" 1.895 + "movlhps $dst,$dst\t! replicate16B" %} 1.896 + ins_encode %{ 1.897 + __ movdl($dst$$XMMRegister, $src$$Register); 1.898 + __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister); 1.899 + __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); 1.900 + __ movlhps($dst$$XMMRegister, $dst$$XMMRegister); 1.901 + %} 1.902 + ins_pipe( pipe_slow ); 1.903 +%} 1.904 + 1.905 +instruct Repl32B(vecY dst, rRegI src) %{ 1.906 + predicate(n->as_Vector()->length() == 32); 1.907 + match(Set dst (ReplicateB src)); 1.908 + format %{ "movd $dst,$src\n\t" 1.909 + "punpcklbw $dst,$dst\n\t" 1.910 + "pshuflw $dst,$dst,0x00\n\t" 1.911 + "movlhps $dst,$dst\n\t" 1.912 + "vinsertf128h $dst,$dst,$dst\t! replicate32B" %} 1.913 + ins_encode %{ 1.914 + __ movdl($dst$$XMMRegister, $src$$Register); 1.915 + __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister); 1.916 + __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); 1.917 + __ movlhps($dst$$XMMRegister, $dst$$XMMRegister); 1.918 + __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); 1.919 + %} 1.920 + ins_pipe( pipe_slow ); 1.921 +%} 1.922 + 1.923 +// Replicate byte scalar immediate to be vector by loading from const table. 1.924 +instruct Repl4B_imm(vecS dst, immI con) %{ 1.925 + predicate(n->as_Vector()->length() == 4); 1.926 + match(Set dst (ReplicateB con)); 1.927 + format %{ "movss $dst,[$constantaddress]\t! replicate4B($con)" %} 1.928 + ins_encode %{ 1.929 + __ movflt($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1))); 1.930 + %} 1.931 + ins_pipe( pipe_slow ); 1.932 +%} 1.933 + 1.934 +instruct Repl8B_imm(vecD dst, immI con) %{ 1.935 + predicate(n->as_Vector()->length() == 8); 1.936 + match(Set dst (ReplicateB con)); 1.937 + format %{ "movsd $dst,[$constantaddress]\t! replicate8B($con)" %} 1.938 + ins_encode %{ 1.939 + __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1))); 1.940 + %} 1.941 + ins_pipe( pipe_slow ); 1.942 +%} 1.943 + 1.944 +instruct Repl16B_imm(vecX dst, immI con) %{ 1.945 + predicate(n->as_Vector()->length() == 16); 1.946 + match(Set dst (ReplicateB con)); 1.947 + format %{ "movsd $dst,[$constantaddress]\t! replicate16B($con)\n\t" 1.948 + "movlhps $dst,$dst" %} 1.949 + ins_encode %{ 1.950 + __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1))); 1.951 + __ movlhps($dst$$XMMRegister, $dst$$XMMRegister); 1.952 + %} 1.953 + ins_pipe( pipe_slow ); 1.954 +%} 1.955 + 1.956 +instruct Repl32B_imm(vecY dst, immI con) %{ 1.957 + predicate(n->as_Vector()->length() == 32); 1.958 + match(Set dst (ReplicateB con)); 1.959 + format %{ "movsd $dst,[$constantaddress]\t! lreplicate32B($con)\n\t" 1.960 + "movlhps $dst,$dst\n\t" 1.961 + "vinsertf128h $dst,$dst,$dst" %} 1.962 + ins_encode %{ 1.963 + __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1))); 1.964 + __ movlhps($dst$$XMMRegister, $dst$$XMMRegister); 1.965 + __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); 1.966 + %} 1.967 + ins_pipe( pipe_slow ); 1.968 +%} 1.969 + 1.970 +// Replicate byte scalar zero to be vector 1.971 +instruct Repl4B_zero(vecS dst, immI0 zero) %{ 1.972 + predicate(n->as_Vector()->length() == 4); 1.973 + match(Set dst (ReplicateB zero)); 1.974 + format %{ "pxor $dst,$dst\t! replicate4B zero" %} 1.975 + ins_encode %{ 1.976 + __ pxor($dst$$XMMRegister, $dst$$XMMRegister); 1.977 + %} 1.978 + ins_pipe( fpu_reg_reg ); 1.979 +%} 1.980 + 1.981 +instruct Repl8B_zero(vecD dst, immI0 zero) %{ 1.982 + predicate(n->as_Vector()->length() == 8); 1.983 + match(Set dst (ReplicateB zero)); 1.984 + format %{ "pxor $dst,$dst\t! replicate8B zero" %} 1.985 + ins_encode %{ 1.986 + __ pxor($dst$$XMMRegister, $dst$$XMMRegister); 1.987 + %} 1.988 + ins_pipe( fpu_reg_reg ); 1.989 +%} 1.990 + 1.991 +instruct Repl16B_zero(vecX dst, immI0 zero) %{ 1.992 + predicate(n->as_Vector()->length() == 16); 1.993 + match(Set dst (ReplicateB zero)); 1.994 + format %{ "pxor $dst,$dst\t! replicate16B zero" %} 1.995 + ins_encode %{ 1.996 + __ pxor($dst$$XMMRegister, $dst$$XMMRegister); 1.997 + %} 1.998 + ins_pipe( fpu_reg_reg ); 1.999 +%} 1.1000 + 1.1001 +instruct Repl32B_zero(vecY dst, immI0 zero) %{ 1.1002 + predicate(n->as_Vector()->length() == 32); 1.1003 + match(Set dst (ReplicateB zero)); 1.1004 + format %{ "vxorpd $dst,$dst,$dst\t! replicate32B zero" %} 1.1005 + ins_encode %{ 1.1006 + // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it). 1.1007 + bool vector256 = true; 1.1008 + __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256); 1.1009 + %} 1.1010 + ins_pipe( fpu_reg_reg ); 1.1011 +%} 1.1012 + 1.1013 +// Replicate char/short (2 byte) scalar to be vector 1.1014 +instruct Repl2S(vecS dst, rRegI src) %{ 1.1015 + predicate(n->as_Vector()->length() == 2); 1.1016 + match(Set dst (ReplicateS src)); 1.1017 + format %{ "movd $dst,$src\n\t" 1.1018 + "pshuflw $dst,$dst,0x00\t! replicate2S" %} 1.1019 + ins_encode %{ 1.1020 + __ movdl($dst$$XMMRegister, $src$$Register); 1.1021 + __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); 1.1022 + %} 1.1023 + ins_pipe( fpu_reg_reg ); 1.1024 +%} 1.1025 + 1.1026 +instruct Repl4S(vecD dst, rRegI src) %{ 1.1027 + predicate(n->as_Vector()->length() == 4); 1.1028 + match(Set dst (ReplicateS src)); 1.1029 + format %{ "movd $dst,$src\n\t" 1.1030 + "pshuflw $dst,$dst,0x00\t! replicate4S" %} 1.1031 + ins_encode %{ 1.1032 + __ movdl($dst$$XMMRegister, $src$$Register); 1.1033 + __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); 1.1034 + %} 1.1035 + ins_pipe( fpu_reg_reg ); 1.1036 +%} 1.1037 + 1.1038 +instruct Repl8S(vecX dst, rRegI src) %{ 1.1039 + predicate(n->as_Vector()->length() == 8); 1.1040 + match(Set dst (ReplicateS src)); 1.1041 + format %{ "movd $dst,$src\n\t" 1.1042 + "pshuflw $dst,$dst,0x00\n\t" 1.1043 + "movlhps $dst,$dst\t! replicate8S" %} 1.1044 + ins_encode %{ 1.1045 + __ movdl($dst$$XMMRegister, $src$$Register); 1.1046 + __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); 1.1047 + __ movlhps($dst$$XMMRegister, $dst$$XMMRegister); 1.1048 + %} 1.1049 + ins_pipe( pipe_slow ); 1.1050 +%} 1.1051 + 1.1052 +instruct Repl16S(vecY dst, rRegI src) %{ 1.1053 + predicate(n->as_Vector()->length() == 16); 1.1054 + match(Set dst (ReplicateS src)); 1.1055 + format %{ "movd $dst,$src\n\t" 1.1056 + "pshuflw $dst,$dst,0x00\n\t" 1.1057 + "movlhps $dst,$dst\n\t" 1.1058 + "vinsertf128h $dst,$dst,$dst\t! replicate16S" %} 1.1059 + ins_encode %{ 1.1060 + __ movdl($dst$$XMMRegister, $src$$Register); 1.1061 + __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); 1.1062 + __ movlhps($dst$$XMMRegister, $dst$$XMMRegister); 1.1063 + __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); 1.1064 + %} 1.1065 + ins_pipe( pipe_slow ); 1.1066 +%} 1.1067 + 1.1068 +// Replicate char/short (2 byte) scalar immediate to be vector by loading from const table. 1.1069 +instruct Repl2S_imm(vecS dst, immI con) %{ 1.1070 + predicate(n->as_Vector()->length() == 2); 1.1071 + match(Set dst (ReplicateS con)); 1.1072 + format %{ "movss $dst,[$constantaddress]\t! replicate2S($con)" %} 1.1073 + ins_encode %{ 1.1074 + __ movflt($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2))); 1.1075 + %} 1.1076 + ins_pipe( fpu_reg_reg ); 1.1077 +%} 1.1078 + 1.1079 +instruct Repl4S_imm(vecD dst, immI con) %{ 1.1080 + predicate(n->as_Vector()->length() == 4); 1.1081 + match(Set dst (ReplicateS con)); 1.1082 + format %{ "movsd $dst,[$constantaddress]\t! replicate4S($con)" %} 1.1083 + ins_encode %{ 1.1084 + __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2))); 1.1085 + %} 1.1086 + ins_pipe( fpu_reg_reg ); 1.1087 +%} 1.1088 + 1.1089 +instruct Repl8S_imm(vecX dst, immI con) %{ 1.1090 + predicate(n->as_Vector()->length() == 8); 1.1091 + match(Set dst (ReplicateS con)); 1.1092 + format %{ "movsd $dst,[$constantaddress]\t! replicate8S($con)\n\t" 1.1093 + "movlhps $dst,$dst" %} 1.1094 + ins_encode %{ 1.1095 + __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2))); 1.1096 + __ movlhps($dst$$XMMRegister, $dst$$XMMRegister); 1.1097 + %} 1.1098 + ins_pipe( pipe_slow ); 1.1099 +%} 1.1100 + 1.1101 +instruct Repl16S_imm(vecY dst, immI con) %{ 1.1102 + predicate(n->as_Vector()->length() == 16); 1.1103 + match(Set dst (ReplicateS con)); 1.1104 + format %{ "movsd $dst,[$constantaddress]\t! replicate16S($con)\n\t" 1.1105 + "movlhps $dst,$dst\n\t" 1.1106 + "vinsertf128h $dst,$dst,$dst" %} 1.1107 + ins_encode %{ 1.1108 + __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2))); 1.1109 + __ movlhps($dst$$XMMRegister, $dst$$XMMRegister); 1.1110 + __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); 1.1111 + %} 1.1112 + ins_pipe( pipe_slow ); 1.1113 +%} 1.1114 + 1.1115 +// Replicate char/short (2 byte) scalar zero to be vector 1.1116 +instruct Repl2S_zero(vecS dst, immI0 zero) %{ 1.1117 + predicate(n->as_Vector()->length() == 2); 1.1118 + match(Set dst (ReplicateS zero)); 1.1119 + format %{ "pxor $dst,$dst\t! replicate2S zero" %} 1.1120 + ins_encode %{ 1.1121 + __ pxor($dst$$XMMRegister, $dst$$XMMRegister); 1.1122 + %} 1.1123 + ins_pipe( fpu_reg_reg ); 1.1124 +%} 1.1125 + 1.1126 +instruct Repl4S_zero(vecD dst, immI0 zero) %{ 1.1127 + predicate(n->as_Vector()->length() == 4); 1.1128 + match(Set dst (ReplicateS zero)); 1.1129 + format %{ "pxor $dst,$dst\t! replicate4S zero" %} 1.1130 + ins_encode %{ 1.1131 + __ pxor($dst$$XMMRegister, $dst$$XMMRegister); 1.1132 + %} 1.1133 + ins_pipe( fpu_reg_reg ); 1.1134 +%} 1.1135 + 1.1136 +instruct Repl8S_zero(vecX dst, immI0 zero) %{ 1.1137 + predicate(n->as_Vector()->length() == 8); 1.1138 + match(Set dst (ReplicateS zero)); 1.1139 + format %{ "pxor $dst,$dst\t! replicate8S zero" %} 1.1140 + ins_encode %{ 1.1141 + __ pxor($dst$$XMMRegister, $dst$$XMMRegister); 1.1142 + %} 1.1143 + ins_pipe( fpu_reg_reg ); 1.1144 +%} 1.1145 + 1.1146 +instruct Repl16S_zero(vecY dst, immI0 zero) %{ 1.1147 + predicate(n->as_Vector()->length() == 16); 1.1148 + match(Set dst (ReplicateS zero)); 1.1149 + format %{ "vxorpd $dst,$dst,$dst\t! replicate16S zero" %} 1.1150 + ins_encode %{ 1.1151 + // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it). 1.1152 + bool vector256 = true; 1.1153 + __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256); 1.1154 + %} 1.1155 + ins_pipe( fpu_reg_reg ); 1.1156 +%} 1.1157 + 1.1158 +// Replicate integer (4 byte) scalar to be vector 1.1159 +instruct Repl2I(vecD dst, rRegI src) %{ 1.1160 + predicate(n->as_Vector()->length() == 2); 1.1161 + match(Set dst (ReplicateI src)); 1.1162 + format %{ "movd $dst,$src\n\t" 1.1163 + "pshufd $dst,$dst,0x00\t! replicate2I" %} 1.1164 + ins_encode %{ 1.1165 + __ movdl($dst$$XMMRegister, $src$$Register); 1.1166 + __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); 1.1167 + %} 1.1168 + ins_pipe( fpu_reg_reg ); 1.1169 +%} 1.1170 + 1.1171 +instruct Repl4I(vecX dst, rRegI src) %{ 1.1172 + predicate(n->as_Vector()->length() == 4); 1.1173 + match(Set dst (ReplicateI src)); 1.1174 + format %{ "movd $dst,$src\n\t" 1.1175 + "pshufd $dst,$dst,0x00\t! replicate4I" %} 1.1176 + ins_encode %{ 1.1177 + __ movdl($dst$$XMMRegister, $src$$Register); 1.1178 + __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); 1.1179 + %} 1.1180 + ins_pipe( pipe_slow ); 1.1181 +%} 1.1182 + 1.1183 +instruct Repl8I(vecY dst, rRegI src) %{ 1.1184 + predicate(n->as_Vector()->length() == 8); 1.1185 + match(Set dst (ReplicateI src)); 1.1186 + format %{ "movd $dst,$src\n\t" 1.1187 + "pshufd $dst,$dst,0x00\n\t" 1.1188 + "vinsertf128h $dst,$dst,$dst\t! replicate8I" %} 1.1189 + ins_encode %{ 1.1190 + __ movdl($dst$$XMMRegister, $src$$Register); 1.1191 + __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); 1.1192 + __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); 1.1193 + %} 1.1194 + ins_pipe( pipe_slow ); 1.1195 +%} 1.1196 + 1.1197 +// Replicate integer (4 byte) scalar immediate to be vector by loading from const table. 1.1198 +instruct Repl2I_imm(vecD dst, immI con) %{ 1.1199 + predicate(n->as_Vector()->length() == 2); 1.1200 + match(Set dst (ReplicateI con)); 1.1201 + format %{ "movsd $dst,[$constantaddress]\t! replicate2I($con)" %} 1.1202 + ins_encode %{ 1.1203 + __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4))); 1.1204 + %} 1.1205 + ins_pipe( fpu_reg_reg ); 1.1206 +%} 1.1207 + 1.1208 +instruct Repl4I_imm(vecX dst, immI con) %{ 1.1209 + predicate(n->as_Vector()->length() == 4); 1.1210 + match(Set dst (ReplicateI con)); 1.1211 + format %{ "movsd $dst,[$constantaddress]\t! replicate4I($con)\n\t" 1.1212 + "movlhps $dst,$dst" %} 1.1213 + ins_encode %{ 1.1214 + __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4))); 1.1215 + __ movlhps($dst$$XMMRegister, $dst$$XMMRegister); 1.1216 + %} 1.1217 + ins_pipe( pipe_slow ); 1.1218 +%} 1.1219 + 1.1220 +instruct Repl8I_imm(vecY dst, immI con) %{ 1.1221 + predicate(n->as_Vector()->length() == 8); 1.1222 + match(Set dst (ReplicateI con)); 1.1223 + format %{ "movsd $dst,[$constantaddress]\t! replicate8I($con)\n\t" 1.1224 + "movlhps $dst,$dst\n\t" 1.1225 + "vinsertf128h $dst,$dst,$dst" %} 1.1226 + ins_encode %{ 1.1227 + __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4))); 1.1228 + __ movlhps($dst$$XMMRegister, $dst$$XMMRegister); 1.1229 + __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); 1.1230 + %} 1.1231 + ins_pipe( pipe_slow ); 1.1232 +%} 1.1233 + 1.1234 +// Integer could be loaded into xmm register directly from memory. 1.1235 +instruct Repl2I_mem(vecD dst, memory mem) %{ 1.1236 + predicate(n->as_Vector()->length() == 2); 1.1237 + match(Set dst (ReplicateI mem)); 1.1238 + format %{ "movd $dst,$mem\n\t" 1.1239 + "pshufd $dst,$dst,0x00\t! replicate2I" %} 1.1240 + ins_encode %{ 1.1241 + __ movdl($dst$$XMMRegister, $mem$$Address); 1.1242 + __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); 1.1243 + %} 1.1244 + ins_pipe( fpu_reg_reg ); 1.1245 +%} 1.1246 + 1.1247 +instruct Repl4I_mem(vecX dst, memory mem) %{ 1.1248 + predicate(n->as_Vector()->length() == 4); 1.1249 + match(Set dst (ReplicateI mem)); 1.1250 + format %{ "movd $dst,$mem\n\t" 1.1251 + "pshufd $dst,$dst,0x00\t! replicate4I" %} 1.1252 + ins_encode %{ 1.1253 + __ movdl($dst$$XMMRegister, $mem$$Address); 1.1254 + __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); 1.1255 + %} 1.1256 + ins_pipe( pipe_slow ); 1.1257 +%} 1.1258 + 1.1259 +instruct Repl8I_mem(vecY dst, memory mem) %{ 1.1260 + predicate(n->as_Vector()->length() == 8); 1.1261 + match(Set dst (ReplicateI mem)); 1.1262 + format %{ "movd $dst,$mem\n\t" 1.1263 + "pshufd $dst,$dst,0x00\n\t" 1.1264 + "vinsertf128h $dst,$dst,$dst\t! replicate8I" %} 1.1265 + ins_encode %{ 1.1266 + __ movdl($dst$$XMMRegister, $mem$$Address); 1.1267 + __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); 1.1268 + __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); 1.1269 + %} 1.1270 + ins_pipe( pipe_slow ); 1.1271 +%} 1.1272 + 1.1273 +// Replicate integer (4 byte) scalar zero to be vector 1.1274 +instruct Repl2I_zero(vecD dst, immI0 zero) %{ 1.1275 + predicate(n->as_Vector()->length() == 2); 1.1276 + match(Set dst (ReplicateI zero)); 1.1277 + format %{ "pxor $dst,$dst\t! replicate2I" %} 1.1278 + ins_encode %{ 1.1279 + __ pxor($dst$$XMMRegister, $dst$$XMMRegister); 1.1280 + %} 1.1281 + ins_pipe( fpu_reg_reg ); 1.1282 +%} 1.1283 + 1.1284 +instruct Repl4I_zero(vecX dst, immI0 zero) %{ 1.1285 + predicate(n->as_Vector()->length() == 4); 1.1286 + match(Set dst (ReplicateI zero)); 1.1287 + format %{ "pxor $dst,$dst\t! replicate4I zero)" %} 1.1288 + ins_encode %{ 1.1289 + __ pxor($dst$$XMMRegister, $dst$$XMMRegister); 1.1290 + %} 1.1291 + ins_pipe( fpu_reg_reg ); 1.1292 +%} 1.1293 + 1.1294 +instruct Repl8I_zero(vecY dst, immI0 zero) %{ 1.1295 + predicate(n->as_Vector()->length() == 8); 1.1296 + match(Set dst (ReplicateI zero)); 1.1297 + format %{ "vxorpd $dst,$dst,$dst\t! replicate8I zero" %} 1.1298 + ins_encode %{ 1.1299 + // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it). 1.1300 + bool vector256 = true; 1.1301 + __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256); 1.1302 + %} 1.1303 + ins_pipe( fpu_reg_reg ); 1.1304 +%} 1.1305 + 1.1306 +// Replicate long (8 byte) scalar to be vector 1.1307 +#ifdef _LP64 1.1308 +instruct Repl2L(vecX dst, rRegL src) %{ 1.1309 + predicate(n->as_Vector()->length() == 2); 1.1310 + match(Set dst (ReplicateL src)); 1.1311 + format %{ "movdq $dst,$src\n\t" 1.1312 + "movlhps $dst,$dst\t! replicate2L" %} 1.1313 + ins_encode %{ 1.1314 + __ movdq($dst$$XMMRegister, $src$$Register); 1.1315 + __ movlhps($dst$$XMMRegister, $dst$$XMMRegister); 1.1316 + %} 1.1317 + ins_pipe( pipe_slow ); 1.1318 +%} 1.1319 + 1.1320 +instruct Repl4L(vecY dst, rRegL src) %{ 1.1321 + predicate(n->as_Vector()->length() == 4); 1.1322 + match(Set dst (ReplicateL src)); 1.1323 + format %{ "movdq $dst,$src\n\t" 1.1324 + "movlhps $dst,$dst\n\t" 1.1325 + "vinsertf128h $dst,$dst,$dst\t! replicate4L" %} 1.1326 + ins_encode %{ 1.1327 + __ movdq($dst$$XMMRegister, $src$$Register); 1.1328 + __ movlhps($dst$$XMMRegister, $dst$$XMMRegister); 1.1329 + __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); 1.1330 + %} 1.1331 + ins_pipe( pipe_slow ); 1.1332 +%} 1.1333 +#else // _LP64 1.1334 +instruct Repl2L(vecX dst, eRegL src, regD tmp) %{ 1.1335 + predicate(n->as_Vector()->length() == 2); 1.1336 + match(Set dst (ReplicateL src)); 1.1337 + effect(TEMP dst, USE src, TEMP tmp); 1.1338 + format %{ "movdl $dst,$src.lo\n\t" 1.1339 + "movdl $tmp,$src.hi\n\t" 1.1340 + "punpckldq $dst,$tmp\n\t" 1.1341 + "movlhps $dst,$dst\t! replicate2L"%} 1.1342 + ins_encode %{ 1.1343 + __ movdl($dst$$XMMRegister, $src$$Register); 1.1344 + __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register)); 1.1345 + __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister); 1.1346 + __ movlhps($dst$$XMMRegister, $dst$$XMMRegister); 1.1347 + %} 1.1348 + ins_pipe( pipe_slow ); 1.1349 +%} 1.1350 + 1.1351 +instruct Repl4L(vecY dst, eRegL src, regD tmp) %{ 1.1352 + predicate(n->as_Vector()->length() == 4); 1.1353 + match(Set dst (ReplicateL src)); 1.1354 + effect(TEMP dst, USE src, TEMP tmp); 1.1355 + format %{ "movdl $dst,$src.lo\n\t" 1.1356 + "movdl $tmp,$src.hi\n\t" 1.1357 + "punpckldq $dst,$tmp\n\t" 1.1358 + "movlhps $dst,$dst\n\t" 1.1359 + "vinsertf128h $dst,$dst,$dst\t! replicate4L" %} 1.1360 + ins_encode %{ 1.1361 + __ movdl($dst$$XMMRegister, $src$$Register); 1.1362 + __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register)); 1.1363 + __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister); 1.1364 + __ movlhps($dst$$XMMRegister, $dst$$XMMRegister); 1.1365 + __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); 1.1366 + %} 1.1367 + ins_pipe( pipe_slow ); 1.1368 +%} 1.1369 +#endif // _LP64 1.1370 + 1.1371 +// Replicate long (8 byte) scalar immediate to be vector by loading from const table. 1.1372 +instruct Repl2L_imm(vecX dst, immL con) %{ 1.1373 + predicate(n->as_Vector()->length() == 2); 1.1374 + match(Set dst (ReplicateL con)); 1.1375 + format %{ "movsd $dst,[$constantaddress]\t! replicate2L($con)\n\t" 1.1376 + "movlhps $dst,$dst" %} 1.1377 + ins_encode %{ 1.1378 + __ movdbl($dst$$XMMRegister, $constantaddress($con)); 1.1379 + __ movlhps($dst$$XMMRegister, $dst$$XMMRegister); 1.1380 + %} 1.1381 + ins_pipe( pipe_slow ); 1.1382 +%} 1.1383 + 1.1384 +instruct Repl4L_imm(vecY dst, immL con) %{ 1.1385 + predicate(n->as_Vector()->length() == 4); 1.1386 + match(Set dst (ReplicateL con)); 1.1387 + format %{ "movsd $dst,[$constantaddress]\t! replicate4L($con)\n\t" 1.1388 + "movlhps $dst,$dst\n\t" 1.1389 + "vinsertf128h $dst,$dst,$dst" %} 1.1390 + ins_encode %{ 1.1391 + __ movdbl($dst$$XMMRegister, $constantaddress($con)); 1.1392 + __ movlhps($dst$$XMMRegister, $dst$$XMMRegister); 1.1393 + __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); 1.1394 + %} 1.1395 + ins_pipe( pipe_slow ); 1.1396 +%} 1.1397 + 1.1398 +// Long could be loaded into xmm register directly from memory. 1.1399 +instruct Repl2L_mem(vecX dst, memory mem) %{ 1.1400 + predicate(n->as_Vector()->length() == 2); 1.1401 + match(Set dst (ReplicateL mem)); 1.1402 + format %{ "movq $dst,$mem\n\t" 1.1403 + "movlhps $dst,$dst\t! replicate2L" %} 1.1404 + ins_encode %{ 1.1405 + __ movq($dst$$XMMRegister, $mem$$Address); 1.1406 + __ movlhps($dst$$XMMRegister, $dst$$XMMRegister); 1.1407 + %} 1.1408 + ins_pipe( pipe_slow ); 1.1409 +%} 1.1410 + 1.1411 +instruct Repl4L_mem(vecY dst, memory mem) %{ 1.1412 + predicate(n->as_Vector()->length() == 4); 1.1413 + match(Set dst (ReplicateL mem)); 1.1414 + format %{ "movq $dst,$mem\n\t" 1.1415 + "movlhps $dst,$dst\n\t" 1.1416 + "vinsertf128h $dst,$dst,$dst\t! replicate4L" %} 1.1417 + ins_encode %{ 1.1418 + __ movq($dst$$XMMRegister, $mem$$Address); 1.1419 + __ movlhps($dst$$XMMRegister, $dst$$XMMRegister); 1.1420 + __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); 1.1421 + %} 1.1422 + ins_pipe( pipe_slow ); 1.1423 +%} 1.1424 + 1.1425 +// Replicate long (8 byte) scalar zero to be vector 1.1426 +instruct Repl2L_zero(vecX dst, immL0 zero) %{ 1.1427 + predicate(n->as_Vector()->length() == 2); 1.1428 + match(Set dst (ReplicateL zero)); 1.1429 + format %{ "pxor $dst,$dst\t! replicate2L zero" %} 1.1430 + ins_encode %{ 1.1431 + __ pxor($dst$$XMMRegister, $dst$$XMMRegister); 1.1432 + %} 1.1433 + ins_pipe( fpu_reg_reg ); 1.1434 +%} 1.1435 + 1.1436 +instruct Repl4L_zero(vecY dst, immL0 zero) %{ 1.1437 + predicate(n->as_Vector()->length() == 4); 1.1438 + match(Set dst (ReplicateL zero)); 1.1439 + format %{ "vxorpd $dst,$dst,$dst\t! replicate4L zero" %} 1.1440 + ins_encode %{ 1.1441 + // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it). 1.1442 + bool vector256 = true; 1.1443 + __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256); 1.1444 + %} 1.1445 + ins_pipe( fpu_reg_reg ); 1.1446 +%} 1.1447 + 1.1448 +// Replicate float (4 byte) scalar to be vector 1.1449 +instruct Repl2F(vecD dst, regF src) %{ 1.1450 + predicate(n->as_Vector()->length() == 2); 1.1451 + match(Set dst (ReplicateF src)); 1.1452 + format %{ "pshufd $dst,$dst,0x00\t! replicate2F" %} 1.1453 + ins_encode %{ 1.1454 + __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00); 1.1455 + %} 1.1456 + ins_pipe( fpu_reg_reg ); 1.1457 +%} 1.1458 + 1.1459 +instruct Repl4F(vecX dst, regF src) %{ 1.1460 + predicate(n->as_Vector()->length() == 4); 1.1461 + match(Set dst (ReplicateF src)); 1.1462 + format %{ "pshufd $dst,$dst,0x00\t! replicate4F" %} 1.1463 + ins_encode %{ 1.1464 + __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00); 1.1465 + %} 1.1466 + ins_pipe( pipe_slow ); 1.1467 +%} 1.1468 + 1.1469 +instruct Repl8F(vecY dst, regF src) %{ 1.1470 + predicate(n->as_Vector()->length() == 8); 1.1471 + match(Set dst (ReplicateF src)); 1.1472 + format %{ "pshufd $dst,$src,0x00\n\t" 1.1473 + "vinsertf128h $dst,$dst,$dst\t! replicate8F" %} 1.1474 + ins_encode %{ 1.1475 + __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00); 1.1476 + __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); 1.1477 + %} 1.1478 + ins_pipe( pipe_slow ); 1.1479 +%} 1.1480 + 1.1481 +// Replicate float (4 byte) scalar zero to be vector 1.1482 +instruct Repl2F_zero(vecD dst, immF0 zero) %{ 1.1483 + predicate(n->as_Vector()->length() == 2); 1.1484 + match(Set dst (ReplicateF zero)); 1.1485 + format %{ "xorps $dst,$dst\t! replicate2F zero" %} 1.1486 + ins_encode %{ 1.1487 + __ xorps($dst$$XMMRegister, $dst$$XMMRegister); 1.1488 + %} 1.1489 + ins_pipe( fpu_reg_reg ); 1.1490 +%} 1.1491 + 1.1492 +instruct Repl4F_zero(vecX dst, immF0 zero) %{ 1.1493 + predicate(n->as_Vector()->length() == 4); 1.1494 + match(Set dst (ReplicateF zero)); 1.1495 + format %{ "xorps $dst,$dst\t! replicate4F zero" %} 1.1496 + ins_encode %{ 1.1497 + __ xorps($dst$$XMMRegister, $dst$$XMMRegister); 1.1498 + %} 1.1499 + ins_pipe( fpu_reg_reg ); 1.1500 +%} 1.1501 + 1.1502 +instruct Repl8F_zero(vecY dst, immF0 zero) %{ 1.1503 + predicate(n->as_Vector()->length() == 8); 1.1504 + match(Set dst (ReplicateF zero)); 1.1505 + format %{ "vxorps $dst,$dst,$dst\t! replicate8F zero" %} 1.1506 + ins_encode %{ 1.1507 + bool vector256 = true; 1.1508 + __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256); 1.1509 + %} 1.1510 + ins_pipe( fpu_reg_reg ); 1.1511 +%} 1.1512 + 1.1513 +// Replicate double (8 bytes) scalar to be vector 1.1514 +instruct Repl2D(vecX dst, regD src) %{ 1.1515 + predicate(n->as_Vector()->length() == 2); 1.1516 + match(Set dst (ReplicateD src)); 1.1517 + format %{ "pshufd $dst,$src,0x44\t! replicate2D" %} 1.1518 + ins_encode %{ 1.1519 + __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44); 1.1520 + %} 1.1521 + ins_pipe( pipe_slow ); 1.1522 +%} 1.1523 + 1.1524 +instruct Repl4D(vecY dst, regD src) %{ 1.1525 + predicate(n->as_Vector()->length() == 4); 1.1526 + match(Set dst (ReplicateD src)); 1.1527 + format %{ "pshufd $dst,$src,0x44\n\t" 1.1528 + "vinsertf128h $dst,$dst,$dst\t! replicate4D" %} 1.1529 + ins_encode %{ 1.1530 + __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44); 1.1531 + __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); 1.1532 + %} 1.1533 + ins_pipe( pipe_slow ); 1.1534 +%} 1.1535 + 1.1536 +// Replicate double (8 byte) scalar zero to be vector 1.1537 +instruct Repl2D_zero(vecX dst, immD0 zero) %{ 1.1538 + predicate(n->as_Vector()->length() == 2); 1.1539 + match(Set dst (ReplicateD zero)); 1.1540 + format %{ "xorpd $dst,$dst\t! replicate2D zero" %} 1.1541 + ins_encode %{ 1.1542 + __ xorpd($dst$$XMMRegister, $dst$$XMMRegister); 1.1543 + %} 1.1544 + ins_pipe( fpu_reg_reg ); 1.1545 +%} 1.1546 + 1.1547 +instruct Repl4D_zero(vecY dst, immD0 zero) %{ 1.1548 + predicate(n->as_Vector()->length() == 4); 1.1549 + match(Set dst (ReplicateD zero)); 1.1550 + format %{ "vxorpd $dst,$dst,$dst,vect256\t! replicate4D zero" %} 1.1551 + ins_encode %{ 1.1552 + bool vector256 = true; 1.1553 + __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256); 1.1554 + %} 1.1555 + ins_pipe( fpu_reg_reg ); 1.1556 +%} 1.1557 +