1.1 --- a/src/cpu/x86/vm/assembler_x86.cpp Mon Sep 17 17:02:10 2012 -0700 1.2 +++ b/src/cpu/x86/vm/assembler_x86.cpp Mon Sep 17 19:39:07 2012 -0700 1.3 @@ -3496,6 +3496,33 @@ 1.4 emit_byte(0x01); 1.5 } 1.6 1.7 +void Assembler::vinsertf128h(XMMRegister dst, Address src) { 1.8 + assert(VM_Version::supports_avx(), ""); 1.9 + InstructionMark im(this); 1.10 + bool vector256 = true; 1.11 + assert(dst != xnoreg, "sanity"); 1.12 + int dst_enc = dst->encoding(); 1.13 + // swap src<->dst for encoding 1.14 + vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256); 1.15 + emit_byte(0x18); 1.16 + emit_operand(dst, src); 1.17 + // 0x01 - insert into upper 128 bits 1.18 + emit_byte(0x01); 1.19 +} 1.20 + 1.21 +void Assembler::vextractf128h(Address dst, XMMRegister src) { 1.22 + assert(VM_Version::supports_avx(), ""); 1.23 + InstructionMark im(this); 1.24 + bool vector256 = true; 1.25 + assert(src != xnoreg, "sanity"); 1.26 + int src_enc = src->encoding(); 1.27 + vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256); 1.28 + emit_byte(0x19); 1.29 + emit_operand(src, dst); 1.30 + // 0x01 - extract from upper 128 bits 1.31 + emit_byte(0x01); 1.32 +} 1.33 + 1.34 void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) { 1.35 assert(VM_Version::supports_avx2(), ""); 1.36 bool vector256 = true; 1.37 @@ -3507,6 +3534,33 @@ 1.38 emit_byte(0x01); 1.39 } 1.40 1.41 +void Assembler::vinserti128h(XMMRegister dst, Address src) { 1.42 + assert(VM_Version::supports_avx2(), ""); 1.43 + InstructionMark im(this); 1.44 + bool vector256 = true; 1.45 + assert(dst != xnoreg, "sanity"); 1.46 + int dst_enc = dst->encoding(); 1.47 + // swap src<->dst for encoding 1.48 + vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256); 1.49 + emit_byte(0x38); 1.50 + emit_operand(dst, src); 1.51 + // 0x01 - insert into upper 128 bits 1.52 + emit_byte(0x01); 1.53 +} 1.54 + 1.55 +void Assembler::vextracti128h(Address dst, XMMRegister src) { 1.56 + assert(VM_Version::supports_avx2(), ""); 1.57 + InstructionMark im(this); 1.58 + bool vector256 = true; 1.59 + assert(src != xnoreg, "sanity"); 1.60 + int src_enc = src->encoding(); 1.61 + vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256); 1.62 + emit_byte(0x39); 1.63 + emit_operand(src, dst); 1.64 + // 0x01 - extract from upper 128 bits 1.65 + emit_byte(0x01); 1.66 +} 1.67 + 1.68 void Assembler::vzeroupper() { 1.69 assert(VM_Version::supports_avx(), ""); 1.70 (void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE); 1.71 @@ -8907,11 +8961,9 @@ 1.72 pusha(); 1.73 1.74 // if we are coming from c1, xmm registers may be live 1.75 - if (UseSSE >= 1) { 1.76 - subptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8)); 1.77 - } 1.78 int off = 0; 1.79 if (UseSSE == 1) { 1.80 + subptr(rsp, sizeof(jdouble)*8); 1.81 movflt(Address(rsp,off++*sizeof(jdouble)),xmm0); 1.82 movflt(Address(rsp,off++*sizeof(jdouble)),xmm1); 1.83 movflt(Address(rsp,off++*sizeof(jdouble)),xmm2); 1.84 @@ -8921,23 +8973,50 @@ 1.85 movflt(Address(rsp,off++*sizeof(jdouble)),xmm6); 1.86 movflt(Address(rsp,off++*sizeof(jdouble)),xmm7); 1.87 } else if (UseSSE >= 2) { 1.88 - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm0); 1.89 - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm1); 1.90 - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm2); 1.91 - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm3); 1.92 - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm4); 1.93 - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm5); 1.94 - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm6); 1.95 - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm7); 1.96 +#ifdef COMPILER2 1.97 + if (MaxVectorSize > 16) { 1.98 + assert(UseAVX > 0, "256bit vectors are supported only with AVX"); 1.99 + // Save upper half of YMM registes 1.100 + subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8)); 1.101 + vextractf128h(Address(rsp, 0),xmm0); 1.102 + vextractf128h(Address(rsp, 16),xmm1); 1.103 + vextractf128h(Address(rsp, 32),xmm2); 1.104 + vextractf128h(Address(rsp, 48),xmm3); 1.105 + vextractf128h(Address(rsp, 64),xmm4); 1.106 + vextractf128h(Address(rsp, 80),xmm5); 1.107 + vextractf128h(Address(rsp, 96),xmm6); 1.108 + vextractf128h(Address(rsp,112),xmm7); 1.109 #ifdef _LP64 1.110 - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm8); 1.111 - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm9); 1.112 - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm10); 1.113 - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm11); 1.114 - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm12); 1.115 - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm13); 1.116 - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm14); 1.117 - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm15); 1.118 + vextractf128h(Address(rsp,128),xmm8); 1.119 + vextractf128h(Address(rsp,144),xmm9); 1.120 + vextractf128h(Address(rsp,160),xmm10); 1.121 + vextractf128h(Address(rsp,176),xmm11); 1.122 + vextractf128h(Address(rsp,192),xmm12); 1.123 + vextractf128h(Address(rsp,208),xmm13); 1.124 + vextractf128h(Address(rsp,224),xmm14); 1.125 + vextractf128h(Address(rsp,240),xmm15); 1.126 +#endif 1.127 + } 1.128 +#endif 1.129 + // Save whole 128bit (16 bytes) XMM regiters 1.130 + subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8)); 1.131 + movdqu(Address(rsp,off++*16),xmm0); 1.132 + movdqu(Address(rsp,off++*16),xmm1); 1.133 + movdqu(Address(rsp,off++*16),xmm2); 1.134 + movdqu(Address(rsp,off++*16),xmm3); 1.135 + movdqu(Address(rsp,off++*16),xmm4); 1.136 + movdqu(Address(rsp,off++*16),xmm5); 1.137 + movdqu(Address(rsp,off++*16),xmm6); 1.138 + movdqu(Address(rsp,off++*16),xmm7); 1.139 +#ifdef _LP64 1.140 + movdqu(Address(rsp,off++*16),xmm8); 1.141 + movdqu(Address(rsp,off++*16),xmm9); 1.142 + movdqu(Address(rsp,off++*16),xmm10); 1.143 + movdqu(Address(rsp,off++*16),xmm11); 1.144 + movdqu(Address(rsp,off++*16),xmm12); 1.145 + movdqu(Address(rsp,off++*16),xmm13); 1.146 + movdqu(Address(rsp,off++*16),xmm14); 1.147 + movdqu(Address(rsp,off++*16),xmm15); 1.148 #endif 1.149 } 1.150 1.151 @@ -9015,28 +9094,52 @@ 1.152 movflt(xmm5, Address(rsp,off++*sizeof(jdouble))); 1.153 movflt(xmm6, Address(rsp,off++*sizeof(jdouble))); 1.154 movflt(xmm7, Address(rsp,off++*sizeof(jdouble))); 1.155 + addptr(rsp, sizeof(jdouble)*8); 1.156 } else if (UseSSE >= 2) { 1.157 - movdbl(xmm0, Address(rsp,off++*sizeof(jdouble))); 1.158 - movdbl(xmm1, Address(rsp,off++*sizeof(jdouble))); 1.159 - movdbl(xmm2, Address(rsp,off++*sizeof(jdouble))); 1.160 - movdbl(xmm3, Address(rsp,off++*sizeof(jdouble))); 1.161 - movdbl(xmm4, Address(rsp,off++*sizeof(jdouble))); 1.162 - movdbl(xmm5, Address(rsp,off++*sizeof(jdouble))); 1.163 - movdbl(xmm6, Address(rsp,off++*sizeof(jdouble))); 1.164 - movdbl(xmm7, Address(rsp,off++*sizeof(jdouble))); 1.165 + // Restore whole 128bit (16 bytes) XMM regiters 1.166 + movdqu(xmm0, Address(rsp,off++*16)); 1.167 + movdqu(xmm1, Address(rsp,off++*16)); 1.168 + movdqu(xmm2, Address(rsp,off++*16)); 1.169 + movdqu(xmm3, Address(rsp,off++*16)); 1.170 + movdqu(xmm4, Address(rsp,off++*16)); 1.171 + movdqu(xmm5, Address(rsp,off++*16)); 1.172 + movdqu(xmm6, Address(rsp,off++*16)); 1.173 + movdqu(xmm7, Address(rsp,off++*16)); 1.174 #ifdef _LP64 1.175 - movdbl(xmm8, Address(rsp,off++*sizeof(jdouble))); 1.176 - movdbl(xmm9, Address(rsp,off++*sizeof(jdouble))); 1.177 - movdbl(xmm10, Address(rsp,off++*sizeof(jdouble))); 1.178 - movdbl(xmm11, Address(rsp,off++*sizeof(jdouble))); 1.179 - movdbl(xmm12, Address(rsp,off++*sizeof(jdouble))); 1.180 - movdbl(xmm13, Address(rsp,off++*sizeof(jdouble))); 1.181 - movdbl(xmm14, Address(rsp,off++*sizeof(jdouble))); 1.182 - movdbl(xmm15, Address(rsp,off++*sizeof(jdouble))); 1.183 + movdqu(xmm8, Address(rsp,off++*16)); 1.184 + movdqu(xmm9, Address(rsp,off++*16)); 1.185 + movdqu(xmm10, Address(rsp,off++*16)); 1.186 + movdqu(xmm11, Address(rsp,off++*16)); 1.187 + movdqu(xmm12, Address(rsp,off++*16)); 1.188 + movdqu(xmm13, Address(rsp,off++*16)); 1.189 + movdqu(xmm14, Address(rsp,off++*16)); 1.190 + movdqu(xmm15, Address(rsp,off++*16)); 1.191 #endif 1.192 - } 1.193 - if (UseSSE >= 1) { 1.194 - addptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8)); 1.195 + addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8)); 1.196 +#ifdef COMPILER2 1.197 + if (MaxVectorSize > 16) { 1.198 + // Restore upper half of YMM registes. 1.199 + vinsertf128h(xmm0, Address(rsp, 0)); 1.200 + vinsertf128h(xmm1, Address(rsp, 16)); 1.201 + vinsertf128h(xmm2, Address(rsp, 32)); 1.202 + vinsertf128h(xmm3, Address(rsp, 48)); 1.203 + vinsertf128h(xmm4, Address(rsp, 64)); 1.204 + vinsertf128h(xmm5, Address(rsp, 80)); 1.205 + vinsertf128h(xmm6, Address(rsp, 96)); 1.206 + vinsertf128h(xmm7, Address(rsp,112)); 1.207 +#ifdef _LP64 1.208 + vinsertf128h(xmm8, Address(rsp,128)); 1.209 + vinsertf128h(xmm9, Address(rsp,144)); 1.210 + vinsertf128h(xmm10, Address(rsp,160)); 1.211 + vinsertf128h(xmm11, Address(rsp,176)); 1.212 + vinsertf128h(xmm12, Address(rsp,192)); 1.213 + vinsertf128h(xmm13, Address(rsp,208)); 1.214 + vinsertf128h(xmm14, Address(rsp,224)); 1.215 + vinsertf128h(xmm15, Address(rsp,240)); 1.216 +#endif 1.217 + addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8)); 1.218 + } 1.219 +#endif 1.220 } 1.221 popa(); 1.222 }