src/cpu/x86/vm/assembler_x86.cpp

changeset 4103
137868b7aa6f
parent 4037
da91efe96a93
child 4142
d8ce2825b193
child 4159
8e47bac5643a
     1.1 --- a/src/cpu/x86/vm/assembler_x86.cpp	Mon Sep 17 17:02:10 2012 -0700
     1.2 +++ b/src/cpu/x86/vm/assembler_x86.cpp	Mon Sep 17 19:39:07 2012 -0700
     1.3 @@ -3496,6 +3496,33 @@
     1.4    emit_byte(0x01);
     1.5  }
     1.6  
     1.7 +void Assembler::vinsertf128h(XMMRegister dst, Address src) {
     1.8 +  assert(VM_Version::supports_avx(), "");
     1.9 +  InstructionMark im(this);
    1.10 +  bool vector256 = true;
    1.11 +  assert(dst != xnoreg, "sanity");
    1.12 +  int dst_enc = dst->encoding();
    1.13 +  // swap src<->dst for encoding
    1.14 +  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
    1.15 +  emit_byte(0x18);
    1.16 +  emit_operand(dst, src);
    1.17 +  // 0x01 - insert into upper 128 bits
    1.18 +  emit_byte(0x01);
    1.19 +}
    1.20 +
    1.21 +void Assembler::vextractf128h(Address dst, XMMRegister src) {
    1.22 +  assert(VM_Version::supports_avx(), "");
    1.23 +  InstructionMark im(this);
    1.24 +  bool vector256 = true;
    1.25 +  assert(src != xnoreg, "sanity");
    1.26 +  int src_enc = src->encoding();
    1.27 +  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
    1.28 +  emit_byte(0x19);
    1.29 +  emit_operand(src, dst);
    1.30 +  // 0x01 - extract from upper 128 bits
    1.31 +  emit_byte(0x01);
    1.32 +}
    1.33 +
    1.34  void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
    1.35    assert(VM_Version::supports_avx2(), "");
    1.36    bool vector256 = true;
    1.37 @@ -3507,6 +3534,33 @@
    1.38    emit_byte(0x01);
    1.39  }
    1.40  
    1.41 +void Assembler::vinserti128h(XMMRegister dst, Address src) {
    1.42 +  assert(VM_Version::supports_avx2(), "");
    1.43 +  InstructionMark im(this);
    1.44 +  bool vector256 = true;
    1.45 +  assert(dst != xnoreg, "sanity");
    1.46 +  int dst_enc = dst->encoding();
    1.47 +  // swap src<->dst for encoding
    1.48 +  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
    1.49 +  emit_byte(0x38);
    1.50 +  emit_operand(dst, src);
    1.51 +  // 0x01 - insert into upper 128 bits
    1.52 +  emit_byte(0x01);
    1.53 +}
    1.54 +
    1.55 +void Assembler::vextracti128h(Address dst, XMMRegister src) {
    1.56 +  assert(VM_Version::supports_avx2(), "");
    1.57 +  InstructionMark im(this);
    1.58 +  bool vector256 = true;
    1.59 +  assert(src != xnoreg, "sanity");
    1.60 +  int src_enc = src->encoding();
    1.61 +  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
    1.62 +  emit_byte(0x39);
    1.63 +  emit_operand(src, dst);
    1.64 +  // 0x01 - extract from upper 128 bits
    1.65 +  emit_byte(0x01);
    1.66 +}
    1.67 +
    1.68  void Assembler::vzeroupper() {
    1.69    assert(VM_Version::supports_avx(), "");
    1.70    (void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE);
    1.71 @@ -8907,11 +8961,9 @@
    1.72    pusha();
    1.73  
    1.74    // if we are coming from c1, xmm registers may be live
    1.75 -  if (UseSSE >= 1) {
    1.76 -    subptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
    1.77 -  }
    1.78    int off = 0;
    1.79    if (UseSSE == 1)  {
    1.80 +    subptr(rsp, sizeof(jdouble)*8);
    1.81      movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
    1.82      movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
    1.83      movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
    1.84 @@ -8921,23 +8973,50 @@
    1.85      movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
    1.86      movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
    1.87    } else if (UseSSE >= 2)  {
    1.88 -    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm0);
    1.89 -    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm1);
    1.90 -    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm2);
    1.91 -    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm3);
    1.92 -    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm4);
    1.93 -    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm5);
    1.94 -    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm6);
    1.95 -    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm7);
    1.96 +#ifdef COMPILER2
    1.97 +    if (MaxVectorSize > 16) {
    1.98 +      assert(UseAVX > 0, "256bit vectors are supported only with AVX");
    1.99 +      // Save upper half of YMM registes
   1.100 +      subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
   1.101 +      vextractf128h(Address(rsp,  0),xmm0);
   1.102 +      vextractf128h(Address(rsp, 16),xmm1);
   1.103 +      vextractf128h(Address(rsp, 32),xmm2);
   1.104 +      vextractf128h(Address(rsp, 48),xmm3);
   1.105 +      vextractf128h(Address(rsp, 64),xmm4);
   1.106 +      vextractf128h(Address(rsp, 80),xmm5);
   1.107 +      vextractf128h(Address(rsp, 96),xmm6);
   1.108 +      vextractf128h(Address(rsp,112),xmm7);
   1.109  #ifdef _LP64
   1.110 -    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm8);
   1.111 -    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm9);
   1.112 -    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm10);
   1.113 -    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm11);
   1.114 -    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm12);
   1.115 -    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm13);
   1.116 -    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm14);
   1.117 -    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm15);
   1.118 +      vextractf128h(Address(rsp,128),xmm8);
   1.119 +      vextractf128h(Address(rsp,144),xmm9);
   1.120 +      vextractf128h(Address(rsp,160),xmm10);
   1.121 +      vextractf128h(Address(rsp,176),xmm11);
   1.122 +      vextractf128h(Address(rsp,192),xmm12);
   1.123 +      vextractf128h(Address(rsp,208),xmm13);
   1.124 +      vextractf128h(Address(rsp,224),xmm14);
   1.125 +      vextractf128h(Address(rsp,240),xmm15);
   1.126 +#endif
   1.127 +    }
   1.128 +#endif
   1.129 +    // Save whole 128bit (16 bytes) XMM regiters
   1.130 +    subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
   1.131 +    movdqu(Address(rsp,off++*16),xmm0);
   1.132 +    movdqu(Address(rsp,off++*16),xmm1);
   1.133 +    movdqu(Address(rsp,off++*16),xmm2);
   1.134 +    movdqu(Address(rsp,off++*16),xmm3);
   1.135 +    movdqu(Address(rsp,off++*16),xmm4);
   1.136 +    movdqu(Address(rsp,off++*16),xmm5);
   1.137 +    movdqu(Address(rsp,off++*16),xmm6);
   1.138 +    movdqu(Address(rsp,off++*16),xmm7);
   1.139 +#ifdef _LP64
   1.140 +    movdqu(Address(rsp,off++*16),xmm8);
   1.141 +    movdqu(Address(rsp,off++*16),xmm9);
   1.142 +    movdqu(Address(rsp,off++*16),xmm10);
   1.143 +    movdqu(Address(rsp,off++*16),xmm11);
   1.144 +    movdqu(Address(rsp,off++*16),xmm12);
   1.145 +    movdqu(Address(rsp,off++*16),xmm13);
   1.146 +    movdqu(Address(rsp,off++*16),xmm14);
   1.147 +    movdqu(Address(rsp,off++*16),xmm15);
   1.148  #endif
   1.149    }
   1.150  
   1.151 @@ -9015,28 +9094,52 @@
   1.152      movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
   1.153      movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
   1.154      movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
   1.155 +    addptr(rsp, sizeof(jdouble)*8);
   1.156    } else if (UseSSE >= 2)  {
   1.157 -    movdbl(xmm0, Address(rsp,off++*sizeof(jdouble)));
   1.158 -    movdbl(xmm1, Address(rsp,off++*sizeof(jdouble)));
   1.159 -    movdbl(xmm2, Address(rsp,off++*sizeof(jdouble)));
   1.160 -    movdbl(xmm3, Address(rsp,off++*sizeof(jdouble)));
   1.161 -    movdbl(xmm4, Address(rsp,off++*sizeof(jdouble)));
   1.162 -    movdbl(xmm5, Address(rsp,off++*sizeof(jdouble)));
   1.163 -    movdbl(xmm6, Address(rsp,off++*sizeof(jdouble)));
   1.164 -    movdbl(xmm7, Address(rsp,off++*sizeof(jdouble)));
   1.165 +    // Restore whole 128bit (16 bytes) XMM regiters
   1.166 +    movdqu(xmm0, Address(rsp,off++*16));
   1.167 +    movdqu(xmm1, Address(rsp,off++*16));
   1.168 +    movdqu(xmm2, Address(rsp,off++*16));
   1.169 +    movdqu(xmm3, Address(rsp,off++*16));
   1.170 +    movdqu(xmm4, Address(rsp,off++*16));
   1.171 +    movdqu(xmm5, Address(rsp,off++*16));
   1.172 +    movdqu(xmm6, Address(rsp,off++*16));
   1.173 +    movdqu(xmm7, Address(rsp,off++*16));
   1.174  #ifdef _LP64
   1.175 -    movdbl(xmm8, Address(rsp,off++*sizeof(jdouble)));
   1.176 -    movdbl(xmm9, Address(rsp,off++*sizeof(jdouble)));
   1.177 -    movdbl(xmm10, Address(rsp,off++*sizeof(jdouble)));
   1.178 -    movdbl(xmm11, Address(rsp,off++*sizeof(jdouble)));
   1.179 -    movdbl(xmm12, Address(rsp,off++*sizeof(jdouble)));
   1.180 -    movdbl(xmm13, Address(rsp,off++*sizeof(jdouble)));
   1.181 -    movdbl(xmm14, Address(rsp,off++*sizeof(jdouble)));
   1.182 -    movdbl(xmm15, Address(rsp,off++*sizeof(jdouble)));
   1.183 +    movdqu(xmm8, Address(rsp,off++*16));
   1.184 +    movdqu(xmm9, Address(rsp,off++*16));
   1.185 +    movdqu(xmm10, Address(rsp,off++*16));
   1.186 +    movdqu(xmm11, Address(rsp,off++*16));
   1.187 +    movdqu(xmm12, Address(rsp,off++*16));
   1.188 +    movdqu(xmm13, Address(rsp,off++*16));
   1.189 +    movdqu(xmm14, Address(rsp,off++*16));
   1.190 +    movdqu(xmm15, Address(rsp,off++*16));
   1.191  #endif
   1.192 -  }
   1.193 -  if (UseSSE >= 1) {
   1.194 -    addptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
   1.195 +    addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
   1.196 +#ifdef COMPILER2
   1.197 +    if (MaxVectorSize > 16) {
   1.198 +      // Restore upper half of YMM registes.
   1.199 +      vinsertf128h(xmm0, Address(rsp,  0));
   1.200 +      vinsertf128h(xmm1, Address(rsp, 16));
   1.201 +      vinsertf128h(xmm2, Address(rsp, 32));
   1.202 +      vinsertf128h(xmm3, Address(rsp, 48));
   1.203 +      vinsertf128h(xmm4, Address(rsp, 64));
   1.204 +      vinsertf128h(xmm5, Address(rsp, 80));
   1.205 +      vinsertf128h(xmm6, Address(rsp, 96));
   1.206 +      vinsertf128h(xmm7, Address(rsp,112));
   1.207 +#ifdef _LP64
   1.208 +      vinsertf128h(xmm8, Address(rsp,128));
   1.209 +      vinsertf128h(xmm9, Address(rsp,144));
   1.210 +      vinsertf128h(xmm10, Address(rsp,160));
   1.211 +      vinsertf128h(xmm11, Address(rsp,176));
   1.212 +      vinsertf128h(xmm12, Address(rsp,192));
   1.213 +      vinsertf128h(xmm13, Address(rsp,208));
   1.214 +      vinsertf128h(xmm14, Address(rsp,224));
   1.215 +      vinsertf128h(xmm15, Address(rsp,240));
   1.216 +#endif
   1.217 +      addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
   1.218 +    }
   1.219 +#endif
   1.220    }
   1.221    popa();
   1.222  }

mercurial