src/cpu/x86/vm/x86_32.ad

changeset 4873
e961c11b85fe
parent 4479
b30b3c2a0cf2
child 4944
886d1fd67dc3
child 5000
a6e09d6dd8e5
     1.1 --- a/src/cpu/x86/vm/x86_32.ad	Tue Apr 02 09:30:07 2013 +0200
     1.2 +++ b/src/cpu/x86/vm/x86_32.ad	Wed Apr 03 11:12:57 2013 -0700
     1.3 @@ -228,10 +228,16 @@
     1.4  static jlong *double_signflip_pool = double_quadword(&fp_signmask_pool[4*2], CONST64(0x8000000000000000), CONST64(0x8000000000000000));
     1.5  
     1.6  // Offset hacking within calls.
     1.7 -static int pre_call_FPU_size() {
     1.8 -  if (Compile::current()->in_24_bit_fp_mode())
     1.9 -    return 6; // fldcw
    1.10 -  return 0;
    1.11 +static int pre_call_resets_size() {
    1.12 +  int size = 0;
    1.13 +  Compile* C = Compile::current();
    1.14 +  if (C->in_24_bit_fp_mode()) {
    1.15 +    size += 6; // fldcw
    1.16 +  }
    1.17 +  if (C->max_vector_size() > 16) {
    1.18 +    size += 3; // vzeroupper
    1.19 +  }
    1.20 +  return size;
    1.21  }
    1.22  
    1.23  static int preserve_SP_size() {
    1.24 @@ -242,21 +248,21 @@
    1.25  //       from the start of the call to the point where the return address
    1.26  //       will point.
    1.27  int MachCallStaticJavaNode::ret_addr_offset() {
    1.28 -  int offset = 5 + pre_call_FPU_size();  // 5 bytes from start of call to where return address points
    1.29 +  int offset = 5 + pre_call_resets_size();  // 5 bytes from start of call to where return address points
    1.30    if (_method_handle_invoke)
    1.31      offset += preserve_SP_size();
    1.32    return offset;
    1.33  }
    1.34  
    1.35  int MachCallDynamicJavaNode::ret_addr_offset() {
    1.36 -  return 10 + pre_call_FPU_size();  // 10 bytes from start of call to where return address points
    1.37 +  return 10 + pre_call_resets_size();  // 10 bytes from start of call to where return address points
    1.38  }
    1.39  
    1.40  static int sizeof_FFree_Float_Stack_All = -1;
    1.41  
    1.42  int MachCallRuntimeNode::ret_addr_offset() {
    1.43    assert(sizeof_FFree_Float_Stack_All != -1, "must have been emitted already");
    1.44 -  return sizeof_FFree_Float_Stack_All + 5 + pre_call_FPU_size();
    1.45 +  return sizeof_FFree_Float_Stack_All + 5 + pre_call_resets_size();
    1.46  }
    1.47  
    1.48  // Indicate if the safepoint node needs the polling page as an input.
    1.49 @@ -272,7 +278,7 @@
    1.50  // The address of the call instruction needs to be 4-byte aligned to
    1.51  // ensure that it does not span a cache line so that it can be patched.
    1.52  int CallStaticJavaDirectNode::compute_padding(int current_offset) const {
    1.53 -  current_offset += pre_call_FPU_size();  // skip fldcw, if any
    1.54 +  current_offset += pre_call_resets_size();  // skip fldcw, if any
    1.55    current_offset += 1;      // skip call opcode byte
    1.56    return round_to(current_offset, alignment_required()) - current_offset;
    1.57  }
    1.58 @@ -280,7 +286,7 @@
    1.59  // The address of the call instruction needs to be 4-byte aligned to
    1.60  // ensure that it does not span a cache line so that it can be patched.
    1.61  int CallStaticJavaHandleNode::compute_padding(int current_offset) const {
    1.62 -  current_offset += pre_call_FPU_size();  // skip fldcw, if any
    1.63 +  current_offset += pre_call_resets_size();  // skip fldcw, if any
    1.64    current_offset += preserve_SP_size();   // skip mov rbp, rsp
    1.65    current_offset += 1;      // skip call opcode byte
    1.66    return round_to(current_offset, alignment_required()) - current_offset;
    1.67 @@ -289,7 +295,7 @@
    1.68  // The address of the call instruction needs to be 4-byte aligned to
    1.69  // ensure that it does not span a cache line so that it can be patched.
    1.70  int CallDynamicJavaDirectNode::compute_padding(int current_offset) const {
    1.71 -  current_offset += pre_call_FPU_size();  // skip fldcw, if any
    1.72 +  current_offset += pre_call_resets_size();  // skip fldcw, if any
    1.73    current_offset += 5;      // skip MOV instruction
    1.74    current_offset += 1;      // skip call opcode byte
    1.75    return round_to(current_offset, alignment_required()) - current_offset;
    1.76 @@ -583,16 +589,20 @@
    1.77    // Remove two words for return addr and rbp,
    1.78    framesize -= 2*wordSize;
    1.79  
    1.80 -  if( C->in_24_bit_fp_mode() ) {
    1.81 +  if (C->max_vector_size() > 16) {
    1.82 +    st->print("VZEROUPPER");
    1.83 +    st->cr(); st->print("\t");
    1.84 +  }
    1.85 +  if (C->in_24_bit_fp_mode()) {
    1.86      st->print("FLDCW  standard control word");
    1.87      st->cr(); st->print("\t");
    1.88    }
    1.89 -  if( framesize ) {
    1.90 +  if (framesize) {
    1.91      st->print("ADD    ESP,%d\t# Destroy frame",framesize);
    1.92      st->cr(); st->print("\t");
    1.93    }
    1.94    st->print_cr("POPL   EBP"); st->print("\t");
    1.95 -  if( do_polling() && C->is_method_compilation() ) {
    1.96 +  if (do_polling() && C->is_method_compilation()) {
    1.97      st->print("TEST   PollPage,EAX\t! Poll Safepoint");
    1.98      st->cr(); st->print("\t");
    1.99    }
   1.100 @@ -602,8 +612,14 @@
   1.101  void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
   1.102    Compile *C = ra_->C;
   1.103  
   1.104 +  if (C->max_vector_size() > 16) {
   1.105 +    // Clear upper bits of YMM registers when current compiled code uses
   1.106 +    // wide vectors to avoid AVX <-> SSE transition penalty during call.
   1.107 +    MacroAssembler masm(&cbuf);
   1.108 +    masm.vzeroupper();
   1.109 +  }
   1.110    // If method set FPU control word, restore to standard control word
   1.111 -  if( C->in_24_bit_fp_mode() ) {
   1.112 +  if (C->in_24_bit_fp_mode()) {
   1.113      MacroAssembler masm(&cbuf);
   1.114      masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
   1.115    }
   1.116 @@ -615,12 +631,11 @@
   1.117  
   1.118    // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here
   1.119  
   1.120 -  if( framesize >= 128 ) {
   1.121 +  if (framesize >= 128) {
   1.122      emit_opcode(cbuf, 0x81); // add  SP, #framesize
   1.123      emit_rm(cbuf, 0x3, 0x00, ESP_enc);
   1.124      emit_d32(cbuf, framesize);
   1.125 -  }
   1.126 -  else if( framesize ) {
   1.127 +  } else if (framesize) {
   1.128      emit_opcode(cbuf, 0x83); // add  SP, #framesize
   1.129      emit_rm(cbuf, 0x3, 0x00, ESP_enc);
   1.130      emit_d8(cbuf, framesize);
   1.131 @@ -628,7 +643,7 @@
   1.132  
   1.133    emit_opcode(cbuf, 0x58 | EBP_enc);
   1.134  
   1.135 -  if( do_polling() && C->is_method_compilation() ) {
   1.136 +  if (do_polling() && C->is_method_compilation()) {
   1.137      cbuf.relocate(cbuf.insts_end(), relocInfo::poll_return_type, 0);
   1.138      emit_opcode(cbuf,0x85);
   1.139      emit_rm(cbuf, 0x0, EAX_enc, 0x5); // EAX
   1.140 @@ -640,7 +655,8 @@
   1.141    Compile *C = ra_->C;
   1.142    // If method set FPU control word, restore to standard control word
   1.143    int size = C->in_24_bit_fp_mode() ? 6 : 0;
   1.144 -  if( do_polling() && C->is_method_compilation() ) size += 6;
   1.145 +  if (C->max_vector_size() > 16) size += 3; // vzeroupper
   1.146 +  if (do_polling() && C->is_method_compilation()) size += 6;
   1.147  
   1.148    int framesize = C->frame_slots() << LogBytesPerInt;
   1.149    assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
   1.150 @@ -649,7 +665,7 @@
   1.151  
   1.152    size++; // popl rbp,
   1.153  
   1.154 -  if( framesize >= 128 ) {
   1.155 +  if (framesize >= 128) {
   1.156      size += 6;
   1.157    } else {
   1.158      size += framesize ? 3 : 0;
   1.159 @@ -1853,20 +1869,26 @@
   1.160    %}
   1.161  
   1.162  
   1.163 -  enc_class pre_call_FPU %{
   1.164 +  enc_class pre_call_resets %{
   1.165      // If method sets FPU control word restore it here
   1.166      debug_only(int off0 = cbuf.insts_size());
   1.167 -    if( Compile::current()->in_24_bit_fp_mode() ) {
   1.168 -      MacroAssembler masm(&cbuf);
   1.169 -      masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
   1.170 +    if (ra_->C->in_24_bit_fp_mode()) {
   1.171 +      MacroAssembler _masm(&cbuf);
   1.172 +      __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
   1.173 +    }
   1.174 +    if (ra_->C->max_vector_size() > 16) {
   1.175 +      // Clear upper bits of YMM registers when current compiled code uses
   1.176 +      // wide vectors to avoid AVX <-> SSE transition penalty during call.
   1.177 +      MacroAssembler _masm(&cbuf);
   1.178 +      __ vzeroupper();
   1.179      }
   1.180      debug_only(int off1 = cbuf.insts_size());
   1.181 -    assert(off1 - off0 == pre_call_FPU_size(), "correct size prediction");
   1.182 +    assert(off1 - off0 == pre_call_resets_size(), "correct size prediction");
   1.183    %}
   1.184  
   1.185    enc_class post_call_FPU %{
   1.186      // If method sets FPU control word do it here also
   1.187 -    if( Compile::current()->in_24_bit_fp_mode() ) {
   1.188 +    if (Compile::current()->in_24_bit_fp_mode()) {
   1.189        MacroAssembler masm(&cbuf);
   1.190        masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
   1.191      }
   1.192 @@ -1877,17 +1899,17 @@
   1.193      // who we intended to call.
   1.194      cbuf.set_insts_mark();
   1.195      $$$emit8$primary;
   1.196 -    if ( !_method ) {
   1.197 +    if (!_method) {
   1.198        emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
   1.199                       runtime_call_Relocation::spec(), RELOC_IMM32 );
   1.200 -    } else if(_optimized_virtual) {
   1.201 +    } else if (_optimized_virtual) {
   1.202        emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
   1.203                       opt_virtual_call_Relocation::spec(), RELOC_IMM32 );
   1.204      } else {
   1.205        emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
   1.206                       static_call_Relocation::spec(), RELOC_IMM32 );
   1.207      }
   1.208 -    if( _method ) {  // Emit stub for static call
   1.209 +    if (_method) {  // Emit stub for static call
   1.210        emit_java_to_interp(cbuf);
   1.211      }
   1.212    %}
   1.213 @@ -12828,7 +12850,7 @@
   1.214    ins_cost(300);
   1.215    format %{ "CALL,static " %}
   1.216    opcode(0xE8); /* E8 cd */
   1.217 -  ins_encode( pre_call_FPU,
   1.218 +  ins_encode( pre_call_resets,
   1.219                Java_Static_Call( meth ),
   1.220                call_epilog,
   1.221                post_call_FPU );
   1.222 @@ -12849,7 +12871,7 @@
   1.223    ins_cost(300);
   1.224    format %{ "CALL,static/MethodHandle " %}
   1.225    opcode(0xE8); /* E8 cd */
   1.226 -  ins_encode( pre_call_FPU,
   1.227 +  ins_encode( pre_call_resets,
   1.228                preserve_SP,
   1.229                Java_Static_Call( meth ),
   1.230                restore_SP,
   1.231 @@ -12870,7 +12892,7 @@
   1.232    format %{ "MOV    EAX,(oop)-1\n\t"
   1.233              "CALL,dynamic" %}
   1.234    opcode(0xE8); /* E8 cd */
   1.235 -  ins_encode( pre_call_FPU,
   1.236 +  ins_encode( pre_call_resets,
   1.237                Java_Dynamic_Call( meth ),
   1.238                call_epilog,
   1.239                post_call_FPU );
   1.240 @@ -12887,7 +12909,7 @@
   1.241    format %{ "CALL,runtime " %}
   1.242    opcode(0xE8); /* E8 cd */
   1.243    // Use FFREEs to clear entries in float stack
   1.244 -  ins_encode( pre_call_FPU,
   1.245 +  ins_encode( pre_call_resets,
   1.246                FFree_Float_Stack_All,
   1.247                Java_To_Runtime( meth ),
   1.248                post_call_FPU );
   1.249 @@ -12902,7 +12924,7 @@
   1.250    ins_cost(300);
   1.251    format %{ "CALL_LEAF,runtime " %}
   1.252    opcode(0xE8); /* E8 cd */
   1.253 -  ins_encode( pre_call_FPU,
   1.254 +  ins_encode( pre_call_resets,
   1.255                FFree_Float_Stack_All,
   1.256                Java_To_Runtime( meth ),
   1.257                Verify_FPU_For_Leaf, post_call_FPU );

mercurial