1.1 --- a/src/cpu/x86/vm/x86_32.ad Tue Apr 02 09:30:07 2013 +0200 1.2 +++ b/src/cpu/x86/vm/x86_32.ad Wed Apr 03 11:12:57 2013 -0700 1.3 @@ -228,10 +228,16 @@ 1.4 static jlong *double_signflip_pool = double_quadword(&fp_signmask_pool[4*2], CONST64(0x8000000000000000), CONST64(0x8000000000000000)); 1.5 1.6 // Offset hacking within calls. 1.7 -static int pre_call_FPU_size() { 1.8 - if (Compile::current()->in_24_bit_fp_mode()) 1.9 - return 6; // fldcw 1.10 - return 0; 1.11 +static int pre_call_resets_size() { 1.12 + int size = 0; 1.13 + Compile* C = Compile::current(); 1.14 + if (C->in_24_bit_fp_mode()) { 1.15 + size += 6; // fldcw 1.16 + } 1.17 + if (C->max_vector_size() > 16) { 1.18 + size += 3; // vzeroupper 1.19 + } 1.20 + return size; 1.21 } 1.22 1.23 static int preserve_SP_size() { 1.24 @@ -242,21 +248,21 @@ 1.25 // from the start of the call to the point where the return address 1.26 // will point. 1.27 int MachCallStaticJavaNode::ret_addr_offset() { 1.28 - int offset = 5 + pre_call_FPU_size(); // 5 bytes from start of call to where return address points 1.29 + int offset = 5 + pre_call_resets_size(); // 5 bytes from start of call to where return address points 1.30 if (_method_handle_invoke) 1.31 offset += preserve_SP_size(); 1.32 return offset; 1.33 } 1.34 1.35 int MachCallDynamicJavaNode::ret_addr_offset() { 1.36 - return 10 + pre_call_FPU_size(); // 10 bytes from start of call to where return address points 1.37 + return 10 + pre_call_resets_size(); // 10 bytes from start of call to where return address points 1.38 } 1.39 1.40 static int sizeof_FFree_Float_Stack_All = -1; 1.41 1.42 int MachCallRuntimeNode::ret_addr_offset() { 1.43 assert(sizeof_FFree_Float_Stack_All != -1, "must have been emitted already"); 1.44 - return sizeof_FFree_Float_Stack_All + 5 + pre_call_FPU_size(); 1.45 + return sizeof_FFree_Float_Stack_All + 5 + pre_call_resets_size(); 1.46 } 1.47 1.48 // Indicate if the safepoint node needs the polling page as an input. 1.49 @@ -272,7 +278,7 @@ 1.50 // The address of the call instruction needs to be 4-byte aligned to 1.51 // ensure that it does not span a cache line so that it can be patched. 1.52 int CallStaticJavaDirectNode::compute_padding(int current_offset) const { 1.53 - current_offset += pre_call_FPU_size(); // skip fldcw, if any 1.54 + current_offset += pre_call_resets_size(); // skip fldcw, if any 1.55 current_offset += 1; // skip call opcode byte 1.56 return round_to(current_offset, alignment_required()) - current_offset; 1.57 } 1.58 @@ -280,7 +286,7 @@ 1.59 // The address of the call instruction needs to be 4-byte aligned to 1.60 // ensure that it does not span a cache line so that it can be patched. 1.61 int CallStaticJavaHandleNode::compute_padding(int current_offset) const { 1.62 - current_offset += pre_call_FPU_size(); // skip fldcw, if any 1.63 + current_offset += pre_call_resets_size(); // skip fldcw, if any 1.64 current_offset += preserve_SP_size(); // skip mov rbp, rsp 1.65 current_offset += 1; // skip call opcode byte 1.66 return round_to(current_offset, alignment_required()) - current_offset; 1.67 @@ -289,7 +295,7 @@ 1.68 // The address of the call instruction needs to be 4-byte aligned to 1.69 // ensure that it does not span a cache line so that it can be patched. 1.70 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const { 1.71 - current_offset += pre_call_FPU_size(); // skip fldcw, if any 1.72 + current_offset += pre_call_resets_size(); // skip fldcw, if any 1.73 current_offset += 5; // skip MOV instruction 1.74 current_offset += 1; // skip call opcode byte 1.75 return round_to(current_offset, alignment_required()) - current_offset; 1.76 @@ -583,16 +589,20 @@ 1.77 // Remove two words for return addr and rbp, 1.78 framesize -= 2*wordSize; 1.79 1.80 - if( C->in_24_bit_fp_mode() ) { 1.81 + if (C->max_vector_size() > 16) { 1.82 + st->print("VZEROUPPER"); 1.83 + st->cr(); st->print("\t"); 1.84 + } 1.85 + if (C->in_24_bit_fp_mode()) { 1.86 st->print("FLDCW standard control word"); 1.87 st->cr(); st->print("\t"); 1.88 } 1.89 - if( framesize ) { 1.90 + if (framesize) { 1.91 st->print("ADD ESP,%d\t# Destroy frame",framesize); 1.92 st->cr(); st->print("\t"); 1.93 } 1.94 st->print_cr("POPL EBP"); st->print("\t"); 1.95 - if( do_polling() && C->is_method_compilation() ) { 1.96 + if (do_polling() && C->is_method_compilation()) { 1.97 st->print("TEST PollPage,EAX\t! Poll Safepoint"); 1.98 st->cr(); st->print("\t"); 1.99 } 1.100 @@ -602,8 +612,14 @@ 1.101 void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const { 1.102 Compile *C = ra_->C; 1.103 1.104 + if (C->max_vector_size() > 16) { 1.105 + // Clear upper bits of YMM registers when current compiled code uses 1.106 + // wide vectors to avoid AVX <-> SSE transition penalty during call. 1.107 + MacroAssembler masm(&cbuf); 1.108 + masm.vzeroupper(); 1.109 + } 1.110 // If method set FPU control word, restore to standard control word 1.111 - if( C->in_24_bit_fp_mode() ) { 1.112 + if (C->in_24_bit_fp_mode()) { 1.113 MacroAssembler masm(&cbuf); 1.114 masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std())); 1.115 } 1.116 @@ -615,12 +631,11 @@ 1.117 1.118 // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here 1.119 1.120 - if( framesize >= 128 ) { 1.121 + if (framesize >= 128) { 1.122 emit_opcode(cbuf, 0x81); // add SP, #framesize 1.123 emit_rm(cbuf, 0x3, 0x00, ESP_enc); 1.124 emit_d32(cbuf, framesize); 1.125 - } 1.126 - else if( framesize ) { 1.127 + } else if (framesize) { 1.128 emit_opcode(cbuf, 0x83); // add SP, #framesize 1.129 emit_rm(cbuf, 0x3, 0x00, ESP_enc); 1.130 emit_d8(cbuf, framesize); 1.131 @@ -628,7 +643,7 @@ 1.132 1.133 emit_opcode(cbuf, 0x58 | EBP_enc); 1.134 1.135 - if( do_polling() && C->is_method_compilation() ) { 1.136 + if (do_polling() && C->is_method_compilation()) { 1.137 cbuf.relocate(cbuf.insts_end(), relocInfo::poll_return_type, 0); 1.138 emit_opcode(cbuf,0x85); 1.139 emit_rm(cbuf, 0x0, EAX_enc, 0x5); // EAX 1.140 @@ -640,7 +655,8 @@ 1.141 Compile *C = ra_->C; 1.142 // If method set FPU control word, restore to standard control word 1.143 int size = C->in_24_bit_fp_mode() ? 6 : 0; 1.144 - if( do_polling() && C->is_method_compilation() ) size += 6; 1.145 + if (C->max_vector_size() > 16) size += 3; // vzeroupper 1.146 + if (do_polling() && C->is_method_compilation()) size += 6; 1.147 1.148 int framesize = C->frame_slots() << LogBytesPerInt; 1.149 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 1.150 @@ -649,7 +665,7 @@ 1.151 1.152 size++; // popl rbp, 1.153 1.154 - if( framesize >= 128 ) { 1.155 + if (framesize >= 128) { 1.156 size += 6; 1.157 } else { 1.158 size += framesize ? 3 : 0; 1.159 @@ -1853,20 +1869,26 @@ 1.160 %} 1.161 1.162 1.163 - enc_class pre_call_FPU %{ 1.164 + enc_class pre_call_resets %{ 1.165 // If method sets FPU control word restore it here 1.166 debug_only(int off0 = cbuf.insts_size()); 1.167 - if( Compile::current()->in_24_bit_fp_mode() ) { 1.168 - MacroAssembler masm(&cbuf); 1.169 - masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std())); 1.170 + if (ra_->C->in_24_bit_fp_mode()) { 1.171 + MacroAssembler _masm(&cbuf); 1.172 + __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std())); 1.173 + } 1.174 + if (ra_->C->max_vector_size() > 16) { 1.175 + // Clear upper bits of YMM registers when current compiled code uses 1.176 + // wide vectors to avoid AVX <-> SSE transition penalty during call. 1.177 + MacroAssembler _masm(&cbuf); 1.178 + __ vzeroupper(); 1.179 } 1.180 debug_only(int off1 = cbuf.insts_size()); 1.181 - assert(off1 - off0 == pre_call_FPU_size(), "correct size prediction"); 1.182 + assert(off1 - off0 == pre_call_resets_size(), "correct size prediction"); 1.183 %} 1.184 1.185 enc_class post_call_FPU %{ 1.186 // If method sets FPU control word do it here also 1.187 - if( Compile::current()->in_24_bit_fp_mode() ) { 1.188 + if (Compile::current()->in_24_bit_fp_mode()) { 1.189 MacroAssembler masm(&cbuf); 1.190 masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24())); 1.191 } 1.192 @@ -1877,17 +1899,17 @@ 1.193 // who we intended to call. 1.194 cbuf.set_insts_mark(); 1.195 $$$emit8$primary; 1.196 - if ( !_method ) { 1.197 + if (!_method) { 1.198 emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4), 1.199 runtime_call_Relocation::spec(), RELOC_IMM32 ); 1.200 - } else if(_optimized_virtual) { 1.201 + } else if (_optimized_virtual) { 1.202 emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4), 1.203 opt_virtual_call_Relocation::spec(), RELOC_IMM32 ); 1.204 } else { 1.205 emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4), 1.206 static_call_Relocation::spec(), RELOC_IMM32 ); 1.207 } 1.208 - if( _method ) { // Emit stub for static call 1.209 + if (_method) { // Emit stub for static call 1.210 emit_java_to_interp(cbuf); 1.211 } 1.212 %} 1.213 @@ -12828,7 +12850,7 @@ 1.214 ins_cost(300); 1.215 format %{ "CALL,static " %} 1.216 opcode(0xE8); /* E8 cd */ 1.217 - ins_encode( pre_call_FPU, 1.218 + ins_encode( pre_call_resets, 1.219 Java_Static_Call( meth ), 1.220 call_epilog, 1.221 post_call_FPU ); 1.222 @@ -12849,7 +12871,7 @@ 1.223 ins_cost(300); 1.224 format %{ "CALL,static/MethodHandle " %} 1.225 opcode(0xE8); /* E8 cd */ 1.226 - ins_encode( pre_call_FPU, 1.227 + ins_encode( pre_call_resets, 1.228 preserve_SP, 1.229 Java_Static_Call( meth ), 1.230 restore_SP, 1.231 @@ -12870,7 +12892,7 @@ 1.232 format %{ "MOV EAX,(oop)-1\n\t" 1.233 "CALL,dynamic" %} 1.234 opcode(0xE8); /* E8 cd */ 1.235 - ins_encode( pre_call_FPU, 1.236 + ins_encode( pre_call_resets, 1.237 Java_Dynamic_Call( meth ), 1.238 call_epilog, 1.239 post_call_FPU ); 1.240 @@ -12887,7 +12909,7 @@ 1.241 format %{ "CALL,runtime " %} 1.242 opcode(0xE8); /* E8 cd */ 1.243 // Use FFREEs to clear entries in float stack 1.244 - ins_encode( pre_call_FPU, 1.245 + ins_encode( pre_call_resets, 1.246 FFree_Float_Stack_All, 1.247 Java_To_Runtime( meth ), 1.248 post_call_FPU ); 1.249 @@ -12902,7 +12924,7 @@ 1.250 ins_cost(300); 1.251 format %{ "CALL_LEAF,runtime " %} 1.252 opcode(0xE8); /* E8 cd */ 1.253 - ins_encode( pre_call_FPU, 1.254 + ins_encode( pre_call_resets, 1.255 FFree_Float_Stack_All, 1.256 Java_To_Runtime( meth ), 1.257 Verify_FPU_For_Leaf, post_call_FPU );