Sat, 03 Sep 2011 09:56:57 -0700
Merge
1.1 --- a/agent/src/share/classes/sun/jvm/hotspot/interpreter/BytecodeLoadConstant.java Fri Sep 02 20:58:21 2011 -0700 1.2 +++ b/agent/src/share/classes/sun/jvm/hotspot/interpreter/BytecodeLoadConstant.java Sat Sep 03 09:56:57 2011 -0700 1.3 @@ -90,7 +90,7 @@ 1.4 jcode == Bytecodes._ldc2_w; 1.5 if (! codeOk) return false; 1.6 1.7 - ConstantTag ctag = method().getConstants().getTagAt(rawIndex()); 1.8 + ConstantTag ctag = method().getConstants().getTagAt(poolIndex()); 1.9 if (jcode == Bytecodes._ldc2_w) { 1.10 // has to be double or long 1.11 return (ctag.isDouble() || ctag.isLong()) ? true: false;
2.1 --- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp Fri Sep 02 20:58:21 2011 -0700 2.2 +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp Sat Sep 03 09:56:57 2011 -0700 2.3 @@ -1124,6 +1124,126 @@ 2.4 } 2.5 } 2.6 2.7 + // 2.8 + // Generate main code for disjoint arraycopy 2.9 + // 2.10 + typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec, 2.11 + Label& L_loop, bool use_prefetch, bool use_bis); 2.12 + 2.13 + void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size, 2.14 + int iter_size, CopyLoopFunc copy_loop_func) { 2.15 + Label L_copy; 2.16 + 2.17 + assert(log2_elem_size <= 3, "the following code should be changed"); 2.18 + int count_dec = 16>>log2_elem_size; 2.19 + 2.20 + int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance); 2.21 + assert(prefetch_dist < 4096, "invalid value"); 2.22 + prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size 2.23 + int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count 2.24 + 2.25 + if (UseBlockCopy) { 2.26 + Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy; 2.27 + 2.28 + // 64 bytes tail + bytes copied in one loop iteration 2.29 + int tail_size = 64 + iter_size; 2.30 + int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size; 2.31 + // Use BIS copy only for big arrays since it requires membar. 2.32 + __ set(block_copy_count, O4); 2.33 + __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy); 2.34 + // This code is for disjoint source and destination: 2.35 + // to <= from || to >= from+count 2.36 + // but BIS will stomp over 'from' if (to > from-tail_size && to <= from) 2.37 + __ sub(from, to, O4); 2.38 + __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm. 2.39 + __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy); 2.40 + 2.41 + __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY); 2.42 + // BIS should not be used to copy tail (64 bytes+iter_size) 2.43 + // to avoid zeroing of following values. 2.44 + __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0 2.45 + 2.46 + if (prefetch_count > 0) { // rounded up to one iteration count 2.47 + // Do prefetching only if copy size is bigger 2.48 + // than prefetch distance. 2.49 + __ set(prefetch_count, O4); 2.50 + __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy); 2.51 + __ sub(count, prefetch_count, count); 2.52 + 2.53 + (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true); 2.54 + __ add(count, prefetch_count, count); // restore count 2.55 + 2.56 + } // prefetch_count > 0 2.57 + 2.58 + (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true); 2.59 + __ add(count, (tail_size>>log2_elem_size), count); // restore count 2.60 + 2.61 + __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT); 2.62 + // BIS needs membar. 2.63 + __ membar(Assembler::StoreLoad); 2.64 + // Copy tail 2.65 + __ ba_short(L_copy); 2.66 + 2.67 + __ BIND(L_skip_block_copy); 2.68 + } // UseBlockCopy 2.69 + 2.70 + if (prefetch_count > 0) { // rounded up to one iteration count 2.71 + // Do prefetching only if copy size is bigger 2.72 + // than prefetch distance. 2.73 + __ set(prefetch_count, O4); 2.74 + __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy); 2.75 + __ sub(count, prefetch_count, count); 2.76 + 2.77 + Label L_copy_prefetch; 2.78 + (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false); 2.79 + __ add(count, prefetch_count, count); // restore count 2.80 + 2.81 + } // prefetch_count > 0 2.82 + 2.83 + (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false); 2.84 + } 2.85 + 2.86 + 2.87 + 2.88 + // 2.89 + // Helper methods for copy_16_bytes_forward_with_shift() 2.90 + // 2.91 + void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec, 2.92 + Label& L_loop, bool use_prefetch, bool use_bis) { 2.93 + 2.94 + const Register left_shift = G1; // left shift bit counter 2.95 + const Register right_shift = G5; // right shift bit counter 2.96 + 2.97 + __ align(OptoLoopAlignment); 2.98 + __ BIND(L_loop); 2.99 + if (use_prefetch) { 2.100 + if (ArraycopySrcPrefetchDistance > 0) { 2.101 + __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); 2.102 + } 2.103 + if (ArraycopyDstPrefetchDistance > 0) { 2.104 + __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); 2.105 + } 2.106 + } 2.107 + __ ldx(from, 0, O4); 2.108 + __ ldx(from, 8, G4); 2.109 + __ inc(to, 16); 2.110 + __ inc(from, 16); 2.111 + __ deccc(count, count_dec); // Can we do next iteration after this one? 2.112 + __ srlx(O4, right_shift, G3); 2.113 + __ bset(G3, O3); 2.114 + __ sllx(O4, left_shift, O4); 2.115 + __ srlx(G4, right_shift, G3); 2.116 + __ bset(G3, O4); 2.117 + if (use_bis) { 2.118 + __ stxa(O3, to, -16); 2.119 + __ stxa(O4, to, -8); 2.120 + } else { 2.121 + __ stx(O3, to, -16); 2.122 + __ stx(O4, to, -8); 2.123 + } 2.124 + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 2.125 + __ delayed()->sllx(G4, left_shift, O3); 2.126 + } 2.127 2.128 // Copy big chunks forward with shift 2.129 // 2.130 @@ -1135,64 +1255,51 @@ 2.131 // L_copy_bytes - copy exit label 2.132 // 2.133 void copy_16_bytes_forward_with_shift(Register from, Register to, 2.134 - Register count, int count_dec, Label& L_copy_bytes) { 2.135 - Label L_loop, L_aligned_copy, L_copy_last_bytes; 2.136 + Register count, int log2_elem_size, Label& L_copy_bytes) { 2.137 + Label L_aligned_copy, L_copy_last_bytes; 2.138 + assert(log2_elem_size <= 3, "the following code should be changed"); 2.139 + int count_dec = 16>>log2_elem_size; 2.140 2.141 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy 2.142 - __ andcc(from, 7, G1); // misaligned bytes 2.143 - __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 2.144 - __ delayed()->nop(); 2.145 + __ andcc(from, 7, G1); // misaligned bytes 2.146 + __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 2.147 + __ delayed()->nop(); 2.148 2.149 const Register left_shift = G1; // left shift bit counter 2.150 const Register right_shift = G5; // right shift bit counter 2.151 2.152 - __ sll(G1, LogBitsPerByte, left_shift); 2.153 - __ mov(64, right_shift); 2.154 - __ sub(right_shift, left_shift, right_shift); 2.155 + __ sll(G1, LogBitsPerByte, left_shift); 2.156 + __ mov(64, right_shift); 2.157 + __ sub(right_shift, left_shift, right_shift); 2.158 2.159 // 2.160 // Load 2 aligned 8-bytes chunks and use one from previous iteration 2.161 // to form 2 aligned 8-bytes chunks to store. 2.162 // 2.163 - __ deccc(count, count_dec); // Pre-decrement 'count' 2.164 - __ andn(from, 7, from); // Align address 2.165 - __ ldx(from, 0, O3); 2.166 - __ inc(from, 8); 2.167 - __ align(OptoLoopAlignment); 2.168 - __ BIND(L_loop); 2.169 - __ ldx(from, 0, O4); 2.170 - __ deccc(count, count_dec); // Can we do next iteration after this one? 2.171 - __ ldx(from, 8, G4); 2.172 - __ inc(to, 16); 2.173 - __ inc(from, 16); 2.174 - __ sllx(O3, left_shift, O3); 2.175 - __ srlx(O4, right_shift, G3); 2.176 - __ bset(G3, O3); 2.177 - __ stx(O3, to, -16); 2.178 - __ sllx(O4, left_shift, O4); 2.179 - __ srlx(G4, right_shift, G3); 2.180 - __ bset(G3, O4); 2.181 - __ stx(O4, to, -8); 2.182 - __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 2.183 - __ delayed()->mov(G4, O3); 2.184 - 2.185 - __ inccc(count, count_dec>>1 ); // + 8 bytes 2.186 - __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes); 2.187 - __ delayed()->inc(count, count_dec>>1); // restore 'count' 2.188 - 2.189 - // copy 8 bytes, part of them already loaded in O3 2.190 - __ ldx(from, 0, O4); 2.191 - __ inc(to, 8); 2.192 - __ inc(from, 8); 2.193 - __ sllx(O3, left_shift, O3); 2.194 - __ srlx(O4, right_shift, G3); 2.195 - __ bset(O3, G3); 2.196 - __ stx(G3, to, -8); 2.197 + __ dec(count, count_dec); // Pre-decrement 'count' 2.198 + __ andn(from, 7, from); // Align address 2.199 + __ ldx(from, 0, O3); 2.200 + __ inc(from, 8); 2.201 + __ sllx(O3, left_shift, O3); 2.202 + 2.203 + disjoint_copy_core(from, to, count, log2_elem_size, 16, copy_16_bytes_shift_loop); 2.204 + 2.205 + __ inccc(count, count_dec>>1 ); // + 8 bytes 2.206 + __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes); 2.207 + __ delayed()->inc(count, count_dec>>1); // restore 'count' 2.208 + 2.209 + // copy 8 bytes, part of them already loaded in O3 2.210 + __ ldx(from, 0, O4); 2.211 + __ inc(to, 8); 2.212 + __ inc(from, 8); 2.213 + __ srlx(O4, right_shift, G3); 2.214 + __ bset(O3, G3); 2.215 + __ stx(G3, to, -8); 2.216 2.217 __ BIND(L_copy_last_bytes); 2.218 - __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes 2.219 - __ br(Assembler::always, false, Assembler::pt, L_copy_bytes); 2.220 - __ delayed()->sub(from, right_shift, from); // restore address 2.221 + __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes 2.222 + __ br(Assembler::always, false, Assembler::pt, L_copy_bytes); 2.223 + __ delayed()->sub(from, right_shift, from); // restore address 2.224 2.225 __ BIND(L_aligned_copy); 2.226 } 2.227 @@ -1348,7 +1455,7 @@ 2.228 // The compare above (count >= 23) guarantes 'count' >= 16 bytes. 2.229 // Also jump over aligned copy after the copy with shift completed. 2.230 2.231 - copy_16_bytes_forward_with_shift(from, to, count, 16, L_copy_byte); 2.232 + copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte); 2.233 } 2.234 2.235 // Both array are 8 bytes aligned, copy 16 bytes at a time 2.236 @@ -1576,7 +1683,7 @@ 2.237 // The compare above (count >= 11) guarantes 'count' >= 16 bytes. 2.238 // Also jump over aligned copy after the copy with shift completed. 2.239 2.240 - copy_16_bytes_forward_with_shift(from, to, count, 8, L_copy_2_bytes); 2.241 + copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes); 2.242 } 2.243 2.244 // Both array are 8 bytes aligned, copy 16 bytes at a time 2.245 @@ -1950,6 +2057,45 @@ 2.246 } 2.247 2.248 // 2.249 + // Helper methods for generate_disjoint_int_copy_core() 2.250 + // 2.251 + void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec, 2.252 + Label& L_loop, bool use_prefetch, bool use_bis) { 2.253 + 2.254 + __ align(OptoLoopAlignment); 2.255 + __ BIND(L_loop); 2.256 + if (use_prefetch) { 2.257 + if (ArraycopySrcPrefetchDistance > 0) { 2.258 + __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); 2.259 + } 2.260 + if (ArraycopyDstPrefetchDistance > 0) { 2.261 + __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); 2.262 + } 2.263 + } 2.264 + __ ldx(from, 4, O4); 2.265 + __ ldx(from, 12, G4); 2.266 + __ inc(to, 16); 2.267 + __ inc(from, 16); 2.268 + __ deccc(count, 4); // Can we do next iteration after this one? 2.269 + 2.270 + __ srlx(O4, 32, G3); 2.271 + __ bset(G3, O3); 2.272 + __ sllx(O4, 32, O4); 2.273 + __ srlx(G4, 32, G3); 2.274 + __ bset(G3, O4); 2.275 + if (use_bis) { 2.276 + __ stxa(O3, to, -16); 2.277 + __ stxa(O4, to, -8); 2.278 + } else { 2.279 + __ stx(O3, to, -16); 2.280 + __ stx(O4, to, -8); 2.281 + } 2.282 + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 2.283 + __ delayed()->sllx(G4, 32, O3); 2.284 + 2.285 + } 2.286 + 2.287 + // 2.288 // Generate core code for disjoint int copy (and oop copy on 32-bit). 2.289 // If "aligned" is true, the "from" and "to" addresses are assumed 2.290 // to be heapword aligned. 2.291 @@ -1962,7 +2108,7 @@ 2.292 void generate_disjoint_int_copy_core(bool aligned) { 2.293 2.294 Label L_skip_alignment, L_aligned_copy; 2.295 - Label L_copy_16_bytes, L_copy_4_bytes, L_copy_4_bytes_loop, L_exit; 2.296 + Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit; 2.297 2.298 const Register from = O0; // source array address 2.299 const Register to = O1; // destination array address 2.300 @@ -2013,30 +2159,16 @@ 2.301 2.302 // copy with shift 4 elements (16 bytes) at a time 2.303 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4 2.304 - 2.305 - __ align(OptoLoopAlignment); 2.306 - __ BIND(L_copy_16_bytes); 2.307 - __ ldx(from, 4, O4); 2.308 - __ deccc(count, 4); // Can we do next iteration after this one? 2.309 - __ ldx(from, 12, G4); 2.310 - __ inc(to, 16); 2.311 - __ inc(from, 16); 2.312 - __ sllx(O3, 32, O3); 2.313 - __ srlx(O4, 32, G3); 2.314 - __ bset(G3, O3); 2.315 - __ stx(O3, to, -16); 2.316 - __ sllx(O4, 32, O4); 2.317 - __ srlx(G4, 32, G3); 2.318 - __ bset(G3, O4); 2.319 - __ stx(O4, to, -8); 2.320 - __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes); 2.321 - __ delayed()->mov(G4, O3); 2.322 + __ sllx(O3, 32, O3); 2.323 + 2.324 + disjoint_copy_core(from, to, count, 2, 16, copy_16_bytes_loop); 2.325 2.326 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes); 2.327 __ delayed()->inc(count, 4); // restore 'count' 2.328 2.329 __ BIND(L_aligned_copy); 2.330 - } 2.331 + } // !aligned 2.332 + 2.333 // copy 4 elements (16 bytes) at a time 2.334 __ and3(count, 1, G4); // Save 2.335 __ srl(count, 1, count); 2.336 @@ -2223,6 +2355,38 @@ 2.337 } 2.338 2.339 // 2.340 + // Helper methods for generate_disjoint_long_copy_core() 2.341 + // 2.342 + void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec, 2.343 + Label& L_loop, bool use_prefetch, bool use_bis) { 2.344 + __ align(OptoLoopAlignment); 2.345 + __ BIND(L_loop); 2.346 + for (int off = 0; off < 64; off += 16) { 2.347 + if (use_prefetch && (off & 31) == 0) { 2.348 + if (ArraycopySrcPrefetchDistance > 0) { 2.349 + __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); 2.350 + } 2.351 + if (ArraycopyDstPrefetchDistance > 0) { 2.352 + __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); 2.353 + } 2.354 + } 2.355 + __ ldx(from, off+0, O4); 2.356 + __ ldx(from, off+8, O5); 2.357 + if (use_bis) { 2.358 + __ stxa(O4, to, off+0); 2.359 + __ stxa(O5, to, off+8); 2.360 + } else { 2.361 + __ stx(O4, to, off+0); 2.362 + __ stx(O5, to, off+8); 2.363 + } 2.364 + } 2.365 + __ deccc(count, 8); 2.366 + __ inc(from, 64); 2.367 + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 2.368 + __ delayed()->inc(to, 64); 2.369 + } 2.370 + 2.371 + // 2.372 // Generate core code for disjoint long copy (and oop copy on 64-bit). 2.373 // "aligned" is ignored, because we must make the stronger 2.374 // assumption that both addresses are always 64-bit aligned. 2.375 @@ -2261,38 +2425,28 @@ 2.376 const Register offset0 = O4; // element offset 2.377 const Register offset8 = O5; // next element offset 2.378 2.379 - __ deccc(count, 2); 2.380 - __ mov(G0, offset0); // offset from start of arrays (0) 2.381 - __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); 2.382 - __ delayed()->add(offset0, 8, offset8); 2.383 + __ deccc(count, 2); 2.384 + __ mov(G0, offset0); // offset from start of arrays (0) 2.385 + __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); 2.386 + __ delayed()->add(offset0, 8, offset8); 2.387 2.388 // Copy by 64 bytes chunks 2.389 - Label L_copy_64_bytes; 2.390 + 2.391 const Register from64 = O3; // source address 2.392 const Register to64 = G3; // destination address 2.393 - __ subcc(count, 6, O3); 2.394 - __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes ); 2.395 - __ delayed()->mov(to, to64); 2.396 - // Now we can use O4(offset0), O5(offset8) as temps 2.397 - __ mov(O3, count); 2.398 - __ mov(from, from64); 2.399 - 2.400 - __ align(OptoLoopAlignment); 2.401 - __ BIND(L_copy_64_bytes); 2.402 - for( int off = 0; off < 64; off += 16 ) { 2.403 - __ ldx(from64, off+0, O4); 2.404 - __ ldx(from64, off+8, O5); 2.405 - __ stx(O4, to64, off+0); 2.406 - __ stx(O5, to64, off+8); 2.407 - } 2.408 - __ deccc(count, 8); 2.409 - __ inc(from64, 64); 2.410 - __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_64_bytes); 2.411 - __ delayed()->inc(to64, 64); 2.412 + __ subcc(count, 6, O3); 2.413 + __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes ); 2.414 + __ delayed()->mov(to, to64); 2.415 + // Now we can use O4(offset0), O5(offset8) as temps 2.416 + __ mov(O3, count); 2.417 + // count >= 0 (original count - 8) 2.418 + __ mov(from, from64); 2.419 + 2.420 + disjoint_copy_core(from64, to64, count, 3, 64, copy_64_bytes_loop); 2.421 2.422 // Restore O4(offset0), O5(offset8) 2.423 __ sub(from64, from, offset0); 2.424 - __ inccc(count, 6); 2.425 + __ inccc(count, 6); // restore count 2.426 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); 2.427 __ delayed()->add(offset0, 8, offset8); 2.428
3.1 --- a/src/cpu/sparc/vm/vm_version_sparc.cpp Fri Sep 02 20:58:21 2011 -0700 3.2 +++ b/src/cpu/sparc/vm/vm_version_sparc.cpp Sat Sep 03 09:56:57 2011 -0700 3.3 @@ -75,6 +75,24 @@ 3.4 FLAG_SET_DEFAULT(AllocatePrefetchStyle, 1); 3.5 } 3.6 3.7 + if (has_v9()) { 3.8 + assert(ArraycopySrcPrefetchDistance < 4096, "invalid value"); 3.9 + if (ArraycopySrcPrefetchDistance >= 4096) 3.10 + ArraycopySrcPrefetchDistance = 4064; 3.11 + assert(ArraycopyDstPrefetchDistance < 4096, "invalid value"); 3.12 + if (ArraycopyDstPrefetchDistance >= 4096) 3.13 + ArraycopyDstPrefetchDistance = 4064; 3.14 + } else { 3.15 + if (ArraycopySrcPrefetchDistance > 0) { 3.16 + warning("prefetch instructions are not available on this CPU"); 3.17 + FLAG_SET_DEFAULT(ArraycopySrcPrefetchDistance, 0); 3.18 + } 3.19 + if (ArraycopyDstPrefetchDistance > 0) { 3.20 + warning("prefetch instructions are not available on this CPU"); 3.21 + FLAG_SET_DEFAULT(ArraycopyDstPrefetchDistance, 0); 3.22 + } 3.23 + } 3.24 + 3.25 UseSSE = 0; // Only on x86 and x64 3.26 3.27 _supports_cx8 = has_v9(); 3.28 @@ -180,6 +198,16 @@ 3.29 FLAG_SET_DEFAULT(UseBlockZeroing, false); 3.30 } 3.31 3.32 + assert(BlockCopyLowLimit > 0, "invalid value"); 3.33 + if (has_block_zeroing()) { // has_blk_init() && is_T4(): core's local L2 cache 3.34 + if (FLAG_IS_DEFAULT(UseBlockCopy)) { 3.35 + FLAG_SET_DEFAULT(UseBlockCopy, true); 3.36 + } 3.37 + } else if (UseBlockCopy) { 3.38 + warning("BIS instructions are not available or expensive on this CPU"); 3.39 + FLAG_SET_DEFAULT(UseBlockCopy, false); 3.40 + } 3.41 + 3.42 #ifdef COMPILER2 3.43 // T4 and newer Sparc cpus have fast RDPC. 3.44 if (has_fast_rdpc() && FLAG_IS_DEFAULT(UseRDPCForConstantTableBase)) {
4.1 --- a/src/share/vm/runtime/globals.hpp Fri Sep 02 20:58:21 2011 -0700 4.2 +++ b/src/share/vm/runtime/globals.hpp Sat Sep 03 09:56:57 2011 -0700 4.3 @@ -1985,6 +1985,12 @@ 4.4 product(intx, BlockZeroingLowLimit, 2048, \ 4.5 "Minimum size in bytes when block zeroing will be used") \ 4.6 \ 4.7 + product(bool, UseBlockCopy, false, \ 4.8 + "Use special cpu instructions for block copy") \ 4.9 + \ 4.10 + product(intx, BlockCopyLowLimit, 2048, \ 4.11 + "Minimum size in bytes when block copy will be used") \ 4.12 + \ 4.13 product(bool, PrintRevisitStats, false, \ 4.14 "Print revisit (klass and MDO) stack related information") \ 4.15 \ 4.16 @@ -2918,6 +2924,12 @@ 4.17 product(intx, ReadPrefetchInstr, 0, \ 4.18 "Prefetch instruction to prefetch ahead") \ 4.19 \ 4.20 + product(uintx, ArraycopySrcPrefetchDistance, 0, \ 4.21 + "Distance to prefetch source array in arracopy") \ 4.22 + \ 4.23 + product(uintx, ArraycopyDstPrefetchDistance, 0, \ 4.24 + "Distance to prefetch destination array in arracopy") \ 4.25 + \ 4.26 /* deoptimization */ \ 4.27 develop(bool, TraceDeoptimization, false, \ 4.28 "Trace deoptimization") \