Fri, 02 Sep 2011 12:13:33 -0700
7039731: arraycopy could use prefetch on SPARC
Summary: Use BIS and prefetch in arraycopy stubs for Sparc (BIS for T4 only).
Reviewed-by: never, iveresov
1.1 --- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp Fri Sep 02 04:28:59 2011 -0700 1.2 +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp Fri Sep 02 12:13:33 2011 -0700 1.3 @@ -1124,6 +1124,126 @@ 1.4 } 1.5 } 1.6 1.7 + // 1.8 + // Generate main code for disjoint arraycopy 1.9 + // 1.10 + typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec, 1.11 + Label& L_loop, bool use_prefetch, bool use_bis); 1.12 + 1.13 + void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size, 1.14 + int iter_size, CopyLoopFunc copy_loop_func) { 1.15 + Label L_copy; 1.16 + 1.17 + assert(log2_elem_size <= 3, "the following code should be changed"); 1.18 + int count_dec = 16>>log2_elem_size; 1.19 + 1.20 + int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance); 1.21 + assert(prefetch_dist < 4096, "invalid value"); 1.22 + prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size 1.23 + int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count 1.24 + 1.25 + if (UseBlockCopy) { 1.26 + Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy; 1.27 + 1.28 + // 64 bytes tail + bytes copied in one loop iteration 1.29 + int tail_size = 64 + iter_size; 1.30 + int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size; 1.31 + // Use BIS copy only for big arrays since it requires membar. 1.32 + __ set(block_copy_count, O4); 1.33 + __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy); 1.34 + // This code is for disjoint source and destination: 1.35 + // to <= from || to >= from+count 1.36 + // but BIS will stomp over 'from' if (to > from-tail_size && to <= from) 1.37 + __ sub(from, to, O4); 1.38 + __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm. 1.39 + __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy); 1.40 + 1.41 + __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY); 1.42 + // BIS should not be used to copy tail (64 bytes+iter_size) 1.43 + // to avoid zeroing of following values. 1.44 + __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0 1.45 + 1.46 + if (prefetch_count > 0) { // rounded up to one iteration count 1.47 + // Do prefetching only if copy size is bigger 1.48 + // than prefetch distance. 1.49 + __ set(prefetch_count, O4); 1.50 + __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy); 1.51 + __ sub(count, prefetch_count, count); 1.52 + 1.53 + (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true); 1.54 + __ add(count, prefetch_count, count); // restore count 1.55 + 1.56 + } // prefetch_count > 0 1.57 + 1.58 + (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true); 1.59 + __ add(count, (tail_size>>log2_elem_size), count); // restore count 1.60 + 1.61 + __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT); 1.62 + // BIS needs membar. 1.63 + __ membar(Assembler::StoreLoad); 1.64 + // Copy tail 1.65 + __ ba_short(L_copy); 1.66 + 1.67 + __ BIND(L_skip_block_copy); 1.68 + } // UseBlockCopy 1.69 + 1.70 + if (prefetch_count > 0) { // rounded up to one iteration count 1.71 + // Do prefetching only if copy size is bigger 1.72 + // than prefetch distance. 1.73 + __ set(prefetch_count, O4); 1.74 + __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy); 1.75 + __ sub(count, prefetch_count, count); 1.76 + 1.77 + Label L_copy_prefetch; 1.78 + (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false); 1.79 + __ add(count, prefetch_count, count); // restore count 1.80 + 1.81 + } // prefetch_count > 0 1.82 + 1.83 + (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false); 1.84 + } 1.85 + 1.86 + 1.87 + 1.88 + // 1.89 + // Helper methods for copy_16_bytes_forward_with_shift() 1.90 + // 1.91 + void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec, 1.92 + Label& L_loop, bool use_prefetch, bool use_bis) { 1.93 + 1.94 + const Register left_shift = G1; // left shift bit counter 1.95 + const Register right_shift = G5; // right shift bit counter 1.96 + 1.97 + __ align(OptoLoopAlignment); 1.98 + __ BIND(L_loop); 1.99 + if (use_prefetch) { 1.100 + if (ArraycopySrcPrefetchDistance > 0) { 1.101 + __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); 1.102 + } 1.103 + if (ArraycopyDstPrefetchDistance > 0) { 1.104 + __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); 1.105 + } 1.106 + } 1.107 + __ ldx(from, 0, O4); 1.108 + __ ldx(from, 8, G4); 1.109 + __ inc(to, 16); 1.110 + __ inc(from, 16); 1.111 + __ deccc(count, count_dec); // Can we do next iteration after this one? 1.112 + __ srlx(O4, right_shift, G3); 1.113 + __ bset(G3, O3); 1.114 + __ sllx(O4, left_shift, O4); 1.115 + __ srlx(G4, right_shift, G3); 1.116 + __ bset(G3, O4); 1.117 + if (use_bis) { 1.118 + __ stxa(O3, to, -16); 1.119 + __ stxa(O4, to, -8); 1.120 + } else { 1.121 + __ stx(O3, to, -16); 1.122 + __ stx(O4, to, -8); 1.123 + } 1.124 + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1.125 + __ delayed()->sllx(G4, left_shift, O3); 1.126 + } 1.127 1.128 // Copy big chunks forward with shift 1.129 // 1.130 @@ -1135,64 +1255,51 @@ 1.131 // L_copy_bytes - copy exit label 1.132 // 1.133 void copy_16_bytes_forward_with_shift(Register from, Register to, 1.134 - Register count, int count_dec, Label& L_copy_bytes) { 1.135 - Label L_loop, L_aligned_copy, L_copy_last_bytes; 1.136 + Register count, int log2_elem_size, Label& L_copy_bytes) { 1.137 + Label L_aligned_copy, L_copy_last_bytes; 1.138 + assert(log2_elem_size <= 3, "the following code should be changed"); 1.139 + int count_dec = 16>>log2_elem_size; 1.140 1.141 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy 1.142 - __ andcc(from, 7, G1); // misaligned bytes 1.143 - __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 1.144 - __ delayed()->nop(); 1.145 + __ andcc(from, 7, G1); // misaligned bytes 1.146 + __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 1.147 + __ delayed()->nop(); 1.148 1.149 const Register left_shift = G1; // left shift bit counter 1.150 const Register right_shift = G5; // right shift bit counter 1.151 1.152 - __ sll(G1, LogBitsPerByte, left_shift); 1.153 - __ mov(64, right_shift); 1.154 - __ sub(right_shift, left_shift, right_shift); 1.155 + __ sll(G1, LogBitsPerByte, left_shift); 1.156 + __ mov(64, right_shift); 1.157 + __ sub(right_shift, left_shift, right_shift); 1.158 1.159 // 1.160 // Load 2 aligned 8-bytes chunks and use one from previous iteration 1.161 // to form 2 aligned 8-bytes chunks to store. 1.162 // 1.163 - __ deccc(count, count_dec); // Pre-decrement 'count' 1.164 - __ andn(from, 7, from); // Align address 1.165 - __ ldx(from, 0, O3); 1.166 - __ inc(from, 8); 1.167 - __ align(OptoLoopAlignment); 1.168 - __ BIND(L_loop); 1.169 - __ ldx(from, 0, O4); 1.170 - __ deccc(count, count_dec); // Can we do next iteration after this one? 1.171 - __ ldx(from, 8, G4); 1.172 - __ inc(to, 16); 1.173 - __ inc(from, 16); 1.174 - __ sllx(O3, left_shift, O3); 1.175 - __ srlx(O4, right_shift, G3); 1.176 - __ bset(G3, O3); 1.177 - __ stx(O3, to, -16); 1.178 - __ sllx(O4, left_shift, O4); 1.179 - __ srlx(G4, right_shift, G3); 1.180 - __ bset(G3, O4); 1.181 - __ stx(O4, to, -8); 1.182 - __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1.183 - __ delayed()->mov(G4, O3); 1.184 - 1.185 - __ inccc(count, count_dec>>1 ); // + 8 bytes 1.186 - __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes); 1.187 - __ delayed()->inc(count, count_dec>>1); // restore 'count' 1.188 - 1.189 - // copy 8 bytes, part of them already loaded in O3 1.190 - __ ldx(from, 0, O4); 1.191 - __ inc(to, 8); 1.192 - __ inc(from, 8); 1.193 - __ sllx(O3, left_shift, O3); 1.194 - __ srlx(O4, right_shift, G3); 1.195 - __ bset(O3, G3); 1.196 - __ stx(G3, to, -8); 1.197 + __ dec(count, count_dec); // Pre-decrement 'count' 1.198 + __ andn(from, 7, from); // Align address 1.199 + __ ldx(from, 0, O3); 1.200 + __ inc(from, 8); 1.201 + __ sllx(O3, left_shift, O3); 1.202 + 1.203 + disjoint_copy_core(from, to, count, log2_elem_size, 16, copy_16_bytes_shift_loop); 1.204 + 1.205 + __ inccc(count, count_dec>>1 ); // + 8 bytes 1.206 + __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes); 1.207 + __ delayed()->inc(count, count_dec>>1); // restore 'count' 1.208 + 1.209 + // copy 8 bytes, part of them already loaded in O3 1.210 + __ ldx(from, 0, O4); 1.211 + __ inc(to, 8); 1.212 + __ inc(from, 8); 1.213 + __ srlx(O4, right_shift, G3); 1.214 + __ bset(O3, G3); 1.215 + __ stx(G3, to, -8); 1.216 1.217 __ BIND(L_copy_last_bytes); 1.218 - __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes 1.219 - __ br(Assembler::always, false, Assembler::pt, L_copy_bytes); 1.220 - __ delayed()->sub(from, right_shift, from); // restore address 1.221 + __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes 1.222 + __ br(Assembler::always, false, Assembler::pt, L_copy_bytes); 1.223 + __ delayed()->sub(from, right_shift, from); // restore address 1.224 1.225 __ BIND(L_aligned_copy); 1.226 } 1.227 @@ -1348,7 +1455,7 @@ 1.228 // The compare above (count >= 23) guarantes 'count' >= 16 bytes. 1.229 // Also jump over aligned copy after the copy with shift completed. 1.230 1.231 - copy_16_bytes_forward_with_shift(from, to, count, 16, L_copy_byte); 1.232 + copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte); 1.233 } 1.234 1.235 // Both array are 8 bytes aligned, copy 16 bytes at a time 1.236 @@ -1576,7 +1683,7 @@ 1.237 // The compare above (count >= 11) guarantes 'count' >= 16 bytes. 1.238 // Also jump over aligned copy after the copy with shift completed. 1.239 1.240 - copy_16_bytes_forward_with_shift(from, to, count, 8, L_copy_2_bytes); 1.241 + copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes); 1.242 } 1.243 1.244 // Both array are 8 bytes aligned, copy 16 bytes at a time 1.245 @@ -1950,6 +2057,45 @@ 1.246 } 1.247 1.248 // 1.249 + // Helper methods for generate_disjoint_int_copy_core() 1.250 + // 1.251 + void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec, 1.252 + Label& L_loop, bool use_prefetch, bool use_bis) { 1.253 + 1.254 + __ align(OptoLoopAlignment); 1.255 + __ BIND(L_loop); 1.256 + if (use_prefetch) { 1.257 + if (ArraycopySrcPrefetchDistance > 0) { 1.258 + __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); 1.259 + } 1.260 + if (ArraycopyDstPrefetchDistance > 0) { 1.261 + __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); 1.262 + } 1.263 + } 1.264 + __ ldx(from, 4, O4); 1.265 + __ ldx(from, 12, G4); 1.266 + __ inc(to, 16); 1.267 + __ inc(from, 16); 1.268 + __ deccc(count, 4); // Can we do next iteration after this one? 1.269 + 1.270 + __ srlx(O4, 32, G3); 1.271 + __ bset(G3, O3); 1.272 + __ sllx(O4, 32, O4); 1.273 + __ srlx(G4, 32, G3); 1.274 + __ bset(G3, O4); 1.275 + if (use_bis) { 1.276 + __ stxa(O3, to, -16); 1.277 + __ stxa(O4, to, -8); 1.278 + } else { 1.279 + __ stx(O3, to, -16); 1.280 + __ stx(O4, to, -8); 1.281 + } 1.282 + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1.283 + __ delayed()->sllx(G4, 32, O3); 1.284 + 1.285 + } 1.286 + 1.287 + // 1.288 // Generate core code for disjoint int copy (and oop copy on 32-bit). 1.289 // If "aligned" is true, the "from" and "to" addresses are assumed 1.290 // to be heapword aligned. 1.291 @@ -1962,7 +2108,7 @@ 1.292 void generate_disjoint_int_copy_core(bool aligned) { 1.293 1.294 Label L_skip_alignment, L_aligned_copy; 1.295 - Label L_copy_16_bytes, L_copy_4_bytes, L_copy_4_bytes_loop, L_exit; 1.296 + Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit; 1.297 1.298 const Register from = O0; // source array address 1.299 const Register to = O1; // destination array address 1.300 @@ -2013,30 +2159,16 @@ 1.301 1.302 // copy with shift 4 elements (16 bytes) at a time 1.303 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4 1.304 - 1.305 - __ align(OptoLoopAlignment); 1.306 - __ BIND(L_copy_16_bytes); 1.307 - __ ldx(from, 4, O4); 1.308 - __ deccc(count, 4); // Can we do next iteration after this one? 1.309 - __ ldx(from, 12, G4); 1.310 - __ inc(to, 16); 1.311 - __ inc(from, 16); 1.312 - __ sllx(O3, 32, O3); 1.313 - __ srlx(O4, 32, G3); 1.314 - __ bset(G3, O3); 1.315 - __ stx(O3, to, -16); 1.316 - __ sllx(O4, 32, O4); 1.317 - __ srlx(G4, 32, G3); 1.318 - __ bset(G3, O4); 1.319 - __ stx(O4, to, -8); 1.320 - __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes); 1.321 - __ delayed()->mov(G4, O3); 1.322 + __ sllx(O3, 32, O3); 1.323 + 1.324 + disjoint_copy_core(from, to, count, 2, 16, copy_16_bytes_loop); 1.325 1.326 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes); 1.327 __ delayed()->inc(count, 4); // restore 'count' 1.328 1.329 __ BIND(L_aligned_copy); 1.330 - } 1.331 + } // !aligned 1.332 + 1.333 // copy 4 elements (16 bytes) at a time 1.334 __ and3(count, 1, G4); // Save 1.335 __ srl(count, 1, count); 1.336 @@ -2223,6 +2355,38 @@ 1.337 } 1.338 1.339 // 1.340 + // Helper methods for generate_disjoint_long_copy_core() 1.341 + // 1.342 + void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec, 1.343 + Label& L_loop, bool use_prefetch, bool use_bis) { 1.344 + __ align(OptoLoopAlignment); 1.345 + __ BIND(L_loop); 1.346 + for (int off = 0; off < 64; off += 16) { 1.347 + if (use_prefetch && (off & 31) == 0) { 1.348 + if (ArraycopySrcPrefetchDistance > 0) { 1.349 + __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); 1.350 + } 1.351 + if (ArraycopyDstPrefetchDistance > 0) { 1.352 + __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); 1.353 + } 1.354 + } 1.355 + __ ldx(from, off+0, O4); 1.356 + __ ldx(from, off+8, O5); 1.357 + if (use_bis) { 1.358 + __ stxa(O4, to, off+0); 1.359 + __ stxa(O5, to, off+8); 1.360 + } else { 1.361 + __ stx(O4, to, off+0); 1.362 + __ stx(O5, to, off+8); 1.363 + } 1.364 + } 1.365 + __ deccc(count, 8); 1.366 + __ inc(from, 64); 1.367 + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1.368 + __ delayed()->inc(to, 64); 1.369 + } 1.370 + 1.371 + // 1.372 // Generate core code for disjoint long copy (and oop copy on 64-bit). 1.373 // "aligned" is ignored, because we must make the stronger 1.374 // assumption that both addresses are always 64-bit aligned. 1.375 @@ -2261,38 +2425,28 @@ 1.376 const Register offset0 = O4; // element offset 1.377 const Register offset8 = O5; // next element offset 1.378 1.379 - __ deccc(count, 2); 1.380 - __ mov(G0, offset0); // offset from start of arrays (0) 1.381 - __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); 1.382 - __ delayed()->add(offset0, 8, offset8); 1.383 + __ deccc(count, 2); 1.384 + __ mov(G0, offset0); // offset from start of arrays (0) 1.385 + __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); 1.386 + __ delayed()->add(offset0, 8, offset8); 1.387 1.388 // Copy by 64 bytes chunks 1.389 - Label L_copy_64_bytes; 1.390 + 1.391 const Register from64 = O3; // source address 1.392 const Register to64 = G3; // destination address 1.393 - __ subcc(count, 6, O3); 1.394 - __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes ); 1.395 - __ delayed()->mov(to, to64); 1.396 - // Now we can use O4(offset0), O5(offset8) as temps 1.397 - __ mov(O3, count); 1.398 - __ mov(from, from64); 1.399 - 1.400 - __ align(OptoLoopAlignment); 1.401 - __ BIND(L_copy_64_bytes); 1.402 - for( int off = 0; off < 64; off += 16 ) { 1.403 - __ ldx(from64, off+0, O4); 1.404 - __ ldx(from64, off+8, O5); 1.405 - __ stx(O4, to64, off+0); 1.406 - __ stx(O5, to64, off+8); 1.407 - } 1.408 - __ deccc(count, 8); 1.409 - __ inc(from64, 64); 1.410 - __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_64_bytes); 1.411 - __ delayed()->inc(to64, 64); 1.412 + __ subcc(count, 6, O3); 1.413 + __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes ); 1.414 + __ delayed()->mov(to, to64); 1.415 + // Now we can use O4(offset0), O5(offset8) as temps 1.416 + __ mov(O3, count); 1.417 + // count >= 0 (original count - 8) 1.418 + __ mov(from, from64); 1.419 + 1.420 + disjoint_copy_core(from64, to64, count, 3, 64, copy_64_bytes_loop); 1.421 1.422 // Restore O4(offset0), O5(offset8) 1.423 __ sub(from64, from, offset0); 1.424 - __ inccc(count, 6); 1.425 + __ inccc(count, 6); // restore count 1.426 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); 1.427 __ delayed()->add(offset0, 8, offset8); 1.428
2.1 --- a/src/cpu/sparc/vm/vm_version_sparc.cpp Fri Sep 02 04:28:59 2011 -0700 2.2 +++ b/src/cpu/sparc/vm/vm_version_sparc.cpp Fri Sep 02 12:13:33 2011 -0700 2.3 @@ -75,6 +75,24 @@ 2.4 FLAG_SET_DEFAULT(AllocatePrefetchStyle, 1); 2.5 } 2.6 2.7 + if (has_v9()) { 2.8 + assert(ArraycopySrcPrefetchDistance < 4096, "invalid value"); 2.9 + if (ArraycopySrcPrefetchDistance >= 4096) 2.10 + ArraycopySrcPrefetchDistance = 4064; 2.11 + assert(ArraycopyDstPrefetchDistance < 4096, "invalid value"); 2.12 + if (ArraycopyDstPrefetchDistance >= 4096) 2.13 + ArraycopyDstPrefetchDistance = 4064; 2.14 + } else { 2.15 + if (ArraycopySrcPrefetchDistance > 0) { 2.16 + warning("prefetch instructions are not available on this CPU"); 2.17 + FLAG_SET_DEFAULT(ArraycopySrcPrefetchDistance, 0); 2.18 + } 2.19 + if (ArraycopyDstPrefetchDistance > 0) { 2.20 + warning("prefetch instructions are not available on this CPU"); 2.21 + FLAG_SET_DEFAULT(ArraycopyDstPrefetchDistance, 0); 2.22 + } 2.23 + } 2.24 + 2.25 UseSSE = 0; // Only on x86 and x64 2.26 2.27 _supports_cx8 = has_v9(); 2.28 @@ -180,6 +198,16 @@ 2.29 FLAG_SET_DEFAULT(UseBlockZeroing, false); 2.30 } 2.31 2.32 + assert(BlockCopyLowLimit > 0, "invalid value"); 2.33 + if (has_block_zeroing()) { // has_blk_init() && is_T4(): core's local L2 cache 2.34 + if (FLAG_IS_DEFAULT(UseBlockCopy)) { 2.35 + FLAG_SET_DEFAULT(UseBlockCopy, true); 2.36 + } 2.37 + } else if (UseBlockCopy) { 2.38 + warning("BIS instructions are not available or expensive on this CPU"); 2.39 + FLAG_SET_DEFAULT(UseBlockCopy, false); 2.40 + } 2.41 + 2.42 #ifdef COMPILER2 2.43 // T4 and newer Sparc cpus have fast RDPC. 2.44 if (has_fast_rdpc() && FLAG_IS_DEFAULT(UseRDPCForConstantTableBase)) {
3.1 --- a/src/share/vm/runtime/globals.hpp Fri Sep 02 04:28:59 2011 -0700 3.2 +++ b/src/share/vm/runtime/globals.hpp Fri Sep 02 12:13:33 2011 -0700 3.3 @@ -1985,6 +1985,12 @@ 3.4 product(intx, BlockZeroingLowLimit, 2048, \ 3.5 "Minimum size in bytes when block zeroing will be used") \ 3.6 \ 3.7 + product(bool, UseBlockCopy, false, \ 3.8 + "Use special cpu instructions for block copy") \ 3.9 + \ 3.10 + product(intx, BlockCopyLowLimit, 2048, \ 3.11 + "Minimum size in bytes when block copy will be used") \ 3.12 + \ 3.13 product(bool, PrintRevisitStats, false, \ 3.14 "Print revisit (klass and MDO) stack related information") \ 3.15 \ 3.16 @@ -2918,6 +2924,12 @@ 3.17 product(intx, ReadPrefetchInstr, 0, \ 3.18 "Prefetch instruction to prefetch ahead") \ 3.19 \ 3.20 + product(uintx, ArraycopySrcPrefetchDistance, 0, \ 3.21 + "Distance to prefetch source array in arracopy") \ 3.22 + \ 3.23 + product(uintx, ArraycopyDstPrefetchDistance, 0, \ 3.24 + "Distance to prefetch destination array in arracopy") \ 3.25 + \ 3.26 /* deoptimization */ \ 3.27 develop(bool, TraceDeoptimization, false, \ 3.28 "Trace deoptimization") \