Tue, 04 Aug 2009 16:00:17 -0700
6819077: G1: first GC thread coming late into the GC.
Summary: The first worker thread is delayed when entering the GC because it clears the card count table that is used in identifying hot cards. Replace the card count table with a dynamically sized evicting hash table that includes an epoch based counter.
Reviewed-by: iveresov, tonyp
1.1 --- a/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp Mon Aug 03 12:59:30 2009 -0700 1.2 +++ b/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp Tue Aug 04 16:00:17 2009 -0700 1.3 @@ -25,11 +25,21 @@ 1.4 #include "incls/_precompiled.incl" 1.5 #include "incls/_concurrentG1Refine.cpp.incl" 1.6 1.7 +// Possible sizes for the card counts cache: odd primes that roughly double in size. 1.8 +// (See jvmtiTagMap.cpp). 1.9 +int ConcurrentG1Refine::_cc_cache_sizes[] = { 1.10 + 16381, 32771, 76831, 150001, 307261, 1.11 + 614563, 1228891, 2457733, 4915219, 9830479, 1.12 + 19660831, 39321619, 78643219, 157286461, -1 1.13 + }; 1.14 + 1.15 ConcurrentG1Refine::ConcurrentG1Refine() : 1.16 - _card_counts(NULL), _cur_card_count_histo(NULL), _cum_card_count_histo(NULL), 1.17 + _card_counts(NULL), _card_epochs(NULL), 1.18 + _n_card_counts(0), _max_n_card_counts(0), 1.19 + _cache_size_index(0), _expand_card_counts(false), 1.20 _hot_cache(NULL), 1.21 _def_use_cache(false), _use_cache(false), 1.22 - _n_periods(0), _total_cards(0), _total_travs(0), 1.23 + _n_periods(0), 1.24 _threads(NULL), _n_threads(0) 1.25 { 1.26 if (G1ConcRefine) { 1.27 @@ -57,26 +67,39 @@ 1.28 } 1.29 1.30 void ConcurrentG1Refine::init() { 1.31 - G1CollectedHeap* g1h = G1CollectedHeap::heap(); 1.32 - if (G1ConcRSLogCacheSize > 0 || G1ConcRSCountTraversals) { 1.33 - _n_card_counts = 1.34 - (unsigned) (g1h->g1_reserved_obj_bytes() >> CardTableModRefBS::card_shift); 1.35 - _card_counts = NEW_C_HEAP_ARRAY(unsigned char, _n_card_counts); 1.36 - for (size_t i = 0; i < _n_card_counts; i++) _card_counts[i] = 0; 1.37 - ModRefBarrierSet* bs = g1h->mr_bs(); 1.38 + if (G1ConcRSLogCacheSize > 0) { 1.39 + _g1h = G1CollectedHeap::heap(); 1.40 + _max_n_card_counts = 1.41 + (unsigned) (_g1h->g1_reserved_obj_bytes() >> CardTableModRefBS::card_shift); 1.42 + 1.43 + size_t max_card_num = ((size_t)1 << (sizeof(unsigned)*BitsPerByte-1)) - 1; 1.44 + guarantee(_max_n_card_counts < max_card_num, "card_num representation"); 1.45 + 1.46 + int desired = _max_n_card_counts / InitialCacheFraction; 1.47 + for (_cache_size_index = 0; 1.48 + _cc_cache_sizes[_cache_size_index] >= 0; _cache_size_index++) { 1.49 + if (_cc_cache_sizes[_cache_size_index] >= desired) break; 1.50 + } 1.51 + _cache_size_index = MAX2(0, (_cache_size_index - 1)); 1.52 + 1.53 + int initial_size = _cc_cache_sizes[_cache_size_index]; 1.54 + if (initial_size < 0) initial_size = _max_n_card_counts; 1.55 + 1.56 + // Make sure we don't go bigger than we will ever need 1.57 + _n_card_counts = MIN2((unsigned) initial_size, _max_n_card_counts); 1.58 + 1.59 + _card_counts = NEW_C_HEAP_ARRAY(CardCountCacheEntry, _n_card_counts); 1.60 + _card_epochs = NEW_C_HEAP_ARRAY(CardEpochCacheEntry, _n_card_counts); 1.61 + 1.62 + Copy::fill_to_bytes(&_card_counts[0], 1.63 + _n_card_counts * sizeof(CardCountCacheEntry)); 1.64 + Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry)); 1.65 + 1.66 + ModRefBarrierSet* bs = _g1h->mr_bs(); 1.67 guarantee(bs->is_a(BarrierSet::CardTableModRef), "Precondition"); 1.68 - CardTableModRefBS* ctbs = (CardTableModRefBS*)bs; 1.69 - _ct_bot = ctbs->byte_for_const(g1h->reserved_region().start()); 1.70 - if (G1ConcRSCountTraversals) { 1.71 - _cur_card_count_histo = NEW_C_HEAP_ARRAY(unsigned, 256); 1.72 - _cum_card_count_histo = NEW_C_HEAP_ARRAY(unsigned, 256); 1.73 - for (int i = 0; i < 256; i++) { 1.74 - _cur_card_count_histo[i] = 0; 1.75 - _cum_card_count_histo[i] = 0; 1.76 - } 1.77 - } 1.78 - } 1.79 - if (G1ConcRSLogCacheSize > 0) { 1.80 + _ct_bs = (CardTableModRefBS*)bs; 1.81 + _ct_bot = _ct_bs->byte_for_const(_g1h->reserved_region().start()); 1.82 + 1.83 _def_use_cache = true; 1.84 _use_cache = true; 1.85 _hot_cache_size = (1 << G1ConcRSLogCacheSize); 1.86 @@ -86,7 +109,7 @@ 1.87 1.88 // For refining the cards in the hot cache in parallel 1.89 int n_workers = (ParallelGCThreads > 0 ? 1.90 - g1h->workers()->total_workers() : 1); 1.91 + _g1h->workers()->total_workers() : 1); 1.92 _hot_cache_par_chunk_size = MAX2(1, _hot_cache_size / n_workers); 1.93 _hot_cache_par_claimed_idx = 0; 1.94 } 1.95 @@ -101,15 +124,11 @@ 1.96 } 1.97 1.98 ConcurrentG1Refine::~ConcurrentG1Refine() { 1.99 - if (G1ConcRSLogCacheSize > 0 || G1ConcRSCountTraversals) { 1.100 + if (G1ConcRSLogCacheSize > 0) { 1.101 assert(_card_counts != NULL, "Logic"); 1.102 - FREE_C_HEAP_ARRAY(unsigned char, _card_counts); 1.103 - assert(_cur_card_count_histo != NULL, "Logic"); 1.104 - FREE_C_HEAP_ARRAY(unsigned, _cur_card_count_histo); 1.105 - assert(_cum_card_count_histo != NULL, "Logic"); 1.106 - FREE_C_HEAP_ARRAY(unsigned, _cum_card_count_histo); 1.107 - } 1.108 - if (G1ConcRSLogCacheSize > 0) { 1.109 + FREE_C_HEAP_ARRAY(CardCountCacheEntry, _card_counts); 1.110 + assert(_card_epochs != NULL, "Logic"); 1.111 + FREE_C_HEAP_ARRAY(CardEpochCacheEntry, _card_epochs); 1.112 assert(_hot_cache != NULL, "Logic"); 1.113 FREE_C_HEAP_ARRAY(jbyte*, _hot_cache); 1.114 } 1.115 @@ -129,42 +148,165 @@ 1.116 } 1.117 } 1.118 1.119 - 1.120 -int ConcurrentG1Refine::add_card_count(jbyte* card_ptr) { 1.121 - size_t card_num = (card_ptr - _ct_bot); 1.122 - guarantee(0 <= card_num && card_num < _n_card_counts, "Bounds"); 1.123 - unsigned char cnt = _card_counts[card_num]; 1.124 - if (cnt < 255) _card_counts[card_num]++; 1.125 - return cnt; 1.126 - _total_travs++; 1.127 +bool ConcurrentG1Refine::is_young_card(jbyte* card_ptr) { 1.128 + HeapWord* start = _ct_bs->addr_for(card_ptr); 1.129 + HeapRegion* r = _g1h->heap_region_containing(start); 1.130 + if (r != NULL && r->is_young()) { 1.131 + return true; 1.132 + } 1.133 + // This card is not associated with a heap region 1.134 + // so can't be young. 1.135 + return false; 1.136 } 1.137 1.138 -jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr) { 1.139 - int count = add_card_count(card_ptr); 1.140 - // Count previously unvisited cards. 1.141 - if (count == 0) _total_cards++; 1.142 - // We'll assume a traversal unless we store it in the cache. 1.143 +jbyte* ConcurrentG1Refine::add_card_count(jbyte* card_ptr, int* count, bool* defer) { 1.144 + unsigned new_card_num = ptr_2_card_num(card_ptr); 1.145 + unsigned bucket = hash(new_card_num); 1.146 + assert(0 <= bucket && bucket < _n_card_counts, "Bounds"); 1.147 + 1.148 + CardCountCacheEntry* count_ptr = &_card_counts[bucket]; 1.149 + CardEpochCacheEntry* epoch_ptr = &_card_epochs[bucket]; 1.150 + 1.151 + // We have to construct a new entry if we haven't updated the counts 1.152 + // during the current period, or if the count was updated for a 1.153 + // different card number. 1.154 + unsigned int new_epoch = (unsigned int) _n_periods; 1.155 + julong new_epoch_entry = make_epoch_entry(new_card_num, new_epoch); 1.156 + 1.157 + while (true) { 1.158 + // Fetch the previous epoch value 1.159 + julong prev_epoch_entry = epoch_ptr->_value; 1.160 + julong cas_res; 1.161 + 1.162 + if (extract_epoch(prev_epoch_entry) != new_epoch) { 1.163 + // This entry has not yet been updated during this period. 1.164 + // Note: we update the epoch value atomically to ensure 1.165 + // that there is only one winner that updates the cached 1.166 + // card_ptr value even though all the refine threads share 1.167 + // the same epoch value. 1.168 + 1.169 + cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry, 1.170 + (volatile jlong*)&epoch_ptr->_value, 1.171 + (jlong) prev_epoch_entry); 1.172 + 1.173 + if (cas_res == prev_epoch_entry) { 1.174 + // We have successfully won the race to update the 1.175 + // epoch and card_num value. Make it look like the 1.176 + // count and eviction count were previously cleared. 1.177 + count_ptr->_count = 1; 1.178 + count_ptr->_evict_count = 0; 1.179 + *count = 0; 1.180 + // We can defer the processing of card_ptr 1.181 + *defer = true; 1.182 + return card_ptr; 1.183 + } 1.184 + // We did not win the race to update the epoch field, so some other 1.185 + // thread must have done it. The value that gets returned by CAS 1.186 + // should be the new epoch value. 1.187 + assert(extract_epoch(cas_res) == new_epoch, "unexpected epoch"); 1.188 + // We could 'continue' here or just re-read the previous epoch value 1.189 + prev_epoch_entry = epoch_ptr->_value; 1.190 + } 1.191 + 1.192 + // The epoch entry for card_ptr has been updated during this period. 1.193 + unsigned old_card_num = extract_card_num(prev_epoch_entry); 1.194 + 1.195 + // The card count that will be returned to caller 1.196 + *count = count_ptr->_count; 1.197 + 1.198 + // Are we updating the count for the same card? 1.199 + if (new_card_num == old_card_num) { 1.200 + // Same card - just update the count. We could have more than one 1.201 + // thread racing to update count for the current card. It should be 1.202 + // OK not to use a CAS as the only penalty should be some missed 1.203 + // increments of the count which delays identifying the card as "hot". 1.204 + 1.205 + if (*count < max_jubyte) count_ptr->_count++; 1.206 + // We can defer the processing of card_ptr 1.207 + *defer = true; 1.208 + return card_ptr; 1.209 + } 1.210 + 1.211 + // Different card - evict old card info 1.212 + if (count_ptr->_evict_count < max_jubyte) count_ptr->_evict_count++; 1.213 + if (count_ptr->_evict_count > G1CardCountCacheExpandThreshold) { 1.214 + // Trigger a resize the next time we clear 1.215 + _expand_card_counts = true; 1.216 + } 1.217 + 1.218 + cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry, 1.219 + (volatile jlong*)&epoch_ptr->_value, 1.220 + (jlong) prev_epoch_entry); 1.221 + 1.222 + if (cas_res == prev_epoch_entry) { 1.223 + // We successfully updated the card num value in the epoch entry 1.224 + count_ptr->_count = 0; // initialize counter for new card num 1.225 + 1.226 + // Even though the region containg the card at old_card_num was not 1.227 + // in the young list when old_card_num was recorded in the epoch 1.228 + // cache it could have been added to the free list and subsequently 1.229 + // added to the young list in the intervening time. If the evicted 1.230 + // card is in a young region just return the card_ptr and the evicted 1.231 + // card will not be cleaned. See CR 6817995. 1.232 + 1.233 + jbyte* old_card_ptr = card_num_2_ptr(old_card_num); 1.234 + if (is_young_card(old_card_ptr)) { 1.235 + *count = 0; 1.236 + // We can defer the processing of card_ptr 1.237 + *defer = true; 1.238 + return card_ptr; 1.239 + } 1.240 + 1.241 + // We do not want to defer processing of card_ptr in this case 1.242 + // (we need to refine old_card_ptr and card_ptr) 1.243 + *defer = false; 1.244 + return old_card_ptr; 1.245 + } 1.246 + // Someone else beat us - try again. 1.247 + } 1.248 +} 1.249 + 1.250 +jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr, bool* defer) { 1.251 + int count; 1.252 + jbyte* cached_ptr = add_card_count(card_ptr, &count, defer); 1.253 + assert(cached_ptr != NULL, "bad cached card ptr"); 1.254 + assert(!is_young_card(cached_ptr), "shouldn't get a card in young region"); 1.255 + 1.256 + // The card pointer we obtained from card count cache is not hot 1.257 + // so do not store it in the cache; return it for immediate 1.258 + // refining. 1.259 if (count < G1ConcRSHotCardLimit) { 1.260 - _total_travs++; 1.261 - return card_ptr; 1.262 + return cached_ptr; 1.263 } 1.264 - // Otherwise, it's hot. 1.265 + 1.266 + // Otherwise, the pointer we got from the _card_counts is hot. 1.267 jbyte* res = NULL; 1.268 MutexLockerEx x(HotCardCache_lock, Mutex::_no_safepoint_check_flag); 1.269 if (_n_hot == _hot_cache_size) { 1.270 - _total_travs++; 1.271 res = _hot_cache[_hot_cache_idx]; 1.272 _n_hot--; 1.273 } 1.274 // Now _n_hot < _hot_cache_size, and we can insert at _hot_cache_idx. 1.275 - _hot_cache[_hot_cache_idx] = card_ptr; 1.276 + _hot_cache[_hot_cache_idx] = cached_ptr; 1.277 _hot_cache_idx++; 1.278 if (_hot_cache_idx == _hot_cache_size) _hot_cache_idx = 0; 1.279 _n_hot++; 1.280 + 1.281 + if (res != NULL) { 1.282 + // Even though the region containg res was not in the young list 1.283 + // when it was recorded in the hot cache it could have been added 1.284 + // to the free list and subsequently added to the young list in 1.285 + // the intervening time. If res is in a young region, return NULL 1.286 + // so that res is not cleaned. See CR 6817995. 1.287 + 1.288 + if (is_young_card(res)) { 1.289 + res = NULL; 1.290 + } 1.291 + } 1.292 + 1.293 return res; 1.294 } 1.295 1.296 - 1.297 void ConcurrentG1Refine::clean_up_cache(int worker_i, G1RemSet* g1rs) { 1.298 assert(!use_cache(), "cache should be disabled"); 1.299 int start_idx; 1.300 @@ -186,114 +328,52 @@ 1.301 } 1.302 } 1.303 1.304 -void ConcurrentG1Refine::clear_and_record_card_counts() { 1.305 - if (G1ConcRSLogCacheSize == 0 && !G1ConcRSCountTraversals) return; 1.306 - _n_periods++; 1.307 - if (G1ConcRSCountTraversals) { 1.308 - for (size_t i = 0; i < _n_card_counts; i++) { 1.309 - unsigned char bucket = _card_counts[i]; 1.310 - _cur_card_count_histo[bucket]++; 1.311 - _card_counts[i] = 0; 1.312 +void ConcurrentG1Refine::expand_card_count_cache() { 1.313 + if (_n_card_counts < _max_n_card_counts) { 1.314 + int new_idx = _cache_size_index+1; 1.315 + int new_size = _cc_cache_sizes[new_idx]; 1.316 + if (new_size < 0) new_size = _max_n_card_counts; 1.317 + 1.318 + // Make sure we don't go bigger than we will ever need 1.319 + new_size = MIN2((unsigned) new_size, _max_n_card_counts); 1.320 + 1.321 + // Expand the card count and card epoch tables 1.322 + if (new_size > (int)_n_card_counts) { 1.323 + // We can just free and allocate a new array as we're 1.324 + // not interested in preserving the contents 1.325 + assert(_card_counts != NULL, "Logic!"); 1.326 + assert(_card_epochs != NULL, "Logic!"); 1.327 + FREE_C_HEAP_ARRAY(CardCountCacheEntry, _card_counts); 1.328 + FREE_C_HEAP_ARRAY(CardEpochCacheEntry, _card_epochs); 1.329 + _n_card_counts = new_size; 1.330 + _card_counts = NEW_C_HEAP_ARRAY(CardCountCacheEntry, _n_card_counts); 1.331 + _card_epochs = NEW_C_HEAP_ARRAY(CardEpochCacheEntry, _n_card_counts); 1.332 + _cache_size_index = new_idx; 1.333 } 1.334 - gclog_or_tty->print_cr("Card counts:"); 1.335 - for (int i = 0; i < 256; i++) { 1.336 - if (_cur_card_count_histo[i] > 0) { 1.337 - gclog_or_tty->print_cr(" %3d: %9d", i, _cur_card_count_histo[i]); 1.338 - _cum_card_count_histo[i] += _cur_card_count_histo[i]; 1.339 - _cur_card_count_histo[i] = 0; 1.340 - } 1.341 - } 1.342 - } else { 1.343 - assert(G1ConcRSLogCacheSize > 0, "Logic"); 1.344 - Copy::fill_to_words((HeapWord*)(&_card_counts[0]), 1.345 - _n_card_counts / HeapWordSize); 1.346 } 1.347 } 1.348 1.349 -void 1.350 -ConcurrentG1Refine:: 1.351 -print_card_count_histo_range(unsigned* histo, int from, int to, 1.352 - float& cum_card_pct, 1.353 - float& cum_travs_pct) { 1.354 - unsigned cards = 0; 1.355 - unsigned travs = 0; 1.356 - guarantee(to <= 256, "Precondition"); 1.357 - for (int i = from; i < to-1; i++) { 1.358 - cards += histo[i]; 1.359 - travs += histo[i] * i; 1.360 +void ConcurrentG1Refine::clear_and_record_card_counts() { 1.361 + if (G1ConcRSLogCacheSize == 0) return; 1.362 + 1.363 +#ifndef PRODUCT 1.364 + double start = os::elapsedTime(); 1.365 +#endif 1.366 + 1.367 + if (_expand_card_counts) { 1.368 + expand_card_count_cache(); 1.369 + _expand_card_counts = false; 1.370 + // Only need to clear the epochs. 1.371 + Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry)); 1.372 } 1.373 - if (to == 256) { 1.374 - unsigned histo_card_sum = 0; 1.375 - unsigned histo_trav_sum = 0; 1.376 - for (int i = 1; i < 255; i++) { 1.377 - histo_trav_sum += histo[i] * i; 1.378 - } 1.379 - cards += histo[255]; 1.380 - // correct traversals for the last one. 1.381 - unsigned travs_255 = (unsigned) (_total_travs - histo_trav_sum); 1.382 - travs += travs_255; 1.383 1.384 - } else { 1.385 - cards += histo[to-1]; 1.386 - travs += histo[to-1] * (to-1); 1.387 - } 1.388 - float fperiods = (float)_n_periods; 1.389 - float f_tot_cards = (float)_total_cards/fperiods; 1.390 - float f_tot_travs = (float)_total_travs/fperiods; 1.391 - if (cards > 0) { 1.392 - float fcards = (float)cards/fperiods; 1.393 - float ftravs = (float)travs/fperiods; 1.394 - if (to == 256) { 1.395 - gclog_or_tty->print(" %4d- %10.2f%10.2f", from, fcards, ftravs); 1.396 - } else { 1.397 - gclog_or_tty->print(" %4d-%4d %10.2f%10.2f", from, to-1, fcards, ftravs); 1.398 - } 1.399 - float pct_cards = fcards*100.0/f_tot_cards; 1.400 - cum_card_pct += pct_cards; 1.401 - float pct_travs = ftravs*100.0/f_tot_travs; 1.402 - cum_travs_pct += pct_travs; 1.403 - gclog_or_tty->print_cr("%10.2f%10.2f%10.2f%10.2f", 1.404 - pct_cards, cum_card_pct, 1.405 - pct_travs, cum_travs_pct); 1.406 - } 1.407 + int this_epoch = (int) _n_periods; 1.408 + assert((this_epoch+1) <= max_jint, "to many periods"); 1.409 + // Update epoch 1.410 + _n_periods++; 1.411 + 1.412 +#ifndef PRODUCT 1.413 + double elapsed = os::elapsedTime() - start; 1.414 + _g1h->g1_policy()->record_cc_clear_time(elapsed * 1000.0); 1.415 +#endif 1.416 } 1.417 - 1.418 -void ConcurrentG1Refine::print_final_card_counts() { 1.419 - if (!G1ConcRSCountTraversals) return; 1.420 - 1.421 - gclog_or_tty->print_cr("Did %d total traversals of %d distinct cards.", 1.422 - _total_travs, _total_cards); 1.423 - float fperiods = (float)_n_periods; 1.424 - gclog_or_tty->print_cr(" This is an average of %8.2f traversals, %8.2f cards, " 1.425 - "per collection.", (float)_total_travs/fperiods, 1.426 - (float)_total_cards/fperiods); 1.427 - gclog_or_tty->print_cr(" This is an average of %8.2f traversals/distinct " 1.428 - "dirty card.\n", 1.429 - _total_cards > 0 ? 1.430 - (float)_total_travs/(float)_total_cards : 0.0); 1.431 - 1.432 - 1.433 - gclog_or_tty->print_cr("Histogram:\n\n%10s %10s%10s%10s%10s%10s%10s", 1.434 - "range", "# cards", "# travs", "% cards", "(cum)", 1.435 - "% travs", "(cum)"); 1.436 - gclog_or_tty->print_cr("------------------------------------------------------------" 1.437 - "-------------"); 1.438 - float cum_cards_pct = 0.0; 1.439 - float cum_travs_pct = 0.0; 1.440 - for (int i = 1; i < 10; i++) { 1.441 - print_card_count_histo_range(_cum_card_count_histo, i, i+1, 1.442 - cum_cards_pct, cum_travs_pct); 1.443 - } 1.444 - for (int i = 10; i < 100; i += 10) { 1.445 - print_card_count_histo_range(_cum_card_count_histo, i, i+10, 1.446 - cum_cards_pct, cum_travs_pct); 1.447 - } 1.448 - print_card_count_histo_range(_cum_card_count_histo, 100, 150, 1.449 - cum_cards_pct, cum_travs_pct); 1.450 - print_card_count_histo_range(_cum_card_count_histo, 150, 200, 1.451 - cum_cards_pct, cum_travs_pct); 1.452 - print_card_count_histo_range(_cum_card_count_histo, 150, 255, 1.453 - cum_cards_pct, cum_travs_pct); 1.454 - print_card_count_histo_range(_cum_card_count_histo, 255, 256, 1.455 - cum_cards_pct, cum_travs_pct); 1.456 -}
2.1 --- a/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp Mon Aug 03 12:59:30 2009 -0700 2.2 +++ b/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp Tue Aug 04 16:00:17 2009 -0700 2.3 @@ -29,18 +29,77 @@ 2.4 class ConcurrentG1Refine: public CHeapObj { 2.5 ConcurrentG1RefineThread** _threads; 2.6 int _n_threads; 2.7 + 2.8 // The cache for card refinement. 2.9 - bool _use_cache; 2.10 - bool _def_use_cache; 2.11 - size_t _n_periods; 2.12 - size_t _total_cards; 2.13 - size_t _total_travs; 2.14 + bool _use_cache; 2.15 + bool _def_use_cache; 2.16 2.17 - unsigned char* _card_counts; 2.18 - unsigned _n_card_counts; 2.19 - const jbyte* _ct_bot; 2.20 - unsigned* _cur_card_count_histo; 2.21 - unsigned* _cum_card_count_histo; 2.22 + size_t _n_periods; // Used as clearing epoch 2.23 + 2.24 + // An evicting cache of the number of times each card 2.25 + // is accessed. Reduces, but does not eliminate, the amount 2.26 + // of duplicated processing of dirty cards. 2.27 + 2.28 + enum SomePrivateConstants { 2.29 + epoch_bits = 32, 2.30 + card_num_shift = epoch_bits, 2.31 + epoch_mask = AllBits, 2.32 + card_num_mask = AllBits, 2.33 + 2.34 + // The initial cache size is approximately this fraction 2.35 + // of a maximal cache (i.e. the size needed for all cards 2.36 + // in the heap) 2.37 + InitialCacheFraction = 512 2.38 + }; 2.39 + 2.40 + const static julong card_num_mask_in_place = 2.41 + (julong) card_num_mask << card_num_shift; 2.42 + 2.43 + typedef struct { 2.44 + julong _value; // | card_num | epoch | 2.45 + } CardEpochCacheEntry; 2.46 + 2.47 + julong make_epoch_entry(unsigned int card_num, unsigned int epoch) { 2.48 + assert(0 <= card_num && card_num < _max_n_card_counts, "Bounds"); 2.49 + assert(0 <= epoch && epoch <= _n_periods, "must be"); 2.50 + 2.51 + return ((julong) card_num << card_num_shift) | epoch; 2.52 + } 2.53 + 2.54 + unsigned int extract_epoch(julong v) { 2.55 + return (v & epoch_mask); 2.56 + } 2.57 + 2.58 + unsigned int extract_card_num(julong v) { 2.59 + return (v & card_num_mask_in_place) >> card_num_shift; 2.60 + } 2.61 + 2.62 + typedef struct { 2.63 + unsigned char _count; 2.64 + unsigned char _evict_count; 2.65 + } CardCountCacheEntry; 2.66 + 2.67 + CardCountCacheEntry* _card_counts; 2.68 + CardEpochCacheEntry* _card_epochs; 2.69 + 2.70 + // The current number of buckets in the card count cache 2.71 + unsigned _n_card_counts; 2.72 + 2.73 + // The max number of buckets required for the number of 2.74 + // cards for the entire reserved heap 2.75 + unsigned _max_n_card_counts; 2.76 + 2.77 + // Possible sizes of the cache: odd primes that roughly double in size. 2.78 + // (See jvmtiTagMap.cpp). 2.79 + static int _cc_cache_sizes[]; 2.80 + 2.81 + // The index in _cc_cache_sizes corresponding to the size of 2.82 + // _card_counts. 2.83 + int _cache_size_index; 2.84 + 2.85 + bool _expand_card_counts; 2.86 + 2.87 + const jbyte* _ct_bot; 2.88 2.89 jbyte** _hot_cache; 2.90 int _hot_cache_size; 2.91 @@ -50,12 +109,37 @@ 2.92 int _hot_cache_par_chunk_size; 2.93 volatile int _hot_cache_par_claimed_idx; 2.94 2.95 + // Needed to workaround 6817995 2.96 + CardTableModRefBS* _ct_bs; 2.97 + G1CollectedHeap* _g1h; 2.98 + 2.99 + // Expands the array that holds the card counts to the next size up 2.100 + void expand_card_count_cache(); 2.101 + 2.102 + // hash a given key (index of card_ptr) with the specified size 2.103 + static unsigned int hash(size_t key, int size) { 2.104 + return (unsigned int) key % size; 2.105 + } 2.106 + 2.107 + // hash a given key (index of card_ptr) 2.108 + unsigned int hash(size_t key) { 2.109 + return hash(key, _n_card_counts); 2.110 + } 2.111 + 2.112 + unsigned ptr_2_card_num(jbyte* card_ptr) { 2.113 + return (unsigned) (card_ptr - _ct_bot); 2.114 + } 2.115 + 2.116 + jbyte* card_num_2_ptr(unsigned card_num) { 2.117 + return (jbyte*) (_ct_bot + card_num); 2.118 + } 2.119 + 2.120 // Returns the count of this card after incrementing it. 2.121 - int add_card_count(jbyte* card_ptr); 2.122 + jbyte* add_card_count(jbyte* card_ptr, int* count, bool* defer); 2.123 2.124 - void print_card_count_histo_range(unsigned* histo, int from, int to, 2.125 - float& cum_card_pct, 2.126 - float& cum_travs_pct); 2.127 + // Returns true if this card is in a young region 2.128 + bool is_young_card(jbyte* card_ptr); 2.129 + 2.130 public: 2.131 ConcurrentG1Refine(); 2.132 ~ConcurrentG1Refine(); 2.133 @@ -69,7 +153,7 @@ 2.134 // If this is the first entry for the slot, writes into the cache and 2.135 // returns NULL. If it causes an eviction, returns the evicted pointer. 2.136 // Otherwise, its a cache hit, and returns NULL. 2.137 - jbyte* cache_insert(jbyte* card_ptr); 2.138 + jbyte* cache_insert(jbyte* card_ptr, bool* defer); 2.139 2.140 // Process the cached entries. 2.141 void clean_up_cache(int worker_i, G1RemSet* g1rs); 2.142 @@ -93,7 +177,6 @@ 2.143 } 2.144 2.145 void clear_and_record_card_counts(); 2.146 - void print_final_card_counts(); 2.147 2.148 static size_t thread_num(); 2.149 };
3.1 --- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp Mon Aug 03 12:59:30 2009 -0700 3.2 +++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp Tue Aug 04 16:00:17 2009 -0700 3.3 @@ -2414,8 +2414,6 @@ 3.4 } 3.5 3.6 void G1CollectedHeap::print_tracing_info() const { 3.7 - concurrent_g1_refine()->print_final_card_counts(); 3.8 - 3.9 // We'll overload this to mean "trace GC pause statistics." 3.10 if (TraceGen0Time || TraceGen1Time) { 3.11 // The "G1CollectorPolicy" is keeping track of these stats, so delegate
4.1 --- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp Mon Aug 03 12:59:30 2009 -0700 4.2 +++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp Tue Aug 04 16:00:17 2009 -0700 4.3 @@ -94,7 +94,14 @@ 4.4 _summary(new Summary()), 4.5 _abandoned_summary(new AbandonedSummary()), 4.6 4.7 +#ifndef PRODUCT 4.8 _cur_clear_ct_time_ms(0.0), 4.9 + _min_clear_cc_time_ms(-1.0), 4.10 + _max_clear_cc_time_ms(-1.0), 4.11 + _cur_clear_cc_time_ms(0.0), 4.12 + _cum_clear_cc_time_ms(0.0), 4.13 + _num_cc_clears(0L), 4.14 +#endif 4.15 4.16 _region_num_young(0), 4.17 _region_num_tenured(0), 4.18 @@ -1648,6 +1655,15 @@ 4.19 print_stats(1, "Object Copying", obj_copy_time); 4.20 } 4.21 } 4.22 +#ifndef PRODUCT 4.23 + print_stats(1, "Cur Clear CC", _cur_clear_cc_time_ms); 4.24 + print_stats(1, "Cum Clear CC", _cum_clear_cc_time_ms); 4.25 + print_stats(1, "Min Clear CC", _min_clear_cc_time_ms); 4.26 + print_stats(1, "Max Clear CC", _max_clear_cc_time_ms); 4.27 + if (_num_cc_clears > 0) { 4.28 + print_stats(1, "Avg Clear CC", _cum_clear_cc_time_ms / ((double)_num_cc_clears)); 4.29 + } 4.30 +#endif 4.31 print_stats(1, "Other", other_time_ms); 4.32 for (int i = 0; i < _aux_num; ++i) { 4.33 if (_cur_aux_times_set[i]) {
5.1 --- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp Mon Aug 03 12:59:30 2009 -0700 5.2 +++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp Tue Aug 04 16:00:17 2009 -0700 5.3 @@ -112,7 +112,6 @@ 5.4 return 8*M; 5.5 } 5.6 5.7 - 5.8 double _cur_collection_start_sec; 5.9 size_t _cur_collection_pause_used_at_start_bytes; 5.10 size_t _cur_collection_pause_used_regions_at_start; 5.11 @@ -122,6 +121,15 @@ 5.12 double _cur_clear_ct_time_ms; 5.13 bool _satb_drain_time_set; 5.14 5.15 +#ifndef PRODUCT 5.16 + // Card Table Count Cache stats 5.17 + double _min_clear_cc_time_ms; // min 5.18 + double _max_clear_cc_time_ms; // max 5.19 + double _cur_clear_cc_time_ms; // clearing time during current pause 5.20 + double _cum_clear_cc_time_ms; // cummulative clearing time 5.21 + jlong _num_cc_clears; // number of times the card count cache has been cleared 5.22 +#endif 5.23 + 5.24 double _cur_CH_strong_roots_end_sec; 5.25 double _cur_CH_strong_roots_dur_ms; 5.26 double _cur_G1_strong_roots_end_sec; 5.27 @@ -931,6 +939,18 @@ 5.28 _cur_aux_times_ms[i] += ms; 5.29 } 5.30 5.31 +#ifndef PRODUCT 5.32 + void record_cc_clear_time(double ms) { 5.33 + if (_min_clear_cc_time_ms < 0.0 || ms <= _min_clear_cc_time_ms) 5.34 + _min_clear_cc_time_ms = ms; 5.35 + if (_max_clear_cc_time_ms < 0.0 || ms >= _max_clear_cc_time_ms) 5.36 + _max_clear_cc_time_ms = ms; 5.37 + _cur_clear_cc_time_ms = ms; 5.38 + _cum_clear_cc_time_ms += ms; 5.39 + _num_cc_clears++; 5.40 + } 5.41 +#endif 5.42 + 5.43 // Record the fact that "bytes" bytes allocated in a region. 5.44 void record_before_bytes(size_t bytes); 5.45 void record_after_bytes(size_t bytes);
6.1 --- a/src/share/vm/gc_implementation/g1/g1RemSet.cpp Mon Aug 03 12:59:30 2009 -0700 6.2 +++ b/src/share/vm/gc_implementation/g1/g1RemSet.cpp Tue Aug 04 16:00:17 2009 -0700 6.3 @@ -676,6 +676,55 @@ 6.4 6.5 static IntHistogram out_of_histo(50, 50); 6.6 6.7 +void HRInto_G1RemSet::concurrentRefineOneCard_impl(jbyte* card_ptr, int worker_i) { 6.8 + // Construct the region representing the card. 6.9 + HeapWord* start = _ct_bs->addr_for(card_ptr); 6.10 + // And find the region containing it. 6.11 + HeapRegion* r = _g1->heap_region_containing(start); 6.12 + assert(r != NULL, "unexpected null"); 6.13 + 6.14 + HeapWord* end = _ct_bs->addr_for(card_ptr + 1); 6.15 + MemRegion dirtyRegion(start, end); 6.16 + 6.17 +#if CARD_REPEAT_HISTO 6.18 + init_ct_freq_table(_g1->g1_reserved_obj_bytes()); 6.19 + ct_freq_note_card(_ct_bs->index_for(start)); 6.20 +#endif 6.21 + 6.22 + UpdateRSOopClosure update_rs_oop_cl(this, worker_i); 6.23 + update_rs_oop_cl.set_from(r); 6.24 + FilterOutOfRegionClosure filter_then_update_rs_oop_cl(r, &update_rs_oop_cl); 6.25 + 6.26 + // Undirty the card. 6.27 + *card_ptr = CardTableModRefBS::clean_card_val(); 6.28 + // We must complete this write before we do any of the reads below. 6.29 + OrderAccess::storeload(); 6.30 + // And process it, being careful of unallocated portions of TLAB's. 6.31 + HeapWord* stop_point = 6.32 + r->oops_on_card_seq_iterate_careful(dirtyRegion, 6.33 + &filter_then_update_rs_oop_cl); 6.34 + // If stop_point is non-null, then we encountered an unallocated region 6.35 + // (perhaps the unfilled portion of a TLAB.) For now, we'll dirty the 6.36 + // card and re-enqueue: if we put off the card until a GC pause, then the 6.37 + // unallocated portion will be filled in. Alternatively, we might try 6.38 + // the full complexity of the technique used in "regular" precleaning. 6.39 + if (stop_point != NULL) { 6.40 + // The card might have gotten re-dirtied and re-enqueued while we 6.41 + // worked. (In fact, it's pretty likely.) 6.42 + if (*card_ptr != CardTableModRefBS::dirty_card_val()) { 6.43 + *card_ptr = CardTableModRefBS::dirty_card_val(); 6.44 + MutexLockerEx x(Shared_DirtyCardQ_lock, 6.45 + Mutex::_no_safepoint_check_flag); 6.46 + DirtyCardQueue* sdcq = 6.47 + JavaThread::dirty_card_queue_set().shared_dirty_card_queue(); 6.48 + sdcq->enqueue(card_ptr); 6.49 + } 6.50 + } else { 6.51 + out_of_histo.add_entry(filter_then_update_rs_oop_cl.out_of_region()); 6.52 + _conc_refine_cards++; 6.53 + } 6.54 +} 6.55 + 6.56 void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) { 6.57 // If the card is no longer dirty, nothing to do. 6.58 if (*card_ptr != CardTableModRefBS::dirty_card_val()) return; 6.59 @@ -716,61 +765,63 @@ 6.60 return; 6.61 } 6.62 6.63 - // Should we defer it? 6.64 + // Should we defer processing the card? 6.65 + // 6.66 + // Previously the result from the insert_cache call would be 6.67 + // either card_ptr (implying that card_ptr was currently "cold"), 6.68 + // null (meaning we had inserted the card ptr into the "hot" 6.69 + // cache, which had some headroom), or a "hot" card ptr 6.70 + // extracted from the "hot" cache. 6.71 + // 6.72 + // Now that the _card_counts cache in the ConcurrentG1Refine 6.73 + // instance is an evicting hash table, the result we get back 6.74 + // could be from evicting the card ptr in an already occupied 6.75 + // bucket (in which case we have replaced the card ptr in the 6.76 + // bucket with card_ptr and "defer" is set to false). To avoid 6.77 + // having a data structure (updates to which would need a lock) 6.78 + // to hold these unprocessed dirty cards, we need to immediately 6.79 + // process card_ptr. The actions needed to be taken on return 6.80 + // from cache_insert are summarized in the following table: 6.81 + // 6.82 + // res defer action 6.83 + // -------------------------------------------------------------- 6.84 + // null false card evicted from _card_counts & replaced with 6.85 + // card_ptr; evicted ptr added to hot cache. 6.86 + // No need to process res; immediately process card_ptr 6.87 + // 6.88 + // null true card not evicted from _card_counts; card_ptr added 6.89 + // to hot cache. 6.90 + // Nothing to do. 6.91 + // 6.92 + // non-null false card evicted from _card_counts & replaced with 6.93 + // card_ptr; evicted ptr is currently "cold" or 6.94 + // caused an eviction from the hot cache. 6.95 + // Immediately process res; process card_ptr. 6.96 + // 6.97 + // non-null true card not evicted from _card_counts; card_ptr is 6.98 + // currently cold, or caused an eviction from hot 6.99 + // cache. 6.100 + // Immediately process res; no need to process card_ptr. 6.101 + 6.102 + jbyte* res = card_ptr; 6.103 + bool defer = false; 6.104 if (_cg1r->use_cache()) { 6.105 - card_ptr = _cg1r->cache_insert(card_ptr); 6.106 - // If it was not an eviction, nothing to do. 6.107 - if (card_ptr == NULL) return; 6.108 - 6.109 - // OK, we have to reset the card start, region, etc. 6.110 - start = _ct_bs->addr_for(card_ptr); 6.111 - r = _g1->heap_region_containing(start); 6.112 - if (r == NULL) { 6.113 - guarantee(_g1->is_in_permanent(start), "Or else where?"); 6.114 - return; // Not in the G1 heap (might be in perm, for example.) 6.115 + jbyte* res = _cg1r->cache_insert(card_ptr, &defer); 6.116 + if (res != NULL && (res != card_ptr || defer)) { 6.117 + start = _ct_bs->addr_for(res); 6.118 + r = _g1->heap_region_containing(start); 6.119 + if (r == NULL) { 6.120 + assert(_g1->is_in_permanent(start), "Or else where?"); 6.121 + } else { 6.122 + guarantee(!r->is_young(), "It was evicted in the current minor cycle."); 6.123 + // Process card pointer we get back from the hot card cache 6.124 + concurrentRefineOneCard_impl(res, worker_i); 6.125 + } 6.126 } 6.127 - guarantee(!r->is_young(), "It was evicted in the current minor cycle."); 6.128 } 6.129 6.130 - HeapWord* end = _ct_bs->addr_for(card_ptr + 1); 6.131 - MemRegion dirtyRegion(start, end); 6.132 - 6.133 -#if CARD_REPEAT_HISTO 6.134 - init_ct_freq_table(_g1->g1_reserved_obj_bytes()); 6.135 - ct_freq_note_card(_ct_bs->index_for(start)); 6.136 -#endif 6.137 - 6.138 - UpdateRSOopClosure update_rs_oop_cl(this, worker_i); 6.139 - update_rs_oop_cl.set_from(r); 6.140 - FilterOutOfRegionClosure filter_then_update_rs_oop_cl(r, &update_rs_oop_cl); 6.141 - 6.142 - // Undirty the card. 6.143 - *card_ptr = CardTableModRefBS::clean_card_val(); 6.144 - // We must complete this write before we do any of the reads below. 6.145 - OrderAccess::storeload(); 6.146 - // And process it, being careful of unallocated portions of TLAB's. 6.147 - HeapWord* stop_point = 6.148 - r->oops_on_card_seq_iterate_careful(dirtyRegion, 6.149 - &filter_then_update_rs_oop_cl); 6.150 - // If stop_point is non-null, then we encountered an unallocated region 6.151 - // (perhaps the unfilled portion of a TLAB.) For now, we'll dirty the 6.152 - // card and re-enqueue: if we put off the card until a GC pause, then the 6.153 - // unallocated portion will be filled in. Alternatively, we might try 6.154 - // the full complexity of the technique used in "regular" precleaning. 6.155 - if (stop_point != NULL) { 6.156 - // The card might have gotten re-dirtied and re-enqueued while we 6.157 - // worked. (In fact, it's pretty likely.) 6.158 - if (*card_ptr != CardTableModRefBS::dirty_card_val()) { 6.159 - *card_ptr = CardTableModRefBS::dirty_card_val(); 6.160 - MutexLockerEx x(Shared_DirtyCardQ_lock, 6.161 - Mutex::_no_safepoint_check_flag); 6.162 - DirtyCardQueue* sdcq = 6.163 - JavaThread::dirty_card_queue_set().shared_dirty_card_queue(); 6.164 - sdcq->enqueue(card_ptr); 6.165 - } 6.166 - } else { 6.167 - out_of_histo.add_entry(filter_then_update_rs_oop_cl.out_of_region()); 6.168 - _conc_refine_cards++; 6.169 + if (!defer) { 6.170 + concurrentRefineOneCard_impl(card_ptr, worker_i); 6.171 } 6.172 } 6.173
7.1 --- a/src/share/vm/gc_implementation/g1/g1RemSet.hpp Mon Aug 03 12:59:30 2009 -0700 7.2 +++ b/src/share/vm/gc_implementation/g1/g1RemSet.hpp Tue Aug 04 16:00:17 2009 -0700 7.3 @@ -157,6 +157,10 @@ 7.4 } 7.5 } 7.6 7.7 + // The routine that performs the actual work of refining a dirty 7.8 + // card. 7.9 + void concurrentRefineOneCard_impl(jbyte* card_ptr, int worker_i); 7.10 + 7.11 protected: 7.12 template <class T> void write_ref_nv(HeapRegion* from, T* p); 7.13 template <class T> void par_write_ref_nv(HeapRegion* from, T* p, int tid);
8.1 --- a/src/share/vm/gc_implementation/g1/g1_globals.hpp Mon Aug 03 12:59:30 2009 -0700 8.2 +++ b/src/share/vm/gc_implementation/g1/g1_globals.hpp Tue Aug 04 16:00:17 2009 -0700 8.3 @@ -187,10 +187,6 @@ 8.4 develop(intx, G1ConcRSLogCacheSize, 10, \ 8.5 "Log base 2 of the length of conc RS hot-card cache.") \ 8.6 \ 8.7 - develop(bool, G1ConcRSCountTraversals, false, \ 8.8 - "If true, gather data about the number of times CR traverses " \ 8.9 - "cards ") \ 8.10 - \ 8.11 develop(intx, G1ConcRSHotCardLimit, 4, \ 8.12 "The threshold that defines (>=) a hot card.") \ 8.13 \ 8.14 @@ -264,6 +260,10 @@ 8.15 \ 8.16 product(uintx, G1ParallelRSetThreads, 0, \ 8.17 "If non-0 is the number of parallel rem set update threads, " \ 8.18 - "otherwise the value is determined ergonomically.") 8.19 + "otherwise the value is determined ergonomically.") \ 8.20 + \ 8.21 + develop(intx, G1CardCountCacheExpandThreshold, 16, \ 8.22 + "Expand the card count cache if the number of collisions for " \ 8.23 + "a particular entry exceeds this value.") 8.24 8.25 G1_FLAGS(DECLARE_DEVELOPER_FLAG, DECLARE_PD_DEVELOPER_FLAG, DECLARE_PRODUCT_FLAG, DECLARE_PD_PRODUCT_FLAG, DECLARE_DIAGNOSTIC_FLAG, DECLARE_EXPERIMENTAL_FLAG, DECLARE_NOTPRODUCT_FLAG, DECLARE_MANAGEABLE_FLAG, DECLARE_PRODUCT_RW_FLAG)
9.1 --- a/src/share/vm/gc_implementation/includeDB_gc_g1 Mon Aug 03 12:59:30 2009 -0700 9.2 +++ b/src/share/vm/gc_implementation/includeDB_gc_g1 Tue Aug 04 16:00:17 2009 -0700 9.3 @@ -45,11 +45,14 @@ 9.4 concurrentG1Refine.cpp concurrentG1RefineThread.hpp 9.5 concurrentG1Refine.cpp copy.hpp 9.6 concurrentG1Refine.cpp g1CollectedHeap.inline.hpp 9.7 +concurrentG1Refine.cpp g1CollectorPolicy.hpp 9.8 concurrentG1Refine.cpp g1RemSet.hpp 9.9 concurrentG1Refine.cpp space.inline.hpp 9.10 +concurrentG1Refine.cpp heapRegionSeq.inline.hpp 9.11 9.12 concurrentG1Refine.hpp globalDefinitions.hpp 9.13 concurrentG1Refine.hpp allocation.hpp 9.14 +concurrentG1Refine.hpp cardTableModRefBS.hpp 9.15 concurrentG1Refine.hpp thread.hpp 9.16 9.17 concurrentG1RefineThread.cpp concurrentG1Refine.hpp