Tue, 21 Aug 2012 14:10:39 -0700
7185699: G1: Prediction model discrepancies
Summary: Correct the result value of G1CollectedHeap::pending_card_num(). Change the code that calculates the GC efficiency of a non-young heap region to use historical data from mixed GCs and the actual number of live bytes when predicting how long it would take to collect the region. Changes were also reviewed by Thomas Schatzl.
Reviewed-by: azeemj, brutisso
1 /*
2 * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
25 #include "precompiled.hpp"
26 #include "gc_implementation/g1/concurrentG1Refine.hpp"
27 #include "gc_implementation/g1/concurrentG1RefineThread.hpp"
28 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
29 #include "gc_implementation/g1/g1CollectorPolicy.hpp"
30 #include "gc_implementation/g1/g1GCPhaseTimes.hpp"
31 #include "gc_implementation/g1/g1RemSet.hpp"
32 #include "gc_implementation/g1/heapRegionSeq.inline.hpp"
33 #include "memory/space.inline.hpp"
34 #include "runtime/atomic.hpp"
35 #include "runtime/java.hpp"
36 #include "utilities/copy.hpp"
38 // Possible sizes for the card counts cache: odd primes that roughly double in size.
39 // (See jvmtiTagMap.cpp).
41 #define MAX_SIZE ((size_t) -1)
43 size_t ConcurrentG1Refine::_cc_cache_sizes[] = {
44 16381, 32771, 76831, 150001, 307261,
45 614563, 1228891, 2457733, 4915219, 9830479,
46 19660831, 39321619, 78643219, 157286461, MAX_SIZE
47 };
49 ConcurrentG1Refine::ConcurrentG1Refine() :
50 _card_counts(NULL), _card_epochs(NULL),
51 _n_card_counts(0), _max_cards(0), _max_n_card_counts(0),
52 _cache_size_index(0), _expand_card_counts(false),
53 _hot_cache(NULL),
54 _def_use_cache(false), _use_cache(false),
55 // We initialize the epochs of the array to 0. By initializing
56 // _n_periods to 1 and not 0 we automatically invalidate all the
57 // entries on the array. Otherwise we might accidentally think that
58 // we claimed a card that was in fact never set (see CR7033292).
59 _n_periods(1),
60 _threads(NULL), _n_threads(0)
61 {
63 // Ergomonically select initial concurrent refinement parameters
64 if (FLAG_IS_DEFAULT(G1ConcRefinementGreenZone)) {
65 FLAG_SET_DEFAULT(G1ConcRefinementGreenZone, MAX2<int>(ParallelGCThreads, 1));
66 }
67 set_green_zone(G1ConcRefinementGreenZone);
69 if (FLAG_IS_DEFAULT(G1ConcRefinementYellowZone)) {
70 FLAG_SET_DEFAULT(G1ConcRefinementYellowZone, green_zone() * 3);
71 }
72 set_yellow_zone(MAX2<int>(G1ConcRefinementYellowZone, green_zone()));
74 if (FLAG_IS_DEFAULT(G1ConcRefinementRedZone)) {
75 FLAG_SET_DEFAULT(G1ConcRefinementRedZone, yellow_zone() * 2);
76 }
77 set_red_zone(MAX2<int>(G1ConcRefinementRedZone, yellow_zone()));
78 _n_worker_threads = thread_num();
79 // We need one extra thread to do the young gen rset size sampling.
80 _n_threads = _n_worker_threads + 1;
81 reset_threshold_step();
83 _threads = NEW_C_HEAP_ARRAY(ConcurrentG1RefineThread*, _n_threads, mtGC);
84 int worker_id_offset = (int)DirtyCardQueueSet::num_par_ids();
85 ConcurrentG1RefineThread *next = NULL;
86 for (int i = _n_threads - 1; i >= 0; i--) {
87 ConcurrentG1RefineThread* t = new ConcurrentG1RefineThread(this, next, worker_id_offset, i);
88 assert(t != NULL, "Conc refine should have been created");
89 assert(t->cg1r() == this, "Conc refine thread should refer to this");
90 _threads[i] = t;
91 next = t;
92 }
93 }
95 void ConcurrentG1Refine::reset_threshold_step() {
96 if (FLAG_IS_DEFAULT(G1ConcRefinementThresholdStep)) {
97 _thread_threshold_step = (yellow_zone() - green_zone()) / (worker_thread_num() + 1);
98 } else {
99 _thread_threshold_step = G1ConcRefinementThresholdStep;
100 }
101 }
103 int ConcurrentG1Refine::thread_num() {
104 return MAX2<int>((G1ConcRefinementThreads > 0) ? G1ConcRefinementThreads : ParallelGCThreads, 1);
105 }
107 void ConcurrentG1Refine::init() {
108 if (G1ConcRSLogCacheSize > 0) {
109 _g1h = G1CollectedHeap::heap();
111 _max_cards = _g1h->max_capacity() >> CardTableModRefBS::card_shift;
112 _max_n_card_counts = _max_cards * G1MaxHotCardCountSizePercent / 100;
114 size_t max_card_num = ((size_t)1 << (sizeof(unsigned)*BitsPerByte-1)) - 1;
115 guarantee(_max_cards < max_card_num, "card_num representation");
117 // We need _n_card_counts to be less than _max_n_card_counts here
118 // so that the expansion call (below) actually allocates the
119 // _counts and _epochs arrays.
120 assert(_n_card_counts == 0, "pre-condition");
121 assert(_max_n_card_counts > 0, "pre-condition");
123 // Find the index into cache size array that is of a size that's
124 // large enough to hold desired_sz.
125 size_t desired_sz = _max_cards / InitialCacheFraction;
126 int desired_sz_index = 0;
127 while (_cc_cache_sizes[desired_sz_index] < desired_sz) {
128 desired_sz_index += 1;
129 assert(desired_sz_index < MAX_CC_CACHE_INDEX, "invariant");
130 }
131 assert(desired_sz_index < MAX_CC_CACHE_INDEX, "invariant");
133 // If the desired_sz value is between two sizes then
134 // _cc_cache_sizes[desired_sz_index-1] < desired_sz <= _cc_cache_sizes[desired_sz_index]
135 // we will start with the lower size in the optimistic expectation that
136 // we will not need to expand up. Note desired_sz_index could also be 0.
137 if (desired_sz_index > 0 &&
138 _cc_cache_sizes[desired_sz_index] > desired_sz) {
139 desired_sz_index -= 1;
140 }
142 if (!expand_card_count_cache(desired_sz_index)) {
143 // Allocation was unsuccessful - exit
144 vm_exit_during_initialization("Could not reserve enough space for card count cache");
145 }
146 assert(_n_card_counts > 0, "post-condition");
147 assert(_cache_size_index == desired_sz_index, "post-condition");
149 Copy::fill_to_bytes(&_card_counts[0],
150 _n_card_counts * sizeof(CardCountCacheEntry));
151 Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry));
153 ModRefBarrierSet* bs = _g1h->mr_bs();
154 guarantee(bs->is_a(BarrierSet::CardTableModRef), "Precondition");
155 _ct_bs = (CardTableModRefBS*)bs;
156 _ct_bot = _ct_bs->byte_for_const(_g1h->reserved_region().start());
158 _def_use_cache = true;
159 _use_cache = true;
160 _hot_cache_size = (1 << G1ConcRSLogCacheSize);
161 _hot_cache = NEW_C_HEAP_ARRAY(jbyte*, _hot_cache_size, mtGC);
162 _n_hot = 0;
163 _hot_cache_idx = 0;
165 // For refining the cards in the hot cache in parallel
166 int n_workers = (ParallelGCThreads > 0 ?
167 _g1h->workers()->total_workers() : 1);
168 _hot_cache_par_chunk_size = MAX2(1, _hot_cache_size / n_workers);
169 _hot_cache_par_claimed_idx = 0;
170 }
171 }
173 void ConcurrentG1Refine::stop() {
174 if (_threads != NULL) {
175 for (int i = 0; i < _n_threads; i++) {
176 _threads[i]->stop();
177 }
178 }
179 }
181 void ConcurrentG1Refine::reinitialize_threads() {
182 reset_threshold_step();
183 if (_threads != NULL) {
184 for (int i = 0; i < _n_threads; i++) {
185 _threads[i]->initialize();
186 }
187 }
188 }
190 ConcurrentG1Refine::~ConcurrentG1Refine() {
191 if (G1ConcRSLogCacheSize > 0) {
192 // Please see the comment in allocate_card_count_cache
193 // for why we call os::malloc() and os::free() directly.
194 assert(_card_counts != NULL, "Logic");
195 os::free(_card_counts, mtGC);
196 assert(_card_epochs != NULL, "Logic");
197 os::free(_card_epochs, mtGC);
199 assert(_hot_cache != NULL, "Logic");
200 FREE_C_HEAP_ARRAY(jbyte*, _hot_cache, mtGC);
201 }
202 if (_threads != NULL) {
203 for (int i = 0; i < _n_threads; i++) {
204 delete _threads[i];
205 }
206 FREE_C_HEAP_ARRAY(ConcurrentG1RefineThread*, _threads, mtGC);
207 }
208 }
210 void ConcurrentG1Refine::threads_do(ThreadClosure *tc) {
211 if (_threads != NULL) {
212 for (int i = 0; i < _n_threads; i++) {
213 tc->do_thread(_threads[i]);
214 }
215 }
216 }
218 bool ConcurrentG1Refine::is_young_card(jbyte* card_ptr) {
219 HeapWord* start = _ct_bs->addr_for(card_ptr);
220 HeapRegion* r = _g1h->heap_region_containing(start);
221 if (r != NULL && r->is_young()) {
222 return true;
223 }
224 // This card is not associated with a heap region
225 // so can't be young.
226 return false;
227 }
229 jbyte* ConcurrentG1Refine::add_card_count(jbyte* card_ptr, int* count, bool* defer) {
230 unsigned new_card_num = ptr_2_card_num(card_ptr);
231 unsigned bucket = hash(new_card_num);
232 assert(0 <= bucket && bucket < _n_card_counts, "Bounds");
234 CardCountCacheEntry* count_ptr = &_card_counts[bucket];
235 CardEpochCacheEntry* epoch_ptr = &_card_epochs[bucket];
237 // We have to construct a new entry if we haven't updated the counts
238 // during the current period, or if the count was updated for a
239 // different card number.
240 unsigned int new_epoch = (unsigned int) _n_periods;
241 julong new_epoch_entry = make_epoch_entry(new_card_num, new_epoch);
243 while (true) {
244 // Fetch the previous epoch value
245 julong prev_epoch_entry = epoch_ptr->_value;
246 julong cas_res;
248 if (extract_epoch(prev_epoch_entry) != new_epoch) {
249 // This entry has not yet been updated during this period.
250 // Note: we update the epoch value atomically to ensure
251 // that there is only one winner that updates the cached
252 // card_ptr value even though all the refine threads share
253 // the same epoch value.
255 cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry,
256 (volatile jlong*)&epoch_ptr->_value,
257 (jlong) prev_epoch_entry);
259 if (cas_res == prev_epoch_entry) {
260 // We have successfully won the race to update the
261 // epoch and card_num value. Make it look like the
262 // count and eviction count were previously cleared.
263 count_ptr->_count = 1;
264 count_ptr->_evict_count = 0;
265 *count = 0;
266 // We can defer the processing of card_ptr
267 *defer = true;
268 return card_ptr;
269 }
270 // We did not win the race to update the epoch field, so some other
271 // thread must have done it. The value that gets returned by CAS
272 // should be the new epoch value.
273 assert(extract_epoch(cas_res) == new_epoch, "unexpected epoch");
274 // We could 'continue' here or just re-read the previous epoch value
275 prev_epoch_entry = epoch_ptr->_value;
276 }
278 // The epoch entry for card_ptr has been updated during this period.
279 unsigned old_card_num = extract_card_num(prev_epoch_entry);
281 // The card count that will be returned to caller
282 *count = count_ptr->_count;
284 // Are we updating the count for the same card?
285 if (new_card_num == old_card_num) {
286 // Same card - just update the count. We could have more than one
287 // thread racing to update count for the current card. It should be
288 // OK not to use a CAS as the only penalty should be some missed
289 // increments of the count which delays identifying the card as "hot".
291 if (*count < max_jubyte) count_ptr->_count++;
292 // We can defer the processing of card_ptr
293 *defer = true;
294 return card_ptr;
295 }
297 // Different card - evict old card info
298 if (count_ptr->_evict_count < max_jubyte) count_ptr->_evict_count++;
299 if (count_ptr->_evict_count > G1CardCountCacheExpandThreshold) {
300 // Trigger a resize the next time we clear
301 _expand_card_counts = true;
302 }
304 cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry,
305 (volatile jlong*)&epoch_ptr->_value,
306 (jlong) prev_epoch_entry);
308 if (cas_res == prev_epoch_entry) {
309 // We successfully updated the card num value in the epoch entry
310 count_ptr->_count = 0; // initialize counter for new card num
311 jbyte* old_card_ptr = card_num_2_ptr(old_card_num);
313 // Even though the region containg the card at old_card_num was not
314 // in the young list when old_card_num was recorded in the epoch
315 // cache it could have been added to the free list and subsequently
316 // added to the young list in the intervening time. See CR 6817995.
317 // We do not deal with this case here - it will be handled in
318 // HeapRegion::oops_on_card_seq_iterate_careful after it has been
319 // determined that the region containing the card has been allocated
320 // to, and it's safe to check the young type of the region.
322 // We do not want to defer processing of card_ptr in this case
323 // (we need to refine old_card_ptr and card_ptr)
324 *defer = false;
325 return old_card_ptr;
326 }
327 // Someone else beat us - try again.
328 }
329 }
331 jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr, bool* defer) {
332 int count;
333 jbyte* cached_ptr = add_card_count(card_ptr, &count, defer);
334 assert(cached_ptr != NULL, "bad cached card ptr");
336 // We've just inserted a card pointer into the card count cache
337 // and got back the card that we just inserted or (evicted) the
338 // previous contents of that count slot.
340 // The card we got back could be in a young region. When the
341 // returned card (if evicted) was originally inserted, we had
342 // determined that its containing region was not young. However
343 // it is possible for the region to be freed during a cleanup
344 // pause, then reallocated and tagged as young which will result
345 // in the returned card residing in a young region.
346 //
347 // We do not deal with this case here - the change from non-young
348 // to young could be observed at any time - it will be handled in
349 // HeapRegion::oops_on_card_seq_iterate_careful after it has been
350 // determined that the region containing the card has been allocated
351 // to.
353 // The card pointer we obtained from card count cache is not hot
354 // so do not store it in the cache; return it for immediate
355 // refining.
356 if (count < G1ConcRSHotCardLimit) {
357 return cached_ptr;
358 }
360 // Otherwise, the pointer we got from the _card_counts cache is hot.
361 jbyte* res = NULL;
362 MutexLockerEx x(HotCardCache_lock, Mutex::_no_safepoint_check_flag);
363 if (_n_hot == _hot_cache_size) {
364 res = _hot_cache[_hot_cache_idx];
365 _n_hot--;
366 }
367 // Now _n_hot < _hot_cache_size, and we can insert at _hot_cache_idx.
368 _hot_cache[_hot_cache_idx] = cached_ptr;
369 _hot_cache_idx++;
370 if (_hot_cache_idx == _hot_cache_size) _hot_cache_idx = 0;
371 _n_hot++;
373 // The card obtained from the hot card cache could be in a young
374 // region. See above on how this can happen.
376 return res;
377 }
379 void ConcurrentG1Refine::clean_up_cache(int worker_i,
380 G1RemSet* g1rs,
381 DirtyCardQueue* into_cset_dcq) {
382 assert(!use_cache(), "cache should be disabled");
383 int start_idx;
385 while ((start_idx = _hot_cache_par_claimed_idx) < _n_hot) { // read once
386 int end_idx = start_idx + _hot_cache_par_chunk_size;
388 if (start_idx ==
389 Atomic::cmpxchg(end_idx, &_hot_cache_par_claimed_idx, start_idx)) {
390 // The current worker has successfully claimed the chunk [start_idx..end_idx)
391 end_idx = MIN2(end_idx, _n_hot);
392 for (int i = start_idx; i < end_idx; i++) {
393 jbyte* entry = _hot_cache[i];
394 if (entry != NULL) {
395 if (g1rs->concurrentRefineOneCard(entry, worker_i, true)) {
396 // 'entry' contains references that point into the current
397 // collection set. We need to record 'entry' in the DCQS
398 // that's used for that purpose.
399 //
400 // The only time we care about recording cards that contain
401 // references that point into the collection set is during
402 // RSet updating while within an evacuation pause.
403 // In this case worker_i should be the id of a GC worker thread
404 assert(SafepointSynchronize::is_at_safepoint(), "not during an evacuation pause");
405 assert(worker_i < (int) (ParallelGCThreads == 0 ? 1 : ParallelGCThreads), "incorrect worker id");
406 into_cset_dcq->enqueue(entry);
407 }
408 }
409 }
410 }
411 }
412 }
414 // The arrays used to hold the card counts and the epochs must have
415 // a 1:1 correspondence. Hence they are allocated and freed together
416 // Returns true if the allocations of both the counts and epochs
417 // were successful; false otherwise.
418 bool ConcurrentG1Refine::allocate_card_count_cache(size_t n,
419 CardCountCacheEntry** counts,
420 CardEpochCacheEntry** epochs) {
421 // We call the allocation/free routines directly for the counts
422 // and epochs arrays. The NEW_C_HEAP_ARRAY/FREE_C_HEAP_ARRAY
423 // macros call AllocateHeap and FreeHeap respectively.
424 // AllocateHeap will call vm_exit_out_of_memory in the event
425 // of an allocation failure and abort the JVM. With the
426 // _counts/epochs arrays we only need to abort the JVM if the
427 // initial allocation of these arrays fails.
428 //
429 // Additionally AllocateHeap/FreeHeap do some tracing of
430 // allocate/free calls so calling one without calling the
431 // other can cause inconsistencies in the tracing. So we
432 // call neither.
434 assert(*counts == NULL, "out param");
435 assert(*epochs == NULL, "out param");
437 size_t counts_size = n * sizeof(CardCountCacheEntry);
438 size_t epochs_size = n * sizeof(CardEpochCacheEntry);
440 *counts = (CardCountCacheEntry*) os::malloc(counts_size, mtGC);
441 if (*counts == NULL) {
442 // allocation was unsuccessful
443 return false;
444 }
446 *epochs = (CardEpochCacheEntry*) os::malloc(epochs_size, mtGC);
447 if (*epochs == NULL) {
448 // allocation was unsuccessful - free counts array
449 assert(*counts != NULL, "must be");
450 os::free(*counts, mtGC);
451 *counts = NULL;
452 return false;
453 }
455 // We successfully allocated both counts and epochs
456 return true;
457 }
459 // Returns true if the card counts/epochs cache was
460 // successfully expanded; false otherwise.
461 bool ConcurrentG1Refine::expand_card_count_cache(int cache_size_idx) {
462 // Can we expand the card count and epoch tables?
463 if (_n_card_counts < _max_n_card_counts) {
464 assert(cache_size_idx >= 0 && cache_size_idx < MAX_CC_CACHE_INDEX, "oob");
466 size_t cache_size = _cc_cache_sizes[cache_size_idx];
467 // Make sure we don't go bigger than we will ever need
468 cache_size = MIN2(cache_size, _max_n_card_counts);
470 // Should we expand the card count and card epoch tables?
471 if (cache_size > _n_card_counts) {
472 // We have been asked to allocate new, larger, arrays for
473 // the card counts and the epochs. Attempt the allocation
474 // of both before we free the existing arrays in case
475 // the allocation is unsuccessful...
476 CardCountCacheEntry* counts = NULL;
477 CardEpochCacheEntry* epochs = NULL;
479 if (allocate_card_count_cache(cache_size, &counts, &epochs)) {
480 // Allocation was successful.
481 // We can just free the old arrays; we're
482 // not interested in preserving the contents
483 if (_card_counts != NULL) os::free(_card_counts, mtGC);
484 if (_card_epochs != NULL) os::free(_card_epochs, mtGC);
486 // Cache the size of the arrays and the index that got us there.
487 _n_card_counts = cache_size;
488 _cache_size_index = cache_size_idx;
490 _card_counts = counts;
491 _card_epochs = epochs;
493 // We successfully allocated/expanded the caches.
494 return true;
495 }
496 }
497 }
499 // We did not successfully expand the caches.
500 return false;
501 }
503 void ConcurrentG1Refine::clear_and_record_card_counts() {
504 if (G1ConcRSLogCacheSize == 0) {
505 return;
506 }
508 double start = os::elapsedTime();
510 if (_expand_card_counts) {
511 int new_idx = _cache_size_index + 1;
513 if (expand_card_count_cache(new_idx)) {
514 // Allocation was successful and _n_card_counts has
515 // been updated to the new size. We only need to clear
516 // the epochs so we don't read a bogus epoch value
517 // when inserting a card into the hot card cache.
518 Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry));
519 }
520 _expand_card_counts = false;
521 }
523 int this_epoch = (int) _n_periods;
524 assert((this_epoch+1) <= max_jint, "to many periods");
525 // Update epoch
526 _n_periods++;
527 double cc_clear_time_ms = (os::elapsedTime() - start) * 1000;
528 _g1h->g1_policy()->phase_times()->record_cc_clear_time_ms(cc_clear_time_ms);
529 }
531 void ConcurrentG1Refine::print_worker_threads_on(outputStream* st) const {
532 for (int i = 0; i < _n_threads; ++i) {
533 _threads[i]->print_on(st);
534 st->cr();
535 }
536 }