Mon, 26 Jan 2009 12:47:21 -0800
6786503: Overflow list performance can be improved
Summary: Avoid overflow list walk in CMS & ParNew when it is unnecessary. Fix a couple of correctness issues, including a C-heap leak, in ParNew at the intersection of promotion failure, work queue overflow and object array chunking. Add stress testing option and related assertion checking.
Reviewed-by: jmasa
1.1 --- a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp Wed Jan 21 13:40:10 2009 -0800 1.2 +++ b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp Mon Jan 26 12:47:21 2009 -0800 1.3 @@ -8508,7 +8508,7 @@ 1.4 size_t i = num; 1.5 oop cur = _overflow_list; 1.6 const markOop proto = markOopDesc::prototype(); 1.7 - NOT_PRODUCT(size_t n = 0;) 1.8 + NOT_PRODUCT(ssize_t n = 0;) 1.9 for (oop next; i > 0 && cur != NULL; cur = next, i--) { 1.10 next = oop(cur->mark()); 1.11 cur->set_mark(proto); // until proven otherwise 1.12 @@ -8525,45 +8525,131 @@ 1.13 return !stack->isEmpty(); 1.14 } 1.15 1.16 -// Multi-threaded; use CAS to break off a prefix 1.17 +#define BUSY (oop(0x1aff1aff)) 1.18 +// (MT-safe) Get a prefix of at most "num" from the list. 1.19 +// The overflow list is chained through the mark word of 1.20 +// each object in the list. We fetch the entire list, 1.21 +// break off a prefix of the right size and return the 1.22 +// remainder. If other threads try to take objects from 1.23 +// the overflow list at that time, they will wait for 1.24 +// some time to see if data becomes available. If (and 1.25 +// only if) another thread places one or more object(s) 1.26 +// on the global list before we have returned the suffix 1.27 +// to the global list, we will walk down our local list 1.28 +// to find its end and append the global list to 1.29 +// our suffix before returning it. This suffix walk can 1.30 +// prove to be expensive (quadratic in the amount of traffic) 1.31 +// when there are many objects in the overflow list and 1.32 +// there is much producer-consumer contention on the list. 1.33 +// *NOTE*: The overflow list manipulation code here and 1.34 +// in ParNewGeneration:: are very similar in shape, 1.35 +// except that in the ParNew case we use the old (from/eden) 1.36 +// copy of the object to thread the list via its klass word. 1.37 +// Because of the common code, if you make any changes in 1.38 +// the code below, please check the ParNew version to see if 1.39 +// similar changes might be needed. 1.40 +// CR 6797058 has been filed to consolidate the common code. 1.41 bool CMSCollector::par_take_from_overflow_list(size_t num, 1.42 OopTaskQueue* work_q) { 1.43 - assert(work_q->size() == 0, "That's the current policy"); 1.44 + assert(work_q->size() == 0, "First empty local work queue"); 1.45 assert(num < work_q->max_elems(), "Can't bite more than we can chew"); 1.46 if (_overflow_list == NULL) { 1.47 return false; 1.48 } 1.49 // Grab the entire list; we'll put back a suffix 1.50 - oop prefix = (oop)Atomic::xchg_ptr(NULL, &_overflow_list); 1.51 - if (prefix == NULL) { // someone grabbed it before we did ... 1.52 - // ... we could spin for a short while, but for now we don't 1.53 - return false; 1.54 - } 1.55 + oop prefix = (oop)Atomic::xchg_ptr(BUSY, &_overflow_list); 1.56 + Thread* tid = Thread::current(); 1.57 + size_t CMSOverflowSpinCount = (size_t)ParallelGCThreads; 1.58 + size_t sleep_time_millis = MAX2((size_t)1, num/100); 1.59 + // If the list is busy, we spin for a short while, 1.60 + // sleeping between attempts to get the list. 1.61 + for (size_t spin = 0; prefix == BUSY && spin < CMSOverflowSpinCount; spin++) { 1.62 + os::sleep(tid, sleep_time_millis, false); 1.63 + if (_overflow_list == NULL) { 1.64 + // Nothing left to take 1.65 + return false; 1.66 + } else if (_overflow_list != BUSY) { 1.67 + // Try and grab the prefix 1.68 + prefix = (oop)Atomic::xchg_ptr(BUSY, &_overflow_list); 1.69 + } 1.70 + } 1.71 + // If the list was found to be empty, or we spun long 1.72 + // enough, we give up and return empty-handed. If we leave 1.73 + // the list in the BUSY state below, it must be the case that 1.74 + // some other thread holds the overflow list and will set it 1.75 + // to a non-BUSY state in the future. 1.76 + if (prefix == NULL || prefix == BUSY) { 1.77 + // Nothing to take or waited long enough 1.78 + if (prefix == NULL) { 1.79 + // Write back the NULL in case we overwrote it with BUSY above 1.80 + // and it is still the same value. 1.81 + (void) Atomic::cmpxchg_ptr(NULL, &_overflow_list, BUSY); 1.82 + } 1.83 + return false; 1.84 + } 1.85 + assert(prefix != NULL && prefix != BUSY, "Error"); 1.86 size_t i = num; 1.87 oop cur = prefix; 1.88 + // Walk down the first "num" objects, unless we reach the end. 1.89 for (; i > 1 && cur->mark() != NULL; cur = oop(cur->mark()), i--); 1.90 - if (cur->mark() != NULL) { 1.91 + if (cur->mark() == NULL) { 1.92 + // We have "num" or fewer elements in the list, so there 1.93 + // is nothing to return to the global list. 1.94 + // Write back the NULL in lieu of the BUSY we wrote 1.95 + // above, if it is still the same value. 1.96 + if (_overflow_list == BUSY) { 1.97 + (void) Atomic::cmpxchg_ptr(NULL, &_overflow_list, BUSY); 1.98 + } 1.99 + } else { 1.100 + // Chop off the suffix and rerturn it to the global list. 1.101 + assert(cur->mark() != BUSY, "Error"); 1.102 oop suffix_head = cur->mark(); // suffix will be put back on global list 1.103 cur->set_mark(NULL); // break off suffix 1.104 - // Find tail of suffix so we can prepend suffix to global list 1.105 - for (cur = suffix_head; cur->mark() != NULL; cur = (oop)(cur->mark())); 1.106 - oop suffix_tail = cur; 1.107 - assert(suffix_tail != NULL && suffix_tail->mark() == NULL, 1.108 - "Tautology"); 1.109 + // It's possible that the list is still in the empty(busy) state 1.110 + // we left it in a short while ago; in that case we may be 1.111 + // able to place back the suffix without incurring the cost 1.112 + // of a walk down the list. 1.113 oop observed_overflow_list = _overflow_list; 1.114 - do { 1.115 - cur = observed_overflow_list; 1.116 - suffix_tail->set_mark(markOop(cur)); 1.117 + oop cur_overflow_list = observed_overflow_list; 1.118 + bool attached = false; 1.119 + while (observed_overflow_list == BUSY || observed_overflow_list == NULL) { 1.120 observed_overflow_list = 1.121 - (oop) Atomic::cmpxchg_ptr(suffix_head, &_overflow_list, cur); 1.122 - } while (cur != observed_overflow_list); 1.123 + (oop) Atomic::cmpxchg_ptr(suffix_head, &_overflow_list, cur_overflow_list); 1.124 + if (cur_overflow_list == observed_overflow_list) { 1.125 + attached = true; 1.126 + break; 1.127 + } else cur_overflow_list = observed_overflow_list; 1.128 + } 1.129 + if (!attached) { 1.130 + // Too bad, someone else sneaked in (at least) an element; we'll need 1.131 + // to do a splice. Find tail of suffix so we can prepend suffix to global 1.132 + // list. 1.133 + for (cur = suffix_head; cur->mark() != NULL; cur = (oop)(cur->mark())); 1.134 + oop suffix_tail = cur; 1.135 + assert(suffix_tail != NULL && suffix_tail->mark() == NULL, 1.136 + "Tautology"); 1.137 + observed_overflow_list = _overflow_list; 1.138 + do { 1.139 + cur_overflow_list = observed_overflow_list; 1.140 + if (cur_overflow_list != BUSY) { 1.141 + // Do the splice ... 1.142 + suffix_tail->set_mark(markOop(cur_overflow_list)); 1.143 + } else { // cur_overflow_list == BUSY 1.144 + suffix_tail->set_mark(NULL); 1.145 + } 1.146 + // ... and try to place spliced list back on overflow_list ... 1.147 + observed_overflow_list = 1.148 + (oop) Atomic::cmpxchg_ptr(suffix_head, &_overflow_list, cur_overflow_list); 1.149 + } while (cur_overflow_list != observed_overflow_list); 1.150 + // ... until we have succeeded in doing so. 1.151 + } 1.152 } 1.153 1.154 // Push the prefix elements on work_q 1.155 assert(prefix != NULL, "control point invariant"); 1.156 const markOop proto = markOopDesc::prototype(); 1.157 oop next; 1.158 - NOT_PRODUCT(size_t n = 0;) 1.159 + NOT_PRODUCT(ssize_t n = 0;) 1.160 for (cur = prefix; cur != NULL; cur = next) { 1.161 next = oop(cur->mark()); 1.162 cur->set_mark(proto); // until proven otherwise 1.163 @@ -8597,11 +8683,16 @@ 1.164 oop cur_overflow_list; 1.165 do { 1.166 cur_overflow_list = observed_overflow_list; 1.167 - p->set_mark(markOop(cur_overflow_list)); 1.168 + if (cur_overflow_list != BUSY) { 1.169 + p->set_mark(markOop(cur_overflow_list)); 1.170 + } else { 1.171 + p->set_mark(NULL); 1.172 + } 1.173 observed_overflow_list = 1.174 (oop) Atomic::cmpxchg_ptr(p, &_overflow_list, cur_overflow_list); 1.175 } while (cur_overflow_list != observed_overflow_list); 1.176 } 1.177 +#undef BUSY 1.178 1.179 // Single threaded 1.180 // General Note on GrowableArray: pushes may silently fail 1.181 @@ -8610,7 +8701,7 @@ 1.182 // a lot of code in the JVM. The prudent thing for GrowableArray 1.183 // to do (for now) is to exit with an error. However, that may 1.184 // be too draconian in some cases because the caller may be 1.185 -// able to recover without much harm. For suych cases, we 1.186 +// able to recover without much harm. For such cases, we 1.187 // should probably introduce a "soft_push" method which returns 1.188 // an indication of success or failure with the assumption that 1.189 // the caller may be able to recover from a failure; code in 1.190 @@ -8618,8 +8709,6 @@ 1.191 // failures where possible, thus, incrementally hardening the VM 1.192 // in such low resource situations. 1.193 void CMSCollector::preserve_mark_work(oop p, markOop m) { 1.194 - int PreserveMarkStackSize = 128; 1.195 - 1.196 if (_preserved_oop_stack == NULL) { 1.197 assert(_preserved_mark_stack == NULL, 1.198 "bijection with preserved_oop_stack");
2.1 --- a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp Wed Jan 21 13:40:10 2009 -0800 2.2 +++ b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp Mon Jan 26 12:47:21 2009 -0800 2.3 @@ -595,7 +595,7 @@ 2.4 size_t _ser_kac_preclean_ovflw; 2.5 size_t _ser_kac_ovflw; 2.6 size_t _par_kac_ovflw; 2.7 - NOT_PRODUCT(size_t _num_par_pushes;) 2.8 + NOT_PRODUCT(ssize_t _num_par_pushes;) 2.9 2.10 // ("Weak") Reference processing support 2.11 ReferenceProcessor* _ref_processor;
3.1 --- a/src/share/vm/gc_implementation/includeDB_gc_parNew Wed Jan 21 13:40:10 2009 -0800 3.2 +++ b/src/share/vm/gc_implementation/includeDB_gc_parNew Mon Jan 26 12:47:21 2009 -0800 3.3 @@ -77,6 +77,7 @@ 3.4 parNewGeneration.cpp sharedHeap.hpp 3.5 parNewGeneration.cpp space.hpp 3.6 parNewGeneration.cpp spaceDecorator.hpp 3.7 +parNewGeneration.cpp thread.hpp 3.8 parNewGeneration.cpp workgroup.hpp 3.9 3.10 parNewGeneration.hpp defNewGeneration.hpp
4.1 --- a/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp Wed Jan 21 13:40:10 2009 -0800 4.2 +++ b/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp Mon Jan 26 12:47:21 2009 -0800 4.3 @@ -404,6 +404,8 @@ 4.4 if (terminator()->offer_termination()) break; 4.5 par_scan_state()->end_term_time(); 4.6 } 4.7 + assert(par_gen()->_overflow_list == NULL && par_gen()->_num_par_pushes == 0, 4.8 + "Broken overflow list?"); 4.9 // Finish the last termination pause. 4.10 par_scan_state()->end_term_time(); 4.11 } 4.12 @@ -456,6 +458,8 @@ 4.13 _is_alive_closure(this), 4.14 _plab_stats(YoungPLABSize, PLABWeight) 4.15 { 4.16 + NOT_PRODUCT(_overflow_counter = ParGCWorkQueueOverflowInterval;) 4.17 + NOT_PRODUCT(_num_par_pushes = 0;) 4.18 _task_queues = new ObjToScanQueueSet(ParallelGCThreads); 4.19 guarantee(_task_queues != NULL, "task_queues allocation failure."); 4.20 4.21 @@ -993,12 +997,19 @@ 4.22 "push forwarded object"); 4.23 } 4.24 // Push it on one of the queues of to-be-scanned objects. 4.25 - if (!par_scan_state->work_queue()->push(obj_to_push)) { 4.26 + bool simulate_overflow = false; 4.27 + NOT_PRODUCT( 4.28 + if (ParGCWorkQueueOverflowALot && should_simulate_overflow()) { 4.29 + // simulate a stack overflow 4.30 + simulate_overflow = true; 4.31 + } 4.32 + ) 4.33 + if (simulate_overflow || !par_scan_state->work_queue()->push(obj_to_push)) { 4.34 // Add stats for overflow pushes. 4.35 if (Verbose && PrintGCDetails) { 4.36 gclog_or_tty->print("queue overflow!\n"); 4.37 } 4.38 - push_on_overflow_list(old); 4.39 + push_on_overflow_list(old, par_scan_state); 4.40 par_scan_state->note_overflow_push(); 4.41 } 4.42 par_scan_state->note_push(); 4.43 @@ -1110,9 +1121,16 @@ 4.44 "push forwarded object"); 4.45 } 4.46 // Push it on one of the queues of to-be-scanned objects. 4.47 - if (!par_scan_state->work_queue()->push(obj_to_push)) { 4.48 + bool simulate_overflow = false; 4.49 + NOT_PRODUCT( 4.50 + if (ParGCWorkQueueOverflowALot && should_simulate_overflow()) { 4.51 + // simulate a stack overflow 4.52 + simulate_overflow = true; 4.53 + } 4.54 + ) 4.55 + if (simulate_overflow || !par_scan_state->work_queue()->push(obj_to_push)) { 4.56 // Add stats for overflow pushes. 4.57 - push_on_overflow_list(old); 4.58 + push_on_overflow_list(old, par_scan_state); 4.59 par_scan_state->note_overflow_push(); 4.60 } 4.61 par_scan_state->note_push(); 4.62 @@ -1135,89 +1153,190 @@ 4.63 return forward_ptr; 4.64 } 4.65 4.66 -void ParNewGeneration::push_on_overflow_list(oop from_space_obj) { 4.67 - oop cur_overflow_list = _overflow_list; 4.68 +#ifndef PRODUCT 4.69 +// It's OK to call this multi-threaded; the worst thing 4.70 +// that can happen is that we'll get a bunch of closely 4.71 +// spaced simulated oveflows, but that's OK, in fact 4.72 +// probably good as it would exercise the overflow code 4.73 +// under contention. 4.74 +bool ParNewGeneration::should_simulate_overflow() { 4.75 + if (_overflow_counter-- <= 0) { // just being defensive 4.76 + _overflow_counter = ParGCWorkQueueOverflowInterval; 4.77 + return true; 4.78 + } else { 4.79 + return false; 4.80 + } 4.81 +} 4.82 +#endif 4.83 + 4.84 +#define BUSY (oop(0x1aff1aff)) 4.85 +void ParNewGeneration::push_on_overflow_list(oop from_space_obj, ParScanThreadState* par_scan_state) { 4.86 // if the object has been forwarded to itself, then we cannot 4.87 // use the klass pointer for the linked list. Instead we have 4.88 // to allocate an oopDesc in the C-Heap and use that for the linked list. 4.89 + // XXX This is horribly inefficient when a promotion failure occurs 4.90 + // and should be fixed. XXX FIX ME !!! 4.91 +#ifndef PRODUCT 4.92 + Atomic::inc_ptr(&_num_par_pushes); 4.93 + assert(_num_par_pushes > 0, "Tautology"); 4.94 +#endif 4.95 if (from_space_obj->forwardee() == from_space_obj) { 4.96 oopDesc* listhead = NEW_C_HEAP_ARRAY(oopDesc, 1); 4.97 listhead->forward_to(from_space_obj); 4.98 from_space_obj = listhead; 4.99 } 4.100 - while (true) { 4.101 - from_space_obj->set_klass_to_list_ptr(cur_overflow_list); 4.102 - oop observed_overflow_list = 4.103 + oop observed_overflow_list = _overflow_list; 4.104 + oop cur_overflow_list; 4.105 + do { 4.106 + cur_overflow_list = observed_overflow_list; 4.107 + if (cur_overflow_list != BUSY) { 4.108 + from_space_obj->set_klass_to_list_ptr(cur_overflow_list); 4.109 + } else { 4.110 + from_space_obj->set_klass_to_list_ptr(NULL); 4.111 + } 4.112 + observed_overflow_list = 4.113 (oop)Atomic::cmpxchg_ptr(from_space_obj, &_overflow_list, cur_overflow_list); 4.114 - if (observed_overflow_list == cur_overflow_list) break; 4.115 - // Otherwise... 4.116 - cur_overflow_list = observed_overflow_list; 4.117 - } 4.118 + } while (cur_overflow_list != observed_overflow_list); 4.119 } 4.120 4.121 +// *NOTE*: The overflow list manipulation code here and 4.122 +// in CMSCollector:: are very similar in shape, 4.123 +// except that in the CMS case we thread the objects 4.124 +// directly into the list via their mark word, and do 4.125 +// not need to deal with special cases below related 4.126 +// to chunking of object arrays and promotion failure 4.127 +// handling. 4.128 +// CR 6797058 has been filed to attempt consolidation of 4.129 +// the common code. 4.130 +// Because of the common code, if you make any changes in 4.131 +// the code below, please check the CMS version to see if 4.132 +// similar changes might be needed. 4.133 +// See CMSCollector::par_take_from_overflow_list() for 4.134 +// more extensive documentation comments. 4.135 bool 4.136 ParNewGeneration::take_from_overflow_list(ParScanThreadState* par_scan_state) { 4.137 ObjToScanQueue* work_q = par_scan_state->work_queue(); 4.138 + assert(work_q->size() == 0, "Should first empty local work queue"); 4.139 // How many to take? 4.140 - int objsFromOverflow = MIN2(work_q->max_elems()/4, 4.141 - (juint)ParGCDesiredObjsFromOverflowList); 4.142 + size_t objsFromOverflow = MIN2((size_t)work_q->max_elems()/4, 4.143 + (size_t)ParGCDesiredObjsFromOverflowList); 4.144 4.145 if (_overflow_list == NULL) return false; 4.146 4.147 // Otherwise, there was something there; try claiming the list. 4.148 - oop prefix = (oop)Atomic::xchg_ptr(NULL, &_overflow_list); 4.149 - 4.150 - if (prefix == NULL) { 4.151 - return false; 4.152 + oop prefix = (oop)Atomic::xchg_ptr(BUSY, &_overflow_list); 4.153 + // Trim off a prefix of at most objsFromOverflow items 4.154 + Thread* tid = Thread::current(); 4.155 + size_t spin_count = (size_t)ParallelGCThreads; 4.156 + size_t sleep_time_millis = MAX2((size_t)1, objsFromOverflow/100); 4.157 + for (size_t spin = 0; prefix == BUSY && spin < spin_count; spin++) { 4.158 + // someone grabbed it before we did ... 4.159 + // ... we spin for a short while... 4.160 + os::sleep(tid, sleep_time_millis, false); 4.161 + if (_overflow_list == NULL) { 4.162 + // nothing left to take 4.163 + return false; 4.164 + } else if (_overflow_list != BUSY) { 4.165 + // try and grab the prefix 4.166 + prefix = (oop)Atomic::xchg_ptr(BUSY, &_overflow_list); 4.167 + } 4.168 } 4.169 - // Trim off a prefix of at most objsFromOverflow items 4.170 - int i = 1; 4.171 + if (prefix == NULL || prefix == BUSY) { 4.172 + // Nothing to take or waited long enough 4.173 + if (prefix == NULL) { 4.174 + // Write back the NULL in case we overwrote it with BUSY above 4.175 + // and it is still the same value. 4.176 + (void) Atomic::cmpxchg_ptr(NULL, &_overflow_list, BUSY); 4.177 + } 4.178 + return false; 4.179 + } 4.180 + assert(prefix != NULL && prefix != BUSY, "Error"); 4.181 + size_t i = 1; 4.182 oop cur = prefix; 4.183 while (i < objsFromOverflow && cur->klass_or_null() != NULL) { 4.184 i++; cur = oop(cur->klass()); 4.185 } 4.186 4.187 // Reattach remaining (suffix) to overflow list 4.188 - if (cur->klass_or_null() != NULL) { 4.189 - oop suffix = oop(cur->klass()); 4.190 - cur->set_klass_to_list_ptr(NULL); 4.191 - 4.192 - // Find last item of suffix list 4.193 - oop last = suffix; 4.194 - while (last->klass_or_null() != NULL) { 4.195 - last = oop(last->klass()); 4.196 + if (cur->klass_or_null() == NULL) { 4.197 + // Write back the NULL in lieu of the BUSY we wrote 4.198 + // above and it is still the same value. 4.199 + if (_overflow_list == BUSY) { 4.200 + (void) Atomic::cmpxchg_ptr(NULL, &_overflow_list, BUSY); 4.201 } 4.202 - // Atomically prepend suffix to current overflow list 4.203 - oop cur_overflow_list = _overflow_list; 4.204 - while (true) { 4.205 - last->set_klass_to_list_ptr(cur_overflow_list); 4.206 - oop observed_overflow_list = 4.207 - (oop)Atomic::cmpxchg_ptr(suffix, &_overflow_list, cur_overflow_list); 4.208 - if (observed_overflow_list == cur_overflow_list) break; 4.209 - // Otherwise... 4.210 - cur_overflow_list = observed_overflow_list; 4.211 + } else { 4.212 + assert(cur->klass_or_null() != BUSY, "Error"); 4.213 + oop suffix = oop(cur->klass()); // suffix will be put back on global list 4.214 + cur->set_klass_to_list_ptr(NULL); // break off suffix 4.215 + // It's possible that the list is still in the empty(busy) state 4.216 + // we left it in a short while ago; in that case we may be 4.217 + // able to place back the suffix. 4.218 + oop observed_overflow_list = _overflow_list; 4.219 + oop cur_overflow_list = observed_overflow_list; 4.220 + bool attached = false; 4.221 + while (observed_overflow_list == BUSY || observed_overflow_list == NULL) { 4.222 + observed_overflow_list = 4.223 + (oop) Atomic::cmpxchg_ptr(suffix, &_overflow_list, cur_overflow_list); 4.224 + if (cur_overflow_list == observed_overflow_list) { 4.225 + attached = true; 4.226 + break; 4.227 + } else cur_overflow_list = observed_overflow_list; 4.228 + } 4.229 + if (!attached) { 4.230 + // Too bad, someone else got in in between; we'll need to do a splice. 4.231 + // Find the last item of suffix list 4.232 + oop last = suffix; 4.233 + while (last->klass_or_null() != NULL) { 4.234 + last = oop(last->klass()); 4.235 + } 4.236 + // Atomically prepend suffix to current overflow list 4.237 + observed_overflow_list = _overflow_list; 4.238 + do { 4.239 + cur_overflow_list = observed_overflow_list; 4.240 + if (cur_overflow_list != BUSY) { 4.241 + // Do the splice ... 4.242 + last->set_klass_to_list_ptr(cur_overflow_list); 4.243 + } else { // cur_overflow_list == BUSY 4.244 + last->set_klass_to_list_ptr(NULL); 4.245 + } 4.246 + observed_overflow_list = 4.247 + (oop)Atomic::cmpxchg_ptr(suffix, &_overflow_list, cur_overflow_list); 4.248 + } while (cur_overflow_list != observed_overflow_list); 4.249 } 4.250 } 4.251 4.252 // Push objects on prefix list onto this thread's work queue 4.253 - assert(cur != NULL, "program logic"); 4.254 + assert(prefix != NULL && prefix != BUSY, "program logic"); 4.255 cur = prefix; 4.256 - int n = 0; 4.257 + ssize_t n = 0; 4.258 while (cur != NULL) { 4.259 oop obj_to_push = cur->forwardee(); 4.260 oop next = oop(cur->klass_or_null()); 4.261 cur->set_klass(obj_to_push->klass()); 4.262 - if (par_scan_state->should_be_partially_scanned(obj_to_push, cur)) { 4.263 + // This may be an array object that is self-forwarded. In that case, the list pointer 4.264 + // space, cur, is not in the Java heap, but rather in the C-heap and should be freed. 4.265 + if (!is_in_reserved(cur)) { 4.266 + // This can become a scaling bottleneck when there is work queue overflow coincident 4.267 + // with promotion failure. 4.268 + oopDesc* f = cur; 4.269 + FREE_C_HEAP_ARRAY(oopDesc, f); 4.270 + } else if (par_scan_state->should_be_partially_scanned(obj_to_push, cur)) { 4.271 + assert(arrayOop(cur)->length() == 0, "entire array remaining to be scanned"); 4.272 obj_to_push = cur; 4.273 - assert(arrayOop(cur)->length() == 0, "entire array remaining to be scanned"); 4.274 } 4.275 - work_q->push(obj_to_push); 4.276 + bool ok = work_q->push(obj_to_push); 4.277 + assert(ok, "Should have succeeded"); 4.278 cur = next; 4.279 n++; 4.280 } 4.281 par_scan_state->note_overflow_refill(n); 4.282 +#ifndef PRODUCT 4.283 + assert(_num_par_pushes >= n, "Too many pops?"); 4.284 + Atomic::add_ptr(-(intptr_t)n, &_num_par_pushes); 4.285 +#endif 4.286 return true; 4.287 } 4.288 +#undef BUSY 4.289 4.290 void ParNewGeneration::ref_processor_init() 4.291 {
5.1 --- a/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp Wed Jan 21 13:40:10 2009 -0800 5.2 +++ b/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp Mon Jan 26 12:47:21 2009 -0800 5.3 @@ -278,6 +278,7 @@ 5.4 friend class ParNewRefProcTask; 5.5 friend class ParNewRefProcTaskExecutor; 5.6 friend class ParScanThreadStateSet; 5.7 + friend class ParEvacuateFollowersClosure; 5.8 5.9 private: 5.10 // XXX use a global constant instead of 64! 5.11 @@ -296,6 +297,7 @@ 5.12 // klass-pointers (klass information already copied to the forwarded 5.13 // image.) Manipulated with CAS. 5.14 oop _overflow_list; 5.15 + NOT_PRODUCT(ssize_t _num_par_pushes;) 5.16 5.17 // If true, older generation does not support promotion undo, so avoid. 5.18 static bool _avoid_promotion_undo; 5.19 @@ -372,8 +374,12 @@ 5.20 oop copy_to_survivor_space_with_undo(ParScanThreadState* par_scan_state, 5.21 oop obj, size_t obj_sz, markOop m); 5.22 5.23 + // in support of testing overflow code 5.24 + NOT_PRODUCT(int _overflow_counter;) 5.25 + NOT_PRODUCT(bool should_simulate_overflow();) 5.26 + 5.27 // Push the given (from-space) object on the global overflow list. 5.28 - void push_on_overflow_list(oop from_space_obj); 5.29 + void push_on_overflow_list(oop from_space_obj, ParScanThreadState* par_scan_state); 5.30 5.31 // If the global overflow list is non-empty, move some tasks from it 5.32 // onto "work_q" (which must be empty). No more than 1/4 of the
6.1 --- a/src/share/vm/memory/referenceProcessor.cpp Wed Jan 21 13:40:10 2009 -0800 6.2 +++ b/src/share/vm/memory/referenceProcessor.cpp Mon Jan 26 12:47:21 2009 -0800 6.3 @@ -721,12 +721,6 @@ 6.4 iter.obj(), iter.obj()->blueprint()->internal_name()); 6.5 } 6.6 assert(iter.obj()->is_oop(UseConcMarkSweepGC), "Adding a bad reference"); 6.7 - // If discovery is concurrent, we may have objects with null referents, 6.8 - // being those that were concurrently cleared after they were discovered 6.9 - // (and not subsequently precleaned). 6.10 - assert( (discovery_is_atomic() && iter.referent()->is_oop()) 6.11 - || (!discovery_is_atomic() && iter.referent()->is_oop_or_null(UseConcMarkSweepGC)), 6.12 - "Adding a bad referent"); 6.13 iter.next(); 6.14 } 6.15 // Remember to keep sentinel pointer around
7.1 --- a/src/share/vm/runtime/globals.hpp Wed Jan 21 13:40:10 2009 -0800 7.2 +++ b/src/share/vm/runtime/globals.hpp Mon Jan 26 12:47:21 2009 -0800 7.3 @@ -1307,7 +1307,14 @@ 7.4 product(intx, ParGCArrayScanChunk, 50, \ 7.5 "Scan a subset and push remainder, if array is bigger than this") \ 7.6 \ 7.7 - product(intx, ParGCDesiredObjsFromOverflowList, 20, \ 7.8 + notproduct(bool, ParGCWorkQueueOverflowALot, false, \ 7.9 + "Whether we should simulate work queue overflow in ParNew") \ 7.10 + \ 7.11 + notproduct(uintx, ParGCWorkQueueOverflowInterval, 1000, \ 7.12 + "An `interval' counter that determines how frequently" \ 7.13 + " we simulate overflow; a smaller number increases frequency") \ 7.14 + \ 7.15 + product(uintx, ParGCDesiredObjsFromOverflowList, 20, \ 7.16 "The desired number of objects to claim from the overflow list") \ 7.17 \ 7.18 product(uintx, CMSParPromoteBlocksToClaim, 50, \ 7.19 @@ -1429,8 +1436,8 @@ 7.20 "Whether we should simulate frequent marking stack / work queue" \ 7.21 " overflow") \ 7.22 \ 7.23 - notproduct(intx, CMSMarkStackOverflowInterval, 1000, \ 7.24 - "A per-thread `interval' counter that determines how frequently" \ 7.25 + notproduct(uintx, CMSMarkStackOverflowInterval, 1000, \ 7.26 + "An `interval' counter that determines how frequently" \ 7.27 " we simulate overflow; a smaller number increases frequency") \ 7.28 \ 7.29 product(uintx, CMSMaxAbortablePrecleanLoops, 0, \ 7.30 @@ -1648,7 +1655,7 @@ 7.31 develop(uintx, WorkStealingYieldsBeforeSleep, 1000, \ 7.32 "Number of yields before a sleep is done during workstealing") \ 7.33 \ 7.34 - product(uintx, PreserveMarkStackSize, 40, \ 7.35 + product(uintx, PreserveMarkStackSize, 1024, \ 7.36 "Size for stack used in promotion failure handling") \ 7.37 \ 7.38 product_pd(bool, UseTLAB, "Use thread-local object allocation") \