Sat, 28 Mar 2009 15:47:29 -0700
6819891: ParNew: Fix work queue overflow code to deal correctly with +UseCompressedOops
Summary: When using compressed oops, rather than chaining the overflowed grey objects' pre-images through their klass words, we use GC-worker thread-local overflow stacks.
Reviewed-by: jcoomes, jmasa
1.1 --- a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp Thu Mar 26 08:51:32 2009 -0700 1.2 +++ b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp Sat Mar 28 15:47:29 2009 -0700 1.3 @@ -3847,7 +3847,7 @@ 1.4 MutexLockerEx ml(ovflw_stk->par_lock(), 1.5 Mutex::_no_safepoint_check_flag); 1.6 // Grab up to 1/4 the size of the work queue 1.7 - size_t num = MIN2((size_t)work_q->max_elems()/4, 1.8 + size_t num = MIN2((size_t)(work_q->max_elems() - work_q->size())/4, 1.9 (size_t)ParGCDesiredObjsFromOverflowList); 1.10 num = MIN2(num, ovflw_stk->length()); 1.11 for (int i = (int) num; i > 0; i--) { 1.12 @@ -5204,13 +5204,12 @@ 1.13 NOT_PRODUCT(int num_steals = 0;) 1.14 oop obj_to_scan; 1.15 CMSBitMap* bm = &(_collector->_markBitMap); 1.16 - size_t num_from_overflow_list = 1.17 - MIN2((size_t)work_q->max_elems()/4, 1.18 - (size_t)ParGCDesiredObjsFromOverflowList); 1.19 1.20 while (true) { 1.21 // Completely finish any left over work from (an) earlier round(s) 1.22 cl->trim_queue(0); 1.23 + size_t num_from_overflow_list = MIN2((size_t)(work_q->max_elems() - work_q->size())/4, 1.24 + (size_t)ParGCDesiredObjsFromOverflowList); 1.25 // Now check if there's any work in the overflow list 1.26 if (_collector->par_take_from_overflow_list(num_from_overflow_list, 1.27 work_q)) { 1.28 @@ -5622,13 +5621,12 @@ 1.29 OopTaskQueue* work_q = work_queue(i); 1.30 NOT_PRODUCT(int num_steals = 0;) 1.31 oop obj_to_scan; 1.32 - size_t num_from_overflow_list = 1.33 - MIN2((size_t)work_q->max_elems()/4, 1.34 - (size_t)ParGCDesiredObjsFromOverflowList); 1.35 1.36 while (true) { 1.37 // Completely finish any left over work from (an) earlier round(s) 1.38 drain->trim_queue(0); 1.39 + size_t num_from_overflow_list = MIN2((size_t)(work_q->max_elems() - work_q->size())/4, 1.40 + (size_t)ParGCDesiredObjsFromOverflowList); 1.41 // Now check if there's any work in the overflow list 1.42 if (_collector->par_take_from_overflow_list(num_from_overflow_list, 1.43 work_q)) { 1.44 @@ -9021,7 +9019,7 @@ 1.45 // Transfer some number of overflown objects to usual marking 1.46 // stack. Return true if some objects were transferred. 1.47 bool MarkRefsIntoAndScanClosure::take_from_overflow_list() { 1.48 - size_t num = MIN2((size_t)_mark_stack->capacity()/4, 1.49 + size_t num = MIN2((size_t)(_mark_stack->capacity() - _mark_stack->length())/4, 1.50 (size_t)ParGCDesiredObjsFromOverflowList); 1.51 1.52 bool res = _collector->take_from_overflow_list(num, _mark_stack);
2.1 --- a/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp Thu Mar 26 08:51:32 2009 -0700 2.2 +++ b/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp Sat Mar 28 15:47:29 2009 -0700 2.3 @@ -36,7 +36,7 @@ 2.4 ObjToScanQueueSet* work_queue_set_, 2.5 size_t desired_plab_sz_, 2.6 ParallelTaskTerminator& term_) : 2.7 - _to_space(to_space_), _old_gen(old_gen_), _thread_num(thread_num_), 2.8 + _to_space(to_space_), _old_gen(old_gen_), _young_gen(gen_), _thread_num(thread_num_), 2.9 _work_queue(work_queue_set_->queue(thread_num_)), _to_space_full(false), 2.10 _ageTable(false), // false ==> not the global age table, no perf data. 2.11 _to_space_alloc_buffer(desired_plab_sz_), 2.12 @@ -57,6 +57,11 @@ 2.13 _start = os::elapsedTime(); 2.14 _old_gen_closure.set_generation(old_gen_); 2.15 _old_gen_root_closure.set_generation(old_gen_); 2.16 + if (UseCompressedOops) { 2.17 + _overflow_stack = new (ResourceObj::C_HEAP) GrowableArray<oop>(512, true); 2.18 + } else { 2.19 + _overflow_stack = NULL; 2.20 + } 2.21 } 2.22 #ifdef _MSC_VER 2.23 #pragma warning( pop ) 2.24 @@ -81,7 +86,7 @@ 2.25 assert(old->is_objArray(), "must be obj array"); 2.26 assert(old->is_forwarded(), "must be forwarded"); 2.27 assert(Universe::heap()->is_in_reserved(old), "must be in heap."); 2.28 - assert(!_old_gen->is_in(old), "must be in young generation."); 2.29 + assert(!old_gen()->is_in(old), "must be in young generation."); 2.30 2.31 objArrayOop obj = objArrayOop(old->forwardee()); 2.32 // Process ParGCArrayScanChunk elements now 2.33 @@ -119,26 +124,68 @@ 2.34 2.35 void ParScanThreadState::trim_queues(int max_size) { 2.36 ObjToScanQueue* queue = work_queue(); 2.37 - while (queue->size() > (juint)max_size) { 2.38 - oop obj_to_scan; 2.39 - if (queue->pop_local(obj_to_scan)) { 2.40 - note_pop(); 2.41 - 2.42 - if ((HeapWord *)obj_to_scan < young_old_boundary()) { 2.43 - if (obj_to_scan->is_objArray() && 2.44 - obj_to_scan->is_forwarded() && 2.45 - obj_to_scan->forwardee() != obj_to_scan) { 2.46 - scan_partial_array_and_push_remainder(obj_to_scan); 2.47 + do { 2.48 + while (queue->size() > (juint)max_size) { 2.49 + oop obj_to_scan; 2.50 + if (queue->pop_local(obj_to_scan)) { 2.51 + note_pop(); 2.52 + if ((HeapWord *)obj_to_scan < young_old_boundary()) { 2.53 + if (obj_to_scan->is_objArray() && 2.54 + obj_to_scan->is_forwarded() && 2.55 + obj_to_scan->forwardee() != obj_to_scan) { 2.56 + scan_partial_array_and_push_remainder(obj_to_scan); 2.57 + } else { 2.58 + // object is in to_space 2.59 + obj_to_scan->oop_iterate(&_to_space_closure); 2.60 + } 2.61 } else { 2.62 - // object is in to_space 2.63 - obj_to_scan->oop_iterate(&_to_space_closure); 2.64 + // object is in old generation 2.65 + obj_to_scan->oop_iterate(&_old_gen_closure); 2.66 } 2.67 - } else { 2.68 - // object is in old generation 2.69 - obj_to_scan->oop_iterate(&_old_gen_closure); 2.70 } 2.71 } 2.72 + // For the case of compressed oops, we have a private, non-shared 2.73 + // overflow stack, so we eagerly drain it so as to more evenly 2.74 + // distribute load early. Note: this may be good to do in 2.75 + // general rather than delay for the final stealing phase. 2.76 + // If applicable, we'll transfer a set of objects over to our 2.77 + // work queue, allowing them to be stolen and draining our 2.78 + // private overflow stack. 2.79 + } while (ParGCTrimOverflow && young_gen()->take_from_overflow_list(this)); 2.80 +} 2.81 + 2.82 +bool ParScanThreadState::take_from_overflow_stack() { 2.83 + assert(UseCompressedOops, "Else should not call"); 2.84 + assert(young_gen()->overflow_list() == NULL, "Error"); 2.85 + ObjToScanQueue* queue = work_queue(); 2.86 + GrowableArray<oop>* of_stack = overflow_stack(); 2.87 + uint num_overflow_elems = of_stack->length(); 2.88 + uint num_take_elems = MIN2(MIN2((queue->max_elems() - queue->size())/4, 2.89 + (juint)ParGCDesiredObjsFromOverflowList), 2.90 + num_overflow_elems); 2.91 + // Transfer the most recent num_take_elems from the overflow 2.92 + // stack to our work queue. 2.93 + for (size_t i = 0; i != num_take_elems; i++) { 2.94 + oop cur = of_stack->pop(); 2.95 + oop obj_to_push = cur->forwardee(); 2.96 + assert(Universe::heap()->is_in_reserved(cur), "Should be in heap"); 2.97 + assert(!old_gen()->is_in_reserved(cur), "Should be in young gen"); 2.98 + assert(Universe::heap()->is_in_reserved(obj_to_push), "Should be in heap"); 2.99 + if (should_be_partially_scanned(obj_to_push, cur)) { 2.100 + assert(arrayOop(cur)->length() == 0, "entire array remaining to be scanned"); 2.101 + obj_to_push = cur; 2.102 + } 2.103 + bool ok = queue->push(obj_to_push); 2.104 + assert(ok, "Should have succeeded"); 2.105 } 2.106 + assert(young_gen()->overflow_list() == NULL, "Error"); 2.107 + return num_take_elems > 0; // was something transferred? 2.108 +} 2.109 + 2.110 +void ParScanThreadState::push_on_overflow_stack(oop p) { 2.111 + assert(UseCompressedOops, "Else should not call"); 2.112 + overflow_stack()->push(p); 2.113 + assert(young_gen()->overflow_list() == NULL, "Error"); 2.114 } 2.115 2.116 HeapWord* ParScanThreadState::alloc_in_to_space_slow(size_t word_sz) { 2.117 @@ -425,8 +472,7 @@ 2.118 ResourceMark rm; 2.119 HandleMark hm; 2.120 // We would need multiple old-gen queues otherwise. 2.121 - guarantee(gch->n_gens() == 2, 2.122 - "Par young collection currently only works with one older gen."); 2.123 + assert(gch->n_gens() == 2, "Par young collection currently only works with one older gen."); 2.124 2.125 Generation* old_gen = gch->next_gen(_gen); 2.126 2.127 @@ -1169,36 +1215,75 @@ 2.128 } 2.129 #endif 2.130 2.131 +// In case we are using compressed oops, we need to be careful. 2.132 +// If the object being pushed is an object array, then its length 2.133 +// field keeps track of the "grey boundary" at which the next 2.134 +// incremental scan will be done (see ParGCArrayScanChunk). 2.135 +// When using compressed oops, this length field is kept in the 2.136 +// lower 32 bits of the erstwhile klass word and cannot be used 2.137 +// for the overflow chaining pointer (OCP below). As such the OCP 2.138 +// would itself need to be compressed into the top 32-bits in this 2.139 +// case. Unfortunately, see below, in the event that we have a 2.140 +// promotion failure, the node to be pushed on the list can be 2.141 +// outside of the Java heap, so the heap-based pointer compression 2.142 +// would not work (we would have potential aliasing between C-heap 2.143 +// and Java-heap pointers). For this reason, when using compressed 2.144 +// oops, we simply use a worker-thread-local, non-shared overflow 2.145 +// list in the form of a growable array, with a slightly different 2.146 +// overflow stack draining strategy. If/when we start using fat 2.147 +// stacks here, we can go back to using (fat) pointer chains 2.148 +// (although some performance comparisons would be useful since 2.149 +// single global lists have their own performance disadvantages 2.150 +// as we were made painfully aware not long ago, see 6786503). 2.151 #define BUSY (oop(0x1aff1aff)) 2.152 void ParNewGeneration::push_on_overflow_list(oop from_space_obj, ParScanThreadState* par_scan_state) { 2.153 - // if the object has been forwarded to itself, then we cannot 2.154 - // use the klass pointer for the linked list. Instead we have 2.155 - // to allocate an oopDesc in the C-Heap and use that for the linked list. 2.156 - // XXX This is horribly inefficient when a promotion failure occurs 2.157 - // and should be fixed. XXX FIX ME !!! 2.158 + assert(is_in_reserved(from_space_obj), "Should be from this generation"); 2.159 + if (UseCompressedOops) { 2.160 + // In the case of compressed oops, we use a private, not-shared 2.161 + // overflow stack. 2.162 + par_scan_state->push_on_overflow_stack(from_space_obj); 2.163 + } else { 2.164 + // if the object has been forwarded to itself, then we cannot 2.165 + // use the klass pointer for the linked list. Instead we have 2.166 + // to allocate an oopDesc in the C-Heap and use that for the linked list. 2.167 + // XXX This is horribly inefficient when a promotion failure occurs 2.168 + // and should be fixed. XXX FIX ME !!! 2.169 #ifndef PRODUCT 2.170 - Atomic::inc_ptr(&_num_par_pushes); 2.171 - assert(_num_par_pushes > 0, "Tautology"); 2.172 + Atomic::inc_ptr(&_num_par_pushes); 2.173 + assert(_num_par_pushes > 0, "Tautology"); 2.174 #endif 2.175 - if (from_space_obj->forwardee() == from_space_obj) { 2.176 - oopDesc* listhead = NEW_C_HEAP_ARRAY(oopDesc, 1); 2.177 - listhead->forward_to(from_space_obj); 2.178 - from_space_obj = listhead; 2.179 + if (from_space_obj->forwardee() == from_space_obj) { 2.180 + oopDesc* listhead = NEW_C_HEAP_ARRAY(oopDesc, 1); 2.181 + listhead->forward_to(from_space_obj); 2.182 + from_space_obj = listhead; 2.183 + } 2.184 + oop observed_overflow_list = _overflow_list; 2.185 + oop cur_overflow_list; 2.186 + do { 2.187 + cur_overflow_list = observed_overflow_list; 2.188 + if (cur_overflow_list != BUSY) { 2.189 + from_space_obj->set_klass_to_list_ptr(cur_overflow_list); 2.190 + } else { 2.191 + from_space_obj->set_klass_to_list_ptr(NULL); 2.192 + } 2.193 + observed_overflow_list = 2.194 + (oop)Atomic::cmpxchg_ptr(from_space_obj, &_overflow_list, cur_overflow_list); 2.195 + } while (cur_overflow_list != observed_overflow_list); 2.196 } 2.197 - oop observed_overflow_list = _overflow_list; 2.198 - oop cur_overflow_list; 2.199 - do { 2.200 - cur_overflow_list = observed_overflow_list; 2.201 - if (cur_overflow_list != BUSY) { 2.202 - from_space_obj->set_klass_to_list_ptr(cur_overflow_list); 2.203 - } else { 2.204 - from_space_obj->set_klass_to_list_ptr(NULL); 2.205 - } 2.206 - observed_overflow_list = 2.207 - (oop)Atomic::cmpxchg_ptr(from_space_obj, &_overflow_list, cur_overflow_list); 2.208 - } while (cur_overflow_list != observed_overflow_list); 2.209 } 2.210 2.211 +bool ParNewGeneration::take_from_overflow_list(ParScanThreadState* par_scan_state) { 2.212 + bool res; 2.213 + 2.214 + if (UseCompressedOops) { 2.215 + res = par_scan_state->take_from_overflow_stack(); 2.216 + } else { 2.217 + res = take_from_overflow_list_work(par_scan_state); 2.218 + } 2.219 + return res; 2.220 +} 2.221 + 2.222 + 2.223 // *NOTE*: The overflow list manipulation code here and 2.224 // in CMSCollector:: are very similar in shape, 2.225 // except that in the CMS case we thread the objects 2.226 @@ -1213,14 +1298,13 @@ 2.227 // similar changes might be needed. 2.228 // See CMSCollector::par_take_from_overflow_list() for 2.229 // more extensive documentation comments. 2.230 -bool 2.231 -ParNewGeneration::take_from_overflow_list(ParScanThreadState* par_scan_state) { 2.232 +bool ParNewGeneration::take_from_overflow_list_work(ParScanThreadState* par_scan_state) { 2.233 ObjToScanQueue* work_q = par_scan_state->work_queue(); 2.234 - assert(work_q->size() == 0, "Should first empty local work queue"); 2.235 // How many to take? 2.236 - size_t objsFromOverflow = MIN2((size_t)work_q->max_elems()/4, 2.237 + size_t objsFromOverflow = MIN2((size_t)(work_q->max_elems() - work_q->size())/4, 2.238 (size_t)ParGCDesiredObjsFromOverflowList); 2.239 2.240 + assert(par_scan_state->overflow_stack() == NULL, "Error"); 2.241 if (_overflow_list == NULL) return false; 2.242 2.243 // Otherwise, there was something there; try claiming the list.
3.1 --- a/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp Thu Mar 26 08:51:32 2009 -0700 3.2 +++ b/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp Sat Mar 28 15:47:29 2009 -0700 3.3 @@ -55,6 +55,7 @@ 3.4 friend class ParScanThreadStateSet; 3.5 private: 3.6 ObjToScanQueue *_work_queue; 3.7 + GrowableArray<oop>* _overflow_stack; 3.8 3.9 ParGCAllocBuffer _to_space_alloc_buffer; 3.10 3.11 @@ -79,6 +80,9 @@ 3.12 Space* _to_space; 3.13 Space* to_space() { return _to_space; } 3.14 3.15 + ParNewGeneration* _young_gen; 3.16 + ParNewGeneration* young_gen() const { return _young_gen; } 3.17 + 3.18 Generation* _old_gen; 3.19 Generation* old_gen() { return _old_gen; } 3.20 3.21 @@ -134,6 +138,11 @@ 3.22 // Decrease queue size below "max_size". 3.23 void trim_queues(int max_size); 3.24 3.25 + // Private overflow stack usage 3.26 + GrowableArray<oop>* overflow_stack() { return _overflow_stack; } 3.27 + bool take_from_overflow_stack(); 3.28 + void push_on_overflow_stack(oop p); 3.29 + 3.30 // Is new_obj a candidate for scan_partial_array_and_push_remainder method. 3.31 inline bool should_be_partially_scanned(oop new_obj, oop old_obj) const; 3.32 3.33 @@ -378,13 +387,17 @@ 3.34 NOT_PRODUCT(int _overflow_counter;) 3.35 NOT_PRODUCT(bool should_simulate_overflow();) 3.36 3.37 + // Accessor for overflow list 3.38 + oop overflow_list() { return _overflow_list; } 3.39 + 3.40 // Push the given (from-space) object on the global overflow list. 3.41 void push_on_overflow_list(oop from_space_obj, ParScanThreadState* par_scan_state); 3.42 3.43 // If the global overflow list is non-empty, move some tasks from it 3.44 - // onto "work_q" (which must be empty). No more than 1/4 of the 3.45 - // max_elems of "work_q" are moved. 3.46 + // onto "work_q" (which need not be empty). No more than 1/4 of the 3.47 + // available space on "work_q" is used. 3.48 bool take_from_overflow_list(ParScanThreadState* par_scan_state); 3.49 + bool take_from_overflow_list_work(ParScanThreadState* par_scan_state); 3.50 3.51 // The task queues to be used by parallel GC threads. 3.52 ObjToScanQueueSet* task_queues() {
4.1 --- a/src/share/vm/runtime/globals.hpp Thu Mar 26 08:51:32 2009 -0700 4.2 +++ b/src/share/vm/runtime/globals.hpp Sat Mar 28 15:47:29 2009 -0700 4.3 @@ -1316,6 +1316,9 @@ 4.4 product(intx, ParGCArrayScanChunk, 50, \ 4.5 "Scan a subset and push remainder, if array is bigger than this") \ 4.6 \ 4.7 + product(bool, ParGCTrimOverflow, true, \ 4.8 + "Eagerly trim the overflow lists (useful for UseCompressedOops") \ 4.9 + \ 4.10 notproduct(bool, ParGCWorkQueueOverflowALot, false, \ 4.11 "Whether we should simulate work queue overflow in ParNew") \ 4.12 \