Tue, 13 Mar 2012 11:05:32 -0700
7143511: G1: Another instance of high GC Worker Other time (50ms)
Summary: Tiered compilation has increased the number of nmethods in the code cache. This has, in turn, significantly increased the number of marked nmethods processed during the StrongRootsScope destructor. Create a specialized version of CodeBlobToOopClosure for G1 which places only those nmethods that contain pointers into the collection set on to the marked nmethods list.
Reviewed-by: iveresov, tonyp
1.1 --- a/src/share/vm/code/nmethod.hpp Tue Mar 27 10:29:59 2012 +0200 1.2 +++ b/src/share/vm/code/nmethod.hpp Tue Mar 13 11:05:32 2012 -0700 1.3 @@ -553,7 +553,7 @@ 1.4 static void oops_do_marking_prologue(); 1.5 static void oops_do_marking_epilogue(); 1.6 static bool oops_do_marking_is_active() { return _oops_do_mark_nmethods != NULL; } 1.7 - DEBUG_ONLY(bool test_oops_do_mark() { return _oops_do_mark_link != NULL; }) 1.8 + bool test_oops_do_mark() { return _oops_do_mark_link != NULL; } 1.9 1.10 // ScopeDesc for an instruction 1.11 ScopeDesc* scope_desc_at(address pc);
2.1 --- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp Tue Mar 27 10:29:59 2012 +0200 2.2 +++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp Tue Mar 13 11:05:32 2012 -0700 2.3 @@ -4675,67 +4675,74 @@ 2.4 double start_time_ms = os::elapsedTime() * 1000.0; 2.5 _g1h->g1_policy()->record_gc_worker_start_time(worker_id, start_time_ms); 2.6 2.7 - ResourceMark rm; 2.8 - HandleMark hm; 2.9 - 2.10 - ReferenceProcessor* rp = _g1h->ref_processor_stw(); 2.11 - 2.12 - G1ParScanThreadState pss(_g1h, worker_id); 2.13 - G1ParScanHeapEvacClosure scan_evac_cl(_g1h, &pss, rp); 2.14 - G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss, rp); 2.15 - G1ParScanPartialArrayClosure partial_scan_cl(_g1h, &pss, rp); 2.16 - 2.17 - pss.set_evac_closure(&scan_evac_cl); 2.18 - pss.set_evac_failure_closure(&evac_failure_cl); 2.19 - pss.set_partial_scan_closure(&partial_scan_cl); 2.20 - 2.21 - G1ParScanExtRootClosure only_scan_root_cl(_g1h, &pss, rp); 2.22 - G1ParScanPermClosure only_scan_perm_cl(_g1h, &pss, rp); 2.23 - 2.24 - G1ParScanAndMarkExtRootClosure scan_mark_root_cl(_g1h, &pss, rp); 2.25 - G1ParScanAndMarkPermClosure scan_mark_perm_cl(_g1h, &pss, rp); 2.26 - 2.27 - OopClosure* scan_root_cl = &only_scan_root_cl; 2.28 - OopsInHeapRegionClosure* scan_perm_cl = &only_scan_perm_cl; 2.29 - 2.30 - if (_g1h->g1_policy()->during_initial_mark_pause()) { 2.31 - // We also need to mark copied objects. 2.32 - scan_root_cl = &scan_mark_root_cl; 2.33 - scan_perm_cl = &scan_mark_perm_cl; 2.34 + { 2.35 + ResourceMark rm; 2.36 + HandleMark hm; 2.37 + 2.38 + ReferenceProcessor* rp = _g1h->ref_processor_stw(); 2.39 + 2.40 + G1ParScanThreadState pss(_g1h, worker_id); 2.41 + G1ParScanHeapEvacClosure scan_evac_cl(_g1h, &pss, rp); 2.42 + G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss, rp); 2.43 + G1ParScanPartialArrayClosure partial_scan_cl(_g1h, &pss, rp); 2.44 + 2.45 + pss.set_evac_closure(&scan_evac_cl); 2.46 + pss.set_evac_failure_closure(&evac_failure_cl); 2.47 + pss.set_partial_scan_closure(&partial_scan_cl); 2.48 + 2.49 + G1ParScanExtRootClosure only_scan_root_cl(_g1h, &pss, rp); 2.50 + G1ParScanPermClosure only_scan_perm_cl(_g1h, &pss, rp); 2.51 + 2.52 + G1ParScanAndMarkExtRootClosure scan_mark_root_cl(_g1h, &pss, rp); 2.53 + G1ParScanAndMarkPermClosure scan_mark_perm_cl(_g1h, &pss, rp); 2.54 + 2.55 + OopClosure* scan_root_cl = &only_scan_root_cl; 2.56 + OopsInHeapRegionClosure* scan_perm_cl = &only_scan_perm_cl; 2.57 + 2.58 + if (_g1h->g1_policy()->during_initial_mark_pause()) { 2.59 + // We also need to mark copied objects. 2.60 + scan_root_cl = &scan_mark_root_cl; 2.61 + scan_perm_cl = &scan_mark_perm_cl; 2.62 + } 2.63 + 2.64 + G1ParPushHeapRSClosure push_heap_rs_cl(_g1h, &pss); 2.65 + 2.66 + pss.start_strong_roots(); 2.67 + _g1h->g1_process_strong_roots(/* not collecting perm */ false, 2.68 + SharedHeap::SO_AllClasses, 2.69 + scan_root_cl, 2.70 + &push_heap_rs_cl, 2.71 + scan_perm_cl, 2.72 + worker_id); 2.73 + pss.end_strong_roots(); 2.74 + 2.75 + { 2.76 + double start = os::elapsedTime(); 2.77 + G1ParEvacuateFollowersClosure evac(_g1h, &pss, _queues, &_terminator); 2.78 + evac.do_void(); 2.79 + double elapsed_ms = (os::elapsedTime()-start)*1000.0; 2.80 + double term_ms = pss.term_time()*1000.0; 2.81 + _g1h->g1_policy()->record_obj_copy_time(worker_id, elapsed_ms-term_ms); 2.82 + _g1h->g1_policy()->record_termination(worker_id, term_ms, pss.term_attempts()); 2.83 + } 2.84 + _g1h->g1_policy()->record_thread_age_table(pss.age_table()); 2.85 + _g1h->update_surviving_young_words(pss.surviving_young_words()+1); 2.86 + 2.87 + // Clean up any par-expanded rem sets. 2.88 + HeapRegionRemSet::par_cleanup(); 2.89 + 2.90 + if (ParallelGCVerbose) { 2.91 + MutexLocker x(stats_lock()); 2.92 + pss.print_termination_stats(worker_id); 2.93 + } 2.94 + 2.95 + assert(pss.refs()->is_empty(), "should be empty"); 2.96 + 2.97 + // Close the inner scope so that the ResourceMark and HandleMark 2.98 + // destructors are executed here and are included as part of the 2.99 + // "GC Worker Time". 2.100 } 2.101 2.102 - G1ParPushHeapRSClosure push_heap_rs_cl(_g1h, &pss); 2.103 - 2.104 - pss.start_strong_roots(); 2.105 - _g1h->g1_process_strong_roots(/* not collecting perm */ false, 2.106 - SharedHeap::SO_AllClasses, 2.107 - scan_root_cl, 2.108 - &push_heap_rs_cl, 2.109 - scan_perm_cl, 2.110 - worker_id); 2.111 - pss.end_strong_roots(); 2.112 - 2.113 - { 2.114 - double start = os::elapsedTime(); 2.115 - G1ParEvacuateFollowersClosure evac(_g1h, &pss, _queues, &_terminator); 2.116 - evac.do_void(); 2.117 - double elapsed_ms = (os::elapsedTime()-start)*1000.0; 2.118 - double term_ms = pss.term_time()*1000.0; 2.119 - _g1h->g1_policy()->record_obj_copy_time(worker_id, elapsed_ms-term_ms); 2.120 - _g1h->g1_policy()->record_termination(worker_id, term_ms, pss.term_attempts()); 2.121 - } 2.122 - _g1h->g1_policy()->record_thread_age_table(pss.age_table()); 2.123 - _g1h->update_surviving_young_words(pss.surviving_young_words()+1); 2.124 - 2.125 - // Clean up any par-expanded rem sets. 2.126 - HeapRegionRemSet::par_cleanup(); 2.127 - 2.128 - if (ParallelGCVerbose) { 2.129 - MutexLocker x(stats_lock()); 2.130 - pss.print_termination_stats(worker_id); 2.131 - } 2.132 - 2.133 - assert(pss.refs()->is_empty(), "should be empty"); 2.134 double end_time_ms = os::elapsedTime() * 1000.0; 2.135 _g1h->g1_policy()->record_gc_worker_end_time(worker_id, end_time_ms); 2.136 } 2.137 @@ -4743,6 +4750,67 @@ 2.138 2.139 // *** Common G1 Evacuation Stuff 2.140 2.141 +// Closures that support the filtering of CodeBlobs scanned during 2.142 +// external root scanning. 2.143 + 2.144 +// Closure applied to reference fields in code blobs (specifically nmethods) 2.145 +// to determine whether an nmethod contains references that point into 2.146 +// the collection set. Used as a predicate when walking code roots so 2.147 +// that only nmethods that point into the collection set are added to the 2.148 +// 'marked' list. 2.149 + 2.150 +class G1FilteredCodeBlobToOopClosure : public CodeBlobToOopClosure { 2.151 + 2.152 + class G1PointsIntoCSOopClosure : public OopClosure { 2.153 + G1CollectedHeap* _g1; 2.154 + bool _points_into_cs; 2.155 + public: 2.156 + G1PointsIntoCSOopClosure(G1CollectedHeap* g1) : 2.157 + _g1(g1), _points_into_cs(false) { } 2.158 + 2.159 + bool points_into_cs() const { return _points_into_cs; } 2.160 + 2.161 + template <class T> 2.162 + void do_oop_nv(T* p) { 2.163 + if (!_points_into_cs) { 2.164 + T heap_oop = oopDesc::load_heap_oop(p); 2.165 + if (!oopDesc::is_null(heap_oop) && 2.166 + _g1->in_cset_fast_test(oopDesc::decode_heap_oop_not_null(heap_oop))) { 2.167 + _points_into_cs = true; 2.168 + } 2.169 + } 2.170 + } 2.171 + 2.172 + virtual void do_oop(oop* p) { do_oop_nv(p); } 2.173 + virtual void do_oop(narrowOop* p) { do_oop_nv(p); } 2.174 + }; 2.175 + 2.176 + G1CollectedHeap* _g1; 2.177 + 2.178 +public: 2.179 + G1FilteredCodeBlobToOopClosure(G1CollectedHeap* g1, OopClosure* cl) : 2.180 + CodeBlobToOopClosure(cl, true), _g1(g1) { } 2.181 + 2.182 + virtual void do_code_blob(CodeBlob* cb) { 2.183 + nmethod* nm = cb->as_nmethod_or_null(); 2.184 + if (nm != NULL && !(nm->test_oops_do_mark())) { 2.185 + G1PointsIntoCSOopClosure predicate_cl(_g1); 2.186 + nm->oops_do(&predicate_cl); 2.187 + 2.188 + if (predicate_cl.points_into_cs()) { 2.189 + // At least one of the reference fields or the oop relocations 2.190 + // in the nmethod points into the collection set. We have to 2.191 + // 'mark' this nmethod. 2.192 + // Note: Revisit the following if CodeBlobToOopClosure::do_code_blob() 2.193 + // or MarkingCodeBlobClosure::do_code_blob() change. 2.194 + if (!nm->test_set_oops_do_mark()) { 2.195 + do_newly_marked_nmethod(nm); 2.196 + } 2.197 + } 2.198 + } 2.199 + } 2.200 +}; 2.201 + 2.202 // This method is run in a GC worker. 2.203 2.204 void 2.205 @@ -4764,7 +4832,7 @@ 2.206 2.207 // Walk the code cache w/o buffering, because StarTask cannot handle 2.208 // unaligned oop locations. 2.209 - CodeBlobToOopClosure eager_scan_code_roots(scan_non_heap_roots, /*do_marking=*/ true); 2.210 + G1FilteredCodeBlobToOopClosure eager_scan_code_roots(this, scan_non_heap_roots); 2.211 2.212 process_strong_roots(false, // no scoping; this is parallel code 2.213 collecting_perm_gen, so, 2.214 @@ -5378,25 +5446,39 @@ 2.215 rem_set()->prepare_for_younger_refs_iterate(true); 2.216 2.217 assert(dirty_card_queue_set().completed_buffers_num() == 0, "Should be empty"); 2.218 - double start_par = os::elapsedTime(); 2.219 - 2.220 - if (G1CollectedHeap::use_parallel_gc_threads()) { 2.221 - // The individual threads will set their evac-failure closures. 2.222 + double start_par_time_sec = os::elapsedTime(); 2.223 + double end_par_time_sec; 2.224 + 2.225 + { 2.226 StrongRootsScope srs(this); 2.227 - if (ParallelGCVerbose) G1ParScanThreadState::print_termination_stats_hdr(); 2.228 - // These tasks use ShareHeap::_process_strong_tasks 2.229 - assert(UseDynamicNumberOfGCThreads || 2.230 - workers()->active_workers() == workers()->total_workers(), 2.231 - "If not dynamic should be using all the workers"); 2.232 - workers()->run_task(&g1_par_task); 2.233 - } else { 2.234 - StrongRootsScope srs(this); 2.235 - g1_par_task.set_for_termination(n_workers); 2.236 - g1_par_task.work(0); 2.237 - } 2.238 - 2.239 - double par_time = (os::elapsedTime() - start_par) * 1000.0; 2.240 - g1_policy()->record_par_time(par_time); 2.241 + 2.242 + if (G1CollectedHeap::use_parallel_gc_threads()) { 2.243 + // The individual threads will set their evac-failure closures. 2.244 + if (ParallelGCVerbose) G1ParScanThreadState::print_termination_stats_hdr(); 2.245 + // These tasks use ShareHeap::_process_strong_tasks 2.246 + assert(UseDynamicNumberOfGCThreads || 2.247 + workers()->active_workers() == workers()->total_workers(), 2.248 + "If not dynamic should be using all the workers"); 2.249 + workers()->run_task(&g1_par_task); 2.250 + } else { 2.251 + g1_par_task.set_for_termination(n_workers); 2.252 + g1_par_task.work(0); 2.253 + } 2.254 + end_par_time_sec = os::elapsedTime(); 2.255 + 2.256 + // Closing the inner scope will execute the destructor 2.257 + // for the StrongRootsScope object. We record the current 2.258 + // elapsed time before closing the scope so that time 2.259 + // taken for the SRS destructor is NOT included in the 2.260 + // reported parallel time. 2.261 + } 2.262 + 2.263 + double par_time_ms = (end_par_time_sec - start_par_time_sec) * 1000.0; 2.264 + g1_policy()->record_par_time(par_time_ms); 2.265 + 2.266 + double code_root_fixup_time_ms = 2.267 + (os::elapsedTime() - end_par_time_sec) * 1000.0; 2.268 + g1_policy()->record_code_root_fixup_time(code_root_fixup_time_ms); 2.269 2.270 set_par_threads(0); 2.271
3.1 --- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp Tue Mar 27 10:29:59 2012 +0200 3.2 +++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp Tue Mar 13 11:05:32 2012 -0700 3.3 @@ -1261,6 +1261,9 @@ 3.4 other_time_ms -= known_time; 3.5 } 3.6 3.7 + // Now subtract the time taken to fix up roots in generated code 3.8 + other_time_ms -= _cur_collection_code_root_fixup_time_ms; 3.9 + 3.10 // Subtract the time taken to clean the card table from the 3.11 // current value of "other time" 3.12 other_time_ms -= _cur_clear_ct_time_ms; 3.13 @@ -1401,10 +1404,10 @@ 3.14 print_par_stats(2, "Object Copy", _par_last_obj_copy_times_ms); 3.15 print_par_stats(2, "Termination", _par_last_termination_times_ms); 3.16 print_par_sizes(3, "Termination Attempts", _par_last_termination_attempts); 3.17 - print_par_stats(2, "GC Worker End", _par_last_gc_worker_end_times_ms); 3.18 3.19 for (int i = 0; i < _parallel_gc_threads; i++) { 3.20 - _par_last_gc_worker_times_ms[i] = _par_last_gc_worker_end_times_ms[i] - _par_last_gc_worker_start_times_ms[i]; 3.21 + _par_last_gc_worker_times_ms[i] = _par_last_gc_worker_end_times_ms[i] - 3.22 + _par_last_gc_worker_start_times_ms[i]; 3.23 3.24 double worker_known_time = _par_last_ext_root_scan_times_ms[i] + 3.25 _par_last_satb_filtering_times_ms[i] + 3.26 @@ -1413,10 +1416,13 @@ 3.27 _par_last_obj_copy_times_ms[i] + 3.28 _par_last_termination_times_ms[i]; 3.29 3.30 - _par_last_gc_worker_other_times_ms[i] = _cur_collection_par_time_ms - worker_known_time; 3.31 + _par_last_gc_worker_other_times_ms[i] = _par_last_gc_worker_times_ms[i] - 3.32 + worker_known_time; 3.33 } 3.34 - print_par_stats(2, "GC Worker", _par_last_gc_worker_times_ms); 3.35 + 3.36 print_par_stats(2, "GC Worker Other", _par_last_gc_worker_other_times_ms); 3.37 + print_par_stats(2, "GC Worker Total", _par_last_gc_worker_times_ms); 3.38 + print_par_stats(2, "GC Worker End", _par_last_gc_worker_end_times_ms); 3.39 } else { 3.40 print_stats(1, "Ext Root Scanning", ext_root_scan_time); 3.41 if (print_marking_info) { 3.42 @@ -1427,6 +1433,7 @@ 3.43 print_stats(1, "Scan RS", scan_rs_time); 3.44 print_stats(1, "Object Copying", obj_copy_time); 3.45 } 3.46 + print_stats(1, "Code Root Fixup", _cur_collection_code_root_fixup_time_ms); 3.47 if (print_marking_info) { 3.48 print_stats(1, "Complete CSet Marking", _mark_closure_time_ms); 3.49 }
4.1 --- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp Tue Mar 27 10:29:59 2012 +0200 4.2 +++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp Tue Mar 13 11:05:32 2012 -0700 4.3 @@ -179,6 +179,9 @@ 4.4 size_t _cur_collection_pause_used_at_start_bytes; 4.5 size_t _cur_collection_pause_used_regions_at_start; 4.6 double _cur_collection_par_time_ms; 4.7 + 4.8 + double _cur_collection_code_root_fixup_time_ms; 4.9 + 4.10 double _cur_satb_drain_time_ms; 4.11 double _cur_clear_ct_time_ms; 4.12 double _cur_ref_proc_time_ms; 4.13 @@ -226,8 +229,8 @@ 4.14 double* _par_last_gc_worker_times_ms; 4.15 4.16 // Each workers 'other' time i.e. the elapsed time of the parallel 4.17 - // phase of the pause minus the sum of the individual sub-phase 4.18 - // times for a given worker thread. 4.19 + // code executed by a worker minus the sum of the individual sub-phase 4.20 + // times for that worker thread. 4.21 double* _par_last_gc_worker_other_times_ms; 4.22 4.23 // indicates whether we are in young or mixed GC mode 4.24 @@ -897,6 +900,10 @@ 4.25 _cur_collection_par_time_ms = ms; 4.26 } 4.27 4.28 + void record_code_root_fixup_time(double ms) { 4.29 + _cur_collection_code_root_fixup_time_ms = ms; 4.30 + } 4.31 + 4.32 void record_aux_start_time(int i) { 4.33 guarantee(i < _aux_num, "should be within range"); 4.34 _cur_aux_start_times_ms[i] = os::elapsedTime() * 1000.0;