Fri, 29 Apr 2011 12:40:49 -0400
7034139: G1: assert(Thread::current()->is_ConcurrentGC_thread()) failed: only a conc GC thread can call this.
Summary: We were calling STS join and leave during a STW pause and we are not suppoesed to. I now only call those during concurrent phase. I also added stress code in the non-product builds to force an overflows (the condition that ws uncovering the bug) to make sure it does not happen again.
Reviewed-by: johnc, brutisso
1.1 --- a/src/share/vm/gc_implementation/g1/concurrentMark.cpp Thu Apr 28 15:29:18 2011 -0700 1.2 +++ b/src/share/vm/gc_implementation/g1/concurrentMark.cpp Fri Apr 29 12:40:49 2011 -0400 1.3 @@ -826,6 +826,14 @@ 1.4 void ConcurrentMark::checkpointRootsInitialPost() { 1.5 G1CollectedHeap* g1h = G1CollectedHeap::heap(); 1.6 1.7 + // If we force an overflow during remark, the remark operation will 1.8 + // actually abort and we'll restart concurrent marking. If we always 1.9 + // force an oveflow during remark we'll never actually complete the 1.10 + // marking phase. So, we initilize this here, at the start of the 1.11 + // cycle, so that at the remaining overflow number will decrease at 1.12 + // every remark and we'll eventually not need to cause one. 1.13 + force_overflow_stw()->init(); 1.14 + 1.15 // For each region note start of marking. 1.16 NoteStartOfMarkHRClosure startcl; 1.17 g1h->heap_region_iterate(&startcl); 1.18 @@ -893,27 +901,37 @@ 1.19 } 1.20 1.21 /* 1.22 - Notice that in the next two methods, we actually leave the STS 1.23 - during the barrier sync and join it immediately afterwards. If we 1.24 - do not do this, this then the following deadlock can occur: one 1.25 - thread could be in the barrier sync code, waiting for the other 1.26 - thread to also sync up, whereas another one could be trying to 1.27 - yield, while also waiting for the other threads to sync up too. 1.28 - 1.29 - Because the thread that does the sync barrier has left the STS, it 1.30 - is possible to be suspended for a Full GC or an evacuation pause 1.31 - could occur. This is actually safe, since the entering the sync 1.32 - barrier is one of the last things do_marking_step() does, and it 1.33 - doesn't manipulate any data structures afterwards. 1.34 -*/ 1.35 + * Notice that in the next two methods, we actually leave the STS 1.36 + * during the barrier sync and join it immediately afterwards. If we 1.37 + * do not do this, the following deadlock can occur: one thread could 1.38 + * be in the barrier sync code, waiting for the other thread to also 1.39 + * sync up, whereas another one could be trying to yield, while also 1.40 + * waiting for the other threads to sync up too. 1.41 + * 1.42 + * Note, however, that this code is also used during remark and in 1.43 + * this case we should not attempt to leave / enter the STS, otherwise 1.44 + * we'll either hit an asseert (debug / fastdebug) or deadlock 1.45 + * (product). So we should only leave / enter the STS if we are 1.46 + * operating concurrently. 1.47 + * 1.48 + * Because the thread that does the sync barrier has left the STS, it 1.49 + * is possible to be suspended for a Full GC or an evacuation pause 1.50 + * could occur. This is actually safe, since the entering the sync 1.51 + * barrier is one of the last things do_marking_step() does, and it 1.52 + * doesn't manipulate any data structures afterwards. 1.53 + */ 1.54 1.55 void ConcurrentMark::enter_first_sync_barrier(int task_num) { 1.56 if (verbose_low()) 1.57 gclog_or_tty->print_cr("[%d] entering first barrier", task_num); 1.58 1.59 - ConcurrentGCThread::stsLeave(); 1.60 + if (concurrent()) { 1.61 + ConcurrentGCThread::stsLeave(); 1.62 + } 1.63 _first_overflow_barrier_sync.enter(); 1.64 - ConcurrentGCThread::stsJoin(); 1.65 + if (concurrent()) { 1.66 + ConcurrentGCThread::stsJoin(); 1.67 + } 1.68 // at this point everyone should have synced up and not be doing any 1.69 // more work 1.70 1.71 @@ -923,7 +941,12 @@ 1.72 // let task 0 do this 1.73 if (task_num == 0) { 1.74 // task 0 is responsible for clearing the global data structures 1.75 - clear_marking_state(); 1.76 + // We should be here because of an overflow. During STW we should 1.77 + // not clear the overflow flag since we rely on it being true when 1.78 + // we exit this method to abort the pause and restart concurent 1.79 + // marking. 1.80 + clear_marking_state(concurrent() /* clear_overflow */); 1.81 + force_overflow()->update(); 1.82 1.83 if (PrintGC) { 1.84 gclog_or_tty->date_stamp(PrintGCDateStamps); 1.85 @@ -940,15 +963,45 @@ 1.86 if (verbose_low()) 1.87 gclog_or_tty->print_cr("[%d] entering second barrier", task_num); 1.88 1.89 - ConcurrentGCThread::stsLeave(); 1.90 + if (concurrent()) { 1.91 + ConcurrentGCThread::stsLeave(); 1.92 + } 1.93 _second_overflow_barrier_sync.enter(); 1.94 - ConcurrentGCThread::stsJoin(); 1.95 + if (concurrent()) { 1.96 + ConcurrentGCThread::stsJoin(); 1.97 + } 1.98 // at this point everything should be re-initialised and ready to go 1.99 1.100 if (verbose_low()) 1.101 gclog_or_tty->print_cr("[%d] leaving second barrier", task_num); 1.102 } 1.103 1.104 +#ifndef PRODUCT 1.105 +void ForceOverflowSettings::init() { 1.106 + _num_remaining = G1ConcMarkForceOverflow; 1.107 + _force = false; 1.108 + update(); 1.109 +} 1.110 + 1.111 +void ForceOverflowSettings::update() { 1.112 + if (_num_remaining > 0) { 1.113 + _num_remaining -= 1; 1.114 + _force = true; 1.115 + } else { 1.116 + _force = false; 1.117 + } 1.118 +} 1.119 + 1.120 +bool ForceOverflowSettings::should_force() { 1.121 + if (_force) { 1.122 + _force = false; 1.123 + return true; 1.124 + } else { 1.125 + return false; 1.126 + } 1.127 +} 1.128 +#endif // !PRODUCT 1.129 + 1.130 void ConcurrentMark::grayRoot(oop p) { 1.131 HeapWord* addr = (HeapWord*) p; 1.132 // We can't really check against _heap_start and _heap_end, since it 1.133 @@ -1117,6 +1170,7 @@ 1.134 _restart_for_overflow = false; 1.135 1.136 size_t active_workers = MAX2((size_t) 1, parallel_marking_threads()); 1.137 + force_overflow_conc()->init(); 1.138 set_phase(active_workers, true /* concurrent */); 1.139 1.140 CMConcurrentMarkingTask markingTask(this, cmThread()); 1.141 @@ -2703,12 +2757,16 @@ 1.142 1.143 } 1.144 1.145 -void ConcurrentMark::clear_marking_state() { 1.146 +void ConcurrentMark::clear_marking_state(bool clear_overflow) { 1.147 _markStack.setEmpty(); 1.148 _markStack.clear_overflow(); 1.149 _regionStack.setEmpty(); 1.150 _regionStack.clear_overflow(); 1.151 - clear_has_overflown(); 1.152 + if (clear_overflow) { 1.153 + clear_has_overflown(); 1.154 + } else { 1.155 + assert(has_overflown(), "pre-condition"); 1.156 + } 1.157 _finger = _heap_start; 1.158 1.159 for (int i = 0; i < (int)_max_task_num; ++i) { 1.160 @@ -4279,6 +4337,15 @@ 1.161 } 1.162 } 1.163 1.164 + // If we are about to wrap up and go into termination, check if we 1.165 + // should raise the overflow flag. 1.166 + if (do_termination && !has_aborted()) { 1.167 + if (_cm->force_overflow()->should_force()) { 1.168 + _cm->set_has_overflown(); 1.169 + regular_clock_call(); 1.170 + } 1.171 + } 1.172 + 1.173 // We still haven't aborted. Now, let's try to get into the 1.174 // termination protocol. 1.175 if (do_termination && !has_aborted()) {
2.1 --- a/src/share/vm/gc_implementation/g1/concurrentMark.hpp Thu Apr 28 15:29:18 2011 -0700 2.2 +++ b/src/share/vm/gc_implementation/g1/concurrentMark.hpp Fri Apr 29 12:40:49 2011 -0400 2.3 @@ -316,6 +316,19 @@ 2.4 void setEmpty() { _index = 0; clear_overflow(); } 2.5 }; 2.6 2.7 +class ForceOverflowSettings VALUE_OBJ_CLASS_SPEC { 2.8 +private: 2.9 +#ifndef PRODUCT 2.10 + uintx _num_remaining; 2.11 + bool _force; 2.12 +#endif // !defined(PRODUCT) 2.13 + 2.14 +public: 2.15 + void init() PRODUCT_RETURN; 2.16 + void update() PRODUCT_RETURN; 2.17 + bool should_force() PRODUCT_RETURN_( return false; ); 2.18 +}; 2.19 + 2.20 // this will enable a variety of different statistics per GC task 2.21 #define _MARKING_STATS_ 0 2.22 // this will enable the higher verbose levels 2.23 @@ -462,6 +475,9 @@ 2.24 2.25 WorkGang* _parallel_workers; 2.26 2.27 + ForceOverflowSettings _force_overflow_conc; 2.28 + ForceOverflowSettings _force_overflow_stw; 2.29 + 2.30 void weakRefsWork(bool clear_all_soft_refs); 2.31 2.32 void swapMarkBitMaps(); 2.33 @@ -470,7 +486,7 @@ 2.34 // task local ones; should be called during initial mark. 2.35 void reset(); 2.36 // It resets all the marking data structures. 2.37 - void clear_marking_state(); 2.38 + void clear_marking_state(bool clear_overflow = true); 2.39 2.40 // It should be called to indicate which phase we're in (concurrent 2.41 // mark or remark) and how many threads are currently active. 2.42 @@ -547,6 +563,22 @@ 2.43 void enter_first_sync_barrier(int task_num); 2.44 void enter_second_sync_barrier(int task_num); 2.45 2.46 + ForceOverflowSettings* force_overflow_conc() { 2.47 + return &_force_overflow_conc; 2.48 + } 2.49 + 2.50 + ForceOverflowSettings* force_overflow_stw() { 2.51 + return &_force_overflow_stw; 2.52 + } 2.53 + 2.54 + ForceOverflowSettings* force_overflow() { 2.55 + if (concurrent()) { 2.56 + return force_overflow_conc(); 2.57 + } else { 2.58 + return force_overflow_stw(); 2.59 + } 2.60 + } 2.61 + 2.62 public: 2.63 // Manipulation of the global mark stack. 2.64 // Notice that the first mark_stack_push is CAS-based, whereas the
3.1 --- a/src/share/vm/gc_implementation/g1/g1_globals.hpp Thu Apr 28 15:29:18 2011 -0700 3.2 +++ b/src/share/vm/gc_implementation/g1/g1_globals.hpp Fri Apr 29 12:40:49 2011 -0400 3.3 @@ -311,7 +311,11 @@ 3.4 \ 3.5 develop(bool, G1ExitOnExpansionFailure, false, \ 3.6 "Raise a fatal VM exit out of memory failure in the event " \ 3.7 - " that heap expansion fails due to running out of swap.") 3.8 + " that heap expansion fails due to running out of swap.") \ 3.9 + \ 3.10 + develop(uintx, G1ConcMarkForceOverflow, 0, \ 3.11 + "The number of times we'll force an overflow during " \ 3.12 + "concurrent marking") 3.13 3.14 G1_FLAGS(DECLARE_DEVELOPER_FLAG, DECLARE_PD_DEVELOPER_FLAG, DECLARE_PRODUCT_FLAG, DECLARE_PD_PRODUCT_FLAG, DECLARE_DIAGNOSTIC_FLAG, DECLARE_EXPERIMENTAL_FLAG, DECLARE_NOTPRODUCT_FLAG, DECLARE_MANAGEABLE_FLAG, DECLARE_PRODUCT_RW_FLAG) 3.15