7034139: G1: assert(Thread::current()->is_ConcurrentGC_thread()) failed: only a conc GC thread can call this.

Fri, 29 Apr 2011 12:40:49 -0400

author
tonyp
date
Fri, 29 Apr 2011 12:40:49 -0400
changeset 2848
cd8e33b2a8ad
parent 2847
da0fffdcc453
child 2849
063382f9b575

7034139: G1: assert(Thread::current()->is_ConcurrentGC_thread()) failed: only a conc GC thread can call this.
Summary: We were calling STS join and leave during a STW pause and we are not suppoesed to. I now only call those during concurrent phase. I also added stress code in the non-product builds to force an overflows (the condition that ws uncovering the bug) to make sure it does not happen again.
Reviewed-by: johnc, brutisso

src/share/vm/gc_implementation/g1/concurrentMark.cpp file | annotate | diff | comparison | revisions
src/share/vm/gc_implementation/g1/concurrentMark.hpp file | annotate | diff | comparison | revisions
src/share/vm/gc_implementation/g1/g1_globals.hpp file | annotate | diff | comparison | revisions
     1.1 --- a/src/share/vm/gc_implementation/g1/concurrentMark.cpp	Thu Apr 28 15:29:18 2011 -0700
     1.2 +++ b/src/share/vm/gc_implementation/g1/concurrentMark.cpp	Fri Apr 29 12:40:49 2011 -0400
     1.3 @@ -826,6 +826,14 @@
     1.4  void ConcurrentMark::checkpointRootsInitialPost() {
     1.5    G1CollectedHeap*   g1h = G1CollectedHeap::heap();
     1.6  
     1.7 +  // If we force an overflow during remark, the remark operation will
     1.8 +  // actually abort and we'll restart concurrent marking. If we always
     1.9 +  // force an oveflow during remark we'll never actually complete the
    1.10 +  // marking phase. So, we initilize this here, at the start of the
    1.11 +  // cycle, so that at the remaining overflow number will decrease at
    1.12 +  // every remark and we'll eventually not need to cause one.
    1.13 +  force_overflow_stw()->init();
    1.14 +
    1.15    // For each region note start of marking.
    1.16    NoteStartOfMarkHRClosure startcl;
    1.17    g1h->heap_region_iterate(&startcl);
    1.18 @@ -893,27 +901,37 @@
    1.19  }
    1.20  
    1.21  /*
    1.22 -   Notice that in the next two methods, we actually leave the STS
    1.23 -   during the barrier sync and join it immediately afterwards. If we
    1.24 -   do not do this, this then the following deadlock can occur: one
    1.25 -   thread could be in the barrier sync code, waiting for the other
    1.26 -   thread to also sync up, whereas another one could be trying to
    1.27 -   yield, while also waiting for the other threads to sync up too.
    1.28 -
    1.29 -   Because the thread that does the sync barrier has left the STS, it
    1.30 -   is possible to be suspended for a Full GC or an evacuation pause
    1.31 -   could occur. This is actually safe, since the entering the sync
    1.32 -   barrier is one of the last things do_marking_step() does, and it
    1.33 -   doesn't manipulate any data structures afterwards.
    1.34 -*/
    1.35 + * Notice that in the next two methods, we actually leave the STS
    1.36 + * during the barrier sync and join it immediately afterwards. If we
    1.37 + * do not do this, the following deadlock can occur: one thread could
    1.38 + * be in the barrier sync code, waiting for the other thread to also
    1.39 + * sync up, whereas another one could be trying to yield, while also
    1.40 + * waiting for the other threads to sync up too.
    1.41 + *
    1.42 + * Note, however, that this code is also used during remark and in
    1.43 + * this case we should not attempt to leave / enter the STS, otherwise
    1.44 + * we'll either hit an asseert (debug / fastdebug) or deadlock
    1.45 + * (product). So we should only leave / enter the STS if we are
    1.46 + * operating concurrently.
    1.47 + *
    1.48 + * Because the thread that does the sync barrier has left the STS, it
    1.49 + * is possible to be suspended for a Full GC or an evacuation pause
    1.50 + * could occur. This is actually safe, since the entering the sync
    1.51 + * barrier is one of the last things do_marking_step() does, and it
    1.52 + * doesn't manipulate any data structures afterwards.
    1.53 + */
    1.54  
    1.55  void ConcurrentMark::enter_first_sync_barrier(int task_num) {
    1.56    if (verbose_low())
    1.57      gclog_or_tty->print_cr("[%d] entering first barrier", task_num);
    1.58  
    1.59 -  ConcurrentGCThread::stsLeave();
    1.60 +  if (concurrent()) {
    1.61 +    ConcurrentGCThread::stsLeave();
    1.62 +  }
    1.63    _first_overflow_barrier_sync.enter();
    1.64 -  ConcurrentGCThread::stsJoin();
    1.65 +  if (concurrent()) {
    1.66 +    ConcurrentGCThread::stsJoin();
    1.67 +  }
    1.68    // at this point everyone should have synced up and not be doing any
    1.69    // more work
    1.70  
    1.71 @@ -923,7 +941,12 @@
    1.72    // let task 0 do this
    1.73    if (task_num == 0) {
    1.74      // task 0 is responsible for clearing the global data structures
    1.75 -    clear_marking_state();
    1.76 +    // We should be here because of an overflow. During STW we should
    1.77 +    // not clear the overflow flag since we rely on it being true when
    1.78 +    // we exit this method to abort the pause and restart concurent
    1.79 +    // marking.
    1.80 +    clear_marking_state(concurrent() /* clear_overflow */);
    1.81 +    force_overflow()->update();
    1.82  
    1.83      if (PrintGC) {
    1.84        gclog_or_tty->date_stamp(PrintGCDateStamps);
    1.85 @@ -940,15 +963,45 @@
    1.86    if (verbose_low())
    1.87      gclog_or_tty->print_cr("[%d] entering second barrier", task_num);
    1.88  
    1.89 -  ConcurrentGCThread::stsLeave();
    1.90 +  if (concurrent()) {
    1.91 +    ConcurrentGCThread::stsLeave();
    1.92 +  }
    1.93    _second_overflow_barrier_sync.enter();
    1.94 -  ConcurrentGCThread::stsJoin();
    1.95 +  if (concurrent()) {
    1.96 +    ConcurrentGCThread::stsJoin();
    1.97 +  }
    1.98    // at this point everything should be re-initialised and ready to go
    1.99  
   1.100    if (verbose_low())
   1.101      gclog_or_tty->print_cr("[%d] leaving second barrier", task_num);
   1.102  }
   1.103  
   1.104 +#ifndef PRODUCT
   1.105 +void ForceOverflowSettings::init() {
   1.106 +  _num_remaining = G1ConcMarkForceOverflow;
   1.107 +  _force = false;
   1.108 +  update();
   1.109 +}
   1.110 +
   1.111 +void ForceOverflowSettings::update() {
   1.112 +  if (_num_remaining > 0) {
   1.113 +    _num_remaining -= 1;
   1.114 +    _force = true;
   1.115 +  } else {
   1.116 +    _force = false;
   1.117 +  }
   1.118 +}
   1.119 +
   1.120 +bool ForceOverflowSettings::should_force() {
   1.121 +  if (_force) {
   1.122 +    _force = false;
   1.123 +    return true;
   1.124 +  } else {
   1.125 +    return false;
   1.126 +  }
   1.127 +}
   1.128 +#endif // !PRODUCT
   1.129 +
   1.130  void ConcurrentMark::grayRoot(oop p) {
   1.131    HeapWord* addr = (HeapWord*) p;
   1.132    // We can't really check against _heap_start and _heap_end, since it
   1.133 @@ -1117,6 +1170,7 @@
   1.134    _restart_for_overflow = false;
   1.135  
   1.136    size_t active_workers = MAX2((size_t) 1, parallel_marking_threads());
   1.137 +  force_overflow_conc()->init();
   1.138    set_phase(active_workers, true /* concurrent */);
   1.139  
   1.140    CMConcurrentMarkingTask markingTask(this, cmThread());
   1.141 @@ -2703,12 +2757,16 @@
   1.142  
   1.143  }
   1.144  
   1.145 -void ConcurrentMark::clear_marking_state() {
   1.146 +void ConcurrentMark::clear_marking_state(bool clear_overflow) {
   1.147    _markStack.setEmpty();
   1.148    _markStack.clear_overflow();
   1.149    _regionStack.setEmpty();
   1.150    _regionStack.clear_overflow();
   1.151 -  clear_has_overflown();
   1.152 +  if (clear_overflow) {
   1.153 +    clear_has_overflown();
   1.154 +  } else {
   1.155 +    assert(has_overflown(), "pre-condition");
   1.156 +  }
   1.157    _finger = _heap_start;
   1.158  
   1.159    for (int i = 0; i < (int)_max_task_num; ++i) {
   1.160 @@ -4279,6 +4337,15 @@
   1.161      }
   1.162    }
   1.163  
   1.164 +  // If we are about to wrap up and go into termination, check if we
   1.165 +  // should raise the overflow flag.
   1.166 +  if (do_termination && !has_aborted()) {
   1.167 +    if (_cm->force_overflow()->should_force()) {
   1.168 +      _cm->set_has_overflown();
   1.169 +      regular_clock_call();
   1.170 +    }
   1.171 +  }
   1.172 +
   1.173    // We still haven't aborted. Now, let's try to get into the
   1.174    // termination protocol.
   1.175    if (do_termination && !has_aborted()) {
     2.1 --- a/src/share/vm/gc_implementation/g1/concurrentMark.hpp	Thu Apr 28 15:29:18 2011 -0700
     2.2 +++ b/src/share/vm/gc_implementation/g1/concurrentMark.hpp	Fri Apr 29 12:40:49 2011 -0400
     2.3 @@ -316,6 +316,19 @@
     2.4    void setEmpty()   { _index = 0; clear_overflow(); }
     2.5  };
     2.6  
     2.7 +class ForceOverflowSettings VALUE_OBJ_CLASS_SPEC {
     2.8 +private:
     2.9 +#ifndef PRODUCT
    2.10 +  uintx _num_remaining;
    2.11 +  bool _force;
    2.12 +#endif // !defined(PRODUCT)
    2.13 +
    2.14 +public:
    2.15 +  void init() PRODUCT_RETURN;
    2.16 +  void update() PRODUCT_RETURN;
    2.17 +  bool should_force() PRODUCT_RETURN_( return false; );
    2.18 +};
    2.19 +
    2.20  // this will enable a variety of different statistics per GC task
    2.21  #define _MARKING_STATS_       0
    2.22  // this will enable the higher verbose levels
    2.23 @@ -462,6 +475,9 @@
    2.24  
    2.25    WorkGang* _parallel_workers;
    2.26  
    2.27 +  ForceOverflowSettings _force_overflow_conc;
    2.28 +  ForceOverflowSettings _force_overflow_stw;
    2.29 +
    2.30    void weakRefsWork(bool clear_all_soft_refs);
    2.31  
    2.32    void swapMarkBitMaps();
    2.33 @@ -470,7 +486,7 @@
    2.34    // task local ones; should be called during initial mark.
    2.35    void reset();
    2.36    // It resets all the marking data structures.
    2.37 -  void clear_marking_state();
    2.38 +  void clear_marking_state(bool clear_overflow = true);
    2.39  
    2.40    // It should be called to indicate which phase we're in (concurrent
    2.41    // mark or remark) and how many threads are currently active.
    2.42 @@ -547,6 +563,22 @@
    2.43    void enter_first_sync_barrier(int task_num);
    2.44    void enter_second_sync_barrier(int task_num);
    2.45  
    2.46 +  ForceOverflowSettings* force_overflow_conc() {
    2.47 +    return &_force_overflow_conc;
    2.48 +  }
    2.49 +
    2.50 +  ForceOverflowSettings* force_overflow_stw() {
    2.51 +    return &_force_overflow_stw;
    2.52 +  }
    2.53 +
    2.54 +  ForceOverflowSettings* force_overflow() {
    2.55 +    if (concurrent()) {
    2.56 +      return force_overflow_conc();
    2.57 +    } else {
    2.58 +      return force_overflow_stw();
    2.59 +    }
    2.60 +  }
    2.61 +
    2.62  public:
    2.63    // Manipulation of the global mark stack.
    2.64    // Notice that the first mark_stack_push is CAS-based, whereas the
     3.1 --- a/src/share/vm/gc_implementation/g1/g1_globals.hpp	Thu Apr 28 15:29:18 2011 -0700
     3.2 +++ b/src/share/vm/gc_implementation/g1/g1_globals.hpp	Fri Apr 29 12:40:49 2011 -0400
     3.3 @@ -311,7 +311,11 @@
     3.4                                                                              \
     3.5    develop(bool, G1ExitOnExpansionFailure, false,                            \
     3.6            "Raise a fatal VM exit out of memory failure in the event "       \
     3.7 -          " that heap expansion fails due to running out of swap.")
     3.8 +          " that heap expansion fails due to running out of swap.")         \
     3.9 +                                                                            \
    3.10 +  develop(uintx, G1ConcMarkForceOverflow, 0,                                \
    3.11 +          "The number of times we'll force an overflow during "             \
    3.12 +          "concurrent marking")
    3.13  
    3.14  G1_FLAGS(DECLARE_DEVELOPER_FLAG, DECLARE_PD_DEVELOPER_FLAG, DECLARE_PRODUCT_FLAG, DECLARE_PD_PRODUCT_FLAG, DECLARE_DIAGNOSTIC_FLAG, DECLARE_EXPERIMENTAL_FLAG, DECLARE_NOTPRODUCT_FLAG, DECLARE_MANAGEABLE_FLAG, DECLARE_PRODUCT_RW_FLAG)
    3.15  

mercurial