diff -r d1bdeef3e3e2 -r ec4b032a4977 src/share/vm/gc_implementation/g1/concurrentMarkThread.cpp --- a/src/share/vm/gc_implementation/g1/concurrentMarkThread.cpp Wed Oct 12 10:25:51 2011 -0700 +++ b/src/share/vm/gc_implementation/g1/concurrentMarkThread.cpp Thu Oct 13 13:54:29 2011 -0400 @@ -215,20 +215,20 @@ gclog_or_tty->print_cr("[GC concurrent-cleanup-start]"); } - // Now do the remainder of the cleanup operation. + // Now do the concurrent cleanup operation. _cm->completeCleanup(); + // Notify anyone who's waiting that there are no more free - // regions coming. We have to do this before we join the STS, - // otherwise we might deadlock: a GC worker could be blocked - // waiting for the notification whereas this thread will be - // blocked for the pause to finish while it's trying to join - // the STS, which is conditional on the GC workers finishing. + // regions coming. We have to do this before we join the STS + // (in fact, we should not attempt to join the STS in the + // interval between finishing the cleanup pause and clearing + // the free_regions_coming flag) otherwise we might deadlock: + // a GC worker could be blocked waiting for the notification + // whereas this thread will be blocked for the pause to finish + // while it's trying to join the STS, which is conditional on + // the GC workers finishing. g1h->reset_free_regions_coming(); - _sts.join(); - g1_policy->record_concurrent_mark_cleanup_completed(); - _sts.leave(); - double cleanup_end_sec = os::elapsedTime(); if (PrintGC) { gclog_or_tty->date_stamp(PrintGCDateStamps); @@ -240,6 +240,36 @@ guarantee(cm()->cleanup_list_is_empty(), "at this point there should be no regions on the cleanup list"); + // There is a tricky race before recording that the concurrent + // cleanup has completed and a potential Full GC starting around + // the same time. We want to make sure that the Full GC calls + // abort() on concurrent mark after + // record_concurrent_mark_cleanup_completed(), since abort() is + // the method that will reset the concurrent mark state. If we + // end up calling record_concurrent_mark_cleanup_completed() + // after abort() then we might incorrectly undo some of the work + // abort() did. Checking the has_aborted() flag after joining + // the STS allows the correct ordering of the two methods. There + // are two scenarios: + // + // a) If we reach here before the Full GC, the fact that we have + // joined the STS means that the Full GC cannot start until we + // leave the STS, so record_concurrent_mark_cleanup_completed() + // will complete before abort() is called. + // + // b) If we reach here during the Full GC, we'll be held up from + // joining the STS until the Full GC is done, which means that + // abort() will have completed and has_aborted() will return + // true to prevent us from calling + // record_concurrent_mark_cleanup_completed() (and, in fact, it's + // not needed any more as the concurrent mark state has been + // already reset). + _sts.join(); + if (!cm()->has_aborted()) { + g1_policy->record_concurrent_mark_cleanup_completed(); + } + _sts.leave(); + if (cm()->has_aborted()) { if (PrintGC) { gclog_or_tty->date_stamp(PrintGCDateStamps); @@ -248,7 +278,7 @@ } } - // we now want to allow clearing of the marking bitmap to be + // We now want to allow clearing of the marking bitmap to be // suspended by a collection pause. _sts.join(); _cm->clearNextBitmap();