diff -r d1bdeef3e3e2 -r ec4b032a4977 src/share/vm/gc_implementation/g1/concurrentMarkThread.cpp
--- a/src/share/vm/gc_implementation/g1/concurrentMarkThread.cpp	Wed Oct 12 10:25:51 2011 -0700
+++ b/src/share/vm/gc_implementation/g1/concurrentMarkThread.cpp	Thu Oct 13 13:54:29 2011 -0400
@@ -215,20 +215,20 @@
           gclog_or_tty->print_cr("[GC concurrent-cleanup-start]");
         }
 
-        // Now do the remainder of the cleanup operation.
+        // Now do the concurrent cleanup operation.
         _cm->completeCleanup();
+
         // Notify anyone who's waiting that there are no more free
-        // regions coming. We have to do this before we join the STS,
-        // otherwise we might deadlock: a GC worker could be blocked
-        // waiting for the notification whereas this thread will be
-        // blocked for the pause to finish while it's trying to join
-        // the STS, which is conditional on the GC workers finishing.
+        // regions coming. We have to do this before we join the STS
+        // (in fact, we should not attempt to join the STS in the
+        // interval between finishing the cleanup pause and clearing
+        // the free_regions_coming flag) otherwise we might deadlock:
+        // a GC worker could be blocked waiting for the notification
+        // whereas this thread will be blocked for the pause to finish
+        // while it's trying to join the STS, which is conditional on
+        // the GC workers finishing.
         g1h->reset_free_regions_coming();
 
-        _sts.join();
-        g1_policy->record_concurrent_mark_cleanup_completed();
-        _sts.leave();
-
         double cleanup_end_sec = os::elapsedTime();
         if (PrintGC) {
           gclog_or_tty->date_stamp(PrintGCDateStamps);
@@ -240,6 +240,36 @@
       guarantee(cm()->cleanup_list_is_empty(),
                 "at this point there should be no regions on the cleanup list");
 
+      // There is a tricky race before recording that the concurrent
+      // cleanup has completed and a potential Full GC starting around
+      // the same time. We want to make sure that the Full GC calls
+      // abort() on concurrent mark after
+      // record_concurrent_mark_cleanup_completed(), since abort() is
+      // the method that will reset the concurrent mark state. If we
+      // end up calling record_concurrent_mark_cleanup_completed()
+      // after abort() then we might incorrectly undo some of the work
+      // abort() did. Checking the has_aborted() flag after joining
+      // the STS allows the correct ordering of the two methods. There
+      // are two scenarios:
+      //
+      // a) If we reach here before the Full GC, the fact that we have
+      // joined the STS means that the Full GC cannot start until we
+      // leave the STS, so record_concurrent_mark_cleanup_completed()
+      // will complete before abort() is called.
+      //
+      // b) If we reach here during the Full GC, we'll be held up from
+      // joining the STS until the Full GC is done, which means that
+      // abort() will have completed and has_aborted() will return
+      // true to prevent us from calling
+      // record_concurrent_mark_cleanup_completed() (and, in fact, it's
+      // not needed any more as the concurrent mark state has been
+      // already reset).
+      _sts.join();
+      if (!cm()->has_aborted()) {
+        g1_policy->record_concurrent_mark_cleanup_completed();
+      }
+      _sts.leave();
+
       if (cm()->has_aborted()) {
         if (PrintGC) {
           gclog_or_tty->date_stamp(PrintGCDateStamps);
@@ -248,7 +278,7 @@
         }
       }
 
-      // we now want to allow clearing of the marking bitmap to be
+      // We now want to allow clearing of the marking bitmap to be
       // suspended by a collection pause.
       _sts.join();
       _cm->clearNextBitmap();