diff -r 97fbf5beff7b -r 8b10f48633dc src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
--- a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp	Mon Sep 20 14:38:38 2010 -0700
@@ -195,7 +195,7 @@
            "Offset of FreeChunk::_prev within FreeChunk must match"
            "  that of OopDesc::_klass within OopDesc");
   )
-  if (ParallelGCThreads > 0) {
+  if (CollectedHeap::use_parallel_gc_threads()) {
     typedef CMSParGCThreadState* CMSParGCThreadStatePtr;
     _par_gc_thread_states =
       NEW_C_HEAP_ARRAY(CMSParGCThreadStatePtr, ParallelGCThreads);
@@ -616,7 +616,7 @@
   }
 
   // Support for multi-threaded concurrent phases
-  if (ParallelGCThreads > 0 && CMSConcurrentMTEnabled) {
+  if (CollectedHeap::use_parallel_gc_threads() && CMSConcurrentMTEnabled) {
     if (FLAG_IS_DEFAULT(ConcGCThreads)) {
       // just for now
       FLAG_SET_DEFAULT(ConcGCThreads, (ParallelGCThreads + 3)/4);
@@ -628,6 +628,8 @@
         warning("GC/CMS: _conc_workers allocation failure: "
               "forcing -CMSConcurrentMTEnabled");
         CMSConcurrentMTEnabled = false;
+      } else {
+        _conc_workers->initialize_workers();
       }
     } else {
       CMSConcurrentMTEnabled = false;
@@ -936,7 +938,7 @@
   // along with all the other pointers into the heap but
   // compaction is expected to be a rare event with
   // a heap using cms so don't do it without seeing the need.
-  if (ParallelGCThreads > 0) {
+  if (CollectedHeap::use_parallel_gc_threads()) {
     for (uint i = 0; i < ParallelGCThreads; i++) {
       _par_gc_thread_states[i]->promo.reset();
     }
@@ -2630,7 +2632,8 @@
   // Should call gc_prologue_work() for all cms gens we are responsible for
   bool registerClosure =    _collectorState >= Marking
                          && _collectorState < Sweeping;
-  ModUnionClosure* muc = ParallelGCThreads > 0 ? &_modUnionClosurePar
+  ModUnionClosure* muc = CollectedHeap::use_parallel_gc_threads() ?
+                                               &_modUnionClosurePar
                                                : &_modUnionClosure;
   _cmsGen->gc_prologue_work(full, registerClosure, muc);
   _permGen->gc_prologue_work(full, registerClosure, muc);
@@ -2731,7 +2734,7 @@
   collector()->gc_epilogue(full);
 
   // Also reset promotion tracking in par gc thread states.
-  if (ParallelGCThreads > 0) {
+  if (CollectedHeap::use_parallel_gc_threads()) {
     for (uint i = 0; i < ParallelGCThreads; i++) {
       _par_gc_thread_states[i]->promo.stopTrackingPromotions(i);
     }
@@ -3731,7 +3734,6 @@
 // MT Concurrent Marking Task
 class CMSConcMarkingTask: public YieldingFlexibleGangTask {
   CMSCollector* _collector;
-  YieldingFlexibleWorkGang* _workers;        // the whole gang
   int           _n_workers;                  // requested/desired # workers
   bool          _asynch;
   bool          _result;
@@ -3751,21 +3753,19 @@
   CMSConcMarkingTask(CMSCollector* collector,
                  CompactibleFreeListSpace* cms_space,
                  CompactibleFreeListSpace* perm_space,
-                 bool asynch, int n_workers,
+                 bool asynch,
                  YieldingFlexibleWorkGang* workers,
                  OopTaskQueueSet* task_queues):
     YieldingFlexibleGangTask("Concurrent marking done multi-threaded"),
     _collector(collector),
     _cms_space(cms_space),
     _perm_space(perm_space),
-    _asynch(asynch), _n_workers(n_workers), _result(true),
-    _workers(workers), _task_queues(task_queues),
-    _term(n_workers, task_queues, _collector, asynch),
+    _asynch(asynch), _n_workers(0), _result(true),
+    _task_queues(task_queues),
+    _term(_n_workers, task_queues, _collector, asynch),
     _bit_map_lock(collector->bitMapLock())
   {
-    assert(n_workers <= workers->total_workers(),
-           "Else termination won't work correctly today"); // XXX FIX ME!
-    _requested_size = n_workers;
+    _requested_size = _n_workers;
     _term.set_task(this);
     assert(_cms_space->bottom() < _perm_space->bottom(),
            "Finger incorrectly initialized below");
@@ -3781,6 +3781,10 @@
 
   CMSConcMarkingTerminator* terminator() { return &_term; }
 
+  virtual void set_for_termination(int active_workers) {
+    terminator()->reset_for_reuse(active_workers);
+  }
+
   void work(int i);
 
   virtual void coordinator_yield();  // stuff done by coordinator
@@ -4220,9 +4224,12 @@
   CompactibleFreeListSpace* cms_space  = _cmsGen->cmsSpace();
   CompactibleFreeListSpace* perm_space = _permGen->cmsSpace();
 
-  CMSConcMarkingTask tsk(this, cms_space, perm_space,
-                         asynch, num_workers /* number requested XXX */,
-                         conc_workers(), task_queues());
+  CMSConcMarkingTask tsk(this,
+                         cms_space,
+                         perm_space,
+                         asynch,
+                         conc_workers(),
+                         task_queues());
 
   // Since the actual number of workers we get may be different
   // from the number we requested above, do we need to do anything different
@@ -4326,6 +4333,10 @@
   verify_overflow_empty();
   _abort_preclean = false;
   if (CMSPrecleaningEnabled) {
+    // Precleaning is currently not MT but the reference processor
+    // may be set for MT.  Disable it temporarily here.
+    ReferenceProcessor* rp = ref_processor();
+    ReferenceProcessorMTProcMutator z(rp, false);
     _eden_chunk_index = 0;
     size_t used = get_eden_used();
     size_t capacity = get_eden_capacity();
@@ -4918,7 +4929,7 @@
       // dirtied since the first checkpoint in this GC cycle and prior to
       // the most recent young generation GC, minus those cleaned up by the
       // concurrent precleaning.
-      if (CMSParallelRemarkEnabled && ParallelGCThreads > 0) {
+      if (CMSParallelRemarkEnabled && CollectedHeap::use_parallel_gc_threads()) {
         TraceTime t("Rescan (parallel) ", PrintGCDetails, false, gclog_or_tty);
         do_remark_parallel();
       } else {
@@ -5012,7 +5023,6 @@
 // Parallel remark task
 class CMSParRemarkTask: public AbstractGangTask {
   CMSCollector* _collector;
-  WorkGang*     _workers;
   int           _n_workers;
   CompactibleFreeListSpace* _cms_space;
   CompactibleFreeListSpace* _perm_space;
@@ -5025,21 +5035,21 @@
   CMSParRemarkTask(CMSCollector* collector,
                    CompactibleFreeListSpace* cms_space,
                    CompactibleFreeListSpace* perm_space,
-                   int n_workers, WorkGang* workers,
+                   int n_workers, FlexibleWorkGang* workers,
                    OopTaskQueueSet* task_queues):
     AbstractGangTask("Rescan roots and grey objects in parallel"),
     _collector(collector),
     _cms_space(cms_space), _perm_space(perm_space),
     _n_workers(n_workers),
-    _workers(workers),
     _task_queues(task_queues),
-    _term(workers->total_workers(), task_queues) { }
+    _term(n_workers, task_queues) { }
 
   OopTaskQueueSet* task_queues() { return _task_queues; }
 
   OopTaskQueue* work_queue(int i) { return task_queues()->queue(i); }
 
   ParallelTaskTerminator* terminator() { return &_term; }
+  int n_workers() { return _n_workers; }
 
   void work(int i);
 
@@ -5057,6 +5067,11 @@
   void do_work_steal(int i, Par_MarkRefsIntoAndScanClosure* cl, int* seed);
 };
 
+// work_queue(i) is passed to the closure
+// Par_MarkRefsIntoAndScanClosure.  The "i" parameter
+// also is passed to do_dirty_card_rescan_tasks() and to
+// do_work_steal() to select the i-th task_queue.
+
 void CMSParRemarkTask::work(int i) {
   elapsedTimer _timer;
   ResourceMark rm;
@@ -5128,6 +5143,7 @@
 
   // Do the rescan tasks for each of the two spaces
   // (cms_space and perm_space) in turn.
+  // "i" is passed to select the "i-th" task_queue
   do_dirty_card_rescan_tasks(_cms_space, i, &par_mrias_cl);
   do_dirty_card_rescan_tasks(_perm_space, i, &par_mrias_cl);
   _timer.stop();
@@ -5150,6 +5166,7 @@
   }
 }
 
+// Note that parameter "i" is not used.
 void
 CMSParRemarkTask::do_young_space_rescan(int i,
   Par_MarkRefsIntoAndScanClosure* cl, ContiguousSpace* space,
@@ -5309,8 +5326,13 @@
     size_t num_from_overflow_list = MIN2((size_t)(work_q->max_elems() - work_q->size())/4,
                                          (size_t)ParGCDesiredObjsFromOverflowList);
     // Now check if there's any work in the overflow list
+    // Passing ParallelGCThreads as the third parameter, no_of_gc_threads,
+    // only affects the number of attempts made to get work from the
+    // overflow list and does not affect the number of workers.  Just
+    // pass ParallelGCThreads so this behavior is unchanged.
     if (_collector->par_take_from_overflow_list(num_from_overflow_list,
-                                                work_q)) {
+                                                work_q,
+                                                ParallelGCThreads)) {
       // found something in global overflow list;
       // not yet ready to go stealing work from others.
       // We'd like to assert(work_q->size() != 0, ...)
@@ -5367,11 +5389,12 @@
 // Merge the per-thread plab arrays into the global survivor chunk
 // array which will provide the partitioning of the survivor space
 // for CMS rescan.
-void CMSCollector::merge_survivor_plab_arrays(ContiguousSpace* surv) {
+void CMSCollector::merge_survivor_plab_arrays(ContiguousSpace* surv,
+                                              int no_of_gc_threads) {
   assert(_survivor_plab_array  != NULL, "Error");
   assert(_survivor_chunk_array != NULL, "Error");
   assert(_collectorState == FinalMarking, "Error");
-  for (uint j = 0; j < ParallelGCThreads; j++) {
+  for (int j = 0; j < no_of_gc_threads; j++) {
     _cursor[j] = 0;
   }
   HeapWord* top = surv->top();
@@ -5379,7 +5402,7 @@
   for (i = 0; i < _survivor_chunk_capacity; i++) {  // all sca entries
     HeapWord* min_val = top;          // Higher than any PLAB address
     uint      min_tid = 0;            // position of min_val this round
-    for (uint j = 0; j < ParallelGCThreads; j++) {
+    for (int j = 0; j < no_of_gc_threads; j++) {
       ChunkArray* cur_sca = &_survivor_plab_array[j];
       if (_cursor[j] == cur_sca->end()) {
         continue;
@@ -5413,7 +5436,7 @@
   // Verify that we used up all the recorded entries
   #ifdef ASSERT
     size_t total = 0;
-    for (uint j = 0; j < ParallelGCThreads; j++) {
+    for (int j = 0; j < no_of_gc_threads; j++) {
       assert(_cursor[j] == _survivor_plab_array[j].end(), "Ctl pt invariant");
       total += _cursor[j];
     }
@@ -5448,13 +5471,15 @@
     // Each valid entry in [0, _eden_chunk_index) represents a task.
     size_t n_tasks = _eden_chunk_index + 1;
     assert(n_tasks == 1 || _eden_chunk_array != NULL, "Error");
-    pst->set_par_threads(n_threads);
+    // Sets the condition for completion of the subtask (how many threads
+    // need to finish in order to be done).
+    pst->set_n_threads(n_threads);
     pst->set_n_tasks((int)n_tasks);
   }
 
   // Merge the survivor plab arrays into _survivor_chunk_array
   if (_survivor_plab_array != NULL) {
-    merge_survivor_plab_arrays(dng->from());
+    merge_survivor_plab_arrays(dng->from(), n_threads);
   } else {
     assert(_survivor_chunk_index == 0, "Error");
   }
@@ -5463,7 +5488,9 @@
   {
     SequentialSubTasksDone* pst = dng->to()->par_seq_tasks();
     assert(!pst->valid(), "Clobbering existing data?");
-    pst->set_par_threads(n_threads);
+    // Sets the condition for completion of the subtask (how many threads
+    // need to finish in order to be done).
+    pst->set_n_threads(n_threads);
     pst->set_n_tasks(1);
     assert(pst->valid(), "Error");
   }
@@ -5474,7 +5501,9 @@
     assert(!pst->valid(), "Clobbering existing data?");
     size_t n_tasks = _survivor_chunk_index + 1;
     assert(n_tasks == 1 || _survivor_chunk_array != NULL, "Error");
-    pst->set_par_threads(n_threads);
+    // Sets the condition for completion of the subtask (how many threads
+    // need to finish in order to be done).
+    pst->set_n_threads(n_threads);
     pst->set_n_tasks((int)n_tasks);
     assert(pst->valid(), "Error");
   }
@@ -5483,7 +5512,7 @@
 // Parallel version of remark
 void CMSCollector::do_remark_parallel() {
   GenCollectedHeap* gch = GenCollectedHeap::heap();
-  WorkGang* workers = gch->workers();
+  FlexibleWorkGang* workers = gch->workers();
   assert(workers != NULL, "Need parallel worker threads.");
   int n_workers = workers->total_workers();
   CompactibleFreeListSpace* cms_space  = _cmsGen->cmsSpace();
@@ -5636,13 +5665,11 @@
 ////////////////////////////////////////////////////////
 // Parallel Reference Processing Task Proxy Class
 ////////////////////////////////////////////////////////
-class CMSRefProcTaskProxy: public AbstractGangTask {
+class CMSRefProcTaskProxy: public AbstractGangTaskWOopQueues {
   typedef AbstractRefProcTaskExecutor::ProcessTask ProcessTask;
   CMSCollector*          _collector;
   CMSBitMap*             _mark_bit_map;
   const MemRegion        _span;
-  OopTaskQueueSet*       _task_queues;
-  ParallelTaskTerminator _term;
   ProcessTask&           _task;
 
 public:
@@ -5650,24 +5677,21 @@
                       CMSCollector*    collector,
                       const MemRegion& span,
                       CMSBitMap*       mark_bit_map,
-                      int              total_workers,
+                      AbstractWorkGang* workers,
                       OopTaskQueueSet* task_queues):
-    AbstractGangTask("Process referents by policy in parallel"),
+    AbstractGangTaskWOopQueues("Process referents by policy in parallel",
+      task_queues),
     _task(task),
-    _collector(collector), _span(span), _mark_bit_map(mark_bit_map),
-    _task_queues(task_queues),
-    _term(total_workers, task_queues)
+    _collector(collector), _span(span), _mark_bit_map(mark_bit_map)
     {
       assert(_collector->_span.equals(_span) && !_span.is_empty(),
              "Inconsistency in _span");
     }
 
-  OopTaskQueueSet* task_queues() { return _task_queues; }
+  OopTaskQueueSet* task_queues() { return queues(); }
 
   OopTaskQueue* work_queue(int i) { return task_queues()->queue(i); }
 
-  ParallelTaskTerminator* terminator() { return &_term; }
-
   void do_work_steal(int i,
                      CMSParDrainMarkingStackClosure* drain,
                      CMSParKeepAliveClosure* keep_alive,
@@ -5739,8 +5763,13 @@
     size_t num_from_overflow_list = MIN2((size_t)(work_q->max_elems() - work_q->size())/4,
                                          (size_t)ParGCDesiredObjsFromOverflowList);
     // Now check if there's any work in the overflow list
+    // Passing ParallelGCThreads as the third parameter, no_of_gc_threads,
+    // only affects the number of attempts made to get work from the
+    // overflow list and does not affect the number of workers.  Just
+    // pass ParallelGCThreads so this behavior is unchanged.
     if (_collector->par_take_from_overflow_list(num_from_overflow_list,
-                                                work_q)) {
+                                                work_q,
+                                                ParallelGCThreads)) {
       // Found something in global overflow list;
       // not yet ready to go stealing work from others.
       // We'd like to assert(work_q->size() != 0, ...)
@@ -5773,13 +5802,12 @@
 void CMSRefProcTaskExecutor::execute(ProcessTask& task)
 {
   GenCollectedHeap* gch = GenCollectedHeap::heap();
-  WorkGang* workers = gch->workers();
+  FlexibleWorkGang* workers = gch->workers();
   assert(workers != NULL, "Need parallel worker threads.");
-  int n_workers = workers->total_workers();
   CMSRefProcTaskProxy rp_task(task, &_collector,
                               _collector.ref_processor()->span(),
                               _collector.markBitMap(),
-                              n_workers, _collector.task_queues());
+                              workers, _collector.task_queues());
   workers->run_task(&rp_task);
 }
 
@@ -5787,7 +5815,7 @@
 {
 
   GenCollectedHeap* gch = GenCollectedHeap::heap();
-  WorkGang* workers = gch->workers();
+  FlexibleWorkGang* workers = gch->workers();
   assert(workers != NULL, "Need parallel worker threads.");
   CMSRefEnqueueTaskProxy enq_task(task);
   workers->run_task(&enq_task);
@@ -5814,6 +5842,14 @@
   {
     TraceTime t("weak refs processing", PrintGCDetails, false, gclog_or_tty);
     if (rp->processing_is_mt()) {
+      // Set the degree of MT here.  If the discovery is done MT, there
+      // may have been a different number of threads doing the discovery
+      // and a different number of discovered lists may have Ref objects.
+      // That is OK as long as the Reference lists are balanced (see
+      // balance_all_queues() and balance_queues()).
+
+
+      rp->set_mt_degree(ParallelGCThreads);
       CMSRefProcTaskExecutor task_executor(*this);
       rp->process_discovered_references(&_is_alive_closure,
                                         &cmsKeepAliveClosure,
@@ -5874,6 +5910,7 @@
 
   rp->set_enqueuing_is_done(true);
   if (rp->processing_is_mt()) {
+    rp->balance_all_queues();
     CMSRefProcTaskExecutor task_executor(*this);
     rp->enqueue_discovered_references(&task_executor);
   } else {
@@ -8708,7 +8745,8 @@
 // similar changes might be needed.
 // CR 6797058 has been filed to consolidate the common code.
 bool CMSCollector::par_take_from_overflow_list(size_t num,
-                                               OopTaskQueue* work_q) {
+                                               OopTaskQueue* work_q,
+                                               int no_of_gc_threads) {
   assert(work_q->size() == 0, "First empty local work queue");
   assert(num < work_q->max_elems(), "Can't bite more than we can chew");
   if (_overflow_list == NULL) {
@@ -8717,7 +8755,9 @@
   // Grab the entire list; we'll put back a suffix
   oop prefix = (oop)Atomic::xchg_ptr(BUSY, &_overflow_list);
   Thread* tid = Thread::current();
-  size_t CMSOverflowSpinCount = (size_t)ParallelGCThreads;
+  // Before "no_of_gc_threads" was introduced CMSOverflowSpinCount was
+  // set to ParallelGCThreads.
+  size_t CMSOverflowSpinCount = (size_t) no_of_gc_threads; // was ParallelGCThreads;
   size_t sleep_time_millis = MAX2((size_t)1, num/100);
   // If the list is busy, we spin for a short while,
   // sleeping between attempts to get the list.
@@ -9256,4 +9296,3 @@
              true /* recordGCEndTime */,
              true /* countCollection */ );
 }
-