6786503: Overflow list performance can be improved

Mon, 26 Jan 2009 12:47:21 -0800

author
ysr
date
Mon, 26 Jan 2009 12:47:21 -0800
changeset 969
5cfd8d19e546
parent 962
2b1de1db9a9d
child 970
4e400c36026f

6786503: Overflow list performance can be improved
Summary: Avoid overflow list walk in CMS & ParNew when it is unnecessary. Fix a couple of correctness issues, including a C-heap leak, in ParNew at the intersection of promotion failure, work queue overflow and object array chunking. Add stress testing option and related assertion checking.
Reviewed-by: jmasa

src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp file | annotate | diff | comparison | revisions
src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp file | annotate | diff | comparison | revisions
src/share/vm/gc_implementation/includeDB_gc_parNew file | annotate | diff | comparison | revisions
src/share/vm/gc_implementation/parNew/parNewGeneration.cpp file | annotate | diff | comparison | revisions
src/share/vm/gc_implementation/parNew/parNewGeneration.hpp file | annotate | diff | comparison | revisions
src/share/vm/memory/referenceProcessor.cpp file | annotate | diff | comparison | revisions
src/share/vm/runtime/globals.hpp file | annotate | diff | comparison | revisions
     1.1 --- a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp	Wed Jan 21 13:40:10 2009 -0800
     1.2 +++ b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp	Mon Jan 26 12:47:21 2009 -0800
     1.3 @@ -8508,7 +8508,7 @@
     1.4    size_t i = num;
     1.5    oop  cur = _overflow_list;
     1.6    const markOop proto = markOopDesc::prototype();
     1.7 -  NOT_PRODUCT(size_t n = 0;)
     1.8 +  NOT_PRODUCT(ssize_t n = 0;)
     1.9    for (oop next; i > 0 && cur != NULL; cur = next, i--) {
    1.10      next = oop(cur->mark());
    1.11      cur->set_mark(proto);   // until proven otherwise
    1.12 @@ -8525,45 +8525,131 @@
    1.13    return !stack->isEmpty();
    1.14  }
    1.15  
    1.16 -// Multi-threaded; use CAS to break off a prefix
    1.17 +#define BUSY  (oop(0x1aff1aff))
    1.18 +// (MT-safe) Get a prefix of at most "num" from the list.
    1.19 +// The overflow list is chained through the mark word of
    1.20 +// each object in the list. We fetch the entire list,
    1.21 +// break off a prefix of the right size and return the
    1.22 +// remainder. If other threads try to take objects from
    1.23 +// the overflow list at that time, they will wait for
    1.24 +// some time to see if data becomes available. If (and
    1.25 +// only if) another thread places one or more object(s)
    1.26 +// on the global list before we have returned the suffix
    1.27 +// to the global list, we will walk down our local list
    1.28 +// to find its end and append the global list to
    1.29 +// our suffix before returning it. This suffix walk can
    1.30 +// prove to be expensive (quadratic in the amount of traffic)
    1.31 +// when there are many objects in the overflow list and
    1.32 +// there is much producer-consumer contention on the list.
    1.33 +// *NOTE*: The overflow list manipulation code here and
    1.34 +// in ParNewGeneration:: are very similar in shape,
    1.35 +// except that in the ParNew case we use the old (from/eden)
    1.36 +// copy of the object to thread the list via its klass word.
    1.37 +// Because of the common code, if you make any changes in
    1.38 +// the code below, please check the ParNew version to see if
    1.39 +// similar changes might be needed.
    1.40 +// CR 6797058 has been filed to consolidate the common code.
    1.41  bool CMSCollector::par_take_from_overflow_list(size_t num,
    1.42                                                 OopTaskQueue* work_q) {
    1.43 -  assert(work_q->size() == 0, "That's the current policy");
    1.44 +  assert(work_q->size() == 0, "First empty local work queue");
    1.45    assert(num < work_q->max_elems(), "Can't bite more than we can chew");
    1.46    if (_overflow_list == NULL) {
    1.47      return false;
    1.48    }
    1.49    // Grab the entire list; we'll put back a suffix
    1.50 -  oop prefix = (oop)Atomic::xchg_ptr(NULL, &_overflow_list);
    1.51 -  if (prefix == NULL) {  // someone grabbed it before we did ...
    1.52 -    // ... we could spin for a short while, but for now we don't
    1.53 -    return false;
    1.54 -  }
    1.55 +  oop prefix = (oop)Atomic::xchg_ptr(BUSY, &_overflow_list);
    1.56 +  Thread* tid = Thread::current();
    1.57 +  size_t CMSOverflowSpinCount = (size_t)ParallelGCThreads;
    1.58 +  size_t sleep_time_millis = MAX2((size_t)1, num/100);
    1.59 +  // If the list is busy, we spin for a short while,
    1.60 +  // sleeping between attempts to get the list.
    1.61 +  for (size_t spin = 0; prefix == BUSY && spin < CMSOverflowSpinCount; spin++) {
    1.62 +    os::sleep(tid, sleep_time_millis, false);
    1.63 +    if (_overflow_list == NULL) {
    1.64 +      // Nothing left to take
    1.65 +      return false;
    1.66 +    } else if (_overflow_list != BUSY) {
    1.67 +      // Try and grab the prefix
    1.68 +      prefix = (oop)Atomic::xchg_ptr(BUSY, &_overflow_list);
    1.69 +    }
    1.70 +  }
    1.71 +  // If the list was found to be empty, or we spun long
    1.72 +  // enough, we give up and return empty-handed. If we leave
    1.73 +  // the list in the BUSY state below, it must be the case that
    1.74 +  // some other thread holds the overflow list and will set it
    1.75 +  // to a non-BUSY state in the future.
    1.76 +  if (prefix == NULL || prefix == BUSY) {
    1.77 +     // Nothing to take or waited long enough
    1.78 +     if (prefix == NULL) {
    1.79 +       // Write back the NULL in case we overwrote it with BUSY above
    1.80 +       // and it is still the same value.
    1.81 +       (void) Atomic::cmpxchg_ptr(NULL, &_overflow_list, BUSY);
    1.82 +     }
    1.83 +     return false;
    1.84 +  }
    1.85 +  assert(prefix != NULL && prefix != BUSY, "Error");
    1.86    size_t i = num;
    1.87    oop cur = prefix;
    1.88 +  // Walk down the first "num" objects, unless we reach the end.
    1.89    for (; i > 1 && cur->mark() != NULL; cur = oop(cur->mark()), i--);
    1.90 -  if (cur->mark() != NULL) {
    1.91 +  if (cur->mark() == NULL) {
    1.92 +    // We have "num" or fewer elements in the list, so there
    1.93 +    // is nothing to return to the global list.
    1.94 +    // Write back the NULL in lieu of the BUSY we wrote
    1.95 +    // above, if it is still the same value.
    1.96 +    if (_overflow_list == BUSY) {
    1.97 +      (void) Atomic::cmpxchg_ptr(NULL, &_overflow_list, BUSY);
    1.98 +    }
    1.99 +  } else {
   1.100 +    // Chop off the suffix and rerturn it to the global list.
   1.101 +    assert(cur->mark() != BUSY, "Error");
   1.102      oop suffix_head = cur->mark(); // suffix will be put back on global list
   1.103      cur->set_mark(NULL);           // break off suffix
   1.104 -    // Find tail of suffix so we can prepend suffix to global list
   1.105 -    for (cur = suffix_head; cur->mark() != NULL; cur = (oop)(cur->mark()));
   1.106 -    oop suffix_tail = cur;
   1.107 -    assert(suffix_tail != NULL && suffix_tail->mark() == NULL,
   1.108 -           "Tautology");
   1.109 +    // It's possible that the list is still in the empty(busy) state
   1.110 +    // we left it in a short while ago; in that case we may be
   1.111 +    // able to place back the suffix without incurring the cost
   1.112 +    // of a walk down the list.
   1.113      oop observed_overflow_list = _overflow_list;
   1.114 -    do {
   1.115 -      cur = observed_overflow_list;
   1.116 -      suffix_tail->set_mark(markOop(cur));
   1.117 +    oop cur_overflow_list = observed_overflow_list;
   1.118 +    bool attached = false;
   1.119 +    while (observed_overflow_list == BUSY || observed_overflow_list == NULL) {
   1.120        observed_overflow_list =
   1.121 -        (oop) Atomic::cmpxchg_ptr(suffix_head, &_overflow_list, cur);
   1.122 -    } while (cur != observed_overflow_list);
   1.123 +        (oop) Atomic::cmpxchg_ptr(suffix_head, &_overflow_list, cur_overflow_list);
   1.124 +      if (cur_overflow_list == observed_overflow_list) {
   1.125 +        attached = true;
   1.126 +        break;
   1.127 +      } else cur_overflow_list = observed_overflow_list;
   1.128 +    }
   1.129 +    if (!attached) {
   1.130 +      // Too bad, someone else sneaked in (at least) an element; we'll need
   1.131 +      // to do a splice. Find tail of suffix so we can prepend suffix to global
   1.132 +      // list.
   1.133 +      for (cur = suffix_head; cur->mark() != NULL; cur = (oop)(cur->mark()));
   1.134 +      oop suffix_tail = cur;
   1.135 +      assert(suffix_tail != NULL && suffix_tail->mark() == NULL,
   1.136 +             "Tautology");
   1.137 +      observed_overflow_list = _overflow_list;
   1.138 +      do {
   1.139 +        cur_overflow_list = observed_overflow_list;
   1.140 +        if (cur_overflow_list != BUSY) {
   1.141 +          // Do the splice ...
   1.142 +          suffix_tail->set_mark(markOop(cur_overflow_list));
   1.143 +        } else { // cur_overflow_list == BUSY
   1.144 +          suffix_tail->set_mark(NULL);
   1.145 +        }
   1.146 +        // ... and try to place spliced list back on overflow_list ...
   1.147 +        observed_overflow_list =
   1.148 +          (oop) Atomic::cmpxchg_ptr(suffix_head, &_overflow_list, cur_overflow_list);
   1.149 +      } while (cur_overflow_list != observed_overflow_list);
   1.150 +      // ... until we have succeeded in doing so.
   1.151 +    }
   1.152    }
   1.153  
   1.154    // Push the prefix elements on work_q
   1.155    assert(prefix != NULL, "control point invariant");
   1.156    const markOop proto = markOopDesc::prototype();
   1.157    oop next;
   1.158 -  NOT_PRODUCT(size_t n = 0;)
   1.159 +  NOT_PRODUCT(ssize_t n = 0;)
   1.160    for (cur = prefix; cur != NULL; cur = next) {
   1.161      next = oop(cur->mark());
   1.162      cur->set_mark(proto);   // until proven otherwise
   1.163 @@ -8597,11 +8683,16 @@
   1.164    oop cur_overflow_list;
   1.165    do {
   1.166      cur_overflow_list = observed_overflow_list;
   1.167 -    p->set_mark(markOop(cur_overflow_list));
   1.168 +    if (cur_overflow_list != BUSY) {
   1.169 +      p->set_mark(markOop(cur_overflow_list));
   1.170 +    } else {
   1.171 +      p->set_mark(NULL);
   1.172 +    }
   1.173      observed_overflow_list =
   1.174        (oop) Atomic::cmpxchg_ptr(p, &_overflow_list, cur_overflow_list);
   1.175    } while (cur_overflow_list != observed_overflow_list);
   1.176  }
   1.177 +#undef BUSY
   1.178  
   1.179  // Single threaded
   1.180  // General Note on GrowableArray: pushes may silently fail
   1.181 @@ -8610,7 +8701,7 @@
   1.182  // a lot of code in the JVM. The prudent thing for GrowableArray
   1.183  // to do (for now) is to exit with an error. However, that may
   1.184  // be too draconian in some cases because the caller may be
   1.185 -// able to recover without much harm. For suych cases, we
   1.186 +// able to recover without much harm. For such cases, we
   1.187  // should probably introduce a "soft_push" method which returns
   1.188  // an indication of success or failure with the assumption that
   1.189  // the caller may be able to recover from a failure; code in
   1.190 @@ -8618,8 +8709,6 @@
   1.191  // failures where possible, thus, incrementally hardening the VM
   1.192  // in such low resource situations.
   1.193  void CMSCollector::preserve_mark_work(oop p, markOop m) {
   1.194 -  int PreserveMarkStackSize = 128;
   1.195 -
   1.196    if (_preserved_oop_stack == NULL) {
   1.197      assert(_preserved_mark_stack == NULL,
   1.198             "bijection with preserved_oop_stack");
     2.1 --- a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp	Wed Jan 21 13:40:10 2009 -0800
     2.2 +++ b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp	Mon Jan 26 12:47:21 2009 -0800
     2.3 @@ -595,7 +595,7 @@
     2.4    size_t        _ser_kac_preclean_ovflw;
     2.5    size_t        _ser_kac_ovflw;
     2.6    size_t        _par_kac_ovflw;
     2.7 -  NOT_PRODUCT(size_t _num_par_pushes;)
     2.8 +  NOT_PRODUCT(ssize_t _num_par_pushes;)
     2.9  
    2.10    // ("Weak") Reference processing support
    2.11    ReferenceProcessor*            _ref_processor;
     3.1 --- a/src/share/vm/gc_implementation/includeDB_gc_parNew	Wed Jan 21 13:40:10 2009 -0800
     3.2 +++ b/src/share/vm/gc_implementation/includeDB_gc_parNew	Mon Jan 26 12:47:21 2009 -0800
     3.3 @@ -77,6 +77,7 @@
     3.4  parNewGeneration.cpp                    sharedHeap.hpp
     3.5  parNewGeneration.cpp                    space.hpp
     3.6  parNewGeneration.cpp                    spaceDecorator.hpp
     3.7 +parNewGeneration.cpp                    thread.hpp
     3.8  parNewGeneration.cpp                    workgroup.hpp
     3.9  
    3.10  parNewGeneration.hpp                    defNewGeneration.hpp
     4.1 --- a/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp	Wed Jan 21 13:40:10 2009 -0800
     4.2 +++ b/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp	Mon Jan 26 12:47:21 2009 -0800
     4.3 @@ -404,6 +404,8 @@
     4.4      if (terminator()->offer_termination()) break;
     4.5      par_scan_state()->end_term_time();
     4.6    }
     4.7 +  assert(par_gen()->_overflow_list == NULL && par_gen()->_num_par_pushes == 0,
     4.8 +         "Broken overflow list?");
     4.9    // Finish the last termination pause.
    4.10    par_scan_state()->end_term_time();
    4.11  }
    4.12 @@ -456,6 +458,8 @@
    4.13    _is_alive_closure(this),
    4.14    _plab_stats(YoungPLABSize, PLABWeight)
    4.15  {
    4.16 +  NOT_PRODUCT(_overflow_counter = ParGCWorkQueueOverflowInterval;)
    4.17 +  NOT_PRODUCT(_num_par_pushes = 0;)
    4.18    _task_queues = new ObjToScanQueueSet(ParallelGCThreads);
    4.19    guarantee(_task_queues != NULL, "task_queues allocation failure.");
    4.20  
    4.21 @@ -993,12 +997,19 @@
    4.22               "push forwarded object");
    4.23      }
    4.24      // Push it on one of the queues of to-be-scanned objects.
    4.25 -    if (!par_scan_state->work_queue()->push(obj_to_push)) {
    4.26 +    bool simulate_overflow = false;
    4.27 +    NOT_PRODUCT(
    4.28 +      if (ParGCWorkQueueOverflowALot && should_simulate_overflow()) {
    4.29 +        // simulate a stack overflow
    4.30 +        simulate_overflow = true;
    4.31 +      }
    4.32 +    )
    4.33 +    if (simulate_overflow || !par_scan_state->work_queue()->push(obj_to_push)) {
    4.34        // Add stats for overflow pushes.
    4.35        if (Verbose && PrintGCDetails) {
    4.36          gclog_or_tty->print("queue overflow!\n");
    4.37        }
    4.38 -      push_on_overflow_list(old);
    4.39 +      push_on_overflow_list(old, par_scan_state);
    4.40        par_scan_state->note_overflow_push();
    4.41      }
    4.42      par_scan_state->note_push();
    4.43 @@ -1110,9 +1121,16 @@
    4.44               "push forwarded object");
    4.45      }
    4.46      // Push it on one of the queues of to-be-scanned objects.
    4.47 -    if (!par_scan_state->work_queue()->push(obj_to_push)) {
    4.48 +    bool simulate_overflow = false;
    4.49 +    NOT_PRODUCT(
    4.50 +      if (ParGCWorkQueueOverflowALot && should_simulate_overflow()) {
    4.51 +        // simulate a stack overflow
    4.52 +        simulate_overflow = true;
    4.53 +      }
    4.54 +    )
    4.55 +    if (simulate_overflow || !par_scan_state->work_queue()->push(obj_to_push)) {
    4.56        // Add stats for overflow pushes.
    4.57 -      push_on_overflow_list(old);
    4.58 +      push_on_overflow_list(old, par_scan_state);
    4.59        par_scan_state->note_overflow_push();
    4.60      }
    4.61      par_scan_state->note_push();
    4.62 @@ -1135,89 +1153,190 @@
    4.63    return forward_ptr;
    4.64  }
    4.65  
    4.66 -void ParNewGeneration::push_on_overflow_list(oop from_space_obj) {
    4.67 -  oop cur_overflow_list = _overflow_list;
    4.68 +#ifndef PRODUCT
    4.69 +// It's OK to call this multi-threaded;  the worst thing
    4.70 +// that can happen is that we'll get a bunch of closely
    4.71 +// spaced simulated oveflows, but that's OK, in fact
    4.72 +// probably good as it would exercise the overflow code
    4.73 +// under contention.
    4.74 +bool ParNewGeneration::should_simulate_overflow() {
    4.75 +  if (_overflow_counter-- <= 0) { // just being defensive
    4.76 +    _overflow_counter = ParGCWorkQueueOverflowInterval;
    4.77 +    return true;
    4.78 +  } else {
    4.79 +    return false;
    4.80 +  }
    4.81 +}
    4.82 +#endif
    4.83 +
    4.84 +#define BUSY (oop(0x1aff1aff))
    4.85 +void ParNewGeneration::push_on_overflow_list(oop from_space_obj, ParScanThreadState* par_scan_state) {
    4.86    // if the object has been forwarded to itself, then we cannot
    4.87    // use the klass pointer for the linked list.  Instead we have
    4.88    // to allocate an oopDesc in the C-Heap and use that for the linked list.
    4.89 +  // XXX This is horribly inefficient when a promotion failure occurs
    4.90 +  // and should be fixed. XXX FIX ME !!!
    4.91 +#ifndef PRODUCT
    4.92 +  Atomic::inc_ptr(&_num_par_pushes);
    4.93 +  assert(_num_par_pushes > 0, "Tautology");
    4.94 +#endif
    4.95    if (from_space_obj->forwardee() == from_space_obj) {
    4.96      oopDesc* listhead = NEW_C_HEAP_ARRAY(oopDesc, 1);
    4.97      listhead->forward_to(from_space_obj);
    4.98      from_space_obj = listhead;
    4.99    }
   4.100 -  while (true) {
   4.101 -    from_space_obj->set_klass_to_list_ptr(cur_overflow_list);
   4.102 -    oop observed_overflow_list =
   4.103 +  oop observed_overflow_list = _overflow_list;
   4.104 +  oop cur_overflow_list;
   4.105 +  do {
   4.106 +    cur_overflow_list = observed_overflow_list;
   4.107 +    if (cur_overflow_list != BUSY) {
   4.108 +      from_space_obj->set_klass_to_list_ptr(cur_overflow_list);
   4.109 +    } else {
   4.110 +      from_space_obj->set_klass_to_list_ptr(NULL);
   4.111 +    }
   4.112 +    observed_overflow_list =
   4.113        (oop)Atomic::cmpxchg_ptr(from_space_obj, &_overflow_list, cur_overflow_list);
   4.114 -    if (observed_overflow_list == cur_overflow_list) break;
   4.115 -    // Otherwise...
   4.116 -    cur_overflow_list = observed_overflow_list;
   4.117 -  }
   4.118 +  } while (cur_overflow_list != observed_overflow_list);
   4.119  }
   4.120  
   4.121 +// *NOTE*: The overflow list manipulation code here and
   4.122 +// in CMSCollector:: are very similar in shape,
   4.123 +// except that in the CMS case we thread the objects
   4.124 +// directly into the list via their mark word, and do
   4.125 +// not need to deal with special cases below related
   4.126 +// to chunking of object arrays and promotion failure
   4.127 +// handling.
   4.128 +// CR 6797058 has been filed to attempt consolidation of
   4.129 +// the common code.
   4.130 +// Because of the common code, if you make any changes in
   4.131 +// the code below, please check the CMS version to see if
   4.132 +// similar changes might be needed.
   4.133 +// See CMSCollector::par_take_from_overflow_list() for
   4.134 +// more extensive documentation comments.
   4.135  bool
   4.136  ParNewGeneration::take_from_overflow_list(ParScanThreadState* par_scan_state) {
   4.137    ObjToScanQueue* work_q = par_scan_state->work_queue();
   4.138 +  assert(work_q->size() == 0, "Should first empty local work queue");
   4.139    // How many to take?
   4.140 -  int objsFromOverflow = MIN2(work_q->max_elems()/4,
   4.141 -                              (juint)ParGCDesiredObjsFromOverflowList);
   4.142 +  size_t objsFromOverflow = MIN2((size_t)work_q->max_elems()/4,
   4.143 +                                 (size_t)ParGCDesiredObjsFromOverflowList);
   4.144  
   4.145    if (_overflow_list == NULL) return false;
   4.146  
   4.147    // Otherwise, there was something there; try claiming the list.
   4.148 -  oop prefix = (oop)Atomic::xchg_ptr(NULL, &_overflow_list);
   4.149 -
   4.150 -  if (prefix == NULL) {
   4.151 -    return false;
   4.152 +  oop prefix = (oop)Atomic::xchg_ptr(BUSY, &_overflow_list);
   4.153 +  // Trim off a prefix of at most objsFromOverflow items
   4.154 +  Thread* tid = Thread::current();
   4.155 +  size_t spin_count = (size_t)ParallelGCThreads;
   4.156 +  size_t sleep_time_millis = MAX2((size_t)1, objsFromOverflow/100);
   4.157 +  for (size_t spin = 0; prefix == BUSY && spin < spin_count; spin++) {
   4.158 +    // someone grabbed it before we did ...
   4.159 +    // ... we spin for a short while...
   4.160 +    os::sleep(tid, sleep_time_millis, false);
   4.161 +    if (_overflow_list == NULL) {
   4.162 +      // nothing left to take
   4.163 +      return false;
   4.164 +    } else if (_overflow_list != BUSY) {
   4.165 +     // try and grab the prefix
   4.166 +     prefix = (oop)Atomic::xchg_ptr(BUSY, &_overflow_list);
   4.167 +    }
   4.168    }
   4.169 -  // Trim off a prefix of at most objsFromOverflow items
   4.170 -  int i = 1;
   4.171 +  if (prefix == NULL || prefix == BUSY) {
   4.172 +     // Nothing to take or waited long enough
   4.173 +     if (prefix == NULL) {
   4.174 +       // Write back the NULL in case we overwrote it with BUSY above
   4.175 +       // and it is still the same value.
   4.176 +       (void) Atomic::cmpxchg_ptr(NULL, &_overflow_list, BUSY);
   4.177 +     }
   4.178 +     return false;
   4.179 +  }
   4.180 +  assert(prefix != NULL && prefix != BUSY, "Error");
   4.181 +  size_t i = 1;
   4.182    oop cur = prefix;
   4.183    while (i < objsFromOverflow && cur->klass_or_null() != NULL) {
   4.184      i++; cur = oop(cur->klass());
   4.185    }
   4.186  
   4.187    // Reattach remaining (suffix) to overflow list
   4.188 -  if (cur->klass_or_null() != NULL) {
   4.189 -    oop suffix = oop(cur->klass());
   4.190 -    cur->set_klass_to_list_ptr(NULL);
   4.191 -
   4.192 -    // Find last item of suffix list
   4.193 -    oop last = suffix;
   4.194 -    while (last->klass_or_null() != NULL) {
   4.195 -      last = oop(last->klass());
   4.196 +  if (cur->klass_or_null() == NULL) {
   4.197 +    // Write back the NULL in lieu of the BUSY we wrote
   4.198 +    // above and it is still the same value.
   4.199 +    if (_overflow_list == BUSY) {
   4.200 +      (void) Atomic::cmpxchg_ptr(NULL, &_overflow_list, BUSY);
   4.201      }
   4.202 -    // Atomically prepend suffix to current overflow list
   4.203 -    oop cur_overflow_list = _overflow_list;
   4.204 -    while (true) {
   4.205 -      last->set_klass_to_list_ptr(cur_overflow_list);
   4.206 -      oop observed_overflow_list =
   4.207 -        (oop)Atomic::cmpxchg_ptr(suffix, &_overflow_list, cur_overflow_list);
   4.208 -      if (observed_overflow_list == cur_overflow_list) break;
   4.209 -      // Otherwise...
   4.210 -      cur_overflow_list = observed_overflow_list;
   4.211 +  } else {
   4.212 +    assert(cur->klass_or_null() != BUSY, "Error");
   4.213 +    oop suffix = oop(cur->klass());       // suffix will be put back on global list
   4.214 +    cur->set_klass_to_list_ptr(NULL);     // break off suffix
   4.215 +    // It's possible that the list is still in the empty(busy) state
   4.216 +    // we left it in a short while ago; in that case we may be
   4.217 +    // able to place back the suffix.
   4.218 +    oop observed_overflow_list = _overflow_list;
   4.219 +    oop cur_overflow_list = observed_overflow_list;
   4.220 +    bool attached = false;
   4.221 +    while (observed_overflow_list == BUSY || observed_overflow_list == NULL) {
   4.222 +      observed_overflow_list =
   4.223 +        (oop) Atomic::cmpxchg_ptr(suffix, &_overflow_list, cur_overflow_list);
   4.224 +      if (cur_overflow_list == observed_overflow_list) {
   4.225 +        attached = true;
   4.226 +        break;
   4.227 +      } else cur_overflow_list = observed_overflow_list;
   4.228 +    }
   4.229 +    if (!attached) {
   4.230 +      // Too bad, someone else got in in between; we'll need to do a splice.
   4.231 +      // Find the last item of suffix list
   4.232 +      oop last = suffix;
   4.233 +      while (last->klass_or_null() != NULL) {
   4.234 +        last = oop(last->klass());
   4.235 +      }
   4.236 +      // Atomically prepend suffix to current overflow list
   4.237 +      observed_overflow_list = _overflow_list;
   4.238 +      do {
   4.239 +        cur_overflow_list = observed_overflow_list;
   4.240 +        if (cur_overflow_list != BUSY) {
   4.241 +          // Do the splice ...
   4.242 +          last->set_klass_to_list_ptr(cur_overflow_list);
   4.243 +        } else { // cur_overflow_list == BUSY
   4.244 +          last->set_klass_to_list_ptr(NULL);
   4.245 +        }
   4.246 +        observed_overflow_list =
   4.247 +          (oop)Atomic::cmpxchg_ptr(suffix, &_overflow_list, cur_overflow_list);
   4.248 +      } while (cur_overflow_list != observed_overflow_list);
   4.249      }
   4.250    }
   4.251  
   4.252    // Push objects on prefix list onto this thread's work queue
   4.253 -  assert(cur != NULL, "program logic");
   4.254 +  assert(prefix != NULL && prefix != BUSY, "program logic");
   4.255    cur = prefix;
   4.256 -  int n = 0;
   4.257 +  ssize_t n = 0;
   4.258    while (cur != NULL) {
   4.259      oop obj_to_push = cur->forwardee();
   4.260      oop next        = oop(cur->klass_or_null());
   4.261      cur->set_klass(obj_to_push->klass());
   4.262 -    if (par_scan_state->should_be_partially_scanned(obj_to_push, cur)) {
   4.263 +    // This may be an array object that is self-forwarded. In that case, the list pointer
   4.264 +    // space, cur, is not in the Java heap, but rather in the C-heap and should be freed.
   4.265 +    if (!is_in_reserved(cur)) {
   4.266 +      // This can become a scaling bottleneck when there is work queue overflow coincident
   4.267 +      // with promotion failure.
   4.268 +      oopDesc* f = cur;
   4.269 +      FREE_C_HEAP_ARRAY(oopDesc, f);
   4.270 +    } else if (par_scan_state->should_be_partially_scanned(obj_to_push, cur)) {
   4.271 +      assert(arrayOop(cur)->length() == 0, "entire array remaining to be scanned");
   4.272        obj_to_push = cur;
   4.273 -      assert(arrayOop(cur)->length() == 0, "entire array remaining to be scanned");
   4.274      }
   4.275 -    work_q->push(obj_to_push);
   4.276 +    bool ok = work_q->push(obj_to_push);
   4.277 +    assert(ok, "Should have succeeded");
   4.278      cur = next;
   4.279      n++;
   4.280    }
   4.281    par_scan_state->note_overflow_refill(n);
   4.282 +#ifndef PRODUCT
   4.283 +  assert(_num_par_pushes >= n, "Too many pops?");
   4.284 +  Atomic::add_ptr(-(intptr_t)n, &_num_par_pushes);
   4.285 +#endif
   4.286    return true;
   4.287  }
   4.288 +#undef BUSY
   4.289  
   4.290  void ParNewGeneration::ref_processor_init()
   4.291  {
     5.1 --- a/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp	Wed Jan 21 13:40:10 2009 -0800
     5.2 +++ b/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp	Mon Jan 26 12:47:21 2009 -0800
     5.3 @@ -278,6 +278,7 @@
     5.4    friend class ParNewRefProcTask;
     5.5    friend class ParNewRefProcTaskExecutor;
     5.6    friend class ParScanThreadStateSet;
     5.7 +  friend class ParEvacuateFollowersClosure;
     5.8  
     5.9   private:
    5.10    // XXX use a global constant instead of 64!
    5.11 @@ -296,6 +297,7 @@
    5.12    // klass-pointers (klass information already copied to the forwarded
    5.13    // image.)  Manipulated with CAS.
    5.14    oop _overflow_list;
    5.15 +  NOT_PRODUCT(ssize_t _num_par_pushes;)
    5.16  
    5.17    // If true, older generation does not support promotion undo, so avoid.
    5.18    static bool _avoid_promotion_undo;
    5.19 @@ -372,8 +374,12 @@
    5.20    oop copy_to_survivor_space_with_undo(ParScanThreadState* par_scan_state,
    5.21                               oop obj, size_t obj_sz, markOop m);
    5.22  
    5.23 +  // in support of testing overflow code
    5.24 +  NOT_PRODUCT(int _overflow_counter;)
    5.25 +  NOT_PRODUCT(bool should_simulate_overflow();)
    5.26 +
    5.27    // Push the given (from-space) object on the global overflow list.
    5.28 -  void push_on_overflow_list(oop from_space_obj);
    5.29 +  void push_on_overflow_list(oop from_space_obj, ParScanThreadState* par_scan_state);
    5.30  
    5.31    // If the global overflow list is non-empty, move some tasks from it
    5.32    // onto "work_q" (which must be empty).  No more than 1/4 of the
     6.1 --- a/src/share/vm/memory/referenceProcessor.cpp	Wed Jan 21 13:40:10 2009 -0800
     6.2 +++ b/src/share/vm/memory/referenceProcessor.cpp	Mon Jan 26 12:47:21 2009 -0800
     6.3 @@ -721,12 +721,6 @@
     6.4                               iter.obj(), iter.obj()->blueprint()->internal_name());
     6.5      }
     6.6      assert(iter.obj()->is_oop(UseConcMarkSweepGC), "Adding a bad reference");
     6.7 -    // If discovery is concurrent, we may have objects with null referents,
     6.8 -    // being those that were concurrently cleared after they were discovered
     6.9 -    // (and not subsequently precleaned).
    6.10 -    assert(   (discovery_is_atomic() && iter.referent()->is_oop())
    6.11 -           || (!discovery_is_atomic() && iter.referent()->is_oop_or_null(UseConcMarkSweepGC)),
    6.12 -           "Adding a bad referent");
    6.13      iter.next();
    6.14    }
    6.15    // Remember to keep sentinel pointer around
     7.1 --- a/src/share/vm/runtime/globals.hpp	Wed Jan 21 13:40:10 2009 -0800
     7.2 +++ b/src/share/vm/runtime/globals.hpp	Mon Jan 26 12:47:21 2009 -0800
     7.3 @@ -1307,7 +1307,14 @@
     7.4    product(intx, ParGCArrayScanChunk, 50,                                    \
     7.5            "Scan a subset and push remainder, if array is bigger than this") \
     7.6                                                                              \
     7.7 -  product(intx, ParGCDesiredObjsFromOverflowList, 20,                       \
     7.8 +  notproduct(bool, ParGCWorkQueueOverflowALot, false,                       \
     7.9 +          "Whether we should simulate work queue overflow in ParNew")       \
    7.10 +                                                                            \
    7.11 +  notproduct(uintx, ParGCWorkQueueOverflowInterval, 1000,                   \
    7.12 +          "An `interval' counter that determines how frequently"            \
    7.13 +          " we simulate overflow; a smaller number increases frequency")    \
    7.14 +                                                                            \
    7.15 +  product(uintx, ParGCDesiredObjsFromOverflowList, 20,                      \
    7.16            "The desired number of objects to claim from the overflow list")  \
    7.17                                                                              \
    7.18    product(uintx, CMSParPromoteBlocksToClaim, 50,                            \
    7.19 @@ -1429,8 +1436,8 @@
    7.20            "Whether we should simulate frequent marking stack / work queue"  \
    7.21            " overflow")                                                      \
    7.22                                                                              \
    7.23 -  notproduct(intx, CMSMarkStackOverflowInterval, 1000,                      \
    7.24 -          "A per-thread `interval' counter that determines how frequently"  \
    7.25 +  notproduct(uintx, CMSMarkStackOverflowInterval, 1000,                     \
    7.26 +          "An `interval' counter that determines how frequently"            \
    7.27            " we simulate overflow; a smaller number increases frequency")    \
    7.28                                                                              \
    7.29    product(uintx, CMSMaxAbortablePrecleanLoops, 0,                           \
    7.30 @@ -1648,7 +1655,7 @@
    7.31    develop(uintx, WorkStealingYieldsBeforeSleep, 1000,                       \
    7.32            "Number of yields before a sleep is done during workstealing")    \
    7.33                                                                              \
    7.34 -  product(uintx, PreserveMarkStackSize, 40,                                 \
    7.35 +  product(uintx, PreserveMarkStackSize, 1024,                               \
    7.36             "Size for stack used in promotion failure handling")             \
    7.37                                                                              \
    7.38    product_pd(bool, UseTLAB, "Use thread-local object allocation")           \

mercurial