6819891: ParNew: Fix work queue overflow code to deal correctly with +UseCompressedOops

Sat, 28 Mar 2009 15:47:29 -0700

author
ysr
date
Sat, 28 Mar 2009 15:47:29 -0700
changeset 1114
cea947c8a988
parent 1113
4ac7d97e6101
child 1115
a80d48f6fde1
child 1130
becb17ad5e51

6819891: ParNew: Fix work queue overflow code to deal correctly with +UseCompressedOops
Summary: When using compressed oops, rather than chaining the overflowed grey objects' pre-images through their klass words, we use GC-worker thread-local overflow stacks.
Reviewed-by: jcoomes, jmasa

src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp file | annotate | diff | comparison | revisions
src/share/vm/gc_implementation/parNew/parNewGeneration.cpp file | annotate | diff | comparison | revisions
src/share/vm/gc_implementation/parNew/parNewGeneration.hpp file | annotate | diff | comparison | revisions
src/share/vm/runtime/globals.hpp file | annotate | diff | comparison | revisions
     1.1 --- a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp	Thu Mar 26 08:51:32 2009 -0700
     1.2 +++ b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp	Sat Mar 28 15:47:29 2009 -0700
     1.3 @@ -3847,7 +3847,7 @@
     1.4    MutexLockerEx ml(ovflw_stk->par_lock(),
     1.5                     Mutex::_no_safepoint_check_flag);
     1.6    // Grab up to 1/4 the size of the work queue
     1.7 -  size_t num = MIN2((size_t)work_q->max_elems()/4,
     1.8 +  size_t num = MIN2((size_t)(work_q->max_elems() - work_q->size())/4,
     1.9                      (size_t)ParGCDesiredObjsFromOverflowList);
    1.10    num = MIN2(num, ovflw_stk->length());
    1.11    for (int i = (int) num; i > 0; i--) {
    1.12 @@ -5204,13 +5204,12 @@
    1.13    NOT_PRODUCT(int num_steals = 0;)
    1.14    oop obj_to_scan;
    1.15    CMSBitMap* bm = &(_collector->_markBitMap);
    1.16 -  size_t num_from_overflow_list =
    1.17 -           MIN2((size_t)work_q->max_elems()/4,
    1.18 -                (size_t)ParGCDesiredObjsFromOverflowList);
    1.19  
    1.20    while (true) {
    1.21      // Completely finish any left over work from (an) earlier round(s)
    1.22      cl->trim_queue(0);
    1.23 +    size_t num_from_overflow_list = MIN2((size_t)(work_q->max_elems() - work_q->size())/4,
    1.24 +                                         (size_t)ParGCDesiredObjsFromOverflowList);
    1.25      // Now check if there's any work in the overflow list
    1.26      if (_collector->par_take_from_overflow_list(num_from_overflow_list,
    1.27                                                  work_q)) {
    1.28 @@ -5622,13 +5621,12 @@
    1.29    OopTaskQueue* work_q = work_queue(i);
    1.30    NOT_PRODUCT(int num_steals = 0;)
    1.31    oop obj_to_scan;
    1.32 -  size_t num_from_overflow_list =
    1.33 -           MIN2((size_t)work_q->max_elems()/4,
    1.34 -                (size_t)ParGCDesiredObjsFromOverflowList);
    1.35  
    1.36    while (true) {
    1.37      // Completely finish any left over work from (an) earlier round(s)
    1.38      drain->trim_queue(0);
    1.39 +    size_t num_from_overflow_list = MIN2((size_t)(work_q->max_elems() - work_q->size())/4,
    1.40 +                                         (size_t)ParGCDesiredObjsFromOverflowList);
    1.41      // Now check if there's any work in the overflow list
    1.42      if (_collector->par_take_from_overflow_list(num_from_overflow_list,
    1.43                                                  work_q)) {
    1.44 @@ -9021,7 +9019,7 @@
    1.45  // Transfer some number of overflown objects to usual marking
    1.46  // stack. Return true if some objects were transferred.
    1.47  bool MarkRefsIntoAndScanClosure::take_from_overflow_list() {
    1.48 -  size_t num = MIN2((size_t)_mark_stack->capacity()/4,
    1.49 +  size_t num = MIN2((size_t)(_mark_stack->capacity() - _mark_stack->length())/4,
    1.50                      (size_t)ParGCDesiredObjsFromOverflowList);
    1.51  
    1.52    bool res = _collector->take_from_overflow_list(num, _mark_stack);
     2.1 --- a/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp	Thu Mar 26 08:51:32 2009 -0700
     2.2 +++ b/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp	Sat Mar 28 15:47:29 2009 -0700
     2.3 @@ -36,7 +36,7 @@
     2.4                                         ObjToScanQueueSet* work_queue_set_,
     2.5                                         size_t desired_plab_sz_,
     2.6                                         ParallelTaskTerminator& term_) :
     2.7 -  _to_space(to_space_), _old_gen(old_gen_), _thread_num(thread_num_),
     2.8 +  _to_space(to_space_), _old_gen(old_gen_), _young_gen(gen_), _thread_num(thread_num_),
     2.9    _work_queue(work_queue_set_->queue(thread_num_)), _to_space_full(false),
    2.10    _ageTable(false), // false ==> not the global age table, no perf data.
    2.11    _to_space_alloc_buffer(desired_plab_sz_),
    2.12 @@ -57,6 +57,11 @@
    2.13    _start = os::elapsedTime();
    2.14    _old_gen_closure.set_generation(old_gen_);
    2.15    _old_gen_root_closure.set_generation(old_gen_);
    2.16 +  if (UseCompressedOops) {
    2.17 +    _overflow_stack = new (ResourceObj::C_HEAP) GrowableArray<oop>(512, true);
    2.18 +  } else {
    2.19 +    _overflow_stack = NULL;
    2.20 +  }
    2.21  }
    2.22  #ifdef _MSC_VER
    2.23  #pragma warning( pop )
    2.24 @@ -81,7 +86,7 @@
    2.25    assert(old->is_objArray(), "must be obj array");
    2.26    assert(old->is_forwarded(), "must be forwarded");
    2.27    assert(Universe::heap()->is_in_reserved(old), "must be in heap.");
    2.28 -  assert(!_old_gen->is_in(old), "must be in young generation.");
    2.29 +  assert(!old_gen()->is_in(old), "must be in young generation.");
    2.30  
    2.31    objArrayOop obj = objArrayOop(old->forwardee());
    2.32    // Process ParGCArrayScanChunk elements now
    2.33 @@ -119,26 +124,68 @@
    2.34  
    2.35  void ParScanThreadState::trim_queues(int max_size) {
    2.36    ObjToScanQueue* queue = work_queue();
    2.37 -  while (queue->size() > (juint)max_size) {
    2.38 -    oop obj_to_scan;
    2.39 -    if (queue->pop_local(obj_to_scan)) {
    2.40 -      note_pop();
    2.41 -
    2.42 -      if ((HeapWord *)obj_to_scan < young_old_boundary()) {
    2.43 -        if (obj_to_scan->is_objArray() &&
    2.44 -            obj_to_scan->is_forwarded() &&
    2.45 -            obj_to_scan->forwardee() != obj_to_scan) {
    2.46 -          scan_partial_array_and_push_remainder(obj_to_scan);
    2.47 +  do {
    2.48 +    while (queue->size() > (juint)max_size) {
    2.49 +      oop obj_to_scan;
    2.50 +      if (queue->pop_local(obj_to_scan)) {
    2.51 +        note_pop();
    2.52 +        if ((HeapWord *)obj_to_scan < young_old_boundary()) {
    2.53 +          if (obj_to_scan->is_objArray() &&
    2.54 +              obj_to_scan->is_forwarded() &&
    2.55 +              obj_to_scan->forwardee() != obj_to_scan) {
    2.56 +            scan_partial_array_and_push_remainder(obj_to_scan);
    2.57 +          } else {
    2.58 +            // object is in to_space
    2.59 +            obj_to_scan->oop_iterate(&_to_space_closure);
    2.60 +          }
    2.61          } else {
    2.62 -          // object is in to_space
    2.63 -          obj_to_scan->oop_iterate(&_to_space_closure);
    2.64 +          // object is in old generation
    2.65 +          obj_to_scan->oop_iterate(&_old_gen_closure);
    2.66          }
    2.67 -      } else {
    2.68 -        // object is in old generation
    2.69 -        obj_to_scan->oop_iterate(&_old_gen_closure);
    2.70        }
    2.71      }
    2.72 +    // For the  case of compressed oops, we have a private, non-shared
    2.73 +    // overflow stack, so we eagerly drain it so as to more evenly
    2.74 +    // distribute load early. Note: this may be good to do in
    2.75 +    // general rather than delay for the final stealing phase.
    2.76 +    // If applicable, we'll transfer a set of objects over to our
    2.77 +    // work queue, allowing them to be stolen and draining our
    2.78 +    // private overflow stack.
    2.79 +  } while (ParGCTrimOverflow && young_gen()->take_from_overflow_list(this));
    2.80 +}
    2.81 +
    2.82 +bool ParScanThreadState::take_from_overflow_stack() {
    2.83 +  assert(UseCompressedOops, "Else should not call");
    2.84 +  assert(young_gen()->overflow_list() == NULL, "Error");
    2.85 +  ObjToScanQueue* queue = work_queue();
    2.86 +  GrowableArray<oop>* of_stack = overflow_stack();
    2.87 +  uint num_overflow_elems = of_stack->length();
    2.88 +  uint num_take_elems     = MIN2(MIN2((queue->max_elems() - queue->size())/4,
    2.89 +                                      (juint)ParGCDesiredObjsFromOverflowList),
    2.90 +                                 num_overflow_elems);
    2.91 +  // Transfer the most recent num_take_elems from the overflow
    2.92 +  // stack to our work queue.
    2.93 +  for (size_t i = 0; i != num_take_elems; i++) {
    2.94 +    oop cur = of_stack->pop();
    2.95 +    oop obj_to_push = cur->forwardee();
    2.96 +    assert(Universe::heap()->is_in_reserved(cur), "Should be in heap");
    2.97 +    assert(!old_gen()->is_in_reserved(cur), "Should be in young gen");
    2.98 +    assert(Universe::heap()->is_in_reserved(obj_to_push), "Should be in heap");
    2.99 +    if (should_be_partially_scanned(obj_to_push, cur)) {
   2.100 +      assert(arrayOop(cur)->length() == 0, "entire array remaining to be scanned");
   2.101 +      obj_to_push = cur;
   2.102 +    }
   2.103 +    bool ok = queue->push(obj_to_push);
   2.104 +    assert(ok, "Should have succeeded");
   2.105    }
   2.106 +  assert(young_gen()->overflow_list() == NULL, "Error");
   2.107 +  return num_take_elems > 0;  // was something transferred?
   2.108 +}
   2.109 +
   2.110 +void ParScanThreadState::push_on_overflow_stack(oop p) {
   2.111 +  assert(UseCompressedOops, "Else should not call");
   2.112 +  overflow_stack()->push(p);
   2.113 +  assert(young_gen()->overflow_list() == NULL, "Error");
   2.114  }
   2.115  
   2.116  HeapWord* ParScanThreadState::alloc_in_to_space_slow(size_t word_sz) {
   2.117 @@ -425,8 +472,7 @@
   2.118    ResourceMark rm;
   2.119    HandleMark hm;
   2.120    // We would need multiple old-gen queues otherwise.
   2.121 -  guarantee(gch->n_gens() == 2,
   2.122 -     "Par young collection currently only works with one older gen.");
   2.123 +  assert(gch->n_gens() == 2, "Par young collection currently only works with one older gen.");
   2.124  
   2.125    Generation* old_gen = gch->next_gen(_gen);
   2.126  
   2.127 @@ -1169,36 +1215,75 @@
   2.128  }
   2.129  #endif
   2.130  
   2.131 +// In case we are using compressed oops, we need to be careful.
   2.132 +// If the object being pushed is an object array, then its length
   2.133 +// field keeps track of the "grey boundary" at which the next
   2.134 +// incremental scan will be done (see ParGCArrayScanChunk).
   2.135 +// When using compressed oops, this length field is kept in the
   2.136 +// lower 32 bits of the erstwhile klass word and cannot be used
   2.137 +// for the overflow chaining pointer (OCP below). As such the OCP
   2.138 +// would itself need to be compressed into the top 32-bits in this
   2.139 +// case. Unfortunately, see below, in the event that we have a
   2.140 +// promotion failure, the node to be pushed on the list can be
   2.141 +// outside of the Java heap, so the heap-based pointer compression
   2.142 +// would not work (we would have potential aliasing between C-heap
   2.143 +// and Java-heap pointers). For this reason, when using compressed
   2.144 +// oops, we simply use a worker-thread-local, non-shared overflow
   2.145 +// list in the form of a growable array, with a slightly different
   2.146 +// overflow stack draining strategy. If/when we start using fat
   2.147 +// stacks here, we can go back to using (fat) pointer chains
   2.148 +// (although some performance comparisons would be useful since
   2.149 +// single global lists have their own performance disadvantages
   2.150 +// as we were made painfully aware not long ago, see 6786503).
   2.151  #define BUSY (oop(0x1aff1aff))
   2.152  void ParNewGeneration::push_on_overflow_list(oop from_space_obj, ParScanThreadState* par_scan_state) {
   2.153 -  // if the object has been forwarded to itself, then we cannot
   2.154 -  // use the klass pointer for the linked list.  Instead we have
   2.155 -  // to allocate an oopDesc in the C-Heap and use that for the linked list.
   2.156 -  // XXX This is horribly inefficient when a promotion failure occurs
   2.157 -  // and should be fixed. XXX FIX ME !!!
   2.158 +  assert(is_in_reserved(from_space_obj), "Should be from this generation");
   2.159 +  if (UseCompressedOops) {
   2.160 +    // In the case of compressed oops, we use a private, not-shared
   2.161 +    // overflow stack.
   2.162 +    par_scan_state->push_on_overflow_stack(from_space_obj);
   2.163 +  } else {
   2.164 +    // if the object has been forwarded to itself, then we cannot
   2.165 +    // use the klass pointer for the linked list.  Instead we have
   2.166 +    // to allocate an oopDesc in the C-Heap and use that for the linked list.
   2.167 +    // XXX This is horribly inefficient when a promotion failure occurs
   2.168 +    // and should be fixed. XXX FIX ME !!!
   2.169  #ifndef PRODUCT
   2.170 -  Atomic::inc_ptr(&_num_par_pushes);
   2.171 -  assert(_num_par_pushes > 0, "Tautology");
   2.172 +    Atomic::inc_ptr(&_num_par_pushes);
   2.173 +    assert(_num_par_pushes > 0, "Tautology");
   2.174  #endif
   2.175 -  if (from_space_obj->forwardee() == from_space_obj) {
   2.176 -    oopDesc* listhead = NEW_C_HEAP_ARRAY(oopDesc, 1);
   2.177 -    listhead->forward_to(from_space_obj);
   2.178 -    from_space_obj = listhead;
   2.179 +    if (from_space_obj->forwardee() == from_space_obj) {
   2.180 +      oopDesc* listhead = NEW_C_HEAP_ARRAY(oopDesc, 1);
   2.181 +      listhead->forward_to(from_space_obj);
   2.182 +      from_space_obj = listhead;
   2.183 +    }
   2.184 +    oop observed_overflow_list = _overflow_list;
   2.185 +    oop cur_overflow_list;
   2.186 +    do {
   2.187 +      cur_overflow_list = observed_overflow_list;
   2.188 +      if (cur_overflow_list != BUSY) {
   2.189 +        from_space_obj->set_klass_to_list_ptr(cur_overflow_list);
   2.190 +      } else {
   2.191 +        from_space_obj->set_klass_to_list_ptr(NULL);
   2.192 +      }
   2.193 +      observed_overflow_list =
   2.194 +        (oop)Atomic::cmpxchg_ptr(from_space_obj, &_overflow_list, cur_overflow_list);
   2.195 +    } while (cur_overflow_list != observed_overflow_list);
   2.196    }
   2.197 -  oop observed_overflow_list = _overflow_list;
   2.198 -  oop cur_overflow_list;
   2.199 -  do {
   2.200 -    cur_overflow_list = observed_overflow_list;
   2.201 -    if (cur_overflow_list != BUSY) {
   2.202 -      from_space_obj->set_klass_to_list_ptr(cur_overflow_list);
   2.203 -    } else {
   2.204 -      from_space_obj->set_klass_to_list_ptr(NULL);
   2.205 -    }
   2.206 -    observed_overflow_list =
   2.207 -      (oop)Atomic::cmpxchg_ptr(from_space_obj, &_overflow_list, cur_overflow_list);
   2.208 -  } while (cur_overflow_list != observed_overflow_list);
   2.209  }
   2.210  
   2.211 +bool ParNewGeneration::take_from_overflow_list(ParScanThreadState* par_scan_state) {
   2.212 +  bool res;
   2.213 +
   2.214 +  if (UseCompressedOops) {
   2.215 +    res = par_scan_state->take_from_overflow_stack();
   2.216 +  } else {
   2.217 +    res = take_from_overflow_list_work(par_scan_state);
   2.218 +  }
   2.219 +  return res;
   2.220 +}
   2.221 +
   2.222 +
   2.223  // *NOTE*: The overflow list manipulation code here and
   2.224  // in CMSCollector:: are very similar in shape,
   2.225  // except that in the CMS case we thread the objects
   2.226 @@ -1213,14 +1298,13 @@
   2.227  // similar changes might be needed.
   2.228  // See CMSCollector::par_take_from_overflow_list() for
   2.229  // more extensive documentation comments.
   2.230 -bool
   2.231 -ParNewGeneration::take_from_overflow_list(ParScanThreadState* par_scan_state) {
   2.232 +bool ParNewGeneration::take_from_overflow_list_work(ParScanThreadState* par_scan_state) {
   2.233    ObjToScanQueue* work_q = par_scan_state->work_queue();
   2.234 -  assert(work_q->size() == 0, "Should first empty local work queue");
   2.235    // How many to take?
   2.236 -  size_t objsFromOverflow = MIN2((size_t)work_q->max_elems()/4,
   2.237 +  size_t objsFromOverflow = MIN2((size_t)(work_q->max_elems() - work_q->size())/4,
   2.238                                   (size_t)ParGCDesiredObjsFromOverflowList);
   2.239  
   2.240 +  assert(par_scan_state->overflow_stack() == NULL, "Error");
   2.241    if (_overflow_list == NULL) return false;
   2.242  
   2.243    // Otherwise, there was something there; try claiming the list.
     3.1 --- a/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp	Thu Mar 26 08:51:32 2009 -0700
     3.2 +++ b/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp	Sat Mar 28 15:47:29 2009 -0700
     3.3 @@ -55,6 +55,7 @@
     3.4    friend class ParScanThreadStateSet;
     3.5   private:
     3.6    ObjToScanQueue *_work_queue;
     3.7 +  GrowableArray<oop>* _overflow_stack;
     3.8  
     3.9    ParGCAllocBuffer _to_space_alloc_buffer;
    3.10  
    3.11 @@ -79,6 +80,9 @@
    3.12    Space* _to_space;
    3.13    Space* to_space() { return _to_space; }
    3.14  
    3.15 +  ParNewGeneration* _young_gen;
    3.16 +  ParNewGeneration* young_gen() const { return _young_gen; }
    3.17 +
    3.18    Generation* _old_gen;
    3.19    Generation* old_gen() { return _old_gen; }
    3.20  
    3.21 @@ -134,6 +138,11 @@
    3.22    // Decrease queue size below "max_size".
    3.23    void trim_queues(int max_size);
    3.24  
    3.25 +  // Private overflow stack usage
    3.26 +  GrowableArray<oop>* overflow_stack() { return _overflow_stack; }
    3.27 +  bool take_from_overflow_stack();
    3.28 +  void push_on_overflow_stack(oop p);
    3.29 +
    3.30    // Is new_obj a candidate for scan_partial_array_and_push_remainder method.
    3.31    inline bool should_be_partially_scanned(oop new_obj, oop old_obj) const;
    3.32  
    3.33 @@ -378,13 +387,17 @@
    3.34    NOT_PRODUCT(int _overflow_counter;)
    3.35    NOT_PRODUCT(bool should_simulate_overflow();)
    3.36  
    3.37 +  // Accessor for overflow list
    3.38 +  oop overflow_list() { return _overflow_list; }
    3.39 +
    3.40    // Push the given (from-space) object on the global overflow list.
    3.41    void push_on_overflow_list(oop from_space_obj, ParScanThreadState* par_scan_state);
    3.42  
    3.43    // If the global overflow list is non-empty, move some tasks from it
    3.44 -  // onto "work_q" (which must be empty).  No more than 1/4 of the
    3.45 -  // max_elems of "work_q" are moved.
    3.46 +  // onto "work_q" (which need not be empty).  No more than 1/4 of the
    3.47 +  // available space on "work_q" is used.
    3.48    bool take_from_overflow_list(ParScanThreadState* par_scan_state);
    3.49 +  bool take_from_overflow_list_work(ParScanThreadState* par_scan_state);
    3.50  
    3.51    // The task queues to be used by parallel GC threads.
    3.52    ObjToScanQueueSet* task_queues() {
     4.1 --- a/src/share/vm/runtime/globals.hpp	Thu Mar 26 08:51:32 2009 -0700
     4.2 +++ b/src/share/vm/runtime/globals.hpp	Sat Mar 28 15:47:29 2009 -0700
     4.3 @@ -1316,6 +1316,9 @@
     4.4    product(intx, ParGCArrayScanChunk, 50,                                    \
     4.5            "Scan a subset and push remainder, if array is bigger than this") \
     4.6                                                                              \
     4.7 +  product(bool, ParGCTrimOverflow, true,                                    \
     4.8 +          "Eagerly trim the overflow lists (useful for UseCompressedOops")  \
     4.9 +                                                                            \
    4.10    notproduct(bool, ParGCWorkQueueOverflowALot, false,                       \
    4.11            "Whether we should simulate work queue overflow in ParNew")       \
    4.12                                                                              \

mercurial