6821693: 64-bit TaskQueue capacity still too small

Wed, 05 Aug 2009 12:33:29 -0700

author
jcoomes
date
Wed, 05 Aug 2009 12:33:29 -0700
changeset 1342
3ee342e25e57
parent 1326
703065c670fa
child 1343
b1773b9a2ca1

6821693: 64-bit TaskQueue capacity still too small
6821507: Alignment problem in GC taskqueue
Reviewed-by: tonyp, apetrusenko

src/share/vm/utilities/taskqueue.hpp file | annotate | diff | comparison | revisions
     1.1 --- a/src/share/vm/utilities/taskqueue.hpp	Wed Aug 05 18:54:12 2009 -0700
     1.2 +++ b/src/share/vm/utilities/taskqueue.hpp	Wed Aug 05 12:33:29 2009 -0700
     1.3 @@ -22,94 +22,90 @@
     1.4   *
     1.5   */
     1.6  
     1.7 -#ifdef LP64
     1.8 -typedef juint TAG_TYPE;
     1.9 -// for a taskqueue size of 4M
    1.10 -#define LOG_TASKQ_SIZE 22
    1.11 -#else
    1.12 -typedef jushort TAG_TYPE;
    1.13 -// for a taskqueue size of 16K
    1.14 -#define LOG_TASKQ_SIZE 14
    1.15 -#endif
    1.16 -
    1.17  class TaskQueueSuper: public CHeapObj {
    1.18  protected:
    1.19 -  // The first free element after the last one pushed (mod _n).
    1.20 +  // Internal type for indexing the queue; also used for the tag.
    1.21 +  typedef NOT_LP64(uint16_t) LP64_ONLY(uint32_t) idx_t;
    1.22 +
    1.23 +  // The first free element after the last one pushed (mod N).
    1.24    volatile uint _bottom;
    1.25  
    1.26 -  // log2 of the size of the queue.
    1.27 -  enum SomeProtectedConstants {
    1.28 -    Log_n = LOG_TASKQ_SIZE
    1.29 +  enum {
    1.30 +    N = 1 << NOT_LP64(14) LP64_ONLY(17), // Queue size: 16K or 128K
    1.31 +    MOD_N_MASK = N - 1                   // To compute x mod N efficiently.
    1.32    };
    1.33 -#undef LOG_TASKQ_SIZE
    1.34  
    1.35 -  // Size of the queue.
    1.36 -  uint n() { return (1 << Log_n); }
    1.37 -  // For computing "x mod n" efficiently.
    1.38 -  uint n_mod_mask() { return n() - 1; }
    1.39 +  class Age {
    1.40 +  public:
    1.41 +    Age(size_t data = 0)         { _data = data; }
    1.42 +    Age(const Age& age)          { _data = age._data; }
    1.43 +    Age(idx_t top, idx_t tag)    { _fields._top = top; _fields._tag = tag; }
    1.44  
    1.45 -  struct Age {
    1.46 -    TAG_TYPE _top;
    1.47 -    TAG_TYPE _tag;
    1.48 +    Age   get()        const volatile { return _data; }
    1.49 +    void  set(Age age) volatile       { _data = age._data; }
    1.50  
    1.51 -    TAG_TYPE tag() const { return _tag; }
    1.52 -    TAG_TYPE top() const { return _top; }
    1.53 +    idx_t top()        const volatile { return _fields._top; }
    1.54 +    idx_t tag()        const volatile { return _fields._tag; }
    1.55  
    1.56 -    Age() { _tag = 0; _top = 0; }
    1.57 +    // Increment top; if it wraps, increment tag also.
    1.58 +    void increment() {
    1.59 +      _fields._top = increment_index(_fields._top);
    1.60 +      if (_fields._top == 0) ++_fields._tag;
    1.61 +    }
    1.62  
    1.63 -    friend bool operator ==(const Age& a1, const Age& a2) {
    1.64 -      return a1.tag() == a2.tag() && a1.top() == a2.top();
    1.65 +    Age cmpxchg(const Age new_age, const Age old_age) volatile {
    1.66 +      return (size_t) Atomic::cmpxchg_ptr((intptr_t)new_age._data,
    1.67 +                                          (volatile intptr_t *)&_data,
    1.68 +                                          (intptr_t)old_age._data);
    1.69      }
    1.70 +
    1.71 +    bool operator ==(const Age& other) const { return _data == other._data; }
    1.72 +
    1.73 +  private:
    1.74 +    struct fields {
    1.75 +      idx_t _top;
    1.76 +      idx_t _tag;
    1.77 +    };
    1.78 +    union {
    1.79 +      size_t _data;
    1.80 +      fields _fields;
    1.81 +    };
    1.82    };
    1.83 -  Age _age;
    1.84 -  // These make sure we do single atomic reads and writes.
    1.85 -  Age get_age() {
    1.86 -    uint res = *(volatile uint*)(&_age);
    1.87 -    return *(Age*)(&res);
    1.88 +
    1.89 +  volatile Age _age;
    1.90 +
    1.91 +  // These both operate mod N.
    1.92 +  static uint increment_index(uint ind) {
    1.93 +    return (ind + 1) & MOD_N_MASK;
    1.94    }
    1.95 -  void set_age(Age a) {
    1.96 -    *(volatile uint*)(&_age) = *(uint*)(&a);
    1.97 +  static uint decrement_index(uint ind) {
    1.98 +    return (ind - 1) & MOD_N_MASK;
    1.99    }
   1.100  
   1.101 -  TAG_TYPE get_top() {
   1.102 -    return get_age().top();
   1.103 -  }
   1.104 -
   1.105 -  // These both operate mod _n.
   1.106 -  uint increment_index(uint ind) {
   1.107 -    return (ind + 1) & n_mod_mask();
   1.108 -  }
   1.109 -  uint decrement_index(uint ind) {
   1.110 -    return (ind - 1) & n_mod_mask();
   1.111 -  }
   1.112 -
   1.113 -  // Returns a number in the range [0.._n).  If the result is "n-1", it
   1.114 -  // should be interpreted as 0.
   1.115 +  // Returns a number in the range [0..N).  If the result is "N-1", it should be
   1.116 +  // interpreted as 0.
   1.117    uint dirty_size(uint bot, uint top) {
   1.118 -    return ((int)bot - (int)top) & n_mod_mask();
   1.119 +    return (bot - top) & MOD_N_MASK;
   1.120    }
   1.121  
   1.122    // Returns the size corresponding to the given "bot" and "top".
   1.123    uint size(uint bot, uint top) {
   1.124      uint sz = dirty_size(bot, top);
   1.125 -    // Has the queue "wrapped", so that bottom is less than top?
   1.126 -    // There's a complicated special case here.  A pair of threads could
   1.127 -    // perform pop_local and pop_global operations concurrently, starting
   1.128 -    // from a state in which _bottom == _top+1.  The pop_local could
   1.129 -    // succeed in decrementing _bottom, and the pop_global in incrementing
   1.130 -    // _top (in which case the pop_global will be awarded the contested
   1.131 -    // queue element.)  The resulting state must be interpreted as an empty
   1.132 -    // queue.  (We only need to worry about one such event: only the queue
   1.133 -    // owner performs pop_local's, and several concurrent threads
   1.134 -    // attempting to perform the pop_global will all perform the same CAS,
   1.135 -    // and only one can succeed.  Any stealing thread that reads after
   1.136 -    // either the increment or decrement will see an empty queue, and will
   1.137 -    // not join the competitors.  The "sz == -1 || sz == _n-1" state will
   1.138 -    // not be modified  by concurrent queues, so the owner thread can reset
   1.139 -    // the state to _bottom == top so subsequent pushes will be performed
   1.140 -    // normally.
   1.141 -    if (sz == (n()-1)) return 0;
   1.142 -    else return sz;
   1.143 +    // Has the queue "wrapped", so that bottom is less than top?  There's a
   1.144 +    // complicated special case here.  A pair of threads could perform pop_local
   1.145 +    // and pop_global operations concurrently, starting from a state in which
   1.146 +    // _bottom == _top+1.  The pop_local could succeed in decrementing _bottom,
   1.147 +    // and the pop_global in incrementing _top (in which case the pop_global
   1.148 +    // will be awarded the contested queue element.)  The resulting state must
   1.149 +    // be interpreted as an empty queue.  (We only need to worry about one such
   1.150 +    // event: only the queue owner performs pop_local's, and several concurrent
   1.151 +    // threads attempting to perform the pop_global will all perform the same
   1.152 +    // CAS, and only one can succeed.)  Any stealing thread that reads after
   1.153 +    // either the increment or decrement will see an empty queue, and will not
   1.154 +    // join the competitors.  The "sz == -1 || sz == N-1" state will not be
   1.155 +    // modified by concurrent queues, so the owner thread can reset the state to
   1.156 +    // _bottom == top so subsequent pushes will be performed normally.
   1.157 +    return (sz == N - 1) ? 0 : sz;
   1.158    }
   1.159  
   1.160  public:
   1.161 @@ -122,22 +118,21 @@
   1.162    // The "careful" version admits the possibility of pop_local/pop_global
   1.163    // races.
   1.164    uint size() {
   1.165 -    return size(_bottom, get_top());
   1.166 +    return size(_bottom, _age.top());
   1.167    }
   1.168  
   1.169    uint dirty_size() {
   1.170 -    return dirty_size(_bottom, get_top());
   1.171 +    return dirty_size(_bottom, _age.top());
   1.172    }
   1.173  
   1.174    void set_empty() {
   1.175      _bottom = 0;
   1.176 -    _age = Age();
   1.177 +    _age.set(0);
   1.178    }
   1.179  
   1.180    // Maximum number of elements allowed in the queue.  This is two less
   1.181    // than the actual queue size, for somewhat complicated reasons.
   1.182 -  uint max_elems() { return n() - 2; }
   1.183 -
   1.184 +  uint max_elems() { return N - 2; }
   1.185  };
   1.186  
   1.187  template<class E> class GenericTaskQueue: public TaskQueueSuper {
   1.188 @@ -179,12 +174,12 @@
   1.189  
   1.190  template<class E>
   1.191  GenericTaskQueue<E>::GenericTaskQueue():TaskQueueSuper() {
   1.192 -  assert(sizeof(Age) == sizeof(int), "Depends on this.");
   1.193 +  assert(sizeof(Age) == sizeof(size_t), "Depends on this.");
   1.194  }
   1.195  
   1.196  template<class E>
   1.197  void GenericTaskQueue<E>::initialize() {
   1.198 -  _elems = NEW_C_HEAP_ARRAY(E, n());
   1.199 +  _elems = NEW_C_HEAP_ARRAY(E, N);
   1.200    guarantee(_elems != NULL, "Allocation failed.");
   1.201  }
   1.202  
   1.203 @@ -208,14 +203,14 @@
   1.204  
   1.205  template<class E>
   1.206  bool GenericTaskQueue<E>::push_slow(E t, uint dirty_n_elems) {
   1.207 -  if (dirty_n_elems == n() - 1) {
   1.208 +  if (dirty_n_elems == N - 1) {
   1.209      // Actually means 0, so do the push.
   1.210      uint localBot = _bottom;
   1.211      _elems[localBot] = t;
   1.212      _bottom = increment_index(localBot);
   1.213      return true;
   1.214 -  } else
   1.215 -    return false;
   1.216 +  }
   1.217 +  return false;
   1.218  }
   1.219  
   1.220  template<class E>
   1.221 @@ -230,53 +225,45 @@
   1.222    // then have the owner thread do a pop followed by another push.  Without
   1.223    // the incrementing of "tag", the pop_global's CAS could succeed,
   1.224    // allowing it to believe it has claimed the stale element.)
   1.225 -  Age newAge;
   1.226 -  newAge._top = localBot;
   1.227 -  newAge._tag = oldAge.tag() + 1;
   1.228 +  Age newAge((idx_t)localBot, oldAge.tag() + 1);
   1.229    // Perhaps a competing pop_global has already incremented "top", in which
   1.230    // case it wins the element.
   1.231    if (localBot == oldAge.top()) {
   1.232 -    Age tempAge;
   1.233      // No competing pop_global has yet incremented "top"; we'll try to
   1.234      // install new_age, thus claiming the element.
   1.235 -    assert(sizeof(Age) == sizeof(int), "Assumption about CAS unit.");
   1.236 -    *(uint*)&tempAge = Atomic::cmpxchg(*(uint*)&newAge, (volatile uint*)&_age, *(uint*)&oldAge);
   1.237 +    Age tempAge = _age.cmpxchg(newAge, oldAge);
   1.238      if (tempAge == oldAge) {
   1.239        // We win.
   1.240 -      assert(dirty_size(localBot, get_top()) != n() - 1,
   1.241 -             "Shouldn't be possible...");
   1.242 +      assert(dirty_size(localBot, _age.top()) != N - 1, "sanity");
   1.243        return true;
   1.244      }
   1.245    }
   1.246 -  // We fail; a completing pop_global gets the element.  But the queue is
   1.247 -  // empty (and top is greater than bottom.)  Fix this representation of
   1.248 -  // the empty queue to become the canonical one.
   1.249 -  set_age(newAge);
   1.250 -  assert(dirty_size(localBot, get_top()) != n() - 1,
   1.251 -         "Shouldn't be possible...");
   1.252 +  // We lose; a completing pop_global gets the element.  But the queue is empty
   1.253 +  // and top is greater than bottom.  Fix this representation of the empty queue
   1.254 +  // to become the canonical one.
   1.255 +  _age.set(newAge);
   1.256 +  assert(dirty_size(localBot, _age.top()) != N - 1, "sanity");
   1.257    return false;
   1.258  }
   1.259  
   1.260  template<class E>
   1.261  bool GenericTaskQueue<E>::pop_global(E& t) {
   1.262 -  Age newAge;
   1.263 -  Age oldAge = get_age();
   1.264 +  Age oldAge = _age.get();
   1.265    uint localBot = _bottom;
   1.266    uint n_elems = size(localBot, oldAge.top());
   1.267    if (n_elems == 0) {
   1.268      return false;
   1.269    }
   1.270 +
   1.271    t = _elems[oldAge.top()];
   1.272 -  newAge = oldAge;
   1.273 -  newAge._top = increment_index(newAge.top());
   1.274 -  if ( newAge._top == 0 ) newAge._tag++;
   1.275 -  Age resAge;
   1.276 -  *(uint*)&resAge = Atomic::cmpxchg(*(uint*)&newAge, (volatile uint*)&_age, *(uint*)&oldAge);
   1.277 +  Age newAge(oldAge);
   1.278 +  newAge.increment();
   1.279 +  Age resAge = _age.cmpxchg(newAge, oldAge);
   1.280 +
   1.281    // Note that using "_bottom" here might fail, since a pop_local might
   1.282    // have decremented it.
   1.283 -  assert(dirty_size(localBot, newAge._top) != n() - 1,
   1.284 -         "Shouldn't be possible...");
   1.285 -  return (resAge == oldAge);
   1.286 +  assert(dirty_size(localBot, newAge.top()) != N - 1, "sanity");
   1.287 +  return resAge == oldAge;
   1.288  }
   1.289  
   1.290  template<class E>
   1.291 @@ -459,7 +446,7 @@
   1.292      return offer_termination(NULL);
   1.293    }
   1.294  
   1.295 -  // As above, but it also terminates of the should_exit_termination()
   1.296 +  // As above, but it also terminates if the should_exit_termination()
   1.297    // method of the terminator parameter returns true. If terminator is
   1.298    // NULL, then it is ignored.
   1.299    bool offer_termination(TerminatorTerminator* terminator);
   1.300 @@ -492,11 +479,10 @@
   1.301    }
   1.302  #else
   1.303    uint localBot = _bottom;
   1.304 -  assert((localBot >= 0) && (localBot < n()), "_bottom out of range.");
   1.305 -  TAG_TYPE top = get_top();
   1.306 +  assert((localBot >= 0) && (localBot < N), "_bottom out of range.");
   1.307 +  idx_t top = _age.top();
   1.308    uint dirty_n_elems = dirty_size(localBot, top);
   1.309 -  assert((dirty_n_elems >= 0) && (dirty_n_elems < n()),
   1.310 -         "n_elems out of range.");
   1.311 +  assert((dirty_n_elems >= 0) && (dirty_n_elems < N), "n_elems out of range.");
   1.312    if (dirty_n_elems < max_elems()) {
   1.313      _elems[localBot] = t;
   1.314      _bottom = increment_index(localBot);
   1.315 @@ -517,12 +503,12 @@
   1.316    return true;
   1.317  #else
   1.318    uint localBot = _bottom;
   1.319 -  // This value cannot be n-1.  That can only occur as a result of
   1.320 +  // This value cannot be N-1.  That can only occur as a result of
   1.321    // the assignment to bottom in this method.  If it does, this method
   1.322    // resets the size( to 0 before the next call (which is sequential,
   1.323    // since this is pop_local.)
   1.324 -  uint dirty_n_elems = dirty_size(localBot, get_top());
   1.325 -  assert(dirty_n_elems != n() - 1, "Shouldn't be possible...");
   1.326 +  uint dirty_n_elems = dirty_size(localBot, _age.top());
   1.327 +  assert(dirty_n_elems != N - 1, "Shouldn't be possible...");
   1.328    if (dirty_n_elems == 0) return false;
   1.329    localBot = decrement_index(localBot);
   1.330    _bottom = localBot;
   1.331 @@ -534,15 +520,14 @@
   1.332    // If there's still at least one element in the queue, based on the
   1.333    // "_bottom" and "age" we've read, then there can be no interference with
   1.334    // a "pop_global" operation, and we're done.
   1.335 -  TAG_TYPE tp = get_top();    // XXX
   1.336 +  idx_t tp = _age.top();    // XXX
   1.337    if (size(localBot, tp) > 0) {
   1.338 -    assert(dirty_size(localBot, tp) != n() - 1,
   1.339 -           "Shouldn't be possible...");
   1.340 +    assert(dirty_size(localBot, tp) != N - 1, "sanity");
   1.341      return true;
   1.342    } else {
   1.343      // Otherwise, the queue contained exactly one element; we take the slow
   1.344      // path.
   1.345 -    return pop_local_slow(localBot, get_age());
   1.346 +    return pop_local_slow(localBot, _age.get());
   1.347    }
   1.348  #endif
   1.349  }

mercurial