src/share/vm/utilities/taskqueue.hpp

changeset 0
f90c822e73f8
child 1
2d8a650513c2
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/share/vm/utilities/taskqueue.hpp	Wed Apr 27 01:25:04 2016 +0800
     1.3 @@ -0,0 +1,831 @@
     1.4 +/*
     1.5 + * Copyright (c) 2001, 2013, Oracle and/or its affiliates. All rights reserved.
     1.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     1.7 + *
     1.8 + * This code is free software; you can redistribute it and/or modify it
     1.9 + * under the terms of the GNU General Public License version 2 only, as
    1.10 + * published by the Free Software Foundation.
    1.11 + *
    1.12 + * This code is distributed in the hope that it will be useful, but WITHOUT
    1.13 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    1.14 + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    1.15 + * version 2 for more details (a copy is included in the LICENSE file that
    1.16 + * accompanied this code).
    1.17 + *
    1.18 + * You should have received a copy of the GNU General Public License version
    1.19 + * 2 along with this work; if not, write to the Free Software Foundation,
    1.20 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    1.21 + *
    1.22 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    1.23 + * or visit www.oracle.com if you need additional information or have any
    1.24 + * questions.
    1.25 + *
    1.26 + */
    1.27 +
    1.28 +#ifndef SHARE_VM_UTILITIES_TASKQUEUE_HPP
    1.29 +#define SHARE_VM_UTILITIES_TASKQUEUE_HPP
    1.30 +
    1.31 +#include "memory/allocation.hpp"
    1.32 +#include "memory/allocation.inline.hpp"
    1.33 +#include "runtime/mutex.hpp"
    1.34 +#include "utilities/stack.hpp"
    1.35 +#ifdef TARGET_OS_ARCH_linux_x86
    1.36 +# include "orderAccess_linux_x86.inline.hpp"
    1.37 +#endif
    1.38 +#ifdef TARGET_OS_ARCH_linux_sparc
    1.39 +# include "orderAccess_linux_sparc.inline.hpp"
    1.40 +#endif
    1.41 +#ifdef TARGET_OS_ARCH_linux_zero
    1.42 +# include "orderAccess_linux_zero.inline.hpp"
    1.43 +#endif
    1.44 +#ifdef TARGET_OS_ARCH_solaris_x86
    1.45 +# include "orderAccess_solaris_x86.inline.hpp"
    1.46 +#endif
    1.47 +#ifdef TARGET_OS_ARCH_solaris_sparc
    1.48 +# include "orderAccess_solaris_sparc.inline.hpp"
    1.49 +#endif
    1.50 +#ifdef TARGET_OS_ARCH_windows_x86
    1.51 +# include "orderAccess_windows_x86.inline.hpp"
    1.52 +#endif
    1.53 +#ifdef TARGET_OS_ARCH_linux_arm
    1.54 +# include "orderAccess_linux_arm.inline.hpp"
    1.55 +#endif
    1.56 +#ifdef TARGET_OS_ARCH_linux_ppc
    1.57 +# include "orderAccess_linux_ppc.inline.hpp"
    1.58 +#endif
    1.59 +#ifdef TARGET_OS_ARCH_aix_ppc
    1.60 +# include "orderAccess_aix_ppc.inline.hpp"
    1.61 +#endif
    1.62 +#ifdef TARGET_OS_ARCH_bsd_x86
    1.63 +# include "orderAccess_bsd_x86.inline.hpp"
    1.64 +#endif
    1.65 +#ifdef TARGET_OS_ARCH_bsd_zero
    1.66 +# include "orderAccess_bsd_zero.inline.hpp"
    1.67 +#endif
    1.68 +
    1.69 +// Simple TaskQueue stats that are collected by default in debug builds.
    1.70 +
    1.71 +#if !defined(TASKQUEUE_STATS) && defined(ASSERT)
    1.72 +#define TASKQUEUE_STATS 1
    1.73 +#elif !defined(TASKQUEUE_STATS)
    1.74 +#define TASKQUEUE_STATS 0
    1.75 +#endif
    1.76 +
    1.77 +#if TASKQUEUE_STATS
    1.78 +#define TASKQUEUE_STATS_ONLY(code) code
    1.79 +#else
    1.80 +#define TASKQUEUE_STATS_ONLY(code)
    1.81 +#endif // TASKQUEUE_STATS
    1.82 +
    1.83 +#if TASKQUEUE_STATS
    1.84 +class TaskQueueStats {
    1.85 +public:
    1.86 +  enum StatId {
    1.87 +    push,             // number of taskqueue pushes
    1.88 +    pop,              // number of taskqueue pops
    1.89 +    pop_slow,         // subset of taskqueue pops that were done slow-path
    1.90 +    steal_attempt,    // number of taskqueue steal attempts
    1.91 +    steal,            // number of taskqueue steals
    1.92 +    overflow,         // number of overflow pushes
    1.93 +    overflow_max_len, // max length of overflow stack
    1.94 +    last_stat_id
    1.95 +  };
    1.96 +
    1.97 +public:
    1.98 +  inline TaskQueueStats()       { reset(); }
    1.99 +
   1.100 +  inline void record_push()     { ++_stats[push]; }
   1.101 +  inline void record_pop()      { ++_stats[pop]; }
   1.102 +  inline void record_pop_slow() { record_pop(); ++_stats[pop_slow]; }
   1.103 +  inline void record_steal(bool success);
   1.104 +  inline void record_overflow(size_t new_length);
   1.105 +
   1.106 +  TaskQueueStats & operator +=(const TaskQueueStats & addend);
   1.107 +
   1.108 +  inline size_t get(StatId id) const { return _stats[id]; }
   1.109 +  inline const size_t* get() const   { return _stats; }
   1.110 +
   1.111 +  inline void reset();
   1.112 +
   1.113 +  // Print the specified line of the header (does not include a line separator).
   1.114 +  static void print_header(unsigned int line, outputStream* const stream = tty,
   1.115 +                           unsigned int width = 10);
   1.116 +  // Print the statistics (does not include a line separator).
   1.117 +  void print(outputStream* const stream = tty, unsigned int width = 10) const;
   1.118 +
   1.119 +  DEBUG_ONLY(void verify() const;)
   1.120 +
   1.121 +private:
   1.122 +  size_t                    _stats[last_stat_id];
   1.123 +  static const char * const _names[last_stat_id];
   1.124 +};
   1.125 +
   1.126 +void TaskQueueStats::record_steal(bool success) {
   1.127 +  ++_stats[steal_attempt];
   1.128 +  if (success) ++_stats[steal];
   1.129 +}
   1.130 +
   1.131 +void TaskQueueStats::record_overflow(size_t new_len) {
   1.132 +  ++_stats[overflow];
   1.133 +  if (new_len > _stats[overflow_max_len]) _stats[overflow_max_len] = new_len;
   1.134 +}
   1.135 +
   1.136 +void TaskQueueStats::reset() {
   1.137 +  memset(_stats, 0, sizeof(_stats));
   1.138 +}
   1.139 +#endif // TASKQUEUE_STATS
   1.140 +
   1.141 +// TaskQueueSuper collects functionality common to all GenericTaskQueue instances.
   1.142 +
   1.143 +template <unsigned int N, MEMFLAGS F>
   1.144 +class TaskQueueSuper: public CHeapObj<F> {
   1.145 +protected:
   1.146 +  // Internal type for indexing the queue; also used for the tag.
   1.147 +  typedef NOT_LP64(uint16_t) LP64_ONLY(uint32_t) idx_t;
   1.148 +
   1.149 +  // The first free element after the last one pushed (mod N).
   1.150 +  volatile uint _bottom;
   1.151 +
   1.152 +  enum { MOD_N_MASK = N - 1 };
   1.153 +
   1.154 +  class Age {
   1.155 +  public:
   1.156 +    Age(size_t data = 0)         { _data = data; }
   1.157 +    Age(const Age& age)          { _data = age._data; }
   1.158 +    Age(idx_t top, idx_t tag)    { _fields._top = top; _fields._tag = tag; }
   1.159 +
   1.160 +    Age   get()        const volatile { return _data; }
   1.161 +    void  set(Age age) volatile       { _data = age._data; }
   1.162 +
   1.163 +    idx_t top()        const volatile { return _fields._top; }
   1.164 +    idx_t tag()        const volatile { return _fields._tag; }
   1.165 +
   1.166 +    // Increment top; if it wraps, increment tag also.
   1.167 +    void increment() {
   1.168 +      _fields._top = increment_index(_fields._top);
   1.169 +      if (_fields._top == 0) ++_fields._tag;
   1.170 +    }
   1.171 +
   1.172 +    Age cmpxchg(const Age new_age, const Age old_age) volatile {
   1.173 +      return (size_t) Atomic::cmpxchg_ptr((intptr_t)new_age._data,
   1.174 +                                          (volatile intptr_t *)&_data,
   1.175 +                                          (intptr_t)old_age._data);
   1.176 +    }
   1.177 +
   1.178 +    bool operator ==(const Age& other) const { return _data == other._data; }
   1.179 +
   1.180 +  private:
   1.181 +    struct fields {
   1.182 +      idx_t _top;
   1.183 +      idx_t _tag;
   1.184 +    };
   1.185 +    union {
   1.186 +      size_t _data;
   1.187 +      fields _fields;
   1.188 +    };
   1.189 +  };
   1.190 +
   1.191 +  volatile Age _age;
   1.192 +
   1.193 +  // These both operate mod N.
   1.194 +  static uint increment_index(uint ind) {
   1.195 +    return (ind + 1) & MOD_N_MASK;
   1.196 +  }
   1.197 +  static uint decrement_index(uint ind) {
   1.198 +    return (ind - 1) & MOD_N_MASK;
   1.199 +  }
   1.200 +
   1.201 +  // Returns a number in the range [0..N).  If the result is "N-1", it should be
   1.202 +  // interpreted as 0.
   1.203 +  uint dirty_size(uint bot, uint top) const {
   1.204 +    return (bot - top) & MOD_N_MASK;
   1.205 +  }
   1.206 +
   1.207 +  // Returns the size corresponding to the given "bot" and "top".
   1.208 +  uint size(uint bot, uint top) const {
   1.209 +    uint sz = dirty_size(bot, top);
   1.210 +    // Has the queue "wrapped", so that bottom is less than top?  There's a
   1.211 +    // complicated special case here.  A pair of threads could perform pop_local
   1.212 +    // and pop_global operations concurrently, starting from a state in which
   1.213 +    // _bottom == _top+1.  The pop_local could succeed in decrementing _bottom,
   1.214 +    // and the pop_global in incrementing _top (in which case the pop_global
   1.215 +    // will be awarded the contested queue element.)  The resulting state must
   1.216 +    // be interpreted as an empty queue.  (We only need to worry about one such
   1.217 +    // event: only the queue owner performs pop_local's, and several concurrent
   1.218 +    // threads attempting to perform the pop_global will all perform the same
   1.219 +    // CAS, and only one can succeed.)  Any stealing thread that reads after
   1.220 +    // either the increment or decrement will see an empty queue, and will not
   1.221 +    // join the competitors.  The "sz == -1 || sz == N-1" state will not be
   1.222 +    // modified by concurrent queues, so the owner thread can reset the state to
   1.223 +    // _bottom == top so subsequent pushes will be performed normally.
   1.224 +    return (sz == N - 1) ? 0 : sz;
   1.225 +  }
   1.226 +
   1.227 +public:
   1.228 +  TaskQueueSuper() : _bottom(0), _age() {}
   1.229 +
   1.230 +  // Return true if the TaskQueue contains/does not contain any tasks.
   1.231 +  bool peek()     const { return _bottom != _age.top(); }
   1.232 +  bool is_empty() const { return size() == 0; }
   1.233 +
   1.234 +  // Return an estimate of the number of elements in the queue.
   1.235 +  // The "careful" version admits the possibility of pop_local/pop_global
   1.236 +  // races.
   1.237 +  uint size() const {
   1.238 +    return size(_bottom, _age.top());
   1.239 +  }
   1.240 +
   1.241 +  uint dirty_size() const {
   1.242 +    return dirty_size(_bottom, _age.top());
   1.243 +  }
   1.244 +
   1.245 +  void set_empty() {
   1.246 +    _bottom = 0;
   1.247 +    _age.set(0);
   1.248 +  }
   1.249 +
   1.250 +  // Maximum number of elements allowed in the queue.  This is two less
   1.251 +  // than the actual queue size, for somewhat complicated reasons.
   1.252 +  uint max_elems() const { return N - 2; }
   1.253 +
   1.254 +  // Total size of queue.
   1.255 +  static const uint total_size() { return N; }
   1.256 +
   1.257 +  TASKQUEUE_STATS_ONLY(TaskQueueStats stats;)
   1.258 +};
   1.259 +
   1.260 +//
   1.261 +// GenericTaskQueue implements an ABP, Aurora-Blumofe-Plaxton, double-
   1.262 +// ended-queue (deque), intended for use in work stealing. Queue operations
   1.263 +// are non-blocking.
   1.264 +//
   1.265 +// A queue owner thread performs push() and pop_local() operations on one end
   1.266 +// of the queue, while other threads may steal work using the pop_global()
   1.267 +// method.
   1.268 +//
   1.269 +// The main difference to the original algorithm is that this
   1.270 +// implementation allows wrap-around at the end of its allocated
   1.271 +// storage, which is an array.
   1.272 +//
   1.273 +// The original paper is:
   1.274 +//
   1.275 +// Arora, N. S., Blumofe, R. D., and Plaxton, C. G.
   1.276 +// Thread scheduling for multiprogrammed multiprocessors.
   1.277 +// Theory of Computing Systems 34, 2 (2001), 115-144.
   1.278 +//
   1.279 +// The following paper provides an correctness proof and an
   1.280 +// implementation for weakly ordered memory models including (pseudo-)
   1.281 +// code containing memory barriers for a Chase-Lev deque. Chase-Lev is
   1.282 +// similar to ABP, with the main difference that it allows resizing of the
   1.283 +// underlying storage:
   1.284 +//
   1.285 +// Le, N. M., Pop, A., Cohen A., and Nardell, F. Z.
   1.286 +// Correct and efficient work-stealing for weak memory models
   1.287 +// Proceedings of the 18th ACM SIGPLAN symposium on Principles and
   1.288 +// practice of parallel programming (PPoPP 2013), 69-80
   1.289 +//
   1.290 +
   1.291 +template <class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE>
   1.292 +class GenericTaskQueue: public TaskQueueSuper<N, F> {
   1.293 +  ArrayAllocator<E, F> _array_allocator;
   1.294 +protected:
   1.295 +  typedef typename TaskQueueSuper<N, F>::Age Age;
   1.296 +  typedef typename TaskQueueSuper<N, F>::idx_t idx_t;
   1.297 +
   1.298 +  using TaskQueueSuper<N, F>::_bottom;
   1.299 +  using TaskQueueSuper<N, F>::_age;
   1.300 +  using TaskQueueSuper<N, F>::increment_index;
   1.301 +  using TaskQueueSuper<N, F>::decrement_index;
   1.302 +  using TaskQueueSuper<N, F>::dirty_size;
   1.303 +
   1.304 +public:
   1.305 +  using TaskQueueSuper<N, F>::max_elems;
   1.306 +  using TaskQueueSuper<N, F>::size;
   1.307 +
   1.308 +#if  TASKQUEUE_STATS
   1.309 +  using TaskQueueSuper<N, F>::stats;
   1.310 +#endif
   1.311 +
   1.312 +private:
   1.313 +  // Slow paths for push, pop_local.  (pop_global has no fast path.)
   1.314 +  bool push_slow(E t, uint dirty_n_elems);
   1.315 +  bool pop_local_slow(uint localBot, Age oldAge);
   1.316 +
   1.317 +public:
   1.318 +  typedef E element_type;
   1.319 +
   1.320 +  // Initializes the queue to empty.
   1.321 +  GenericTaskQueue();
   1.322 +
   1.323 +  void initialize();
   1.324 +
   1.325 +  // Push the task "t" on the queue.  Returns "false" iff the queue is full.
   1.326 +  inline bool push(E t);
   1.327 +
   1.328 +  // Attempts to claim a task from the "local" end of the queue (the most
   1.329 +  // recently pushed).  If successful, returns true and sets t to the task;
   1.330 +  // otherwise, returns false (the queue is empty).
   1.331 +  inline bool pop_local(volatile E& t);
   1.332 +
   1.333 +  // Like pop_local(), but uses the "global" end of the queue (the least
   1.334 +  // recently pushed).
   1.335 +  bool pop_global(volatile E& t);
   1.336 +
   1.337 +  // Delete any resource associated with the queue.
   1.338 +  ~GenericTaskQueue();
   1.339 +
   1.340 +  // apply the closure to all elements in the task queue
   1.341 +  void oops_do(OopClosure* f);
   1.342 +
   1.343 +private:
   1.344 +  // Element array.
   1.345 +  volatile E* _elems;
   1.346 +};
   1.347 +
   1.348 +template<class E, MEMFLAGS F, unsigned int N>
   1.349 +GenericTaskQueue<E, F, N>::GenericTaskQueue() {
   1.350 +  assert(sizeof(Age) == sizeof(size_t), "Depends on this.");
   1.351 +}
   1.352 +
   1.353 +template<class E, MEMFLAGS F, unsigned int N>
   1.354 +void GenericTaskQueue<E, F, N>::initialize() {
   1.355 +  _elems = _array_allocator.allocate(N);
   1.356 +}
   1.357 +
   1.358 +template<class E, MEMFLAGS F, unsigned int N>
   1.359 +void GenericTaskQueue<E, F, N>::oops_do(OopClosure* f) {
   1.360 +  // tty->print_cr("START OopTaskQueue::oops_do");
   1.361 +  uint iters = size();
   1.362 +  uint index = _bottom;
   1.363 +  for (uint i = 0; i < iters; ++i) {
   1.364 +    index = decrement_index(index);
   1.365 +    // tty->print_cr("  doing entry %d," INTPTR_T " -> " INTPTR_T,
   1.366 +    //            index, &_elems[index], _elems[index]);
   1.367 +    E* t = (E*)&_elems[index];      // cast away volatility
   1.368 +    oop* p = (oop*)t;
   1.369 +    assert((*t)->is_oop_or_null(), "Not an oop or null");
   1.370 +    f->do_oop(p);
   1.371 +  }
   1.372 +  // tty->print_cr("END OopTaskQueue::oops_do");
   1.373 +}
   1.374 +
   1.375 +template<class E, MEMFLAGS F, unsigned int N>
   1.376 +bool GenericTaskQueue<E, F, N>::push_slow(E t, uint dirty_n_elems) {
   1.377 +  if (dirty_n_elems == N - 1) {
   1.378 +    // Actually means 0, so do the push.
   1.379 +    uint localBot = _bottom;
   1.380 +    // g++ complains if the volatile result of the assignment is
   1.381 +    // unused, so we cast the volatile away.  We cannot cast directly
   1.382 +    // to void, because gcc treats that as not using the result of the
   1.383 +    // assignment.  However, casting to E& means that we trigger an
   1.384 +    // unused-value warning.  So, we cast the E& to void.
   1.385 +    (void)const_cast<E&>(_elems[localBot] = t);
   1.386 +    OrderAccess::release_store(&_bottom, increment_index(localBot));
   1.387 +    TASKQUEUE_STATS_ONLY(stats.record_push());
   1.388 +    return true;
   1.389 +  }
   1.390 +  return false;
   1.391 +}
   1.392 +
   1.393 +// pop_local_slow() is done by the owning thread and is trying to
   1.394 +// get the last task in the queue.  It will compete with pop_global()
   1.395 +// that will be used by other threads.  The tag age is incremented
   1.396 +// whenever the queue goes empty which it will do here if this thread
   1.397 +// gets the last task or in pop_global() if the queue wraps (top == 0
   1.398 +// and pop_global() succeeds, see pop_global()).
   1.399 +template<class E, MEMFLAGS F, unsigned int N>
   1.400 +bool GenericTaskQueue<E, F, N>::pop_local_slow(uint localBot, Age oldAge) {
   1.401 +  // This queue was observed to contain exactly one element; either this
   1.402 +  // thread will claim it, or a competing "pop_global".  In either case,
   1.403 +  // the queue will be logically empty afterwards.  Create a new Age value
   1.404 +  // that represents the empty queue for the given value of "_bottom".  (We
   1.405 +  // must also increment "tag" because of the case where "bottom == 1",
   1.406 +  // "top == 0".  A pop_global could read the queue element in that case,
   1.407 +  // then have the owner thread do a pop followed by another push.  Without
   1.408 +  // the incrementing of "tag", the pop_global's CAS could succeed,
   1.409 +  // allowing it to believe it has claimed the stale element.)
   1.410 +  Age newAge((idx_t)localBot, oldAge.tag() + 1);
   1.411 +  // Perhaps a competing pop_global has already incremented "top", in which
   1.412 +  // case it wins the element.
   1.413 +  if (localBot == oldAge.top()) {
   1.414 +    // No competing pop_global has yet incremented "top"; we'll try to
   1.415 +    // install new_age, thus claiming the element.
   1.416 +    Age tempAge = _age.cmpxchg(newAge, oldAge);
   1.417 +    if (tempAge == oldAge) {
   1.418 +      // We win.
   1.419 +      assert(dirty_size(localBot, _age.top()) != N - 1, "sanity");
   1.420 +      TASKQUEUE_STATS_ONLY(stats.record_pop_slow());
   1.421 +      return true;
   1.422 +    }
   1.423 +  }
   1.424 +  // We lose; a completing pop_global gets the element.  But the queue is empty
   1.425 +  // and top is greater than bottom.  Fix this representation of the empty queue
   1.426 +  // to become the canonical one.
   1.427 +  _age.set(newAge);
   1.428 +  assert(dirty_size(localBot, _age.top()) != N - 1, "sanity");
   1.429 +  return false;
   1.430 +}
   1.431 +
   1.432 +template<class E, MEMFLAGS F, unsigned int N>
   1.433 +bool GenericTaskQueue<E, F, N>::pop_global(volatile E& t) {
   1.434 +  Age oldAge = _age.get();
   1.435 +  // Architectures with weak memory model require a barrier here
   1.436 +  // to guarantee that bottom is not older than age,
   1.437 +  // which is crucial for the correctness of the algorithm.
   1.438 +#if !(defined SPARC || defined IA32 || defined AMD64)
   1.439 +  OrderAccess::fence();
   1.440 +#endif
   1.441 +  uint localBot = OrderAccess::load_acquire((volatile juint*)&_bottom);
   1.442 +  uint n_elems = size(localBot, oldAge.top());
   1.443 +  if (n_elems == 0) {
   1.444 +    return false;
   1.445 +  }
   1.446 +
   1.447 +  // g++ complains if the volatile result of the assignment is
   1.448 +  // unused, so we cast the volatile away.  We cannot cast directly
   1.449 +  // to void, because gcc treats that as not using the result of the
   1.450 +  // assignment.  However, casting to E& means that we trigger an
   1.451 +  // unused-value warning.  So, we cast the E& to void.
   1.452 +  (void) const_cast<E&>(t = _elems[oldAge.top()]);
   1.453 +  Age newAge(oldAge);
   1.454 +  newAge.increment();
   1.455 +  Age resAge = _age.cmpxchg(newAge, oldAge);
   1.456 +
   1.457 +  // Note that using "_bottom" here might fail, since a pop_local might
   1.458 +  // have decremented it.
   1.459 +  assert(dirty_size(localBot, newAge.top()) != N - 1, "sanity");
   1.460 +  return resAge == oldAge;
   1.461 +}
   1.462 +
   1.463 +template<class E, MEMFLAGS F, unsigned int N>
   1.464 +GenericTaskQueue<E, F, N>::~GenericTaskQueue() {
   1.465 +  FREE_C_HEAP_ARRAY(E, _elems, F);
   1.466 +}
   1.467 +
   1.468 +// OverflowTaskQueue is a TaskQueue that also includes an overflow stack for
   1.469 +// elements that do not fit in the TaskQueue.
   1.470 +//
   1.471 +// This class hides two methods from super classes:
   1.472 +//
   1.473 +// push() - push onto the task queue or, if that fails, onto the overflow stack
   1.474 +// is_empty() - return true if both the TaskQueue and overflow stack are empty
   1.475 +//
   1.476 +// Note that size() is not hidden--it returns the number of elements in the
   1.477 +// TaskQueue, and does not include the size of the overflow stack.  This
   1.478 +// simplifies replacement of GenericTaskQueues with OverflowTaskQueues.
   1.479 +template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE>
   1.480 +class OverflowTaskQueue: public GenericTaskQueue<E, F, N>
   1.481 +{
   1.482 +public:
   1.483 +  typedef Stack<E, F>               overflow_t;
   1.484 +  typedef GenericTaskQueue<E, F, N> taskqueue_t;
   1.485 +
   1.486 +  TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;)
   1.487 +
   1.488 +  // Push task t onto the queue or onto the overflow stack.  Return true.
   1.489 +  inline bool push(E t);
   1.490 +
   1.491 +  // Attempt to pop from the overflow stack; return true if anything was popped.
   1.492 +  inline bool pop_overflow(E& t);
   1.493 +
   1.494 +  inline overflow_t* overflow_stack() { return &_overflow_stack; }
   1.495 +
   1.496 +  inline bool taskqueue_empty() const { return taskqueue_t::is_empty(); }
   1.497 +  inline bool overflow_empty()  const { return _overflow_stack.is_empty(); }
   1.498 +  inline bool is_empty()        const {
   1.499 +    return taskqueue_empty() && overflow_empty();
   1.500 +  }
   1.501 +
   1.502 +private:
   1.503 +  overflow_t _overflow_stack;
   1.504 +};
   1.505 +
   1.506 +template <class E, MEMFLAGS F, unsigned int N>
   1.507 +bool OverflowTaskQueue<E, F, N>::push(E t)
   1.508 +{
   1.509 +  if (!taskqueue_t::push(t)) {
   1.510 +    overflow_stack()->push(t);
   1.511 +    TASKQUEUE_STATS_ONLY(stats.record_overflow(overflow_stack()->size()));
   1.512 +  }
   1.513 +  return true;
   1.514 +}
   1.515 +
   1.516 +template <class E, MEMFLAGS F, unsigned int N>
   1.517 +bool OverflowTaskQueue<E, F, N>::pop_overflow(E& t)
   1.518 +{
   1.519 +  if (overflow_empty()) return false;
   1.520 +  t = overflow_stack()->pop();
   1.521 +  return true;
   1.522 +}
   1.523 +
   1.524 +class TaskQueueSetSuper {
   1.525 +protected:
   1.526 +  static int randomParkAndMiller(int* seed0);
   1.527 +public:
   1.528 +  // Returns "true" if some TaskQueue in the set contains a task.
   1.529 +  virtual bool peek() = 0;
   1.530 +};
   1.531 +
   1.532 +template <MEMFLAGS F> class TaskQueueSetSuperImpl: public CHeapObj<F>, public TaskQueueSetSuper {
   1.533 +};
   1.534 +
   1.535 +template<class T, MEMFLAGS F>
   1.536 +class GenericTaskQueueSet: public TaskQueueSetSuperImpl<F> {
   1.537 +private:
   1.538 +  uint _n;
   1.539 +  T** _queues;
   1.540 +
   1.541 +public:
   1.542 +  typedef typename T::element_type E;
   1.543 +
   1.544 +  GenericTaskQueueSet(int n) : _n(n) {
   1.545 +    typedef T* GenericTaskQueuePtr;
   1.546 +    _queues = NEW_C_HEAP_ARRAY(GenericTaskQueuePtr, n, F);
   1.547 +    for (int i = 0; i < n; i++) {
   1.548 +      _queues[i] = NULL;
   1.549 +    }
   1.550 +  }
   1.551 +
   1.552 +  bool steal_best_of_2(uint queue_num, int* seed, E& t);
   1.553 +
   1.554 +  void register_queue(uint i, T* q);
   1.555 +
   1.556 +  T* queue(uint n);
   1.557 +
   1.558 +  // The thread with queue number "queue_num" (and whose random number seed is
   1.559 +  // at "seed") is trying to steal a task from some other queue.  (It may try
   1.560 +  // several queues, according to some configuration parameter.)  If some steal
   1.561 +  // succeeds, returns "true" and sets "t" to the stolen task, otherwise returns
   1.562 +  // false.
   1.563 +  bool steal(uint queue_num, int* seed, E& t);
   1.564 +
   1.565 +  bool peek();
   1.566 +};
   1.567 +
   1.568 +template<class T, MEMFLAGS F> void
   1.569 +GenericTaskQueueSet<T, F>::register_queue(uint i, T* q) {
   1.570 +  assert(i < _n, "index out of range.");
   1.571 +  _queues[i] = q;
   1.572 +}
   1.573 +
   1.574 +template<class T, MEMFLAGS F> T*
   1.575 +GenericTaskQueueSet<T, F>::queue(uint i) {
   1.576 +  return _queues[i];
   1.577 +}
   1.578 +
   1.579 +template<class T, MEMFLAGS F> bool
   1.580 +GenericTaskQueueSet<T, F>::steal(uint queue_num, int* seed, E& t) {
   1.581 +  for (uint i = 0; i < 2 * _n; i++) {
   1.582 +    if (steal_best_of_2(queue_num, seed, t)) {
   1.583 +      TASKQUEUE_STATS_ONLY(queue(queue_num)->stats.record_steal(true));
   1.584 +      return true;
   1.585 +    }
   1.586 +  }
   1.587 +  TASKQUEUE_STATS_ONLY(queue(queue_num)->stats.record_steal(false));
   1.588 +  return false;
   1.589 +}
   1.590 +
   1.591 +template<class T, MEMFLAGS F> bool
   1.592 +GenericTaskQueueSet<T, F>::steal_best_of_2(uint queue_num, int* seed, E& t) {
   1.593 +  if (_n > 2) {
   1.594 +    uint k1 = queue_num;
   1.595 +    while (k1 == queue_num) k1 = TaskQueueSetSuper::randomParkAndMiller(seed) % _n;
   1.596 +    uint k2 = queue_num;
   1.597 +    while (k2 == queue_num || k2 == k1) k2 = TaskQueueSetSuper::randomParkAndMiller(seed) % _n;
   1.598 +    // Sample both and try the larger.
   1.599 +    uint sz1 = _queues[k1]->size();
   1.600 +    uint sz2 = _queues[k2]->size();
   1.601 +    if (sz2 > sz1) return _queues[k2]->pop_global(t);
   1.602 +    else return _queues[k1]->pop_global(t);
   1.603 +  } else if (_n == 2) {
   1.604 +    // Just try the other one.
   1.605 +    uint k = (queue_num + 1) % 2;
   1.606 +    return _queues[k]->pop_global(t);
   1.607 +  } else {
   1.608 +    assert(_n == 1, "can't be zero.");
   1.609 +    return false;
   1.610 +  }
   1.611 +}
   1.612 +
   1.613 +template<class T, MEMFLAGS F>
   1.614 +bool GenericTaskQueueSet<T, F>::peek() {
   1.615 +  // Try all the queues.
   1.616 +  for (uint j = 0; j < _n; j++) {
   1.617 +    if (_queues[j]->peek())
   1.618 +      return true;
   1.619 +  }
   1.620 +  return false;
   1.621 +}
   1.622 +
   1.623 +// When to terminate from the termination protocol.
   1.624 +class TerminatorTerminator: public CHeapObj<mtInternal> {
   1.625 +public:
   1.626 +  virtual bool should_exit_termination() = 0;
   1.627 +};
   1.628 +
   1.629 +// A class to aid in the termination of a set of parallel tasks using
   1.630 +// TaskQueueSet's for work stealing.
   1.631 +
   1.632 +#undef TRACESPINNING
   1.633 +
   1.634 +class ParallelTaskTerminator: public StackObj {
   1.635 +private:
   1.636 +  int _n_threads;
   1.637 +  TaskQueueSetSuper* _queue_set;
   1.638 +  int _offered_termination;
   1.639 +
   1.640 +#ifdef TRACESPINNING
   1.641 +  static uint _total_yields;
   1.642 +  static uint _total_spins;
   1.643 +  static uint _total_peeks;
   1.644 +#endif
   1.645 +
   1.646 +  bool peek_in_queue_set();
   1.647 +protected:
   1.648 +  virtual void yield();
   1.649 +  void sleep(uint millis);
   1.650 +
   1.651 +public:
   1.652 +
   1.653 +  // "n_threads" is the number of threads to be terminated.  "queue_set" is a
   1.654 +  // queue sets of work queues of other threads.
   1.655 +  ParallelTaskTerminator(int n_threads, TaskQueueSetSuper* queue_set);
   1.656 +
   1.657 +  // The current thread has no work, and is ready to terminate if everyone
   1.658 +  // else is.  If returns "true", all threads are terminated.  If returns
   1.659 +  // "false", available work has been observed in one of the task queues,
   1.660 +  // so the global task is not complete.
   1.661 +  bool offer_termination() {
   1.662 +    return offer_termination(NULL);
   1.663 +  }
   1.664 +
   1.665 +  // As above, but it also terminates if the should_exit_termination()
   1.666 +  // method of the terminator parameter returns true. If terminator is
   1.667 +  // NULL, then it is ignored.
   1.668 +  bool offer_termination(TerminatorTerminator* terminator);
   1.669 +
   1.670 +  // Reset the terminator, so that it may be reused again.
   1.671 +  // The caller is responsible for ensuring that this is done
   1.672 +  // in an MT-safe manner, once the previous round of use of
   1.673 +  // the terminator is finished.
   1.674 +  void reset_for_reuse();
   1.675 +  // Same as above but the number of parallel threads is set to the
   1.676 +  // given number.
   1.677 +  void reset_for_reuse(int n_threads);
   1.678 +
   1.679 +#ifdef TRACESPINNING
   1.680 +  static uint total_yields() { return _total_yields; }
   1.681 +  static uint total_spins() { return _total_spins; }
   1.682 +  static uint total_peeks() { return _total_peeks; }
   1.683 +  static void print_termination_counts();
   1.684 +#endif
   1.685 +};
   1.686 +
   1.687 +template<class E, MEMFLAGS F, unsigned int N> inline bool
   1.688 +GenericTaskQueue<E, F, N>::push(E t) {
   1.689 +  uint localBot = _bottom;
   1.690 +  assert(localBot < N, "_bottom out of range.");
   1.691 +  idx_t top = _age.top();
   1.692 +  uint dirty_n_elems = dirty_size(localBot, top);
   1.693 +  assert(dirty_n_elems < N, "n_elems out of range.");
   1.694 +  if (dirty_n_elems < max_elems()) {
   1.695 +    // g++ complains if the volatile result of the assignment is
   1.696 +    // unused, so we cast the volatile away.  We cannot cast directly
   1.697 +    // to void, because gcc treats that as not using the result of the
   1.698 +    // assignment.  However, casting to E& means that we trigger an
   1.699 +    // unused-value warning.  So, we cast the E& to void.
   1.700 +    (void) const_cast<E&>(_elems[localBot] = t);
   1.701 +    OrderAccess::release_store(&_bottom, increment_index(localBot));
   1.702 +    TASKQUEUE_STATS_ONLY(stats.record_push());
   1.703 +    return true;
   1.704 +  } else {
   1.705 +    return push_slow(t, dirty_n_elems);
   1.706 +  }
   1.707 +}
   1.708 +
   1.709 +template<class E, MEMFLAGS F, unsigned int N> inline bool
   1.710 +GenericTaskQueue<E, F, N>::pop_local(volatile E& t) {
   1.711 +  uint localBot = _bottom;
   1.712 +  // This value cannot be N-1.  That can only occur as a result of
   1.713 +  // the assignment to bottom in this method.  If it does, this method
   1.714 +  // resets the size to 0 before the next call (which is sequential,
   1.715 +  // since this is pop_local.)
   1.716 +  uint dirty_n_elems = dirty_size(localBot, _age.top());
   1.717 +  assert(dirty_n_elems != N - 1, "Shouldn't be possible...");
   1.718 +  if (dirty_n_elems == 0) return false;
   1.719 +  localBot = decrement_index(localBot);
   1.720 +  _bottom = localBot;
   1.721 +  // This is necessary to prevent any read below from being reordered
   1.722 +  // before the store just above.
   1.723 +  OrderAccess::fence();
   1.724 +  // g++ complains if the volatile result of the assignment is
   1.725 +  // unused, so we cast the volatile away.  We cannot cast directly
   1.726 +  // to void, because gcc treats that as not using the result of the
   1.727 +  // assignment.  However, casting to E& means that we trigger an
   1.728 +  // unused-value warning.  So, we cast the E& to void.
   1.729 +  (void) const_cast<E&>(t = _elems[localBot]);
   1.730 +  // This is a second read of "age"; the "size()" above is the first.
   1.731 +  // If there's still at least one element in the queue, based on the
   1.732 +  // "_bottom" and "age" we've read, then there can be no interference with
   1.733 +  // a "pop_global" operation, and we're done.
   1.734 +  idx_t tp = _age.top();    // XXX
   1.735 +  if (size(localBot, tp) > 0) {
   1.736 +    assert(dirty_size(localBot, tp) != N - 1, "sanity");
   1.737 +    TASKQUEUE_STATS_ONLY(stats.record_pop());
   1.738 +    return true;
   1.739 +  } else {
   1.740 +    // Otherwise, the queue contained exactly one element; we take the slow
   1.741 +    // path.
   1.742 +    return pop_local_slow(localBot, _age.get());
   1.743 +  }
   1.744 +}
   1.745 +
   1.746 +typedef GenericTaskQueue<oop, mtGC>             OopTaskQueue;
   1.747 +typedef GenericTaskQueueSet<OopTaskQueue, mtGC> OopTaskQueueSet;
   1.748 +
   1.749 +#ifdef _MSC_VER
   1.750 +#pragma warning(push)
   1.751 +// warning C4522: multiple assignment operators specified
   1.752 +#pragma warning(disable:4522)
   1.753 +#endif
   1.754 +
   1.755 +// This is a container class for either an oop* or a narrowOop*.
   1.756 +// Both are pushed onto a task queue and the consumer will test is_narrow()
   1.757 +// to determine which should be processed.
   1.758 +class StarTask {
   1.759 +  void*  _holder;        // either union oop* or narrowOop*
   1.760 +
   1.761 +  enum { COMPRESSED_OOP_MASK = 1 };
   1.762 +
   1.763 + public:
   1.764 +  StarTask(narrowOop* p) {
   1.765 +    assert(((uintptr_t)p & COMPRESSED_OOP_MASK) == 0, "Information loss!");
   1.766 +    _holder = (void *)((uintptr_t)p | COMPRESSED_OOP_MASK);
   1.767 +  }
   1.768 +  StarTask(oop* p)       {
   1.769 +    assert(((uintptr_t)p & COMPRESSED_OOP_MASK) == 0, "Information loss!");
   1.770 +    _holder = (void*)p;
   1.771 +  }
   1.772 +  StarTask()             { _holder = NULL; }
   1.773 +  operator oop*()        { return (oop*)_holder; }
   1.774 +  operator narrowOop*()  {
   1.775 +    return (narrowOop*)((uintptr_t)_holder & ~COMPRESSED_OOP_MASK);
   1.776 +  }
   1.777 +
   1.778 +  StarTask& operator=(const StarTask& t) {
   1.779 +    _holder = t._holder;
   1.780 +    return *this;
   1.781 +  }
   1.782 +  volatile StarTask& operator=(const volatile StarTask& t) volatile {
   1.783 +    _holder = t._holder;
   1.784 +    return *this;
   1.785 +  }
   1.786 +
   1.787 +  bool is_narrow() const {
   1.788 +    return (((uintptr_t)_holder & COMPRESSED_OOP_MASK) != 0);
   1.789 +  }
   1.790 +};
   1.791 +
   1.792 +class ObjArrayTask
   1.793 +{
   1.794 +public:
   1.795 +  ObjArrayTask(oop o = NULL, int idx = 0): _obj(o), _index(idx) { }
   1.796 +  ObjArrayTask(oop o, size_t idx): _obj(o), _index(int(idx)) {
   1.797 +    assert(idx <= size_t(max_jint), "too big");
   1.798 +  }
   1.799 +  ObjArrayTask(const ObjArrayTask& t): _obj(t._obj), _index(t._index) { }
   1.800 +
   1.801 +  ObjArrayTask& operator =(const ObjArrayTask& t) {
   1.802 +    _obj = t._obj;
   1.803 +    _index = t._index;
   1.804 +    return *this;
   1.805 +  }
   1.806 +  volatile ObjArrayTask&
   1.807 +  operator =(const volatile ObjArrayTask& t) volatile {
   1.808 +    (void)const_cast<oop&>(_obj = t._obj);
   1.809 +    _index = t._index;
   1.810 +    return *this;
   1.811 +  }
   1.812 +
   1.813 +  inline oop obj()   const { return _obj; }
   1.814 +  inline int index() const { return _index; }
   1.815 +
   1.816 +  DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid.
   1.817 +
   1.818 +private:
   1.819 +  oop _obj;
   1.820 +  int _index;
   1.821 +};
   1.822 +
   1.823 +#ifdef _MSC_VER
   1.824 +#pragma warning(pop)
   1.825 +#endif
   1.826 +
   1.827 +typedef OverflowTaskQueue<StarTask, mtClass>           OopStarTaskQueue;
   1.828 +typedef GenericTaskQueueSet<OopStarTaskQueue, mtClass> OopStarTaskQueueSet;
   1.829 +
   1.830 +typedef OverflowTaskQueue<size_t, mtInternal>             RegionTaskQueue;
   1.831 +typedef GenericTaskQueueSet<RegionTaskQueue, mtClass>     RegionTaskQueueSet;
   1.832 +
   1.833 +
   1.834 +#endif // SHARE_VM_UTILITIES_TASKQUEUE_HPP

mercurial