1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/src/share/vm/utilities/taskqueue.hpp Wed Apr 27 01:25:04 2016 +0800 1.3 @@ -0,0 +1,831 @@ 1.4 +/* 1.5 + * Copyright (c) 2001, 2013, Oracle and/or its affiliates. All rights reserved. 1.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 1.7 + * 1.8 + * This code is free software; you can redistribute it and/or modify it 1.9 + * under the terms of the GNU General Public License version 2 only, as 1.10 + * published by the Free Software Foundation. 1.11 + * 1.12 + * This code is distributed in the hope that it will be useful, but WITHOUT 1.13 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 1.14 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 1.15 + * version 2 for more details (a copy is included in the LICENSE file that 1.16 + * accompanied this code). 1.17 + * 1.18 + * You should have received a copy of the GNU General Public License version 1.19 + * 2 along with this work; if not, write to the Free Software Foundation, 1.20 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 1.21 + * 1.22 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 1.23 + * or visit www.oracle.com if you need additional information or have any 1.24 + * questions. 1.25 + * 1.26 + */ 1.27 + 1.28 +#ifndef SHARE_VM_UTILITIES_TASKQUEUE_HPP 1.29 +#define SHARE_VM_UTILITIES_TASKQUEUE_HPP 1.30 + 1.31 +#include "memory/allocation.hpp" 1.32 +#include "memory/allocation.inline.hpp" 1.33 +#include "runtime/mutex.hpp" 1.34 +#include "utilities/stack.hpp" 1.35 +#ifdef TARGET_OS_ARCH_linux_x86 1.36 +# include "orderAccess_linux_x86.inline.hpp" 1.37 +#endif 1.38 +#ifdef TARGET_OS_ARCH_linux_sparc 1.39 +# include "orderAccess_linux_sparc.inline.hpp" 1.40 +#endif 1.41 +#ifdef TARGET_OS_ARCH_linux_zero 1.42 +# include "orderAccess_linux_zero.inline.hpp" 1.43 +#endif 1.44 +#ifdef TARGET_OS_ARCH_solaris_x86 1.45 +# include "orderAccess_solaris_x86.inline.hpp" 1.46 +#endif 1.47 +#ifdef TARGET_OS_ARCH_solaris_sparc 1.48 +# include "orderAccess_solaris_sparc.inline.hpp" 1.49 +#endif 1.50 +#ifdef TARGET_OS_ARCH_windows_x86 1.51 +# include "orderAccess_windows_x86.inline.hpp" 1.52 +#endif 1.53 +#ifdef TARGET_OS_ARCH_linux_arm 1.54 +# include "orderAccess_linux_arm.inline.hpp" 1.55 +#endif 1.56 +#ifdef TARGET_OS_ARCH_linux_ppc 1.57 +# include "orderAccess_linux_ppc.inline.hpp" 1.58 +#endif 1.59 +#ifdef TARGET_OS_ARCH_aix_ppc 1.60 +# include "orderAccess_aix_ppc.inline.hpp" 1.61 +#endif 1.62 +#ifdef TARGET_OS_ARCH_bsd_x86 1.63 +# include "orderAccess_bsd_x86.inline.hpp" 1.64 +#endif 1.65 +#ifdef TARGET_OS_ARCH_bsd_zero 1.66 +# include "orderAccess_bsd_zero.inline.hpp" 1.67 +#endif 1.68 + 1.69 +// Simple TaskQueue stats that are collected by default in debug builds. 1.70 + 1.71 +#if !defined(TASKQUEUE_STATS) && defined(ASSERT) 1.72 +#define TASKQUEUE_STATS 1 1.73 +#elif !defined(TASKQUEUE_STATS) 1.74 +#define TASKQUEUE_STATS 0 1.75 +#endif 1.76 + 1.77 +#if TASKQUEUE_STATS 1.78 +#define TASKQUEUE_STATS_ONLY(code) code 1.79 +#else 1.80 +#define TASKQUEUE_STATS_ONLY(code) 1.81 +#endif // TASKQUEUE_STATS 1.82 + 1.83 +#if TASKQUEUE_STATS 1.84 +class TaskQueueStats { 1.85 +public: 1.86 + enum StatId { 1.87 + push, // number of taskqueue pushes 1.88 + pop, // number of taskqueue pops 1.89 + pop_slow, // subset of taskqueue pops that were done slow-path 1.90 + steal_attempt, // number of taskqueue steal attempts 1.91 + steal, // number of taskqueue steals 1.92 + overflow, // number of overflow pushes 1.93 + overflow_max_len, // max length of overflow stack 1.94 + last_stat_id 1.95 + }; 1.96 + 1.97 +public: 1.98 + inline TaskQueueStats() { reset(); } 1.99 + 1.100 + inline void record_push() { ++_stats[push]; } 1.101 + inline void record_pop() { ++_stats[pop]; } 1.102 + inline void record_pop_slow() { record_pop(); ++_stats[pop_slow]; } 1.103 + inline void record_steal(bool success); 1.104 + inline void record_overflow(size_t new_length); 1.105 + 1.106 + TaskQueueStats & operator +=(const TaskQueueStats & addend); 1.107 + 1.108 + inline size_t get(StatId id) const { return _stats[id]; } 1.109 + inline const size_t* get() const { return _stats; } 1.110 + 1.111 + inline void reset(); 1.112 + 1.113 + // Print the specified line of the header (does not include a line separator). 1.114 + static void print_header(unsigned int line, outputStream* const stream = tty, 1.115 + unsigned int width = 10); 1.116 + // Print the statistics (does not include a line separator). 1.117 + void print(outputStream* const stream = tty, unsigned int width = 10) const; 1.118 + 1.119 + DEBUG_ONLY(void verify() const;) 1.120 + 1.121 +private: 1.122 + size_t _stats[last_stat_id]; 1.123 + static const char * const _names[last_stat_id]; 1.124 +}; 1.125 + 1.126 +void TaskQueueStats::record_steal(bool success) { 1.127 + ++_stats[steal_attempt]; 1.128 + if (success) ++_stats[steal]; 1.129 +} 1.130 + 1.131 +void TaskQueueStats::record_overflow(size_t new_len) { 1.132 + ++_stats[overflow]; 1.133 + if (new_len > _stats[overflow_max_len]) _stats[overflow_max_len] = new_len; 1.134 +} 1.135 + 1.136 +void TaskQueueStats::reset() { 1.137 + memset(_stats, 0, sizeof(_stats)); 1.138 +} 1.139 +#endif // TASKQUEUE_STATS 1.140 + 1.141 +// TaskQueueSuper collects functionality common to all GenericTaskQueue instances. 1.142 + 1.143 +template <unsigned int N, MEMFLAGS F> 1.144 +class TaskQueueSuper: public CHeapObj<F> { 1.145 +protected: 1.146 + // Internal type for indexing the queue; also used for the tag. 1.147 + typedef NOT_LP64(uint16_t) LP64_ONLY(uint32_t) idx_t; 1.148 + 1.149 + // The first free element after the last one pushed (mod N). 1.150 + volatile uint _bottom; 1.151 + 1.152 + enum { MOD_N_MASK = N - 1 }; 1.153 + 1.154 + class Age { 1.155 + public: 1.156 + Age(size_t data = 0) { _data = data; } 1.157 + Age(const Age& age) { _data = age._data; } 1.158 + Age(idx_t top, idx_t tag) { _fields._top = top; _fields._tag = tag; } 1.159 + 1.160 + Age get() const volatile { return _data; } 1.161 + void set(Age age) volatile { _data = age._data; } 1.162 + 1.163 + idx_t top() const volatile { return _fields._top; } 1.164 + idx_t tag() const volatile { return _fields._tag; } 1.165 + 1.166 + // Increment top; if it wraps, increment tag also. 1.167 + void increment() { 1.168 + _fields._top = increment_index(_fields._top); 1.169 + if (_fields._top == 0) ++_fields._tag; 1.170 + } 1.171 + 1.172 + Age cmpxchg(const Age new_age, const Age old_age) volatile { 1.173 + return (size_t) Atomic::cmpxchg_ptr((intptr_t)new_age._data, 1.174 + (volatile intptr_t *)&_data, 1.175 + (intptr_t)old_age._data); 1.176 + } 1.177 + 1.178 + bool operator ==(const Age& other) const { return _data == other._data; } 1.179 + 1.180 + private: 1.181 + struct fields { 1.182 + idx_t _top; 1.183 + idx_t _tag; 1.184 + }; 1.185 + union { 1.186 + size_t _data; 1.187 + fields _fields; 1.188 + }; 1.189 + }; 1.190 + 1.191 + volatile Age _age; 1.192 + 1.193 + // These both operate mod N. 1.194 + static uint increment_index(uint ind) { 1.195 + return (ind + 1) & MOD_N_MASK; 1.196 + } 1.197 + static uint decrement_index(uint ind) { 1.198 + return (ind - 1) & MOD_N_MASK; 1.199 + } 1.200 + 1.201 + // Returns a number in the range [0..N). If the result is "N-1", it should be 1.202 + // interpreted as 0. 1.203 + uint dirty_size(uint bot, uint top) const { 1.204 + return (bot - top) & MOD_N_MASK; 1.205 + } 1.206 + 1.207 + // Returns the size corresponding to the given "bot" and "top". 1.208 + uint size(uint bot, uint top) const { 1.209 + uint sz = dirty_size(bot, top); 1.210 + // Has the queue "wrapped", so that bottom is less than top? There's a 1.211 + // complicated special case here. A pair of threads could perform pop_local 1.212 + // and pop_global operations concurrently, starting from a state in which 1.213 + // _bottom == _top+1. The pop_local could succeed in decrementing _bottom, 1.214 + // and the pop_global in incrementing _top (in which case the pop_global 1.215 + // will be awarded the contested queue element.) The resulting state must 1.216 + // be interpreted as an empty queue. (We only need to worry about one such 1.217 + // event: only the queue owner performs pop_local's, and several concurrent 1.218 + // threads attempting to perform the pop_global will all perform the same 1.219 + // CAS, and only one can succeed.) Any stealing thread that reads after 1.220 + // either the increment or decrement will see an empty queue, and will not 1.221 + // join the competitors. The "sz == -1 || sz == N-1" state will not be 1.222 + // modified by concurrent queues, so the owner thread can reset the state to 1.223 + // _bottom == top so subsequent pushes will be performed normally. 1.224 + return (sz == N - 1) ? 0 : sz; 1.225 + } 1.226 + 1.227 +public: 1.228 + TaskQueueSuper() : _bottom(0), _age() {} 1.229 + 1.230 + // Return true if the TaskQueue contains/does not contain any tasks. 1.231 + bool peek() const { return _bottom != _age.top(); } 1.232 + bool is_empty() const { return size() == 0; } 1.233 + 1.234 + // Return an estimate of the number of elements in the queue. 1.235 + // The "careful" version admits the possibility of pop_local/pop_global 1.236 + // races. 1.237 + uint size() const { 1.238 + return size(_bottom, _age.top()); 1.239 + } 1.240 + 1.241 + uint dirty_size() const { 1.242 + return dirty_size(_bottom, _age.top()); 1.243 + } 1.244 + 1.245 + void set_empty() { 1.246 + _bottom = 0; 1.247 + _age.set(0); 1.248 + } 1.249 + 1.250 + // Maximum number of elements allowed in the queue. This is two less 1.251 + // than the actual queue size, for somewhat complicated reasons. 1.252 + uint max_elems() const { return N - 2; } 1.253 + 1.254 + // Total size of queue. 1.255 + static const uint total_size() { return N; } 1.256 + 1.257 + TASKQUEUE_STATS_ONLY(TaskQueueStats stats;) 1.258 +}; 1.259 + 1.260 +// 1.261 +// GenericTaskQueue implements an ABP, Aurora-Blumofe-Plaxton, double- 1.262 +// ended-queue (deque), intended for use in work stealing. Queue operations 1.263 +// are non-blocking. 1.264 +// 1.265 +// A queue owner thread performs push() and pop_local() operations on one end 1.266 +// of the queue, while other threads may steal work using the pop_global() 1.267 +// method. 1.268 +// 1.269 +// The main difference to the original algorithm is that this 1.270 +// implementation allows wrap-around at the end of its allocated 1.271 +// storage, which is an array. 1.272 +// 1.273 +// The original paper is: 1.274 +// 1.275 +// Arora, N. S., Blumofe, R. D., and Plaxton, C. G. 1.276 +// Thread scheduling for multiprogrammed multiprocessors. 1.277 +// Theory of Computing Systems 34, 2 (2001), 115-144. 1.278 +// 1.279 +// The following paper provides an correctness proof and an 1.280 +// implementation for weakly ordered memory models including (pseudo-) 1.281 +// code containing memory barriers for a Chase-Lev deque. Chase-Lev is 1.282 +// similar to ABP, with the main difference that it allows resizing of the 1.283 +// underlying storage: 1.284 +// 1.285 +// Le, N. M., Pop, A., Cohen A., and Nardell, F. Z. 1.286 +// Correct and efficient work-stealing for weak memory models 1.287 +// Proceedings of the 18th ACM SIGPLAN symposium on Principles and 1.288 +// practice of parallel programming (PPoPP 2013), 69-80 1.289 +// 1.290 + 1.291 +template <class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE> 1.292 +class GenericTaskQueue: public TaskQueueSuper<N, F> { 1.293 + ArrayAllocator<E, F> _array_allocator; 1.294 +protected: 1.295 + typedef typename TaskQueueSuper<N, F>::Age Age; 1.296 + typedef typename TaskQueueSuper<N, F>::idx_t idx_t; 1.297 + 1.298 + using TaskQueueSuper<N, F>::_bottom; 1.299 + using TaskQueueSuper<N, F>::_age; 1.300 + using TaskQueueSuper<N, F>::increment_index; 1.301 + using TaskQueueSuper<N, F>::decrement_index; 1.302 + using TaskQueueSuper<N, F>::dirty_size; 1.303 + 1.304 +public: 1.305 + using TaskQueueSuper<N, F>::max_elems; 1.306 + using TaskQueueSuper<N, F>::size; 1.307 + 1.308 +#if TASKQUEUE_STATS 1.309 + using TaskQueueSuper<N, F>::stats; 1.310 +#endif 1.311 + 1.312 +private: 1.313 + // Slow paths for push, pop_local. (pop_global has no fast path.) 1.314 + bool push_slow(E t, uint dirty_n_elems); 1.315 + bool pop_local_slow(uint localBot, Age oldAge); 1.316 + 1.317 +public: 1.318 + typedef E element_type; 1.319 + 1.320 + // Initializes the queue to empty. 1.321 + GenericTaskQueue(); 1.322 + 1.323 + void initialize(); 1.324 + 1.325 + // Push the task "t" on the queue. Returns "false" iff the queue is full. 1.326 + inline bool push(E t); 1.327 + 1.328 + // Attempts to claim a task from the "local" end of the queue (the most 1.329 + // recently pushed). If successful, returns true and sets t to the task; 1.330 + // otherwise, returns false (the queue is empty). 1.331 + inline bool pop_local(volatile E& t); 1.332 + 1.333 + // Like pop_local(), but uses the "global" end of the queue (the least 1.334 + // recently pushed). 1.335 + bool pop_global(volatile E& t); 1.336 + 1.337 + // Delete any resource associated with the queue. 1.338 + ~GenericTaskQueue(); 1.339 + 1.340 + // apply the closure to all elements in the task queue 1.341 + void oops_do(OopClosure* f); 1.342 + 1.343 +private: 1.344 + // Element array. 1.345 + volatile E* _elems; 1.346 +}; 1.347 + 1.348 +template<class E, MEMFLAGS F, unsigned int N> 1.349 +GenericTaskQueue<E, F, N>::GenericTaskQueue() { 1.350 + assert(sizeof(Age) == sizeof(size_t), "Depends on this."); 1.351 +} 1.352 + 1.353 +template<class E, MEMFLAGS F, unsigned int N> 1.354 +void GenericTaskQueue<E, F, N>::initialize() { 1.355 + _elems = _array_allocator.allocate(N); 1.356 +} 1.357 + 1.358 +template<class E, MEMFLAGS F, unsigned int N> 1.359 +void GenericTaskQueue<E, F, N>::oops_do(OopClosure* f) { 1.360 + // tty->print_cr("START OopTaskQueue::oops_do"); 1.361 + uint iters = size(); 1.362 + uint index = _bottom; 1.363 + for (uint i = 0; i < iters; ++i) { 1.364 + index = decrement_index(index); 1.365 + // tty->print_cr(" doing entry %d," INTPTR_T " -> " INTPTR_T, 1.366 + // index, &_elems[index], _elems[index]); 1.367 + E* t = (E*)&_elems[index]; // cast away volatility 1.368 + oop* p = (oop*)t; 1.369 + assert((*t)->is_oop_or_null(), "Not an oop or null"); 1.370 + f->do_oop(p); 1.371 + } 1.372 + // tty->print_cr("END OopTaskQueue::oops_do"); 1.373 +} 1.374 + 1.375 +template<class E, MEMFLAGS F, unsigned int N> 1.376 +bool GenericTaskQueue<E, F, N>::push_slow(E t, uint dirty_n_elems) { 1.377 + if (dirty_n_elems == N - 1) { 1.378 + // Actually means 0, so do the push. 1.379 + uint localBot = _bottom; 1.380 + // g++ complains if the volatile result of the assignment is 1.381 + // unused, so we cast the volatile away. We cannot cast directly 1.382 + // to void, because gcc treats that as not using the result of the 1.383 + // assignment. However, casting to E& means that we trigger an 1.384 + // unused-value warning. So, we cast the E& to void. 1.385 + (void)const_cast<E&>(_elems[localBot] = t); 1.386 + OrderAccess::release_store(&_bottom, increment_index(localBot)); 1.387 + TASKQUEUE_STATS_ONLY(stats.record_push()); 1.388 + return true; 1.389 + } 1.390 + return false; 1.391 +} 1.392 + 1.393 +// pop_local_slow() is done by the owning thread and is trying to 1.394 +// get the last task in the queue. It will compete with pop_global() 1.395 +// that will be used by other threads. The tag age is incremented 1.396 +// whenever the queue goes empty which it will do here if this thread 1.397 +// gets the last task or in pop_global() if the queue wraps (top == 0 1.398 +// and pop_global() succeeds, see pop_global()). 1.399 +template<class E, MEMFLAGS F, unsigned int N> 1.400 +bool GenericTaskQueue<E, F, N>::pop_local_slow(uint localBot, Age oldAge) { 1.401 + // This queue was observed to contain exactly one element; either this 1.402 + // thread will claim it, or a competing "pop_global". In either case, 1.403 + // the queue will be logically empty afterwards. Create a new Age value 1.404 + // that represents the empty queue for the given value of "_bottom". (We 1.405 + // must also increment "tag" because of the case where "bottom == 1", 1.406 + // "top == 0". A pop_global could read the queue element in that case, 1.407 + // then have the owner thread do a pop followed by another push. Without 1.408 + // the incrementing of "tag", the pop_global's CAS could succeed, 1.409 + // allowing it to believe it has claimed the stale element.) 1.410 + Age newAge((idx_t)localBot, oldAge.tag() + 1); 1.411 + // Perhaps a competing pop_global has already incremented "top", in which 1.412 + // case it wins the element. 1.413 + if (localBot == oldAge.top()) { 1.414 + // No competing pop_global has yet incremented "top"; we'll try to 1.415 + // install new_age, thus claiming the element. 1.416 + Age tempAge = _age.cmpxchg(newAge, oldAge); 1.417 + if (tempAge == oldAge) { 1.418 + // We win. 1.419 + assert(dirty_size(localBot, _age.top()) != N - 1, "sanity"); 1.420 + TASKQUEUE_STATS_ONLY(stats.record_pop_slow()); 1.421 + return true; 1.422 + } 1.423 + } 1.424 + // We lose; a completing pop_global gets the element. But the queue is empty 1.425 + // and top is greater than bottom. Fix this representation of the empty queue 1.426 + // to become the canonical one. 1.427 + _age.set(newAge); 1.428 + assert(dirty_size(localBot, _age.top()) != N - 1, "sanity"); 1.429 + return false; 1.430 +} 1.431 + 1.432 +template<class E, MEMFLAGS F, unsigned int N> 1.433 +bool GenericTaskQueue<E, F, N>::pop_global(volatile E& t) { 1.434 + Age oldAge = _age.get(); 1.435 + // Architectures with weak memory model require a barrier here 1.436 + // to guarantee that bottom is not older than age, 1.437 + // which is crucial for the correctness of the algorithm. 1.438 +#if !(defined SPARC || defined IA32 || defined AMD64) 1.439 + OrderAccess::fence(); 1.440 +#endif 1.441 + uint localBot = OrderAccess::load_acquire((volatile juint*)&_bottom); 1.442 + uint n_elems = size(localBot, oldAge.top()); 1.443 + if (n_elems == 0) { 1.444 + return false; 1.445 + } 1.446 + 1.447 + // g++ complains if the volatile result of the assignment is 1.448 + // unused, so we cast the volatile away. We cannot cast directly 1.449 + // to void, because gcc treats that as not using the result of the 1.450 + // assignment. However, casting to E& means that we trigger an 1.451 + // unused-value warning. So, we cast the E& to void. 1.452 + (void) const_cast<E&>(t = _elems[oldAge.top()]); 1.453 + Age newAge(oldAge); 1.454 + newAge.increment(); 1.455 + Age resAge = _age.cmpxchg(newAge, oldAge); 1.456 + 1.457 + // Note that using "_bottom" here might fail, since a pop_local might 1.458 + // have decremented it. 1.459 + assert(dirty_size(localBot, newAge.top()) != N - 1, "sanity"); 1.460 + return resAge == oldAge; 1.461 +} 1.462 + 1.463 +template<class E, MEMFLAGS F, unsigned int N> 1.464 +GenericTaskQueue<E, F, N>::~GenericTaskQueue() { 1.465 + FREE_C_HEAP_ARRAY(E, _elems, F); 1.466 +} 1.467 + 1.468 +// OverflowTaskQueue is a TaskQueue that also includes an overflow stack for 1.469 +// elements that do not fit in the TaskQueue. 1.470 +// 1.471 +// This class hides two methods from super classes: 1.472 +// 1.473 +// push() - push onto the task queue or, if that fails, onto the overflow stack 1.474 +// is_empty() - return true if both the TaskQueue and overflow stack are empty 1.475 +// 1.476 +// Note that size() is not hidden--it returns the number of elements in the 1.477 +// TaskQueue, and does not include the size of the overflow stack. This 1.478 +// simplifies replacement of GenericTaskQueues with OverflowTaskQueues. 1.479 +template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE> 1.480 +class OverflowTaskQueue: public GenericTaskQueue<E, F, N> 1.481 +{ 1.482 +public: 1.483 + typedef Stack<E, F> overflow_t; 1.484 + typedef GenericTaskQueue<E, F, N> taskqueue_t; 1.485 + 1.486 + TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;) 1.487 + 1.488 + // Push task t onto the queue or onto the overflow stack. Return true. 1.489 + inline bool push(E t); 1.490 + 1.491 + // Attempt to pop from the overflow stack; return true if anything was popped. 1.492 + inline bool pop_overflow(E& t); 1.493 + 1.494 + inline overflow_t* overflow_stack() { return &_overflow_stack; } 1.495 + 1.496 + inline bool taskqueue_empty() const { return taskqueue_t::is_empty(); } 1.497 + inline bool overflow_empty() const { return _overflow_stack.is_empty(); } 1.498 + inline bool is_empty() const { 1.499 + return taskqueue_empty() && overflow_empty(); 1.500 + } 1.501 + 1.502 +private: 1.503 + overflow_t _overflow_stack; 1.504 +}; 1.505 + 1.506 +template <class E, MEMFLAGS F, unsigned int N> 1.507 +bool OverflowTaskQueue<E, F, N>::push(E t) 1.508 +{ 1.509 + if (!taskqueue_t::push(t)) { 1.510 + overflow_stack()->push(t); 1.511 + TASKQUEUE_STATS_ONLY(stats.record_overflow(overflow_stack()->size())); 1.512 + } 1.513 + return true; 1.514 +} 1.515 + 1.516 +template <class E, MEMFLAGS F, unsigned int N> 1.517 +bool OverflowTaskQueue<E, F, N>::pop_overflow(E& t) 1.518 +{ 1.519 + if (overflow_empty()) return false; 1.520 + t = overflow_stack()->pop(); 1.521 + return true; 1.522 +} 1.523 + 1.524 +class TaskQueueSetSuper { 1.525 +protected: 1.526 + static int randomParkAndMiller(int* seed0); 1.527 +public: 1.528 + // Returns "true" if some TaskQueue in the set contains a task. 1.529 + virtual bool peek() = 0; 1.530 +}; 1.531 + 1.532 +template <MEMFLAGS F> class TaskQueueSetSuperImpl: public CHeapObj<F>, public TaskQueueSetSuper { 1.533 +}; 1.534 + 1.535 +template<class T, MEMFLAGS F> 1.536 +class GenericTaskQueueSet: public TaskQueueSetSuperImpl<F> { 1.537 +private: 1.538 + uint _n; 1.539 + T** _queues; 1.540 + 1.541 +public: 1.542 + typedef typename T::element_type E; 1.543 + 1.544 + GenericTaskQueueSet(int n) : _n(n) { 1.545 + typedef T* GenericTaskQueuePtr; 1.546 + _queues = NEW_C_HEAP_ARRAY(GenericTaskQueuePtr, n, F); 1.547 + for (int i = 0; i < n; i++) { 1.548 + _queues[i] = NULL; 1.549 + } 1.550 + } 1.551 + 1.552 + bool steal_best_of_2(uint queue_num, int* seed, E& t); 1.553 + 1.554 + void register_queue(uint i, T* q); 1.555 + 1.556 + T* queue(uint n); 1.557 + 1.558 + // The thread with queue number "queue_num" (and whose random number seed is 1.559 + // at "seed") is trying to steal a task from some other queue. (It may try 1.560 + // several queues, according to some configuration parameter.) If some steal 1.561 + // succeeds, returns "true" and sets "t" to the stolen task, otherwise returns 1.562 + // false. 1.563 + bool steal(uint queue_num, int* seed, E& t); 1.564 + 1.565 + bool peek(); 1.566 +}; 1.567 + 1.568 +template<class T, MEMFLAGS F> void 1.569 +GenericTaskQueueSet<T, F>::register_queue(uint i, T* q) { 1.570 + assert(i < _n, "index out of range."); 1.571 + _queues[i] = q; 1.572 +} 1.573 + 1.574 +template<class T, MEMFLAGS F> T* 1.575 +GenericTaskQueueSet<T, F>::queue(uint i) { 1.576 + return _queues[i]; 1.577 +} 1.578 + 1.579 +template<class T, MEMFLAGS F> bool 1.580 +GenericTaskQueueSet<T, F>::steal(uint queue_num, int* seed, E& t) { 1.581 + for (uint i = 0; i < 2 * _n; i++) { 1.582 + if (steal_best_of_2(queue_num, seed, t)) { 1.583 + TASKQUEUE_STATS_ONLY(queue(queue_num)->stats.record_steal(true)); 1.584 + return true; 1.585 + } 1.586 + } 1.587 + TASKQUEUE_STATS_ONLY(queue(queue_num)->stats.record_steal(false)); 1.588 + return false; 1.589 +} 1.590 + 1.591 +template<class T, MEMFLAGS F> bool 1.592 +GenericTaskQueueSet<T, F>::steal_best_of_2(uint queue_num, int* seed, E& t) { 1.593 + if (_n > 2) { 1.594 + uint k1 = queue_num; 1.595 + while (k1 == queue_num) k1 = TaskQueueSetSuper::randomParkAndMiller(seed) % _n; 1.596 + uint k2 = queue_num; 1.597 + while (k2 == queue_num || k2 == k1) k2 = TaskQueueSetSuper::randomParkAndMiller(seed) % _n; 1.598 + // Sample both and try the larger. 1.599 + uint sz1 = _queues[k1]->size(); 1.600 + uint sz2 = _queues[k2]->size(); 1.601 + if (sz2 > sz1) return _queues[k2]->pop_global(t); 1.602 + else return _queues[k1]->pop_global(t); 1.603 + } else if (_n == 2) { 1.604 + // Just try the other one. 1.605 + uint k = (queue_num + 1) % 2; 1.606 + return _queues[k]->pop_global(t); 1.607 + } else { 1.608 + assert(_n == 1, "can't be zero."); 1.609 + return false; 1.610 + } 1.611 +} 1.612 + 1.613 +template<class T, MEMFLAGS F> 1.614 +bool GenericTaskQueueSet<T, F>::peek() { 1.615 + // Try all the queues. 1.616 + for (uint j = 0; j < _n; j++) { 1.617 + if (_queues[j]->peek()) 1.618 + return true; 1.619 + } 1.620 + return false; 1.621 +} 1.622 + 1.623 +// When to terminate from the termination protocol. 1.624 +class TerminatorTerminator: public CHeapObj<mtInternal> { 1.625 +public: 1.626 + virtual bool should_exit_termination() = 0; 1.627 +}; 1.628 + 1.629 +// A class to aid in the termination of a set of parallel tasks using 1.630 +// TaskQueueSet's for work stealing. 1.631 + 1.632 +#undef TRACESPINNING 1.633 + 1.634 +class ParallelTaskTerminator: public StackObj { 1.635 +private: 1.636 + int _n_threads; 1.637 + TaskQueueSetSuper* _queue_set; 1.638 + int _offered_termination; 1.639 + 1.640 +#ifdef TRACESPINNING 1.641 + static uint _total_yields; 1.642 + static uint _total_spins; 1.643 + static uint _total_peeks; 1.644 +#endif 1.645 + 1.646 + bool peek_in_queue_set(); 1.647 +protected: 1.648 + virtual void yield(); 1.649 + void sleep(uint millis); 1.650 + 1.651 +public: 1.652 + 1.653 + // "n_threads" is the number of threads to be terminated. "queue_set" is a 1.654 + // queue sets of work queues of other threads. 1.655 + ParallelTaskTerminator(int n_threads, TaskQueueSetSuper* queue_set); 1.656 + 1.657 + // The current thread has no work, and is ready to terminate if everyone 1.658 + // else is. If returns "true", all threads are terminated. If returns 1.659 + // "false", available work has been observed in one of the task queues, 1.660 + // so the global task is not complete. 1.661 + bool offer_termination() { 1.662 + return offer_termination(NULL); 1.663 + } 1.664 + 1.665 + // As above, but it also terminates if the should_exit_termination() 1.666 + // method of the terminator parameter returns true. If terminator is 1.667 + // NULL, then it is ignored. 1.668 + bool offer_termination(TerminatorTerminator* terminator); 1.669 + 1.670 + // Reset the terminator, so that it may be reused again. 1.671 + // The caller is responsible for ensuring that this is done 1.672 + // in an MT-safe manner, once the previous round of use of 1.673 + // the terminator is finished. 1.674 + void reset_for_reuse(); 1.675 + // Same as above but the number of parallel threads is set to the 1.676 + // given number. 1.677 + void reset_for_reuse(int n_threads); 1.678 + 1.679 +#ifdef TRACESPINNING 1.680 + static uint total_yields() { return _total_yields; } 1.681 + static uint total_spins() { return _total_spins; } 1.682 + static uint total_peeks() { return _total_peeks; } 1.683 + static void print_termination_counts(); 1.684 +#endif 1.685 +}; 1.686 + 1.687 +template<class E, MEMFLAGS F, unsigned int N> inline bool 1.688 +GenericTaskQueue<E, F, N>::push(E t) { 1.689 + uint localBot = _bottom; 1.690 + assert(localBot < N, "_bottom out of range."); 1.691 + idx_t top = _age.top(); 1.692 + uint dirty_n_elems = dirty_size(localBot, top); 1.693 + assert(dirty_n_elems < N, "n_elems out of range."); 1.694 + if (dirty_n_elems < max_elems()) { 1.695 + // g++ complains if the volatile result of the assignment is 1.696 + // unused, so we cast the volatile away. We cannot cast directly 1.697 + // to void, because gcc treats that as not using the result of the 1.698 + // assignment. However, casting to E& means that we trigger an 1.699 + // unused-value warning. So, we cast the E& to void. 1.700 + (void) const_cast<E&>(_elems[localBot] = t); 1.701 + OrderAccess::release_store(&_bottom, increment_index(localBot)); 1.702 + TASKQUEUE_STATS_ONLY(stats.record_push()); 1.703 + return true; 1.704 + } else { 1.705 + return push_slow(t, dirty_n_elems); 1.706 + } 1.707 +} 1.708 + 1.709 +template<class E, MEMFLAGS F, unsigned int N> inline bool 1.710 +GenericTaskQueue<E, F, N>::pop_local(volatile E& t) { 1.711 + uint localBot = _bottom; 1.712 + // This value cannot be N-1. That can only occur as a result of 1.713 + // the assignment to bottom in this method. If it does, this method 1.714 + // resets the size to 0 before the next call (which is sequential, 1.715 + // since this is pop_local.) 1.716 + uint dirty_n_elems = dirty_size(localBot, _age.top()); 1.717 + assert(dirty_n_elems != N - 1, "Shouldn't be possible..."); 1.718 + if (dirty_n_elems == 0) return false; 1.719 + localBot = decrement_index(localBot); 1.720 + _bottom = localBot; 1.721 + // This is necessary to prevent any read below from being reordered 1.722 + // before the store just above. 1.723 + OrderAccess::fence(); 1.724 + // g++ complains if the volatile result of the assignment is 1.725 + // unused, so we cast the volatile away. We cannot cast directly 1.726 + // to void, because gcc treats that as not using the result of the 1.727 + // assignment. However, casting to E& means that we trigger an 1.728 + // unused-value warning. So, we cast the E& to void. 1.729 + (void) const_cast<E&>(t = _elems[localBot]); 1.730 + // This is a second read of "age"; the "size()" above is the first. 1.731 + // If there's still at least one element in the queue, based on the 1.732 + // "_bottom" and "age" we've read, then there can be no interference with 1.733 + // a "pop_global" operation, and we're done. 1.734 + idx_t tp = _age.top(); // XXX 1.735 + if (size(localBot, tp) > 0) { 1.736 + assert(dirty_size(localBot, tp) != N - 1, "sanity"); 1.737 + TASKQUEUE_STATS_ONLY(stats.record_pop()); 1.738 + return true; 1.739 + } else { 1.740 + // Otherwise, the queue contained exactly one element; we take the slow 1.741 + // path. 1.742 + return pop_local_slow(localBot, _age.get()); 1.743 + } 1.744 +} 1.745 + 1.746 +typedef GenericTaskQueue<oop, mtGC> OopTaskQueue; 1.747 +typedef GenericTaskQueueSet<OopTaskQueue, mtGC> OopTaskQueueSet; 1.748 + 1.749 +#ifdef _MSC_VER 1.750 +#pragma warning(push) 1.751 +// warning C4522: multiple assignment operators specified 1.752 +#pragma warning(disable:4522) 1.753 +#endif 1.754 + 1.755 +// This is a container class for either an oop* or a narrowOop*. 1.756 +// Both are pushed onto a task queue and the consumer will test is_narrow() 1.757 +// to determine which should be processed. 1.758 +class StarTask { 1.759 + void* _holder; // either union oop* or narrowOop* 1.760 + 1.761 + enum { COMPRESSED_OOP_MASK = 1 }; 1.762 + 1.763 + public: 1.764 + StarTask(narrowOop* p) { 1.765 + assert(((uintptr_t)p & COMPRESSED_OOP_MASK) == 0, "Information loss!"); 1.766 + _holder = (void *)((uintptr_t)p | COMPRESSED_OOP_MASK); 1.767 + } 1.768 + StarTask(oop* p) { 1.769 + assert(((uintptr_t)p & COMPRESSED_OOP_MASK) == 0, "Information loss!"); 1.770 + _holder = (void*)p; 1.771 + } 1.772 + StarTask() { _holder = NULL; } 1.773 + operator oop*() { return (oop*)_holder; } 1.774 + operator narrowOop*() { 1.775 + return (narrowOop*)((uintptr_t)_holder & ~COMPRESSED_OOP_MASK); 1.776 + } 1.777 + 1.778 + StarTask& operator=(const StarTask& t) { 1.779 + _holder = t._holder; 1.780 + return *this; 1.781 + } 1.782 + volatile StarTask& operator=(const volatile StarTask& t) volatile { 1.783 + _holder = t._holder; 1.784 + return *this; 1.785 + } 1.786 + 1.787 + bool is_narrow() const { 1.788 + return (((uintptr_t)_holder & COMPRESSED_OOP_MASK) != 0); 1.789 + } 1.790 +}; 1.791 + 1.792 +class ObjArrayTask 1.793 +{ 1.794 +public: 1.795 + ObjArrayTask(oop o = NULL, int idx = 0): _obj(o), _index(idx) { } 1.796 + ObjArrayTask(oop o, size_t idx): _obj(o), _index(int(idx)) { 1.797 + assert(idx <= size_t(max_jint), "too big"); 1.798 + } 1.799 + ObjArrayTask(const ObjArrayTask& t): _obj(t._obj), _index(t._index) { } 1.800 + 1.801 + ObjArrayTask& operator =(const ObjArrayTask& t) { 1.802 + _obj = t._obj; 1.803 + _index = t._index; 1.804 + return *this; 1.805 + } 1.806 + volatile ObjArrayTask& 1.807 + operator =(const volatile ObjArrayTask& t) volatile { 1.808 + (void)const_cast<oop&>(_obj = t._obj); 1.809 + _index = t._index; 1.810 + return *this; 1.811 + } 1.812 + 1.813 + inline oop obj() const { return _obj; } 1.814 + inline int index() const { return _index; } 1.815 + 1.816 + DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid. 1.817 + 1.818 +private: 1.819 + oop _obj; 1.820 + int _index; 1.821 +}; 1.822 + 1.823 +#ifdef _MSC_VER 1.824 +#pragma warning(pop) 1.825 +#endif 1.826 + 1.827 +typedef OverflowTaskQueue<StarTask, mtClass> OopStarTaskQueue; 1.828 +typedef GenericTaskQueueSet<OopStarTaskQueue, mtClass> OopStarTaskQueueSet; 1.829 + 1.830 +typedef OverflowTaskQueue<size_t, mtInternal> RegionTaskQueue; 1.831 +typedef GenericTaskQueueSet<RegionTaskQueue, mtClass> RegionTaskQueueSet; 1.832 + 1.833 + 1.834 +#endif // SHARE_VM_UTILITIES_TASKQUEUE_HPP