Fri, 30 Jan 2009 14:17:52 -0800
6787254: Work queue capacity can be increased substantially on some platforms
Summary: Increased the default and maximum size of the CMS marking stack and the size of the parallel workers' work queues in 64-bit mode. The latter was accomplished by an increase in the width of the Taskqueue's Age struct and its Tag field in 64-bit mode.
Reviewed-by: jmasa, tonyp
1.1 --- a/src/share/vm/runtime/globals.hpp Thu Jan 29 21:25:42 2009 -0800 1.2 +++ b/src/share/vm/runtime/globals.hpp Fri Jan 30 14:17:52 2009 -0800 1.3 @@ -1426,10 +1426,10 @@ 1.4 develop(bool, CMSOverflowEarlyRestoration, false, \ 1.5 "Whether preserved marks should be restored early") \ 1.6 \ 1.7 - product(uintx, CMSMarkStackSize, 32*K, \ 1.8 + product(uintx, CMSMarkStackSize, NOT_LP64(32*K) LP64_ONLY(4*M), \ 1.9 "Size of CMS marking stack") \ 1.10 \ 1.11 - product(uintx, CMSMarkStackSizeMax, 4*M, \ 1.12 + product(uintx, CMSMarkStackSizeMax, NOT_LP64(4*M) LP64_ONLY(512*M), \ 1.13 "Max size of CMS marking stack") \ 1.14 \ 1.15 notproduct(bool, CMSMarkStackOverflowALot, false, \
2.1 --- a/src/share/vm/utilities/taskqueue.cpp Thu Jan 29 21:25:42 2009 -0800 2.2 +++ b/src/share/vm/utilities/taskqueue.cpp Fri Jan 30 14:17:52 2009 -0800 2.3 @@ -69,7 +69,7 @@ 2.4 ParallelTaskTerminator::offer_termination(TerminatorTerminator* terminator) { 2.5 Atomic::inc(&_offered_termination); 2.6 2.7 - juint yield_count = 0; 2.8 + uint yield_count = 0; 2.9 while (true) { 2.10 if (_offered_termination == _n_threads) { 2.11 //inner_termination_loop();
3.1 --- a/src/share/vm/utilities/taskqueue.hpp Thu Jan 29 21:25:42 2009 -0800 3.2 +++ b/src/share/vm/utilities/taskqueue.hpp Fri Jan 30 14:17:52 2009 -0800 3.3 @@ -22,67 +22,76 @@ 3.4 * 3.5 */ 3.6 3.7 +#ifdef LP64 3.8 +typedef juint TAG_TYPE; 3.9 +// for a taskqueue size of 4M 3.10 +#define LOG_TASKQ_SIZE 22 3.11 +#else 3.12 +typedef jushort TAG_TYPE; 3.13 +// for a taskqueue size of 16K 3.14 +#define LOG_TASKQ_SIZE 14 3.15 +#endif 3.16 + 3.17 class TaskQueueSuper: public CHeapObj { 3.18 protected: 3.19 // The first free element after the last one pushed (mod _n). 3.20 - // (For now we'll assume only 32-bit CAS). 3.21 - volatile juint _bottom; 3.22 + volatile uint _bottom; 3.23 3.24 // log2 of the size of the queue. 3.25 enum SomeProtectedConstants { 3.26 - Log_n = 14 3.27 + Log_n = LOG_TASKQ_SIZE 3.28 }; 3.29 +#undef LOG_TASKQ_SIZE 3.30 3.31 // Size of the queue. 3.32 - juint n() { return (1 << Log_n); } 3.33 + uint n() { return (1 << Log_n); } 3.34 // For computing "x mod n" efficiently. 3.35 - juint n_mod_mask() { return n() - 1; } 3.36 + uint n_mod_mask() { return n() - 1; } 3.37 3.38 struct Age { 3.39 - jushort _top; 3.40 - jushort _tag; 3.41 + TAG_TYPE _top; 3.42 + TAG_TYPE _tag; 3.43 3.44 - jushort tag() const { return _tag; } 3.45 - jushort top() const { return _top; } 3.46 + TAG_TYPE tag() const { return _tag; } 3.47 + TAG_TYPE top() const { return _top; } 3.48 3.49 Age() { _tag = 0; _top = 0; } 3.50 3.51 friend bool operator ==(const Age& a1, const Age& a2) { 3.52 return a1.tag() == a2.tag() && a1.top() == a2.top(); 3.53 } 3.54 - 3.55 }; 3.56 Age _age; 3.57 // These make sure we do single atomic reads and writes. 3.58 Age get_age() { 3.59 - jint res = *(volatile jint*)(&_age); 3.60 + uint res = *(volatile uint*)(&_age); 3.61 return *(Age*)(&res); 3.62 } 3.63 void set_age(Age a) { 3.64 - *(volatile jint*)(&_age) = *(int*)(&a); 3.65 + *(volatile uint*)(&_age) = *(uint*)(&a); 3.66 } 3.67 3.68 - jushort get_top() { 3.69 + TAG_TYPE get_top() { 3.70 return get_age().top(); 3.71 } 3.72 3.73 // These both operate mod _n. 3.74 - juint increment_index(juint ind) { 3.75 + uint increment_index(uint ind) { 3.76 return (ind + 1) & n_mod_mask(); 3.77 } 3.78 - juint decrement_index(juint ind) { 3.79 + uint decrement_index(uint ind) { 3.80 return (ind - 1) & n_mod_mask(); 3.81 } 3.82 3.83 // Returns a number in the range [0.._n). If the result is "n-1", it 3.84 // should be interpreted as 0. 3.85 - juint dirty_size(juint bot, juint top) { 3.86 - return ((jint)bot - (jint)top) & n_mod_mask(); 3.87 + uint dirty_size(uint bot, uint top) { 3.88 + return ((int)bot - (int)top) & n_mod_mask(); 3.89 } 3.90 3.91 // Returns the size corresponding to the given "bot" and "top". 3.92 - juint size(juint bot, juint top) { 3.93 - juint sz = dirty_size(bot, top); 3.94 + uint size(uint bot, uint top) { 3.95 + uint sz = dirty_size(bot, top); 3.96 // Has the queue "wrapped", so that bottom is less than top? 3.97 // There's a complicated special case here. A pair of threads could 3.98 // perform pop_local and pop_global operations concurrently, starting 3.99 @@ -94,7 +103,7 @@ 3.100 // owner performs pop_local's, and several concurrent threads 3.101 // attempting to perform the pop_global will all perform the same CAS, 3.102 // and only one can succeed. Any stealing thread that reads after 3.103 - // either the increment or decrement will seen an empty queue, and will 3.104 + // either the increment or decrement will see an empty queue, and will 3.105 // not join the competitors. The "sz == -1 || sz == _n-1" state will 3.106 // not be modified by concurrent queues, so the owner thread can reset 3.107 // the state to _bottom == top so subsequent pushes will be performed 3.108 @@ -112,11 +121,11 @@ 3.109 // Return an estimate of the number of elements in the queue. 3.110 // The "careful" version admits the possibility of pop_local/pop_global 3.111 // races. 3.112 - juint size() { 3.113 + uint size() { 3.114 return size(_bottom, get_top()); 3.115 } 3.116 3.117 - juint dirty_size() { 3.118 + uint dirty_size() { 3.119 return dirty_size(_bottom, get_top()); 3.120 } 3.121 3.122 @@ -127,15 +136,15 @@ 3.123 3.124 // Maximum number of elements allowed in the queue. This is two less 3.125 // than the actual queue size, for somewhat complicated reasons. 3.126 - juint max_elems() { return n() - 2; } 3.127 + uint max_elems() { return n() - 2; } 3.128 3.129 }; 3.130 3.131 template<class E> class GenericTaskQueue: public TaskQueueSuper { 3.132 private: 3.133 // Slow paths for push, pop_local. (pop_global has no fast path.) 3.134 - bool push_slow(E t, juint dirty_n_elems); 3.135 - bool pop_local_slow(juint localBot, Age oldAge); 3.136 + bool push_slow(E t, uint dirty_n_elems); 3.137 + bool pop_local_slow(uint localBot, Age oldAge); 3.138 3.139 public: 3.140 // Initializes the queue to empty. 3.141 @@ -170,7 +179,7 @@ 3.142 3.143 template<class E> 3.144 GenericTaskQueue<E>::GenericTaskQueue():TaskQueueSuper() { 3.145 - assert(sizeof(Age) == sizeof(jint), "Depends on this."); 3.146 + assert(sizeof(Age) == sizeof(int), "Depends on this."); 3.147 } 3.148 3.149 template<class E> 3.150 @@ -182,9 +191,9 @@ 3.151 template<class E> 3.152 void GenericTaskQueue<E>::oops_do(OopClosure* f) { 3.153 // tty->print_cr("START OopTaskQueue::oops_do"); 3.154 - int iters = size(); 3.155 - juint index = _bottom; 3.156 - for (int i = 0; i < iters; ++i) { 3.157 + uint iters = size(); 3.158 + uint index = _bottom; 3.159 + for (uint i = 0; i < iters; ++i) { 3.160 index = decrement_index(index); 3.161 // tty->print_cr(" doing entry %d," INTPTR_T " -> " INTPTR_T, 3.162 // index, &_elems[index], _elems[index]); 3.163 @@ -198,10 +207,10 @@ 3.164 3.165 3.166 template<class E> 3.167 -bool GenericTaskQueue<E>::push_slow(E t, juint dirty_n_elems) { 3.168 +bool GenericTaskQueue<E>::push_slow(E t, uint dirty_n_elems) { 3.169 if (dirty_n_elems == n() - 1) { 3.170 // Actually means 0, so do the push. 3.171 - juint localBot = _bottom; 3.172 + uint localBot = _bottom; 3.173 _elems[localBot] = t; 3.174 _bottom = increment_index(localBot); 3.175 return true; 3.176 @@ -211,7 +220,7 @@ 3.177 3.178 template<class E> 3.179 bool GenericTaskQueue<E>:: 3.180 -pop_local_slow(juint localBot, Age oldAge) { 3.181 +pop_local_slow(uint localBot, Age oldAge) { 3.182 // This queue was observed to contain exactly one element; either this 3.183 // thread will claim it, or a competing "pop_global". In either case, 3.184 // the queue will be logically empty afterwards. Create a new Age value 3.185 @@ -230,9 +239,8 @@ 3.186 Age tempAge; 3.187 // No competing pop_global has yet incremented "top"; we'll try to 3.188 // install new_age, thus claiming the element. 3.189 - assert(sizeof(Age) == sizeof(jint) && sizeof(jint) == sizeof(juint), 3.190 - "Assumption about CAS unit."); 3.191 - *(jint*)&tempAge = Atomic::cmpxchg(*(jint*)&newAge, (volatile jint*)&_age, *(jint*)&oldAge); 3.192 + assert(sizeof(Age) == sizeof(int), "Assumption about CAS unit."); 3.193 + *(uint*)&tempAge = Atomic::cmpxchg(*(uint*)&newAge, (volatile uint*)&_age, *(uint*)&oldAge); 3.194 if (tempAge == oldAge) { 3.195 // We win. 3.196 assert(dirty_size(localBot, get_top()) != n() - 1, 3.197 @@ -253,8 +261,8 @@ 3.198 bool GenericTaskQueue<E>::pop_global(E& t) { 3.199 Age newAge; 3.200 Age oldAge = get_age(); 3.201 - juint localBot = _bottom; 3.202 - juint n_elems = size(localBot, oldAge.top()); 3.203 + uint localBot = _bottom; 3.204 + uint n_elems = size(localBot, oldAge.top()); 3.205 if (n_elems == 0) { 3.206 return false; 3.207 } 3.208 @@ -263,7 +271,7 @@ 3.209 newAge._top = increment_index(newAge.top()); 3.210 if ( newAge._top == 0 ) newAge._tag++; 3.211 Age resAge; 3.212 - *(jint*)&resAge = Atomic::cmpxchg(*(jint*)&newAge, (volatile jint*)&_age, *(jint*)&oldAge); 3.213 + *(uint*)&resAge = Atomic::cmpxchg(*(uint*)&newAge, (volatile uint*)&_age, *(uint*)&oldAge); 3.214 // Note that using "_bottom" here might fail, since a pop_local might 3.215 // have decremented it. 3.216 assert(dirty_size(localBot, newAge._top) != n() - 1, 3.217 @@ -287,7 +295,7 @@ 3.218 3.219 template<class E> class GenericTaskQueueSet: public TaskQueueSetSuper { 3.220 private: 3.221 - int _n; 3.222 + uint _n; 3.223 GenericTaskQueue<E>** _queues; 3.224 3.225 public: 3.226 @@ -300,51 +308,51 @@ 3.227 } 3.228 } 3.229 3.230 - bool steal_1_random(int queue_num, int* seed, E& t); 3.231 - bool steal_best_of_2(int queue_num, int* seed, E& t); 3.232 - bool steal_best_of_all(int queue_num, int* seed, E& t); 3.233 + bool steal_1_random(uint queue_num, int* seed, E& t); 3.234 + bool steal_best_of_2(uint queue_num, int* seed, E& t); 3.235 + bool steal_best_of_all(uint queue_num, int* seed, E& t); 3.236 3.237 - void register_queue(int i, GenericTaskQueue<E>* q); 3.238 + void register_queue(uint i, GenericTaskQueue<E>* q); 3.239 3.240 - GenericTaskQueue<E>* queue(int n); 3.241 + GenericTaskQueue<E>* queue(uint n); 3.242 3.243 // The thread with queue number "queue_num" (and whose random number seed 3.244 // is at "seed") is trying to steal a task from some other queue. (It 3.245 // may try several queues, according to some configuration parameter.) 3.246 // If some steal succeeds, returns "true" and sets "t" the stolen task, 3.247 // otherwise returns false. 3.248 - bool steal(int queue_num, int* seed, E& t); 3.249 + bool steal(uint queue_num, int* seed, E& t); 3.250 3.251 bool peek(); 3.252 }; 3.253 3.254 template<class E> 3.255 -void GenericTaskQueueSet<E>::register_queue(int i, GenericTaskQueue<E>* q) { 3.256 - assert(0 <= i && i < _n, "index out of range."); 3.257 +void GenericTaskQueueSet<E>::register_queue(uint i, GenericTaskQueue<E>* q) { 3.258 + assert(i < _n, "index out of range."); 3.259 _queues[i] = q; 3.260 } 3.261 3.262 template<class E> 3.263 -GenericTaskQueue<E>* GenericTaskQueueSet<E>::queue(int i) { 3.264 +GenericTaskQueue<E>* GenericTaskQueueSet<E>::queue(uint i) { 3.265 return _queues[i]; 3.266 } 3.267 3.268 template<class E> 3.269 -bool GenericTaskQueueSet<E>::steal(int queue_num, int* seed, E& t) { 3.270 - for (int i = 0; i < 2 * _n; i++) 3.271 +bool GenericTaskQueueSet<E>::steal(uint queue_num, int* seed, E& t) { 3.272 + for (uint i = 0; i < 2 * _n; i++) 3.273 if (steal_best_of_2(queue_num, seed, t)) 3.274 return true; 3.275 return false; 3.276 } 3.277 3.278 template<class E> 3.279 -bool GenericTaskQueueSet<E>::steal_best_of_all(int queue_num, int* seed, E& t) { 3.280 +bool GenericTaskQueueSet<E>::steal_best_of_all(uint queue_num, int* seed, E& t) { 3.281 if (_n > 2) { 3.282 int best_k; 3.283 - jint best_sz = 0; 3.284 - for (int k = 0; k < _n; k++) { 3.285 + uint best_sz = 0; 3.286 + for (uint k = 0; k < _n; k++) { 3.287 if (k == queue_num) continue; 3.288 - jint sz = _queues[k]->size(); 3.289 + uint sz = _queues[k]->size(); 3.290 if (sz > best_sz) { 3.291 best_sz = sz; 3.292 best_k = k; 3.293 @@ -362,9 +370,9 @@ 3.294 } 3.295 3.296 template<class E> 3.297 -bool GenericTaskQueueSet<E>::steal_1_random(int queue_num, int* seed, E& t) { 3.298 +bool GenericTaskQueueSet<E>::steal_1_random(uint queue_num, int* seed, E& t) { 3.299 if (_n > 2) { 3.300 - int k = queue_num; 3.301 + uint k = queue_num; 3.302 while (k == queue_num) k = randomParkAndMiller(seed) % _n; 3.303 return _queues[2]->pop_global(t); 3.304 } else if (_n == 2) { 3.305 @@ -378,20 +386,20 @@ 3.306 } 3.307 3.308 template<class E> 3.309 -bool GenericTaskQueueSet<E>::steal_best_of_2(int queue_num, int* seed, E& t) { 3.310 +bool GenericTaskQueueSet<E>::steal_best_of_2(uint queue_num, int* seed, E& t) { 3.311 if (_n > 2) { 3.312 - int k1 = queue_num; 3.313 + uint k1 = queue_num; 3.314 while (k1 == queue_num) k1 = randomParkAndMiller(seed) % _n; 3.315 - int k2 = queue_num; 3.316 + uint k2 = queue_num; 3.317 while (k2 == queue_num || k2 == k1) k2 = randomParkAndMiller(seed) % _n; 3.318 // Sample both and try the larger. 3.319 - juint sz1 = _queues[k1]->size(); 3.320 - juint sz2 = _queues[k2]->size(); 3.321 + uint sz1 = _queues[k1]->size(); 3.322 + uint sz2 = _queues[k2]->size(); 3.323 if (sz2 > sz1) return _queues[k2]->pop_global(t); 3.324 else return _queues[k1]->pop_global(t); 3.325 } else if (_n == 2) { 3.326 // Just try the other one. 3.327 - int k = (queue_num + 1) % 2; 3.328 + uint k = (queue_num + 1) % 2; 3.329 return _queues[k]->pop_global(t); 3.330 } else { 3.331 assert(_n == 1, "can't be zero."); 3.332 @@ -402,7 +410,7 @@ 3.333 template<class E> 3.334 bool GenericTaskQueueSet<E>::peek() { 3.335 // Try all the queues. 3.336 - for (int j = 0; j < _n; j++) { 3.337 + for (uint j = 0; j < _n; j++) { 3.338 if (_queues[j]->peek()) 3.339 return true; 3.340 } 3.341 @@ -422,7 +430,7 @@ 3.342 private: 3.343 int _n_threads; 3.344 TaskQueueSetSuper* _queue_set; 3.345 - jint _offered_termination; 3.346 + int _offered_termination; 3.347 3.348 bool peek_in_queue_set(); 3.349 protected: 3.350 @@ -460,7 +468,7 @@ 3.351 3.352 template<class E> inline bool GenericTaskQueue<E>::push(E t) { 3.353 #if SIMPLE_STACK 3.354 - juint localBot = _bottom; 3.355 + uint localBot = _bottom; 3.356 if (_bottom < max_elems()) { 3.357 _elems[localBot] = t; 3.358 _bottom = localBot + 1; 3.359 @@ -469,10 +477,10 @@ 3.360 return false; 3.361 } 3.362 #else 3.363 - juint localBot = _bottom; 3.364 + uint localBot = _bottom; 3.365 assert((localBot >= 0) && (localBot < n()), "_bottom out of range."); 3.366 - jushort top = get_top(); 3.367 - juint dirty_n_elems = dirty_size(localBot, top); 3.368 + TAG_TYPE top = get_top(); 3.369 + uint dirty_n_elems = dirty_size(localBot, top); 3.370 assert((dirty_n_elems >= 0) && (dirty_n_elems < n()), 3.371 "n_elems out of range."); 3.372 if (dirty_n_elems < max_elems()) { 3.373 @@ -487,19 +495,19 @@ 3.374 3.375 template<class E> inline bool GenericTaskQueue<E>::pop_local(E& t) { 3.376 #if SIMPLE_STACK 3.377 - juint localBot = _bottom; 3.378 + uint localBot = _bottom; 3.379 assert(localBot > 0, "precondition."); 3.380 localBot--; 3.381 t = _elems[localBot]; 3.382 _bottom = localBot; 3.383 return true; 3.384 #else 3.385 - juint localBot = _bottom; 3.386 + uint localBot = _bottom; 3.387 // This value cannot be n-1. That can only occur as a result of 3.388 // the assignment to bottom in this method. If it does, this method 3.389 // resets the size( to 0 before the next call (which is sequential, 3.390 // since this is pop_local.) 3.391 - juint dirty_n_elems = dirty_size(localBot, get_top()); 3.392 + uint dirty_n_elems = dirty_size(localBot, get_top()); 3.393 assert(dirty_n_elems != n() - 1, "Shouldn't be possible..."); 3.394 if (dirty_n_elems == 0) return false; 3.395 localBot = decrement_index(localBot); 3.396 @@ -512,7 +520,7 @@ 3.397 // If there's still at least one element in the queue, based on the 3.398 // "_bottom" and "age" we've read, then there can be no interference with 3.399 // a "pop_global" operation, and we're done. 3.400 - juint tp = get_top(); 3.401 + TAG_TYPE tp = get_top(); // XXX 3.402 if (size(localBot, tp) > 0) { 3.403 assert(dirty_size(localBot, tp) != n() - 1, 3.404 "Shouldn't be possible..."); 3.405 @@ -581,7 +589,7 @@ 3.406 bool is_empty(); 3.407 bool stealable_is_empty(); 3.408 bool overflow_is_empty(); 3.409 - juint stealable_size() { return _region_queue.size(); } 3.410 + uint stealable_size() { return _region_queue.size(); } 3.411 RegionTaskQueue* task_queue() { return &_region_queue; } 3.412 }; 3.413