Mon, 26 Jan 2009 12:47:21 -0800
6786503: Overflow list performance can be improved
Summary: Avoid overflow list walk in CMS & ParNew when it is unnecessary. Fix a couple of correctness issues, including a C-heap leak, in ParNew at the intersection of promotion failure, work queue overflow and object array chunking. Add stress testing option and related assertion checking.
Reviewed-by: jmasa
duke@435 | 1 | /* |
duke@435 | 2 | * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. |
duke@435 | 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
duke@435 | 4 | * |
duke@435 | 5 | * This code is free software; you can redistribute it and/or modify it |
duke@435 | 6 | * under the terms of the GNU General Public License version 2 only, as |
duke@435 | 7 | * published by the Free Software Foundation. |
duke@435 | 8 | * |
duke@435 | 9 | * This code is distributed in the hope that it will be useful, but WITHOUT |
duke@435 | 10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
duke@435 | 11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
duke@435 | 12 | * version 2 for more details (a copy is included in the LICENSE file that |
duke@435 | 13 | * accompanied this code). |
duke@435 | 14 | * |
duke@435 | 15 | * You should have received a copy of the GNU General Public License version |
duke@435 | 16 | * 2 along with this work; if not, write to the Free Software Foundation, |
duke@435 | 17 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
duke@435 | 18 | * |
duke@435 | 19 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, |
duke@435 | 20 | * CA 95054 USA or visit www.sun.com if you need additional information or |
duke@435 | 21 | * have any questions. |
duke@435 | 22 | * |
duke@435 | 23 | */ |
duke@435 | 24 | |
duke@435 | 25 | # include "incls/_precompiled.incl" |
duke@435 | 26 | # include "incls/_parCardTableModRefBS.cpp.incl" |
duke@435 | 27 | |
duke@435 | 28 | void CardTableModRefBS::par_non_clean_card_iterate_work(Space* sp, MemRegion mr, |
duke@435 | 29 | DirtyCardToOopClosure* dcto_cl, |
duke@435 | 30 | MemRegionClosure* cl, |
duke@435 | 31 | bool clear, |
duke@435 | 32 | int n_threads) { |
duke@435 | 33 | if (n_threads > 0) { |
duke@435 | 34 | assert(n_threads == (int)ParallelGCThreads, "# worker threads != # requested!"); |
duke@435 | 35 | |
duke@435 | 36 | // Make sure the LNC array is valid for the space. |
duke@435 | 37 | jbyte** lowest_non_clean; |
duke@435 | 38 | uintptr_t lowest_non_clean_base_chunk_index; |
duke@435 | 39 | size_t lowest_non_clean_chunk_size; |
duke@435 | 40 | get_LNC_array_for_space(sp, lowest_non_clean, |
duke@435 | 41 | lowest_non_clean_base_chunk_index, |
duke@435 | 42 | lowest_non_clean_chunk_size); |
duke@435 | 43 | |
duke@435 | 44 | int n_strides = n_threads * StridesPerThread; |
duke@435 | 45 | SequentialSubTasksDone* pst = sp->par_seq_tasks(); |
duke@435 | 46 | pst->set_par_threads(n_threads); |
duke@435 | 47 | pst->set_n_tasks(n_strides); |
duke@435 | 48 | |
duke@435 | 49 | int stride = 0; |
duke@435 | 50 | while (!pst->is_task_claimed(/* reference */ stride)) { |
duke@435 | 51 | process_stride(sp, mr, stride, n_strides, dcto_cl, cl, clear, |
duke@435 | 52 | lowest_non_clean, |
duke@435 | 53 | lowest_non_clean_base_chunk_index, |
duke@435 | 54 | lowest_non_clean_chunk_size); |
duke@435 | 55 | } |
duke@435 | 56 | if (pst->all_tasks_completed()) { |
duke@435 | 57 | // Clear lowest_non_clean array for next time. |
duke@435 | 58 | intptr_t first_chunk_index = addr_to_chunk_index(mr.start()); |
duke@435 | 59 | uintptr_t last_chunk_index = addr_to_chunk_index(mr.last()); |
duke@435 | 60 | for (uintptr_t ch = first_chunk_index; ch <= last_chunk_index; ch++) { |
duke@435 | 61 | intptr_t ind = ch - lowest_non_clean_base_chunk_index; |
duke@435 | 62 | assert(0 <= ind && ind < (intptr_t)lowest_non_clean_chunk_size, |
duke@435 | 63 | "Bounds error"); |
duke@435 | 64 | lowest_non_clean[ind] = NULL; |
duke@435 | 65 | } |
duke@435 | 66 | } |
duke@435 | 67 | } |
duke@435 | 68 | } |
duke@435 | 69 | |
duke@435 | 70 | void |
duke@435 | 71 | CardTableModRefBS:: |
duke@435 | 72 | process_stride(Space* sp, |
duke@435 | 73 | MemRegion used, |
duke@435 | 74 | jint stride, int n_strides, |
duke@435 | 75 | DirtyCardToOopClosure* dcto_cl, |
duke@435 | 76 | MemRegionClosure* cl, |
duke@435 | 77 | bool clear, |
duke@435 | 78 | jbyte** lowest_non_clean, |
duke@435 | 79 | uintptr_t lowest_non_clean_base_chunk_index, |
duke@435 | 80 | size_t lowest_non_clean_chunk_size) { |
duke@435 | 81 | // We don't have to go downwards here; it wouldn't help anyway, |
duke@435 | 82 | // because of parallelism. |
duke@435 | 83 | |
duke@435 | 84 | // Find the first card address of the first chunk in the stride that is |
duke@435 | 85 | // at least "bottom" of the used region. |
duke@435 | 86 | jbyte* start_card = byte_for(used.start()); |
duke@435 | 87 | jbyte* end_card = byte_after(used.last()); |
duke@435 | 88 | uintptr_t start_chunk = addr_to_chunk_index(used.start()); |
duke@435 | 89 | uintptr_t start_chunk_stride_num = start_chunk % n_strides; |
duke@435 | 90 | jbyte* chunk_card_start; |
duke@435 | 91 | |
duke@435 | 92 | if ((uintptr_t)stride >= start_chunk_stride_num) { |
duke@435 | 93 | chunk_card_start = (jbyte*)(start_card + |
duke@435 | 94 | (stride - start_chunk_stride_num) * |
duke@435 | 95 | CardsPerStrideChunk); |
duke@435 | 96 | } else { |
duke@435 | 97 | // Go ahead to the next chunk group boundary, then to the requested stride. |
duke@435 | 98 | chunk_card_start = (jbyte*)(start_card + |
duke@435 | 99 | (n_strides - start_chunk_stride_num + stride) * |
duke@435 | 100 | CardsPerStrideChunk); |
duke@435 | 101 | } |
duke@435 | 102 | |
duke@435 | 103 | while (chunk_card_start < end_card) { |
duke@435 | 104 | // We don't have to go downwards here; it wouldn't help anyway, |
duke@435 | 105 | // because of parallelism. (We take care with "min_done"; see below.) |
duke@435 | 106 | // Invariant: chunk_mr should be fully contained within the "used" region. |
duke@435 | 107 | jbyte* chunk_card_end = chunk_card_start + CardsPerStrideChunk; |
duke@435 | 108 | MemRegion chunk_mr = MemRegion(addr_for(chunk_card_start), |
duke@435 | 109 | chunk_card_end >= end_card ? |
duke@435 | 110 | used.end() : addr_for(chunk_card_end)); |
duke@435 | 111 | assert(chunk_mr.word_size() > 0, "[chunk_card_start > used_end)"); |
duke@435 | 112 | assert(used.contains(chunk_mr), "chunk_mr should be subset of used"); |
duke@435 | 113 | |
duke@435 | 114 | // Process the chunk. |
duke@435 | 115 | process_chunk_boundaries(sp, |
duke@435 | 116 | dcto_cl, |
duke@435 | 117 | chunk_mr, |
duke@435 | 118 | used, |
duke@435 | 119 | lowest_non_clean, |
duke@435 | 120 | lowest_non_clean_base_chunk_index, |
duke@435 | 121 | lowest_non_clean_chunk_size); |
duke@435 | 122 | |
duke@435 | 123 | non_clean_card_iterate_work(chunk_mr, cl, clear); |
duke@435 | 124 | |
duke@435 | 125 | // Find the next chunk of the stride. |
duke@435 | 126 | chunk_card_start += CardsPerStrideChunk * n_strides; |
duke@435 | 127 | } |
duke@435 | 128 | } |
duke@435 | 129 | |
duke@435 | 130 | void |
duke@435 | 131 | CardTableModRefBS:: |
duke@435 | 132 | process_chunk_boundaries(Space* sp, |
duke@435 | 133 | DirtyCardToOopClosure* dcto_cl, |
duke@435 | 134 | MemRegion chunk_mr, |
duke@435 | 135 | MemRegion used, |
duke@435 | 136 | jbyte** lowest_non_clean, |
duke@435 | 137 | uintptr_t lowest_non_clean_base_chunk_index, |
duke@435 | 138 | size_t lowest_non_clean_chunk_size) |
duke@435 | 139 | { |
duke@435 | 140 | // We must worry about the chunk boundaries. |
duke@435 | 141 | |
duke@435 | 142 | // First, set our max_to_do: |
duke@435 | 143 | HeapWord* max_to_do = NULL; |
duke@435 | 144 | uintptr_t cur_chunk_index = addr_to_chunk_index(chunk_mr.start()); |
duke@435 | 145 | cur_chunk_index = cur_chunk_index - lowest_non_clean_base_chunk_index; |
duke@435 | 146 | |
duke@435 | 147 | if (chunk_mr.end() < used.end()) { |
duke@435 | 148 | // This is not the last chunk in the used region. What is the last |
duke@435 | 149 | // object? |
duke@435 | 150 | HeapWord* last_block = sp->block_start(chunk_mr.end()); |
duke@435 | 151 | assert(last_block <= chunk_mr.end(), "In case this property changes."); |
duke@435 | 152 | if (last_block == chunk_mr.end() |
duke@435 | 153 | || !sp->block_is_obj(last_block)) { |
duke@435 | 154 | max_to_do = chunk_mr.end(); |
duke@435 | 155 | |
duke@435 | 156 | } else { |
duke@435 | 157 | // It is an object and starts before the end of the current chunk. |
duke@435 | 158 | // last_obj_card is the card corresponding to the start of the last object |
duke@435 | 159 | // in the chunk. Note that the last object may not start in |
duke@435 | 160 | // the chunk. |
duke@435 | 161 | jbyte* last_obj_card = byte_for(last_block); |
duke@435 | 162 | if (!card_may_have_been_dirty(*last_obj_card)) { |
duke@435 | 163 | // The card containing the head is not dirty. Any marks in |
duke@435 | 164 | // subsequent cards still in this chunk must have been made |
duke@435 | 165 | // precisely; we can cap processing at the end. |
duke@435 | 166 | max_to_do = chunk_mr.end(); |
duke@435 | 167 | } else { |
duke@435 | 168 | // The last object must be considered dirty, and extends onto the |
duke@435 | 169 | // following chunk. Look for a dirty card in that chunk that will |
duke@435 | 170 | // bound our processing. |
duke@435 | 171 | jbyte* limit_card = NULL; |
duke@435 | 172 | size_t last_block_size = sp->block_size(last_block); |
duke@435 | 173 | jbyte* last_card_of_last_obj = |
duke@435 | 174 | byte_for(last_block + last_block_size - 1); |
duke@435 | 175 | jbyte* first_card_of_next_chunk = byte_for(chunk_mr.end()); |
duke@435 | 176 | // This search potentially goes a long distance looking |
duke@435 | 177 | // for the next card that will be scanned. For example, |
duke@435 | 178 | // an object that is an array of primitives will not |
duke@435 | 179 | // have any cards covering regions interior to the array |
duke@435 | 180 | // that will need to be scanned. The scan can be terminated |
duke@435 | 181 | // at the last card of the next chunk. That would leave |
duke@435 | 182 | // limit_card as NULL and would result in "max_to_do" |
duke@435 | 183 | // being set with the LNC value or with the end |
duke@435 | 184 | // of the last block. |
duke@435 | 185 | jbyte* last_card_of_next_chunk = first_card_of_next_chunk + |
duke@435 | 186 | CardsPerStrideChunk; |
duke@435 | 187 | assert(byte_for(chunk_mr.end()) - byte_for(chunk_mr.start()) |
duke@435 | 188 | == CardsPerStrideChunk, "last card of next chunk may be wrong"); |
duke@435 | 189 | jbyte* last_card_to_check = (jbyte*) MIN2(last_card_of_last_obj, |
duke@435 | 190 | last_card_of_next_chunk); |
duke@435 | 191 | for (jbyte* cur = first_card_of_next_chunk; |
duke@435 | 192 | cur <= last_card_to_check; cur++) { |
duke@435 | 193 | if (card_will_be_scanned(*cur)) { |
duke@435 | 194 | limit_card = cur; break; |
duke@435 | 195 | } |
duke@435 | 196 | } |
duke@435 | 197 | assert(0 <= cur_chunk_index+1 && |
duke@435 | 198 | cur_chunk_index+1 < lowest_non_clean_chunk_size, |
duke@435 | 199 | "Bounds error."); |
duke@435 | 200 | // LNC for the next chunk |
duke@435 | 201 | jbyte* lnc_card = lowest_non_clean[cur_chunk_index+1]; |
duke@435 | 202 | if (limit_card == NULL) { |
duke@435 | 203 | limit_card = lnc_card; |
duke@435 | 204 | } |
duke@435 | 205 | if (limit_card != NULL) { |
duke@435 | 206 | if (lnc_card != NULL) { |
duke@435 | 207 | limit_card = (jbyte*)MIN2((intptr_t)limit_card, |
duke@435 | 208 | (intptr_t)lnc_card); |
duke@435 | 209 | } |
duke@435 | 210 | max_to_do = addr_for(limit_card); |
duke@435 | 211 | } else { |
duke@435 | 212 | max_to_do = last_block + last_block_size; |
duke@435 | 213 | } |
duke@435 | 214 | } |
duke@435 | 215 | } |
duke@435 | 216 | assert(max_to_do != NULL, "OOPS!"); |
duke@435 | 217 | } else { |
duke@435 | 218 | max_to_do = used.end(); |
duke@435 | 219 | } |
duke@435 | 220 | // Now we can set the closure we're using so it doesn't to beyond |
duke@435 | 221 | // max_to_do. |
duke@435 | 222 | dcto_cl->set_min_done(max_to_do); |
duke@435 | 223 | #ifndef PRODUCT |
duke@435 | 224 | dcto_cl->set_last_bottom(max_to_do); |
duke@435 | 225 | #endif |
duke@435 | 226 | |
duke@435 | 227 | // Now we set *our" lowest_non_clean entry. |
duke@435 | 228 | // Find the object that spans our boundary, if one exists. |
duke@435 | 229 | // Nothing to do on the first chunk. |
duke@435 | 230 | if (chunk_mr.start() > used.start()) { |
duke@435 | 231 | // first_block is the block possibly spanning the chunk start |
duke@435 | 232 | HeapWord* first_block = sp->block_start(chunk_mr.start()); |
duke@435 | 233 | // Does the block span the start of the chunk and is it |
duke@435 | 234 | // an object? |
duke@435 | 235 | if (first_block < chunk_mr.start() && |
duke@435 | 236 | sp->block_is_obj(first_block)) { |
duke@435 | 237 | jbyte* first_dirty_card = NULL; |
duke@435 | 238 | jbyte* last_card_of_first_obj = |
duke@435 | 239 | byte_for(first_block + sp->block_size(first_block) - 1); |
duke@435 | 240 | jbyte* first_card_of_cur_chunk = byte_for(chunk_mr.start()); |
duke@435 | 241 | jbyte* last_card_of_cur_chunk = byte_for(chunk_mr.last()); |
duke@435 | 242 | jbyte* last_card_to_check = |
duke@435 | 243 | (jbyte*) MIN2((intptr_t) last_card_of_cur_chunk, |
duke@435 | 244 | (intptr_t) last_card_of_first_obj); |
duke@435 | 245 | for (jbyte* cur = first_card_of_cur_chunk; |
duke@435 | 246 | cur <= last_card_to_check; cur++) { |
duke@435 | 247 | if (card_will_be_scanned(*cur)) { |
duke@435 | 248 | first_dirty_card = cur; break; |
duke@435 | 249 | } |
duke@435 | 250 | } |
duke@435 | 251 | if (first_dirty_card != NULL) { |
duke@435 | 252 | assert(0 <= cur_chunk_index && |
duke@435 | 253 | cur_chunk_index < lowest_non_clean_chunk_size, |
duke@435 | 254 | "Bounds error."); |
duke@435 | 255 | lowest_non_clean[cur_chunk_index] = first_dirty_card; |
duke@435 | 256 | } |
duke@435 | 257 | } |
duke@435 | 258 | } |
duke@435 | 259 | } |
duke@435 | 260 | |
duke@435 | 261 | void |
duke@435 | 262 | CardTableModRefBS:: |
duke@435 | 263 | get_LNC_array_for_space(Space* sp, |
duke@435 | 264 | jbyte**& lowest_non_clean, |
duke@435 | 265 | uintptr_t& lowest_non_clean_base_chunk_index, |
duke@435 | 266 | size_t& lowest_non_clean_chunk_size) { |
duke@435 | 267 | |
duke@435 | 268 | int i = find_covering_region_containing(sp->bottom()); |
duke@435 | 269 | MemRegion covered = _covered[i]; |
duke@435 | 270 | size_t n_chunks = chunks_to_cover(covered); |
duke@435 | 271 | |
duke@435 | 272 | // Only the first thread to obtain the lock will resize the |
duke@435 | 273 | // LNC array for the covered region. Any later expansion can't affect |
duke@435 | 274 | // the used_at_save_marks region. |
duke@435 | 275 | // (I observed a bug in which the first thread to execute this would |
duke@435 | 276 | // resize, and then it would cause "expand_and_allocates" that would |
duke@435 | 277 | // Increase the number of chunks in the covered region. Then a second |
duke@435 | 278 | // thread would come and execute this, see that the size didn't match, |
duke@435 | 279 | // and free and allocate again. So the first thread would be using a |
duke@435 | 280 | // freed "_lowest_non_clean" array.) |
duke@435 | 281 | |
duke@435 | 282 | // Do a dirty read here. If we pass the conditional then take the rare |
duke@435 | 283 | // event lock and do the read again in case some other thread had already |
duke@435 | 284 | // succeeded and done the resize. |
duke@435 | 285 | int cur_collection = Universe::heap()->total_collections(); |
duke@435 | 286 | if (_last_LNC_resizing_collection[i] != cur_collection) { |
duke@435 | 287 | MutexLocker x(ParGCRareEvent_lock); |
duke@435 | 288 | if (_last_LNC_resizing_collection[i] != cur_collection) { |
duke@435 | 289 | if (_lowest_non_clean[i] == NULL || |
duke@435 | 290 | n_chunks != _lowest_non_clean_chunk_size[i]) { |
duke@435 | 291 | |
duke@435 | 292 | // Should we delete the old? |
duke@435 | 293 | if (_lowest_non_clean[i] != NULL) { |
duke@435 | 294 | assert(n_chunks != _lowest_non_clean_chunk_size[i], |
duke@435 | 295 | "logical consequence"); |
duke@435 | 296 | FREE_C_HEAP_ARRAY(CardPtr, _lowest_non_clean[i]); |
duke@435 | 297 | _lowest_non_clean[i] = NULL; |
duke@435 | 298 | } |
duke@435 | 299 | // Now allocate a new one if necessary. |
duke@435 | 300 | if (_lowest_non_clean[i] == NULL) { |
duke@435 | 301 | _lowest_non_clean[i] = NEW_C_HEAP_ARRAY(CardPtr, n_chunks); |
duke@435 | 302 | _lowest_non_clean_chunk_size[i] = n_chunks; |
duke@435 | 303 | _lowest_non_clean_base_chunk_index[i] = addr_to_chunk_index(covered.start()); |
duke@435 | 304 | for (int j = 0; j < (int)n_chunks; j++) |
duke@435 | 305 | _lowest_non_clean[i][j] = NULL; |
duke@435 | 306 | } |
duke@435 | 307 | } |
duke@435 | 308 | _last_LNC_resizing_collection[i] = cur_collection; |
duke@435 | 309 | } |
duke@435 | 310 | } |
duke@435 | 311 | // In any case, now do the initialization. |
duke@435 | 312 | lowest_non_clean = _lowest_non_clean[i]; |
duke@435 | 313 | lowest_non_clean_base_chunk_index = _lowest_non_clean_base_chunk_index[i]; |
duke@435 | 314 | lowest_non_clean_chunk_size = _lowest_non_clean_chunk_size[i]; |
duke@435 | 315 | } |