Thu, 02 Jun 2011 10:23:36 -0700
7048782: CMS: assert(last_chunk_index_to_check<= last_chunk_index) failed: parCardTableModRefBS.cpp:359
Summary: The LNC array is sized before the start of a scavenge, while the heap may expand during a scavenge. With CMS, the last block of an arbitrary suffice of the LNC array may expand due to coalition with the expansion delta. We now take care not to attempt access past the end of the LNC array. LNC array code will be cleaned up and suitably encapsulated as part of the forthcoming performance RFE 7043675.
Reviewed-by: brutisso
1 /*
2 * Copyright (c) 2007, 2011 Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
25 #include "precompiled.hpp"
26 #include "memory/allocation.inline.hpp"
27 #include "memory/cardTableModRefBS.hpp"
28 #include "memory/cardTableRS.hpp"
29 #include "memory/sharedHeap.hpp"
30 #include "memory/space.inline.hpp"
31 #include "memory/universe.hpp"
32 #include "oops/oop.inline.hpp"
33 #include "runtime/java.hpp"
34 #include "runtime/mutexLocker.hpp"
35 #include "runtime/virtualspace.hpp"
37 void CardTableModRefBS::non_clean_card_iterate_parallel_work(Space* sp, MemRegion mr,
38 OopsInGenClosure* cl,
39 CardTableRS* ct,
40 int n_threads) {
41 assert(n_threads > 0, "Error: expected n_threads > 0");
42 assert((n_threads == 1 && ParallelGCThreads == 0) ||
43 n_threads <= (int)ParallelGCThreads,
44 "# worker threads != # requested!");
45 // Make sure the LNC array is valid for the space.
46 jbyte** lowest_non_clean;
47 uintptr_t lowest_non_clean_base_chunk_index;
48 size_t lowest_non_clean_chunk_size;
49 get_LNC_array_for_space(sp, lowest_non_clean,
50 lowest_non_clean_base_chunk_index,
51 lowest_non_clean_chunk_size);
53 int n_strides = n_threads * ParGCStridesPerThread;
54 SequentialSubTasksDone* pst = sp->par_seq_tasks();
55 pst->set_n_threads(n_threads);
56 pst->set_n_tasks(n_strides);
58 int stride = 0;
59 while (!pst->is_task_claimed(/* reference */ stride)) {
60 process_stride(sp, mr, stride, n_strides, cl, ct,
61 lowest_non_clean,
62 lowest_non_clean_base_chunk_index,
63 lowest_non_clean_chunk_size);
64 }
65 if (pst->all_tasks_completed()) {
66 // Clear lowest_non_clean array for next time.
67 intptr_t first_chunk_index = addr_to_chunk_index(mr.start());
68 uintptr_t last_chunk_index = addr_to_chunk_index(mr.last());
69 for (uintptr_t ch = first_chunk_index; ch <= last_chunk_index; ch++) {
70 intptr_t ind = ch - lowest_non_clean_base_chunk_index;
71 assert(0 <= ind && ind < (intptr_t)lowest_non_clean_chunk_size,
72 "Bounds error");
73 lowest_non_clean[ind] = NULL;
74 }
75 }
76 }
78 void
79 CardTableModRefBS::
80 process_stride(Space* sp,
81 MemRegion used,
82 jint stride, int n_strides,
83 OopsInGenClosure* cl,
84 CardTableRS* ct,
85 jbyte** lowest_non_clean,
86 uintptr_t lowest_non_clean_base_chunk_index,
87 size_t lowest_non_clean_chunk_size) {
88 // We go from higher to lower addresses here; it wouldn't help that much
89 // because of the strided parallelism pattern used here.
91 // Find the first card address of the first chunk in the stride that is
92 // at least "bottom" of the used region.
93 jbyte* start_card = byte_for(used.start());
94 jbyte* end_card = byte_after(used.last());
95 uintptr_t start_chunk = addr_to_chunk_index(used.start());
96 uintptr_t start_chunk_stride_num = start_chunk % n_strides;
97 jbyte* chunk_card_start;
99 if ((uintptr_t)stride >= start_chunk_stride_num) {
100 chunk_card_start = (jbyte*)(start_card +
101 (stride - start_chunk_stride_num) *
102 ParGCCardsPerStrideChunk);
103 } else {
104 // Go ahead to the next chunk group boundary, then to the requested stride.
105 chunk_card_start = (jbyte*)(start_card +
106 (n_strides - start_chunk_stride_num + stride) *
107 ParGCCardsPerStrideChunk);
108 }
110 while (chunk_card_start < end_card) {
111 // Even though we go from lower to higher addresses below, the
112 // strided parallelism can interleave the actual processing of the
113 // dirty pages in various ways. For a specific chunk within this
114 // stride, we take care to avoid double scanning or missing a card
115 // by suitably initializing the "min_done" field in process_chunk_boundaries()
116 // below, together with the dirty region extension accomplished in
117 // DirtyCardToOopClosure::do_MemRegion().
118 jbyte* chunk_card_end = chunk_card_start + ParGCCardsPerStrideChunk;
119 // Invariant: chunk_mr should be fully contained within the "used" region.
120 MemRegion chunk_mr = MemRegion(addr_for(chunk_card_start),
121 chunk_card_end >= end_card ?
122 used.end() : addr_for(chunk_card_end));
123 assert(chunk_mr.word_size() > 0, "[chunk_card_start > used_end)");
124 assert(used.contains(chunk_mr), "chunk_mr should be subset of used");
126 DirtyCardToOopClosure* dcto_cl = sp->new_dcto_cl(cl, precision(),
127 cl->gen_boundary());
128 ClearNoncleanCardWrapper clear_cl(dcto_cl, ct);
131 // Process the chunk.
132 process_chunk_boundaries(sp,
133 dcto_cl,
134 chunk_mr,
135 used,
136 lowest_non_clean,
137 lowest_non_clean_base_chunk_index,
138 lowest_non_clean_chunk_size);
140 // We want the LNC array updates above in process_chunk_boundaries
141 // to be visible before any of the card table value changes as a
142 // result of the dirty card iteration below.
143 OrderAccess::storestore();
145 // We do not call the non_clean_card_iterate_serial() version because
146 // we want to clear the cards: clear_cl here does the work of finding
147 // contiguous dirty ranges of cards to process and clear.
148 clear_cl.do_MemRegion(chunk_mr);
150 // Find the next chunk of the stride.
151 chunk_card_start += ParGCCardsPerStrideChunk * n_strides;
152 }
153 }
156 // If you want a talkative process_chunk_boundaries,
157 // then #define NOISY(x) x
158 #ifdef NOISY
159 #error "Encountered a global preprocessor flag, NOISY, which might clash with local definition to follow"
160 #else
161 #define NOISY(x)
162 #endif
164 void
165 CardTableModRefBS::
166 process_chunk_boundaries(Space* sp,
167 DirtyCardToOopClosure* dcto_cl,
168 MemRegion chunk_mr,
169 MemRegion used,
170 jbyte** lowest_non_clean,
171 uintptr_t lowest_non_clean_base_chunk_index,
172 size_t lowest_non_clean_chunk_size)
173 {
174 // We must worry about non-array objects that cross chunk boundaries,
175 // because such objects are both precisely and imprecisely marked:
176 // .. if the head of such an object is dirty, the entire object
177 // needs to be scanned, under the interpretation that this
178 // was an imprecise mark
179 // .. if the head of such an object is not dirty, we can assume
180 // precise marking and it's efficient to scan just the dirty
181 // cards.
182 // In either case, each scanned reference must be scanned precisely
183 // once so as to avoid cloning of a young referent. For efficiency,
184 // our closures depend on this property and do not protect against
185 // double scans.
187 uintptr_t cur_chunk_index = addr_to_chunk_index(chunk_mr.start());
188 cur_chunk_index = cur_chunk_index - lowest_non_clean_base_chunk_index;
190 NOISY(tty->print_cr("===========================================================================");)
191 NOISY(tty->print_cr(" process_chunk_boundary: Called with [" PTR_FORMAT "," PTR_FORMAT ")",
192 chunk_mr.start(), chunk_mr.end());)
194 // First, set "our" lowest_non_clean entry, which would be
195 // used by the thread scanning an adjoining left chunk with
196 // a non-array object straddling the mutual boundary.
197 // Find the object that spans our boundary, if one exists.
198 // first_block is the block possibly straddling our left boundary.
199 HeapWord* first_block = sp->block_start(chunk_mr.start());
200 assert((chunk_mr.start() != used.start()) || (first_block == chunk_mr.start()),
201 "First chunk should always have a co-initial block");
202 // Does the block straddle the chunk's left boundary, and is it
203 // a non-array object?
204 if (first_block < chunk_mr.start() // first block straddles left bdry
205 && sp->block_is_obj(first_block) // first block is an object
206 && !(oop(first_block)->is_objArray() // first block is not an array (arrays are precisely dirtied)
207 || oop(first_block)->is_typeArray())) {
208 // Find our least non-clean card, so that a left neighbour
209 // does not scan an object straddling the mutual boundary
210 // too far to the right, and attempt to scan a portion of
211 // that object twice.
212 jbyte* first_dirty_card = NULL;
213 jbyte* last_card_of_first_obj =
214 byte_for(first_block + sp->block_size(first_block) - 1);
215 jbyte* first_card_of_cur_chunk = byte_for(chunk_mr.start());
216 jbyte* last_card_of_cur_chunk = byte_for(chunk_mr.last());
217 jbyte* last_card_to_check =
218 (jbyte*) MIN2((intptr_t) last_card_of_cur_chunk,
219 (intptr_t) last_card_of_first_obj);
220 // Note that this does not need to go beyond our last card
221 // if our first object completely straddles this chunk.
222 for (jbyte* cur = first_card_of_cur_chunk;
223 cur <= last_card_to_check; cur++) {
224 jbyte val = *cur;
225 if (card_will_be_scanned(val)) {
226 first_dirty_card = cur; break;
227 } else {
228 assert(!card_may_have_been_dirty(val), "Error");
229 }
230 }
231 if (first_dirty_card != NULL) {
232 NOISY(tty->print_cr(" LNC: Found a dirty card at " PTR_FORMAT " in current chunk",
233 first_dirty_card);)
234 assert(0 <= cur_chunk_index && cur_chunk_index < lowest_non_clean_chunk_size,
235 "Bounds error.");
236 assert(lowest_non_clean[cur_chunk_index] == NULL,
237 "Write exactly once : value should be stable hereafter for this round");
238 lowest_non_clean[cur_chunk_index] = first_dirty_card;
239 } NOISY(else {
240 tty->print_cr(" LNC: Found no dirty card in current chunk; leaving LNC entry NULL");
241 // In the future, we could have this thread look for a non-NULL value to copy from its
242 // right neighbour (up to the end of the first object).
243 if (last_card_of_cur_chunk < last_card_of_first_obj) {
244 tty->print_cr(" LNC: BEWARE!!! first obj straddles past right end of chunk:\n"
245 " might be efficient to get value from right neighbour?");
246 }
247 })
248 } else {
249 // In this case we can help our neighbour by just asking them
250 // to stop at our first card (even though it may not be dirty).
251 NOISY(tty->print_cr(" LNC: first block is not a non-array object; setting LNC to first card of current chunk");)
252 assert(lowest_non_clean[cur_chunk_index] == NULL, "Write once : value should be stable hereafter");
253 jbyte* first_card_of_cur_chunk = byte_for(chunk_mr.start());
254 lowest_non_clean[cur_chunk_index] = first_card_of_cur_chunk;
255 }
256 NOISY(tty->print_cr(" process_chunk_boundary: lowest_non_clean[" INTPTR_FORMAT "] = " PTR_FORMAT
257 " which corresponds to the heap address " PTR_FORMAT,
258 cur_chunk_index, lowest_non_clean[cur_chunk_index],
259 (lowest_non_clean[cur_chunk_index] != NULL)
260 ? addr_for(lowest_non_clean[cur_chunk_index])
261 : NULL);)
262 NOISY(tty->print_cr("---------------------------------------------------------------------------");)
264 // Next, set our own max_to_do, which will strictly/exclusively bound
265 // the highest address that we will scan past the right end of our chunk.
266 HeapWord* max_to_do = NULL;
267 if (chunk_mr.end() < used.end()) {
268 // This is not the last chunk in the used region.
269 // What is our last block? We check the first block of
270 // the next (right) chunk rather than strictly check our last block
271 // because it's potentially more efficient to do so.
272 HeapWord* const last_block = sp->block_start(chunk_mr.end());
273 assert(last_block <= chunk_mr.end(), "In case this property changes.");
274 if ((last_block == chunk_mr.end()) // our last block does not straddle boundary
275 || !sp->block_is_obj(last_block) // last_block isn't an object
276 || oop(last_block)->is_objArray() // last_block is an array (precisely marked)
277 || oop(last_block)->is_typeArray()) {
278 max_to_do = chunk_mr.end();
279 NOISY(tty->print_cr(" process_chunk_boundary: Last block on this card is not a non-array object;\n"
280 " max_to_do left at " PTR_FORMAT, max_to_do);)
281 } else {
282 assert(last_block < chunk_mr.end(), "Tautology");
283 // It is a non-array object that straddles the right boundary of this chunk.
284 // last_obj_card is the card corresponding to the start of the last object
285 // in the chunk. Note that the last object may not start in
286 // the chunk.
287 jbyte* const last_obj_card = byte_for(last_block);
288 const jbyte val = *last_obj_card;
289 if (!card_will_be_scanned(val)) {
290 assert(!card_may_have_been_dirty(val), "Error");
291 // The card containing the head is not dirty. Any marks on
292 // subsequent cards still in this chunk must have been made
293 // precisely; we can cap processing at the end of our chunk.
294 max_to_do = chunk_mr.end();
295 NOISY(tty->print_cr(" process_chunk_boundary: Head of last object on this card is not dirty;\n"
296 " max_to_do left at " PTR_FORMAT,
297 max_to_do);)
298 } else {
299 // The last object must be considered dirty, and extends onto the
300 // following chunk. Look for a dirty card in that chunk that will
301 // bound our processing.
302 jbyte* limit_card = NULL;
303 const size_t last_block_size = sp->block_size(last_block);
304 jbyte* const last_card_of_last_obj =
305 byte_for(last_block + last_block_size - 1);
306 jbyte* const first_card_of_next_chunk = byte_for(chunk_mr.end());
307 // This search potentially goes a long distance looking
308 // for the next card that will be scanned, terminating
309 // at the end of the last_block, if no earlier dirty card
310 // is found.
311 assert(byte_for(chunk_mr.end()) - byte_for(chunk_mr.start()) == ParGCCardsPerStrideChunk,
312 "last card of next chunk may be wrong");
313 for (jbyte* cur = first_card_of_next_chunk;
314 cur <= last_card_of_last_obj; cur++) {
315 const jbyte val = *cur;
316 if (card_will_be_scanned(val)) {
317 NOISY(tty->print_cr(" Found a non-clean card " PTR_FORMAT " with value 0x%x",
318 cur, (int)val);)
319 limit_card = cur; break;
320 } else {
321 assert(!card_may_have_been_dirty(val), "Error: card can't be skipped");
322 }
323 }
324 if (limit_card != NULL) {
325 max_to_do = addr_for(limit_card);
326 assert(limit_card != NULL && max_to_do != NULL, "Error");
327 NOISY(tty->print_cr(" process_chunk_boundary: Found a dirty card at " PTR_FORMAT
328 " max_to_do set at " PTR_FORMAT " which is before end of last block in chunk: "
329 PTR_FORMAT " + " PTR_FORMAT " = " PTR_FORMAT,
330 limit_card, max_to_do, last_block, last_block_size, (last_block+last_block_size));)
331 } else {
332 // The following is a pessimistic value, because it's possible
333 // that a dirty card on a subsequent chunk has been cleared by
334 // the time we get to look at it; we'll correct for that further below,
335 // using the LNC array which records the least non-clean card
336 // before cards were cleared in a particular chunk.
337 limit_card = last_card_of_last_obj;
338 max_to_do = last_block + last_block_size;
339 assert(limit_card != NULL && max_to_do != NULL, "Error");
340 NOISY(tty->print_cr(" process_chunk_boundary: Found no dirty card before end of last block in chunk\n"
341 " Setting limit_card to " PTR_FORMAT
342 " and max_to_do " PTR_FORMAT " + " PTR_FORMAT " = " PTR_FORMAT,
343 limit_card, last_block, last_block_size, max_to_do);)
344 }
345 assert(0 < cur_chunk_index+1 && cur_chunk_index+1 < lowest_non_clean_chunk_size,
346 "Bounds error.");
347 // It is possible that a dirty card for the last object may have been
348 // cleared before we had a chance to examine it. In that case, the value
349 // will have been logged in the LNC for that chunk.
350 // We need to examine as many chunks to the right as this object
351 // covers. However, we need to bound this checking to the largest
352 // entry in the LNC array: this is because the heap may expand
353 // after the LNC array has been created but before we reach this point,
354 // and the last block in our chunk may have been expanded to include
355 // the expansion delta (and possibly subsequently allocated from, so
356 // it wouldn't be sufficient to check whether that last block was
357 // or was not an object at this point).
358 uintptr_t last_chunk_index_to_check = addr_to_chunk_index(last_block + last_block_size - 1)
359 - lowest_non_clean_base_chunk_index;
360 const uintptr_t last_chunk_index = addr_to_chunk_index(used.last())
361 - lowest_non_clean_base_chunk_index;
362 if (last_chunk_index_to_check > last_chunk_index) {
363 assert(last_block + last_block_size > used.end(),
364 err_msg("Inconsistency detected: last_block [" PTR_FORMAT "," PTR_FORMAT "]"
365 " does not exceed used.end() = " PTR_FORMAT ","
366 " yet last_chunk_index_to_check " INTPTR_FORMAT
367 " exceeds last_chunk_index " INTPTR_FORMAT,
368 last_chunk_index_to_check, last_chunk_index));
369 assert(sp->used_region().end() > used.end(),
370 err_msg("Expansion did not happen: "
371 "[" PTR_FORMAT "," PTR_FORMAT ") -> [" PTR_FORMAT "," PTR_FORMAT ")",
372 sp->used_region().start(), sp->used_region().end(), used.start(), used.end()));
373 NOISY(tty->print_cr(" process_chunk_boundary: heap expanded; explicitly bounding last_chunk");)
374 last_chunk_index_to_check = last_chunk_index;
375 }
376 for (uintptr_t lnc_index = cur_chunk_index + 1;
377 lnc_index <= last_chunk_index_to_check;
378 lnc_index++) {
379 jbyte* lnc_card = lowest_non_clean[lnc_index];
380 if (lnc_card != NULL) {
381 // we can stop at the first non-NULL entry we find
382 if (lnc_card <= limit_card) {
383 NOISY(tty->print_cr(" process_chunk_boundary: LNC card " PTR_FORMAT " is lower than limit_card " PTR_FORMAT,
384 " max_to_do will be lowered to " PTR_FORMAT " from " PTR_FORMAT,
385 lnc_card, limit_card, addr_for(lnc_card), max_to_do);)
386 limit_card = lnc_card;
387 max_to_do = addr_for(limit_card);
388 assert(limit_card != NULL && max_to_do != NULL, "Error");
389 }
390 // In any case, we break now
391 break;
392 } // else continue to look for a non-NULL entry if any
393 }
394 assert(limit_card != NULL && max_to_do != NULL, "Error");
395 }
396 assert(max_to_do != NULL, "OOPS 1 !");
397 }
398 assert(max_to_do != NULL, "OOPS 2!");
399 } else {
400 max_to_do = used.end();
401 NOISY(tty->print_cr(" process_chunk_boundary: Last chunk of this space;\n"
402 " max_to_do left at " PTR_FORMAT,
403 max_to_do);)
404 }
405 assert(max_to_do != NULL, "OOPS 3!");
406 // Now we can set the closure we're using so it doesn't to beyond
407 // max_to_do.
408 dcto_cl->set_min_done(max_to_do);
409 #ifndef PRODUCT
410 dcto_cl->set_last_bottom(max_to_do);
411 #endif
412 NOISY(tty->print_cr("===========================================================================\n");)
413 }
415 #undef NOISY
417 void
418 CardTableModRefBS::
419 get_LNC_array_for_space(Space* sp,
420 jbyte**& lowest_non_clean,
421 uintptr_t& lowest_non_clean_base_chunk_index,
422 size_t& lowest_non_clean_chunk_size) {
424 int i = find_covering_region_containing(sp->bottom());
425 MemRegion covered = _covered[i];
426 size_t n_chunks = chunks_to_cover(covered);
428 // Only the first thread to obtain the lock will resize the
429 // LNC array for the covered region. Any later expansion can't affect
430 // the used_at_save_marks region.
431 // (I observed a bug in which the first thread to execute this would
432 // resize, and then it would cause "expand_and_allocate" that would
433 // increase the number of chunks in the covered region. Then a second
434 // thread would come and execute this, see that the size didn't match,
435 // and free and allocate again. So the first thread would be using a
436 // freed "_lowest_non_clean" array.)
438 // Do a dirty read here. If we pass the conditional then take the rare
439 // event lock and do the read again in case some other thread had already
440 // succeeded and done the resize.
441 int cur_collection = Universe::heap()->total_collections();
442 if (_last_LNC_resizing_collection[i] != cur_collection) {
443 MutexLocker x(ParGCRareEvent_lock);
444 if (_last_LNC_resizing_collection[i] != cur_collection) {
445 if (_lowest_non_clean[i] == NULL ||
446 n_chunks != _lowest_non_clean_chunk_size[i]) {
448 // Should we delete the old?
449 if (_lowest_non_clean[i] != NULL) {
450 assert(n_chunks != _lowest_non_clean_chunk_size[i],
451 "logical consequence");
452 FREE_C_HEAP_ARRAY(CardPtr, _lowest_non_clean[i]);
453 _lowest_non_clean[i] = NULL;
454 }
455 // Now allocate a new one if necessary.
456 if (_lowest_non_clean[i] == NULL) {
457 _lowest_non_clean[i] = NEW_C_HEAP_ARRAY(CardPtr, n_chunks);
458 _lowest_non_clean_chunk_size[i] = n_chunks;
459 _lowest_non_clean_base_chunk_index[i] = addr_to_chunk_index(covered.start());
460 for (int j = 0; j < (int)n_chunks; j++)
461 _lowest_non_clean[i][j] = NULL;
462 }
463 }
464 _last_LNC_resizing_collection[i] = cur_collection;
465 }
466 }
467 // In any case, now do the initialization.
468 lowest_non_clean = _lowest_non_clean[i];
469 lowest_non_clean_base_chunk_index = _lowest_non_clean_base_chunk_index[i];
470 lowest_non_clean_chunk_size = _lowest_non_clean_chunk_size[i];
471 }