-
Notifications
You must be signed in to change notification settings - Fork 5.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
8310031: Parallel: Implement better work distribution for large object arrays in old gen #14846
Changes from 1 commit
53c7b66
0eb924e
5b802ed
d7ab2b0
67edf28
d535a10
9a2b230
3e6c1b7
71a3c44
bba1d2a
ac6bddb
86747ff
268e208
6033358
d4b5fd4
a4d6af9
d75bd60
50737dd
780a03d
817b164
22fe849
78a08cf
d845e65
8b13c83
272ab97
8b544d8
8b22c28
e212413
1462bbf
c9e040f
443f482
381e001
d12e96e
607f0c2
bbb128e
f796551
26f0636
7843a02
bd853c4
fd5d072
7c20c9f
67416b2
a443042
71b0848
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -124,60 +124,6 @@ static void prefetch_write(void *p) { | |||||
} | ||||||
} | ||||||
|
||||||
// postcondition: ret is a dirty card or end_card | ||||||
CardTable::CardValue* PSCardTable::find_first_dirty_card(CardValue* const start_card, | ||||||
CardValue* const end_card) { | ||||||
for (CardValue* i_card = start_card; i_card < end_card; ++i_card) { | ||||||
if (is_dirty(i_card)) { | ||||||
return i_card; | ||||||
} | ||||||
} | ||||||
return end_card; | ||||||
} | ||||||
|
||||||
// postcondition: ret is a clean card or end_card | ||||||
// Note: if a part of an object is on a dirty card, all cards this object | ||||||
// resides on are considered dirty. | ||||||
template <typename T> | ||||||
CardTable::CardValue* PSCardTable::find_first_clean_card(T start_cache, | ||||||
CardValue* const start_card, | ||||||
CardValue* const end_card) { | ||||||
assert(start_card == end_card || | ||||||
*start_card != PSCardTable::clean_card_val(), "precondition"); | ||||||
// Skip the first dirty card. | ||||||
CardValue* i_card = start_card + 1; | ||||||
while (i_card < end_card) { | ||||||
if (*i_card != PSCardTable::clean_card_val()) { | ||||||
i_card++; | ||||||
continue; | ||||||
} | ||||||
assert(i_card - 1 >= start_card, "inv"); | ||||||
assert(*(i_card - 1) != PSCardTable::clean_card_val(), "prev card must be dirty"); | ||||||
// Find the final obj on the prev dirty card. | ||||||
HeapWord* obj_addr = start_cache.object_start(addr_for(i_card)-1); | ||||||
if (cast_to_oop(obj_addr)->is_objArray()) { | ||||||
// Object arrays are precisely marked. | ||||||
return i_card; | ||||||
} | ||||||
HeapWord* obj_end_addr = obj_addr + cast_to_oop(obj_addr)->size(); | ||||||
CardValue* final_card_by_obj = byte_for(obj_end_addr - 1); | ||||||
if (final_card_by_obj <= i_card) { | ||||||
return i_card; | ||||||
} | ||||||
// This final obj extends beyond i_card. Check if it extends beyond end_card. | ||||||
if (final_card_by_obj >= end_card) { | ||||||
return end_card; | ||||||
} | ||||||
// Check if this new card is dirty. | ||||||
if (*final_card_by_obj == PSCardTable::clean_card_val()) { | ||||||
return final_card_by_obj; | ||||||
} | ||||||
// This new card is dirty, continuing the search... | ||||||
i_card = final_card_by_obj + 1; | ||||||
} | ||||||
return end_card; | ||||||
} | ||||||
|
||||||
void PSCardTable::scan_obj(PSPromotionManager* pm, | ||||||
oop obj) { | ||||||
prefetch_write(obj); | ||||||
|
@@ -192,66 +138,29 @@ void PSCardTable::scan_obj_with_limit(PSPromotionManager* pm, | |||||
pm->push_contents_bounded(obj, start, end); | ||||||
} | ||||||
|
||||||
// ObjectStartArray queries can get expensive if the start is far. The | ||||||
// information can be cached if iterating monotonically from lower to higher | ||||||
// addresses. This is vital with parallel processing of large objects. | ||||||
class ObjStartCache : public StackObj { | ||||||
HeapWord* _obj_start; | ||||||
HeapWord* _obj_end; | ||||||
ObjectStartArray* _start_array; | ||||||
DEBUG_ONLY(HeapWord* _prev_query); | ||||||
public: | ||||||
ObjStartCache(ObjectStartArray* start_array) : _obj_start(nullptr), _obj_end(nullptr), | ||||||
_start_array(start_array) | ||||||
DEBUG_ONLY(COMMA _prev_query(nullptr)) {} | ||||||
HeapWord* object_start(HeapWord* addr) { | ||||||
assert(_prev_query <= addr, "precondition"); | ||||||
DEBUG_ONLY(_prev_query = addr); | ||||||
if (addr >= _obj_end) { | ||||||
_obj_start = _start_array->object_start(addr); | ||||||
_obj_end = _obj_start + cast_to_oop(_obj_start)->size(); | ||||||
} | ||||||
assert(_obj_start != nullptr, "postcondition"); | ||||||
return _obj_start; | ||||||
} | ||||||
}; | ||||||
|
||||||
void PSCardTable::pre_scavenge(HeapWord* old_gen_bottom, uint active_workers) { | ||||||
_pre_scavenge_active_workers = active_workers; | ||||||
_pre_scavenge_current_goal_active_workers = active_workers; | ||||||
_pre_scavenge_current_goal = old_gen_bottom + _pre_scavenge_sync_interval; | ||||||
_pre_scavenge_completed_top = nullptr; | ||||||
} | ||||||
|
||||||
void PSCardTable::clear_cards(CardValue* const start, CardValue* const end) { | ||||||
for (CardValue* i_card = start; i_card < end; ++i_card) { | ||||||
*i_card = clean_card_val(); | ||||||
} | ||||||
} | ||||||
|
||||||
// Find cards within [start, end) marked dirty, clear corresponding parts of the | ||||||
// card table and scan objects on dirty cards. | ||||||
// Scanning of objects is limited to [start, end) of a stripe. This way the | ||||||
// scanning of objects crossing stripe boundaries is distributed. For this the | ||||||
// dirty marks of imprecisely marked non-array objects are propagated | ||||||
// pre-scavenge to the stripes they extend to. | ||||||
// Except for the limitation to the [start, end) stripe non-array objects are scanned completely. | ||||||
// Object arrays are marked precisely. Therefore the scanning is limited to dirty cards. | ||||||
template <typename T> | ||||||
void PSCardTable::process_range(T& start_cache, | ||||||
// Scavenge objects on dirty cards of the given stripe [start, end). Accesses to | ||||||
// the card table and scavenging is strictly limited to the stripe. The work on | ||||||
// objects covering multiple stripes is shared among the worker threads owning the | ||||||
// stripes. To support this the card table is preprocessed before | ||||||
// scavenge. Imprecise dirty marks of non-objArrays are copied from start stripes | ||||||
// to all stripes (if any) they extend to. | ||||||
// A copy of card table entries corresponding to the stripe called "shadow" table | ||||||
// is used to separate card reading, clearing and redirtying. | ||||||
template <typename Func> | ||||||
void PSCardTable::process_range(Func&& object_start, | ||||||
PSPromotionManager* pm, | ||||||
HeapWord* const start, | ||||||
HeapWord* const end) { | ||||||
assert(start < end, "precondition"); | ||||||
assert(is_card_aligned(start), "precondition"); | ||||||
|
||||||
// end might not be card-aligned | ||||||
CardValue* itr_limit_r = byte_for(end - 1) + 1; | ||||||
CardValue* clr_limit_r = byte_for(end); | ||||||
|
||||||
CardValue* dirty_l; | ||||||
CardValue* dirty_r; | ||||||
|
||||||
// Helper struct to keep the following code compact. | ||||||
struct Obj { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The scan-dirty-chunk code below is already quite compact; this struct-def is bulky in contrast. Maybe using plain variables inside while-loop is sufficient. Not sure either is definitely superior though. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok. Done. |
||||||
HeapWord* addr; | ||||||
|
@@ -270,63 +179,62 @@ void PSCardTable::process_range(T& start_cache, | |||||
} | ||||||
}; | ||||||
|
||||||
for (CardValue* cur_card = byte_for(start); cur_card < itr_limit_r; cur_card = dirty_r + 1) { | ||||||
dirty_l = find_first_dirty_card(cur_card, itr_limit_r); | ||||||
dirty_r = find_first_clean_card(start_cache, dirty_l, itr_limit_r); | ||||||
StripeShadowTable sct(this, MemRegion(start, end)); | ||||||
|
||||||
// end might not be card-aligned | ||||||
reinrich marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
const CardValue* end_card = sct.card_for(end - 1) + 1; | ||||||
|
||||||
for (HeapWord* i_addr = start; i_addr < end; /* empty */) { | ||||||
const CardValue* dirty_l = sct.find_first_dirty_card(sct.card_for(i_addr), end_card); | ||||||
const CardValue* dirty_r = sct.find_first_clean_card(dirty_l, end_card); | ||||||
|
||||||
assert(dirty_l <= dirty_r, "inv"); | ||||||
|
||||||
if (dirty_l == dirty_r) { | ||||||
assert(dirty_r == itr_limit_r, "inv"); | ||||||
assert(dirty_r == end_card, "inv"); | ||||||
break; | ||||||
} | ||||||
|
||||||
// Located a non-empty dirty chunk [dirty_l, dirty_r) | ||||||
reinrich marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
HeapWord* addr_l = addr_for(dirty_l); | ||||||
HeapWord* addr_r = MIN2(addr_for(dirty_r), end); | ||||||
|
||||||
// Clear the cards before scanning. | ||||||
clear_cards(dirty_l, MIN2(dirty_r, clr_limit_r)); | ||||||
HeapWord* addr_l = sct.addr_for(dirty_l); | ||||||
HeapWord* addr_r = MIN2(sct.addr_for(dirty_r), end); | ||||||
|
||||||
// Scan objects overlapping [addr_l, addr_r) limited to [start, end) | ||||||
reinrich marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
Obj obj(start_cache.object_start(addr_l)); | ||||||
Obj obj(object_start(addr_l)); | ||||||
|
||||||
// Scan non-objArray reaching into stripe and into [addr_l, addr_r). | ||||||
if (!obj.is_obj_array && obj.addr < start) { | ||||||
scan_obj_with_limit(pm, obj.obj, start, MIN2(obj.end_addr, end)); | ||||||
if (obj.end_addr >= addr_r) { | ||||||
// Last object in [addr_l, addr_r) | ||||||
continue; | ||||||
} | ||||||
// move to next obj inside this dirty chunk | ||||||
obj.next(); | ||||||
} | ||||||
while (true) { | ||||||
assert(obj.addr < addr_r, "inv"); | ||||||
|
||||||
// Scan objects overlapping [addr_l, addr_r). | ||||||
// Non-objArrays are known to start within the stripe. They are scanned completely. | ||||||
// Scanning of objArrays is limited to the dirty chunk [addr_l, addr_r). | ||||||
while (obj.end_addr < addr_r) { | ||||||
if (obj.is_obj_array) { | ||||||
// precisely marked | ||||||
// precise-marked | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
scan_obj_with_limit(pm, obj.obj, addr_l, addr_r); | ||||||
} else { | ||||||
assert(obj.addr >= start, "handled before"); | ||||||
// scan whole obj | ||||||
scan_obj(pm, obj.obj); | ||||||
if (obj.addr < addr_l) { | ||||||
if (sct.is_any_dirty(sct.card_for(MAX2(obj.addr, start)), dirty_l)) { | ||||||
// already scanned | ||||||
} else { | ||||||
// treat it as semi-precise-marked, [addr_l, obj-end) | ||||||
scan_obj_with_limit(pm, obj.obj, addr_l, MIN2(obj.end_addr, end)); | ||||||
} | ||||||
} else { | ||||||
// obj-start is dirty | ||||||
if (obj.end_addr <= end) { | ||||||
// scan whole obj if it does not extend beyond the stripe end | ||||||
scan_obj(pm, obj.obj); | ||||||
} else { | ||||||
// otherwise scan limit the scan | ||||||
scan_obj_with_limit(pm, obj.obj, addr_l, end); | ||||||
} | ||||||
} | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if the else branch can be replaced by: if (obj.addr < i_addr && obj.addr > start) {
// already-scanned
} else {
scan_obj_with_limit(pm, obj.obj, addr_l, end);
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You mean to replace L220-L227, right? There we know There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, I mean L212 to L228. "Comment on lines +212 to +228" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes indeed. Looks correct to me. Nice simplification! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I looked at this at first but thought it is obviously wrong if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I get It reproduces well but not 100%. Need to look into it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||||
} | ||||||
|
||||||
// move to next obj | ||||||
obj.next(); | ||||||
} | ||||||
if (obj.end_addr >= addr_r) { | ||||||
i_addr = obj.is_obj_array ? addr_r : obj.end_addr; | ||||||
break; | ||||||
} | ||||||
|
||||||
// Scan object that extends beyond [addr_l, addr_r) and maybe even beyond the stripe. | ||||||
assert(obj.addr < addr_r, "inv"); | ||||||
if (obj.is_obj_array) { | ||||||
// precise-marked | ||||||
scan_obj_with_limit(pm, obj.obj, addr_l, addr_r); | ||||||
} else { | ||||||
assert(obj.addr >= start, "handled before"); | ||||||
scan_obj_with_limit(pm, obj.obj, obj.addr, MIN2(obj.end_addr, end)); | ||||||
// move to next obj inside this dirty chunk | ||||||
reinrich marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
obj.next(); | ||||||
} | ||||||
|
||||||
// Finished a dirty chunk | ||||||
reinrich marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
@@ -336,22 +244,22 @@ void PSCardTable::process_range(T& start_cache, | |||||
|
||||||
// Propagate imprecise card marks from object start to the stripes an object extends to. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
(I saw that this is actually duplicated from the .hpp file. Better to improve the one in the .hpp file and remove this one) |
||||||
// Pre-scavenging and scavenging can overlap. | ||||||
void PSCardTable::pre_scavenge_parallel(ObjectStartArray* start_array, | ||||||
HeapWord* old_gen_bottom, | ||||||
HeapWord* old_gen_top, | ||||||
uint stripe_index, | ||||||
uint n_stripes) { | ||||||
template <typename Func> | ||||||
void PSCardTable::pre_scavenge_parallel(Func&& object_start, | ||||||
HeapWord* old_gen_bottom, | ||||||
HeapWord* old_gen_top, | ||||||
uint stripe_index, | ||||||
uint n_stripes) { | ||||||
const uint active_workers = n_stripes; | ||||||
reinrich marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
const size_t num_cards_in_slice = num_cards_in_stripe * n_stripes; | ||||||
CardValue* cur_card = byte_for(old_gen_bottom) + stripe_index * num_cards_in_stripe; | ||||||
CardValue* const end_card = byte_for(old_gen_top - 1) + 1; | ||||||
HeapWord* signaled_goal = nullptr; | ||||||
reinrich marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
ObjStartCache start_cache(start_array); | ||||||
|
||||||
for ( /* empty */ ; cur_card < end_card; cur_card += num_cards_in_slice) { | ||||||
reinrich marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
HeapWord* stripe_addr = addr_for(cur_card); | ||||||
if (!is_dirty(cur_card)) { | ||||||
HeapWord* first_obj_addr = start_cache.object_start(stripe_addr); | ||||||
HeapWord* first_obj_addr = object_start(stripe_addr); | ||||||
if (first_obj_addr < stripe_addr) { | ||||||
oop first_obj = cast_to_oop(first_obj_addr); | ||||||
if (!first_obj->is_array() && is_dirty(byte_for(first_obj_addr))) { | ||||||
|
@@ -422,15 +330,38 @@ void PSCardTable::scavenge_contents_parallel(ObjectStartArray* start_array, | |||||
PSPromotionManager* pm, | ||||||
uint stripe_index, | ||||||
uint n_stripes) { | ||||||
struct { | ||||||
HeapWord* start_addr; | ||||||
HeapWord* end_addr; | ||||||
DEBUG_ONLY(HeapWord* _prev_query); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This debug-field clutters the real logic, IMO. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've removed it. |
||||||
} cached_obj {nullptr, old_gen_bottom DEBUG_ONLY(COMMA nullptr)}; | ||||||
|
||||||
auto object_start = [&] (HeapWord* addr) { | ||||||
assert(cached_obj._prev_query <= addr, "precondition"); | ||||||
DEBUG_ONLY(cached_obj._prev_query = addr); | ||||||
if (addr < cached_obj.end_addr) { | ||||||
assert(cached_obj.start_addr != nullptr, "inv"); | ||||||
return cached_obj.start_addr; | ||||||
} | ||||||
HeapWord* result = start_array->object_start(addr); | ||||||
|
||||||
cached_obj.start_addr = result; | ||||||
cached_obj.end_addr = result + cast_to_oop(result)->size(); | ||||||
|
||||||
return result; | ||||||
}; | ||||||
|
||||||
const size_t stripe_size_in_words = num_cards_in_stripe * _card_size_in_words; | ||||||
reinrich marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
const size_t slice_size_in_words = stripe_size_in_words * n_stripes; | ||||||
|
||||||
// Prepare scavenge | ||||||
reinrich marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
pre_scavenge_parallel(start_array, old_gen_bottom, old_gen_top, stripe_index, n_stripes); | ||||||
pre_scavenge_parallel(object_start, old_gen_bottom, old_gen_top, stripe_index, n_stripes); | ||||||
|
||||||
// Reset cached object | ||||||
cached_obj = {nullptr, old_gen_bottom DEBUG_ONLY(COMMA nullptr)}; | ||||||
|
||||||
// Scavenge | ||||||
HeapWord* cur_stripe_addr = old_gen_bottom + stripe_index * stripe_size_in_words; | ||||||
ObjStartCache start_cache(start_array); | ||||||
bool pre_scavenge_complete = false; | ||||||
for (/* empty */; cur_stripe_addr < old_gen_top; cur_stripe_addr += slice_size_in_words) { | ||||||
HeapWord* const stripe_l = cur_stripe_addr; | ||||||
|
@@ -447,7 +378,7 @@ void PSCardTable::scavenge_contents_parallel(ObjectStartArray* start_array, | |||||
pre_scavenge_complete = Atomic::load_acquire(&_pre_scavenge_active_workers) == 0; | ||||||
} | ||||||
|
||||||
process_range(start_cache, pm, stripe_l, stripe_r); | ||||||
process_range(object_start, pm, stripe_l, stripe_r); | ||||||
} | ||||||
} | ||||||
|
||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like this documentation (at least the last part starting with the second sentence) should be placed to
scavenge_contents_parallel
because this method does not do preprocessing (butscavenge_contents_parallel
).(And the first sentence should be in the definitions in the hpp file)
Generally I have a feeling that the documentation is scattered across too many places and that imo makes things harder to understand than necessary.
I.e. I would really prefer some larger block summarizing how things work with at most small details added to the methods.
The interface (.hpp) file has no documentation whatsoever, although I would assume that this would be the entry point trying to understand what is happening here.