Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8310031: Parallel: Implement better work distribution for large object arrays in old gen #14846

Closed
Changes from 1 commit
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
53c7b66
8310031: Parallel: Implement better work distribution for large objec…
reinrich Jun 20, 2023
0eb924e
Make sure to skip stripes where no object starts
reinrich Jul 26, 2023
5b802ed
Limit effect of previous commit to large array handling
reinrich Jul 27, 2023
d7ab2b0
Apply Thomas' suggestions
reinrich Sep 11, 2023
67edf28
Merge branch 'master'
reinrich Sep 11, 2023
d535a10
objArrayOopDesc::oop_oop_iterate_bounded must be defined in objArrayO…
reinrich Sep 12, 2023
9a2b230
100 stripes per active worker thread
reinrich Sep 14, 2023
3e6c1b7
Scan large array stripe from first dirty card to stripe end
reinrich Sep 18, 2023
71a3c44
Revert back to precise scanning of large object arrays
reinrich Sep 19, 2023
bba1d2a
find_first_clean_card: avoid expensive start array queries on long ar…
reinrich Sep 20, 2023
ac6bddb
Avoid expensive start array queries on long arrays
reinrich Sep 20, 2023
86747ff
Feedback Thomas
reinrich Sep 22, 2023
268e208
Small clean-ups
reinrich Sep 25, 2023
6033358
Limit stripe size to 1m with at least 8 threads
reinrich Sep 25, 2023
d4b5fd4
Do all large array scanning in separate method
reinrich Sep 22, 2023
a4d6af9
First card of large array should be cleared if dirty
reinrich Sep 25, 2023
d75bd60
Eliminate special case for scanning the large array end
reinrich Sep 26, 2023
50737dd
Remove stripe size adaptations and cache potentially expensive start …
reinrich Sep 27, 2023
780a03d
Reset to master
reinrich Sep 28, 2023
817b164
Split work strictly at stripe boundaries
reinrich Oct 5, 2023
22fe849
Parallel copying of imprecise marks to stripes
reinrich Oct 5, 2023
78a08cf
Overlap scavenge with pre-scavenge
reinrich Oct 6, 2023
d845e65
Missed acquire semantics
reinrich Oct 6, 2023
8b13c83
Cleanup
reinrich Oct 9, 2023
272ab97
find_first_clean_card: return end_card if final object extends beyond…
reinrich Oct 6, 2023
8b544d8
Shadow table per stripe
reinrich Oct 9, 2023
8b22c28
Cleanup
reinrich Oct 11, 2023
e212413
Don't overlap card table processing with scavenging for simplicity
reinrich Oct 11, 2023
1462bbf
Simplification suggested by Albert
reinrich Oct 11, 2023
c9e040f
Make sure to scan obj reaching in just once
reinrich Oct 12, 2023
443f482
Re-cleanup (was accidentally reverted)
reinrich Oct 12, 2023
381e001
Merge branch 'master'
reinrich Oct 12, 2023
d12e96e
Feedback Albert
reinrich Oct 12, 2023
607f0c2
Remove obsolete comment
reinrich Oct 13, 2023
bbb128e
Merge branch 'master'
reinrich Oct 19, 2023
f796551
Use better name: _preprocessing_active_workers
reinrich Oct 19, 2023
26f0636
preprocess_card_table_parallel should be private
reinrich Oct 19, 2023
7843a02
Small cleanup changes suggested by Thomas.
reinrich Oct 19, 2023
bd853c4
More small changes Thomas suggested (line-breaks needed)
reinrich Oct 19, 2023
fd5d072
Review Thomas
reinrich Oct 19, 2023
7c20c9f
Cleanup/improve comments
reinrich Oct 20, 2023
67416b2
Accepting Thomas' (smaller) suggestions
reinrich Oct 20, 2023
a443042
Review Thomas (PSStripeShadowCardTable)
reinrich Oct 20, 2023
71b0848
Forgot to move comment to PSStripeShadowCardTable.
reinrich Oct 20, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
231 changes: 81 additions & 150 deletions src/hotspot/share/gc/parallel/psCardTable.cpp
Original file line number Diff line number Diff line change
@@ -124,60 +124,6 @@ static void prefetch_write(void *p) {
}
}

// postcondition: ret is a dirty card or end_card
CardTable::CardValue* PSCardTable::find_first_dirty_card(CardValue* const start_card,
CardValue* const end_card) {
for (CardValue* i_card = start_card; i_card < end_card; ++i_card) {
if (is_dirty(i_card)) {
return i_card;
}
}
return end_card;
}

// postcondition: ret is a clean card or end_card
// Note: if a part of an object is on a dirty card, all cards this object
// resides on are considered dirty.
template <typename T>
CardTable::CardValue* PSCardTable::find_first_clean_card(T start_cache,
CardValue* const start_card,
CardValue* const end_card) {
assert(start_card == end_card ||
*start_card != PSCardTable::clean_card_val(), "precondition");
// Skip the first dirty card.
CardValue* i_card = start_card + 1;
while (i_card < end_card) {
if (*i_card != PSCardTable::clean_card_val()) {
i_card++;
continue;
}
assert(i_card - 1 >= start_card, "inv");
assert(*(i_card - 1) != PSCardTable::clean_card_val(), "prev card must be dirty");
// Find the final obj on the prev dirty card.
HeapWord* obj_addr = start_cache.object_start(addr_for(i_card)-1);
if (cast_to_oop(obj_addr)->is_objArray()) {
// Object arrays are precisely marked.
return i_card;
}
HeapWord* obj_end_addr = obj_addr + cast_to_oop(obj_addr)->size();
CardValue* final_card_by_obj = byte_for(obj_end_addr - 1);
if (final_card_by_obj <= i_card) {
return i_card;
}
// This final obj extends beyond i_card. Check if it extends beyond end_card.
if (final_card_by_obj >= end_card) {
return end_card;
}
// Check if this new card is dirty.
if (*final_card_by_obj == PSCardTable::clean_card_val()) {
return final_card_by_obj;
}
// This new card is dirty, continuing the search...
i_card = final_card_by_obj + 1;
}
return end_card;
}

void PSCardTable::scan_obj(PSPromotionManager* pm,
oop obj) {
prefetch_write(obj);
@@ -192,66 +138,29 @@ void PSCardTable::scan_obj_with_limit(PSPromotionManager* pm,
pm->push_contents_bounded(obj, start, end);
}

// ObjectStartArray queries can get expensive if the start is far. The
// information can be cached if iterating monotonically from lower to higher
// addresses. This is vital with parallel processing of large objects.
class ObjStartCache : public StackObj {
HeapWord* _obj_start;
HeapWord* _obj_end;
ObjectStartArray* _start_array;
DEBUG_ONLY(HeapWord* _prev_query);
public:
ObjStartCache(ObjectStartArray* start_array) : _obj_start(nullptr), _obj_end(nullptr),
_start_array(start_array)
DEBUG_ONLY(COMMA _prev_query(nullptr)) {}
HeapWord* object_start(HeapWord* addr) {
assert(_prev_query <= addr, "precondition");
DEBUG_ONLY(_prev_query = addr);
if (addr >= _obj_end) {
_obj_start = _start_array->object_start(addr);
_obj_end = _obj_start + cast_to_oop(_obj_start)->size();
}
assert(_obj_start != nullptr, "postcondition");
return _obj_start;
}
};

void PSCardTable::pre_scavenge(HeapWord* old_gen_bottom, uint active_workers) {
_pre_scavenge_active_workers = active_workers;
_pre_scavenge_current_goal_active_workers = active_workers;
_pre_scavenge_current_goal = old_gen_bottom + _pre_scavenge_sync_interval;
_pre_scavenge_completed_top = nullptr;
}

void PSCardTable::clear_cards(CardValue* const start, CardValue* const end) {
for (CardValue* i_card = start; i_card < end; ++i_card) {
*i_card = clean_card_val();
}
}

// Find cards within [start, end) marked dirty, clear corresponding parts of the
// card table and scan objects on dirty cards.
// Scanning of objects is limited to [start, end) of a stripe. This way the
// scanning of objects crossing stripe boundaries is distributed. For this the
// dirty marks of imprecisely marked non-array objects are propagated
// pre-scavenge to the stripes they extend to.
// Except for the limitation to the [start, end) stripe non-array objects are scanned completely.
// Object arrays are marked precisely. Therefore the scanning is limited to dirty cards.
template <typename T>
void PSCardTable::process_range(T& start_cache,
// Scavenge objects on dirty cards of the given stripe [start, end). Accesses to
// the card table and scavenging is strictly limited to the stripe. The work on
// objects covering multiple stripes is shared among the worker threads owning the
// stripes. To support this the card table is preprocessed before
// scavenge. Imprecise dirty marks of non-objArrays are copied from start stripes
// to all stripes (if any) they extend to.
// A copy of card table entries corresponding to the stripe called "shadow" table
// is used to separate card reading, clearing and redirtying.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like this documentation (at least the last part starting with the second sentence) should be placed to scavenge_contents_parallel because this method does not do preprocessing (but scavenge_contents_parallel).
(And the first sentence should be in the definitions in the hpp file)

Generally I have a feeling that the documentation is scattered across too many places and that imo makes things harder to understand than necessary.

I.e. I would really prefer some larger block summarizing how things work with at most small details added to the methods.

The interface (.hpp) file has no documentation whatsoever, although I would assume that this would be the entry point trying to understand what is happening here.

template <typename Func>
void PSCardTable::process_range(Func&& object_start,
PSPromotionManager* pm,
HeapWord* const start,
HeapWord* const end) {
assert(start < end, "precondition");
assert(is_card_aligned(start), "precondition");

// end might not be card-aligned
CardValue* itr_limit_r = byte_for(end - 1) + 1;
CardValue* clr_limit_r = byte_for(end);

CardValue* dirty_l;
CardValue* dirty_r;

// Helper struct to keep the following code compact.
struct Obj {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The scan-dirty-chunk code below is already quite compact; this struct-def is bulky in contrast. Maybe using plain variables inside while-loop is sufficient. Not sure either is definitely superior though.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. Done.

HeapWord* addr;
@@ -270,63 +179,62 @@ void PSCardTable::process_range(T& start_cache,
}
};

for (CardValue* cur_card = byte_for(start); cur_card < itr_limit_r; cur_card = dirty_r + 1) {
dirty_l = find_first_dirty_card(cur_card, itr_limit_r);
dirty_r = find_first_clean_card(start_cache, dirty_l, itr_limit_r);
StripeShadowTable sct(this, MemRegion(start, end));

// end might not be card-aligned
const CardValue* end_card = sct.card_for(end - 1) + 1;

for (HeapWord* i_addr = start; i_addr < end; /* empty */) {
const CardValue* dirty_l = sct.find_first_dirty_card(sct.card_for(i_addr), end_card);
const CardValue* dirty_r = sct.find_first_clean_card(dirty_l, end_card);

assert(dirty_l <= dirty_r, "inv");

if (dirty_l == dirty_r) {
assert(dirty_r == itr_limit_r, "inv");
assert(dirty_r == end_card, "inv");
break;
}

// Located a non-empty dirty chunk [dirty_l, dirty_r)
HeapWord* addr_l = addr_for(dirty_l);
HeapWord* addr_r = MIN2(addr_for(dirty_r), end);

// Clear the cards before scanning.
clear_cards(dirty_l, MIN2(dirty_r, clr_limit_r));
HeapWord* addr_l = sct.addr_for(dirty_l);
HeapWord* addr_r = MIN2(sct.addr_for(dirty_r), end);

// Scan objects overlapping [addr_l, addr_r) limited to [start, end)
Obj obj(start_cache.object_start(addr_l));
Obj obj(object_start(addr_l));

// Scan non-objArray reaching into stripe and into [addr_l, addr_r).
if (!obj.is_obj_array && obj.addr < start) {
scan_obj_with_limit(pm, obj.obj, start, MIN2(obj.end_addr, end));
if (obj.end_addr >= addr_r) {
// Last object in [addr_l, addr_r)
continue;
}
// move to next obj inside this dirty chunk
obj.next();
}
while (true) {
assert(obj.addr < addr_r, "inv");

// Scan objects overlapping [addr_l, addr_r).
// Non-objArrays are known to start within the stripe. They are scanned completely.
// Scanning of objArrays is limited to the dirty chunk [addr_l, addr_r).
while (obj.end_addr < addr_r) {
if (obj.is_obj_array) {
// precisely marked
// precise-marked
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// precise-marked
// Always scan obj arrays precisely (they are always marked precisely) to avoid unnecessary work.

scan_obj_with_limit(pm, obj.obj, addr_l, addr_r);
} else {
assert(obj.addr >= start, "handled before");
// scan whole obj
scan_obj(pm, obj.obj);
if (obj.addr < addr_l) {
if (sct.is_any_dirty(sct.card_for(MAX2(obj.addr, start)), dirty_l)) {
// already scanned
} else {
// treat it as semi-precise-marked, [addr_l, obj-end)
scan_obj_with_limit(pm, obj.obj, addr_l, MIN2(obj.end_addr, end));
}
} else {
// obj-start is dirty
if (obj.end_addr <= end) {
// scan whole obj if it does not extend beyond the stripe end
scan_obj(pm, obj.obj);
} else {
// otherwise scan limit the scan
scan_obj_with_limit(pm, obj.obj, addr_l, end);
}
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if the else branch can be replaced by:

  if (obj.addr < i_addr && obj.addr > start) {
    // already-scanned
  } else {
    scan_obj_with_limit(pm, obj.obj, addr_l, end);
  }

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean to replace L220-L227, right? There we know obj.addr >= addr_l. obj.addr < i_addr cannot be true then IMO. It would overlap with an objArray if i_addr was set there. If i_addr was set at the end of a non-objArray then objects starting before i_addr cannot start in or after addr_l.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I mean L212 to L228. "Comment on lines +212 to +228"

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes indeed. Looks correct to me. Nice simplification!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I mean L212 to L228. "Comment on lines +212 to +228"

I looked at this at first but thought it is obviously wrong if obj is precisely marked after a previous young collection. But if it is precisely marked it is of course correct scan from the beginning of the dirty chunk (addr_l) to the obj/stripe end.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get assert(should_scavenge(p, true)) failed: revisiting object? with test/jdk/java/lang/Thread/virtual/stress/Skynet.java when the change is applied.

It reproduces well but not 100%. Need to look into it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

obj can be scanned twice if it reaches into the stripe because obj.addr > start will still false upon the 2nd time. Fix is to check i_addr > start.

}

// move to next obj
obj.next();
}
if (obj.end_addr >= addr_r) {
i_addr = obj.is_obj_array ? addr_r : obj.end_addr;
break;
}

// Scan object that extends beyond [addr_l, addr_r) and maybe even beyond the stripe.
assert(obj.addr < addr_r, "inv");
if (obj.is_obj_array) {
// precise-marked
scan_obj_with_limit(pm, obj.obj, addr_l, addr_r);
} else {
assert(obj.addr >= start, "handled before");
scan_obj_with_limit(pm, obj.obj, obj.addr, MIN2(obj.end_addr, end));
// move to next obj inside this dirty chunk
obj.next();
}

// Finished a dirty chunk
@@ -336,22 +244,22 @@ void PSCardTable::process_range(T& start_cache,

// Propagate imprecise card marks from object start to the stripes an object extends to.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Propagate imprecise card marks from object start to the stripes an object extends to.
// Propagate imprecise card marks from object start to all stripes an object extends to this thread is assigned to.

(I saw that this is actually duplicated from the .hpp file. Better to improve the one in the .hpp file and remove this one)

// Pre-scavenging and scavenging can overlap.
void PSCardTable::pre_scavenge_parallel(ObjectStartArray* start_array,
HeapWord* old_gen_bottom,
HeapWord* old_gen_top,
uint stripe_index,
uint n_stripes) {
template <typename Func>
void PSCardTable::pre_scavenge_parallel(Func&& object_start,
HeapWord* old_gen_bottom,
HeapWord* old_gen_top,
uint stripe_index,
uint n_stripes) {
const uint active_workers = n_stripes;
const size_t num_cards_in_slice = num_cards_in_stripe * n_stripes;
CardValue* cur_card = byte_for(old_gen_bottom) + stripe_index * num_cards_in_stripe;
CardValue* const end_card = byte_for(old_gen_top - 1) + 1;
HeapWord* signaled_goal = nullptr;
ObjStartCache start_cache(start_array);

for ( /* empty */ ; cur_card < end_card; cur_card += num_cards_in_slice) {
HeapWord* stripe_addr = addr_for(cur_card);
if (!is_dirty(cur_card)) {
HeapWord* first_obj_addr = start_cache.object_start(stripe_addr);
HeapWord* first_obj_addr = object_start(stripe_addr);
if (first_obj_addr < stripe_addr) {
oop first_obj = cast_to_oop(first_obj_addr);
if (!first_obj->is_array() && is_dirty(byte_for(first_obj_addr))) {
@@ -422,15 +330,38 @@ void PSCardTable::scavenge_contents_parallel(ObjectStartArray* start_array,
PSPromotionManager* pm,
uint stripe_index,
uint n_stripes) {
struct {
HeapWord* start_addr;
HeapWord* end_addr;
DEBUG_ONLY(HeapWord* _prev_query);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This debug-field clutters the real logic, IMO.

Copy link
Member Author

@reinrich reinrich Oct 12, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've removed it.

} cached_obj {nullptr, old_gen_bottom DEBUG_ONLY(COMMA nullptr)};

auto object_start = [&] (HeapWord* addr) {
assert(cached_obj._prev_query <= addr, "precondition");
DEBUG_ONLY(cached_obj._prev_query = addr);
if (addr < cached_obj.end_addr) {
assert(cached_obj.start_addr != nullptr, "inv");
return cached_obj.start_addr;
}
HeapWord* result = start_array->object_start(addr);

cached_obj.start_addr = result;
cached_obj.end_addr = result + cast_to_oop(result)->size();

return result;
};

const size_t stripe_size_in_words = num_cards_in_stripe * _card_size_in_words;
const size_t slice_size_in_words = stripe_size_in_words * n_stripes;

// Prepare scavenge
pre_scavenge_parallel(start_array, old_gen_bottom, old_gen_top, stripe_index, n_stripes);
pre_scavenge_parallel(object_start, old_gen_bottom, old_gen_top, stripe_index, n_stripes);

// Reset cached object
cached_obj = {nullptr, old_gen_bottom DEBUG_ONLY(COMMA nullptr)};

// Scavenge
HeapWord* cur_stripe_addr = old_gen_bottom + stripe_index * stripe_size_in_words;
ObjStartCache start_cache(start_array);
bool pre_scavenge_complete = false;
for (/* empty */; cur_stripe_addr < old_gen_top; cur_stripe_addr += slice_size_in_words) {
HeapWord* const stripe_l = cur_stripe_addr;
@@ -447,7 +378,7 @@ void PSCardTable::scavenge_contents_parallel(ObjectStartArray* start_array,
pre_scavenge_complete = Atomic::load_acquire(&_pre_scavenge_active_workers) == 0;
}

process_range(start_cache, pm, stripe_l, stripe_r);
process_range(object_start, pm, stripe_l, stripe_r);
}
}

Loading