Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8290688: Optimize x86_64 nmethod entry barriers #9569

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
Expand Up @@ -28,6 +28,8 @@
// C2_MacroAssembler contains high-level macros for C2

public:
void emit_entry_barrier_stub(C2EntryBarrierStub* stub) {}
static int entry_barrier_stub_size() { return 0; }

void string_compare(Register str1, Register str2,
Register cnt1, Register cnt2, Register result,
Expand Down
3 changes: 3 additions & 0 deletions src/hotspot/cpu/arm/c2_MacroAssembler_arm.hpp
Expand Up @@ -28,6 +28,9 @@
// C2_MacroAssembler contains high-level macros for C2

public:
void emit_entry_barrier_stub(C2EntryBarrierStub* stub) {}
static int entry_barrier_stub_size() { return 0; }

// Compare char[] arrays aligned to 4 bytes.
void char_arrays_equals(Register ary1, Register ary2,
Register limit, Register result,
Expand Down
3 changes: 3 additions & 0 deletions src/hotspot/cpu/ppc/c2_MacroAssembler_ppc.hpp
Expand Up @@ -28,6 +28,9 @@
// C2_MacroAssembler contains high-level macros for C2

public:
void emit_entry_barrier_stub(C2EntryBarrierStub* stub) {}
static int entry_barrier_stub_size() { return 0; }

// Intrinsics for CompactStrings
// Compress char[] to byte[] by compressing 16 bytes at once.
void string_compress_16(Register src, Register dst, Register cnt,
Expand Down
2 changes: 2 additions & 0 deletions src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp
Expand Up @@ -36,6 +36,8 @@
VectorRegister vrs,
bool is_latin, Label& DONE);
public:
void emit_entry_barrier_stub(C2EntryBarrierStub* stub) {}
static int entry_barrier_stub_size() { return 0; }

void string_compare(Register str1, Register str2,
Register cnt1, Register cnt2, Register result,
Expand Down
3 changes: 3 additions & 0 deletions src/hotspot/cpu/s390/c2_MacroAssembler_s390.hpp
Expand Up @@ -29,6 +29,9 @@
// C2_MacroAssembler contains high-level macros for C2

public:
void emit_entry_barrier_stub(C2EntryBarrierStub* stub) {}
static int entry_barrier_stub_size() { return 0; }

//-------------------------------------------
// Special String Intrinsics Implementation.
//-------------------------------------------
Expand Down
3 changes: 2 additions & 1 deletion src/hotspot/cpu/x86/c1_MacroAssembler_x86.cpp
Expand Up @@ -325,7 +325,8 @@ void C1_MacroAssembler::build_frame(int frame_size_in_bytes, int bang_size_in_by
decrement(rsp, frame_size_in_bytes); // does not emit code for frame_size == 0

BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->nmethod_entry_barrier(this);
// C1 code is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
bs->nmethod_entry_barrier(this, NULL /* slow_path */, NULL /* continuation */);
}


Expand Down
31 changes: 30 additions & 1 deletion src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Expand Up @@ -30,6 +30,7 @@
#include "oops/methodData.hpp"
#include "opto/c2_MacroAssembler.hpp"
#include "opto/intrinsicnode.hpp"
#include "opto/output.hpp"
#include "opto/opcodes.hpp"
#include "opto/subnode.hpp"
#include "runtime/objectMonitor.hpp"
Expand Down Expand Up @@ -128,10 +129,38 @@ void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool

if (!is_stub) {
BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->nmethod_entry_barrier(this);
#ifdef _LP64
if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
// We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
Label dummy_slow_path;
Label dummy_continuation;
Label* slow_path = &dummy_slow_path;
Label* continuation = &dummy_continuation;
if (!Compile::current()->output()->in_scratch_emit_size()) {
// Use real labels from actual stub when not emitting code for the purpose of measuring its size
C2EntryBarrierStub* stub = Compile::current()->output()->entry_barrier_table()->add_entry_barrier();
slow_path = &stub->slow_path();
continuation = &stub->continuation();
}
bs->nmethod_entry_barrier(this, slow_path, continuation);
}
#else
// Don't bother with out-of-line nmethod entry barrier stub for x86_32.
bs->nmethod_entry_barrier(this, NULL /* slow_path */, NULL /* continuation */);
#endif
}
}

void C2_MacroAssembler::emit_entry_barrier_stub(C2EntryBarrierStub* stub) {
bind(stub->slow_path());
call(RuntimeAddress(StubRoutines::x86::method_entry_barrier()));
jmp(stub->continuation(), false /* maybe_short */);
}

int C2_MacroAssembler::entry_barrier_stub_size() {
return 10;
}

inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
switch (vlen_in_bytes) {
case 4: // fall-through
Expand Down
3 changes: 3 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
Expand Up @@ -31,6 +31,9 @@
// C2 compiled method's prolog code.
void verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub);

void emit_entry_barrier_stub(C2EntryBarrierStub* stub);
static int entry_barrier_stub_size();

Assembler::AvxVectorLen vector_length_encoding(int vlen_in_bytes);

// Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file.
Expand Down
26 changes: 19 additions & 7 deletions src/hotspot/cpu/x86/gc/shared/barrierSetAssembler_x86.cpp
Expand Up @@ -309,22 +309,34 @@ void BarrierSetAssembler::incr_allocated_bytes(MacroAssembler* masm, Register th
}

#ifdef _LP64
void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm) {
void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm, Label* slow_path, Label* continuation) {
BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
if (bs_nm == NULL) {
return;
}
Label continuation;
Register thread = r15_thread;
Address disarmed_addr(thread, in_bytes(bs_nm->thread_disarmed_offset()));
__ align(8);
// The immediate is the last 4 bytes, so if we align the start of the cmp
// instruction to 4 bytes, we know that the second half of it is also 4
// byte aligned, which means that the immediate will not cross a cache line
__ align(4);
uintptr_t before_cmp = (uintptr_t)__ pc();
__ cmpl(disarmed_addr, 0);
__ jcc(Assembler::equal, continuation);
__ call(RuntimeAddress(StubRoutines::x86::method_entry_barrier()));
__ bind(continuation);
uintptr_t after_cmp = (uintptr_t)__ pc();
guarantee(after_cmp - before_cmp == 8, "Wrong assumed instruction length");

if (slow_path != NULL) {
__ jcc(Assembler::notEqual, *slow_path);
__ bind(*continuation);
} else {
Label done;
__ jccb(Assembler::equal, done);
__ call(RuntimeAddress(StubRoutines::x86::method_entry_barrier()));
__ bind(done);
}
}
#else
void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm) {
void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm, Label* slow_path, Label* continuation) {
BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
if (bs_nm == NULL) {
return;
Expand Down
2 changes: 1 addition & 1 deletion src/hotspot/cpu/x86/gc/shared/barrierSetAssembler_x86.hpp
Expand Up @@ -68,7 +68,7 @@ class BarrierSetAssembler: public CHeapObj<mtGC> {

virtual void barrier_stubs_init() {}

virtual void nmethod_entry_barrier(MacroAssembler* masm);
virtual void nmethod_entry_barrier(MacroAssembler* masm, Label* slow_path, Label* continuation);
virtual void c2i_entry_barrier(MacroAssembler* masm);
};

Expand Down
17 changes: 14 additions & 3 deletions src/hotspot/cpu/x86/gc/shared/barrierSetNMethod_x86.cpp
Expand Up @@ -32,6 +32,7 @@
#include "runtime/sharedRuntime.hpp"
#include "utilities/align.hpp"
#include "utilities/debug.hpp"
#include "utilities/macros.hpp"

class NativeNMethodCmpBarrier: public NativeInstruction {
public:
Expand Down Expand Up @@ -62,7 +63,7 @@ class NativeNMethodCmpBarrier: public NativeInstruction {

#ifdef _LP64
void NativeNMethodCmpBarrier::verify() const {
if (((uintptr_t) instruction_address()) & 0x7) {
if (((uintptr_t) instruction_address()) & 0x3) {
fatal("Not properly aligned");
}

Expand Down Expand Up @@ -156,10 +157,20 @@ void BarrierSetNMethod::deoptimize(nmethod* nm, address* return_address_ptr) {
// NativeNMethodCmpBarrier::verify() will immediately complain when it does
// not find the expected native instruction at this offset, which needs updating.
// Note that this offset is invariant of PreserveFramePointer.
static const int entry_barrier_offset = LP64_ONLY(-19) NOT_LP64(-18);
static const int entry_barrier_offset(nmethod* nm) {
#ifdef _LP64
if (nm->is_compiled_by_c2()) {
return -14;
} else {
return -15;
}
#else
return -18;
#endif
}

static NativeNMethodCmpBarrier* native_nmethod_barrier(nmethod* nm) {
address barrier_address = nm->code_begin() + nm->frame_complete_offset() + entry_barrier_offset;
address barrier_address = nm->code_begin() + nm->frame_complete_offset() + entry_barrier_offset(nm);
NativeNMethodCmpBarrier* barrier = reinterpret_cast<NativeNMethodCmpBarrier*>(barrier_address);
debug_only(barrier->verify());
return barrier;
Expand Down
3 changes: 2 additions & 1 deletion src/hotspot/cpu/x86/sharedRuntime_x86_64.cpp
Expand Up @@ -1744,7 +1744,8 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
__ subptr(rsp, stack_size - 2*wordSize);

BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->nmethod_entry_barrier(masm);
// native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
bs->nmethod_entry_barrier(masm, NULL /* slow_path */, NULL /* continuation */);

// Frame is now completed as far as size and linkage.
int frame_complete = ((intptr_t)__ pc()) - start;
Expand Down
2 changes: 2 additions & 0 deletions src/hotspot/share/opto/c2_MacroAssembler.hpp
Expand Up @@ -29,6 +29,8 @@
#include "asm/macroAssembler.inline.hpp"
#include "utilities/macros.hpp"

class C2EntryBarrierStub;

class C2_MacroAssembler: public MacroAssembler {
public:
// creation
Expand Down
45 changes: 45 additions & 0 deletions src/hotspot/share/opto/output.cpp
Expand Up @@ -39,6 +39,7 @@
#include "opto/ad.hpp"
#include "opto/block.hpp"
#include "opto/c2compiler.hpp"
#include "opto/c2_MacroAssembler.hpp"
#include "opto/callnode.hpp"
#include "opto/cfgnode.hpp"
#include "opto/locknode.hpp"
Expand Down Expand Up @@ -284,12 +285,51 @@ int C2SafepointPollStubTable::estimate_stub_size() const {
return result;
}

// Nmethod entry barrier stubs
C2EntryBarrierStub* C2EntryBarrierStubTable::add_entry_barrier() {
assert(_stub == NULL, "There can only be one entry barrier stub");
_stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
return _stub;
}

void C2EntryBarrierStubTable::emit(CodeBuffer& cb) {
if (_stub == NULL) {
// No stub - nothing to do
return;
}

C2_MacroAssembler masm(&cb);
// Make sure there is enough space in the code buffer
if (cb.insts()->maybe_expand_to_ensure_remaining(PhaseOutput::MAX_inst_size) && cb.blob() == NULL) {
ciEnv::current()->record_failure("CodeCache is full");
return;
}

intptr_t before = masm.offset();
masm.emit_entry_barrier_stub(_stub);
intptr_t after = masm.offset();
int actual_size = (int)(after - before);
int expected_size = masm.entry_barrier_stub_size();
assert(actual_size == expected_size, "Estimated size is wrong, expected %d, was %d", expected_size, actual_size);
}

int C2EntryBarrierStubTable::estimate_stub_size() const {
if (BarrierSet::barrier_set()->barrier_set_nmethod() == NULL) {
// No nmethod entry barrier?
return 0;
}

return C2_MacroAssembler::entry_barrier_stub_size();
}

PhaseOutput::PhaseOutput()
: Phase(Phase::Output),
_code_buffer("Compile::Fill_buffer"),
_first_block_size(0),
_handler_table(),
_inc_table(),
_safepoint_poll_table(),
_entry_barrier_table(),
_oop_map_set(NULL),
_scratch_buffer_blob(NULL),
_scratch_locs_memory(NULL),
Expand Down Expand Up @@ -1302,6 +1342,7 @@ CodeBuffer* PhaseOutput::init_buffer() {
BarrierSetC2* bs = BarrierSet::barrier_set()->barrier_set_c2();
stub_req += bs->estimate_stub_size();
stub_req += safepoint_poll_table()->estimate_stub_size();
stub_req += entry_barrier_table()->estimate_stub_size();

// nmethod and CodeBuffer count stubs & constants as part of method's code.
// class HandlerImpl is platform-specific and defined in the *.ad files.
Expand Down Expand Up @@ -1812,6 +1853,10 @@ void PhaseOutput::fill_buffer(CodeBuffer* cb, uint* blk_starts) {
safepoint_poll_table()->emit(*cb);
if (C->failing()) return;

// Fill in stubs for calling the runtime from nmethod entries.
entry_barrier_table()->emit(*cb);
if (C->failing()) return;

#ifndef PRODUCT
// Information on the size of the method, without the extraneous code
Scheduling::increment_method_size(cb->insts_size());
Expand Down
29 changes: 29 additions & 0 deletions src/hotspot/share/opto/output.hpp
Expand Up @@ -40,6 +40,7 @@ class Arena;
class Bundle;
class Block;
class Block_Array;
class C2_MacroAssembler;
class ciMethod;
class Compile;
class MachNode;
Expand Down Expand Up @@ -113,6 +114,30 @@ class C2SafepointPollStubTable {
void emit(CodeBuffer& cb);
};

// We move non-hot code of the nmethod entry barrier to an out-of-line stub
class C2EntryBarrierStub: public ResourceObj {
Label _slow_path;
Label _continuation;

public:
C2EntryBarrierStub() :
_slow_path(),
_continuation() {}

Label& slow_path() { return _slow_path; }
Label& continuation() { return _continuation; }
};

class C2EntryBarrierStubTable {
C2EntryBarrierStub* _stub;

public:
C2EntryBarrierStubTable() : _stub(NULL) {}
C2EntryBarrierStub* add_entry_barrier();
int estimate_stub_size() const;
void emit(CodeBuffer& cb);
};

class PhaseOutput : public Phase {
private:
// Instruction bits passed off to the VM
Expand All @@ -122,6 +147,7 @@ class PhaseOutput : public Phase {
ExceptionHandlerTable _handler_table; // Table of native-code exception handlers
ImplicitExceptionTable _inc_table; // Table of implicit null checks in native code
C2SafepointPollStubTable _safepoint_poll_table;// Table for safepoint polls
C2EntryBarrierStubTable _entry_barrier_table; // Table for entry barrier stubs
OopMapSet* _oop_map_set; // Table of oop maps (one for each safepoint location)
BufferBlob* _scratch_buffer_blob; // For temporary code buffers.
relocInfo* _scratch_locs_memory; // For temporary code buffers.
Expand Down Expand Up @@ -172,6 +198,9 @@ class PhaseOutput : public Phase {
// Safepoint poll table
C2SafepointPollStubTable* safepoint_poll_table() { return &_safepoint_poll_table; }

// Entry barrier table
C2EntryBarrierStubTable* entry_barrier_table() { return &_entry_barrier_table; }

// Code emission iterator
Block* block() { return _block; }
int index() { return _index; }
Expand Down