Skip to content

Commit 45b581b

Browse files
changpeng1997e1iu
authored andcommittedJun 27, 2023
8309583: AArch64: Optimize firstTrue() when amount of elements < 8
Reviewed-by: aph, eliu
1 parent 87e6fab commit 45b581b

File tree

2 files changed

+26
-70
lines changed

2 files changed

+26
-70
lines changed
 

‎src/hotspot/cpu/aarch64/aarch64_vector.ad

+13-35
Original file line numberDiff line numberDiff line change
@@ -5534,39 +5534,10 @@ instruct vstoremask_truecount_neon(iRegINoSp dst, vReg src, immI_gt_1 size, vReg
55345534

55355535
// first true
55365536

5537-
instruct vmask_firsttrue_lt8e(iRegINoSp dst, vReg src, rFlagsReg cr) %{
5538-
predicate(UseSVE == 0 &&
5539-
Matcher::vector_length(n->in(1)) < 8);
5540-
match(Set dst (VectorMaskFirstTrue src));
5541-
effect(KILL cr);
5542-
format %{ "vmask_firsttrue_lt8e $dst, $src\t# vector < 8 elements (neon). KILL cr" %}
5543-
ins_encode %{
5544-
// Returns the index of the first active lane of the
5545-
// vector mask, or VLENGTH if no lane is active.
5546-
//
5547-
// Input "src" is a vector of boolean represented as
5548-
// bytes with 0x00/0x01 as element values.
5549-
//
5550-
// Computed by reversing the bits and counting the leading
5551-
// zero bytes.
5552-
BasicType bt = Matcher::vector_element_basic_type(this, $src);
5553-
assert(bt == T_BOOLEAN, "unsupported type");
5554-
__ fmovd($dst$$Register, $src$$FloatRegister);
5555-
__ rbit($dst$$Register, $dst$$Register);
5556-
__ clz($dst$$Register, $dst$$Register);
5557-
__ lsrw($dst$$Register, $dst$$Register, 3);
5558-
__ movw(rscratch1, Matcher::vector_length(this, $src));
5559-
__ cmpw($dst$$Register, rscratch1);
5560-
__ cselw($dst$$Register, rscratch1, $dst$$Register, Assembler::GE);
5561-
%}
5562-
ins_pipe(pipe_slow);
5563-
%}
5564-
5565-
instruct vmask_firsttrue_8or16e(iRegINoSp dst, vReg src) %{
5566-
predicate(UseSVE == 0 &&
5567-
(Matcher::vector_length(n->in(1)) == 8 || Matcher::vector_length(n->in(1)) == 16));
5537+
instruct vmask_firsttrue_neon(iRegINoSp dst, vReg src) %{
5538+
predicate(UseSVE == 0);
55685539
match(Set dst (VectorMaskFirstTrue src));
5569-
format %{ "vmask_firsttrue_8or16e $dst, $src\t# vector 8B/16B (neon)" %}
5540+
format %{ "vmask_firsttrue_neon $dst, $src" %}
55705541
ins_encode %{
55715542
// Returns the index of the first active lane of the
55725543
// vector mask, or VLENGTH if no lane is active.
@@ -5579,14 +5550,21 @@ instruct vmask_firsttrue_8or16e(iRegINoSp dst, vReg src) %{
55795550

55805551
BasicType bt = Matcher::vector_element_basic_type(this, $src);
55815552
assert(bt == T_BOOLEAN, "unsupported type");
5582-
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
5583-
if (length_in_bytes == 8) {
5553+
uint vlength = Matcher::vector_length(this, $src);
5554+
if (vlength <= 8) {
55845555
__ fmovd($dst$$Register, $src$$FloatRegister);
5556+
if (vlength == 2 || vlength == 4) {
5557+
// Special handling for 2B or 4B cases:
5558+
// Vector mask is moved to a 64-bit general register, but only the low 16/32 bits are
5559+
// significant for 2B/4B cases. We initialize the 16th/32nd bit as bit 1, so as to generate
5560+
// the expected result (i.e. VLENGTH) for the case that all lanes are zero.
5561+
__ orr($dst$$Register, $dst$$Register, vlength == 2 ? 0x10000 : 0x100000000);
5562+
}
55855563
__ rbit($dst$$Register, $dst$$Register);
55865564
__ clz($dst$$Register, $dst$$Register);
55875565
__ lsrw($dst$$Register, $dst$$Register, 3);
55885566
} else {
5589-
assert(length_in_bytes == 16, "must be");
5567+
assert(vlength == 16, "must be");
55905568
Label FIRST_TRUE_INDEX;
55915569

55925570
// Try to compute the result from lower 64 bits.

‎src/hotspot/cpu/aarch64/aarch64_vector_ad.m4

+13-35
Original file line numberDiff line numberDiff line change
@@ -3844,39 +3844,10 @@ instruct vstoremask_truecount_neon(iRegINoSp dst, vReg src, immI_gt_1 size, vReg
38443844

38453845
// first true
38463846

3847-
instruct vmask_firsttrue_lt8e(iRegINoSp dst, vReg src, rFlagsReg cr) %{
3848-
predicate(UseSVE == 0 &&
3849-
Matcher::vector_length(n->in(1)) < 8);
3850-
match(Set dst (VectorMaskFirstTrue src));
3851-
effect(KILL cr);
3852-
format %{ "vmask_firsttrue_lt8e $dst, $src\t# vector < 8 elements (neon). KILL cr" %}
3853-
ins_encode %{
3854-
// Returns the index of the first active lane of the
3855-
// vector mask, or VLENGTH if no lane is active.
3856-
//
3857-
// Input "src" is a vector of boolean represented as
3858-
// bytes with 0x00/0x01 as element values.
3859-
//
3860-
// Computed by reversing the bits and counting the leading
3861-
// zero bytes.
3862-
BasicType bt = Matcher::vector_element_basic_type(this, $src);
3863-
assert(bt == T_BOOLEAN, "unsupported type");
3864-
__ fmovd($dst$$Register, $src$$FloatRegister);
3865-
__ rbit($dst$$Register, $dst$$Register);
3866-
__ clz($dst$$Register, $dst$$Register);
3867-
__ lsrw($dst$$Register, $dst$$Register, 3);
3868-
__ movw(rscratch1, Matcher::vector_length(this, $src));
3869-
__ cmpw($dst$$Register, rscratch1);
3870-
__ cselw($dst$$Register, rscratch1, $dst$$Register, Assembler::GE);
3871-
%}
3872-
ins_pipe(pipe_slow);
3873-
%}
3874-
3875-
instruct vmask_firsttrue_8or16e(iRegINoSp dst, vReg src) %{
3876-
predicate(UseSVE == 0 &&
3877-
(Matcher::vector_length(n->in(1)) == 8 || Matcher::vector_length(n->in(1)) == 16));
3847+
instruct vmask_firsttrue_neon(iRegINoSp dst, vReg src) %{
3848+
predicate(UseSVE == 0);
38783849
match(Set dst (VectorMaskFirstTrue src));
3879-
format %{ "vmask_firsttrue_8or16e $dst, $src\t# vector 8B/16B (neon)" %}
3850+
format %{ "vmask_firsttrue_neon $dst, $src" %}
38803851
ins_encode %{
38813852
// Returns the index of the first active lane of the
38823853
// vector mask, or VLENGTH if no lane is active.
@@ -3889,14 +3860,21 @@ instruct vmask_firsttrue_8or16e(iRegINoSp dst, vReg src) %{
38893860

38903861
BasicType bt = Matcher::vector_element_basic_type(this, $src);
38913862
assert(bt == T_BOOLEAN, "unsupported type");
3892-
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
3893-
if (length_in_bytes == 8) {
3863+
uint vlength = Matcher::vector_length(this, $src);
3864+
if (vlength <= 8) {
38943865
__ fmovd($dst$$Register, $src$$FloatRegister);
3866+
if (vlength == 2 || vlength == 4) {
3867+
// Special handling for 2B or 4B cases:
3868+
// Vector mask is moved to a 64-bit general register, but only the low 16/32 bits are
3869+
// significant for 2B/4B cases. We initialize the 16th/32nd bit as bit 1, so as to generate
3870+
// the expected result (i.e. VLENGTH) for the case that all lanes are zero.
3871+
__ orr($dst$$Register, $dst$$Register, vlength == 2 ? 0x10000 : 0x100000000);
3872+
}
38953873
__ rbit($dst$$Register, $dst$$Register);
38963874
__ clz($dst$$Register, $dst$$Register);
38973875
__ lsrw($dst$$Register, $dst$$Register, 3);
38983876
} else {
3899-
assert(length_in_bytes == 16, "must be");
3877+
assert(vlength == 16, "must be");
39003878
Label FIRST_TRUE_INDEX;
39013879

39023880
// Try to compute the result from lower 64 bits.

0 commit comments

Comments
 (0)
Please sign in to comment.