Skip to content

Commit 07c7977

Browse files
Bhavana-Kilambinick-arm
authored andcommittedAug 22, 2022
8290249: Vectorize signum on AArch64
Reviewed-by: aph, ngasson
1 parent a3ec0bb commit 07c7977

10 files changed

+460
-328
lines changed
 

‎src/hotspot/cpu/aarch64/aarch64_vector.ad

+33
Original file line numberDiff line numberDiff line change
@@ -6359,3 +6359,36 @@ instruct vexpand(vReg dst, vReg src, pRegGov pg) %{
63596359
%}
63606360
ins_pipe(pipe_slow);
63616361
%}
6362+
6363+
// ------------------------------ Vector signum --------------------------------
6364+
6365+
// Vector Math.signum
6366+
6367+
instruct vsignum_le128b(vReg dst, vReg src, vReg zero, vReg one) %{
6368+
predicate(Matcher::vector_length_in_bytes(n) <= 16);
6369+
match(Set dst (SignumVF src (Binary zero one)));
6370+
match(Set dst (SignumVD src (Binary zero one)));
6371+
effect(TEMP_DEF dst);
6372+
format %{ "vsignum_le128b $dst, $src\t# vector <= 128 bits" %}
6373+
ins_encode %{
6374+
__ vector_signum_neon($dst$$FloatRegister, $src$$FloatRegister, $zero$$FloatRegister,
6375+
$one$$FloatRegister, get_arrangement(this));
6376+
%}
6377+
ins_pipe(pipe_slow);
6378+
%}
6379+
6380+
instruct vsignum_gt128b(vReg dst, vReg src, vReg zero, vReg one, vReg tmp, pRegGov pgtmp) %{
6381+
predicate(Matcher::vector_length_in_bytes(n) > 16);
6382+
match(Set dst (SignumVF src (Binary zero one)));
6383+
match(Set dst (SignumVD src (Binary zero one)));
6384+
effect(TEMP_DEF dst, TEMP tmp, TEMP pgtmp);
6385+
format %{ "vsignum_gt128b $dst, $src\t# vector > 128 bits. KILL $tmp, $pgtmp" %}
6386+
ins_encode %{
6387+
assert(UseSVE > 0, "must be sve");
6388+
BasicType bt = Matcher::vector_element_basic_type(this);
6389+
__ vector_signum_sve($dst$$FloatRegister, $src$$FloatRegister, $zero$$FloatRegister,
6390+
$one$$FloatRegister, $tmp$$FloatRegister, $pgtmp$$PRegister,
6391+
__ elemType_to_regVariant(bt));
6392+
%}
6393+
ins_pipe(pipe_slow);
6394+
%}

‎src/hotspot/cpu/aarch64/aarch64_vector_ad.m4

+33
Original file line numberDiff line numberDiff line change
@@ -4699,3 +4699,36 @@ instruct vexpand(vReg dst, vReg src, pRegGov pg) %{
46994699
%}
47004700
ins_pipe(pipe_slow);
47014701
%}
4702+
4703+
// ------------------------------ Vector signum --------------------------------
4704+
4705+
// Vector Math.signum
4706+
4707+
instruct vsignum_le128b(vReg dst, vReg src, vReg zero, vReg one) %{
4708+
predicate(Matcher::vector_length_in_bytes(n) <= 16);
4709+
match(Set dst (SignumVF src (Binary zero one)));
4710+
match(Set dst (SignumVD src (Binary zero one)));
4711+
effect(TEMP_DEF dst);
4712+
format %{ "vsignum_le128b $dst, $src\t# vector <= 128 bits" %}
4713+
ins_encode %{
4714+
__ vector_signum_neon($dst$$FloatRegister, $src$$FloatRegister, $zero$$FloatRegister,
4715+
$one$$FloatRegister, get_arrangement(this));
4716+
%}
4717+
ins_pipe(pipe_slow);
4718+
%}
4719+
4720+
instruct vsignum_gt128b(vReg dst, vReg src, vReg zero, vReg one, vReg tmp, pRegGov pgtmp) %{
4721+
predicate(Matcher::vector_length_in_bytes(n) > 16);
4722+
match(Set dst (SignumVF src (Binary zero one)));
4723+
match(Set dst (SignumVD src (Binary zero one)));
4724+
effect(TEMP_DEF dst, TEMP tmp, TEMP pgtmp);
4725+
format %{ "vsignum_gt128b $dst, $src\t# vector > 128 bits. KILL $tmp, $pgtmp" %}
4726+
ins_encode %{
4727+
assert(UseSVE > 0, "must be sve");
4728+
BasicType bt = Matcher::vector_element_basic_type(this);
4729+
__ vector_signum_sve($dst$$FloatRegister, $src$$FloatRegister, $zero$$FloatRegister,
4730+
$one$$FloatRegister, $tmp$$FloatRegister, $pgtmp$$PRegister,
4731+
__ elemType_to_regVariant(bt));
4732+
%}
4733+
ins_pipe(pipe_slow);
4734+
%}

‎src/hotspot/cpu/aarch64/assembler_aarch64.hpp

+15-9
Original file line numberDiff line numberDiff line change
@@ -2566,6 +2566,7 @@ void mvnw(Register Rd, Register Rm,
25662566
INSN(fcmeq, 0, 0, 0b111001);
25672567
INSN(fcmgt, 1, 1, 0b111001);
25682568
INSN(fcmge, 1, 0, 0b111001);
2569+
INSN(facgt, 1, 1, 0b111011);
25692570

25702571
#undef INSN
25712572

@@ -3512,18 +3513,22 @@ void mvnw(Register Rd, Register Rm,
35123513
void NAME(Condition cond, PRegister Pd, SIMD_RegVariant T, PRegister Pg, \
35133514
FloatRegister Zn, FloatRegister Zm) { \
35143515
starti; \
3515-
if (fp == 0) { \
3516-
assert(T != Q, "invalid size"); \
3517-
} else { \
3518-
assert(T != B && T != Q, "invalid size"); \
3519-
assert(cond != HI && cond != HS, "invalid condition for fcm"); \
3516+
assert(T != Q, "invalid size"); \
3517+
bool is_absolute = op2 == 0b11; \
3518+
if (fp == 1) { \
3519+
assert(T != B, "invalid size"); \
3520+
if (is_absolute) { \
3521+
assert(cond == GT || cond == GE, "invalid condition for fac"); \
3522+
} else { \
3523+
assert(cond != HI && cond != HS, "invalid condition for fcm"); \
3524+
} \
35203525
} \
35213526
int cond_op; \
35223527
switch(cond) { \
35233528
case EQ: cond_op = (op2 << 2) | 0b10; break; \
35243529
case NE: cond_op = (op2 << 2) | 0b11; break; \
3525-
case GE: cond_op = (op2 << 2) | 0b00; break; \
3526-
case GT: cond_op = (op2 << 2) | 0b01; break; \
3530+
case GE: cond_op = (op2 << 2) | (is_absolute ? 0b01 : 0b00); break; \
3531+
case GT: cond_op = (op2 << 2) | (is_absolute ? 0b11 : 0b01); break; \
35273532
case HI: cond_op = 0b0001; break; \
35283533
case HS: cond_op = 0b0000; break; \
35293534
default: \
@@ -3533,8 +3538,9 @@ void mvnw(Register Rd, Register Rm,
35333538
pgrf(Pg, 10), rf(Zn, 5), f(cond_op & 1, 4), prf(Pd, 0); \
35343539
}
35353540

3536-
INSN(sve_cmp, 0b00100100, 0b10, 0);
3537-
INSN(sve_fcm, 0b01100101, 0b01, 1);
3541+
INSN(sve_cmp, 0b00100100, 0b10, 0); // Integer compare vectors
3542+
INSN(sve_fcm, 0b01100101, 0b01, 1); // Floating-point compare vectors
3543+
INSN(sve_fac, 0b01100101, 0b11, 1); // Floating-point absolute compare vectors
35383544
#undef INSN
35393545

35403546
// SVE Integer Compare - Signed Immediate

‎src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp

+35-1
Original file line numberDiff line numberDiff line change
@@ -2009,6 +2009,41 @@ void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, F
20092009
// result in dst
20102010
}
20112011

2012+
void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2013+
FloatRegister one, SIMD_Arrangement T) {
2014+
assert_different_registers(dst, src, zero, one);
2015+
assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2016+
2017+
facgt(dst, T, src, zero);
2018+
ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2019+
bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2020+
}
2021+
2022+
void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2023+
FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2024+
assert_different_registers(dst, src, zero, one, vtmp);
2025+
assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2026+
2027+
sve_orr(vtmp, src, src);
2028+
sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2029+
switch (T) {
2030+
case S:
2031+
sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2032+
sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2033+
// on the sign of the float value
2034+
break;
2035+
case D:
2036+
sve_and(vtmp, T, min_jlong);
2037+
sve_orr(vtmp, T, jlong_cast(1.0));
2038+
break;
2039+
default:
2040+
assert(false, "unsupported");
2041+
ShouldNotReachHere();
2042+
}
2043+
sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2044+
// Result in dst
2045+
}
2046+
20122047
bool C2_MacroAssembler::in_scratch_emit_size() {
20132048
if (ciEnv::current()->task() != NULL) {
20142049
PhaseOutput* phase_output = Compile::current()->output();
@@ -2018,4 +2053,3 @@ bool C2_MacroAssembler::in_scratch_emit_size() {
20182053
}
20192054
return MacroAssembler::in_scratch_emit_size();
20202055
}
2021-

‎src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp

+7
Original file line numberDiff line numberDiff line change
@@ -165,4 +165,11 @@
165165

166166
void neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ);
167167

168+
// java.lang.Math::signum intrinsics
169+
void vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
170+
FloatRegister one, SIMD_Arrangement T);
171+
172+
void vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
173+
FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T);
174+
168175
#endif // CPU_AARCH64_C2_MACROASSEMBLER_AARCH64_HPP

‎test/hotspot/gtest/aarch64/aarch64-asmtest.py

+8
Original file line numberDiff line numberDiff line change
@@ -1613,6 +1613,8 @@ def generate(kind, names):
16131613
["cmge", "cmge", "2D"],
16141614
["fcmge", "fcmge", "2S"], ["fcmge", "fcmge", "4S"],
16151615
["fcmge", "fcmge", "2D"],
1616+
["facgt", "facgt", "2S"], ["facgt", "facgt", "4S"],
1617+
["facgt", "facgt", "2D"],
16161618
])
16171619

16181620
generate(SVEComparisonWithZero, ["EQ", "GT", "GE", "LT", "LE", "NE"])
@@ -1822,6 +1824,12 @@ def generate(kind, names):
18221824
["compact", "__ sve_compact(z16, __ S, z16, p1);", "compact\tz16.s, p1, z16.s"],
18231825
["compact", "__ sve_compact(z16, __ D, z16, p1);", "compact\tz16.d, p1, z16.d"],
18241826
["ext", "__ sve_ext(z17, z16, 63);", "ext\tz17.b, z17.b, z16.b, #63"],
1827+
["facgt", "__ sve_fac(Assembler::GT, p1, __ H, p2, z4, z5);", "facgt\tp1.h, p2/z, z4.h, z5.h"],
1828+
["facgt", "__ sve_fac(Assembler::GT, p1, __ S, p2, z4, z5);", "facgt\tp1.s, p2/z, z4.s, z5.s"],
1829+
["facgt", "__ sve_fac(Assembler::GT, p1, __ D, p2, z4, z5);", "facgt\tp1.d, p2/z, z4.d, z5.d"],
1830+
["facge", "__ sve_fac(Assembler::GE, p1, __ H, p2, z4, z5);", "facge\tp1.h, p2/z, z4.h, z5.h"],
1831+
["facge", "__ sve_fac(Assembler::GE, p1, __ S, p2, z4, z5);", "facge\tp1.s, p2/z, z4.s, z5.s"],
1832+
["facge", "__ sve_fac(Assembler::GE, p1, __ D, p2, z4, z5);", "facge\tp1.d, p2/z, z4.d, z5.d"],
18251833
# SVE2 instructions
18261834
["histcnt", "__ sve_histcnt(z16, __ S, p0, z16, z16);", "histcnt\tz16.s, p0/z, z16.s, z16.s"],
18271835
["histcnt", "__ sve_histcnt(z17, __ D, p0, z17, z17);", "histcnt\tz17.d, p0/z, z17.d, z17.d"],

‎test/hotspot/gtest/aarch64/asmtest.out.h

+316-304
Large diffs are not rendered by default.

‎test/hotspot/jtreg/compiler/c2/cr6340864/TestDoubleVect.java

+4-4
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,10 @@
2626
* @bug 6340864
2727
* @summary Implement vectorization optimizations in hotspot-server
2828
*
29-
* @run main/othervm -Xbatch -XX:CompileCommand=exclude,*::test() -Xmx128m compiler.c2.cr6340864.TestDoubleVect
30-
* @run main/othervm -Xbatch -XX:CompileCommand=exclude,*::test() -Xmx128m -XX:MaxVectorSize=8 compiler.c2.cr6340864.TestDoubleVect
31-
* @run main/othervm -Xbatch -XX:CompileCommand=exclude,*::test() -Xmx128m -XX:MaxVectorSize=16 compiler.c2.cr6340864.TestDoubleVect
32-
* @run main/othervm -Xbatch -XX:CompileCommand=exclude,*::test() -Xmx128m -XX:MaxVectorSize=32 compiler.c2.cr6340864.TestDoubleVect
29+
* @run main/othervm -Xbatch -XX:CompileCommand=exclude,*::test() -Xmx128m -XX:+UnlockDiagnosticVMOptions -XX:+UseSignumIntrinsic compiler.c2.cr6340864.TestDoubleVect
30+
* @run main/othervm -Xbatch -XX:CompileCommand=exclude,*::test() -Xmx128m -XX:+UnlockDiagnosticVMOptions -XX:+UseSignumIntrinsic -XX:MaxVectorSize=8 compiler.c2.cr6340864.TestDoubleVect
31+
* @run main/othervm -Xbatch -XX:CompileCommand=exclude,*::test() -Xmx128m -XX:+UnlockDiagnosticVMOptions -XX:+UseSignumIntrinsic -XX:MaxVectorSize=16 compiler.c2.cr6340864.TestDoubleVect
32+
* @run main/othervm -Xbatch -XX:CompileCommand=exclude,*::test() -Xmx128m -XX:+UnlockDiagnosticVMOptions -XX:+UseSignumIntrinsic -XX:MaxVectorSize=32 compiler.c2.cr6340864.TestDoubleVect
3333
*/
3434

3535
package compiler.c2.cr6340864;

‎test/hotspot/jtreg/compiler/c2/cr6340864/TestFloatVect.java

+4-4
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,10 @@
2626
* @bug 6340864
2727
* @summary Implement vectorization optimizations in hotspot-server
2828
*
29-
* @run main/othervm -Xbatch -XX:CompileCommand=exclude,*::test() -Xmx128m compiler.c2.cr6340864.TestFloatVect
30-
* @run main/othervm -Xbatch -XX:CompileCommand=exclude,*::test() -Xmx128m -XX:MaxVectorSize=8 compiler.c2.cr6340864.TestFloatVect
31-
* @run main/othervm -Xbatch -XX:CompileCommand=exclude,*::test() -Xmx128m -XX:MaxVectorSize=16 compiler.c2.cr6340864.TestFloatVect
32-
* @run main/othervm -Xbatch -XX:CompileCommand=exclude,*::test() -Xmx128m -XX:MaxVectorSize=32 compiler.c2.cr6340864.TestFloatVect
29+
* @run main/othervm -Xbatch -XX:CompileCommand=exclude,*::test() -Xmx128m -XX:+UnlockDiagnosticVMOptions -XX:+UseSignumIntrinsic compiler.c2.cr6340864.TestFloatVect
30+
* @run main/othervm -Xbatch -XX:CompileCommand=exclude,*::test() -Xmx128m -XX:+UnlockDiagnosticVMOptions -XX:+UseSignumIntrinsic -XX:MaxVectorSize=8 compiler.c2.cr6340864.TestFloatVect
31+
* @run main/othervm -Xbatch -XX:CompileCommand=exclude,*::test() -Xmx128m -XX:+UnlockDiagnosticVMOptions -XX:+UseSignumIntrinsic -XX:MaxVectorSize=16 compiler.c2.cr6340864.TestFloatVect
32+
* @run main/othervm -Xbatch -XX:CompileCommand=exclude,*::test() -Xmx128m -XX:+UnlockDiagnosticVMOptions -XX:+UseSignumIntrinsic -XX:MaxVectorSize=32 compiler.c2.cr6340864.TestFloatVect
3333
*/
3434

3535
package compiler.c2.cr6340864;

‎test/hotspot/jtreg/compiler/vectorization/TestSignumVector.java

+5-6
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,10 @@
2323

2424
/**
2525
* @test
26-
* @bug 8282711
27-
* @summary Accelerate Math.signum function for AVX and AVX512.
26+
* @bug 8282711 8290249
27+
* @summary Accelerate Math.signum function for AVX, AVX512 and aarch64 (Neon and SVE)
2828
* @requires vm.compiler2.enabled
29-
* @requires vm.cpu.features ~= ".*avx.*"
30-
* @requires os.simpleArch == "x64"
29+
* @requires (os.simpleArch == "x64" & vm.cpu.features ~= ".*avx.*") | os.arch == "aarch64"
3130
* @library /test/lib /
3231
* @run driver compiler.vectorization.TestSignumVector
3332
*/
@@ -46,8 +45,8 @@ public class TestSignumVector {
4645
private static float [] fout;
4746

4847
public static void main(String args[]) {
49-
TestFramework.runWithFlags("-XX:-TieredCompilation",
50-
"-XX:CompileThresholdScaling=0.3");
48+
TestFramework.runWithFlags("-XX:-TieredCompilation", "-XX:+UnlockDiagnosticVMOptions",
49+
"-XX:+UseSignumIntrinsic", "-XX:CompileThresholdScaling=0.3");
5150
System.out.println("PASSED");
5251
}
5352

0 commit comments

Comments
 (0)
Please sign in to comment.