Skip to content

Commit ba942c2

Browse files
Fei GaoNingsheng Jian
Fei Gao
authored and
Ningsheng Jian
committedDec 19, 2022
8298244: AArch64: Optimize vector implementation of AddReduction for floating point
Reviewed-by: aph, xgong
1 parent 7938f8c commit ba942c2

File tree

5 files changed

+496
-468
lines changed

5 files changed

+496
-468
lines changed
 

‎src/hotspot/cpu/aarch64/aarch64_vector.ad

+31-20
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,9 @@ source %{
134134
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
135135
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
136136
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
137+
// The vector implementation of Op_AddReductionVD/F is for the Vector API only.
138+
// It is not suitable for auto-vectorization because it does not add the elements
139+
// in the same order as sequential code, and FP addition is non-associative.
137140
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
138141
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
139142
opcode == Op_MulVL) {
@@ -2876,23 +2879,30 @@ instruct reduce_addL_sve(iRegLNoSp dst, iRegL isrc, vReg vsrc, vRegD tmp) %{
28762879
%}
28772880

28782881
// reduction addF
2882+
// Floating-point addition is not associative, so the rules for AddReductionVF
2883+
// on NEON can't be used to auto-vectorize floating-point reduce-add.
2884+
// Currently, on NEON, AddReductionVF is only generated by Vector API.
2885+
instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
2886+
predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 2);
2887+
match(Set dst (AddReductionVF fsrc vsrc));
2888+
effect(TEMP_DEF dst);
2889+
format %{ "reduce_add2F_neon $dst, $fsrc, $vsrc" %}
2890+
ins_encode %{
2891+
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ S);
2892+
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
2893+
%}
2894+
ins_pipe(pipe_slow);
2895+
%}
28792896

2880-
instruct reduce_addF_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
2881-
predicate(UseSVE == 0);
2897+
instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
2898+
predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 4);
28822899
match(Set dst (AddReductionVF fsrc vsrc));
28832900
effect(TEMP_DEF dst, TEMP tmp);
2884-
format %{ "reduce_addF_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
2901+
format %{ "reduce_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
28852902
ins_encode %{
2886-
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
2887-
__ fadds($dst$$FloatRegister, $fsrc$$FloatRegister, $vsrc$$FloatRegister);
2888-
__ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 1);
2889-
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $tmp$$FloatRegister);
2890-
if (length_in_bytes == 16) {
2891-
__ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 2);
2892-
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $tmp$$FloatRegister);
2893-
__ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 3);
2894-
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $tmp$$FloatRegister);
2895-
}
2903+
__ faddp($tmp$$FloatRegister, __ T4S, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
2904+
__ faddp($dst$$FloatRegister, $tmp$$FloatRegister, __ S);
2905+
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
28962906
%}
28972907
ins_pipe(pipe_slow);
28982908
%}
@@ -2910,16 +2920,17 @@ instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
29102920
%}
29112921

29122922
// reduction addD
2913-
2914-
instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc, vReg tmp) %{
2923+
// Floating-point addition is not associative, so the rule for AddReductionVD
2924+
// on NEON can't be used to auto-vectorize floating-point reduce-add.
2925+
// Currently, on NEON, AddReductionVD is only generated by Vector API.
2926+
instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
29152927
predicate(UseSVE == 0);
29162928
match(Set dst (AddReductionVD dsrc vsrc));
2917-
effect(TEMP_DEF dst, TEMP tmp);
2918-
format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D. KILL $tmp" %}
2929+
effect(TEMP_DEF dst);
2930+
format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D" %}
29192931
ins_encode %{
2920-
__ faddd($dst$$FloatRegister, $dsrc$$FloatRegister, $vsrc$$FloatRegister);
2921-
__ ins($tmp$$FloatRegister, __ D, $vsrc$$FloatRegister, 0, 1);
2922-
__ faddd($dst$$FloatRegister, $dst$$FloatRegister, $tmp$$FloatRegister);
2932+
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ D);
2933+
__ faddd($dst$$FloatRegister, $dst$$FloatRegister, $dsrc$$FloatRegister);
29232934
%}
29242935
ins_pipe(pipe_slow);
29252936
%}

‎src/hotspot/cpu/aarch64/aarch64_vector_ad.m4

+31-20
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,9 @@ source %{
124124
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
125125
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
126126
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
127+
// The vector implementation of Op_AddReductionVD/F is for the Vector API only.
128+
// It is not suitable for auto-vectorization because it does not add the elements
129+
// in the same order as sequential code, and FP addition is non-associative.
127130
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
128131
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
129132
opcode == Op_MulVL) {
@@ -1808,23 +1811,30 @@ REDUCE_ADD_INT_NEON_SVE_PAIRWISE(I, iRegIorL2I)
18081811
REDUCE_ADD_INT_NEON_SVE_PAIRWISE(L, iRegL)
18091812

18101813
// reduction addF
1814+
// Floating-point addition is not associative, so the rules for AddReductionVF
1815+
// on NEON can't be used to auto-vectorize floating-point reduce-add.
1816+
// Currently, on NEON, AddReductionVF is only generated by Vector API.
1817+
instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
1818+
predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 2);
1819+
match(Set dst (AddReductionVF fsrc vsrc));
1820+
effect(TEMP_DEF dst);
1821+
format %{ "reduce_add2F_neon $dst, $fsrc, $vsrc" %}
1822+
ins_encode %{
1823+
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ S);
1824+
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
1825+
%}
1826+
ins_pipe(pipe_slow);
1827+
%}
18111828

1812-
instruct reduce_addF_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
1813-
predicate(UseSVE == 0);
1829+
instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
1830+
predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 4);
18141831
match(Set dst (AddReductionVF fsrc vsrc));
18151832
effect(TEMP_DEF dst, TEMP tmp);
1816-
format %{ "reduce_addF_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
1833+
format %{ "reduce_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
18171834
ins_encode %{
1818-
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
1819-
__ fadds($dst$$FloatRegister, $fsrc$$FloatRegister, $vsrc$$FloatRegister);
1820-
__ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 1);
1821-
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $tmp$$FloatRegister);
1822-
if (length_in_bytes == 16) {
1823-
__ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 2);
1824-
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $tmp$$FloatRegister);
1825-
__ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 3);
1826-
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $tmp$$FloatRegister);
1827-
}
1835+
__ faddp($tmp$$FloatRegister, __ T4S, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
1836+
__ faddp($dst$$FloatRegister, $tmp$$FloatRegister, __ S);
1837+
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
18281838
%}
18291839
ins_pipe(pipe_slow);
18301840
%}
@@ -1847,16 +1857,17 @@ dnl
18471857
REDUCE_ADD_FP_SVE(F, S)
18481858

18491859
// reduction addD
1850-
1851-
instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc, vReg tmp) %{
1860+
// Floating-point addition is not associative, so the rule for AddReductionVD
1861+
// on NEON can't be used to auto-vectorize floating-point reduce-add.
1862+
// Currently, on NEON, AddReductionVD is only generated by Vector API.
1863+
instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
18521864
predicate(UseSVE == 0);
18531865
match(Set dst (AddReductionVD dsrc vsrc));
1854-
effect(TEMP_DEF dst, TEMP tmp);
1855-
format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D. KILL $tmp" %}
1866+
effect(TEMP_DEF dst);
1867+
format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D" %}
18561868
ins_encode %{
1857-
__ faddd($dst$$FloatRegister, $dsrc$$FloatRegister, $vsrc$$FloatRegister);
1858-
__ ins($tmp$$FloatRegister, __ D, $vsrc$$FloatRegister, 0, 1);
1859-
__ faddd($dst$$FloatRegister, $dst$$FloatRegister, $tmp$$FloatRegister);
1869+
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ D);
1870+
__ faddd($dst$$FloatRegister, $dst$$FloatRegister, $dsrc$$FloatRegister);
18601871
%}
18611872
ins_pipe(pipe_slow);
18621873
%}

‎src/hotspot/cpu/aarch64/assembler_aarch64.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -2716,6 +2716,7 @@ template<typename R, typename... Rx>
27162716
INSN(fabd, 1, 1, 0b110101);
27172717
INSN(fadd, 0, 0, 0b110101);
27182718
INSN(fdiv, 1, 0, 0b111111);
2719+
INSN(faddp, 1, 0, 0b110101);
27192720
INSN(fmul, 1, 0, 0b110111);
27202721
INSN(fsub, 0, 1, 0b110101);
27212722
INSN(fmla, 0, 0, 0b110011);

‎test/hotspot/gtest/aarch64/aarch64-asmtest.py

+2
Original file line numberDiff line numberDiff line change
@@ -1564,6 +1564,8 @@ def generate(kind, names):
15641564
["mulv", "mul", "2S"], ["mulv", "mul", "4S"],
15651565
["fabd", "fabd", "2S"], ["fabd", "fabd", "4S"],
15661566
["fabd", "fabd", "2D"],
1567+
["faddp", "faddp", "2S"], ["faddp", "faddp", "4S"],
1568+
["faddp", "faddp", "2D"],
15671569
["fmul", "fmul", "2S"], ["fmul", "fmul", "4S"],
15681570
["fmul", "fmul", "2D"],
15691571
["mlav", "mla", "4H"], ["mlav", "mla", "8H"],

‎test/hotspot/gtest/aarch64/asmtest.out.h

+431-428
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)
Please sign in to comment.