@@ -134,6 +134,9 @@ source %{
134
134
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
135
135
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
136
136
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
137
+ // The vector implementation of Op_AddReductionVD/F is for the Vector API only.
138
+ // It is not suitable for auto-vectorization because it does not add the elements
139
+ // in the same order as sequential code, and FP addition is non-associative.
137
140
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
138
141
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
139
142
opcode == Op_MulVL) {
@@ -2876,23 +2879,30 @@ instruct reduce_addL_sve(iRegLNoSp dst, iRegL isrc, vReg vsrc, vRegD tmp) %{
2876
2879
%}
2877
2880
2878
2881
// reduction addF
2882
+ // Floating-point addition is not associative, so the rules for AddReductionVF
2883
+ // on NEON can't be used to auto-vectorize floating-point reduce-add.
2884
+ // Currently, on NEON, AddReductionVF is only generated by Vector API.
2885
+ instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
2886
+ predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 2);
2887
+ match(Set dst (AddReductionVF fsrc vsrc));
2888
+ effect(TEMP_DEF dst);
2889
+ format %{ "reduce_add2F_neon $dst, $fsrc, $vsrc" %}
2890
+ ins_encode %{
2891
+ __ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ S);
2892
+ __ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
2893
+ %}
2894
+ ins_pipe(pipe_slow);
2895
+ %}
2879
2896
2880
- instruct reduce_addF_neon (vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
2881
- predicate(UseSVE == 0);
2897
+ instruct reduce_add4F_neon (vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
2898
+ predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 4 );
2882
2899
match(Set dst (AddReductionVF fsrc vsrc));
2883
2900
effect(TEMP_DEF dst, TEMP tmp);
2884
- format %{ "reduce_addF_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
2901
+ format %{ "reduce_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
2885
2902
ins_encode %{
2886
- uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
2887
- __ fadds($dst$$FloatRegister, $fsrc$$FloatRegister, $vsrc$$FloatRegister);
2888
- __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 1);
2889
- __ fadds($dst$$FloatRegister, $dst$$FloatRegister, $tmp$$FloatRegister);
2890
- if (length_in_bytes == 16) {
2891
- __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 2);
2892
- __ fadds($dst$$FloatRegister, $dst$$FloatRegister, $tmp$$FloatRegister);
2893
- __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 3);
2894
- __ fadds($dst$$FloatRegister, $dst$$FloatRegister, $tmp$$FloatRegister);
2895
- }
2903
+ __ faddp($tmp$$FloatRegister, __ T4S, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
2904
+ __ faddp($dst$$FloatRegister, $tmp$$FloatRegister, __ S);
2905
+ __ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
2896
2906
%}
2897
2907
ins_pipe(pipe_slow);
2898
2908
%}
@@ -2910,16 +2920,17 @@ instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
2910
2920
%}
2911
2921
2912
2922
// reduction addD
2913
-
2914
- instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc, vReg tmp) %{
2923
+ // Floating-point addition is not associative, so the rule for AddReductionVD
2924
+ // on NEON can't be used to auto-vectorize floating-point reduce-add.
2925
+ // Currently, on NEON, AddReductionVD is only generated by Vector API.
2926
+ instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
2915
2927
predicate(UseSVE == 0);
2916
2928
match(Set dst (AddReductionVD dsrc vsrc));
2917
- effect(TEMP_DEF dst, TEMP tmp );
2918
- format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D. KILL $tmp " %}
2929
+ effect(TEMP_DEF dst);
2930
+ format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D" %}
2919
2931
ins_encode %{
2920
- __ faddd($dst$$FloatRegister, $dsrc$$FloatRegister, $vsrc$$FloatRegister);
2921
- __ ins($tmp$$FloatRegister, __ D, $vsrc$$FloatRegister, 0, 1);
2922
- __ faddd($dst$$FloatRegister, $dst$$FloatRegister, $tmp$$FloatRegister);
2932
+ __ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ D);
2933
+ __ faddd($dst$$FloatRegister, $dst$$FloatRegister, $dsrc$$FloatRegister);
2923
2934
%}
2924
2935
ins_pipe(pipe_slow);
2925
2936
%}
0 commit comments