Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8290204: FP16 initial backend implementation #204

Closed
6 changes: 2 additions & 4 deletions src/hotspot/cpu/x86/assembler_x86.cpp
Original file line number Diff line number Diff line change
@@ -11102,11 +11102,9 @@ void Assembler::evex_prefix(bool vex_r, bool vex_b, bool vex_x, bool evex_r, boo
// instead of not'd
int byte2 = (vex_r ? VEX_R : 0) | (vex_x ? VEX_X : 0) | (vex_b ? VEX_B : 0) | (evex_r ? EVEX_Rb : 0);
byte2 = (~byte2) & 0xF0;
// confine opc opcode extensions in mm bits to lower two bits
// of form {0F, 0F_38, 0F_3A}
// For MAP5 and MAP6, we use 3 bits in mmm bits
// confine opc opcode extensions in mmm bits to lower three bits
// of form {0F, 0F_38, 0F_3A, MAP5, MAP6}
byte2 |= opc;
emit_int8(byte2);

// P1: byte 3 as Wvvvv1pp
int byte3 = ((~nds_enc) & 0xf) << 3;
20 changes: 20 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Original file line number Diff line number Diff line change
@@ -1965,6 +1965,26 @@ void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src,
reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
}

void C2_MacroAssembler::reduce8HF(Register dst, XMMRegister src, XMMRegister vtmp, XMMRegister vtmp1, XMMRegister vtmp2) {
movdl(vtmp1, dst);
evaddsh(vtmp1, vtmp1, src);
pshuflw(vtmp, src, 0x01);
evaddsh(vtmp1, vtmp1, vtmp);
pshuflw(vtmp, src, 0x02);
evaddsh(vtmp1, vtmp1, vtmp);
pshuflw(vtmp, src, 0x03);
evaddsh(vtmp1, vtmp1, vtmp);
pshufd(vtmp2, src, 0x0E);
evaddsh(vtmp1, vtmp1, vtmp2);
pshuflw(vtmp, vtmp2, 0x01);
evaddsh(vtmp1, vtmp1, vtmp);
pshuflw(vtmp, vtmp2, 0x02);
evaddsh(vtmp1, vtmp1, vtmp);
pshuflw(vtmp, vtmp2, 0x03);
evaddsh(vtmp1, vtmp1, vtmp);
movdl(dst, vtmp1);
}

void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
reduce_operation_128(T_DOUBLE, opcode, dst, src);
pshufd(vtmp, src, 0xE);
1 change: 1 addition & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
Original file line number Diff line number Diff line change
@@ -171,6 +171,7 @@
void reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid,
XMMRegister dst, XMMRegister src,
XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, XMMRegister xmm_0, XMMRegister xmm_1 = xnoreg);
void reduce8HF(Register dst, XMMRegister src, XMMRegister vtmp, XMMRegister vtmp1, XMMRegister vtmp2);
private:
void reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
void reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
3 changes: 3 additions & 0 deletions src/hotspot/cpu/x86/stubGenerator_x86_32.cpp
Original file line number Diff line number Diff line change
@@ -4107,6 +4107,9 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::x86::_vector_reverse_byte_perm_mask_int = generate_vector_reverse_byte_perm_mask_int("perm_mask_int");
StubRoutines::x86::_vector_reverse_byte_perm_mask_short = generate_vector_reverse_byte_perm_mask_short("perm_mask_short");

StubRoutines::x86::_vector_halffloat_sign_mask = generate_vector_fp_mask("vector_halffloat_sign_mask", 0x7FFF7FFF);
StubRoutines::x86::_vector_halffloat_sign_flip = generate_vector_fp_mask("vector_halffloat_sign_flip", 0x80008000);

if (VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
// lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");
72 changes: 16 additions & 56 deletions src/hotspot/cpu/x86/x86.ad
Original file line number Diff line number Diff line change
@@ -1951,6 +1951,16 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
return false;
}
break;
case Op_AddVHF:
case Op_SubVHF:
case Op_MulVHF:
case Op_DivVHF:
case Op_AbsVHF:
case Op_NegVHF:
if (bt != T_SHORT && !VM_Version::supports_avx512_fp16()) {
return false;
}
break;
Comment on lines +1954 to +1963
Copy link
Member

@jatin-bhateja jatin-bhateja Jan 4, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if target does not support AVX512BW and vector length is 512 bits, vector size compatibility check will fail upfront and we won't reach till this point, currently the BasicType for halffloat is T_SHORT,
https://github.com/openjdk/panama-vector/pull/204/files#diff-d6a3624f0f0af65a98a47378a5c146eed5016ca09b4de1acd0a3acc823242e82L1690

Though, this may not show up in reality since FP16 ISA comes along with next gen Xeons which support both AVX512_FP16 and AVX512BW.

}
return true; // Per default match rules are supported.
}
@@ -2109,16 +2119,6 @@ const bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, Bas
if ((bt == T_INT || bt == T_LONG) && VM_Version::supports_avx512cd()) {
return true;
}
case Op_AddVHF:
case Op_SubVHF:
case Op_MulVHF:
case Op_DivVHF:
case Op_AbsVHF:
case Op_NegVHF:
if(bt != T_SHORT && !VM_Version::supports_avx512_fp16()) {
return false;
}
return true;

default:
return false;
@@ -4848,50 +4848,16 @@ instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
%}

// =======================Half Float Reduction==========================================
instruct reduction8HF(rRegI dst, vec src2, vec tmp, vec tmp1, vec tmp2) %{
predicate(UseAVX > 2);
instruct reduction8HF(rRegI dst, vec src2, vec vtmp, vec vtmp1, vec vtmp2) %{
match(Set dst (AddReductionVHF dst src2));
effect(TEMP tmp, TEMP tmp1, TEMP tmp2);
format %{"movdl $tmp1, $dst\n\t"
"evaddsh $tmp1,$tmp1,$src2\n\t"
"pshuflw $tmp,$src2,0x01\n\t"
"evaddsh $tmp1,$tmp1,$tmp\n\t"
"pshuflw $tmp,$src2,0x02\n\t"
"evaddsh $tmp1,$tmp1,$tmp\n\t"
"pshuflw $tmp,$src2,0x03\n\t"
"evaddsh $tmp1,$tmp1,$tmp\n\t"
"pshufd $tmp2,$src2, 0x0E\n\t"
"evaddsh $tmp1,$tmp1,$tmp2\n\t"
"pshuflw $tmp,$tmp2,0x01\n\t"
"evaddsh $tmp1,$tmp1,$tmp\n\t"
"pshuflw $tmp,$tmp2,0x02\n\t"
"evaddsh $tmp1,$tmp1,$tmp\n\t"
"pshuflw $tmp,$tmp2,0x03\n\t"
"evaddsh $tmp1,$tmp1,$tmp\t"
"movdl $dst, $tmp1\t! add reductionHF" %}
ins_encode %{
__ movdl($tmp1$$XMMRegister, $dst$$Register);
__ evaddsh($tmp1$$XMMRegister, $tmp1$$XMMRegister, $src2$$XMMRegister);
__ pshuflw($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
__ evaddsh($tmp1$$XMMRegister, $tmp1$$XMMRegister, $tmp$$XMMRegister);
__ pshuflw($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
__ evaddsh($tmp1$$XMMRegister, $tmp1$$XMMRegister, $tmp$$XMMRegister);
__ pshuflw($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
__ evaddsh($tmp1$$XMMRegister, $tmp1$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x0E);
__ evaddsh($tmp1$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister);
__ pshuflw($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
__ evaddsh($tmp1$$XMMRegister, $tmp1$$XMMRegister, $tmp$$XMMRegister);
__ pshuflw($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
__ evaddsh($tmp1$$XMMRegister, $tmp1$$XMMRegister, $tmp$$XMMRegister);
__ pshuflw($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
__ evaddsh($tmp1$$XMMRegister, $tmp1$$XMMRegister, $tmp$$XMMRegister);
__ movdl($dst$$Register, $tmp1$$XMMRegister);
effect(TEMP vtmp, TEMP vtmp1, TEMP vtmp2);
format %{ "reduction_halffloat $dst, $src2\t using $vtmp, $vtmp1, $vtmp2 as TEMP" %}
ins_encode %{
__ reduce8HF($dst$$Register, $src2$$XMMRegister, $vtmp$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}


// =======================Double Reduction==========================================

instruct reduction2D(regD dst, vec src, vec vtmp) %{
@@ -5408,7 +5374,6 @@ instruct vaddD_mem(vec dst, vec src, memory mem) %{

// Halffloat vector add
instruct vaddHF_reg(vec dst, vec src1, vec src2) %{
predicate(UseAVX > 2);
match(Set dst (AddVHF src1 src2));
format %{ "evaddph $dst,$src1,$src2\t! add packedHF" %}
ins_encode %{
@@ -5628,7 +5593,6 @@ instruct vsubD_mem(vec dst, vec src, memory mem) %{

// Halffloat vector sub
instruct vsubHF_reg(vec dst, vec src1, vec src2) %{
predicate(UseAVX > 2);
match(Set dst (SubVHF src1 src2));
format %{ "evsubph $dst,$src1,$src2\t! sub packedHF" %}
ins_encode %{
@@ -5969,9 +5933,8 @@ instruct vmulD_mem(vec dst, vec src, memory mem) %{

//Halffloat vector mul
instruct vmulHF_reg(vec dst, vec src1, vec src2) %{
predicate(UseAVX > 2);
match(Set dst (MulVHF src1 src2));
format %{ "vmulph $dst,$src1,$src2\t! mul packedHF" %}
format %{ "evmulph $dst,$src1,$src2\t! mul packedHF" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
__ evmulph($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
@@ -6087,7 +6050,6 @@ instruct vdivD_mem(vec dst, vec src, memory mem) %{

// Halffloat vector div
instruct vdivHF_reg(vec dst, vec src1, vec src2) %{
predicate(UseAVX > 2);
match(Set dst (DivVHF src1 src2));
format %{ "evdivph $dst,$src1,$src2\t! div packedHF" %}
ins_encode %{
@@ -7987,7 +7949,6 @@ instruct vabsL_reg(vec dst, vec src) %{

// Half float abs
instruct vabsHF_reg(vec dst, vec src) %{
predicate(UseAVX > 2);
match(Set dst (AbsVHF src));
format %{ "vandps $dst,$src\t# $dst = |$src| abs packedHF" %}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Incorrect format string.

ins_encode %{
@@ -7999,7 +7960,6 @@ instruct vabsHF_reg(vec dst, vec src) %{

// Halffloat neg
instruct vnegHF_reg(vec dst, vec src) %{
predicate(UseAVX > 2);
match(Set dst (NegVHF src));
format %{"vxorps $dst, $src\t $dst = -$src neg packedHF" %}
ins_encode %{
Original file line number Diff line number Diff line change
@@ -223,6 +223,7 @@ public enum CPUFeature implements CPUFeatureName {
SERIALIZE,
GFNI,
AVX512_BITALG,
AVX512_FP16,
}

private final EnumSet<CPUFeature> features;