Skip to content

Commit dc9a6ef

Browse files
Jatin BhatejaVladimir Ivanov
Jatin Bhateja
and
Vladimir Ivanov
committedNov 21, 2024
8341137: Optimize long vector multiplication using x86 VPMUL[U]DQ instruction
Co-authored-by: Vladimir Ivanov <vlivanov@openjdk.org> Reviewed-by: vlivanov, sviswanathan
1 parent 191b38e commit dc9a6ef

File tree

7 files changed

+544
-1
lines changed

7 files changed

+544
-1
lines changed
 

‎src/hotspot/cpu/x86/x86.ad

+28
Original file line numberDiff line numberDiff line change
@@ -6179,6 +6179,7 @@ instruct evmulL_reg(vec dst, vec src1, vec src2) %{
61796179
VM_Version::supports_avx512dq()) ||
61806180
VM_Version::supports_avx512vldq());
61816181
match(Set dst (MulVL src1 src2));
6182+
ins_cost(500);
61826183
format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
61836184
ins_encode %{
61846185
assert(UseAVX > 2, "required");
@@ -6195,6 +6196,7 @@ instruct evmulL_mem(vec dst, vec src, memory mem) %{
61956196
VM_Version::supports_avx512vldq()));
61966197
match(Set dst (MulVL src (LoadVector mem)));
61976198
format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
6199+
ins_cost(500);
61986200
ins_encode %{
61996201
assert(UseAVX > 2, "required");
62006202
int vlen_enc = vector_length_encoding(this);
@@ -6206,6 +6208,7 @@ instruct evmulL_mem(vec dst, vec src, memory mem) %{
62066208
instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
62076209
predicate(UseAVX == 0);
62086210
match(Set dst (MulVL src1 src2));
6211+
ins_cost(500);
62096212
effect(TEMP dst, TEMP xtmp);
62106213
format %{ "mulVL $dst, $src1, $src2\t! using $xtmp as TEMP" %}
62116214
ins_encode %{
@@ -6232,6 +6235,7 @@ instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
62326235
!VM_Version::supports_avx512vldq())));
62336236
match(Set dst (MulVL src1 src2));
62346237
effect(TEMP xtmp1, TEMP xtmp2);
6238+
ins_cost(500);
62356239
format %{ "vmulVL $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
62366240
ins_encode %{
62376241
int vlen_enc = vector_length_encoding(this);
@@ -6248,6 +6252,30 @@ instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
62486252
ins_pipe( pipe_slow );
62496253
%}
62506254

6255+
instruct vmuludq_reg(vec dst, vec src1, vec src2) %{
6256+
predicate(UseAVX > 0 && n->as_MulVL()->has_uint_inputs());
6257+
match(Set dst (MulVL src1 src2));
6258+
ins_cost(100);
6259+
format %{ "vpmuludq $dst,$src1,$src2\t! muludq packedL" %}
6260+
ins_encode %{
6261+
int vlen_enc = vector_length_encoding(this);
6262+
__ vpmuludq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6263+
%}
6264+
ins_pipe( pipe_slow );
6265+
%}
6266+
6267+
instruct vmuldq_reg(vec dst, vec src1, vec src2) %{
6268+
predicate(UseAVX > 0 && n->as_MulVL()->has_int_inputs());
6269+
match(Set dst (MulVL src1 src2));
6270+
ins_cost(100);
6271+
format %{ "vpmuldq $dst,$src1,$src2\t! muldq packedL" %}
6272+
ins_encode %{
6273+
int vlen_enc = vector_length_encoding(this);
6274+
__ vpmuldq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6275+
%}
6276+
ins_pipe( pipe_slow );
6277+
%}
6278+
62516279
// Floats vector mul
62526280
instruct vmulF(vec dst, vec src) %{
62536281
predicate(UseAVX == 0);

‎src/hotspot/share/opto/node.hpp

+3
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ class VectorUnboxNode;
193193
class VectorSet;
194194
class VectorReinterpretNode;
195195
class ShiftVNode;
196+
class MulVLNode;
196197
class ExpandVNode;
197198
class CompressVNode;
198199
class CompressMNode;
@@ -743,6 +744,7 @@ class Node {
743744
DEFINE_CLASS_ID(Reduction, Vector, 7)
744745
DEFINE_CLASS_ID(NegV, Vector, 8)
745746
DEFINE_CLASS_ID(SaturatingVector, Vector, 9)
747+
DEFINE_CLASS_ID(MulVL, Vector, 10)
746748
DEFINE_CLASS_ID(Con, Type, 8)
747749
DEFINE_CLASS_ID(ConI, Con, 0)
748750
DEFINE_CLASS_ID(SafePointScalarMerge, Type, 9)
@@ -970,6 +972,7 @@ class Node {
970972
DEFINE_CLASS_QUERY(Mul)
971973
DEFINE_CLASS_QUERY(Multi)
972974
DEFINE_CLASS_QUERY(MultiBranch)
975+
DEFINE_CLASS_QUERY(MulVL)
973976
DEFINE_CLASS_QUERY(Neg)
974977
DEFINE_CLASS_QUERY(NegV)
975978
DEFINE_CLASS_QUERY(NeverBranch)

‎src/hotspot/share/opto/vectornode.cpp

+49
Original file line numberDiff line numberDiff line change
@@ -2085,6 +2085,55 @@ Node* VectorBlendNode::Identity(PhaseGVN* phase) {
20852085
}
20862086
return this;
20872087
}
2088+
static bool is_replicate_uint_constant(const Node* n) {
2089+
return n->Opcode() == Op_Replicate &&
2090+
n->in(1)->is_Con() &&
2091+
n->in(1)->bottom_type()->isa_long() &&
2092+
n->in(1)->bottom_type()->is_long()->get_con() <= 0xFFFFFFFFL;
2093+
}
2094+
2095+
static bool has_vector_elements_fit_uint(Node* n) {
2096+
auto is_lower_doubleword_mask_pattern = [](const Node* n) {
2097+
return n->Opcode() == Op_AndV &&
2098+
(is_replicate_uint_constant(n->in(1)) ||
2099+
is_replicate_uint_constant(n->in(2)));
2100+
};
2101+
2102+
auto is_clear_upper_doubleword_uright_shift_pattern = [](const Node* n) {
2103+
return n->Opcode() == Op_URShiftVL &&
2104+
n->in(2)->Opcode() == Op_RShiftCntV && n->in(2)->in(1)->is_Con() &&
2105+
n->in(2)->in(1)->bottom_type()->isa_int() &&
2106+
n->in(2)->in(1)->bottom_type()->is_int()->get_con() >= 32;
2107+
};
2108+
return is_lower_doubleword_mask_pattern(n) || // (AndV SRC (Replicate C)) where C <= 0xFFFFFFFF
2109+
is_clear_upper_doubleword_uright_shift_pattern(n); // (URShiftV SRC S) where S >= 32
2110+
}
2111+
2112+
static bool has_vector_elements_fit_int(Node* n) {
2113+
auto is_cast_integer_to_long_pattern = [](const Node* n) {
2114+
return n->Opcode() == Op_VectorCastI2X && Matcher::vector_element_basic_type(n) == T_LONG;
2115+
};
2116+
2117+
auto is_clear_upper_doubleword_right_shift_pattern = [](const Node* n) {
2118+
return n->Opcode() == Op_RShiftVL &&
2119+
n->in(2)->Opcode() == Op_RShiftCntV && n->in(2)->in(1)->is_Con() &&
2120+
n->in(2)->in(1)->bottom_type()->isa_int() &&
2121+
n->in(2)->in(1)->bottom_type()->is_int()->get_con() >= 32;
2122+
};
2123+
2124+
return is_cast_integer_to_long_pattern(n) || // (VectorCastI2X SRC)
2125+
is_clear_upper_doubleword_right_shift_pattern(n); // (RShiftV SRC S) where S >= 32
2126+
}
2127+
2128+
bool MulVLNode::has_int_inputs() const {
2129+
return has_vector_elements_fit_int(in(1)) &&
2130+
has_vector_elements_fit_int(in(2));
2131+
}
2132+
2133+
bool MulVLNode::has_uint_inputs() const {
2134+
return has_vector_elements_fit_uint(in(1)) &&
2135+
has_vector_elements_fit_uint(in(2));
2136+
}
20882137

20892138
#ifndef PRODUCT
20902139
void VectorBoxAllocateNode::dump_spec(outputStream *st) const {

‎src/hotspot/share/opto/vectornode.hpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -441,8 +441,12 @@ class MulVINode : public VectorNode {
441441
// Vector multiply long
442442
class MulVLNode : public VectorNode {
443443
public:
444-
MulVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
444+
MulVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {
445+
init_class_id(Class_MulVL);
446+
}
445447
virtual int Opcode() const;
448+
bool has_int_inputs() const;
449+
bool has_uint_inputs() const;
446450
};
447451

448452
//------------------------------MulVFNode--------------------------------------

1 commit comments

Comments
 (1)

openjdk-notifier[bot] commented on Nov 21, 2024

@openjdk-notifier[bot]
Please sign in to comment.