Skip to content

Commit 185e711

Browse files
author
Jatin Bhateja
committedApr 21, 2024
8318650: Optimized subword gather for x86 targets.
Reviewed-by: sviswanathan, epeter, psandoz
1 parent 6d56996 commit 185e711

32 files changed

+1157
-49
lines changed
 

‎src/hotspot/cpu/aarch64/aarch64_vector.ad

+6-2
Original file line numberDiff line numberDiff line change
@@ -169,9 +169,7 @@ source %{
169169
case Op_VectorMaskGen:
170170
case Op_LoadVectorMasked:
171171
case Op_StoreVectorMasked:
172-
case Op_LoadVectorGather:
173172
case Op_StoreVectorScatter:
174-
case Op_LoadVectorGatherMasked:
175173
case Op_StoreVectorScatterMasked:
176174
case Op_PopulateIndex:
177175
case Op_CompressM:
@@ -180,6 +178,12 @@ source %{
180178
return false;
181179
}
182180
break;
181+
case Op_LoadVectorGather:
182+
case Op_LoadVectorGatherMasked:
183+
if (UseSVE == 0 || is_subword_type(bt)) {
184+
return false;
185+
}
186+
break;
183187
case Op_MulAddVS2VI:
184188
if (length_in_bytes != 16) {
185189
return false;

‎src/hotspot/cpu/aarch64/matcher_aarch64.hpp

+5
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,11 @@
133133
return true;
134134
}
135135

136+
// Does target support predicated operation emulation.
137+
static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) {
138+
return false;
139+
}
140+
136141
// Does the CPU supports vector variable rotate instructions?
137142
static constexpr bool supports_vector_variable_rotates(void) {
138143
return false;

‎src/hotspot/cpu/arm/matcher_arm.hpp

+5
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,11 @@
126126
return VM_Version::has_simd();
127127
}
128128

129+
// Does target support predicated operation emulation.
130+
static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) {
131+
return false;
132+
}
133+
129134
// Does the CPU supports vector variable rotate instructions?
130135
static constexpr bool supports_vector_variable_rotates(void) {
131136
return false; // not supported

‎src/hotspot/cpu/ppc/matcher_ppc.hpp

+5
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,11 @@
133133
return false;
134134
}
135135

136+
// Does target support predicated operation emulation.
137+
static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) {
138+
return false;
139+
}
140+
136141
// Does the CPU supports vector variable rotate instructions?
137142
static constexpr bool supports_vector_variable_rotates(void) {
138143
return false;

‎src/hotspot/cpu/riscv/matcher_riscv.hpp

+5
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,11 @@
132132
return false;
133133
}
134134

135+
// Does target support predicated operation emulation.
136+
static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) {
137+
return false;
138+
}
139+
135140
// Does the CPU supports vector variable rotate instructions?
136141
static constexpr bool supports_vector_variable_rotates(void) {
137142
return false;

‎src/hotspot/cpu/riscv/riscv_v.ad

+5
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,11 @@ source %{
7373
return false;
7474
}
7575
break;
76+
case Op_LoadVectorGatherMasked:
77+
if (is_subword_type(bt)) {
78+
return false;
79+
}
80+
break;
7681
case Op_VectorCastHF2F:
7782
case Op_VectorCastF2HF:
7883
return UseZvfh;

‎src/hotspot/cpu/s390/matcher_s390.hpp

+5
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,11 @@
124124
return false;
125125
}
126126

127+
// Does target support predicated operation emulation.
128+
static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) {
129+
return false;
130+
}
131+
127132
// Does the CPU supports vector variable rotate instructions?
128133
static constexpr bool supports_vector_variable_rotates(void) {
129134
return false;

‎src/hotspot/cpu/x86/assembler_x86.cpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -13652,9 +13652,13 @@ void Assembler::notq(Register dst) {
1365213652
emit_int16((unsigned char)0xF7, (0xD0 | encode));
1365313653
}
1365413654

13655+
void Assembler::btq(Register dst, Register src) {
13656+
int encode = prefixq_and_encode(src->encoding(), dst->encoding());
13657+
emit_int24(0x0F, (unsigned char)0xA3, (encode | 0xC0));
13658+
}
13659+
1365513660
void Assembler::btq(Register src, int imm8) {
1365613661
assert(isByte(imm8), "not a byte");
13657-
InstructionMark im(this);
1365813662
int encode = prefixq_and_encode(src->encoding());
1365913663
emit_int16(0x0f, 0xba);
1366013664
emit_int8(0xe0|encode);

‎src/hotspot/cpu/x86/assembler_x86.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -1736,6 +1736,7 @@ class Assembler : public AbstractAssembler {
17361736
void btrq(Address dst, int imm8);
17371737
void btq(Register src, int imm8);
17381738
#endif
1739+
void btq(Register dst, Register src);
17391740

17401741
void orw(Register dst, Register src);
17411742

‎src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp

+124
Original file line numberDiff line numberDiff line change
@@ -1796,6 +1796,130 @@ void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src,
17961796
}
17971797
}
17981798

1799+
#ifdef _LP64
1800+
void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1801+
XMMRegister dst, Register base,
1802+
Register idx_base,
1803+
Register offset, Register mask,
1804+
Register mask_idx, Register rtmp,
1805+
int vlen_enc) {
1806+
vpxor(dst, dst, dst, vlen_enc);
1807+
if (elem_bt == T_SHORT) {
1808+
for (int i = 0; i < 4; i++) {
1809+
// dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1810+
Label skip_load;
1811+
btq(mask, mask_idx);
1812+
jccb(Assembler::carryClear, skip_load);
1813+
movl(rtmp, Address(idx_base, i * 4));
1814+
if (offset != noreg) {
1815+
addl(rtmp, offset);
1816+
}
1817+
pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1818+
bind(skip_load);
1819+
incq(mask_idx);
1820+
}
1821+
} else {
1822+
assert(elem_bt == T_BYTE, "");
1823+
for (int i = 0; i < 8; i++) {
1824+
// dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1825+
Label skip_load;
1826+
btq(mask, mask_idx);
1827+
jccb(Assembler::carryClear, skip_load);
1828+
movl(rtmp, Address(idx_base, i * 4));
1829+
if (offset != noreg) {
1830+
addl(rtmp, offset);
1831+
}
1832+
pinsrb(dst, Address(base, rtmp), i);
1833+
bind(skip_load);
1834+
incq(mask_idx);
1835+
}
1836+
}
1837+
}
1838+
#endif // _LP64
1839+
1840+
void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1841+
Register base, Register idx_base,
1842+
Register offset, Register rtmp,
1843+
int vlen_enc) {
1844+
vpxor(dst, dst, dst, vlen_enc);
1845+
if (elem_bt == T_SHORT) {
1846+
for (int i = 0; i < 4; i++) {
1847+
// dst[i] = src[offset + idx_base[i]]
1848+
movl(rtmp, Address(idx_base, i * 4));
1849+
if (offset != noreg) {
1850+
addl(rtmp, offset);
1851+
}
1852+
pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1853+
}
1854+
} else {
1855+
assert(elem_bt == T_BYTE, "");
1856+
for (int i = 0; i < 8; i++) {
1857+
// dst[i] = src[offset + idx_base[i]]
1858+
movl(rtmp, Address(idx_base, i * 4));
1859+
if (offset != noreg) {
1860+
addl(rtmp, offset);
1861+
}
1862+
pinsrb(dst, Address(base, rtmp), i);
1863+
}
1864+
}
1865+
}
1866+
1867+
/*
1868+
* Gather using hybrid algorithm, first partially unroll scalar loop
1869+
* to accumulate values from gather indices into a quad-word(64bit) slice.
1870+
* A slice may hold 8 bytes or 4 short values. This is followed by a vector
1871+
* permutation to place the slice into appropriate vector lane
1872+
* locations in destination vector. Following pseudo code describes the
1873+
* algorithm in detail:
1874+
*
1875+
* DST_VEC = ZERO_VEC
1876+
* PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1877+
* TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1878+
* FOREACH_ITER:
1879+
* TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1880+
* TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1881+
* DST_VEC = DST_VEC OR TEMP_PERM_VEC
1882+
* PERM_INDEX = PERM_INDEX - TWO_VEC
1883+
*
1884+
* With each iteration, doubleword permute indices (0,1) corresponding
1885+
* to gathered quadword gets right shifted by two lane positions.
1886+
*
1887+
*/
1888+
void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1889+
Register base, Register idx_base,
1890+
Register offset, Register mask,
1891+
XMMRegister xtmp1, XMMRegister xtmp2,
1892+
XMMRegister temp_dst, Register rtmp,
1893+
Register mask_idx, Register length,
1894+
int vector_len, int vlen_enc) {
1895+
Label GATHER8_LOOP;
1896+
assert(is_subword_type(elem_ty), "");
1897+
movl(length, vector_len);
1898+
vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1899+
vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1900+
vallones(xtmp2, vlen_enc);
1901+
vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1902+
vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1903+
load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1904+
1905+
bind(GATHER8_LOOP);
1906+
// TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1907+
if (mask == noreg) {
1908+
vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1909+
} else {
1910+
LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
1911+
}
1912+
// TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1913+
vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1914+
// PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1915+
vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1916+
// DST_VEC = DST_VEC OR TEMP_PERM_VEC
1917+
vpor(dst, dst, temp_dst, vlen_enc);
1918+
addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1));
1919+
subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1920+
jcc(Assembler::notEqual, GATHER8_LOOP);
1921+
}
1922+
17991923
void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
18001924
switch(typ) {
18011925
case T_INT:

‎src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp

+12
Original file line numberDiff line numberDiff line change
@@ -500,4 +500,16 @@
500500
void vector_rearrange_int_float(BasicType bt, XMMRegister dst, XMMRegister shuffle,
501501
XMMRegister src, int vlen_enc);
502502

503+
504+
void vgather_subword(BasicType elem_ty, XMMRegister dst, Register base, Register idx_base, Register offset,
505+
Register mask, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
506+
Register midx, Register length, int vector_len, int vlen_enc);
507+
508+
#ifdef _LP64
509+
void vgather8b_masked_offset(BasicType elem_bt, XMMRegister dst, Register base, Register idx_base,
510+
Register offset, Register mask, Register midx, Register rtmp, int vlen_enc);
511+
#endif
512+
void vgather8b_offset(BasicType elem_bt, XMMRegister dst, Register base, Register idx_base,
513+
Register offset, Register rtmp, int vlen_enc);
514+
503515
#endif // CPU_X86_C2_MACROASSEMBLER_X86_HPP

‎src/hotspot/cpu/x86/matcher_x86.hpp

+13
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,16 @@
154154
return (UseAVX >= 2);
155155
}
156156

157+
// Does target support predicated operation emulation.
158+
static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) {
159+
switch(vopc) {
160+
case Op_LoadVectorGatherMasked:
161+
return is_subword_type(bt) && VM_Version::supports_avx2();
162+
default:
163+
return false;
164+
}
165+
}
166+
157167
// Does the CPU supports vector variable rotate instructions?
158168
static constexpr bool supports_vector_variable_rotates(void) {
159169
return true;
@@ -214,6 +224,9 @@
214224
return 7;
215225
case Op_MulVL:
216226
return VM_Version::supports_avx512vldq() ? 0 : 6;
227+
case Op_LoadVectorGather:
228+
case Op_LoadVectorGatherMasked:
229+
return is_subword_type(ety) ? 50 : 0;
217230
case Op_VectorCastF2X: // fall through
218231
case Op_VectorCastD2X:
219232
return is_floating_point_type(ety) ? 0 : (is_subword_type(ety) ? 35 : 30);

0 commit comments

Comments
 (0)
Please sign in to comment.