@@ -1796,6 +1796,130 @@ void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src,
1796
1796
}
1797
1797
}
1798
1798
1799
+ #ifdef _LP64
1800
+ void C2_MacroAssembler::vgather8b_masked_offset (BasicType elem_bt,
1801
+ XMMRegister dst, Register base,
1802
+ Register idx_base,
1803
+ Register offset, Register mask,
1804
+ Register mask_idx, Register rtmp,
1805
+ int vlen_enc) {
1806
+ vpxor (dst, dst, dst, vlen_enc);
1807
+ if (elem_bt == T_SHORT) {
1808
+ for (int i = 0 ; i < 4 ; i++) {
1809
+ // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1810
+ Label skip_load;
1811
+ btq (mask, mask_idx);
1812
+ jccb (Assembler::carryClear, skip_load);
1813
+ movl (rtmp, Address (idx_base, i * 4 ));
1814
+ if (offset != noreg) {
1815
+ addl (rtmp, offset);
1816
+ }
1817
+ pinsrw (dst, Address (base, rtmp, Address::times_2), i);
1818
+ bind (skip_load);
1819
+ incq (mask_idx);
1820
+ }
1821
+ } else {
1822
+ assert (elem_bt == T_BYTE, " " );
1823
+ for (int i = 0 ; i < 8 ; i++) {
1824
+ // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1825
+ Label skip_load;
1826
+ btq (mask, mask_idx);
1827
+ jccb (Assembler::carryClear, skip_load);
1828
+ movl (rtmp, Address (idx_base, i * 4 ));
1829
+ if (offset != noreg) {
1830
+ addl (rtmp, offset);
1831
+ }
1832
+ pinsrb (dst, Address (base, rtmp), i);
1833
+ bind (skip_load);
1834
+ incq (mask_idx);
1835
+ }
1836
+ }
1837
+ }
1838
+ #endif // _LP64
1839
+
1840
+ void C2_MacroAssembler::vgather8b_offset (BasicType elem_bt, XMMRegister dst,
1841
+ Register base, Register idx_base,
1842
+ Register offset, Register rtmp,
1843
+ int vlen_enc) {
1844
+ vpxor (dst, dst, dst, vlen_enc);
1845
+ if (elem_bt == T_SHORT) {
1846
+ for (int i = 0 ; i < 4 ; i++) {
1847
+ // dst[i] = src[offset + idx_base[i]]
1848
+ movl (rtmp, Address (idx_base, i * 4 ));
1849
+ if (offset != noreg) {
1850
+ addl (rtmp, offset);
1851
+ }
1852
+ pinsrw (dst, Address (base, rtmp, Address::times_2), i);
1853
+ }
1854
+ } else {
1855
+ assert (elem_bt == T_BYTE, " " );
1856
+ for (int i = 0 ; i < 8 ; i++) {
1857
+ // dst[i] = src[offset + idx_base[i]]
1858
+ movl (rtmp, Address (idx_base, i * 4 ));
1859
+ if (offset != noreg) {
1860
+ addl (rtmp, offset);
1861
+ }
1862
+ pinsrb (dst, Address (base, rtmp), i);
1863
+ }
1864
+ }
1865
+ }
1866
+
1867
+ /*
1868
+ * Gather using hybrid algorithm, first partially unroll scalar loop
1869
+ * to accumulate values from gather indices into a quad-word(64bit) slice.
1870
+ * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1871
+ * permutation to place the slice into appropriate vector lane
1872
+ * locations in destination vector. Following pseudo code describes the
1873
+ * algorithm in detail:
1874
+ *
1875
+ * DST_VEC = ZERO_VEC
1876
+ * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1877
+ * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1878
+ * FOREACH_ITER:
1879
+ * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1880
+ * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1881
+ * DST_VEC = DST_VEC OR TEMP_PERM_VEC
1882
+ * PERM_INDEX = PERM_INDEX - TWO_VEC
1883
+ *
1884
+ * With each iteration, doubleword permute indices (0,1) corresponding
1885
+ * to gathered quadword gets right shifted by two lane positions.
1886
+ *
1887
+ */
1888
+ void C2_MacroAssembler::vgather_subword (BasicType elem_ty, XMMRegister dst,
1889
+ Register base, Register idx_base,
1890
+ Register offset, Register mask,
1891
+ XMMRegister xtmp1, XMMRegister xtmp2,
1892
+ XMMRegister temp_dst, Register rtmp,
1893
+ Register mask_idx, Register length,
1894
+ int vector_len, int vlen_enc) {
1895
+ Label GATHER8_LOOP;
1896
+ assert (is_subword_type (elem_ty), " " );
1897
+ movl (length, vector_len);
1898
+ vpxor (xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1899
+ vpxor (dst, dst, dst, vlen_enc); // dst = {0, ...}
1900
+ vallones (xtmp2, vlen_enc);
1901
+ vpsubd (xtmp2, xtmp1, xtmp2, vlen_enc);
1902
+ vpslld (xtmp2, xtmp2, 1 , vlen_enc); // xtmp2 = {2, 2, ...}
1903
+ load_iota_indices (xtmp1, vector_len * type2aelembytes (elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1904
+
1905
+ bind (GATHER8_LOOP);
1906
+ // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1907
+ if (mask == noreg) {
1908
+ vgather8b_offset (elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1909
+ } else {
1910
+ LP64_ONLY (vgather8b_masked_offset (elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
1911
+ }
1912
+ // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1913
+ vpermd (temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1914
+ // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1915
+ vpsubd (xtmp1, xtmp1, xtmp2, vlen_enc);
1916
+ // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1917
+ vpor (dst, dst, temp_dst, vlen_enc);
1918
+ addptr (idx_base, 32 >> (type2aelembytes (elem_ty) - 1 ));
1919
+ subl (length, 8 >> (type2aelembytes (elem_ty) - 1 ));
1920
+ jcc (Assembler::notEqual, GATHER8_LOOP);
1921
+ }
1922
+
1799
1923
void C2_MacroAssembler::vgather (BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1800
1924
switch (typ) {
1801
1925
case T_INT:
0 commit comments