Skip to content

Commit

Permalink
8282664: Unroll by hand StringUTF16 and StringLatin1 polynomial hash …
Browse files Browse the repository at this point in the history
…loops

Co-authored-by: Sandhya Viswanathan <sviswanathan@openjdk.org>
Co-authored-by: Ludovic Henry <luhenry@openjdk.org>
Co-authored-by: Claes Redestad <redestad@openjdk.org>
Reviewed-by: vlivanov, sviswanathan, luhenry
  • Loading branch information
3 people committed Jan 17, 2023
1 parent ade08e1 commit e37078f
Show file tree
Hide file tree
Showing 33 changed files with 1,053 additions and 87 deletions.
218 changes: 217 additions & 1 deletion src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -3203,6 +3203,195 @@ void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Regis
bind(DONE_LABEL);
} // stringL_indexof_char

int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
switch (eltype) {
case T_BOOLEAN: return sizeof(jboolean);
case T_BYTE: return sizeof(jbyte);
case T_SHORT: return sizeof(jshort);
case T_CHAR: return sizeof(jchar);
case T_INT: return sizeof(jint);
default:
ShouldNotReachHere();
return -1;
}
}

void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
switch (eltype) {
// T_BOOLEAN used as surrogate for unsigned byte
case T_BOOLEAN: movzbl(dst, src); break;
case T_BYTE: movsbl(dst, src); break;
case T_SHORT: movswl(dst, src); break;
case T_CHAR: movzwl(dst, src); break;
case T_INT: movl(dst, src); break;
default:
ShouldNotReachHere();
}
}

void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
}

void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
}

void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
const int vlen = Assembler::AVX_256bit;
switch (eltype) {
case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break;
case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break;
case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
case T_INT:
// do nothing
break;
default:
ShouldNotReachHere();
}
}

void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
Register index, Register tmp2, Register tmp3, XMMRegister vnext,
XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
BasicType eltype) {
ShortBranchVerifier sbv(this);
assert(UseAVX >= 2, "AVX2 intrinsics are required");
assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);

Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
SHORT_UNROLLED_LOOP_EXIT,
UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
UNROLLED_VECTOR_LOOP_BEGIN,
END;
switch (eltype) {
case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break;
case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break;
case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break;
case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break;
default: BLOCK_COMMENT("arrays_hashcode {"); break;
}

// For "renaming" for readibility of the code
XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
vresult[] = { vresult0, vresult1, vresult2, vresult3 },
vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };

const int elsize = arrays_hashcode_elsize(eltype);

/*
if (cnt1 >= 2) {
if (cnt1 >= 32) {
UNROLLED VECTOR LOOP
}
UNROLLED SCALAR LOOP
}
SINGLE SCALAR
*/

cmpl(cnt1, 32);
jcc(Assembler::less, SHORT_UNROLLED_BEGIN);

// cnt1 >= 32 && generate_vectorized_loop
xorl(index, index);

// vresult = IntVector.zero(I256);
for (int idx = 0; idx < 4; idx++) {
vpxor(vresult[idx], vresult[idx]);
}
// vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
Register bound = tmp2;
Register next = tmp3;
lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
movl(next, Address(tmp2, 0));
movdl(vnext, next);
vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);

// index = 0;
// bound = cnt1 & ~(32 - 1);
movl(bound, cnt1);
andl(bound, ~(32 - 1));
// for (; index < bound; index += 32) {
bind(UNROLLED_VECTOR_LOOP_BEGIN);
// result *= next;
imull(result, next);
// loop fission to upfront the cost of fetching from memory, OOO execution
// can then hopefully do a better job of prefetching
for (int idx = 0; idx < 4; idx++) {
arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
}
// vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
for (int idx = 0; idx < 4; idx++) {
vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
arrays_hashcode_elvcast(vtmp[idx], eltype);
vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
}
// index += 32;
addl(index, 32);
// index < bound;
cmpl(index, bound);
jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
// }

lea(ary1, Address(ary1, bound, Address::times(elsize)));
subl(cnt1, bound);
// release bound

// vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
for (int idx = 0; idx < 4; idx++) {
lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
}
// result += vresult.reduceLanes(ADD);
for (int idx = 0; idx < 4; idx++) {
reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
}

// } else if (cnt1 < 32) {

bind(SHORT_UNROLLED_BEGIN);
// int i = 1;
movl(index, 1);
cmpl(index, cnt1);
jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);

// for (; i < cnt1 ; i += 2) {
bind(SHORT_UNROLLED_LOOP_BEGIN);
movl(tmp3, 961);
imull(result, tmp3);
arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
movl(tmp3, tmp2);
shll(tmp3, 5);
subl(tmp3, tmp2);
addl(result, tmp3);
arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
addl(result, tmp3);
addl(index, 2);
cmpl(index, cnt1);
jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);

// }
// if (i >= cnt1) {
bind(SHORT_UNROLLED_LOOP_EXIT);
jccb(Assembler::greater, END);
movl(tmp2, result);
shll(result, 5);
subl(result, tmp2);
arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
addl(result, tmp3);
// }
bind(END);

BLOCK_COMMENT("} // arrays_hashcode");

} // arrays_hashcode

// helper function for string_compare
void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
Address::ScaleFactor scale, Address::ScaleFactor scale1,
Expand Down Expand Up @@ -4685,6 +4874,33 @@ void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, i
}
}

void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
BasicType from_elem_bt, BasicType to_elem_bt) {
switch (from_elem_bt) {
case T_BYTE:
switch (to_elem_bt) {
case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
case T_INT: vpmovsxbd(dst, src, vlen_enc); break;
case T_LONG: vpmovsxbq(dst, src, vlen_enc); break;
default: ShouldNotReachHere();
}
break;
case T_SHORT:
switch (to_elem_bt) {
case T_INT: vpmovsxwd(dst, src, vlen_enc); break;
case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
default: ShouldNotReachHere();
}
break;
case T_INT:
assert(to_elem_bt == T_LONG, "");
vpmovsxdq(dst, src, vlen_enc);
break;
default:
ShouldNotReachHere();
}
}

void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
BasicType dst_bt, BasicType src_bt, int vlen) {
int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
Expand Down
18 changes: 17 additions & 1 deletion src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -292,6 +292,19 @@
Register limit, Register result, Register chr,
XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask = knoreg);

void arrays_hashcode(Register str1, Register cnt1, Register result,
Register tmp1, Register tmp2, Register tmp3, XMMRegister vnext,
XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
BasicType eltype);

// helper functions for arrays_hashcode
int arrays_hashcode_elsize(BasicType eltype);
void arrays_hashcode_elload(Register dst, Address src, BasicType eltype);
void arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype);
void arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype);
void arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype);

void evmasked_op(int ideal_opc, BasicType eType, KRegister mask,
XMMRegister dst, XMMRegister src1, XMMRegister src2,
Expand All @@ -310,6 +323,9 @@
void vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
BasicType from_elem_bt, BasicType to_elem_bt);

void vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
BasicType from_elem_bt, BasicType to_elem_bt);

void vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
XMMRegister xtmp, Register rscratch, int vec_enc);

Expand Down
39 changes: 38 additions & 1 deletion src/hotspot/cpu/x86/stubRoutines_x86.cpp
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2013, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -227,6 +227,43 @@ juint StubRoutines::x86::_shuf_table_crc32_avx512[] =
};
#endif // _LP64

jint StubRoutines::x86::_arrays_hashcode_powers_of_31[] =
{
2111290369,
-2010103841,
350799937,
11316127,
693101697,
-254736545,
961614017,
31019807,
-2077209343,
-67006753,
1244764481,
-2038056289,
211350913,
-408824225,
-844471871,
-997072353,
1353309697,
-510534177,
1507551809,
-505558625,
-293403007,
129082719,
-1796951359,
-196513505,
-1807454463,
1742810335,
887503681,
28629151,
923521,
29791,
961,
31,
1,
};

#define D 32
#define P 0x82F63B78 // Reflection of Castagnoli (0x11EDC6F41)

Expand Down
5 changes: 4 additions & 1 deletion src/hotspot/cpu/x86/stubRoutines_x86.hpp
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2013, 2022, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -137,6 +137,8 @@ class x86 {
#endif // _LP64
// table for CRC32C
static juint* _crc32c_table;
// table for arrays_hashcode
static jint _arrays_hashcode_powers_of_31[];

// upper word mask for sha1
static address _upper_word_mask_addr;
Expand Down Expand Up @@ -325,6 +327,7 @@ class x86 {
static address base64_decoding_table_addr() { return _decoding_table_base64; }
#endif
static address pshuffle_byte_flip_mask_addr() { return _pshuffle_byte_flip_mask_addr; }
static address arrays_hashcode_powers_of_31() { return (address)_arrays_hashcode_powers_of_31; }
static void generate_CRC32C_table(bool is_pclmulqdq_supported);
};

Expand Down
15 changes: 14 additions & 1 deletion src/hotspot/cpu/x86/vm_version_x86.cpp
@@ -1,5 +1,5 @@
/*
* Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -1695,13 +1695,26 @@ void VM_Version::get_processor_features() {
warning("vectorizedMismatch intrinsics are not available on this CPU");
FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
}
if (UseAVX >= 2) {
FLAG_SET_DEFAULT(UseVectorizedHashCodeIntrinsic, true);
} else if (UseVectorizedHashCodeIntrinsic) {
if (!FLAG_IS_DEFAULT(UseVectorizedHashCodeIntrinsic))
warning("vectorizedHashCode intrinsics are not available on this CPU");
FLAG_SET_DEFAULT(UseVectorizedHashCodeIntrinsic, false);
}
#else
if (UseVectorizedMismatchIntrinsic) {
if (!FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic)) {
warning("vectorizedMismatch intrinsic is not available in 32-bit VM");
}
FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
}
if (UseVectorizedHashCodeIntrinsic) {
if (!FLAG_IS_DEFAULT(UseVectorizedHashCodeIntrinsic)) {
warning("vectorizedHashCode intrinsic is not available in 32-bit VM");
}
FLAG_SET_DEFAULT(UseVectorizedHashCodeIntrinsic, false);
}
#endif // _LP64

// Use count leading zeros count instruction if available.
Expand Down

1 comment on commit e37078f

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.