Skip to content

Commit 4fc6d41

Browse files
committedNov 5, 2024
8341194: [REDO] Implement C2 VectorizedHashCode on AArch64
Reviewed-by: aph, adinn
1 parent abf2dc7 commit 4fc6d41

12 files changed

+1346
-580
lines changed
 

‎src/hotspot/cpu/aarch64/aarch64.ad

+44
Original file line numberDiff line numberDiff line change
@@ -5028,6 +5028,24 @@ operand vRegD_V7()
50285028
interface(REG_INTER);
50295029
%}
50305030

5031+
operand vRegD_V12()
5032+
%{
5033+
constraint(ALLOC_IN_RC(v12_reg));
5034+
match(RegD);
5035+
op_cost(0);
5036+
format %{ %}
5037+
interface(REG_INTER);
5038+
%}
5039+
5040+
operand vRegD_V13()
5041+
%{
5042+
constraint(ALLOC_IN_RC(v13_reg));
5043+
match(RegD);
5044+
op_cost(0);
5045+
format %{ %}
5046+
interface(REG_INTER);
5047+
%}
5048+
50315049
operand pReg()
50325050
%{
50335051
constraint(ALLOC_IN_RC(pr_reg));
@@ -16770,6 +16788,32 @@ instruct array_equalsC(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result,
1677016788
ins_pipe(pipe_class_memory);
1677116789
%}
1677216790

16791+
instruct arrays_hashcode(iRegP_R1 ary, iRegI_R2 cnt, iRegI_R0 result, immI basic_type,
16792+
vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, vRegD_V3 vtmp3,
16793+
vRegD_V4 vtmp4, vRegD_V5 vtmp5, vRegD_V6 vtmp6, vRegD_V7 vtmp7,
16794+
vRegD_V12 vtmp8, vRegD_V13 vtmp9, rFlagsReg cr)
16795+
%{
16796+
match(Set result (VectorizedHashCode (Binary ary cnt) (Binary result basic_type)));
16797+
effect(TEMP vtmp0, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP vtmp5, TEMP vtmp6,
16798+
TEMP vtmp7, TEMP vtmp8, TEMP vtmp9, USE_KILL ary, USE_KILL cnt, USE basic_type, KILL cr);
16799+
16800+
format %{ "Array HashCode array[] $ary,$cnt,$result,$basic_type -> $result // KILL all" %}
16801+
ins_encode %{
16802+
address tpc = __ arrays_hashcode($ary$$Register, $cnt$$Register, $result$$Register,
16803+
$vtmp3$$FloatRegister, $vtmp2$$FloatRegister,
16804+
$vtmp1$$FloatRegister, $vtmp0$$FloatRegister,
16805+
$vtmp4$$FloatRegister, $vtmp5$$FloatRegister,
16806+
$vtmp6$$FloatRegister, $vtmp7$$FloatRegister,
16807+
$vtmp8$$FloatRegister, $vtmp9$$FloatRegister,
16808+
(BasicType)$basic_type$$constant);
16809+
if (tpc == nullptr) {
16810+
ciEnv::current()->record_failure("CodeCache is full");
16811+
return;
16812+
}
16813+
%}
16814+
ins_pipe(pipe_class_memory);
16815+
%}
16816+
1677316817
instruct count_positives(iRegP_R1 ary1, iRegI_R2 len, iRegI_R0 result, rFlagsReg cr)
1677416818
%{
1677516819
match(Set result (CountPositives ary1 len));

‎src/hotspot/cpu/aarch64/assembler_aarch64.hpp

+67-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
33
* Copyright (c) 2014, 2024, Red Hat Inc. All rights reserved.
44
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
55
*
@@ -287,6 +287,11 @@ class Instruction_aarch64 {
287287
f(r->raw_encoding(), lsb + 4, lsb);
288288
}
289289

290+
//<0-15>reg: As `rf(FloatRegister)`, but only the lower 16 FloatRegisters are allowed.
291+
void lrf(FloatRegister r, int lsb) {
292+
f(r->raw_encoding(), lsb + 3, lsb);
293+
}
294+
290295
void prf(PRegister r, int lsb) {
291296
f(r->raw_encoding(), lsb + 3, lsb);
292297
}
@@ -765,6 +770,7 @@ class Assembler : public AbstractAssembler {
765770
#define f current_insn.f
766771
#define sf current_insn.sf
767772
#define rf current_insn.rf
773+
#define lrf current_insn.lrf
768774
#define srf current_insn.srf
769775
#define zrf current_insn.zrf
770776
#define prf current_insn.prf
@@ -1590,6 +1596,16 @@ class Assembler : public AbstractAssembler {
15901596

15911597
#undef INSN
15921598

1599+
// Load/store a register, but with a BasicType parameter. Loaded signed integer values are
1600+
// extended to 64 bits.
1601+
void load(Register Rt, const Address &adr, BasicType bt) {
1602+
int op = (is_signed_subword_type(bt) || bt == T_INT) ? 0b10 : 0b01;
1603+
ld_st2(Rt, adr, exact_log2(type2aelembytes(bt)), op);
1604+
}
1605+
void store(Register Rt, const Address &adr, BasicType bt) {
1606+
ld_st2(Rt, adr, exact_log2(type2aelembytes(bt)), 0b00);
1607+
}
1608+
15931609
/* SIMD extensions
15941610
*
15951611
* We just use FloatRegister in the following. They are exactly the same
@@ -2587,6 +2603,7 @@ template<typename R, typename... Rx>
25872603
INSN(addpv, 0, 0b101111, true); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
25882604
INSN(smullv, 0, 0b110000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
25892605
INSN(umullv, 1, 0b110000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
2606+
INSN(smlalv, 0, 0b100000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
25902607
INSN(umlalv, 1, 0b100000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
25912608
INSN(maxv, 0, 0b011001, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
25922609
INSN(minv, 0, 0b011011, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
@@ -2860,6 +2877,28 @@ template<typename R, typename... Rx>
28602877
// FMULX - Vector - Scalar
28612878
INSN(fmulxvs, 1, 0b1001);
28622879

2880+
#undef INSN
2881+
2882+
#define INSN(NAME, op1, op2) \
2883+
void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm, int index) { \
2884+
starti; \
2885+
assert(T == T4H || T == T8H || T == T2S || T == T4S, "invalid arrangement"); \
2886+
assert(index >= 0 && \
2887+
((T == T2S && index <= 1) || (T != T2S && index <= 3) || (T == T8H && index <= 7)), \
2888+
"invalid index"); \
2889+
assert((T != T4H && T != T8H) || Vm->encoding() < 16, "invalid source SIMD&FP register"); \
2890+
f(0, 31), f((int)T & 1, 30), f(op1, 29), f(0b01111, 28, 24); \
2891+
if (T == T4H || T == T8H) { \
2892+
f(0b01, 23, 22), f(index & 0b11, 21, 20), lrf(Vm, 16), f(index >> 2 & 1, 11); \
2893+
} else { \
2894+
f(0b10, 23, 22), f(index & 1, 21), rf(Vm, 16), f(index >> 1, 11); \
2895+
} \
2896+
f(op2, 15, 12), f(0, 10), rf(Vn, 5), rf(Vd, 0); \
2897+
}
2898+
2899+
// MUL - Vector - Scalar
2900+
INSN(mulvs, 0, 0b1000);
2901+
28632902
#undef INSN
28642903

28652904
// Floating-point Reciprocal Estimate
@@ -3023,6 +3062,33 @@ template<typename R, typename... Rx>
30233062
umov(Xd, Vn, T, index);
30243063
}
30253064

3065+
protected:
3066+
void _xaddwv(bool is_unsigned, FloatRegister Vd, FloatRegister Vn, SIMD_Arrangement Ta,
3067+
FloatRegister Vm, SIMD_Arrangement Tb) {
3068+
starti;
3069+
assert((Tb >> 1) + 1 == (Ta >> 1), "Incompatible arrangement");
3070+
f(0, 31), f((int)Tb & 1, 30), f(is_unsigned ? 1 : 0, 29), f(0b01110, 28, 24);
3071+
f((int)(Ta >> 1) - 1, 23, 22), f(1, 21), rf(Vm, 16), f(0b000100, 15, 10), rf(Vn, 5), rf(Vd, 0);
3072+
}
3073+
3074+
public:
3075+
#define INSN(NAME, assertion, is_unsigned) \
3076+
void NAME(FloatRegister Vd, FloatRegister Vn, SIMD_Arrangement Ta, FloatRegister Vm, \
3077+
SIMD_Arrangement Tb) { \
3078+
assert((assertion), "invalid arrangement"); \
3079+
_xaddwv(is_unsigned, Vd, Vn, Ta, Vm, Tb); \
3080+
}
3081+
3082+
public:
3083+
3084+
INSN(uaddwv, Tb == T8B || Tb == T4H || Tb == T2S, /*is_unsigned*/true)
3085+
INSN(uaddwv2, Tb == T16B || Tb == T8H || Tb == T4S, /*is_unsigned*/true)
3086+
INSN(saddwv, Tb == T8B || Tb == T4H || Tb == T2S, /*is_unsigned*/false)
3087+
INSN(saddwv2, Tb == T16B || Tb == T8H || Tb == T4S, /*is_unsigned*/false)
3088+
3089+
#undef INSN
3090+
3091+
30263092
private:
30273093
void _pmull(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
30283094
starti;

‎src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp

+96
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include "opto/subnode.hpp"
3434
#include "runtime/stubRoutines.hpp"
3535
#include "utilities/globalDefinitions.hpp"
36+
#include "utilities/powerOfTwo.hpp"
3637

3738
#ifdef PRODUCT
3839
#define BLOCK_COMMENT(str) /* nothing */
@@ -46,6 +47,101 @@
4647

4748
typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4849

50+
// jdk.internal.util.ArraysSupport.vectorizedHashCode
51+
address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
52+
FloatRegister vdata0, FloatRegister vdata1,
53+
FloatRegister vdata2, FloatRegister vdata3,
54+
FloatRegister vmul0, FloatRegister vmul1,
55+
FloatRegister vmul2, FloatRegister vmul3,
56+
FloatRegister vpow, FloatRegister vpowm,
57+
BasicType eltype) {
58+
ARRAYS_HASHCODE_REGISTERS;
59+
60+
Register tmp1 = rscratch1, tmp2 = rscratch2;
61+
62+
Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
63+
64+
// Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
65+
// use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
66+
// use 4H for chars and shorts instead, but using 8H gives better performance.
67+
const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
68+
: eltype == T_CHAR || eltype == T_SHORT ? 8
69+
: eltype == T_INT ? 4
70+
: 0;
71+
guarantee(vf, "unsupported eltype");
72+
73+
// Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
74+
const size_t unroll_factor = 4;
75+
76+
switch (eltype) {
77+
case T_BOOLEAN:
78+
BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
79+
break;
80+
case T_CHAR:
81+
BLOCK_COMMENT("arrays_hashcode(char) {");
82+
break;
83+
case T_BYTE:
84+
BLOCK_COMMENT("arrays_hashcode(byte) {");
85+
break;
86+
case T_SHORT:
87+
BLOCK_COMMENT("arrays_hashcode(short) {");
88+
break;
89+
case T_INT:
90+
BLOCK_COMMENT("arrays_hashcode(int) {");
91+
break;
92+
default:
93+
ShouldNotReachHere();
94+
}
95+
96+
// large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
97+
// implemented by the stub executes just once. Call the stub only if at least two iterations will
98+
// be executed.
99+
const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
100+
cmpw(cnt, large_threshold);
101+
br(Assembler::HS, LARGE);
102+
103+
bind(TAIL);
104+
105+
// The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
106+
// uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
107+
// Iteration eats up the remainder, uf elements at a time.
108+
assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
109+
andr(tmp2, cnt, unroll_factor - 1);
110+
adr(tmp1, BR_BASE);
111+
sub(tmp1, tmp1, tmp2, ext::sxtw, 3);
112+
movw(tmp2, 0x1f);
113+
br(tmp1);
114+
115+
bind(LOOP);
116+
for (size_t i = 0; i < unroll_factor; ++i) {
117+
load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
118+
maddw(result, result, tmp2, tmp1);
119+
}
120+
bind(BR_BASE);
121+
subsw(cnt, cnt, unroll_factor);
122+
br(Assembler::HS, LOOP);
123+
124+
b(DONE);
125+
126+
bind(LARGE);
127+
128+
RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
129+
assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
130+
address tpc = trampoline_call(stub);
131+
if (tpc == nullptr) {
132+
DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
133+
postcond(pc() == badAddress);
134+
return nullptr;
135+
}
136+
137+
bind(DONE);
138+
139+
BLOCK_COMMENT("} // arrays_hashcode");
140+
141+
postcond(pc() != badAddress);
142+
return pc();
143+
}
144+
49145
void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
50146
Register tmp2Reg, Register tmp3Reg) {
51147
Register oop = objectReg;

‎src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp

+7
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@
3535
enum shift_kind kind = Assembler::LSL, unsigned shift = 0);
3636

3737
public:
38+
// jdk.internal.util.ArraysSupport.vectorizedHashCode
39+
address arrays_hashcode(Register ary, Register cnt, Register result, FloatRegister vdata0,
40+
FloatRegister vdata1, FloatRegister vdata2, FloatRegister vdata3,
41+
FloatRegister vmul0, FloatRegister vmul1, FloatRegister vmul2,
42+
FloatRegister vmul3, FloatRegister vpow, FloatRegister vpowm,
43+
BasicType eltype);
44+
3845
// Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file.
3946
void fast_lock(Register object, Register box, Register tmp, Register tmp2, Register tmp3);
4047
void fast_unlock(Register object, Register box, Register tmp, Register tmp2);

‎src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp

+18
Original file line numberDiff line numberDiff line change
@@ -1439,6 +1439,24 @@ class MacroAssembler: public Assembler {
14391439
address arrays_equals(Register a1, Register a2, Register result, Register cnt1,
14401440
Register tmp1, Register tmp2, Register tmp3, int elem_size);
14411441

1442+
// Ensure that the inline code and the stub use the same registers.
1443+
#define ARRAYS_HASHCODE_REGISTERS \
1444+
do { \
1445+
assert(result == r0 && \
1446+
ary == r1 && \
1447+
cnt == r2 && \
1448+
vdata0 == v3 && \
1449+
vdata1 == v2 && \
1450+
vdata2 == v1 && \
1451+
vdata3 == v0 && \
1452+
vmul0 == v4 && \
1453+
vmul1 == v5 && \
1454+
vmul2 == v6 && \
1455+
vmul3 == v7 && \
1456+
vpow == v12 && \
1457+
vpowm == v13, "registers must match aarch64.ad"); \
1458+
} while (0)
1459+
14421460
void string_equals(Register a1, Register a2, Register result, Register cnt1);
14431461

14441462
void fill_words(Register base, Register cnt, Register value);

‎src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

+310
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,9 @@
5454
#include "runtime/stubRoutines.hpp"
5555
#include "utilities/align.hpp"
5656
#include "utilities/checkedCast.hpp"
57+
#include "utilities/debug.hpp"
5758
#include "utilities/globalDefinitions.hpp"
59+
#include "utilities/intpow.hpp"
5860
#include "utilities/powerOfTwo.hpp"
5961
#ifdef COMPILER2
6062
#include "opto/runtime.hpp"
@@ -5320,6 +5322,307 @@ class StubGenerator: public StubCodeGenerator {
53205322
return entry;
53215323
}
53225324

5325+
// result = r0 - return value. Contains initial hashcode value on entry.
5326+
// ary = r1 - array address
5327+
// cnt = r2 - elements count
5328+
// Clobbers: v0-v13, rscratch1, rscratch2
5329+
address generate_large_arrays_hashcode(BasicType eltype) {
5330+
const Register result = r0, ary = r1, cnt = r2;
5331+
const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
5332+
const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
5333+
const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0>
5334+
const FloatRegister vpowm = v13;
5335+
5336+
ARRAYS_HASHCODE_REGISTERS;
5337+
5338+
Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
5339+
5340+
unsigned int vf; // vectorization factor
5341+
bool multiply_by_halves;
5342+
Assembler::SIMD_Arrangement load_arrangement;
5343+
switch (eltype) {
5344+
case T_BOOLEAN:
5345+
case T_BYTE:
5346+
load_arrangement = Assembler::T8B;
5347+
multiply_by_halves = true;
5348+
vf = 8;
5349+
break;
5350+
case T_CHAR:
5351+
case T_SHORT:
5352+
load_arrangement = Assembler::T8H;
5353+
multiply_by_halves = true;
5354+
vf = 8;
5355+
break;
5356+
case T_INT:
5357+
load_arrangement = Assembler::T4S;
5358+
multiply_by_halves = false;
5359+
vf = 4;
5360+
break;
5361+
default:
5362+
ShouldNotReachHere();
5363+
}
5364+
5365+
// Unroll factor
5366+
const unsigned uf = 4;
5367+
5368+
// Effective vectorization factor
5369+
const unsigned evf = vf * uf;
5370+
5371+
__ align(CodeEntryAlignment);
5372+
5373+
const char *mark_name = "";
5374+
switch (eltype) {
5375+
case T_BOOLEAN:
5376+
mark_name = "_large_arrays_hashcode_boolean";
5377+
break;
5378+
case T_BYTE:
5379+
mark_name = "_large_arrays_hashcode_byte";
5380+
break;
5381+
case T_CHAR:
5382+
mark_name = "_large_arrays_hashcode_char";
5383+
break;
5384+
case T_SHORT:
5385+
mark_name = "_large_arrays_hashcode_short";
5386+
break;
5387+
case T_INT:
5388+
mark_name = "_large_arrays_hashcode_int";
5389+
break;
5390+
default:
5391+
mark_name = "_large_arrays_hashcode_incorrect_type";
5392+
__ should_not_reach_here();
5393+
};
5394+
5395+
StubCodeMark mark(this, "StubRoutines", mark_name);
5396+
5397+
address entry = __ pc();
5398+
__ enter();
5399+
5400+
// Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
5401+
// the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
5402+
// value shouldn't change throughout both loops.
5403+
__ movw(rscratch1, intpow(31U, 3));
5404+
__ mov(vpow, Assembler::S, 0, rscratch1);
5405+
__ movw(rscratch1, intpow(31U, 2));
5406+
__ mov(vpow, Assembler::S, 1, rscratch1);
5407+
__ movw(rscratch1, intpow(31U, 1));
5408+
__ mov(vpow, Assembler::S, 2, rscratch1);
5409+
__ movw(rscratch1, intpow(31U, 0));
5410+
__ mov(vpow, Assembler::S, 3, rscratch1);
5411+
5412+
__ mov(vmul0, Assembler::T16B, 0);
5413+
__ mov(vmul0, Assembler::S, 3, result);
5414+
5415+
__ andr(rscratch2, cnt, (uf - 1) * vf);
5416+
__ cbz(rscratch2, LARGE_LOOP_PREHEADER);
5417+
5418+
__ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
5419+
__ mov(vpowm, Assembler::S, 0, rscratch1);
5420+
5421+
// SMALL LOOP
5422+
__ bind(SMALL_LOOP);
5423+
5424+
__ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
5425+
__ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
5426+
__ subsw(rscratch2, rscratch2, vf);
5427+
5428+
if (load_arrangement == Assembler::T8B) {
5429+
// Extend 8B to 8H to be able to use vector multiply
5430+
// instructions
5431+
assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
5432+
if (is_signed_subword_type(eltype)) {
5433+
__ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5434+
} else {
5435+
__ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5436+
}
5437+
}
5438+
5439+
switch (load_arrangement) {
5440+
case Assembler::T4S:
5441+
__ addv(vmul0, load_arrangement, vmul0, vdata0);
5442+
break;
5443+
case Assembler::T8B:
5444+
case Assembler::T8H:
5445+
assert(is_subword_type(eltype), "subword type expected");
5446+
if (is_signed_subword_type(eltype)) {
5447+
__ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5448+
} else {
5449+
__ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5450+
}
5451+
break;
5452+
default:
5453+
__ should_not_reach_here();
5454+
}
5455+
5456+
// Process the upper half of a vector
5457+
if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
5458+
__ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
5459+
if (is_signed_subword_type(eltype)) {
5460+
__ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5461+
} else {
5462+
__ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5463+
}
5464+
}
5465+
5466+
__ br(Assembler::HI, SMALL_LOOP);
5467+
5468+
// SMALL LOOP'S EPILOQUE
5469+
__ lsr(rscratch2, cnt, exact_log2(evf));
5470+
__ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
5471+
5472+
__ mulv(vmul0, Assembler::T4S, vmul0, vpow);
5473+
__ addv(vmul0, Assembler::T4S, vmul0);
5474+
__ umov(result, vmul0, Assembler::S, 0);
5475+
5476+
// TAIL
5477+
__ bind(TAIL);
5478+
5479+
// The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
5480+
// of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
5481+
assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
5482+
__ andr(rscratch2, cnt, vf - 1);
5483+
__ bind(TAIL_SHORTCUT);
5484+
__ adr(rscratch1, BR_BASE);
5485+
__ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3);
5486+
__ movw(rscratch2, 0x1f);
5487+
__ br(rscratch1);
5488+
5489+
for (size_t i = 0; i < vf - 1; ++i) {
5490+
__ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
5491+
eltype);
5492+
__ maddw(result, result, rscratch2, rscratch1);
5493+
}
5494+
__ bind(BR_BASE);
5495+
5496+
__ leave();
5497+
__ ret(lr);
5498+
5499+
// LARGE LOOP
5500+
__ bind(LARGE_LOOP_PREHEADER);
5501+
5502+
__ lsr(rscratch2, cnt, exact_log2(evf));
5503+
5504+
if (multiply_by_halves) {
5505+
// 31^4 - multiplier between lower and upper parts of a register
5506+
__ movw(rscratch1, intpow(31U, vf / 2));
5507+
__ mov(vpowm, Assembler::S, 1, rscratch1);
5508+
// 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
5509+
__ movw(rscratch1, intpow(31U, evf - vf / 2));
5510+
__ mov(vpowm, Assembler::S, 0, rscratch1);
5511+
} else {
5512+
// 31^16
5513+
__ movw(rscratch1, intpow(31U, evf));
5514+
__ mov(vpowm, Assembler::S, 0, rscratch1);
5515+
}
5516+
5517+
__ mov(vmul3, Assembler::T16B, 0);
5518+
__ mov(vmul2, Assembler::T16B, 0);
5519+
__ mov(vmul1, Assembler::T16B, 0);
5520+
5521+
__ bind(LARGE_LOOP);
5522+
5523+
__ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
5524+
__ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
5525+
__ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
5526+
__ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
5527+
5528+
__ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
5529+
Address(__ post(ary, evf * type2aelembytes(eltype))));
5530+
5531+
if (load_arrangement == Assembler::T8B) {
5532+
// Extend 8B to 8H to be able to use vector multiply
5533+
// instructions
5534+
assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
5535+
if (is_signed_subword_type(eltype)) {
5536+
__ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
5537+
__ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
5538+
__ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
5539+
__ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5540+
} else {
5541+
__ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
5542+
__ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
5543+
__ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
5544+
__ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5545+
}
5546+
}
5547+
5548+
switch (load_arrangement) {
5549+
case Assembler::T4S:
5550+
__ addv(vmul3, load_arrangement, vmul3, vdata3);
5551+
__ addv(vmul2, load_arrangement, vmul2, vdata2);
5552+
__ addv(vmul1, load_arrangement, vmul1, vdata1);
5553+
__ addv(vmul0, load_arrangement, vmul0, vdata0);
5554+
break;
5555+
case Assembler::T8B:
5556+
case Assembler::T8H:
5557+
assert(is_subword_type(eltype), "subword type expected");
5558+
if (is_signed_subword_type(eltype)) {
5559+
__ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
5560+
__ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
5561+
__ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
5562+
__ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5563+
} else {
5564+
__ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
5565+
__ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
5566+
__ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
5567+
__ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5568+
}
5569+
break;
5570+
default:
5571+
__ should_not_reach_here();
5572+
}
5573+
5574+
// Process the upper half of a vector
5575+
if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
5576+
__ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
5577+
__ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
5578+
__ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
5579+
__ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
5580+
if (is_signed_subword_type(eltype)) {
5581+
__ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
5582+
__ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
5583+
__ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
5584+
__ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5585+
} else {
5586+
__ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
5587+
__ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
5588+
__ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
5589+
__ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5590+
}
5591+
}
5592+
5593+
__ subsw(rscratch2, rscratch2, 1);
5594+
__ br(Assembler::HI, LARGE_LOOP);
5595+
5596+
__ mulv(vmul3, Assembler::T4S, vmul3, vpow);
5597+
__ addv(vmul3, Assembler::T4S, vmul3);
5598+
__ umov(result, vmul3, Assembler::S, 0);
5599+
5600+
__ mov(rscratch2, intpow(31U, vf));
5601+
5602+
__ mulv(vmul2, Assembler::T4S, vmul2, vpow);
5603+
__ addv(vmul2, Assembler::T4S, vmul2);
5604+
__ umov(rscratch1, vmul2, Assembler::S, 0);
5605+
__ maddw(result, result, rscratch2, rscratch1);
5606+
5607+
__ mulv(vmul1, Assembler::T4S, vmul1, vpow);
5608+
__ addv(vmul1, Assembler::T4S, vmul1);
5609+
__ umov(rscratch1, vmul1, Assembler::S, 0);
5610+
__ maddw(result, result, rscratch2, rscratch1);
5611+
5612+
__ mulv(vmul0, Assembler::T4S, vmul0, vpow);
5613+
__ addv(vmul0, Assembler::T4S, vmul0);
5614+
__ umov(rscratch1, vmul0, Assembler::S, 0);
5615+
__ maddw(result, result, rscratch2, rscratch1);
5616+
5617+
__ andr(rscratch2, cnt, vf - 1);
5618+
__ cbnz(rscratch2, TAIL_SHORTCUT);
5619+
5620+
__ leave();
5621+
__ ret(lr);
5622+
5623+
return entry;
5624+
}
5625+
53235626
address generate_dsin_dcos(bool isCos) {
53245627
__ align(CodeEntryAlignment);
53255628
StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
@@ -8361,6 +8664,13 @@ class StubGenerator: public StubCodeGenerator {
83618664
StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
83628665
}
83638666

8667+
// arrays_hascode stub for large arrays.
8668+
StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
8669+
StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
8670+
StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
8671+
StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
8672+
StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
8673+
83648674
// byte_array_inflate stub for large arrays.
83658675
StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
83668676

‎src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
33
* Copyright (c) 2014, Red Hat Inc. All rights reserved.
44
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
55
*
@@ -48,6 +48,11 @@ address StubRoutines::aarch64::_zero_blocks = nullptr;
4848
address StubRoutines::aarch64::_count_positives = nullptr;
4949
address StubRoutines::aarch64::_count_positives_long = nullptr;
5050
address StubRoutines::aarch64::_large_array_equals = nullptr;
51+
address StubRoutines::aarch64::_large_arrays_hashcode_boolean = nullptr;
52+
address StubRoutines::aarch64::_large_arrays_hashcode_byte = nullptr;
53+
address StubRoutines::aarch64::_large_arrays_hashcode_char = nullptr;
54+
address StubRoutines::aarch64::_large_arrays_hashcode_int = nullptr;
55+
address StubRoutines::aarch64::_large_arrays_hashcode_short = nullptr;
5156
address StubRoutines::aarch64::_compare_long_string_LL = nullptr;
5257
address StubRoutines::aarch64::_compare_long_string_UU = nullptr;
5358
address StubRoutines::aarch64::_compare_long_string_LU = nullptr;

‎src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp

+25-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
33
* Copyright (c) 2014, Red Hat Inc. All rights reserved.
44
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
55
*
@@ -62,6 +62,11 @@ class aarch64 {
6262
static address _zero_blocks;
6363

6464
static address _large_array_equals;
65+
static address _large_arrays_hashcode_boolean;
66+
static address _large_arrays_hashcode_byte;
67+
static address _large_arrays_hashcode_char;
68+
static address _large_arrays_hashcode_int;
69+
static address _large_arrays_hashcode_short;
6570
static address _compare_long_string_LL;
6671
static address _compare_long_string_LU;
6772
static address _compare_long_string_UL;
@@ -145,6 +150,25 @@ class aarch64 {
145150
return _large_array_equals;
146151
}
147152

153+
static address large_arrays_hashcode(BasicType eltype) {
154+
switch (eltype) {
155+
case T_BOOLEAN:
156+
return _large_arrays_hashcode_boolean;
157+
case T_BYTE:
158+
return _large_arrays_hashcode_byte;
159+
case T_CHAR:
160+
return _large_arrays_hashcode_char;
161+
case T_SHORT:
162+
return _large_arrays_hashcode_short;
163+
case T_INT:
164+
return _large_arrays_hashcode_int;
165+
default:
166+
ShouldNotReachHere();
167+
}
168+
169+
return nullptr;
170+
}
171+
148172
static address compare_long_string_LL() {
149173
return _compare_long_string_LL;
150174
}

‎src/hotspot/cpu/aarch64/vm_version_aarch64.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,10 @@ void VM_Version::initialize() {
577577
if (FLAG_IS_DEFAULT(UsePoly1305Intrinsics)) {
578578
FLAG_SET_DEFAULT(UsePoly1305Intrinsics, true);
579579
}
580+
581+
if (FLAG_IS_DEFAULT(UseVectorizedHashCodeIntrinsic)) {
582+
FLAG_SET_DEFAULT(UseVectorizedHashCodeIntrinsic, true);
583+
}
580584
#endif
581585

582586
_spin_wait = get_spin_wait_desc();
+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/*
2+
* Copyright (c) 2024, Arm Limited. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* This code is free software; you can redistribute it and/or modify it
6+
* under the terms of the GNU General Public License version 2 only, as
7+
* published by the Free Software Foundation.
8+
*
9+
* This code is distributed in the hope that it will be useful, but WITHOUT
10+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12+
* version 2 for more details (a copy is included in the LICENSE file that
13+
* accompanied this code).
14+
*
15+
* You should have received a copy of the GNU General Public License version
16+
* 2 along with this work; if not, write to the Free Software Foundation,
17+
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18+
*
19+
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20+
* or visit www.oracle.com if you need additional information or have any
21+
* questions.
22+
*
23+
*/
24+
25+
#ifndef SHARE_UTILITIES_INTPOW_HPP
26+
#define SHARE_UTILITIES_INTPOW_HPP
27+
28+
#include "metaprogramming/enableIf.hpp"
29+
#include <limits>
30+
#include <type_traits>
31+
32+
// Raise v to the power p mod 2**N, where N is the width of the type T.
33+
template <typename T, ENABLE_IF(std::is_integral<T>::value && std::is_unsigned<T>::value)>
34+
static constexpr T intpow(T v, unsigned p) {
35+
if (p == 0) {
36+
return 1;
37+
}
38+
39+
// We use exponentiation by squaring to calculate the required power.
40+
T a = intpow(v, p / 2);
41+
T b = (p % 2) ? v : 1;
42+
43+
return a * a * b;
44+
}
45+
46+
#endif // SHARE_UTILITIES_INTPOW_HPP

‎test/hotspot/gtest/aarch64/aarch64-asmtest.py

+111
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,29 @@ class FloatRegister(Register):
7777
def __str__(self):
7878
return self.astr("v")
7979

80+
def generate(self):
81+
self.number = random.randint(0, 31)
82+
return self
83+
8084
def nextReg(self):
8185
next = FloatRegister()
8286
next.number = (self.number + 1) % 32
8387
return next
8488

89+
class LowFloatRegister(Register):
90+
91+
def __str__(self):
92+
return self.astr("v")
93+
94+
def generate(self):
95+
self.number = random.randint(0, 15)
96+
return self
97+
98+
def nextReg(self):
99+
next = FloatRegister()
100+
next.number = (self.number + 1) % 16
101+
return next
102+
85103
class GeneralRegister(Register):
86104

87105
def __str__(self):
@@ -1271,6 +1289,75 @@ def astr(self):
12711289
def aname(self):
12721290
return self._name
12731291

1292+
class VectorScalarNEONInstruction(Instruction):
1293+
def __init__(self, args):
1294+
self._name, self.insname, self.arrangement = args
1295+
1296+
def generate(self):
1297+
vectorLength = {"8B" : 8, "16B" : 16, "4H" : 4, "8H" : 8, "2S" : 2, "4S" : 4, "1D" : 1, "2D" : 2} [self.arrangement]
1298+
self.elemIndex = random.randrange(0, vectorLength)
1299+
self.elemSizeSpecifier = self.arrangement[len(self.arrangement) - 1:]
1300+
self._firstSIMDreg = LowFloatRegister().generate()
1301+
self.numRegs = 3
1302+
return self
1303+
1304+
def cstr(self):
1305+
buf = Instruction.cstr(self) + str(self._firstSIMDreg)
1306+
buf = '%s, __ T%s' % (buf, self.arrangement)
1307+
current = self._firstSIMDreg
1308+
for cnt in range(1, self.numRegs - 1):
1309+
buf = '%s, %s' % (buf, current.nextReg())
1310+
current = current.nextReg()
1311+
buf = '%s, %s, %d' % (buf, current.nextReg(), self.elemIndex)
1312+
return '%s);' % (buf)
1313+
1314+
def astr(self):
1315+
buf = '%s\t%s.%s' % (self.insname, self._firstSIMDreg, self.arrangement)
1316+
current = self._firstSIMDreg
1317+
for cnt in range(1, self.numRegs - 1):
1318+
buf = '%s, %s.%s' % (buf, current.nextReg(), self.arrangement)
1319+
current = current.nextReg()
1320+
buf = '%s, %s.%s[%d]' % (buf, current.nextReg(), self.elemSizeSpecifier, self.elemIndex)
1321+
return buf
1322+
1323+
def aname(self):
1324+
return self._name
1325+
1326+
class WideningNEONInstruction(Instruction):
1327+
def __init__(self, args):
1328+
self._name, self.insname, self.widerArrangement, self.narrowerArrangement = args
1329+
1330+
def generate(self):
1331+
self._firstSIMDreg = FloatRegister().generate()
1332+
return self
1333+
1334+
def cstr(self):
1335+
buf = Instruction.cstr(self) + str(self._firstSIMDreg)
1336+
current = self._firstSIMDreg
1337+
for cnt in range(1, self.numWiderRegs):
1338+
buf = '%s, %s' % (buf, current.nextReg())
1339+
current = current.nextReg()
1340+
buf = '%s, __ T%s' % (buf, self.widerArrangement)
1341+
for cnt in range(0, self.numNarrowerRegs):
1342+
buf = '%s, %s' % (buf, current.nextReg())
1343+
current = current.nextReg()
1344+
buf = '%s, __ T%s' % (buf, self.narrowerArrangement)
1345+
return '%s);' % (buf)
1346+
1347+
def astr(self):
1348+
buf = '%s\t%s.%s' % (self.insname, self._firstSIMDreg, self.widerArrangement)
1349+
current = self._firstSIMDreg
1350+
for cnt in range(1, self.numWiderRegs):
1351+
buf = '%s, %s.%s' % (buf, current.nextReg(), self.widerArrangement)
1352+
current = current.nextReg()
1353+
for cnt in range(0, self.numNarrowerRegs):
1354+
buf = '%s, %s.%s' % (buf, current.nextReg(), self.narrowerArrangement)
1355+
current = current.nextReg()
1356+
return buf
1357+
1358+
def aname(self):
1359+
return self._name
1360+
12741361
class SHA512SIMDOp(Instruction):
12751362

12761363
def generate(self):
@@ -1390,6 +1477,10 @@ class TwoRegNEONOp(CommonNEONInstruction):
13901477
class ThreeRegNEONOp(TwoRegNEONOp):
13911478
numRegs = 3
13921479

1480+
class AddWideNEONOp(WideningNEONInstruction):
1481+
numWiderRegs = 2
1482+
numNarrowerRegs = 1
1483+
13931484
class NEONFloatCompareWithZero(TwoRegNEONOp):
13941485
def __init__(self, args):
13951486
self._name = 'fcm'
@@ -1748,6 +1839,17 @@ def generate(kind, names):
17481839
["facgt", "facgt", "2D"],
17491840
])
17501841

1842+
generate(VectorScalarNEONInstruction,
1843+
[["fmlavs", "fmla", "2S"], ["mulvs", "mul", "4S"],
1844+
["fmlavs", "fmla", "2D"],
1845+
["fmlsvs", "fmls", "2S"], ["mulvs", "mul", "4S"],
1846+
["fmlsvs", "fmls", "2D"],
1847+
["fmulxvs", "fmulx", "2S"], ["mulvs", "mul", "4S"],
1848+
["fmulxvs", "fmulx", "2D"],
1849+
["mulvs", "mul", "4H"], ["mulvs", "mul", "8H"],
1850+
["mulvs", "mul", "2S"], ["mulvs", "mul", "4S"],
1851+
])
1852+
17511853
neonVectorCompareInstructionPrefix = ['cm', 'fcm']
17521854
neonIntegerVectorCompareConditions = ['GT', 'GE', 'EQ', 'HI', 'HS']
17531855
neonFloatVectorCompareConditions = ['EQ', 'GT', 'GE']
@@ -2081,6 +2183,15 @@ def generate(kind, names):
20812183
generate(SVEReductionOp, [["andv", 0], ["orv", 0], ["eorv", 0], ["smaxv", 0], ["sminv", 0],
20822184
["fminv", 2], ["fmaxv", 2], ["fadda", 2], ["uaddv", 0]])
20832185

2186+
generate(AddWideNEONOp,
2187+
[["saddwv", "saddw", "8H", "8B"], ["saddwv2", "saddw2", "8H", "16B"],
2188+
["saddwv", "saddw", "4S", "4H"], ["saddwv2", "saddw2", "4S", "8H"],
2189+
["saddwv", "saddw", "2D", "2S"], ["saddwv2", "saddw2", "2D", "4S"],
2190+
["uaddwv", "uaddw", "8H", "8B"], ["uaddwv2", "uaddw2", "8H", "16B"],
2191+
["uaddwv", "uaddw", "4S", "4H"], ["uaddwv2", "uaddw2", "4S", "8H"],
2192+
["uaddwv", "uaddw", "2D", "2S"], ["uaddwv2", "uaddw2", "2D", "4S"],
2193+
])
2194+
20842195
print "\n __ bind(forth);"
20852196
outfile.write("forth:\n")
20862197

‎test/hotspot/gtest/aarch64/asmtest.out.h

+612-577
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)
Please sign in to comment.