Skip to content

Commit 5c4623b

Browse files
chhagedornslowhog
authored andcommittedJul 18, 2023
8308682: Enhance AES performance
Reviewed-by: rhalade, dlong, kvn
1 parent cb5f1b7 commit 5c4623b

7 files changed

+106
-37
lines changed
 

‎src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

+34-14
Original file line numberDiff line numberDiff line change
@@ -2944,6 +2944,23 @@ class StubGenerator: public StubCodeGenerator {
29442944
return start;
29452945
}
29462946

2947+
// Big-endian 128-bit + 64-bit -> 128-bit addition.
2948+
// Inputs: 128-bits. in is preserved.
2949+
// The least-significant 64-bit word is in the upper dword of each vector.
2950+
// inc (the 64-bit increment) is preserved. Its lower dword must be zero.
2951+
// Output: result
2952+
void be_add_128_64(FloatRegister result, FloatRegister in,
2953+
FloatRegister inc, FloatRegister tmp) {
2954+
assert_different_registers(result, tmp, inc);
2955+
2956+
__ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of
2957+
// input
2958+
__ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
2959+
__ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and
2960+
// MSD == 0 (must be!) to LSD
2961+
__ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow
2962+
}
2963+
29472964
// CTR AES crypt.
29482965
// Arguments:
29492966
//
@@ -3053,13 +3070,16 @@ class StubGenerator: public StubCodeGenerator {
30533070
// Setup the counter
30543071
__ movi(v4, __ T4S, 0);
30553072
__ movi(v5, __ T4S, 1);
3056-
__ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
3073+
__ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
30573074

3058-
__ ld1(v0, __ T16B, counter); // Load the counter into v0
3059-
__ rev32(v16, __ T16B, v0);
3060-
__ addv(v16, __ T4S, v16, v4);
3061-
__ rev32(v16, __ T16B, v16);
3062-
__ st1(v16, __ T16B, counter); // Save the incremented counter back
3075+
// 128-bit big-endian increment
3076+
__ ld1(v0, __ T16B, counter);
3077+
__ rev64(v16, __ T16B, v0);
3078+
be_add_128_64(v16, v16, v4, /*tmp*/v5);
3079+
__ rev64(v16, __ T16B, v16);
3080+
__ st1(v16, __ T16B, counter);
3081+
// Previous counter value is in v0
3082+
// v4 contains { 0, 1 }
30633083

30643084
{
30653085
// We have fewer than bulk_width blocks of data left. Encrypt
@@ -3091,9 +3111,9 @@ class StubGenerator: public StubCodeGenerator {
30913111

30923112
// Increment the counter, store it back
30933113
__ orr(v0, __ T16B, v16, v16);
3094-
__ rev32(v16, __ T16B, v16);
3095-
__ addv(v16, __ T4S, v16, v4);
3096-
__ rev32(v16, __ T16B, v16);
3114+
__ rev64(v16, __ T16B, v16);
3115+
be_add_128_64(v16, v16, v4, /*tmp*/v5);
3116+
__ rev64(v16, __ T16B, v16);
30973117
__ st1(v16, __ T16B, counter); // Save the incremented counter back
30983118

30993119
__ b(inner_loop);
@@ -3141,7 +3161,7 @@ class StubGenerator: public StubCodeGenerator {
31413161
// Keys should already be loaded into the correct registers
31423162

31433163
__ ld1(v0, __ T16B, counter); // v0 contains the first counter
3144-
__ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3164+
__ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
31453165

31463166
// AES/CTR loop
31473167
{
@@ -3151,12 +3171,12 @@ class StubGenerator: public StubCodeGenerator {
31513171
// Setup the counters
31523172
__ movi(v8, __ T4S, 0);
31533173
__ movi(v9, __ T4S, 1);
3154-
__ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3174+
__ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
31553175

31563176
for (int i = 0; i < bulk_width; i++) {
31573177
FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3158-
__ rev32(v0_ofs, __ T16B, v16);
3159-
__ addv(v16, __ T4S, v16, v8);
3178+
__ rev64(v0_ofs, __ T16B, v16);
3179+
be_add_128_64(v16, v16, v8, /*tmp*/v9);
31603180
}
31613181

31623182
__ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
@@ -3186,7 +3206,7 @@ class StubGenerator: public StubCodeGenerator {
31863206
}
31873207

31883208
// Save the counter back where it goes
3189-
__ rev32(v16, __ T16B, v16);
3209+
__ rev64(v16, __ T16B, v16);
31903210
__ st1(v16, __ T16B, counter);
31913211

31923212
__ pop(saved_regs, sp);

‎src/hotspot/cpu/x86/assembler_x86.cpp

+8
Original file line numberDiff line numberDiff line change
@@ -4431,6 +4431,14 @@ void Assembler::evpcmpuw(KRegister kdst, XMMRegister nds, XMMRegister src, Compa
44314431
emit_int24(0x3E, (0xC0 | encode), vcc);
44324432
}
44334433

4434+
void Assembler::evpcmpuq(KRegister kdst, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len) {
4435+
assert(VM_Version::supports_avx512vl(), "");
4436+
InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
4437+
attributes.set_is_evex_instruction();
4438+
int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
4439+
emit_int24(0x1E, (0xC0 | encode), vcc);
4440+
}
4441+
44344442
void Assembler::evpcmpuw(KRegister kdst, XMMRegister nds, Address src, ComparisonPredicate vcc, int vector_len) {
44354443
assert(VM_Version::supports_avx512vlbw(), "");
44364444
InstructionMark im(this);

‎src/hotspot/cpu/x86/assembler_x86.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -1806,6 +1806,8 @@ class Assembler : public AbstractAssembler {
18061806
void evpcmpuw(KRegister kdst, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len);
18071807
void evpcmpuw(KRegister kdst, XMMRegister nds, Address src, ComparisonPredicate vcc, int vector_len);
18081808

1809+
void evpcmpuq(KRegister kdst, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len);
1810+
18091811
void pcmpeqw(XMMRegister dst, XMMRegister src);
18101812
void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
18111813
void evpcmpeqw(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);

‎src/hotspot/cpu/x86/macroAssembler_x86.cpp

+11
Original file line numberDiff line numberDiff line change
@@ -9257,6 +9257,17 @@ void MacroAssembler::evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral sr
92579257
}
92589258
}
92599259

9260+
void MacroAssembler::evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
9261+
assert(rscratch != noreg || always_reachable(src), "missing");
9262+
9263+
if (reachable(src)) {
9264+
Assembler::evpaddq(dst, mask, nds, as_Address(src), merge, vector_len);
9265+
} else {
9266+
lea(rscratch, src);
9267+
Assembler::evpaddq(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
9268+
}
9269+
}
9270+
92609271
void MacroAssembler::evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
92619272
assert(rscratch != noreg || always_reachable(src), "missing");
92629273

‎src/hotspot/cpu/x86/macroAssembler_x86.hpp

+3
Original file line numberDiff line numberDiff line change
@@ -1788,6 +1788,9 @@ class MacroAssembler: public Assembler {
17881788
using Assembler::evpandq;
17891789
void evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
17901790

1791+
using Assembler::evpaddq;
1792+
void evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1793+
17911794
using Assembler::evporq;
17921795
void evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
17931796

‎src/hotspot/cpu/x86/stubGenerator_x86_64.hpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,8 @@ class StubGenerator: public StubCodeGenerator {
364364

365365
// Utility routine for increase 128bit counter (iv in CTR mode)
366366
void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block);
367-
367+
void ev_add128(XMMRegister xmmdst, XMMRegister xmmsrc1, XMMRegister xmmsrc2,
368+
int vector_len, KRegister ktmp, Register rscratch = noreg);
368369
void generate_aes_stubs();
369370

370371

‎src/hotspot/cpu/x86/stubGenerator_x86_64_aes.cpp

+46-22
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,16 @@ static address counter_mask_linc32_addr() {
121121
return (address)COUNTER_MASK_LINC32;
122122
}
123123

124+
ATTRIBUTE_ALIGNED(64) uint64_t COUNTER_MASK_ONES[] = {
125+
0x0000000000000000UL, 0x0000000000000001UL,
126+
0x0000000000000000UL, 0x0000000000000001UL,
127+
0x0000000000000000UL, 0x0000000000000001UL,
128+
0x0000000000000000UL, 0x0000000000000001UL,
129+
};
130+
static address counter_mask_ones_addr() {
131+
return (address)COUNTER_MASK_ONES;
132+
}
133+
124134
ATTRIBUTE_ALIGNED(64) static const uint64_t GHASH_POLYNOMIAL_REDUCTION[] = {
125135
0x00000001C2000000UL, 0xC200000000000000UL,
126136
0x00000001C2000000UL, 0xC200000000000000UL,
@@ -1623,6 +1633,17 @@ void StubGenerator::ev_load_key(XMMRegister xmmdst, Register key, int offset, Re
16231633
__ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
16241634
}
16251635

1636+
// Add 128-bit integers in xmmsrc1 to xmmsrc2, then place the result in xmmdst.
1637+
// Clobber ktmp and rscratch.
1638+
// Used by aesctr_encrypt.
1639+
void StubGenerator::ev_add128(XMMRegister xmmdst, XMMRegister xmmsrc1, XMMRegister xmmsrc2,
1640+
int vector_len, KRegister ktmp, Register rscratch) {
1641+
__ vpaddq(xmmdst, xmmsrc1, xmmsrc2, vector_len);
1642+
__ evpcmpuq(ktmp, xmmdst, xmmsrc2, __ lt, vector_len);
1643+
__ kshiftlbl(ktmp, ktmp, 1);
1644+
__ evpaddq(xmmdst, ktmp, xmmdst, ExternalAddress(counter_mask_ones_addr()), /*merge*/true,
1645+
vector_len, rscratch);
1646+
}
16261647

16271648
// AES-ECB Encrypt Operation
16281649
void StubGenerator::aesecb_encrypt(Register src_addr, Register dest_addr, Register key, Register len) {
@@ -2046,7 +2067,6 @@ void StubGenerator::aesecb_decrypt(Register src_addr, Register dest_addr, Regist
20462067
}
20472068

20482069

2049-
20502070
// AES Counter Mode using VAES instructions
20512071
void StubGenerator::aesctr_encrypt(Register src_addr, Register dest_addr, Register key, Register counter,
20522072
Register len_reg, Register used, Register used_addr, Register saved_encCounter_start) {
@@ -2104,14 +2124,17 @@ void StubGenerator::aesctr_encrypt(Register src_addr, Register dest_addr, Regist
21042124
// The counter is incremented after each block i.e. 16 bytes is processed;
21052125
// each zmm register has 4 counter values as its MSB
21062126
// the counters are incremented in parallel
2107-
__ vpaddd(xmm8, xmm8, ExternalAddress(counter_mask_linc0_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
2108-
__ vpaddd(xmm9, xmm8, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
2109-
__ vpaddd(xmm10, xmm9, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
2110-
__ vpaddd(xmm11, xmm10, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
2111-
__ vpaddd(xmm12, xmm11, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
2112-
__ vpaddd(xmm13, xmm12, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
2113-
__ vpaddd(xmm14, xmm13, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
2114-
__ vpaddd(xmm15, xmm14, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
2127+
2128+
__ evmovdquq(xmm19, ExternalAddress(counter_mask_linc0_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
2129+
ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2130+
__ evmovdquq(xmm19, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
2131+
ev_add128(xmm9, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2132+
ev_add128(xmm10, xmm9, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2133+
ev_add128(xmm11, xmm10, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2134+
ev_add128(xmm12, xmm11, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2135+
ev_add128(xmm13, xmm12, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2136+
ev_add128(xmm14, xmm13, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2137+
ev_add128(xmm15, xmm14, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
21152138

21162139
// load linc32 mask in zmm register.linc32 increments counter by 32
21172140
__ evmovdquq(xmm19, ExternalAddress(counter_mask_linc32_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
@@ -2159,21 +2182,21 @@ void StubGenerator::aesctr_encrypt(Register src_addr, Register dest_addr, Regist
21592182
// This is followed by incrementing counter values in zmm8-zmm15.
21602183
// Since we will be processing 32 blocks at a time, the counter is incremented by 32.
21612184
roundEnc(xmm21, 7);
2162-
__ vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
2185+
ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
21632186
roundEnc(xmm22, 7);
2164-
__ vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit);
2187+
ev_add128(xmm9, xmm9, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
21652188
roundEnc(xmm23, 7);
2166-
__ vpaddq(xmm10, xmm10, xmm19, Assembler::AVX_512bit);
2189+
ev_add128(xmm10, xmm10, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
21672190
roundEnc(xmm24, 7);
2168-
__ vpaddq(xmm11, xmm11, xmm19, Assembler::AVX_512bit);
2191+
ev_add128(xmm11, xmm11, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
21692192
roundEnc(xmm25, 7);
2170-
__ vpaddq(xmm12, xmm12, xmm19, Assembler::AVX_512bit);
2193+
ev_add128(xmm12, xmm12, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
21712194
roundEnc(xmm26, 7);
2172-
__ vpaddq(xmm13, xmm13, xmm19, Assembler::AVX_512bit);
2195+
ev_add128(xmm13, xmm13, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
21732196
roundEnc(xmm27, 7);
2174-
__ vpaddq(xmm14, xmm14, xmm19, Assembler::AVX_512bit);
2197+
ev_add128(xmm14, xmm14, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
21752198
roundEnc(xmm28, 7);
2176-
__ vpaddq(xmm15, xmm15, xmm19, Assembler::AVX_512bit);
2199+
ev_add128(xmm15, xmm15, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
21772200
roundEnc(xmm29, 7);
21782201

21792202
__ cmpl(rounds, 52);
@@ -2251,8 +2274,8 @@ void StubGenerator::aesctr_encrypt(Register src_addr, Register dest_addr, Regist
22512274
__ vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit);
22522275
__ evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit);
22532276
// Increment counter values by 16
2254-
__ vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
2255-
__ vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit);
2277+
ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2278+
ev_add128(xmm9, xmm9, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
22562279
// AES encode rounds
22572280
roundEnc(xmm21, 3);
22582281
roundEnc(xmm22, 3);
@@ -2319,7 +2342,7 @@ void StubGenerator::aesctr_encrypt(Register src_addr, Register dest_addr, Regist
23192342
__ vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
23202343
__ evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
23212344
// increment counter by 8
2322-
__ vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
2345+
ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
23232346
// AES encode
23242347
roundEnc(xmm21, 1);
23252348
roundEnc(xmm22, 1);
@@ -2376,8 +2399,9 @@ void StubGenerator::aesctr_encrypt(Register src_addr, Register dest_addr, Regist
23762399
// XOR counter with first roundkey
23772400
__ vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
23782401
__ evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
2402+
23792403
// Increment counter
2380-
__ vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
2404+
ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
23812405
__ vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_512bit);
23822406
__ vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_512bit);
23832407
__ vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_512bit);
@@ -2427,7 +2451,7 @@ void StubGenerator::aesctr_encrypt(Register src_addr, Register dest_addr, Regist
24272451
__ evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_128bit);
24282452
__ vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_128bit);
24292453
// Increment counter by 1
2430-
__ vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_128bit);
2454+
ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_128bit, /*ktmp*/k1, r15 /*rscratch*/);
24312455
__ vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_128bit);
24322456
__ vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_128bit);
24332457
__ vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_128bit);

0 commit comments

Comments
 (0)
Please sign in to comment.