Skip to content

Commit a48f596

Browse files
author
Hamlin Li
committedFeb 27, 2024
8322179: RISC-V: Implement SHA-1 intrinsic
Reviewed-by: tonyp, fyang
1 parent 5daf63b commit a48f596

File tree

4 files changed

+401
-30
lines changed

4 files changed

+401
-30
lines changed
 

‎src/hotspot/cpu/riscv/stubGenerator_riscv.cpp

+347
Original file line numberDiff line numberDiff line change
@@ -4809,6 +4809,348 @@ class StubGenerator: public StubCodeGenerator {
48094809
return (address) start;
48104810
}
48114811

4812+
4813+
// ------------------------ SHA-1 intrinsic ------------------------
4814+
4815+
// K't =
4816+
// 5a827999, 0 <= t <= 19
4817+
// 6ed9eba1, 20 <= t <= 39
4818+
// 8f1bbcdc, 40 <= t <= 59
4819+
// ca62c1d6, 60 <= t <= 79
4820+
void sha1_prepare_k(Register cur_k, int round) {
4821+
assert(round >= 0 && round < 80, "must be");
4822+
4823+
static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6};
4824+
if ((round % 20) == 0) {
4825+
__ mv(cur_k, ks[round/20]);
4826+
}
4827+
}
4828+
4829+
// W't =
4830+
// M't, 0 <= t <= 15
4831+
// ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79
4832+
void sha1_prepare_w(Register cur_w, Register ws[], Register buf, int round) {
4833+
assert(round >= 0 && round < 80, "must be");
4834+
4835+
if (round < 16) {
4836+
// in the first 16 rounds, in ws[], every register contains 2 W't, e.g.
4837+
// in ws[0], high part contains W't-0, low part contains W't-1,
4838+
// in ws[1], high part contains W't-2, low part contains W't-3,
4839+
// ...
4840+
// in ws[7], high part contains W't-14, low part contains W't-15.
4841+
4842+
if ((round % 2) == 0) {
4843+
__ ld(ws[round/2], Address(buf, (round/2) * 8));
4844+
// reverse bytes, as SHA-1 is defined in big-endian.
4845+
__ revb(ws[round/2], ws[round/2]);
4846+
__ srli(cur_w, ws[round/2], 32);
4847+
} else {
4848+
__ mv(cur_w, ws[round/2]);
4849+
}
4850+
4851+
return;
4852+
}
4853+
4854+
if ((round % 2) == 0) {
4855+
int idx = 16;
4856+
// W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79
4857+
__ srli(t1, ws[(idx-8)/2], 32);
4858+
__ xorr(t0, ws[(idx-3)/2], t1);
4859+
4860+
__ srli(t1, ws[(idx-14)/2], 32);
4861+
__ srli(cur_w, ws[(idx-16)/2], 32);
4862+
__ xorr(cur_w, cur_w, t1);
4863+
4864+
__ xorr(cur_w, cur_w, t0);
4865+
__ rolw_imm(cur_w, cur_w, 1, t0);
4866+
4867+
// copy the cur_w value to ws[8].
4868+
// now, valid w't values are at:
4869+
// w0: ws[0]'s lower 32 bits
4870+
// w1 ~ w14: ws[1] ~ ws[7]
4871+
// w15: ws[8]'s higher 32 bits
4872+
__ slli(ws[idx/2], cur_w, 32);
4873+
4874+
return;
4875+
}
4876+
4877+
int idx = 17;
4878+
// W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79
4879+
__ srli(t1, ws[(idx-3)/2], 32);
4880+
__ xorr(t0, t1, ws[(idx-8)/2]);
4881+
4882+
__ xorr(cur_w, ws[(idx-16)/2], ws[(idx-14)/2]);
4883+
4884+
__ xorr(cur_w, cur_w, t0);
4885+
__ rolw_imm(cur_w, cur_w, 1, t0);
4886+
4887+
// copy the cur_w value to ws[8]
4888+
__ zero_extend(cur_w, cur_w, 32);
4889+
__ orr(ws[idx/2], ws[idx/2], cur_w);
4890+
4891+
// shift the w't registers, so they start from ws[0] again.
4892+
// now, valid w't values are at:
4893+
// w0 ~ w15: ws[0] ~ ws[7]
4894+
Register ws_0 = ws[0];
4895+
for (int i = 0; i < 16/2; i++) {
4896+
ws[i] = ws[i+1];
4897+
}
4898+
ws[8] = ws_0;
4899+
}
4900+
4901+
// f't(x, y, z) =
4902+
// Ch(x, y, z) = (x & y) ^ (~x & z) , 0 <= t <= 19
4903+
// Parity(x, y, z) = x ^ y ^ z , 20 <= t <= 39
4904+
// Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) , 40 <= t <= 59
4905+
// Parity(x, y, z) = x ^ y ^ z , 60 <= t <= 79
4906+
void sha1_f(Register dst, Register x, Register y, Register z, int round) {
4907+
assert(round >= 0 && round < 80, "must be");
4908+
assert_different_registers(dst, x, y, z, t0, t1);
4909+
4910+
if (round < 20) {
4911+
// (x & y) ^ (~x & z)
4912+
__ andr(t0, x, y);
4913+
__ andn(dst, z, x);
4914+
__ xorr(dst, dst, t0);
4915+
} else if (round >= 40 && round < 60) {
4916+
// (x & y) ^ (x & z) ^ (y & z)
4917+
__ andr(t0, x, y);
4918+
__ andr(t1, x, z);
4919+
__ andr(dst, y, z);
4920+
__ xorr(dst, dst, t0);
4921+
__ xorr(dst, dst, t1);
4922+
} else {
4923+
// x ^ y ^ z
4924+
__ xorr(dst, x, y);
4925+
__ xorr(dst, dst, z);
4926+
}
4927+
}
4928+
4929+
// T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
4930+
// e = d
4931+
// d = c
4932+
// c = ROTL'30(b)
4933+
// b = a
4934+
// a = T
4935+
void sha1_process_round(Register a, Register b, Register c, Register d, Register e,
4936+
Register cur_k, Register cur_w, Register tmp, int round) {
4937+
assert(round >= 0 && round < 80, "must be");
4938+
assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp, t0);
4939+
4940+
// T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
4941+
4942+
// cur_w will be recalculated at the beginning of each round,
4943+
// so, we can reuse it as a temp register here.
4944+
Register tmp2 = cur_w;
4945+
4946+
// reuse e as a temporary register, as we will mv new value into it later
4947+
Register tmp3 = e;
4948+
__ add(tmp2, cur_k, tmp2);
4949+
__ add(tmp3, tmp3, tmp2);
4950+
__ rolw_imm(tmp2, a, 5, t0);
4951+
4952+
sha1_f(tmp, b, c, d, round);
4953+
4954+
__ add(tmp2, tmp2, tmp);
4955+
__ add(tmp2, tmp2, tmp3);
4956+
4957+
// e = d
4958+
// d = c
4959+
// c = ROTL'30(b)
4960+
// b = a
4961+
// a = T
4962+
__ mv(e, d);
4963+
__ mv(d, c);
4964+
4965+
__ rolw_imm(c, b, 30);
4966+
__ mv(b, a);
4967+
__ mv(a, tmp2);
4968+
}
4969+
4970+
// H(i)0 = a + H(i-1)0
4971+
// H(i)1 = b + H(i-1)1
4972+
// H(i)2 = c + H(i-1)2
4973+
// H(i)3 = d + H(i-1)3
4974+
// H(i)4 = e + H(i-1)4
4975+
void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e,
4976+
Register prev_ab, Register prev_cd, Register prev_e) {
4977+
assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e);
4978+
4979+
__ add(a, a, prev_ab);
4980+
__ srli(prev_ab, prev_ab, 32);
4981+
__ add(b, b, prev_ab);
4982+
4983+
__ add(c, c, prev_cd);
4984+
__ srli(prev_cd, prev_cd, 32);
4985+
__ add(d, d, prev_cd);
4986+
4987+
__ add(e, e, prev_e);
4988+
}
4989+
4990+
void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e,
4991+
Register prev_ab, Register prev_cd, Register prev_e) {
4992+
assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e, t0);
4993+
4994+
__ slli(t0, b, 32);
4995+
__ zero_extend(prev_ab, a, 32);
4996+
__ orr(prev_ab, prev_ab, t0);
4997+
4998+
__ slli(t0, d, 32);
4999+
__ zero_extend(prev_cd, c, 32);
5000+
__ orr(prev_cd, prev_cd, t0);
5001+
5002+
__ mv(prev_e, e);
5003+
}
5004+
5005+
// Intrinsic for:
5006+
// void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs)
5007+
// void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit)
5008+
//
5009+
// Arguments:
5010+
//
5011+
// Inputs:
5012+
// c_rarg0: byte[] src array + offset
5013+
// c_rarg1: int[] SHA.state
5014+
// - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5015+
// c_rarg2: int offset
5016+
// c_rarg3: int limit
5017+
//
5018+
// Outputs:
5019+
// - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5020+
// c_rarg0: int offset, when (multi_block == true)
5021+
//
5022+
address generate_sha1_implCompress(bool multi_block, const char *name) {
5023+
__ align(CodeEntryAlignment);
5024+
StubCodeMark mark(this, "StubRoutines", name);
5025+
5026+
address start = __ pc();
5027+
__ enter();
5028+
5029+
RegSet saved_regs = RegSet::range(x18, x27);
5030+
if (multi_block) {
5031+
// use x9 as src below.
5032+
saved_regs += RegSet::of(x9);
5033+
}
5034+
__ push_reg(saved_regs, sp);
5035+
5036+
// c_rarg0 - c_rarg3: x10 - x13
5037+
Register buf = c_rarg0;
5038+
Register state = c_rarg1;
5039+
Register offset = c_rarg2;
5040+
Register limit = c_rarg3;
5041+
// use src to contain the original start point of the array.
5042+
Register src = x9;
5043+
5044+
if (multi_block) {
5045+
__ sub(limit, limit, offset);
5046+
__ add(limit, limit, buf);
5047+
__ sub(src, buf, offset);
5048+
}
5049+
5050+
// [args-reg]: x14 - x17
5051+
// [temp-reg]: x28 - x31
5052+
// [saved-reg]: x18 - x27
5053+
5054+
// h0/1/2/3/4
5055+
const Register a = x14, b = x15, c = x16, d = x17, e = x28;
5056+
// w0, w1, ... w15
5057+
// put two adjecent w's in one register:
5058+
// one at high word part, another at low word part
5059+
// at different round (even or odd), w't value reside in different items in ws[].
5060+
// w0 ~ w15, either reside in
5061+
// ws[0] ~ ws[7], where
5062+
// w0 at higher 32 bits of ws[0],
5063+
// w1 at lower 32 bits of ws[0],
5064+
// ...
5065+
// w14 at higher 32 bits of ws[7],
5066+
// w15 at lower 32 bits of ws[7].
5067+
// or, reside in
5068+
// w0: ws[0]'s lower 32 bits
5069+
// w1 ~ w14: ws[1] ~ ws[7]
5070+
// w15: ws[8]'s higher 32 bits
5071+
Register ws[9] = {x29, x30, x31, x18,
5072+
x19, x20, x21, x22,
5073+
x23}; // auxiliary register for calculating w's value
5074+
// current k't's value
5075+
const Register cur_k = x24;
5076+
// current w't's value
5077+
const Register cur_w = x25;
5078+
// values of a, b, c, d, e in the previous round
5079+
const Register prev_ab = x26, prev_cd = x27;
5080+
const Register prev_e = offset; // reuse offset/c_rarg2
5081+
5082+
// load 5 words state into a, b, c, d, e.
5083+
//
5084+
// To minimize the number of memory operations, we apply following
5085+
// optimization: read the states (a/b/c/d) of 4-byte values in pairs,
5086+
// with a single ld, and split them into 2 registers.
5087+
//
5088+
// And, as the core algorithm of SHA-1 works on 32-bits words, so
5089+
// in the following code, it does not care about the content of
5090+
// higher 32-bits in a/b/c/d/e. Based on this observation,
5091+
// we can apply further optimization, which is to just ignore the
5092+
// higher 32-bits in a/c/e, rather than set the higher
5093+
// 32-bits of a/c/e to zero explicitly with extra instructions.
5094+
__ ld(a, Address(state, 0));
5095+
__ srli(b, a, 32);
5096+
__ ld(c, Address(state, 8));
5097+
__ srli(d, c, 32);
5098+
__ lw(e, Address(state, 16));
5099+
5100+
Label L_sha1_loop;
5101+
if (multi_block) {
5102+
__ BIND(L_sha1_loop);
5103+
}
5104+
5105+
sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5106+
5107+
for (int round = 0; round < 80; round++) {
5108+
// prepare K't value
5109+
sha1_prepare_k(cur_k, round);
5110+
5111+
// prepare W't value
5112+
sha1_prepare_w(cur_w, ws, buf, round);
5113+
5114+
// one round process
5115+
sha1_process_round(a, b, c, d, e, cur_k, cur_w, t2, round);
5116+
}
5117+
5118+
// compute the intermediate hash value
5119+
sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5120+
5121+
if (multi_block) {
5122+
int64_t block_bytes = 16 * 4;
5123+
__ addi(buf, buf, block_bytes);
5124+
5125+
__ bge(limit, buf, L_sha1_loop, true);
5126+
}
5127+
5128+
// store back the state.
5129+
__ zero_extend(a, a, 32);
5130+
__ slli(b, b, 32);
5131+
__ orr(a, a, b);
5132+
__ sd(a, Address(state, 0));
5133+
__ zero_extend(c, c, 32);
5134+
__ slli(d, d, 32);
5135+
__ orr(c, c, d);
5136+
__ sd(c, Address(state, 8));
5137+
__ sw(e, Address(state, 16));
5138+
5139+
// return offset
5140+
if (multi_block) {
5141+
__ sub(c_rarg0, buf, src);
5142+
}
5143+
5144+
__ pop_reg(saved_regs, sp);
5145+
5146+
__ leave();
5147+
__ ret();
5148+
5149+
return (address) start;
5150+
}
5151+
5152+
5153+
48125154
#ifdef COMPILER2
48135155

48145156
static const int64_t right_2_bits = right_n_bits(2);
@@ -5273,6 +5615,11 @@ static const int64_t right_3_bits = right_n_bits(3);
52735615
StubRoutines::_chacha20Block = generate_chacha20Block();
52745616
}
52755617

5618+
if (UseSHA1Intrinsics) {
5619+
StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
5620+
StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
5621+
}
5622+
52765623
#endif // COMPILER2_OR_JVMCI
52775624
}
52785625

‎src/hotspot/cpu/riscv/stubRoutines_riscv.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ enum platform_dependent_constants {
3939
// simply increase sizes if too small (assembler will crash if too small)
4040
_initial_stubs_code_size = 10000,
4141
_continuation_stubs_code_size = 2000,
42-
_compiler_stubs_code_size = 15000 ZGC_ONLY(+5000),
42+
_compiler_stubs_code_size = 25000 ZGC_ONLY(+5000),
4343
_final_stubs_code_size = 20000 ZGC_ONLY(+10000)
4444
};
4545

‎src/hotspot/cpu/riscv/vm_version_riscv.cpp

+51-28
Original file line numberDiff line numberDiff line change
@@ -149,16 +149,6 @@ void VM_Version::initialize() {
149149
FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
150150
}
151151

152-
if (UseSHA1Intrinsics) {
153-
warning("Intrinsics for SHA-1 crypto hash functions not available on this CPU.");
154-
FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
155-
}
156-
157-
if (UseSHA3Intrinsics) {
158-
warning("Intrinsics for SHA3-224, SHA3-256, SHA3-384 and SHA3-512 crypto hash functions not available on this CPU.");
159-
FLAG_SET_DEFAULT(UseSHA3Intrinsics, false);
160-
}
161-
162152
if (UseCRC32Intrinsics) {
163153
warning("CRC32 intrinsics are not available on this CPU.");
164154
FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
@@ -260,11 +250,8 @@ void VM_Version::initialize() {
260250
// NOTE: Make sure codes dependent on UseRVV are put after c2_initialize(),
261251
// as there are extra checks inside it which could disable UseRVV
262252
// in some situations.
263-
if (UseZvkn && !UseRVV) {
264-
FLAG_SET_DEFAULT(UseZvkn, false);
265-
warning("Cannot enable Zvkn on cpu without RVV support.");
266-
}
267253

254+
// ChaCha20
268255
if (UseRVV) {
269256
if (FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
270257
FLAG_SET_DEFAULT(UseChaCha20Intrinsics, true);
@@ -276,29 +263,65 @@ void VM_Version::initialize() {
276263
FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
277264
}
278265

279-
if (!UseZvkn && UseSHA) {
280-
warning("SHA instructions are not available on this CPU");
281-
FLAG_SET_DEFAULT(UseSHA, false);
282-
} else if (UseZvkn && FLAG_IS_DEFAULT(UseSHA)) {
266+
// SHA's
267+
if (FLAG_IS_DEFAULT(UseSHA)) {
283268
FLAG_SET_DEFAULT(UseSHA, true);
284269
}
285270

286-
if (!UseSHA) {
271+
// SHA-1, no RVV required though.
272+
if (UseSHA) {
273+
if (FLAG_IS_DEFAULT(UseSHA1Intrinsics)) {
274+
FLAG_SET_DEFAULT(UseSHA1Intrinsics, true);
275+
}
276+
} else if (UseSHA1Intrinsics) {
277+
warning("Intrinsics for SHA-1 crypto hash functions not available on this CPU.");
278+
FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
279+
}
280+
281+
// UseZvkn (depends on RVV) and SHA-2.
282+
if (UseZvkn && !UseRVV) {
283+
FLAG_SET_DEFAULT(UseZvkn, false);
284+
warning("Cannot enable Zvkn on cpu without RVV support.");
285+
}
286+
// SHA-2, depends on Zvkn.
287+
if (UseSHA) {
288+
if (UseZvkn) {
289+
if (FLAG_IS_DEFAULT(UseSHA256Intrinsics)) {
290+
FLAG_SET_DEFAULT(UseSHA256Intrinsics, true);
291+
}
292+
if (FLAG_IS_DEFAULT(UseSHA512Intrinsics)) {
293+
FLAG_SET_DEFAULT(UseSHA512Intrinsics, true);
294+
}
295+
} else {
296+
if (UseSHA256Intrinsics) {
297+
warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU, UseZvkn needed.");
298+
FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
299+
}
300+
if (UseSHA512Intrinsics) {
301+
warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU, UseZvkn needed.");
302+
FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
303+
}
304+
}
305+
} else {
287306
if (UseSHA256Intrinsics) {
288-
warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU, UseZvkn needed.");
307+
warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU, as UseSHA disabled.");
289308
FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
290309
}
291310
if (UseSHA512Intrinsics) {
292-
warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU, UseZvkn needed.");
311+
warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU, as UseSHA disabled.");
293312
FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
294313
}
295-
} else {
296-
if (FLAG_IS_DEFAULT(UseSHA256Intrinsics)) {
297-
FLAG_SET_DEFAULT(UseSHA256Intrinsics, true);
298-
}
299-
if (FLAG_IS_DEFAULT(UseSHA512Intrinsics)) {
300-
FLAG_SET_DEFAULT(UseSHA512Intrinsics, true);
301-
}
314+
}
315+
316+
// SHA-3
317+
if (UseSHA3Intrinsics) {
318+
warning("Intrinsics for SHA3-224, SHA3-256, SHA3-384 and SHA3-512 crypto hash functions not available on this CPU.");
319+
FLAG_SET_DEFAULT(UseSHA3Intrinsics, false);
320+
}
321+
322+
// UseSHA
323+
if (!(UseSHA1Intrinsics || UseSHA256Intrinsics || UseSHA3Intrinsics || UseSHA512Intrinsics)) {
324+
FLAG_SET_DEFAULT(UseSHA, false);
302325
}
303326
}
304327

‎test/hotspot/jtreg/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,8 @@ public class IntrinsicPredicates {
6969

7070
public static final BooleanSupplier SHA1_INSTRUCTION_AVAILABLE
7171
= new OrPredicate(new CPUSpecificPredicate("aarch64.*", new String[] { "sha1" }, null),
72-
new OrPredicate(new CPUSpecificPredicate("riscv64.*", new String[] { "sha1" }, null),
72+
// SHA-1 intrinsic is implemented with scalar instructions on riscv64
73+
new OrPredicate(new CPUSpecificPredicate("riscv64.*", null, null),
7374
new OrPredicate(new CPUSpecificPredicate("s390.*", new String[] { "sha1" }, null),
7475
// x86 variants
7576
new OrPredicate(new CPUSpecificPredicate("amd64.*", new String[] { "sha" }, null),

0 commit comments

Comments
 (0)
Please sign in to comment.