@@ -4809,6 +4809,348 @@ class StubGenerator: public StubCodeGenerator {
4809
4809
return (address) start;
4810
4810
}
4811
4811
4812
+
4813
+ // ------------------------ SHA-1 intrinsic ------------------------
4814
+
4815
+ // K't =
4816
+ // 5a827999, 0 <= t <= 19
4817
+ // 6ed9eba1, 20 <= t <= 39
4818
+ // 8f1bbcdc, 40 <= t <= 59
4819
+ // ca62c1d6, 60 <= t <= 79
4820
+ void sha1_prepare_k (Register cur_k, int round) {
4821
+ assert (round >= 0 && round < 80 , " must be" );
4822
+
4823
+ static const int64_t ks[] = {0x5a827999 , 0x6ed9eba1 , 0x8f1bbcdc , 0xca62c1d6 };
4824
+ if ((round % 20 ) == 0 ) {
4825
+ __ mv (cur_k, ks[round /20 ]);
4826
+ }
4827
+ }
4828
+
4829
+ // W't =
4830
+ // M't, 0 <= t <= 15
4831
+ // ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79
4832
+ void sha1_prepare_w (Register cur_w, Register ws[], Register buf, int round) {
4833
+ assert (round >= 0 && round < 80 , " must be" );
4834
+
4835
+ if (round < 16 ) {
4836
+ // in the first 16 rounds, in ws[], every register contains 2 W't, e.g.
4837
+ // in ws[0], high part contains W't-0, low part contains W't-1,
4838
+ // in ws[1], high part contains W't-2, low part contains W't-3,
4839
+ // ...
4840
+ // in ws[7], high part contains W't-14, low part contains W't-15.
4841
+
4842
+ if ((round % 2 ) == 0 ) {
4843
+ __ ld (ws[round /2 ], Address (buf, (round /2 ) * 8 ));
4844
+ // reverse bytes, as SHA-1 is defined in big-endian.
4845
+ __ revb (ws[round /2 ], ws[round /2 ]);
4846
+ __ srli (cur_w, ws[round /2 ], 32 );
4847
+ } else {
4848
+ __ mv (cur_w, ws[round /2 ]);
4849
+ }
4850
+
4851
+ return ;
4852
+ }
4853
+
4854
+ if ((round % 2 ) == 0 ) {
4855
+ int idx = 16 ;
4856
+ // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79
4857
+ __ srli (t1, ws[(idx-8 )/2 ], 32 );
4858
+ __ xorr (t0, ws[(idx-3 )/2 ], t1);
4859
+
4860
+ __ srli (t1, ws[(idx-14 )/2 ], 32 );
4861
+ __ srli (cur_w, ws[(idx-16 )/2 ], 32 );
4862
+ __ xorr (cur_w, cur_w, t1);
4863
+
4864
+ __ xorr (cur_w, cur_w, t0);
4865
+ __ rolw_imm (cur_w, cur_w, 1 , t0);
4866
+
4867
+ // copy the cur_w value to ws[8].
4868
+ // now, valid w't values are at:
4869
+ // w0: ws[0]'s lower 32 bits
4870
+ // w1 ~ w14: ws[1] ~ ws[7]
4871
+ // w15: ws[8]'s higher 32 bits
4872
+ __ slli (ws[idx/2 ], cur_w, 32 );
4873
+
4874
+ return ;
4875
+ }
4876
+
4877
+ int idx = 17 ;
4878
+ // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79
4879
+ __ srli (t1, ws[(idx-3 )/2 ], 32 );
4880
+ __ xorr (t0, t1, ws[(idx-8 )/2 ]);
4881
+
4882
+ __ xorr (cur_w, ws[(idx-16 )/2 ], ws[(idx-14 )/2 ]);
4883
+
4884
+ __ xorr (cur_w, cur_w, t0);
4885
+ __ rolw_imm (cur_w, cur_w, 1 , t0);
4886
+
4887
+ // copy the cur_w value to ws[8]
4888
+ __ zero_extend (cur_w, cur_w, 32 );
4889
+ __ orr (ws[idx/2 ], ws[idx/2 ], cur_w);
4890
+
4891
+ // shift the w't registers, so they start from ws[0] again.
4892
+ // now, valid w't values are at:
4893
+ // w0 ~ w15: ws[0] ~ ws[7]
4894
+ Register ws_0 = ws[0 ];
4895
+ for (int i = 0 ; i < 16 /2 ; i++) {
4896
+ ws[i] = ws[i+1 ];
4897
+ }
4898
+ ws[8 ] = ws_0;
4899
+ }
4900
+
4901
+ // f't(x, y, z) =
4902
+ // Ch(x, y, z) = (x & y) ^ (~x & z) , 0 <= t <= 19
4903
+ // Parity(x, y, z) = x ^ y ^ z , 20 <= t <= 39
4904
+ // Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) , 40 <= t <= 59
4905
+ // Parity(x, y, z) = x ^ y ^ z , 60 <= t <= 79
4906
+ void sha1_f (Register dst, Register x, Register y, Register z, int round) {
4907
+ assert (round >= 0 && round < 80 , " must be" );
4908
+ assert_different_registers (dst, x, y, z, t0, t1);
4909
+
4910
+ if (round < 20 ) {
4911
+ // (x & y) ^ (~x & z)
4912
+ __ andr (t0, x, y);
4913
+ __ andn (dst, z, x);
4914
+ __ xorr (dst, dst, t0);
4915
+ } else if (round >= 40 && round < 60 ) {
4916
+ // (x & y) ^ (x & z) ^ (y & z)
4917
+ __ andr (t0, x, y);
4918
+ __ andr (t1, x, z);
4919
+ __ andr (dst, y, z);
4920
+ __ xorr (dst, dst, t0);
4921
+ __ xorr (dst, dst, t1);
4922
+ } else {
4923
+ // x ^ y ^ z
4924
+ __ xorr (dst, x, y);
4925
+ __ xorr (dst, dst, z);
4926
+ }
4927
+ }
4928
+
4929
+ // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
4930
+ // e = d
4931
+ // d = c
4932
+ // c = ROTL'30(b)
4933
+ // b = a
4934
+ // a = T
4935
+ void sha1_process_round (Register a, Register b, Register c, Register d, Register e,
4936
+ Register cur_k, Register cur_w, Register tmp, int round) {
4937
+ assert (round >= 0 && round < 80 , " must be" );
4938
+ assert_different_registers (a, b, c, d, e, cur_w, cur_k, tmp, t0);
4939
+
4940
+ // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
4941
+
4942
+ // cur_w will be recalculated at the beginning of each round,
4943
+ // so, we can reuse it as a temp register here.
4944
+ Register tmp2 = cur_w;
4945
+
4946
+ // reuse e as a temporary register, as we will mv new value into it later
4947
+ Register tmp3 = e;
4948
+ __ add (tmp2, cur_k, tmp2);
4949
+ __ add (tmp3, tmp3, tmp2);
4950
+ __ rolw_imm (tmp2, a, 5 , t0);
4951
+
4952
+ sha1_f (tmp, b, c, d, round );
4953
+
4954
+ __ add (tmp2, tmp2, tmp);
4955
+ __ add (tmp2, tmp2, tmp3);
4956
+
4957
+ // e = d
4958
+ // d = c
4959
+ // c = ROTL'30(b)
4960
+ // b = a
4961
+ // a = T
4962
+ __ mv (e, d);
4963
+ __ mv (d, c);
4964
+
4965
+ __ rolw_imm (c, b, 30 );
4966
+ __ mv (b, a);
4967
+ __ mv (a, tmp2);
4968
+ }
4969
+
4970
+ // H(i)0 = a + H(i-1)0
4971
+ // H(i)1 = b + H(i-1)1
4972
+ // H(i)2 = c + H(i-1)2
4973
+ // H(i)3 = d + H(i-1)3
4974
+ // H(i)4 = e + H(i-1)4
4975
+ void sha1_calculate_im_hash (Register a, Register b, Register c, Register d, Register e,
4976
+ Register prev_ab, Register prev_cd, Register prev_e) {
4977
+ assert_different_registers (a, b, c, d, e, prev_ab, prev_cd, prev_e);
4978
+
4979
+ __ add (a, a, prev_ab);
4980
+ __ srli (prev_ab, prev_ab, 32 );
4981
+ __ add (b, b, prev_ab);
4982
+
4983
+ __ add (c, c, prev_cd);
4984
+ __ srli (prev_cd, prev_cd, 32 );
4985
+ __ add (d, d, prev_cd);
4986
+
4987
+ __ add (e, e, prev_e);
4988
+ }
4989
+
4990
+ void sha1_preserve_prev_abcde (Register a, Register b, Register c, Register d, Register e,
4991
+ Register prev_ab, Register prev_cd, Register prev_e) {
4992
+ assert_different_registers (a, b, c, d, e, prev_ab, prev_cd, prev_e, t0);
4993
+
4994
+ __ slli (t0, b, 32 );
4995
+ __ zero_extend (prev_ab, a, 32 );
4996
+ __ orr (prev_ab, prev_ab, t0);
4997
+
4998
+ __ slli (t0, d, 32 );
4999
+ __ zero_extend (prev_cd, c, 32 );
5000
+ __ orr (prev_cd, prev_cd, t0);
5001
+
5002
+ __ mv (prev_e, e);
5003
+ }
5004
+
5005
+ // Intrinsic for:
5006
+ // void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs)
5007
+ // void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit)
5008
+ //
5009
+ // Arguments:
5010
+ //
5011
+ // Inputs:
5012
+ // c_rarg0: byte[] src array + offset
5013
+ // c_rarg1: int[] SHA.state
5014
+ // - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5015
+ // c_rarg2: int offset
5016
+ // c_rarg3: int limit
5017
+ //
5018
+ // Outputs:
5019
+ // - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5020
+ // c_rarg0: int offset, when (multi_block == true)
5021
+ //
5022
+ address generate_sha1_implCompress (bool multi_block, const char *name) {
5023
+ __ align (CodeEntryAlignment);
5024
+ StubCodeMark mark (this , " StubRoutines" , name);
5025
+
5026
+ address start = __ pc ();
5027
+ __ enter ();
5028
+
5029
+ RegSet saved_regs = RegSet::range (x18, x27);
5030
+ if (multi_block) {
5031
+ // use x9 as src below.
5032
+ saved_regs += RegSet::of (x9);
5033
+ }
5034
+ __ push_reg (saved_regs, sp);
5035
+
5036
+ // c_rarg0 - c_rarg3: x10 - x13
5037
+ Register buf = c_rarg0;
5038
+ Register state = c_rarg1;
5039
+ Register offset = c_rarg2;
5040
+ Register limit = c_rarg3;
5041
+ // use src to contain the original start point of the array.
5042
+ Register src = x9;
5043
+
5044
+ if (multi_block) {
5045
+ __ sub (limit, limit, offset);
5046
+ __ add (limit, limit, buf);
5047
+ __ sub (src, buf, offset);
5048
+ }
5049
+
5050
+ // [args-reg]: x14 - x17
5051
+ // [temp-reg]: x28 - x31
5052
+ // [saved-reg]: x18 - x27
5053
+
5054
+ // h0/1/2/3/4
5055
+ const Register a = x14, b = x15, c = x16, d = x17, e = x28;
5056
+ // w0, w1, ... w15
5057
+ // put two adjecent w's in one register:
5058
+ // one at high word part, another at low word part
5059
+ // at different round (even or odd), w't value reside in different items in ws[].
5060
+ // w0 ~ w15, either reside in
5061
+ // ws[0] ~ ws[7], where
5062
+ // w0 at higher 32 bits of ws[0],
5063
+ // w1 at lower 32 bits of ws[0],
5064
+ // ...
5065
+ // w14 at higher 32 bits of ws[7],
5066
+ // w15 at lower 32 bits of ws[7].
5067
+ // or, reside in
5068
+ // w0: ws[0]'s lower 32 bits
5069
+ // w1 ~ w14: ws[1] ~ ws[7]
5070
+ // w15: ws[8]'s higher 32 bits
5071
+ Register ws[9 ] = {x29, x30, x31, x18,
5072
+ x19, x20, x21, x22,
5073
+ x23}; // auxiliary register for calculating w's value
5074
+ // current k't's value
5075
+ const Register cur_k = x24;
5076
+ // current w't's value
5077
+ const Register cur_w = x25;
5078
+ // values of a, b, c, d, e in the previous round
5079
+ const Register prev_ab = x26, prev_cd = x27;
5080
+ const Register prev_e = offset; // reuse offset/c_rarg2
5081
+
5082
+ // load 5 words state into a, b, c, d, e.
5083
+ //
5084
+ // To minimize the number of memory operations, we apply following
5085
+ // optimization: read the states (a/b/c/d) of 4-byte values in pairs,
5086
+ // with a single ld, and split them into 2 registers.
5087
+ //
5088
+ // And, as the core algorithm of SHA-1 works on 32-bits words, so
5089
+ // in the following code, it does not care about the content of
5090
+ // higher 32-bits in a/b/c/d/e. Based on this observation,
5091
+ // we can apply further optimization, which is to just ignore the
5092
+ // higher 32-bits in a/c/e, rather than set the higher
5093
+ // 32-bits of a/c/e to zero explicitly with extra instructions.
5094
+ __ ld (a, Address (state, 0 ));
5095
+ __ srli (b, a, 32 );
5096
+ __ ld (c, Address (state, 8 ));
5097
+ __ srli (d, c, 32 );
5098
+ __ lw (e, Address (state, 16 ));
5099
+
5100
+ Label L_sha1_loop;
5101
+ if (multi_block) {
5102
+ __ BIND (L_sha1_loop);
5103
+ }
5104
+
5105
+ sha1_preserve_prev_abcde (a, b, c, d, e, prev_ab, prev_cd, prev_e);
5106
+
5107
+ for (int round = 0 ; round < 80 ; round ++) {
5108
+ // prepare K't value
5109
+ sha1_prepare_k (cur_k, round );
5110
+
5111
+ // prepare W't value
5112
+ sha1_prepare_w (cur_w, ws, buf, round );
5113
+
5114
+ // one round process
5115
+ sha1_process_round (a, b, c, d, e, cur_k, cur_w, t2, round );
5116
+ }
5117
+
5118
+ // compute the intermediate hash value
5119
+ sha1_calculate_im_hash (a, b, c, d, e, prev_ab, prev_cd, prev_e);
5120
+
5121
+ if (multi_block) {
5122
+ int64_t block_bytes = 16 * 4 ;
5123
+ __ addi (buf, buf, block_bytes);
5124
+
5125
+ __ bge (limit, buf, L_sha1_loop, true );
5126
+ }
5127
+
5128
+ // store back the state.
5129
+ __ zero_extend (a, a, 32 );
5130
+ __ slli (b, b, 32 );
5131
+ __ orr (a, a, b);
5132
+ __ sd (a, Address (state, 0 ));
5133
+ __ zero_extend (c, c, 32 );
5134
+ __ slli (d, d, 32 );
5135
+ __ orr (c, c, d);
5136
+ __ sd (c, Address (state, 8 ));
5137
+ __ sw (e, Address (state, 16 ));
5138
+
5139
+ // return offset
5140
+ if (multi_block) {
5141
+ __ sub (c_rarg0, buf, src);
5142
+ }
5143
+
5144
+ __ pop_reg (saved_regs, sp);
5145
+
5146
+ __ leave ();
5147
+ __ ret ();
5148
+
5149
+ return (address) start;
5150
+ }
5151
+
5152
+
5153
+
4812
5154
#ifdef COMPILER2
4813
5155
4814
5156
static const int64_t right_2_bits = right_n_bits(2 );
@@ -5273,6 +5615,11 @@ static const int64_t right_3_bits = right_n_bits(3);
5273
5615
StubRoutines::_chacha20Block = generate_chacha20Block ();
5274
5616
}
5275
5617
5618
+ if (UseSHA1Intrinsics) {
5619
+ StubRoutines::_sha1_implCompress = generate_sha1_implCompress (false , " sha1_implCompress" );
5620
+ StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress (true , " sha1_implCompressMB" );
5621
+ }
5622
+
5276
5623
#endif // COMPILER2_OR_JVMCI
5277
5624
}
5278
5625
0 commit comments