openjdk · Feb 27, 2024
diff --git a/‎src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
+347 b/‎src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
+347
diff --git a/‎src/hotspot/cpu/riscv/stubRoutines_riscv.hpp
+1-1 b/‎src/hotspot/cpu/riscv/stubRoutines_riscv.hpp
+1-1
diff --git a/‎src/hotspot/cpu/riscv/vm_version_riscv.cpp
+51-28 b/‎src/hotspot/cpu/riscv/vm_version_riscv.cpp
+51-28
diff --git a/‎test/hotspot/jtreg/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java
+2-1 b/‎test/hotspot/jtreg/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java
+2-1
@@ -4809,6 +4809,348 @@ class StubGenerator: public StubCodeGenerator {
     return (address) start;
   }
 
+
+  // ------------------------ SHA-1 intrinsic ------------------------
+
+  // K't =
+  //    5a827999, 0  <= t <= 19
+  //    6ed9eba1, 20 <= t <= 39
+  //    8f1bbcdc, 40 <= t <= 59
+  //    ca62c1d6, 60 <= t <= 79
+  void sha1_prepare_k(Register cur_k, int round) {
+    assert(round >= 0 && round < 80, "must be");
+
+    static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6};
+    if ((round % 20) == 0) {
+      __ mv(cur_k, ks[round/20]);
+    }
+  }
+
+  // W't =
+  //    M't,                                      0 <=  t <= 15
+  //    ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
+  void sha1_prepare_w(Register cur_w, Register ws[], Register buf, int round) {
+    assert(round >= 0 && round < 80, "must be");
+
+    if (round < 16) {
+      // in the first 16 rounds, in ws[], every register contains 2 W't, e.g.
+      //   in ws[0], high part contains W't-0, low part contains W't-1,
+      //   in ws[1], high part contains W't-2, low part contains W't-3,
+      //   ...
+      //   in ws[7], high part contains W't-14, low part contains W't-15.
+
+      if ((round % 2) == 0) {
+        __ ld(ws[round/2], Address(buf, (round/2) * 8));
+        // reverse bytes, as SHA-1 is defined in big-endian.
+        __ revb(ws[round/2], ws[round/2]);
+        __ srli(cur_w, ws[round/2], 32);
+      } else {
+        __ mv(cur_w, ws[round/2]);
+      }
+
+      return;
+    }
+
+    if ((round % 2) == 0) {
+      int idx = 16;
+      // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
+      __ srli(t1, ws[(idx-8)/2], 32);
+      __ xorr(t0, ws[(idx-3)/2], t1);
+
+      __ srli(t1, ws[(idx-14)/2], 32);
+      __ srli(cur_w, ws[(idx-16)/2], 32);
+      __ xorr(cur_w, cur_w, t1);
+
+      __ xorr(cur_w, cur_w, t0);
+      __ rolw_imm(cur_w, cur_w, 1, t0);
+
+      // copy the cur_w value to ws[8].
+      // now, valid w't values are at:
+      //  w0:       ws[0]'s lower 32 bits
+      //  w1 ~ w14: ws[1] ~ ws[7]
+      //  w15:      ws[8]'s higher 32 bits
+      __ slli(ws[idx/2], cur_w, 32);
+
+      return;
+    }
+
+    int idx = 17;
+    // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
+    __ srli(t1, ws[(idx-3)/2], 32);
+    __ xorr(t0, t1, ws[(idx-8)/2]);
+
+    __ xorr(cur_w, ws[(idx-16)/2], ws[(idx-14)/2]);
+
+    __ xorr(cur_w, cur_w, t0);
+    __ rolw_imm(cur_w, cur_w, 1, t0);
+
+    // copy the cur_w value to ws[8]
+    __ zero_extend(cur_w, cur_w, 32);
+    __ orr(ws[idx/2], ws[idx/2], cur_w);
+
+    // shift the w't registers, so they start from ws[0] again.
+    // now, valid w't values are at:
+    //  w0 ~ w15: ws[0] ~ ws[7]
+    Register ws_0 = ws[0];
+    for (int i = 0; i < 16/2; i++) {
+      ws[i] = ws[i+1];
+    }
+    ws[8] = ws_0;
+  }
+
+  // f't(x, y, z) =
+  //    Ch(x, y, z)     = (x & y) ^ (~x & z)            , 0  <= t <= 19
+  //    Parity(x, y, z) = x ^ y ^ z                     , 20 <= t <= 39
+  //    Maj(x, y, z)    = (x & y) ^ (x & z) ^ (y & z)   , 40 <= t <= 59
+  //    Parity(x, y, z) = x ^ y ^ z                     , 60 <= t <= 79
+  void sha1_f(Register dst, Register x, Register y, Register z, int round) {
+    assert(round >= 0 && round < 80, "must be");
+    assert_different_registers(dst, x, y, z, t0, t1);
+
+    if (round < 20) {
+      // (x & y) ^ (~x & z)
+      __ andr(t0, x, y);
+      __ andn(dst, z, x);
+      __ xorr(dst, dst, t0);
+    } else if (round >= 40 && round < 60) {
+      // (x & y) ^ (x & z) ^ (y & z)
+      __ andr(t0, x, y);
+      __ andr(t1, x, z);
+      __ andr(dst, y, z);
+      __ xorr(dst, dst, t0);
+      __ xorr(dst, dst, t1);
+    } else {
+      // x ^ y ^ z
+      __ xorr(dst, x, y);
+      __ xorr(dst, dst, z);
+    }
+  }
+
+  // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
+  // e = d
+  // d = c
+  // c = ROTL'30(b)
+  // b = a
+  // a = T
+  void sha1_process_round(Register a, Register b, Register c, Register d, Register e,
+                          Register cur_k, Register cur_w, Register tmp, int round) {
+    assert(round >= 0 && round < 80, "must be");
+    assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp, t0);
+
+    // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
+
+    // cur_w will be recalculated at the beginning of each round,
+    // so, we can reuse it as a temp register here.
+    Register tmp2 = cur_w;
+
+    // reuse e as a temporary register, as we will mv new value into it later
+    Register tmp3 = e;
+    __ add(tmp2, cur_k, tmp2);
+    __ add(tmp3, tmp3, tmp2);
+    __ rolw_imm(tmp2, a, 5, t0);
+
+    sha1_f(tmp, b, c, d, round);
+
+    __ add(tmp2, tmp2, tmp);
+    __ add(tmp2, tmp2, tmp3);
+
+    // e = d
+    // d = c
+    // c = ROTL'30(b)
+    // b = a
+    // a = T
+    __ mv(e, d);
+    __ mv(d, c);
+
+    __ rolw_imm(c, b, 30);
+    __ mv(b, a);
+    __ mv(a, tmp2);
+  }
+
+  // H(i)0 = a + H(i-1)0
+  // H(i)1 = b + H(i-1)1
+  // H(i)2 = c + H(i-1)2
+  // H(i)3 = d + H(i-1)3
+  // H(i)4 = e + H(i-1)4
+  void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e,
+                              Register prev_ab, Register prev_cd, Register prev_e) {
+    assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e);
+
+    __ add(a, a, prev_ab);
+    __ srli(prev_ab, prev_ab, 32);
+    __ add(b, b, prev_ab);
+
+    __ add(c, c, prev_cd);
+    __ srli(prev_cd, prev_cd, 32);
+    __ add(d, d, prev_cd);
+
+    __ add(e, e, prev_e);
+  }
+
+  void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e,
+                                Register prev_ab, Register prev_cd, Register prev_e) {
+    assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e, t0);
+
+    __ slli(t0, b, 32);
+    __ zero_extend(prev_ab, a, 32);
+    __ orr(prev_ab, prev_ab, t0);
+
+    __ slli(t0, d, 32);
+    __ zero_extend(prev_cd, c, 32);
+    __ orr(prev_cd, prev_cd, t0);
+
+    __ mv(prev_e, e);
+  }
+
+  // Intrinsic for:
+  //   void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs)
+  //   void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit)
+  //
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0: byte[]  src array + offset
+  //   c_rarg1: int[]   SHA.state
+  //   - - - - - - below are only for implCompressMultiBlock0 - - - - - -
+  //   c_rarg2: int     offset
+  //   c_rarg3: int     limit
+  //
+  // Outputs:
+  //   - - - - - - below are only for implCompressMultiBlock0 - - - - - -
+  //   c_rarg0: int offset, when (multi_block == true)
+  //
+  address generate_sha1_implCompress(bool multi_block, const char *name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+
+    address start = __ pc();
+    __ enter();
+
+    RegSet saved_regs = RegSet::range(x18, x27);
+    if (multi_block) {
+      // use x9 as src below.
+      saved_regs += RegSet::of(x9);
+    }
+    __ push_reg(saved_regs, sp);
+
+    // c_rarg0 - c_rarg3: x10 - x13
+    Register buf    = c_rarg0;
+    Register state  = c_rarg1;
+    Register offset = c_rarg2;
+    Register limit  = c_rarg3;
+    // use src to contain the original start point of the array.
+    Register src    = x9;
+
+    if (multi_block) {
+      __ sub(limit, limit, offset);
+      __ add(limit, limit, buf);
+      __ sub(src, buf, offset);
+    }
+
+    // [args-reg]:  x14 - x17
+    // [temp-reg]:  x28 - x31
+    // [saved-reg]: x18 - x27
+
+    // h0/1/2/3/4
+    const Register a = x14, b = x15, c = x16, d = x17, e = x28;
+    // w0, w1, ... w15
+    // put two adjecent w's in one register:
+    //    one at high word part, another at low word part
+    // at different round (even or odd), w't value reside in different items in ws[].
+    // w0 ~ w15, either reside in
+    //    ws[0] ~ ws[7], where
+    //      w0 at higher 32 bits of ws[0],
+    //      w1 at lower 32 bits of ws[0],
+    //      ...
+    //      w14 at higher 32 bits of ws[7],
+    //      w15 at lower 32 bits of ws[7].
+    // or, reside in
+    //    w0:       ws[0]'s lower 32 bits
+    //    w1 ~ w14: ws[1] ~ ws[7]
+    //    w15:      ws[8]'s higher 32 bits
+    Register ws[9] = {x29, x30, x31, x18,
+                      x19, x20, x21, x22,
+                      x23}; // auxiliary register for calculating w's value
+    // current k't's value
+    const Register cur_k = x24;
+    // current w't's value
+    const Register cur_w = x25;
+    // values of a, b, c, d, e in the previous round
+    const Register prev_ab = x26, prev_cd = x27;
+    const Register prev_e = offset; // reuse offset/c_rarg2
+
+    // load 5 words state into a, b, c, d, e.
+    //
+    // To minimize the number of memory operations, we apply following
+    // optimization: read the states (a/b/c/d) of 4-byte values in pairs,
+    // with a single ld, and split them into 2 registers.
+    //
+    // And, as the core algorithm of SHA-1 works on 32-bits words, so
+    // in the following code, it does not care about the content of
+    // higher 32-bits in a/b/c/d/e. Based on this observation,
+    // we can apply further optimization, which is to just ignore the
+    // higher 32-bits in a/c/e, rather than set the higher
+    // 32-bits of a/c/e to zero explicitly with extra instructions.
+    __ ld(a, Address(state, 0));
+    __ srli(b, a, 32);
+    __ ld(c, Address(state, 8));
+    __ srli(d, c, 32);
+    __ lw(e, Address(state, 16));
+
+    Label L_sha1_loop;
+    if (multi_block) {
+      __ BIND(L_sha1_loop);
+    }
+
+    sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e);
+
+    for (int round = 0; round < 80; round++) {
+      // prepare K't value
+      sha1_prepare_k(cur_k, round);
+
+      // prepare W't value
+      sha1_prepare_w(cur_w, ws, buf, round);
+
+      // one round process
+      sha1_process_round(a, b, c, d, e, cur_k, cur_w, t2, round);
+    }
+
+    // compute the intermediate hash value
+    sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e);
+
+    if (multi_block) {
+      int64_t block_bytes = 16 * 4;
+      __ addi(buf, buf, block_bytes);
+
+      __ bge(limit, buf, L_sha1_loop, true);
+    }
+
+    // store back the state.
+    __ zero_extend(a, a, 32);
+    __ slli(b, b, 32);
+    __ orr(a, a, b);
+    __ sd(a, Address(state, 0));
+    __ zero_extend(c, c, 32);
+    __ slli(d, d, 32);
+    __ orr(c, c, d);
+    __ sd(c, Address(state, 8));
+    __ sw(e, Address(state, 16));
+
+    // return offset
+    if (multi_block) {
+      __ sub(c_rarg0, buf, src);
+    }
+
+    __ pop_reg(saved_regs, sp);
+
+    __ leave();
+    __ ret();
+
+    return (address) start;
+  }
+
+
+
 #ifdef COMPILER2
 
 static const int64_t right_2_bits = right_n_bits(2);
@@ -5273,6 +5615,11 @@ static const int64_t right_3_bits = right_n_bits(3);
       StubRoutines::_chacha20Block = generate_chacha20Block();
     }
 
+    if (UseSHA1Intrinsics) {
+      StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false, "sha1_implCompress");
+      StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true, "sha1_implCompressMB");
+    }
+
 #endif // COMPILER2_OR_JVMCI
   }
 
 
@@ -39,7 +39,7 @@ enum platform_dependent_constants {
   // simply increase sizes if too small (assembler will crash if too small)
   _initial_stubs_code_size      = 10000,
   _continuation_stubs_code_size =  2000,
-  _compiler_stubs_code_size     = 15000 ZGC_ONLY(+5000),
+  _compiler_stubs_code_size     = 25000 ZGC_ONLY(+5000),
   _final_stubs_code_size        = 20000 ZGC_ONLY(+10000)
 };
 
 
@@ -149,16 +149,6 @@ void VM_Version::initialize() {
     FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
   }
 
-  if (UseSHA1Intrinsics) {
-    warning("Intrinsics for SHA-1 crypto hash functions not available on this CPU.");
-    FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
-  }
-
-  if (UseSHA3Intrinsics) {
-    warning("Intrinsics for SHA3-224, SHA3-256, SHA3-384 and SHA3-512 crypto hash functions not available on this CPU.");
-    FLAG_SET_DEFAULT(UseSHA3Intrinsics, false);
-  }
-
   if (UseCRC32Intrinsics) {
     warning("CRC32 intrinsics are not available on this CPU.");
     FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
@@ -260,11 +250,8 @@ void VM_Version::initialize() {
   // NOTE: Make sure codes dependent on UseRVV are put after c2_initialize(),
   //       as there are extra checks inside it which could disable UseRVV
   //       in some situations.
-  if (UseZvkn && !UseRVV) {
-    FLAG_SET_DEFAULT(UseZvkn, false);
-    warning("Cannot enable Zvkn on cpu without RVV support.");
-  }
 
+  // ChaCha20
   if (UseRVV) {
     if (FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
       FLAG_SET_DEFAULT(UseChaCha20Intrinsics, true);
@@ -276,29 +263,65 @@ void VM_Version::initialize() {
     FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
   }
 
-  if (!UseZvkn && UseSHA) {
-    warning("SHA instructions are not available on this CPU");
-    FLAG_SET_DEFAULT(UseSHA, false);
-  } else if (UseZvkn && FLAG_IS_DEFAULT(UseSHA)) {
+  // SHA's
+  if (FLAG_IS_DEFAULT(UseSHA)) {
     FLAG_SET_DEFAULT(UseSHA, true);
   }
 
-  if (!UseSHA) {
+  // SHA-1, no RVV required though.
+  if (UseSHA) {
+    if (FLAG_IS_DEFAULT(UseSHA1Intrinsics)) {
+      FLAG_SET_DEFAULT(UseSHA1Intrinsics, true);
+    }
+  } else if (UseSHA1Intrinsics) {
+    warning("Intrinsics for SHA-1 crypto hash functions not available on this CPU.");
+    FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
+  }
+
+  // UseZvkn (depends on RVV) and SHA-2.
+  if (UseZvkn && !UseRVV) {
+    FLAG_SET_DEFAULT(UseZvkn, false);
+    warning("Cannot enable Zvkn on cpu without RVV support.");
+  }
+  // SHA-2, depends on Zvkn.
+  if (UseSHA) {
+    if (UseZvkn) {
+      if (FLAG_IS_DEFAULT(UseSHA256Intrinsics)) {
+        FLAG_SET_DEFAULT(UseSHA256Intrinsics, true);
+      }
+      if (FLAG_IS_DEFAULT(UseSHA512Intrinsics)) {
+        FLAG_SET_DEFAULT(UseSHA512Intrinsics, true);
+      }
+    } else {
+      if (UseSHA256Intrinsics) {
+        warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU, UseZvkn needed.");
+        FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
+      }
+      if (UseSHA512Intrinsics) {
+        warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU, UseZvkn needed.");
+        FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
+      }
+    }
+  } else {
     if (UseSHA256Intrinsics) {
-      warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU, UseZvkn needed.");
+      warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU, as UseSHA disabled.");
       FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
     }
     if (UseSHA512Intrinsics) {
-      warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU, UseZvkn needed.");
+      warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU, as UseSHA disabled.");
       FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
     }
-  } else {
-    if (FLAG_IS_DEFAULT(UseSHA256Intrinsics)) {
-       FLAG_SET_DEFAULT(UseSHA256Intrinsics, true);
-    }
-    if (FLAG_IS_DEFAULT(UseSHA512Intrinsics)) {
-      FLAG_SET_DEFAULT(UseSHA512Intrinsics, true);
-    }
+  }
+
+  // SHA-3
+  if (UseSHA3Intrinsics) {
+    warning("Intrinsics for SHA3-224, SHA3-256, SHA3-384 and SHA3-512 crypto hash functions not available on this CPU.");
+    FLAG_SET_DEFAULT(UseSHA3Intrinsics, false);
+  }
+
+  // UseSHA
+  if (!(UseSHA1Intrinsics || UseSHA256Intrinsics || UseSHA3Intrinsics || UseSHA512Intrinsics)) {
+    FLAG_SET_DEFAULT(UseSHA, false);
   }
 }
 
 
@@ -69,7 +69,8 @@ public class IntrinsicPredicates {
 
     public static final BooleanSupplier SHA1_INSTRUCTION_AVAILABLE
             = new OrPredicate(new CPUSpecificPredicate("aarch64.*", new String[] { "sha1" }, null),
-              new OrPredicate(new CPUSpecificPredicate("riscv64.*", new String[] { "sha1" }, null),
+              // SHA-1 intrinsic is implemented with scalar instructions on riscv64
+              new OrPredicate(new CPUSpecificPredicate("riscv64.*", null, null),
               new OrPredicate(new CPUSpecificPredicate("s390.*",    new String[] { "sha1" }, null),
               // x86 variants
               new OrPredicate(new CPUSpecificPredicate("amd64.*",   new String[] { "sha" },  null),