8321003: RISC-V: C2 MulReductionVI

Hamlin Li · Hamlin Li · commit 1b6281d98cf0 · 2025-02-21T10:25:50.000Z
8321004: RISC-V: C2 MulReductionVL

Reviewed-by: fyang, rehn
diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
@@ -2954,6 +2954,45 @@ void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1,
   vmv_x_s(dst, tmp);
 }
 
+void C2_MacroAssembler::reduce_mul_integral_v(Register dst, Register src1, VectorRegister src2,
+                                              VectorRegister vtmp1, VectorRegister vtmp2,
+                                              BasicType bt, uint vector_length, VectorMask vm) {
+  assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
+  vsetvli_helper(bt, vector_length);
+
+  vector_length /= 2;
+  if (vm != Assembler::unmasked) {
+    // This behaviour is consistent with spec requirements of vector API, for `reduceLanes`:
+    //  If no elements are selected, an operation-specific identity value is returned.
+    //    If the operation is MUL, then the identity value is one.
+    vmv_v_i(vtmp1, 1);
+    vmerge_vvm(vtmp2, vtmp1, src2); // vm == v0
+    vslidedown_vi(vtmp1, vtmp2, vector_length);
+
+    vsetvli_helper(bt, vector_length);
+    vmul_vv(vtmp1, vtmp1, vtmp2);
+  } else {
+    vslidedown_vi(vtmp1, src2, vector_length);
+
+    vsetvli_helper(bt, vector_length);
+    vmul_vv(vtmp1, vtmp1, src2);
+  }
+
+  while (vector_length > 1) {
+    vector_length /= 2;
+    vslidedown_vi(vtmp2, vtmp1, vector_length);
+    vsetvli_helper(bt, vector_length);
+    vmul_vv(vtmp1, vtmp1, vtmp2);
+  }
+
+  vmv_x_s(dst, vtmp1);
+  if (bt == T_INT) {
+    mulw(dst, dst, src1);
+  } else {
+    mul(dst, dst, src1);
+  }
+}
+
 // Set vl and vtype for full and partial vector operations.
 // (vma = mu, vta = tu, vill = false)
 void C2_MacroAssembler::vsetvli_helper(BasicType bt, uint vector_length, LMUL vlmul, Register tmp) {
diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp
@@ -239,6 +239,10 @@
                         int opc, BasicType bt, uint vector_length,
                         VectorMask vm = Assembler::unmasked);
 
+  void reduce_mul_integral_v(Register dst, Register src1, VectorRegister src2,
+                             VectorRegister vtmp1, VectorRegister vtmp2, BasicType bt,
+                             uint vector_length, VectorMask vm = Assembler::unmasked);
+
   void vsetvli_helper(BasicType bt, uint vector_length, LMUL vlmul = Assembler::m1, Register tmp = t0);
 
   void compare_integral_v(VectorRegister dst, VectorRegister src1, VectorRegister src2, int cond,
diff --git a/src/hotspot/cpu/riscv/riscv_v.ad b/src/hotspot/cpu/riscv/riscv_v.ad
@@ -2,6 +2,7 @@
 // Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
 // Copyright (c) 2020, 2023, Arm Limited. All rights reserved.
 // Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved.
+// Copyright (c) 2023, 2025, Rivos Inc. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@@ -99,6 +100,12 @@ source %{
           return false;
         }
         break;
+      case Op_MulReductionVI:
+      case Op_MulReductionVL:
+        // When vlen < 4, our log2(vlen) implementation does not help to gain performance improvement.
+        if (vlen < 4) {
+          return false;
+        }
       default:
         break;
     }
@@ -2427,6 +2434,67 @@ instruct vreduce_minD_masked(fRegD dst, fRegD src1, vReg src2, vRegMask_V0 v0, v
   ins_pipe(pipe_slow);
 %}
 
+
+// ------------------------------ Vector reduction mul -------------------------
+
+instruct reduce_mulI(iRegINoSp dst, iRegIorL2I isrc, vReg vsrc,
+                     vReg tmp1, vReg tmp2) %{
+  match(Set dst (MulReductionVI isrc vsrc));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
+  format %{ "reduce_mulI $dst, $isrc, $vsrc\t" %}
+
+  ins_encode %{
+    __ reduce_mul_integral_v($dst$$Register, $isrc$$Register, as_VectorRegister($vsrc$$reg),
+                             as_VectorRegister($tmp1$$reg), as_VectorRegister($tmp2$$reg),
+                             Matcher::vector_element_basic_type(this, $vsrc), Matcher::vector_length(this, $vsrc));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct reduce_mulI_masked(iRegINoSp dst, iRegIorL2I isrc, vReg vsrc,
+                            vRegMask_V0 v0, vReg tmp1, vReg tmp2) %{
+  match(Set dst (MulReductionVI (Binary isrc vsrc) v0));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
+  format %{ "reduce_mulI_masked $dst, $isrc, $vsrc, $v0\t" %}
+
+  ins_encode %{
+    __ reduce_mul_integral_v($dst$$Register, $isrc$$Register, as_VectorRegister($vsrc$$reg),
+                             as_VectorRegister($tmp1$$reg), as_VectorRegister($tmp2$$reg),
+                             Matcher::vector_element_basic_type(this, $vsrc), Matcher::vector_length(this, $vsrc),
+                             Assembler::v0_t);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct reduce_mulL(iRegLNoSp dst, iRegL isrc, vReg vsrc,
+                     vReg tmp1, vReg tmp2) %{
+  match(Set dst (MulReductionVL isrc vsrc));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
+  format %{ "reduce_mulL $dst, $isrc, $vsrc\t" %}
+
+  ins_encode %{
+    __ reduce_mul_integral_v($dst$$Register, $isrc$$Register, as_VectorRegister($vsrc$$reg),
+                             as_VectorRegister($tmp1$$reg), as_VectorRegister($tmp2$$reg),
+                             Matcher::vector_element_basic_type(this, $vsrc), Matcher::vector_length(this, $vsrc));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct reduce_mulL_masked(iRegLNoSp dst, iRegL isrc, vReg vsrc,
+                            vRegMask_V0 v0, vReg tmp1, vReg tmp2) %{
+  match(Set dst (MulReductionVL (Binary isrc vsrc) v0));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
+  format %{ "reduce_mulL_masked $dst, $isrc, $vsrc, $v0\t" %}
+
+  ins_encode %{
+    __ reduce_mul_integral_v($dst$$Register, $isrc$$Register, as_VectorRegister($vsrc$$reg),
+                             as_VectorRegister($tmp1$$reg), as_VectorRegister($tmp2$$reg),
+                             Matcher::vector_element_basic_type(this, $vsrc), Matcher::vector_length(this, $vsrc),
+                             Assembler::v0_t);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 // vector replicate
 
 instruct replicate(vReg dst, iRegIorL2I src) %{
diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/ProdRed_Int.java b/test/hotspot/jtreg/compiler/loopopts/superword/ProdRed_Int.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -85,6 +85,10 @@ public static void prodReductionInit(int[] a, int[] b) {
     @IR(applyIfCPUFeature = {"sse4.1", "true"},
         applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
         counts = {IRNode.MUL_REDUCTION_VI, ">= 1", IRNode.MUL_REDUCTION_VI, "<= 2"}) // one for main-loop, one for vector-post-loop
+    @IR(applyIfPlatform = {"riscv64", "true"},
+        applyIfCPUFeature = {"rvv", "true"},
+        applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
+        counts = {IRNode.MUL_REDUCTION_VI, ">= 1", IRNode.MUL_REDUCTION_VI, "<= 2"}) // one for main-loop, one for vector-post-loop
     public static int prodReductionImplement(int[] a, int[] b, int total) {
         for (int i = 0; i < a.length; i++) {
             total *= a[i] + b[i];
diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_int.java b/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_int.java
@@ -219,6 +219,10 @@ public static int xorReductionImplement(
     @IR(applyIfCPUFeature = {"sse4.1", "true"},
         applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
         counts = {IRNode.MUL_REDUCTION_VI, ">= 1", IRNode.MUL_REDUCTION_VI, "<= 2"}) // one for main-loop, one for vector-post-loop
+    @IR(applyIfPlatform = {"riscv64", "true"},
+        applyIfCPUFeature = {"rvv", "true"},
+        applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
+        counts = {IRNode.MUL_REDUCTION_VI, ">= 1", IRNode.MUL_REDUCTION_VI, "<= 2"}) // one for main-loop, one for vector-post-loop
     public static int mulReductionImplement(
             int[] a,
             int[] b,
diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_long.java b/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_long.java
@@ -226,6 +226,10 @@ public static long xorReductionImplement(
         applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
         applyIfPlatform = {"64-bit", "true"},
         counts = {IRNode.MUL_REDUCTION_VL, ">= 1", IRNode.MUL_REDUCTION_VL, "<= 2"}) // one for main-loop, one for vector-post-loop
+    @IR(applyIfPlatform = {"riscv64", "true"},
+        applyIfCPUFeature = {"rvv", "true"},
+        applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
+        counts = {IRNode.MUL_REDUCTION_VL, ">= 1", IRNode.MUL_REDUCTION_VL, "<= 2"}) // one for main-loop, one for vector-post-loop
     public static long mulReductionImplement(
             long[] a,
             long[] b,