diff --git a/src/hotspot/cpu/aarch64/c2_globals_aarch64.hpp b/src/hotspot/cpu/aarch64/c2_globals_aarch64.hpp
index 34e6e688abbca..e57dab7d1edac 100644
--- a/src/hotspot/cpu/aarch64/c2_globals_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/c2_globals_aarch64.hpp
@@ -66,6 +66,7 @@ define_pd_global(bool, OptoScheduling,               false);
 define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoRegScheduling,            false);
 define_pd_global(bool, SuperWordLoopUnrollAnalysis,  true);
+define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 8);
 define_pd_global(bool, IdealizeClearArrayNode,       true);
 
 define_pd_global(intx, ReservedCodeCacheSize,        48*M);
diff --git a/src/hotspot/cpu/arm/c2_globals_arm.hpp b/src/hotspot/cpu/arm/c2_globals_arm.hpp
index 57ed8f11c08b1..a44a8f649aee1 100644
--- a/src/hotspot/cpu/arm/c2_globals_arm.hpp
+++ b/src/hotspot/cpu/arm/c2_globals_arm.hpp
@@ -64,6 +64,7 @@ define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoScheduling,               true);
 define_pd_global(bool, OptoRegScheduling,            false);
 define_pd_global(bool, SuperWordLoopUnrollAnalysis,  false);
+define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 16);
 define_pd_global(bool, IdealizeClearArrayNode,       true);
 
 #ifdef _LP64
diff --git a/src/hotspot/cpu/ppc/c2_globals_ppc.hpp b/src/hotspot/cpu/ppc/c2_globals_ppc.hpp
index 00a92ff6b6251..f45faa21f01ba 100644
--- a/src/hotspot/cpu/ppc/c2_globals_ppc.hpp
+++ b/src/hotspot/cpu/ppc/c2_globals_ppc.hpp
@@ -59,6 +59,7 @@ define_pd_global(bool, UseCISCSpill,                 false);
 define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoRegScheduling,            false);
 define_pd_global(bool, SuperWordLoopUnrollAnalysis,  true);
+define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 16);
 // GL:
 // Detected a problem with unscaled compressed oops and
 // narrow_oop_use_complex_address() == false.
diff --git a/src/hotspot/cpu/riscv/c2_globals_riscv.hpp b/src/hotspot/cpu/riscv/c2_globals_riscv.hpp
index 53a41665f4b8e..e9947f9888a92 100644
--- a/src/hotspot/cpu/riscv/c2_globals_riscv.hpp
+++ b/src/hotspot/cpu/riscv/c2_globals_riscv.hpp
@@ -66,6 +66,7 @@ define_pd_global(bool, OptoScheduling,               true);
 define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoRegScheduling,            false);
 define_pd_global(bool, SuperWordLoopUnrollAnalysis,  true);
+define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 16);
 define_pd_global(bool, IdealizeClearArrayNode,       true);
 
 define_pd_global(intx, ReservedCodeCacheSize,        48*M);
diff --git a/src/hotspot/cpu/s390/c2_globals_s390.hpp b/src/hotspot/cpu/s390/c2_globals_s390.hpp
index 1de38f100f627..7f780ca63a017 100644
--- a/src/hotspot/cpu/s390/c2_globals_s390.hpp
+++ b/src/hotspot/cpu/s390/c2_globals_s390.hpp
@@ -61,6 +61,7 @@ define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoScheduling,               false);
 define_pd_global(bool, OptoRegScheduling,            false);
 define_pd_global(bool, SuperWordLoopUnrollAnalysis,  true);
+define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 16);
 // On s390x, we can clear the array with a single instruction,
 // so don't idealize it.
 define_pd_global(bool, IdealizeClearArrayNode,       false);
diff --git a/src/hotspot/cpu/x86/c2_globals_x86.hpp b/src/hotspot/cpu/x86/c2_globals_x86.hpp
index f7315011e6b19..084dde217e490 100644
--- a/src/hotspot/cpu/x86/c2_globals_x86.hpp
+++ b/src/hotspot/cpu/x86/c2_globals_x86.hpp
@@ -76,6 +76,7 @@ define_pd_global(bool, OptoScheduling,               false);
 define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoRegScheduling,            true);
 define_pd_global(bool, SuperWordLoopUnrollAnalysis,  true);
+define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 16);
 define_pd_global(bool, IdealizeClearArrayNode,       true);
 
 define_pd_global(uintx, ReservedCodeCacheSize,       48*M);
diff --git a/src/hotspot/share/opto/c2_globals.hpp b/src/hotspot/share/opto/c2_globals.hpp
index 45a067a830ba6..d4b55ec2d8d12 100644
--- a/src/hotspot/share/opto/c2_globals.hpp
+++ b/src/hotspot/share/opto/c2_globals.hpp
@@ -355,6 +355,12 @@
   product(bool, SuperWordReductions, true,                                  \
           "Enable reductions support in superword.")                        \
                                                                             \
+  product_pd(uint, SuperWordStoreToLoadForwardingFailureDetection, DIAGNOSTIC, \
+          "if >0, auto-vectorization detects possible store-to-load "       \
+          "forwarding failures. The number specifies over how many "        \
+          "loop iterations this detection spans.")                          \
+          range(0, 4096)                                                    \
+                                                                            \
   product(bool, UseCMoveUnconditionally, false,                             \
           "Use CMove (scalar and vector) ignoring profitability test.")     \
                                                                             \
diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index 20c8dfbff1776..8000e4fd39ed0 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -1868,6 +1868,7 @@ bool SuperWord::schedule_and_apply() const {
   }
 
   if (!vtransform.schedule()) { return false; }
+  if (vtransform.has_store_to_load_forwarding_failure()) { return false; }
   vtransform.apply();
   return true;
 }
diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index fc4eaccff5ce5..4d152189625ea 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -31,7 +31,7 @@
 #include "opto/vectorization.hpp"
 
 #ifndef PRODUCT
-static void print_con_or_idx(const Node* n) {
+void VPointer::print_con_or_idx(const Node* n) {
   if (n == nullptr) {
     tty->print("(   0)");
   } else if (n->is_ConI()) {
@@ -1369,12 +1369,12 @@ void VPointer::print() const {
   tty->print("adr: %4d, ", _adr != nullptr ? _adr->_idx : 0);
 
   tty->print(" base");
-  print_con_or_idx(_base);
+  VPointer::print_con_or_idx(_base);
 
   tty->print(" + offset(%4d)", _offset);
 
   tty->print(" + invar");
-  print_con_or_idx(_invar);
+  VPointer::print_con_or_idx(_invar);
 
   tty->print_cr(" + scale(%4d) * iv]", _scale);
 }
@@ -2168,15 +2168,15 @@ void AlignmentSolver::trace_start_solve() const {
 
     // iv = init + pre_iter * pre_stride + main_iter * main_stride
     tty->print("  iv = init");
-    print_con_or_idx(_init_node);
+    VPointer::print_con_or_idx(_init_node);
     tty->print_cr(" + pre_iter * pre_stride(%d) + main_iter * main_stride(%d)",
                   _pre_stride, _main_stride);
 
     // adr = base + offset + invar + scale * iv
     tty->print("  adr = base");
-    print_con_or_idx(_base);
+    VPointer::print_con_or_idx(_base);
     tty->print(" + offset(%d) + invar", _offset);
-    print_con_or_idx(_invar);
+    VPointer::print_con_or_idx(_invar);
     tty->print_cr(" + scale(%d) * iv", _scale);
   }
 }
diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp
index b084edd44b339..98aa3336dedf2 100644
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@@ -870,6 +870,7 @@ class VPointer : public ArenaObj {
   static int cmp_for_sort(const VPointer** p1, const VPointer** p2);
 
   NOT_PRODUCT( void print() const; )
+  NOT_PRODUCT( static void print_con_or_idx(const Node* n); )
 
 #ifndef PRODUCT
   class Tracer {
diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp
index 7c7aca3b90e7c..d09a4c899f685 100644
--- a/src/hotspot/share/opto/vtransform.cpp
+++ b/src/hotspot/share/opto/vtransform.cpp
@@ -144,6 +144,274 @@ void VTransformApplyResult::trace(VTransformNode* vtnode) const {
 }
 #endif
 
+// We use two comparisons, because a subtraction could underflow.
+#define RETURN_CMP_VALUE_IF_NOT_EQUAL(a, b) \
+  if (a < b) { return -1; }                 \
+  if (a > b) { return  1; }
+
+// Helper-class for VTransformGraph::has_store_to_load_forwarding_failure.
+// It represents a memory region: [ptr, ptr + memory_size)
+class VMemoryRegion : public StackObj {
+private:
+  Node* _base;        // ptr = base + offset + invar + scale * iv
+  int _scale;
+  Node* _invar;
+  int _offset;
+  uint _memory_size;
+  bool _is_load;      // load or store?
+  uint _schedule_order;
+
+public:
+  VMemoryRegion() {} // empty constructor for GrowableArray
+  VMemoryRegion(const VPointer& vpointer, int iv_offset, int vector_length, uint schedule_order) :
+    _base(vpointer.base()),
+    _scale(vpointer.scale_in_bytes()),
+    _invar(vpointer.invar()),
+    _offset(vpointer.offset_in_bytes() + _scale * iv_offset),
+    _memory_size(vpointer.memory_size() * vector_length),
+    _is_load(vpointer.mem()->is_Load()),
+    _schedule_order(schedule_order) {}
+
+    Node* base()          const { return _base; }
+    int scale()           const { return _scale; }
+    Node* invar()         const { return _invar; }
+    int offset()          const { return _offset; }
+    uint memory_size()    const { return _memory_size; }
+    bool is_load()        const { return _is_load; }
+    uint schedule_order() const { return _schedule_order; }
+
+    static int cmp_for_sort_by_group(VMemoryRegion* r1, VMemoryRegion* r2) {
+      RETURN_CMP_VALUE_IF_NOT_EQUAL(r1->base()->_idx, r2->base()->_idx);
+      RETURN_CMP_VALUE_IF_NOT_EQUAL(r1->scale(),      r2->scale());
+      int r1_invar_idx = r1->invar() == nullptr ? 0 : r1->invar()->_idx;
+      int r2_invar_idx = r2->invar() == nullptr ? 0 : r2->invar()->_idx;
+      RETURN_CMP_VALUE_IF_NOT_EQUAL(r1_invar_idx,      r2_invar_idx);
+      return 0; // equal
+    }
+
+    static int cmp_for_sort(VMemoryRegion* r1, VMemoryRegion* r2) {
+      int cmp_group = cmp_for_sort_by_group(r1, r2);
+      if (cmp_group != 0) { return cmp_group; }
+
+      RETURN_CMP_VALUE_IF_NOT_EQUAL(r1->offset(),     r2->offset());
+      return 0; // equal
+    }
+
+    enum Aliasing { DIFFERENT_GROUP, BEFORE, EXACT_OVERLAP, PARTIAL_OVERLAP, AFTER };
+
+    Aliasing aliasing(VMemoryRegion& other) {
+      VMemoryRegion* p1 = this;
+      VMemoryRegion* p2 = &other;
+      if (cmp_for_sort_by_group(p1, p2) != 0) { return DIFFERENT_GROUP; }
+
+      jlong offset1 = p1->offset();
+      jlong offset2 = p2->offset();
+      jlong memory_size1 = p1->memory_size();
+      jlong memory_size2 = p2->memory_size();
+
+      if (offset1 >= offset2 + memory_size2) { return AFTER; }
+      if (offset2 >= offset1 + memory_size1) { return BEFORE; }
+      if (offset1 == offset2 && memory_size1 == memory_size2) { return EXACT_OVERLAP; }
+      return PARTIAL_OVERLAP;
+    }
+
+#ifndef PRODUCT
+  void print() const {
+    tty->print("VMemoryRegion[%s %dbytes, schedule_order(%4d), base",
+               _is_load ? "load " : "store", _memory_size, _schedule_order);
+    VPointer::print_con_or_idx(_base);
+    tty->print(" + offset(%4d)", _offset);
+    tty->print(" + invar");
+    VPointer::print_con_or_idx(_invar);
+    tty->print_cr(" + scale(%4d) * iv]", _scale);
+  }
+#endif
+};
+
+// Store-to-load-forwarding is a CPU memory optimization, where a load can directly fetch
+// its value from the store-buffer, rather than from the L1 cache. This is many CPU cycles
+// faster. However, this optimization comes with some restrictions, depending on the CPU.
+// Generally, store-to-load-forwarding works if the load and store memory regions match
+// exactly (same start and width). Generally problematic are partial overlaps - though
+// some CPU's can handle even some subsets of these cases. We conservatively assume that
+// all such partial overlaps lead to a store-to-load-forwarding failures, which means the
+// load has to stall until the store goes from the store-buffer into the L1 cache, incurring
+// a penalty of many CPU cycles.
+//
+// Example (with "iteration distance" 2):
+//   for (int i = 10; i < SIZE; i++) {
+//       aI[i] = aI[i - 2] + 1;
+//   }
+//
+//   load_4_bytes( ptr +  -8)
+//   store_4_bytes(ptr +   0)    *
+//   load_4_bytes( ptr +  -4)    |
+//   store_4_bytes(ptr +   4)    | *
+//   load_4_bytes( ptr +   0)  <-+ |
+//   store_4_bytes(ptr +   8)      |
+//   load_4_bytes( ptr +   4)  <---+
+//   store_4_bytes(ptr +  12)
+//   ...
+//
+//   In the scalar loop, we can forward the stores from 2 iterations back.
+//
+// Assume we have 2-element vectors (2*4 = 8 bytes), with the "iteration distance" 2
+// example. This gives us this machine code:
+//   load_8_bytes( ptr +  -8)
+//   store_8_bytes(ptr +   0) |
+//   load_8_bytes( ptr +   0) v
+//   store_8_bytes(ptr +   8)   |
+//   load_8_bytes( ptr +   8)   v
+//   store_8_bytes(ptr +  16)
+//   ...
+//
+//   We packed 2 iterations, and the stores can perfectly forward to the loads of
+//   the next 2 iterations.
+//
+// Example (with "iteration distance" 3):
+//   for (int i = 10; i < SIZE; i++) {
+//       aI[i] = aI[i - 3] + 1;
+//   }
+//
+//   load_4_bytes( ptr + -12)
+//   store_4_bytes(ptr +   0)    *
+//   load_4_bytes( ptr +  -8)    |
+//   store_4_bytes(ptr +   4)    |
+//   load_4_bytes( ptr +  -4)    |
+//   store_4_bytes(ptr +   8)    |
+//   load_4_bytes( ptr +   0)  <-+
+//   store_4_bytes(ptr +  12)
+//   ...
+//
+//   In the scalar loop, we can forward the stores from 3 iterations back.
+//
+// Unfortunately, vectorization can introduce such store-to-load-forwarding failures.
+// Assume we have 2-element vectors (2*4 = 8 bytes), with the "iteration distance" 3
+// example. This gives us this machine code:
+//   load_8_bytes( ptr + -12)
+//   store_8_bytes(ptr +   0)  |   |
+//   load_8_bytes( ptr +  -4)  x   |
+//   store_8_bytes(ptr +   8)     ||
+//   load_8_bytes( ptr +   4)     xx  <-- partial overlap with 2 stores
+//   store_8_bytes(ptr +  16)
+//   ...
+//
+// We see that eventually all loads are dependent on earlier stores, but the values cannot
+// be forwarded because there is some partial overlap.
+//
+// Preferably, we would have some latency-based cost-model that accounts for such forwarding
+// failures, and decide if vectorization with forwarding failures is still profitable. For
+// now we go with a simpler heuristic: we simply forbid vectorization if we can PROVE that
+// there will be a forwarding failure. This approach has at least 2 possible weaknesses:
+//
+//  (1) There may be forwarding failures in cases where we cannot prove it.
+//      Example:
+//        for (int i = 10; i < SIZE; i++) {
+//            bI[i] = aI[i - 3] + 1;
+//        }
+//
+//      We do not know if aI and bI refer to the same array or not. However, it is reasonable
+//      to assume that if we have two different array references, that they most likely refer
+//      to different arrays (i.e. no aliasing), where we would have no forwarding failures.
+//  (2) There could be some loops where vectorization introduces forwarding failures, and thus
+//      the latency of the loop body is high, but this does not matter because it is dominated
+//      by other latency/throughput based costs in the loop body.
+//
+// Performance measurements with the JMH benchmark StoreToLoadForwarding.java have indicated
+// that there is some iteration threshold: if the failure happens between a store and load that
+// have an iteration distance below this threshold, the latency is the limiting factor, and we
+// should not vectorize to avoid the latency penalty of store-to-load-forwarding failures. If
+// the iteration distance is larger than this threshold, the throughput is the limiting factor,
+// and we should vectorize in these cases to improve throughput.
+//
+bool VTransformGraph::has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const {
+  if (SuperWordStoreToLoadForwardingFailureDetection == 0) { return false; }
+
+  // Collect all pointers for scalar and vector loads/stores.
+  ResourceMark rm;
+  GrowableArray<VMemoryRegion> memory_regions;
+
+  // To detect store-to-load-forwarding failures at the iteration threshold or below, we
+  // simulate a super-unrolling to reach SuperWordStoreToLoadForwardingFailureDetection
+  // iterations at least. This is a heuristic, and we are not trying to be very precise
+  // with the iteration distance. If we have already unrolled more than the iteration
+  // threshold, i.e. if "SuperWordStoreToLoadForwardingFailureDetection < unrolled_count",
+  // then we simply check if there are any store-to-load-forwarding failures in the unrolled
+  // loop body, which may be at larger distance than the desired threshold. We cannot do any
+  // more fine-grained analysis, because the unrolling has lost the information about the
+  // iteration distance.
+  int simulated_unrolling_count = SuperWordStoreToLoadForwardingFailureDetection;
+  int unrolled_count = vloop_analyzer.vloop().cl()->unrolled_count();
+  uint simulated_super_unrolling_count = MAX2(1, simulated_unrolling_count / unrolled_count);
+  int iv_stride = vloop_analyzer.vloop().iv_stride();
+  int schedule_order = 0;
+  for (uint k = 0; k < simulated_super_unrolling_count; k++) {
+    int iv_offset = k * iv_stride; // virtual super-unrolling
+    for (int i = 0; i < _schedule.length(); i++) {
+      VTransformNode* vtn = _schedule.at(i);
+      if (vtn->is_load_or_store_in_loop()) {
+        const VPointer& p = vtn->vpointer(vloop_analyzer);
+        if (p.valid()) {
+          VTransformVectorNode* vector = vtn->isa_Vector();
+          uint vector_length = vector != nullptr ? vector->nodes().length() : 1;
+          memory_regions.push(VMemoryRegion(p, iv_offset, vector_length, schedule_order++));
+        }
+      }
+    }
+  }
+
+  // Sort the pointers by group (same base, invar and stride), and then by offset.
+  memory_regions.sort(VMemoryRegion::cmp_for_sort);
+
+#ifndef PRODUCT
+  if (_trace._verbose) {
+    tty->print_cr("VTransformGraph::has_store_to_load_forwarding_failure:");
+    tty->print_cr("  simulated_unrolling_count = %d", simulated_unrolling_count);
+    tty->print_cr("  simulated_super_unrolling_count = %d", simulated_super_unrolling_count);
+    for (int i = 0; i < memory_regions.length(); i++) {
+      VMemoryRegion& region = memory_regions.at(i);
+      region.print();
+    }
+  }
+#endif
+
+  // For all pairs of pointers in the same group, check if they have a partial overlap.
+  for (int i = 0; i < memory_regions.length(); i++) {
+    VMemoryRegion& region1 = memory_regions.at(i);
+
+    for (int j = i + 1; j < memory_regions.length(); j++) {
+      VMemoryRegion& region2 = memory_regions.at(j);
+
+      const VMemoryRegion::Aliasing aliasing = region1.aliasing(region2);
+      if (aliasing == VMemoryRegion::Aliasing::DIFFERENT_GROUP ||
+          aliasing == VMemoryRegion::Aliasing::BEFORE) {
+        break; // We have reached the next group or pointers that are always after.
+      } else if (aliasing == VMemoryRegion::Aliasing::EXACT_OVERLAP) {
+        continue;
+      } else {
+        assert(aliasing == VMemoryRegion::Aliasing::PARTIAL_OVERLAP, "no other case can happen");
+        if ((region1.is_load() && !region2.is_load() && region1.schedule_order() > region2.schedule_order()) ||
+            (!region1.is_load() && region2.is_load() && region1.schedule_order() < region2.schedule_order())) {
+          // We predict that this leads to a store-to-load-forwarding failure penalty.
+#ifndef PRODUCT
+          if (_trace._rejections) {
+            tty->print_cr("VTransformGraph::has_store_to_load_forwarding_failure:");
+            tty->print_cr("  Partial overlap of store->load. We predict that this leads to");
+            tty->print_cr("  a store-to-load-forwarding failure penalty which makes");
+            tty->print_cr("  vectorization unprofitable. These are the two pointers:");
+            region1.print();
+            region2.print();
+          }
+#endif
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
 Node* VTransformNode::find_transformed_input(int i, const GrowableArray<Node*>& vnode_idx_to_transformed_node) const {
   Node* n = vnode_idx_to_transformed_node.at(in(i)->_idx);
   assert(n != nullptr, "must find input IR node");
diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp
index ee298e7fe723f..8ceca318f4ae1 100644
--- a/src/hotspot/share/opto/vtransform.hpp
+++ b/src/hotspot/share/opto/vtransform.hpp
@@ -66,6 +66,8 @@ class VTransformVectorNode;
 class VTransformElementWiseVectorNode;
 class VTransformBoolVectorNode;
 class VTransformReductionVectorNode;
+class VTransformLoadVectorNode;
+class VTransformStoreVectorNode;
 
 // Result from VTransformNode::apply
 class VTransformApplyResult {
@@ -157,6 +159,7 @@ class VTransformGraph : public StackObj {
   const GrowableArray<VTransformNode*>& vtnodes() const { return _vtnodes; }
 
   bool schedule();
+  bool has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const;
   void apply_memops_reordering_with_schedule() const;
   void apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const;
 
@@ -221,6 +224,7 @@ class VTransform : public StackObj {
   VTransformGraph& graph() { return _graph; }
 
   bool schedule() { return _graph.schedule(); }
+  bool has_store_to_load_forwarding_failure() const { return _graph.has_store_to_load_forwarding_failure(_vloop_analyzer); }
   void apply();
 
 private:
@@ -310,6 +314,11 @@ class VTransformNode : public ArenaObj {
   virtual VTransformElementWiseVectorNode* isa_ElementWiseVector() { return nullptr; }
   virtual VTransformBoolVectorNode* isa_BoolVector() { return nullptr; }
   virtual VTransformReductionVectorNode* isa_ReductionVector() { return nullptr; }
+  virtual VTransformLoadVectorNode* isa_LoadVector() { return nullptr; }
+  virtual VTransformStoreVectorNode* isa_StoreVector() { return nullptr; }
+
+  virtual bool is_load_or_store_in_loop() const { return false; }
+  virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const { ShouldNotReachHere(); }
 
   virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
                                       const GrowableArray<Node*>& vnode_idx_to_transformed_node) const = 0;
@@ -333,6 +342,8 @@ class VTransformScalarNode : public VTransformNode {
     VTransformNode(vtransform, n->req()), _node(n) {}
   Node* node() const { return _node; }
   virtual VTransformScalarNode* isa_Scalar() override { return this; }
+  virtual bool is_load_or_store_in_loop() const override { return _node->is_Load() || _node->is_Store(); }
+  virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().vpointer(node()->as_Mem()); }
   virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
                                       const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "Scalar"; };)
@@ -347,6 +358,7 @@ class VTransformInputScalarNode : public VTransformScalarNode {
   VTransformInputScalarNode(VTransform& vtransform, Node* n) :
     VTransformScalarNode(vtransform, n) {}
   virtual VTransformInputScalarNode* isa_InputScalar() override { return this; }
+  virtual bool is_load_or_store_in_loop() const override { return false; }
   NOT_PRODUCT(virtual const char* name() const override { return "InputScalar"; };)
 };
 
@@ -472,6 +484,9 @@ class VTransformLoadVectorNode : public VTransformVectorNode {
   VTransformLoadVectorNode(VTransform& vtransform, uint number_of_nodes) :
     VTransformVectorNode(vtransform, 3, number_of_nodes) {}
   LoadNode::ControlDependency control_dependency() const;
+  virtual VTransformLoadVectorNode* isa_LoadVector() override { return this; }
+  virtual bool is_load_or_store_in_loop() const override { return true; }
+  virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().vpointer(nodes().at(0)->as_Mem()); }
   virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
                                       const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "LoadVector"; };)
@@ -482,6 +497,9 @@ class VTransformStoreVectorNode : public VTransformVectorNode {
   // req = 4 -> [ctrl, mem, adr, val]
   VTransformStoreVectorNode(VTransform& vtransform, uint number_of_nodes) :
     VTransformVectorNode(vtransform, 4, number_of_nodes) {}
+  virtual VTransformStoreVectorNode* isa_StoreVector() override { return this; }
+  virtual bool is_load_or_store_in_loop() const override { return true; }
+  virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().vpointer(nodes().at(0)->as_Mem()); }
   virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
                                       const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "StoreVector"; };)
diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java
index efd328dc5cce9..60d753ee75f6b 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java
@@ -168,6 +168,9 @@ public TestAlignVector() {
         tests.put("test14aB",    () -> { return test14aB(aB.clone()); });
         tests.put("test14bB",    () -> { return test14bB(aB.clone()); });
         tests.put("test14cB",    () -> { return test14cB(aB.clone()); });
+        tests.put("test14dB",    () -> { return test14dB(aB.clone()); });
+        tests.put("test14eB",    () -> { return test14eB(aB.clone()); });
+        tests.put("test14fB",    () -> { return test14fB(aB.clone()); });
 
         tests.put("test15aB",    () -> { return test15aB(aB.clone()); });
         tests.put("test15bB",    () -> { return test15bB(aB.clone()); });
@@ -239,6 +242,9 @@ public TestAlignVector() {
                  "test14aB",
                  "test14bB",
                  "test14cB",
+                 "test14dB",
+                 "test14eB",
+                 "test14fB",
                  "test15aB",
                  "test15bB",
                  "test15cB",
@@ -1128,9 +1134,9 @@ static Object[] test13bBSIL(byte[] a, short[] b, int[] c, long[] d) {
     }
 
     @Test
-    @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
-                  IRNode.ADD_VB, "> 0",
-                  IRNode.STORE_VECTOR, "> 0"},
+    @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
+                  IRNode.ADD_VB, "= 0",
+                  IRNode.STORE_VECTOR, "= 0"},
         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
         applyIfPlatform = {"64-bit", "true"},
         applyIf = {"AlignVector", "false"})
@@ -1143,6 +1149,9 @@ static Object[] test13bBSIL(byte[] a, short[] b, int[] c, long[] d) {
     static Object[] test14aB(byte[] a) {
         // non-power-of-2 stride
         for (int i = 0; i < RANGE-20; i+=9) {
+            // Since the stride is shorter than the vector length, there will be always
+            // partial overlap of loads with previous stores, this leads to failure in
+            // store-to-load-forwarding -> vectorization not profitable.
             a[i+0]++;
             a[i+1]++;
             a[i+2]++;
@@ -1164,9 +1173,9 @@ static Object[] test14aB(byte[] a) {
     }
 
     @Test
-    @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
-                  IRNode.ADD_VB, "> 0",
-                  IRNode.STORE_VECTOR, "> 0"},
+    @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
+                  IRNode.ADD_VB, "= 0",
+                  IRNode.STORE_VECTOR, "= 0"},
         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
         applyIfPlatform = {"64-bit", "true"},
         applyIf = {"AlignVector", "false"})
@@ -1179,6 +1188,9 @@ static Object[] test14aB(byte[] a) {
     static Object[] test14bB(byte[] a) {
         // non-power-of-2 stride
         for (int i = 0; i < RANGE-20; i+=3) {
+            // Since the stride is shorter than the vector length, there will be always
+            // partial overlap of loads with previous stores, this leads to failure in
+            // store-to-load-forwarding -> vectorization not profitable.
             a[i+0]++;
             a[i+1]++;
             a[i+2]++;
@@ -1200,9 +1212,9 @@ static Object[] test14bB(byte[] a) {
     }
 
     @Test
-    @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
-                  IRNode.ADD_VB, "> 0",
-                  IRNode.STORE_VECTOR, "> 0"},
+    @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
+                  IRNode.ADD_VB, "= 0",
+                  IRNode.STORE_VECTOR, "= 0"},
         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
         applyIfPlatform = {"64-bit", "true"},
         applyIf = {"AlignVector", "false"})
@@ -1215,6 +1227,9 @@ static Object[] test14bB(byte[] a) {
     static Object[] test14cB(byte[] a) {
         // non-power-of-2 stride
         for (int i = 0; i < RANGE-20; i+=5) {
+            // Since the stride is shorter than the vector length, there will be always
+            // partial overlap of loads with previous stores, this leads to failure in
+            // store-to-load-forwarding -> vectorization not profitable.
             a[i+0]++;
             a[i+1]++;
             a[i+2]++;
@@ -1235,6 +1250,90 @@ static Object[] test14cB(byte[] a) {
         return new Object[]{ a };
     }
 
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
+                  IRNode.ADD_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
+                  IRNode.STORE_VECTOR,                                           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIf = {"AlignVector", "false"})
+    @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
+                  IRNode.ADD_VB, "= 0",
+                  IRNode.STORE_VECTOR, "= 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIf = {"AlignVector", "true"})
+    static Object[] test14dB(byte[] a) {
+        // non-power-of-2 stride
+        for (int i = 0; i < RANGE-20; i+=9) {
+            a[i+0]++;
+            a[i+1]++;
+            a[i+2]++;
+            a[i+3]++;
+            a[i+4]++;
+            a[i+5]++;
+            a[i+6]++;
+            a[i+7]++;
+        }
+        return new Object[]{ a };
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
+                  IRNode.ADD_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
+                  IRNode.STORE_VECTOR,                                           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIf = {"AlignVector", "false"})
+    @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
+                  IRNode.ADD_VB, "= 0",
+                  IRNode.STORE_VECTOR, "= 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIf = {"AlignVector", "true"})
+    static Object[] test14eB(byte[] a) {
+        // non-power-of-2 stride
+        for (int i = 0; i < RANGE-32; i+=11) {
+            a[i+0]++;
+            a[i+1]++;
+            a[i+2]++;
+            a[i+3]++;
+            a[i+4]++;
+            a[i+5]++;
+            a[i+6]++;
+            a[i+7]++;
+        }
+        return new Object[]{ a };
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
+                  IRNode.ADD_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
+                  IRNode.STORE_VECTOR,                                           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIf = {"AlignVector", "false"})
+    @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
+                  IRNode.ADD_VB, "= 0",
+                  IRNode.STORE_VECTOR, "= 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIf = {"AlignVector", "true"})
+    static Object[] test14fB(byte[] a) {
+        // non-power-of-2 stride
+        for (int i = 0; i < RANGE-40; i+=12) {
+            a[i+0]++;
+            a[i+1]++;
+            a[i+2]++;
+            a[i+3]++;
+            a[i+4]++;
+            a[i+5]++;
+            a[i+6]++;
+            a[i+7]++;
+        }
+        return new Object[]{ a };
+    }
+
     @Test
     // IR rules difficult because of modulo wrapping with offset after peeling.
     static Object[] test15aB(byte[] a) {
diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestCyclicDependency.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestCyclicDependency.java
index 3849f1b05cf27..7c6b7c92c379d 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestCyclicDependency.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestCyclicDependency.java
@@ -24,7 +24,7 @@
 
 /*
  * @test
- * @bug 8298935
+ * @bug 8298935 8334431
  * @summary Writing forward on array creates cyclic dependency
  *          which leads to wrong result, when ignored.
  * @library /test/lib /
@@ -55,15 +55,30 @@ public class TestCyclicDependency {
     float[] goldF6a = new float[RANGE];
     int[] goldI6b = new int[RANGE];
     float[] goldF6b = new float[RANGE];
-    int[] goldI7 = new int[RANGE];
-    float[] goldF7 = new float[RANGE];
-    int[] goldI8 = new int[RANGE];
-    float[] goldF8 = new float[RANGE];
+    int[] goldI7a = new int[RANGE];
+    float[] goldF7a = new float[RANGE];
+    int[] goldI7b = new int[RANGE];
+    float[] goldF7b = new float[RANGE];
+    float[] goldF7b_2 = new float[RANGE];
+    int[] goldI7c = new int[RANGE];
+    float[] goldF7c = new float[RANGE];
+    int[] goldI8a = new int[RANGE];
+    float[] goldF8a = new float[RANGE];
+    int[] goldI8b = new int[RANGE];
+    int[] goldI8b_2 = new int[RANGE];
+    float[] goldF8b = new float[RANGE];
+    int[] goldI8c = new int[RANGE];
+    float[] goldF8c = new float[RANGE];
     int[] goldI9 = new int[RANGE];
     float[] goldF9 = new float[RANGE];
 
     public static void main(String args[]) {
-        TestFramework.runWithFlags("-XX:CompileCommand=compileonly,TestCyclicDependency::test*");
+        TestFramework.runWithFlags("-XX:CompileCommand=compileonly,TestCyclicDependency::test*",
+                                   "-XX:+IgnoreUnrecognizedVMOptions", "-XX:-AlignVector", "-XX:-VerifyAlignVector");
+        TestFramework.runWithFlags("-XX:CompileCommand=compileonly,TestCyclicDependency::test*",
+                                   "-XX:+IgnoreUnrecognizedVMOptions", "-XX:+AlignVector", "-XX:-VerifyAlignVector");
+        TestFramework.runWithFlags("-XX:CompileCommand=compileonly,TestCyclicDependency::test*",
+                                   "-XX:+IgnoreUnrecognizedVMOptions", "-XX:+AlignVector", "-XX:+VerifyAlignVector");
     }
 
     TestCyclicDependency() {
@@ -95,12 +110,24 @@ public static void main(String args[]) {
         // test6b
         init(goldI6b, goldF6b);
         test6b(goldI6b, goldF6b);
-        // test7
-        init(goldI7, goldF7);
-        test7(goldI7, goldF7);
-        // test8
-        init(goldI8, goldF8);
-        test8(goldI8, goldF8);
+        // test7a
+        init(goldI7a, goldF7a);
+        test7a(goldI7a, goldF7a);
+        // test7b
+        init(goldI7b, goldF7b, goldF7b_2);
+        test7b(goldI7b, goldF7b, goldF7b_2);
+        // test7c
+        init(goldI7c, goldF7c);
+        test7c(goldI7c, goldF7c, goldF7c);
+        // test8a
+        init(goldI8a, goldF8a);
+        test8a(goldI8a, goldF8a);
+        // test8b
+        init(goldI8b, goldI8b_2, goldF8b);
+        test8b(goldI8b, goldI8b_2, goldF8b);
+        // test8c
+        init(goldI8c, goldF8c);
+        test8c(goldI8c, goldI8c, goldF8c);
         // test9
         init(goldI9, goldF9);
         test9(goldI9, goldF9);
@@ -205,26 +232,74 @@ public void runTest6b() {
         verifyF("test6b", dataF, goldF6b);
     }
 
-    @Run(test = "test7")
+    @Run(test = "test7a")
     @Warmup(100)
-    public void runTest7() {
+    public void runTest7a() {
         int[] dataI = new int[RANGE];
         float[] dataF = new float[RANGE];
         init(dataI, dataF);
-        test7(dataI, dataF);
-        verifyI("test7", dataI, goldI7);
-        verifyF("test7", dataF, goldF7);
+        test7a(dataI, dataF);
+        verifyI("test7a", dataI, goldI7a);
+        verifyF("test7a", dataF, goldF7a);
     }
 
-    @Run(test = "test8")
+    @Run(test = "test7b")
     @Warmup(100)
-    public void runTest8() {
+    public void runTest7b() {
+        int[] dataI = new int[RANGE];
+        float[] dataF = new float[RANGE];
+        float[] dataF_2 = new float[RANGE];
+        init(dataI, dataF, dataF_2);
+        test7b(dataI, dataF, dataF_2);
+        verifyI("test7b", dataI, goldI7b);
+        verifyF("test7b", dataF, goldF7b);
+        verifyF("test7b", dataF_2, goldF7b_2);
+    }
+
+    @Run(test = "test7c")
+    @Warmup(100)
+    public void runTest7c() {
+        int[] dataI = new int[RANGE];
+        float[] dataF = new float[RANGE];
+        init(dataI, dataF);
+        test7c(dataI, dataF, dataF);
+        verifyI("test7c", dataI, goldI7c);
+        verifyF("test7c", dataF, goldF7c);
+    }
+
+    @Run(test = "test8a")
+    @Warmup(100)
+    public void runTest8a() {
+        int[] dataI = new int[RANGE];
+        float[] dataF = new float[RANGE];
+        init(dataI, dataF);
+        test8a(dataI, dataF);
+        verifyI("test8a", dataI, goldI8a);
+        verifyF("test8a", dataF, goldF8a);
+    }
+
+    @Run(test = "test8b")
+    @Warmup(100)
+    public void runTest8b() {
+        int[] dataI = new int[RANGE];
+        int[] dataI_2 = new int[RANGE];
+        float[] dataF = new float[RANGE];
+        init(dataI, dataI_2, dataF);
+        test8b(dataI, dataI_2, dataF);
+        verifyI("test8b", dataI, goldI8b);
+        verifyI("test8b", dataI_2, goldI8b_2);
+        verifyF("test8b", dataF, goldF8b);
+    }
+
+    @Run(test = "test8c")
+    @Warmup(100)
+    public void runTest8c() {
         int[] dataI = new int[RANGE];
         float[] dataF = new float[RANGE];
         init(dataI, dataF);
-        test8(dataI, dataF);
-        verifyI("test8", dataI, goldI8);
-        verifyF("test8", dataF, goldF8);
+        test8c(dataI, dataI, dataF);
+        verifyI("test8c", dataI, goldI8c);
+        verifyF("test8c", dataF, goldF8c);
     }
 
     @Run(test = "test9")
@@ -328,34 +403,156 @@ static void test6b(int[] dataI, float[] dataF) {
     }
 
     @Test
-    @IR(counts = {IRNode.ADD_VI, "> 0"},
+    @IR(counts = {IRNode.ADD_VI, "= 0",
+                  IRNode.ADD_VF, "= 0"},
         applyIf = {"AlignVector", "false"},
         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    @IR(counts = {IRNode.ADD_VI, "> 0",
+                  IRNode.ADD_VF, "= 0"},
+        applyIf = {"AlignVector", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
     // Some aarch64 machines have AlignVector == true, like ThunderX2
-    static void test7(int[] dataI, float[] dataF) {
+    static void test7a(int[] dataI, float[] dataF) {
         for (int i = 0; i < RANGE - 32; i++) {
             // write forward 32 -> more than vector size -> can vectorize
-            // write forward 3 -> cannot vectorize
-            // separate types should make decision separately if they vectorize or not
             int v = dataI[i];
             dataI[i + 32] = v + 5;
+            // write forward 3:
+            //   AlignVector=true -> cannot vectorize because load and store cannot be both aligned
+            //   AlignVector=false -> could vectorize, but would get 2-element vectors where
+            //                        store-to-load-forwarding fails, because we have store-load
+            //                        dependencies that have partial overlap.
+            //                        -> all vectorization cancled.
             float f = dataF[i];
             dataF[i + 3] = f + 3.5f;
         }
     }
 
     @Test
-    @IR(counts = {IRNode.ADD_VF, IRNode.VECTOR_SIZE + "min(max_int, max_float)", "> 0"},
+    @IR(counts = {IRNode.ADD_VI, "> 0",
+                  IRNode.ADD_VF, IRNode.VECTOR_SIZE + "2", "> 0"},
+        applyIf = {"AlignVector", "false"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    @IR(counts = {IRNode.ADD_VI, "> 0",
+                  IRNode.ADD_VF, "= 0"},
+        applyIf = {"AlignVector", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    // Some aarch64 machines have AlignVector == true, like ThunderX2
+    static void test7b(int[] dataI, float[] dataF, float[] dataF_2) {
+        for (int i = 0; i < RANGE - 32; i++) {
+            // write forward 32 -> more than vector size -> can vectorize
+            int v = dataI[i];
+            dataI[i + 32] = v + 5;
+            // write forward 3 to different array reference:
+            //   AlignVector=true -> cannot vectorize because load and store cannot be both aligned
+            //   AlignVector=false -> vectorizes because we cannot prove store-to-load forwarding
+            //                        failure. But we can only have 2-element vectors in case
+            //                        the two float-arrays reference the same array.
+            //                        Note: at runtime the float-arrays are always different.
+            float f = dataF[i];
+            dataF_2[i + 3] = f + 3.5f;
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.ADD_VI, "> 0",
+                  IRNode.ADD_VF, IRNode.VECTOR_SIZE + "2", "> 0"},
         applyIf = {"AlignVector", "false"},
         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    @IR(counts = {IRNode.ADD_VI, "> 0",
+                  IRNode.ADD_VF, "= 0"},
+        applyIf = {"AlignVector", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
     // Some aarch64 machines have AlignVector == true, like ThunderX2
-    static void test8(int[] dataI, float[] dataF) {
+    static void test7c(int[] dataI, float[] dataF, float[] dataF_2) {
         for (int i = 0; i < RANGE - 32; i++) {
             // write forward 32 -> more than vector size -> can vectorize
-            // write forward 3 -> cannot vectorize
-            // separate types should make decision separately if they vectorize or not
+            int v = dataI[i];
+            dataI[i + 32] = v + 5;
+            // write forward 3 to different array reference:
+            //   AlignVector=true -> cannot vectorize because load and store cannot be both aligned
+            //   AlignVector=false -> vectorizes because we cannot prove store-to-load forwarding
+            //                        failure. But we can only have 2-element vectors in case
+            //                        the two float-arrays reference the same array.
+            //                        Note: at runtime the float-arrays are always the same.
+            float f = dataF[i];
+            dataF_2[i + 3] = f + 3.5f;
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.ADD_VI, "= 0",
+                  IRNode.ADD_VF, "= 0"},
+        applyIf = {"AlignVector", "false"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    @IR(counts = {IRNode.ADD_VI, "= 0",
+                  IRNode.ADD_VF, IRNode.VECTOR_SIZE + "min(max_int, max_float)", "> 0"},
+        applyIf = {"AlignVector", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    // Some aarch64 machines have AlignVector == true, like ThunderX2
+    static void test8a(int[] dataI, float[] dataF) {
+        for (int i = 0; i < RANGE - 32; i++) {
+            // write forward 3:
+            //   AlignVector=true -> cannot vectorize because load and store cannot be both aligned
+            //   AlignVector=false -> could vectorize, but would get 2-element vectors where
+            //                        store-to-load-forwarding fails, because we have store-load
+            //                        dependencies that have partial overlap.
+            //                        -> all vectorization cancled.
             int v = dataI[i];
             dataI[i + 3] = v + 5;
+            // write forward 32 -> more than vector size -> can vectorize
+            float f = dataF[i];
+            dataF[i + 32] = f + 3.5f;
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.ADD_VI, IRNode.VECTOR_SIZE + "2", "> 0",
+                  IRNode.ADD_VF, IRNode.VECTOR_SIZE + "min(max_int, max_float)", "> 0"},
+        applyIf = {"AlignVector", "false"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    @IR(counts = {IRNode.ADD_VI, "= 0",
+                  IRNode.ADD_VF, IRNode.VECTOR_SIZE + "min(max_int, max_float)", "> 0"},
+        applyIf = {"AlignVector", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    // Some aarch64 machines have AlignVector == true, like ThunderX2
+    static void test8b(int[] dataI, int[] dataI_2, float[] dataF) {
+        for (int i = 0; i < RANGE - 32; i++) {
+            // write forward 3 to different array reference:
+            //   AlignVector=true -> cannot vectorize because load and store cannot be both aligned
+            //   AlignVector=false -> vectorizes because we cannot prove store-to-load forwarding
+            //                        failure. But we can only have 2-element vectors in case
+            //                        the two float-arrays reference the same array.
+            //                        Note: at runtime the float-arrays are always different.
+            int v = dataI[i];
+            dataI_2[i + 3] = v + 5;
+            // write forward 32 -> more than vector size -> can vectorize
+            float f = dataF[i];
+            dataF[i + 32] = f + 3.5f;
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.ADD_VI, IRNode.VECTOR_SIZE + "2", "> 0",
+                  IRNode.ADD_VF, IRNode.VECTOR_SIZE + "min(max_int, max_float)", "> 0"},
+        applyIf = {"AlignVector", "false"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    @IR(counts = {IRNode.ADD_VI, "= 0",
+                  IRNode.ADD_VF, IRNode.VECTOR_SIZE + "min(max_int, max_float)", "> 0"},
+        applyIf = {"AlignVector", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    // Some aarch64 machines have AlignVector == true, like ThunderX2
+    static void test8c(int[] dataI, int[] dataI_2, float[] dataF) {
+        for (int i = 0; i < RANGE - 32; i++) {
+            // write forward 3 to different array reference:
+            //   AlignVector=true -> cannot vectorize because load and store cannot be both aligned
+            //   AlignVector=false -> vectorizes because we cannot prove store-to-load forwarding
+            //                        failure. But we can only have 2-element vectors in case
+            //                        the two float-arrays reference the same array.
+            //                        Note: at runtime the float-arrays are always the same.
+            int v = dataI[i];
+            dataI_2[i + 3] = v + 5;
+            // write forward 32 -> more than vector size -> can vectorize
             float f = dataF[i];
             dataF[i + 32] = f + 3.5f;
         }
@@ -380,6 +577,22 @@ public static void init(int[] dataI, float[] dataF) {
         }
     }
 
+    public static void init(int[] dataI, float[] dataF, float[] dataF_2) {
+        for (int j = 0; j < RANGE; j++) {
+            dataI[j] = j;
+            dataF[j] = j * 0.5f;
+            dataF_2[j] = j * 0.3f;
+        }
+    }
+
+    public static void init(int[] dataI, int[] dataI_2, float[] dataF) {
+        for (int j = 0; j < RANGE; j++) {
+            dataI[j] = j;
+            dataI_2[j] = 3*j - 42;
+            dataF[j] = j * 0.5f;
+        }
+    }
+
     static void verifyI(String name, int[] data, int[] gold) {
         for (int i = 0; i < RANGE; i++) {
             if (data[i] != gold[i]) {
diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestDependencyOffsets.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestDependencyOffsets.java
index 8e5ac88a27da4..cfa19ce385a80 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestDependencyOffsets.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestDependencyOffsets.java
@@ -643,6 +643,12 @@ static List<Integer> getOffsets() {
         return new ArrayList<Integer>(set);
     }
 
+    enum ExpectVectorization {
+        ALWAYS,    // -> positive "count" IR rule
+        UNKNOWN,   // -> disable IR rule
+        NEVER      // -> negative "failOn" IR rule
+    };
+
     static record TestDefinition (int id, Type type, int offset) {
 
         /*
@@ -656,18 +662,22 @@ String generate() {
             String aliasingComment;
             String secondArgument;
             String loadFrom;
+            boolean isSingleArray;
             switch (RANDOM.nextInt(3)) {
             case 0: // a[i + offset] = a[i]
+                isSingleArray = true;
                 aliasingComment = "single-array";
                 secondArgument = "a";
                 loadFrom = "a";
                 break;
             case 1: // a[i + offset] = b[i], but a and b alias, i.e. at runtime a == b.
+                isSingleArray = false;
                 aliasingComment = "aliasing";
                 secondArgument = "a";
                 loadFrom = "b";
                 break;
             case 2: // a[i + offset] = b[i], and a and b do not alias, i.e. at runtime a != b.
+                isSingleArray = false;
                 aliasingComment = "non-aliasing";
                 secondArgument = "b";
                 loadFrom = "b";
@@ -712,7 +722,7 @@ String generate() {
                    type.name, id, type.name,
                    id, id, id, id, secondArgument, id,
                    // IR rules
-                   generateIRRules(),
+                   generateIRRules(isSingleArray),
                    // test
                    id, type.name, type.name,
                    start, end,
@@ -726,7 +736,7 @@ String generate() {
          * expect depends on AlignVector and MaxVectorSize, as well as the byteOffset between the load and
          * store.
          */
-        String generateIRRules() {
+        String generateIRRules(boolean isSingleArray) {
             StringBuilder builder = new StringBuilder();
 
             for (CPUMinVectorWidth cm : getCPUMinVectorWidth(type.name)) {
@@ -744,29 +754,75 @@ String generateIRRules() {
                 // power of two.
                 int infinity = 256; // No vector size is ever larger than this.
                 int maxVectorWidth = infinity; // no constraint by default
+                int log2 = 31 - Integer.numberOfLeadingZeros(offset);
+                int floorPow2Offset = 1 << log2;
                 if (0 < byteOffset && byteOffset < maxVectorWidth) {
-                    int log2 = 31 - Integer.numberOfLeadingZeros(offset);
-                    int floorPow2 = 1 << log2;
-                    maxVectorWidth = Math.min(maxVectorWidth, floorPow2 * type.size);
-                    builder.append("    // Vectors must have at most " + floorPow2 +
+                    maxVectorWidth = Math.min(maxVectorWidth, floorPow2Offset * type.size);
+                    builder.append("    // Vectors must have at most " + floorPow2Offset +
                                    " elements: maxVectorWidth = " + maxVectorWidth +
                                    " to avoid cyclic dependency.\n");
                 }
 
+                ExpectVectorization expectVectorization = ExpectVectorization.ALWAYS;
+                if (isSingleArray && 0 < offset && offset < 64) {
+                    // In a store-forward case at iteration distances below a certain threshold, and not there
+                    // is some partial overlap between the expected vector store and some vector load in a later
+                    // iteration, we avoid vectorization to avoid the latency penalties of store-to-load
+                    // forwarding failure. We only detect these failures in single-array cases.
+                    //
+                    // Note: we currently never detect store-to-load-forwarding failures beyond 64 iterations,
+                    //       And so if the offset >= 64, we always expect vectorization.
+                    //
+                    // The condition for partial overlap:
+                    //   offset % #elements != 0
+                    //
+                    // But we do not know #elements exactly, only a range from min/maxVectorWidth.
+
+                    int maxElements = maxVectorWidth / type.size;
+                    int minElements = minVectorWidth / type.size;
+                    boolean sometimesPartialOverlap = offset % maxElements != 0;
+                    // If offset % minElements != 0, then it does also not hold for any larger vector.
+                    boolean alwaysPartialOverlap = offset % minElements != 0;
+
+                    if (alwaysPartialOverlap) {
+                        // It is a little tricky to know the exact threshold. On all platforms and in all
+                        // unrolling cases, it is between 8 and 64. Hence, we have these 3 cases:
+                        if (offset <= 8) {
+                            builder.append("    // We always detect store-to-load-forwarding failures -> never vectorize.\n");
+                            expectVectorization = ExpectVectorization.NEVER;
+                        } else if (offset <= 64) {
+                            builder.append("    // Unknown if detect store-to-load-forwarding failures -> maybe disable IR rules.\n");
+                            expectVectorization = ExpectVectorization.UNKNOWN;
+                        } else {
+                            // offset > 64  -> offset too large, expect no store-to-load-failure detection
+                            throw new RuntimeException("impossible");
+                        }
+                    } else if (sometimesPartialOverlap && !alwaysPartialOverlap) {
+                        builder.append("    // Partial overlap condition true: sometimes but not always -> maybe disable IR rules.\n");
+                        expectVectorization = ExpectVectorization.UNKNOWN;
+                    } else {
+                        builder.append("    // Partial overlap never happens -> expect vectorization.\n");
+                        expectVectorization = ExpectVectorization.ALWAYS;
+                    }
+                }
+
                 // Rule 1: No strict alignment: -XX:-AlignVector
+                ExpectVectorization expectVectorization1 = expectVectorization;
                 IRRule r1 = new IRRule(type, type.irNode, applyIfCPUFeature);
                 r1.addApplyIf("\"AlignVector\", \"false\"");
                 r1.addApplyIf("\"MaxVectorSize\", \">=" + minVectorWidth + "\"");
 
                 if (maxVectorWidth < minVectorWidth) {
                     builder.append("    // maxVectorWidth < minVectorWidth -> expect no vectorization.\n");
-                    r1.setNegative();
+                    expectVectorization1 = ExpectVectorization.NEVER;
                 } else if (maxVectorWidth < infinity) {
                     r1.setSize("min(" + (maxVectorWidth / type.size) + ",max_" + type.name + ")");
                 }
+                r1.setExpectVectVectorization(expectVectorization1);
                 r1.generate(builder);
 
                 // Rule 2: strict alignment: -XX:+AlignVector
+                ExpectVectorization expectVectorization2 = expectVectorization;
                 IRRule r2 = new IRRule(type, type.irNode, applyIfCPUFeature);
                 r2.addApplyIf("\"AlignVector\", \"true\"");
                 r2.addApplyIf("\"MaxVectorSize\", \">=" + minVectorWidth + "\"");
@@ -791,18 +847,23 @@ String generateIRRules() {
                     builder.append("    // byteOffset % awMax == 0   -> always trivially aligned\n");
                 } else if (byteOffset % awMin != 0) {
                     builder.append("    // byteOffset % awMin != 0   -> can never align -> expect no vectorization.\n");
-                    r2.setNegative();
+                    expectVectorization2 = ExpectVectorization.NEVER;
                 } else {
-                    builder.append("    // Alignment unknown -> disable IR rule.\n");
-                    r2.disable();
+                    if (expectVectorization2 != ExpectVectorization.NEVER) {
+                        builder.append("    // Alignment unknown -> disable IR rule.\n");
+                        expectVectorization2 = ExpectVectorization.UNKNOWN;
+                    } else {
+                        builder.append("    // Alignment unknown -> but already proved no vectorization above.\n");
+                    }
                 }
 
                 if (maxVectorWidth < minVectorWidth) {
                     builder.append("    // Not at least 2 elements or 4 bytes -> expect no vectorization.\n");
-                    r2.setNegative();
+                    expectVectorization2 = ExpectVectorization.NEVER;
                 } else if (maxVectorWidth < infinity) {
                     r2.setSize("min(" + (maxVectorWidth / type.size) + ",max_" + type.name + ")");
                 }
+                r2.setExpectVectVectorization(expectVectorization2);
                 r2.generate(builder);
             }
             return builder.toString();
@@ -846,12 +907,12 @@ void setSize(String size) {
             this.size = size;
         }
 
-        void setNegative() {
-            this.isPositiveRule = false;
-        }
-
-        void disable() {
-            this.isEnabled = false;
+        void setExpectVectVectorization(ExpectVectorization expectVectorization) {
+            switch(expectVectorization) {
+                case ExpectVectorization.NEVER   -> { this.isPositiveRule = false; }
+                case ExpectVectorization.UNKNOWN -> { this.isEnabled = false; }
+                case ExpectVectorization.ALWAYS  -> {}
+            }
         }
 
         void addApplyIf(String constraint) {
diff --git a/test/hotspot/jtreg/compiler/vectorization/runner/LoopCombinedOpTest.java b/test/hotspot/jtreg/compiler/vectorization/runner/LoopCombinedOpTest.java
index 16d04102082b3..8a0715eadfe88 100644
--- a/test/hotspot/jtreg/compiler/vectorization/runner/LoopCombinedOpTest.java
+++ b/test/hotspot/jtreg/compiler/vectorization/runner/LoopCombinedOpTest.java
@@ -138,8 +138,11 @@ public int[] multipleOpsWithMultipleConstants() {
     }
 
     @Test
-    @IR(applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
+    @IR(applyIfCPUFeatureOr = {"asimd", "true", "sse4.1", "true"},
         counts = {IRNode.STORE_VECTOR, ">0"})
+    // With sse2, the MulI does not vectorize. This means we have vectorized stores
+    // to res1, but scalar loads from res1. The store-to-load-forwarding failure
+    // detection catches this and rejects vectorization.
     public int[] multipleStores() {
         int[] res1 = new int[SIZE];
         int[] res2 = new int[SIZE];
diff --git a/test/micro/org/openjdk/bench/vm/compiler/VectorStoreToLoadForwarding.java b/test/micro/org/openjdk/bench/vm/compiler/VectorStoreToLoadForwarding.java
new file mode 100644
index 0000000000000..ac8940ec67510
--- /dev/null
+++ b/test/micro/org/openjdk/bench/vm/compiler/VectorStoreToLoadForwarding.java
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.vm.compiler;
+
+import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.infra.*;
+
+import java.lang.invoke.*;
+
+import java.util.concurrent.TimeUnit;
+import java.util.Random;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@State(Scope.Thread)
+@Warmup(iterations = 2, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 3, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(value = 1)
+public abstract class VectorStoreToLoadForwarding {
+    @Param({"10000"})
+    public int SIZE;
+
+    @Param({  "0",   "1",   "2",   "3",   "4",   "5",   "6",   "7",   "8",   "9",
+             "10",  "11",  "12",  "13",  "14",  "15",  "16",  "17",  "18",  "19",
+             "20",  "21",  "22",  "23",  "24",  "25",  "26",  "27",  "28",  "29",
+             "30",  "31",  "32",  "33",  "34",  "35",  "36",  "37",  "38",  "39",
+             "40",  "41",  "42",  "43",  "44",  "45",  "46",  "47",  "48",  "49",
+             "50",  "51",  "52",  "53",  "54",  "55",  "56",  "57",  "58",  "59",
+             "60",  "61",  "62",  "63",  "64",  "65",  "66",  "67",  "68",  "69",
+             "70",  "71",  "72",  "73",  "74",  "75",  "76",  "77",  "78",  "79",
+             "80",  "81",  "82",  "83",  "84",  "85",  "86",  "87",  "88",  "89",
+             "90",  "91",  "92",  "93",  "94",  "95",  "96",  "97",  "98",  "99",
+            "100", "101", "102", "103", "104", "105", "106", "107", "108", "109",
+            "110", "111", "112", "113", "114", "115", "116", "117", "118", "119",
+            "120", "121", "122", "123", "124", "125", "126", "127", "128", "129"})
+    public int OFFSET;
+
+    // To get compile-time constants for OFFSET
+    static final MutableCallSite MUTABLE_CONSTANT = new MutableCallSite(MethodType.methodType(int.class));
+    static final MethodHandle MUTABLE_CONSTANT_HANDLE = MUTABLE_CONSTANT.dynamicInvoker();
+
+    public int START = 1000;
+
+    private byte[] aB;
+    private short[] aS;
+    private int[] aI;
+    private long[] aL;
+
+    @Param("0")
+    private int seed;
+    private Random r = new Random(seed);
+
+    @Setup
+    public void init() throws Throwable {
+        aB = new byte[SIZE];
+        aS = new short[SIZE];
+        aI = new int[SIZE];
+        aL = new long[SIZE];
+
+        for (int i = START; i < SIZE; i++) {
+            aB[i] = (byte)r.nextInt();
+            aS[i] = (short)r.nextInt();
+            aI[i] = r.nextInt();
+            aL[i] = r.nextLong();
+        }
+
+        MethodHandle constant = MethodHandles.constant(int.class, OFFSET);
+        MUTABLE_CONSTANT.setTarget(constant);
+    }
+
+    @CompilerControl(CompilerControl.Mode.INLINE)
+    private int offset_con() throws Throwable {
+        return (int) MUTABLE_CONSTANT_HANDLE.invokeExact();
+    }
+
+    @Benchmark
+    public void bytes() throws Throwable {
+        int offset = offset_con();
+        for (int i = START; i < SIZE; i++) {
+            aB[i] = (byte)(aB[i - offset] + 1);
+        }
+    }
+
+    @Benchmark
+    public void shorts() throws Throwable {
+        int offset = offset_con();
+        for (int i = START; i < SIZE; i++) {
+            aS[i] = (short)(aS[i - offset] + 1);
+        }
+    }
+
+    @Benchmark
+    public void ints() throws Throwable {
+        int offset = offset_con();
+        for (int i = START; i < SIZE; i++) {
+            aI[i] = aI[i - offset] + 1;
+        }
+    }
+
+    @Benchmark
+    public void longs() throws Throwable {
+        int offset = offset_con();
+        for (int i = START; i < SIZE; i++) {
+            aL[i] = (long)(aL[i - offset] + 1);
+        }
+    }
+
+    @Fork(value = 1, jvmArgs = {
+        "-XX:+UseSuperWord"
+    })
+    public static class Default extends VectorStoreToLoadForwarding {}
+
+    @Fork(value = 1, jvmArgs = {
+        "-XX:-UseSuperWord"
+    })
+    public static class NoVectorization extends VectorStoreToLoadForwarding {}
+
+    @Fork(value = 1, jvmArgs = {
+        "-XX:+UseSuperWord", "-XX:+UnlockDiagnosticVMOptions", "-XX:SuperWordStoreToLoadForwardingFailureDetection=0"
+    })
+    public static class NoStoreToLoadForwardFailureDetection extends VectorStoreToLoadForwarding {}
+}