[LAA] Detect cross-iteration WAW when writing to the same pointerusers/eas/laa-detect-waw-hazard-1

Fixes https://github.com/llvm/llvm-project/issues/187402.
author: Andrei Elovikov <andrei.elovikov@sifive.com> 2026-03-20 13:43:04 -0700
committer: Andrei Elovikov <andrei.elovikov@sifive.com> 2026-03-23 11:49:01 -0700
commit: 4974e0d5ce2d5b93da3e3b5e76e2ee290f18aa06 (patch)
tree: dd7c2a9733c254f28eb35abbf93264c70547b764
parent: aa4e85a2ecfff002f0505c1ed15a4ec80999c41b (diff)
download: llvm-users/eas/laa-detect-waw-hazard-1.tar.gz
llvm-users/eas/laa-detect-waw-hazard-1.tar.bz2
llvm-users/eas/laa-detect-waw-hazard-1.zip
3 files changed, 97 insertions, 47 deletions
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 5f4f305506d4..202665ff6bde 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -872,7 +872,7 @@ public:
 
   /// Goes over all memory accesses, checks whether a RT check is needed
   /// and builds sets of dependent accesses.
-  void buildDependenceSets();
+  void buildDependenceSets(const MemoryDepChecker &DepChecker);
 
   /// Initial processing of memory accesses determined that we need to
   /// perform dependency checking.
@@ -1520,7 +1520,16 @@ bool AccessAnalysis::canCheckPtrAtRT(
   return CanDoRTIfNeeded;
 }
 
-void AccessAnalysis::buildDependenceSets() {
+static bool isInvariant(Value *V, const Loop *TheLoop, ScalarEvolution *SE) {
+  if (TheLoop->isLoopInvariant(V))
+    return true;
+  if (!SE->isSCEVable(V->getType()))
+    return false;
+  const SCEV *S = SE->getSCEV(V);
+  return SE->isLoopInvariant(S, TheLoop);
+}
+
+void AccessAnalysis::buildDependenceSets(const MemoryDepChecker &DepChecker) {
   // We process the set twice: first we process read-write pointers, last we
   // process read-only pointers. This allows us to skip dependence tests for
   // read-only pointers.
@@ -1602,7 +1611,31 @@ void AccessAnalysis::buildDependenceSets() {
           // this is a read only check other writes for conflicts (but only if
           // there is no other write to the ptr - this is an optimization to
           // catch "a[i] = a[i] + " without having to do a dependence check).
-          if ((IsWrite || IsReadOnlyPtr) && AliasSetHasWrite) {
+          //
+          // If there are multiple writes into the same pointer we need to make
+          // sure that there are no cross-iteration dependencies between those
+          // writes to avoid the following scenario:
+          //
+          //   code:
+          //     if (RT_COND0) *p = x;
+          //     if (RT_COND1) *p = y;
+          //
+          //   execution:
+          //     Iter0     |  Iter1
+          //    no store   |   *p = 2
+          //     *p = 1    |  no store
+          //
+          // Scalar loop would leave `*p == 2`, yet two vectorized scatter's
+          // would result in `*p == 1` which is wrong.
+          //
+          // NOTE: Known invariant stores are handled separately in both this
+          // file and LoopVectorizationLegality to support the case when
+          // reduction wasn't completely transformed into SSA form.
+          bool MultipleNonInvariantStoresToPtrExist =
+              DepChecker.getOrderForAccess(Ptr, true).size() > 1 &&
+              !::isInvariant(Ptr, TheLoop, PSE.getSE());
+          if ((IsWrite || IsReadOnlyPtr) &&
+              (AliasSetHasWrite || MultipleNonInvariantStoresToPtrExist)) {
             CheckDeps.push_back(Access);
             IsRTCheckAnalysisNeeded = true;
           }
@@ -2775,14 +2808,14 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
   // If we write (or read-write) to a single destination and there are no other
   // reads in this loop then is it safe to vectorize: the vectorized stores
   // preserve ordering via replication or order-preserving @llvm.masked.scatter.
-  if (NumReadWrites == 1 && NumReads == 0) {
+  if (NumReadWrites == 1 && NumReads == 0 && Stores.size() == 1) {
     LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
     return true;
   }
 
   // Build dependence sets and check whether we need a runtime pointer bounds
   // check.
-  Accesses.buildDependenceSets();
+  Accesses.buildDependenceSets(getDepChecker());
 
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
@@ -2955,13 +2988,7 @@ LoopAccessInfo::recordAnalysis(StringRef RemarkName, const Instruction *I) {
 }
 
 bool LoopAccessInfo::isInvariant(Value *V) const {
-  auto *SE = PSE->getSE();
-  if (TheLoop->isLoopInvariant(V))
-    return true;
-  if (!SE->isSCEVable(V->getType()))
-    return false;
-  const SCEV *S = SE->getSCEV(V);
-  return SE->isLoopInvariant(S, TheLoop);
+  return ::isInvariant(V, TheLoop, PSE->getSE());
 }
 
 /// If \p Ptr is a GEP, which has a loop-variant operand, return that operand.
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/multiple_stores_to_same_addr.ll b/llvm/test/Analysis/LoopAccessAnalysis/multiple_stores_to_same_addr.ll
index 0a71d4a3c63c..bdfee12db528 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/multiple_stores_to_same_addr.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/multiple_stores_to_same_addr.ll
@@ -6,8 +6,13 @@
 define void @waw_no_mask(ptr %p, i64 %stride, i64 %n) {
 ; CHECK-LABEL: 'waw_no_mask'
 ; CHECK-NEXT:    header:
-; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
 ; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            store i64 %iv, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %iv.next, ptr %gep, align 4
+; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
@@ -40,8 +45,13 @@ exit:
 define void @waw_mask(ptr %p, i64 %stride, i64 %n, i64 %n0, i64 %n1) {
 ; CHECK-LABEL: 'waw_mask'
 ; CHECK-NEXT:    header:
-; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
 ; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            store i64 %iv, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %iv.next, ptr %gep, align 4
+; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
@@ -87,8 +97,13 @@ exit:
 define void @waw_no_mask_unknown_stride(ptr %p, i64 %stride, i64 %n) {
 ; CHECK-LABEL: 'waw_no_mask_unknown_stride'
 ; CHECK-NEXT:    header:
-; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
 ; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            store i64 %iv, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %iv.next, ptr %gep, align 4
+; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
@@ -121,8 +136,13 @@ exit:
 define void @waw_mask_unknown_stride(ptr %p, i64 %stride, i64 %n0, i64 %n1) {
 ; CHECK-LABEL: 'waw_mask_unknown_stride'
 ; CHECK-NEXT:    header:
-; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
 ; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            store i64 %iv, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %iv.next, ptr %gep, align 4
+; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
@@ -168,8 +188,13 @@ exit:
 define void @no_cross_iter_dependency(ptr %p, i8 %a, i64 %n, i64 %n0, i64 %n1) {
 ; CHECK-LABEL: 'no_cross_iter_dependency'
 ; CHECK-NEXT:    header:
-; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
 ; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            store i64 %iv, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %iv.next, ptr %gep, align 4
+; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
@@ -219,6 +244,10 @@ define void @const_stride(ptr %p, i64 %n, i64 %n0, i64 %n1) {
 ; CHECK-NEXT:    header:
 ; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            store i64 %iv, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %iv.next, ptr %gep, align 4
+; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
@@ -295,8 +324,13 @@ exit:
 define void @indirect_no_mask(ptr noalias %p, i64 %n) {
 ; CHECK-LABEL: 'indirect_no_mask'
 ; CHECK-NEXT:    header:
-; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
 ; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            store i64 %iv, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %iv.next, ptr %gep, align 4
+; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
@@ -328,8 +362,13 @@ exit:
 define void @indirect_mask(ptr noalias %p, i64 %n, i64 %n0, i64 %n1) {
 ; CHECK-LABEL: 'indirect_mask'
 ; CHECK-NEXT:    header:
-; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
 ; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            store i64 %iv, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %iv.next, ptr %gep, align 4
+; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
index fabab210fb85..3ea068440ce2 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
@@ -162,35 +162,19 @@ exit:
 define void @store_to_addr_generated_from_invariant_addr(ptr noalias %p0, ptr noalias %p1, ptr noalias %p2, ptr %p3, i64 %N) {
 ; CHECK-LABEL: @store_to_addr_generated_from_invariant_addr(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
-; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[P0:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[TMP1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[TMP0]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP4]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[P1:%.*]], <vscale x 2 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT1]], <vscale x 2 x ptr> align 8 [[TMP5]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr i32, ptr [[P1:%.*]], i64 [[IV]]
+; CHECK-NEXT:    store ptr [[P0:%.*]], ptr [[ARRAYIDX11]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[P2:%.*]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[P3:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP8]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x ptr> align 4 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x ptr> align 4 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x ptr> align 1 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP8]], align 4
+; CHECK-NEXT:    store i32 0, ptr [[TMP8]], align 4
+; CHECK-NEXT:    store i8 0, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[VECTOR_BODY]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -250,7 +234,7 @@ define i8 @mixed_gather_scatters(ptr %A, ptr %B, ptr %C) #0 {
 ; RVA23-NEXT:    [[TMP14]] = call <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> [[TMP13]], <vscale x 2 x i8> [[VEC_PHI]], i32 [[TMP0]])
 ; RVA23-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP0]]
 ; RVA23-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
-; RVA23-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; RVA23-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; RVA23:       middle.block:
 ; RVA23-NEXT:    [[TMP16:%.*]] = call i8 @llvm.vector.reduce.or.nxv2i8(<vscale x 2 x i8> [[TMP14]])
 ; RVA23-NEXT:    br label [[EXIT:%.*]]
@@ -291,7 +275,7 @@ define i8 @mixed_gather_scatters(ptr %A, ptr %B, ptr %C) #0 {
 ; RVA23ZVL1024B-NEXT:    [[TMP14]] = call <vscale x 1 x i8> @llvm.vp.merge.nxv1i8(<vscale x 1 x i1> splat (i1 true), <vscale x 1 x i8> [[TMP13]], <vscale x 1 x i8> [[VEC_PHI]], i32 [[TMP0]])
 ; RVA23ZVL1024B-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP0]]
 ; RVA23ZVL1024B-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
-; RVA23ZVL1024B-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; RVA23ZVL1024B-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; RVA23ZVL1024B:       middle.block:
 ; RVA23ZVL1024B-NEXT:    [[TMP16:%.*]] = call i8 @llvm.vector.reduce.or.nxv1i8(<vscale x 1 x i8> [[TMP14]])
 ; RVA23ZVL1024B-NEXT:    br label [[EXIT:%.*]]
author	Andrei Elovikov <andrei.elovikov@sifive.com>	2026-03-20 13:43:04 -0700
committer	Andrei Elovikov <andrei.elovikov@sifive.com>	2026-03-23 11:49:01 -0700
commit	4974e0d5ce2d5b93da3e3b5e76e2ee290f18aa06 (patch)
tree	dd7c2a9733c254f28eb35abbf93264c70547b764
parent	aa4e85a2ecfff002f0505c1ed15a4ec80999c41b (diff)
download	llvm-users/eas/laa-detect-waw-hazard-1.tar.gz llvm-users/eas/laa-detect-waw-hazard-1.tar.bz2 llvm-users/eas/laa-detect-waw-hazard-1.zip