85 files changed, 6087 insertions, 1368 deletions
diff --git a/llvm/test/Analysis/DependenceAnalysis/monotonicity-cast.ll b/llvm/test/Analysis/DependenceAnalysis/monotonicity-cast.ll
new file mode 100644
index 0000000..e43d00d
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/monotonicity-cast.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output -passes="print<da>" -da-dump-monotonicity-report \
+; RUN:     -da-enable-monotonicity-check 2>&1 | FileCheck %s
+
+; int8_t offset = start;
+; for (int i = 0; i < 100; i++, offset += step)
+;   a[sext(offset)] = 0;
+;
+define void @sext_nsw(ptr %a, i8 %start, i8 %step) {
+; CHECK-LABEL: 'sext_nsw'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {(sext i8 %start to i64),+,(sext i8 %step to i64)}<nsw><%loop>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - none!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %offset = phi i8 [ %start, %entry ], [ %offset.next, %loop ]
+  %offset.sext = sext i8 %offset to i64
+  %idx = getelementptr i8, ptr %a, i64 %offset.sext
+  store i8 0, ptr %idx
+  %i.inc = add nsw i64 %i, 1
+  %offset.next = add nsw i8 %offset, %step
+  %exitcond = icmp eq i64 %i.inc, 100
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; The addition for `%offset.next` can wrap, so we cannot prove monotonicity.
+;
+; int8_t offset = start;
+; for (int i = 0; i < 100; i++, offset += step)
+;   a[sext(offset)] = 0;
+;
+define void @sext_may_wrap(ptr %a, i8 %start, i8 %step) {
+; CHECK-LABEL: 'sext_may_wrap'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: (sext i8 {%start,+,%step}<%loop> to i64)
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: (sext i8 {%start,+,%step}<%loop> to i64)
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %offset = phi i8 [ %start, %entry ], [ %offset.next, %loop ]
+  %offset.sext = sext i8 %offset to i64
+  %idx = getelementptr i8, ptr %a, i64 %offset.sext
+  store i8 0, ptr %idx
+  %i.inc = add nsw i64 %i, 1
+  %offset.next = add i8 %offset, %step
+  %exitcond = icmp eq i64 %i.inc, 100
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; for (int8_t i = 0; i < 100; i++)
+;   a[zext(offset)] = 0;
+;
+define void @zext_pos(ptr %a) {
+; CHECK-LABEL: 'zext_pos'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {0,+,1}<nuw><nsw><%loop>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - none!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i8 [ 0, %entry ], [ %i.inc, %loop ]
+  %offset.zext = zext nneg i8 %i to i64
+  %idx = getelementptr i8, ptr %a, i64 %offset.zext
+  store i8 0, ptr %idx
+  %i.inc = add nsw i8 %i, 1
+  %exitcond = icmp eq i8 %i.inc, 100
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; The zero-extened value of `offset` is no longer monotonic. In fact, the
+; values of `offset` in each iteration are:
+;
+;    iteration |   0 | 1 | 2 | ...
+; -------------|-----|---|---|---------
+;       offset |  -1 | 0 | 1 | ...
+; zext(offset) | 255 | 0 | 1 | ...
+;
+;
+; for (int8_t i = -1; i < 100; i++)
+;   a[zext(offset)] = 0;
+;
+define void @zext_cross_zero(ptr %a) {
+; CHECK-LABEL: 'zext_cross_zero'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: (zext i8 {-1,+,1}<nsw><%loop> to i64)
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: (zext i8 {-1,+,1}<nsw><%loop> to i64)
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i8 [ -1, %entry ], [ %i.inc, %loop ]
+  %offset.zext = zext nneg i8 %i to i64
+  %idx = getelementptr i8, ptr %a, i64 %offset.zext
+  store i8 0, ptr %idx
+  %i.inc = add nsw i8 %i, 1
+  %exitcond = icmp eq i8 %i.inc, 100
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; In principle, we can prove that `zext(offset)` is monotonic since we know
+; that `offset` is non-negative.
+;
+; int8_t offset = 0;
+; for (int i = 0; i < 100; i++, offset += step)
+;   a[zext(offset)] = 0;
+;
+define void @zext_nneg_nsw(ptr %a, i8 %step) {
+; CHECK-LABEL: 'zext_nneg_nsw'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: (zext i8 {0,+,%step}<nsw><%loop> to i64)
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: (zext i8 {0,+,%step}<nsw><%loop> to i64)
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %offset = phi i8 [ 0, %entry ], [ %offset.next, %loop ]
+  %offset.zext = zext nneg i8 %offset to i64
+  %idx = getelementptr i8, ptr %a, i64 %offset.zext
+  store i8 0, ptr %idx
+  %i.inc = add nsw i64 %i, 1
+  %offset.next = add nsw i8 %offset, %step
+  %exitcond = icmp eq i64 %i.inc, 100
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; SCEV handles `i & 1` as an i1 addrec. Ensure that the monotonicity analysis
+; properly analyzes it.
+;
+; for (i = 0; i < 100; i++)
+;  a[i & 1] = 0;
+;
+define void @offset_truncated_to_i1(ptr %a) {
+; CHECK-LABEL: 'offset_truncated_to_i1'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: (zext i1 {false,+,true}<%loop> to i64)
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: (zext i1 {false,+,true}<%loop> to i64)
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %and = and i64 %i, 1
+  %idx = getelementptr inbounds i8, ptr %a, i64 %and
+  store i8 0, ptr %idx
+  %i.inc = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.inc, 100
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/monotonicity-delinearize.ll b/llvm/test/Analysis/DependenceAnalysis/monotonicity-delinearize.ll
new file mode 100644
index 0000000..71ea4e9
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/monotonicity-delinearize.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output -passes="print<da>" -da-dump-monotonicity-report \
+; RUN:     -da-enable-monotonicity-check 2>&1 | FileCheck %s
+
+; The offset SCEV will be delinearized into a 2D array access, like as follows:
+;
+; - Outer subscript: {0,+,1}<nuw><nsw><%loop.i.header>
+; - Inner subscript: {0,+,1}<nuw><nsw><%loop.j.header>
+;
+; These subscripts are both monotonic, but we also need to check the
+; monotonicity of the original addrec.
+;
+; char A[...][32];
+; for (i = 0; i < 1ll << 62; i++)
+;   for (j = 0; j < 32; j++)
+;     if (i < (1ll << 57))
+;       A[i][j] = 0;
+;
+define void @linearized_offset_wrap(ptr %a) {
+; CHECK-LABEL: 'linearized_offset_wrap'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %gep, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}0,+,32}<%loop.i.header>,+,1}<nw><%loop.j.header>
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: {{\{\{}}0,+,32}<%loop.i.header>,+,1}<nw><%loop.j.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %gep, align 1 --> Dst: store i8 0, ptr %gep, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop.i.header
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.header
+
+loop.j.header:
+  %j = phi i64 [ 0, %loop.i.header ], [ %j.inc, %loop.j.latch ]
+  %cond = icmp slt i64 %i, 144115188075855872  ; 2^57
+  br i1 %cond, label %if.then, label %loop.j.latch
+
+if.then:
+  %gep = getelementptr inbounds [32 x i8], ptr %a, i64 %i, i64 %j
+  store i8 0, ptr %gep
+  br label %loop.j.latch
+
+loop.j.latch:
+  %j.inc = add nuw nsw i64 %j, 1
+  %ec.j = icmp eq i64 %j.inc, 32
+  br i1 %ec.j, label %loop.i.latch, label %loop.j.header
+
+loop.i.latch:
+  %i.inc = add nuw nsw i64 %i, 1
+  %ec.i = icmp eq i64 %i.inc, 4611686018427387904  ; 2^62
+  br i1 %ec.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/monotonicity-invariant.ll b/llvm/test/Analysis/DependenceAnalysis/monotonicity-invariant.ll
new file mode 100644
index 0000000..e5b6ddb
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/monotonicity-invariant.ll
@@ -0,0 +1,150 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output -passes="print<da>" -da-dump-monotonicity-report \
+; RUN:     -da-enable-monotonicity-check 2>&1 | FileCheck %s
+
+; for (int i = 0; i < n; i++)
+;   a[x] = 0;
+define void @single_loop_invariant(ptr %a, i64 %x, i64 %n) {
+; CHECK-LABEL: 'single_loop_invariant'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: %x
+; CHECK-NEXT:      Monotonicity: Invariant
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - consistent output [S]!
+;
+entry:
+  %guard = icmp sgt i64 %n, 0
+  br i1 %guard, label %loop, label %exit
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %idx = getelementptr inbounds i8, ptr %a, i64 %x
+  store i8 0, ptr %idx
+  %i.inc = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.inc, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; for (int i = 0; i < n; i++)
+;   a[(i % 2 == 0 ? x : y)] = 0;
+define void @single_loop_variant(ptr %a, i64 %x, i64 %y, i64 %n) {
+; CHECK-LABEL: 'single_loop_variant'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: %offset
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: %offset
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  %guard = icmp sgt i64 %n, 0
+  br i1 %guard, label %loop, label %exit
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %offset = phi i64 [ %x, %entry ], [ %offset.next, %loop ]
+  %offset.next = phi i64 [ %y, %entry ], [ %offset, %loop ]
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %i.inc = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.inc, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; for (int i = 0; i < n; i++)
+;   for (int j = 0; j < m; j++)
+;     a[x + i] = 0;
+define void @invariant_plus_monotonic0(ptr %a, i64 %x, i64 %n, i64 %m) {
+; CHECK-LABEL: 'invariant_plus_monotonic0'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {%x,+,1}<nsw><%loop.i.header>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - consistent output [0 S]!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  %offset = phi i64 [ %x, %entry ], [ %offset.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nuw nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %offset.inc = add nsw i64 %offset, 1
+  %exitcond.i = icmp eq i64 %i.inc, %n
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; for (int i = 0; i < n; i++)
+;   for (int j = 0; j < m; j++)
+;     a[x + j] = 0;
+define void @invariant_plus_monotonic1(ptr %a, i64 %x, i64 %n, i64 %m) {
+; CHECK-LABEL: 'invariant_plus_monotonic1'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {%x,+,1}<nsw><%loop.j>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - consistent output [S 0]!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+  %offset = phi i64 [ %x, %loop.j.preheader ], [ %offset.inc, %loop.j ]
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nuw nsw i64 %j, 1
+  %offset.inc = add nsw i64 %offset, 1
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %exitcond.i = icmp eq i64 %i.inc, %n
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll b/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll
new file mode 100644
index 0000000..7411dc9
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll
@@ -0,0 +1,519 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output -passes="print<da>" -da-dump-monotonicity-report \
+; RUN:     -da-enable-monotonicity-check 2>&1 | FileCheck %s
+
+; for (int i = 0; i < n; i++)
+;   a[i] = 0;
+;
+define void @single_loop_nsw(ptr %a, i64 %n) {
+; CHECK-LABEL: 'single_loop_nsw'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {0,+,1}<nuw><nsw><%loop>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - none!
+;
+entry:
+  %guard = icmp sgt i64 %n, 0
+  br i1 %guard, label %loop, label %exit
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %idx = getelementptr inbounds i8, ptr %a, i64 %i
+  store i8 0, ptr %idx
+  %i.inc = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.inc, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; The purpose of the variable `begin` is to avoid violating the size limitation
+; of the allocated object in LLVM IR, which would cause UB.
+;
+; for (unsigned long long i = begin; i < end; i++)
+;   a[i] = 0;
+;
+define void @single_loop_nuw(ptr %a, i64 %begin, i64 %end) {
+; CHECK-LABEL: 'single_loop_nuw'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {%begin,+,1}<nuw><%loop>
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: {%begin,+,1}<nuw><%loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  %guard = icmp ult i64 %begin, %end
+  br i1 %guard, label %loop, label %exit
+
+loop:
+  %i = phi i64 [ %begin, %entry ], [ %i.inc, %loop ]
+  %idx = getelementptr i8, ptr %a, i64 %i
+  store i8 0, ptr %idx
+  %i.inc = add nuw i64 %i, 1
+  %exitcond = icmp eq i64 %i.inc, %end
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; for (int i = 0; i < n; i++)
+;   for (int j = 0; j < m; j++)
+;     a[i + j] = 0;
+;
+define void @nested_loop_nsw0(ptr %a, i64 %n, i64 %m) {
+; CHECK-LABEL: 'nested_loop_nsw0'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,1}<nuw><nsw><%loop.j>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - output [* *]!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+  %offset = add nsw i64 %i, %j
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %exitcond.i = icmp eq i64 %i.inc, %n
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; for (int i = n - 1; i >= 0; i--)
+;   for (int j = 0; j < m; j++)
+;     a[i + j] = 0;
+;
+define void @nested_loop_nsw1(ptr %a, i64 %n, i64 %m) {
+; CHECK-LABEL: 'nested_loop_nsw1'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}(-1 + %n),+,-1}<nsw><%loop.i.header>,+,1}<nsw><%loop.j>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - output [* *]!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ %n, %entry ], [ %i.dec, %loop.i.latch ]
+  %i.dec = add nsw i64 %i, -1
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+  %offset = add nsw i64 %i.dec, %j
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %exitcond.i = icmp eq i64 %i.dec, 0
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; for (int i = 0; i < n; i--)
+;   for (int j = 0; j < m; j++)
+;     a[i - j] = 0;
+;
+define void @nested_loop_nsw2(ptr %a, i64 %n, i64 %m) {
+; CHECK-LABEL: 'nested_loop_nsw2'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,-1}<nsw><%loop.j>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - output [* *]!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+  %offset = sub nsw i64 %i, %j
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %exitcond.i = icmp eq i64 %i.inc, %n
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; for (int i = begin0; i < end0; i++)
+;   for (int j = begin1; j < end1; j++) {
+;     unsigned long long offset = (unsigned long long)i + (unsigned long long)j;
+;     a[offset] = 0;
+;   }
+;
+define void @nested_loop_nuw(ptr %a, i64 %begin0, i64 %end0, i64 %begin1, i64 %end1) {
+; CHECK-LABEL: 'nested_loop_nuw'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}(%begin0 + %begin1),+,1}<nw><%loop.i.header>,+,1}<nw><%loop.j>
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: {{\{\{}}(%begin0 + %begin1),+,1}<nw><%loop.i.header>,+,1}<nw><%loop.j>
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  %guard.i.0 = icmp slt i64 0, %begin0
+  %guard.i.1 = icmp slt i64 %begin0, %end0
+  %guard.i.2 = icmp slt i64 0, %end0
+  %and.i.0 = and i1 %guard.i.0, %guard.i.1
+  %and.i.1 = and i1 %and.i.0, %guard.i.2
+  br i1 %and.i.1, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ %begin0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %guard.j.0 = icmp slt i64 0, %begin1
+  %guard.j.1 = icmp slt i64 %begin1, %end1
+  %guard.j.2 = icmp slt i64 0, %end1
+  %and.j.0 = and i1 %guard.j.0, %guard.j.1
+  %and.j.1 = and i1 %and.j.0, %guard.j.2
+  br i1 %and.j.1, label %loop.j, label %loop.i.latch
+
+loop.j:
+  %j = phi i64 [ %begin1, %loop.j.preheader ], [ %j.inc, %loop.j ]
+  %offset = add nuw i64 %i, %j
+  %idx = getelementptr i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, %end1
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %exitcond.i = icmp eq i64 %i.inc, %end0
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; for (int i = 0; i < n; i++)
+;   for (int j = 0; j < m; j++)
+;     a[i + step*j] = 0;
+;
+define void @nested_loop_step(ptr %a, i64 %n, i64 %m, i64 %step) {
+; CHECK-LABEL: 'nested_loop_step'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,%step}<nsw><%loop.j>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - output [* *]!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+  %offset.j = phi i64 [ 0, %loop.j.preheader ], [ %offset.j.next, %loop.j ]
+  %offset = add nsw i64 %i, %offset.j
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nsw i64 %j, 1
+  %offset.j.next = add nsw i64 %offset.j, %step
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %exitcond.i = icmp eq i64 %i.inc, %n
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; The value of step reccurence is not invariant with respect to the outer most
+; loop (the i-loop).
+;
+; offset_i = 0;
+; for (int i = 0; i < 100; i++) {
+;   for (int j = 0; j < 100; j++)
+;     a[offset_i + j] = 0;
+;   offset_i += (i % 2 == 0) ? 0 : 3;
+; }
+;
+define void @step_is_variant(ptr %a) {
+; CHECK-LABEL: 'step_is_variant'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {%offset.i,+,1}<nuw><nsw><%loop.j>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop.i.header
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  %offset.i = phi i64 [ 0, %entry ], [ %offset.i.next, %loop.i.latch ]
+  %step.i.0 = phi i64 [ 0, %entry ], [ %step.i.1, %loop.i.latch ]
+  %step.i.1 = phi i64 [ 3, %entry ], [ %step.i.0, %loop.i.latch ]
+  br label %loop.j
+
+loop.j:
+  %j = phi i64 [ 0, %loop.i.header ], [ %j.inc, %loop.j ]
+  %offset = add nsw i64 %offset.i, %j
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, 100
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %offset.i.next = add nsw i64 %offset.i, %step.i.0
+  %exitcond.i = icmp eq i64 %i.inc, 100
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; The AddRec doesn't have nsw flag for the j-loop, since the store may not be
+; executed.
+;
+; for (int i = 0; i < n; i++)
+;   for (int j = 0; j < m; j++)
+;     if (cond)
+;       a[i + j] = 0;
+;
+define void @conditional_store0(ptr %a, i64 %n, i64 %m) {
+; CHECK-LABEL: 'conditional_store0'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,1}<nw><%loop.j.header>
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,1}<nw><%loop.j.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j.header, label %loop.i.latch
+
+loop.j.header:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j.latch ]
+  %offset = add nsw i64 %i, %j
+  %cond = freeze i1 poison
+  br i1 %cond, label %if.then, label %loop.j.latch
+
+if.then:
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  br label %loop.j.latch
+
+loop.j.latch:
+  %j.inc = add nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j.header
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %exitcond.i = icmp eq i64 %i.inc, %n
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; Similar to the @conditional_store0, but the definition of the `%offset` is
+; different from it and we can infer `nsw` in this case.
+;
+; for (int i = 0; i < n; i++)
+;   for (int j = 0; j < m; j++)
+;     if (cond)
+;       a[i + j] = 0;
+;
+define void @conditional_store1(ptr %a, i64 %n, i64 %m) {
+; CHECK-LABEL: 'conditional_store1'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,1}<nuw><nsw><%loop.j.header>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - output [* *]!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j.header, label %loop.i.latch
+
+loop.j.header:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j.latch ]
+  %offset = phi i64 [ %i, %loop.j.preheader ], [ %offset.next, %loop.j.latch ]
+  %cond = freeze i1 poison
+  br i1 %cond, label %if.then, label %loop.j.latch
+
+if.then:
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  br label %loop.j.latch
+
+loop.j.latch:
+  %j.inc = add nsw i64 %j, 1
+  %offset.next = add nsw i64 %offset, 1
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j.header
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %exitcond.i = icmp eq i64 %i.inc, %n
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; In the following case, the computation `offset = offset_i + j` will not wrap,
+; but `offset_i += 1024` will wrap both in a signed sense and an unsigned
+; sense. We cannot prove the monotonicity in this case.
+;
+; offset_i = (1ULL << 63) - 256;
+; for (i = 0; i < (1ULL << 62); i++, offset_i += 1024)
+;   for (j = 0; j < 32; j++) {
+;     offset = offset_i + j;
+;
+;     // The value of `offset` is positive in a signed sense.
+;     if (offset < (1ULL << 63))
+;       a[offset] = 0;
+;   }
+;
+define void @outer_loop_may_wrap(ptr %a) {
+; CHECK-LABEL: 'outer_loop_may_wrap'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %gep, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}9223372036854775552,+,1024}<%loop.i.header>,+,1}<nuw><nsw><%loop.j.header>
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: {9223372036854775552,+,1024}<%loop.i.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %gep, align 1 --> Dst: store i8 0, ptr %gep, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop.i.header
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  %subscript.i = phi i64 [ 9223372036854775552, %entry ], [ %subscript.i.next, %loop.i.latch ]  ; The initial value is 2^63 - 256
+  br label %loop.j.header
+
+loop.j.header:
+  %j = phi i64 [ 0, %loop.i.header ], [ %j.inc, %loop.j.latch ]
+  %subscript = phi i64 [ %subscript.i, %loop.i.header ], [ %subscript.next, %loop.j.latch ]
+  %cond = icmp sge i64 %subscript, 0
+  br i1 %cond, label %if.then, label %loop.j.latch
+
+if.then:
+  %gep = getelementptr inbounds i8, ptr %a, i64 %subscript
+  store i8 0, ptr %gep
+  br label %loop.j.latch
+
+loop.j.latch:
+  %j.inc = add nuw nsw i64 %j, 1
+  %subscript.next = add nuw nsw i64 %subscript, 1
+  %ec.j = icmp eq i64 %j.inc, 32
+  br i1 %ec.j, label %loop.i.latch, label %loop.j.header
+
+loop.i.latch:
+  %i.inc = add nuw nsw i64 %i, 1
+  %subscript.i.next = add i64 %subscript.i, 1024
+  %ec.i = icmp eq i64 %i.inc, 4611686018427387904  ; 2^62
+  br i1 %ec.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/non-monotonic.ll b/llvm/test/Analysis/DependenceAnalysis/non-monotonic.ll
new file mode 100644
index 0000000..6247336
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/non-monotonic.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output -passes="print<da>" -da-dump-monotonicity-report \
+; RUN:     -da-enable-monotonicity-check 2>&1 | FileCheck %s
+; RUN: opt < %s -disable-output -passes="print<da>" 2>&1 | FileCheck %s -check-prefix=DISABLE-CHECK
+
+;
+; for (i = 0; i < (1ULL << 60); i++) {
+;   A[i] = 1;
+;
+;   unsigned long long offset = i * 32 + (1ULL << 62);
+;   // offset is positive when interpreted as a signed value.
+;   // To prevent violating the size limitation for an allocated object.
+;   if (offset < (1ULL << 63))
+;     A[offset] = 2;
+; }
+;
+; -----------------------------------------------------------------------------
+;
+; There is a dependency between the two stores. To detect it, we need to check
+; the monotonicity and bail out the analysis since `offset` is not monotonic.
+;
+;  memory location  | first store (A[i]) | second store (A[offset])
+; ------------------|--------------------|----------------------------
+;  A[0]             | i = 0              | i = 2^59 - 2^57
+;  A[2^60 - 32]     | i = 2^60 - 32      | i = 2^59 - 2^57 + 2^55 - 1
+;
+define void @f(ptr %A) {
+; CHECK-LABEL: 'f'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 1, ptr %idx.0, align 1
+; CHECK-NEXT:      Expr: {0,+,1}<nuw><nsw><%loop.header>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-NEXT:    Inst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT:      Expr: {4611686018427387904,+,32}<%loop.header>
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: {4611686018427387904,+,32}<%loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.0, align 1
+; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - confused!
+; CHECK-NEXT:  Src: store i8 2, ptr %idx.1, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+; DISABLE-CHECK-LABEL: 'f'
+; DISABLE-CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.0, align 1
+; DISABLE-CHECK-NEXT:    da analyze - none!
+; DISABLE-CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; DISABLE-CHECK-NEXT:    da analyze - none!
+; DISABLE-CHECK-NEXT:  Src: store i8 2, ptr %idx.1, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; DISABLE-CHECK-NEXT:    da analyze - none!
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop.latch ]
+  %idx.0 = getelementptr inbounds i8, ptr %A, i64 %i
+  store i8 1, ptr %idx.0
+  %offset.tmp = mul i64 %i, 32
+  %offset = add i64 %offset.tmp, 4611686018427387904 ; 1ULL << 62
+  %if.cond = icmp sge i64 %offset, 0
+  br i1 %if.cond, label %if.then, label %loop.latch
+
+if.then:
+  %idx.1 = getelementptr inbounds i8, ptr %A, i64 %offset
+  store i8 2, ptr %idx.1
+  br label %loop.latch
+
+loop.latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %exit.cond = icmp eq i64 %i.next, 1152921504606846976 ; 1ULL << 60
+  br i1 %exit.cond, label %exit, label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll
index 3c3748a..1964fca 100644
--- a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll
+++ b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll
@@ -104,3 +104,56 @@ exit:
 }
 
 declare void @clobber()
+
+
+declare void @clobber.i32(i32)
+
+define void @test_guards_across_loops(i32 %N) {
+; CHECK-LABEL: 'test_guards_across_loops'
+; CHECK-NEXT:  Classifying expressions for: @test_guards_across_loops
+; CHECK-NEXT:    %iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %loop.1 ]
+; CHECK-NEXT:    --> {0,+,1}<%loop.1> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop.1: Computable }
+; CHECK-NEXT:    %iv.1.next = add i32 %iv.1, 1
+; CHECK-NEXT:    --> {1,+,1}<%loop.1> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop.1: Computable }
+; CHECK-NEXT:    %iv.2 = phi i32 [ 0, %loop.1 ], [ %iv.2.next, %loop.2 ]
+; CHECK-NEXT:    --> {0,+,1}<nuw><%loop.2> U: full-set S: full-set Exits: (1 + %N) LoopDispositions: { %loop.2: Computable }
+; CHECK-NEXT:    %iv.2.next = add i32 %iv.2, 1
+; CHECK-NEXT:    --> {1,+,1}<nw><%loop.2> U: full-set S: full-set Exits: (2 + %N) LoopDispositions: { %loop.2: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test_guards_across_loops
+; CHECK-NEXT:  Loop %loop.2: backedge-taken count is (1 + (zext i32 %N to i64))<nuw><nsw>
+; CHECK-NEXT:  Loop %loop.2: constant max backedge-taken count is i64 4294967296
+; CHECK-NEXT:  Loop %loop.2: symbolic max backedge-taken count is (1 + (zext i32 %N to i64))<nuw><nsw>
+; CHECK-NEXT:  Loop %loop.2: Trip multiple is 1
+; CHECK-NEXT:  Loop %loop.1: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop.1: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop.1: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %loop.1: Predicated backedge-taken count is (1 + (zext i32 %N to i64))<nuw><nsw>
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%loop.1> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %loop.1: Predicated constant max backedge-taken count is i64 4294967296
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%loop.1> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %loop.1: Predicated symbolic max backedge-taken count is (1 + (zext i32 %N to i64))<nuw><nsw>
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%loop.1> Added Flags: <nusw>
+;
+entry:
+  br label %loop.1
+
+loop.1:
+  %iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %loop.1 ]
+  call void @clobber.i32(i32 %iv.1)
+  %ec.1 = icmp ugt i32 %iv.1, %N
+  %iv.1.next = add i32 %iv.1, 1
+  br i1 %ec.1, label %loop.2, label %loop.1
+
+loop.2:
+  %iv.2 = phi i32  [ 0, %loop.1 ], [ %iv.2.next, %loop.2 ]
+  call void @clobber.i32(i32 %iv.2)
+  %ec.2 = icmp ugt i32 %iv.2, %N
+  %iv.2.next = add i32 %iv.2, 1
+  br i1 %ec.2, label %exit, label %loop.2
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
index a477465c..f35c48b 100644
--- a/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
@@ -717,5 +717,46 @@ exit:
   ret void
 }
 
+define void @test_urem_non_constant(ptr %dst, i32 %a, i32 %b) {
+; CHECK-LABEL: 'test_urem_non_constant'
+; CHECK-NEXT:  Classifying expressions for: @test_urem_non_constant
+; CHECK-NEXT:    %rem = urem i32 %a, %b
+; CHECK-NEXT:    --> ((-1 * (%a /u %b) * %b) + %a) U: full-set S: full-set
+; CHECK-NEXT:    %and.0 = and i1 %pre.0, %pre.1
+; CHECK-NEXT:    --> (%pre.1 umin %pre.0) U: full-set S: full-set
+; CHECK-NEXT:    %and.1 = and i1 %and.0, %pre.2
+; CHECK-NEXT:    --> (%pre.1 umin %pre.2 umin %pre.0) U: full-set S: full-set
+; CHECK-NEXT:    %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {0,+,%b}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.dst = getelementptr inbounds i8, ptr %dst, i32 %b
+; CHECK-NEXT:    --> ((sext i32 %b to i64) + %dst) U: full-set S: full-set Exits: ((sext i32 %b to i64) + %dst) LoopDispositions: { %loop: Invariant }
+; CHECK-NEXT:    %iv.next = add i32 %iv, %b
+; CHECK-NEXT:    --> {%b,+,%b}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test_urem_non_constant
+; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
+;
+entry:
+  %rem = urem i32 %a, %b
+  %pre.0 = icmp eq i32 %rem, 0
+  %pre.1 = icmp ne i32 %a, 0
+  %pre.2 = icmp ne i32 %b, 0
+  %and.0 = and i1 %pre.0, %pre.1
+  %and.1 = and i1 %and.0, %pre.2
+  br i1 %and.1, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ 0, %entry], [ %iv.next, %loop ]
+  %gep.dst = getelementptr inbounds i8, ptr %dst, i32 %b
+  store i8 0, ptr %gep.dst
+  %iv.next = add i32 %iv, %b
+  %ec = icmp ne i32 %iv.next, %a
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
 declare void @llvm.assume(i1)
 declare void @llvm.experimental.guard(i1, ...)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index b215c51..0933e67 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -1371,11 +1371,10 @@ define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %sc
 ; CHECK-SD-NEXT:    lsr x9, x0, #16
 ; CHECK-SD-NEXT:    adrp x8, .LCPI14_0
 ; CHECK-SD-NEXT:    dup v4.8h, w0
-; CHECK-SD-NEXT:    dup v1.8h, w9
-; CHECK-SD-NEXT:    fmov s3, w9
-; CHECK-SD-NEXT:    sqneg v2.8h, v1.8h
-; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI14_0]
-; CHECK-SD-NEXT:    tbl v1.16b, { v2.16b, v3.16b }, v1.16b
+; CHECK-SD-NEXT:    ldr q3, [x8, :lo12:.LCPI14_0]
+; CHECK-SD-NEXT:    dup v2.8h, w9
+; CHECK-SD-NEXT:    sqneg v1.8h, v2.8h
+; CHECK-SD-NEXT:    tbl v1.16b, { v1.16b, v2.16b }, v3.16b
 ; CHECK-SD-NEXT:    rev32 v2.8h, v0.8h
 ; CHECK-SD-NEXT:    sqdmull v3.4s, v0.4h, v4.4h
 ; CHECK-SD-NEXT:    sqdmull2 v0.4s, v0.8h, v4.8h
diff --git a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
index 284d624..f34d3ed 100644
--- a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
+++ b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
@@ -1,16 +1,12 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
-# RUN:    | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
-# RUN:    | FileCheck --check-prefix=CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
-# RUN:    | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
-# RUN:    | FileCheck --check-prefix=CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
-# RUN:    | FileCheck --check-prefix=CHECK-NO-ZCM-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
-# RUN:    | FileCheck --check-prefix=CHECK-ZCM-ZCZ %s
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcz-gpr32,-zcz-gpr64" %s \
+# RUN:    | FileCheck --check-prefix=CHECK-NOZCZ-GPR32-NOZCZ-GPR64 %s
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcz-gpr32,-zcz-gpr64" %s \
+# RUN:    | FileCheck --check-prefix=CHECK-ZCZ-GPR32-NOZCZ-GPR64 %s
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcz-gpr32,+zcz-gpr64" %s \
+# RUN:    | FileCheck --check-prefix=CHECK-NOZCZ-GPR32-ZCZ-GPR64 %s
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcz-gpr32,+zcz-gpr64" %s \
+# RUN:    | FileCheck --check-prefix=CHECK-ZCZ-GPR32-ZCZ-GPR64 %s
 
 --- |
   define void @f0(i64 noundef %x) { ret void }
@@ -24,41 +20,29 @@ liveins:
 body:             |
   bb.0:
     liveins: $x0, $lr
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-LABEL: name: f0
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: $w0 = ORRWrr $wzr, $wzr
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-LABEL: name: f0
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: $w0 = ORRWrr $wzr, $wzr
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     ;
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-LABEL: name: f0
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: $w0 = ORRWrr $wzr, $wzr
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-LABEL: name: f0
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: $w0 = MOVZWi 0, 0
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     ;
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-LABEL: name: f0
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, undef $xzr, implicit $wzr
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-LABEL: name: f0
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: $w0 = ORRWrr $wzr, $wzr
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     ;
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-LABEL: name: f0
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: $w0 = ORRWrr $wzr, $wzr
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
-    ;
-    ; CHECK-NO-ZCM-ZCZ-LABEL: name: f0
-    ; CHECK-NO-ZCM-ZCZ: liveins: $x0, $lr
-    ; CHECK-NO-ZCM-ZCZ-NEXT: {{  $}}
-    ; CHECK-NO-ZCM-ZCZ-NEXT: $w0 = MOVZWi 0, 0
-    ; CHECK-NO-ZCM-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
-    ;
-    ; CHECK-ZCM-ZCZ-LABEL: name: f0
-    ; CHECK-ZCM-ZCZ: liveins: $x0, $lr
-    ; CHECK-ZCM-ZCZ-NEXT: {{  $}}
-    ; CHECK-ZCM-ZCZ-NEXT: $w0 = MOVZWi 0, 0
-    ; CHECK-ZCM-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-LABEL: name: f0
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: $w0 = MOVZWi 0, 0
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     $w0 = COPY $wzr
     BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
 ...
@@ -69,41 +53,29 @@ liveins:
 body:             |
   bb.0:
     liveins: $x0, $lr
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-LABEL: name: f1
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, $xzr
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
-    ;
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-LABEL: name: f1
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, $xzr
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
-    ;
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-LABEL: name: f1
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, $xzr
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-LABEL: name: f1
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: $x0 = ORRXrr $xzr, $xzr
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     ;
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-LABEL: name: f1
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, $xzr
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-LABEL: name: f1
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: $x0 = ORRXrr $xzr, $xzr
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     ;
-    ; CHECK-NO-ZCM-ZCZ-LABEL: name: f1
-    ; CHECK-NO-ZCM-ZCZ: liveins: $x0, $lr
-    ; CHECK-NO-ZCM-ZCZ-NEXT: {{  $}}
-    ; CHECK-NO-ZCM-ZCZ-NEXT: $x0 = MOVZXi 0, 0
-    ; CHECK-NO-ZCM-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-LABEL: name: f1
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: $x0 = MOVZXi 0, 0
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     ;
-    ; CHECK-ZCM-ZCZ-LABEL: name: f1
-    ; CHECK-ZCM-ZCZ: liveins: $x0, $lr
-    ; CHECK-ZCM-ZCZ-NEXT: {{  $}}
-    ; CHECK-ZCM-ZCZ-NEXT: $x0 = MOVZXi 0, 0
-    ; CHECK-ZCM-ZCZ-NEXT:BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-LABEL: name: f1
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: $x0 = MOVZXi 0, 0
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     $x0 = COPY $xzr
     BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
 ...
diff --git a/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll b/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll
index 97a7741..849323f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll
@@ -421,3 +421,83 @@ for.body51:                                       ; preds = %is_sbox.exit155
   unreachable
 }
 declare fastcc void @get_switch_type(i32, i32, i16 signext, i16 signext, ptr nocapture) nounwind ssp
+
+; CHECK-LABEL: fold_imm1_csinc_32:
+; CHECK:      cmp w0, w1
+; CHECK-NEXT: csinc w0, w2, wzr, ge
+; CHECK-NEXT: ret
+define i32 @fold_imm1_csinc_32(i32 %x, i32 %y, i32 %n) nounwind ssp {
+entry:
+  %cmp = icmp slt i32 %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  br label %exit
+
+if.else:
+  br label %exit
+
+exit:
+  %result = phi i32 [ 1, %if.then ], [ %n, %if.else ]
+  ret i32 %result
+}
+
+; CHECK-LABEL: fold_imm1_csinc_64:
+; CHECK:      cmp x0, x1
+; CHECK-NEXT: csinc x0, x2, xzr, ge
+; CHECK-NEXT: ret
+define i64 @fold_imm1_csinc_64(i64 %x, i64 %y, i64 %n) nounwind ssp {
+entry:
+  %cmp = icmp slt i64 %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  br label %exit
+
+if.else:
+  br label %exit
+
+exit:
+  %result = phi i64 [ 1, %if.then ], [ %n, %if.else ]
+  ret i64 %result
+}
+
+; CHECK-LABEL: fold_imm1_cset_32:
+; CHECK:      cmp w0, w1
+; CHECK-NEXT: cset w0, lt
+; CHECK-NEXT: ret
+define i32 @fold_imm1_cset_32(i32 %x, i32 %y) nounwind ssp {
+entry:
+  %cmp = icmp slt i32 %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  br label %exit
+
+if.else:
+  br label %exit
+
+exit:
+  %result = phi i32 [ 1, %if.then ], [ 0, %if.else ]
+  ret i32 %result
+}
+
+; CHECK-LABEL: fold_imm1_cset_64:
+; CHECK:      cmp x0, x1
+; CHECK-NEXT: cset x0, lt
+; CHECK-NEXT: ret
+define i64 @fold_imm1_cset_64(i64 %x, i64 %y) nounwind ssp {
+entry:
+  %cmp = icmp slt i64 %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  br label %exit
+
+if.else:
+  br label %exit
+
+exit:
+  %result = phi i64 [ 1, %if.then ], [ 0, %if.else ]
+  ret i64 %result
+}
diff --git a/llvm/test/CodeGen/AArch64/peephole-csel.ll b/llvm/test/CodeGen/AArch64/peephole-csel.ll
index 868b9f1..b0852580 100644
--- a/llvm/test/CodeGen/AArch64/peephole-csel.ll
+++ b/llvm/test/CodeGen/AArch64/peephole-csel.ll
@@ -5,10 +5,9 @@ define void @peephole_csel(ptr %dst, i1 %0, i1 %cmp) {
 ; CHECK-LABEL: peephole_csel:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    tst w2, #0x1
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    tst w1, #0x1
-; CHECK-NEXT:    csel x8, x8, x9, eq
+; CHECK-NEXT:    csinc x8, x8, xzr, ne
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/rax1.ll b/llvm/test/CodeGen/AArch64/rax1.ll
index f679e90..4bb551f 100644
--- a/llvm/test/CodeGen/AArch64/rax1.ll
+++ b/llvm/test/CodeGen/AArch64/rax1.ll
@@ -1,22 +1,44 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
-; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefixes=SHA3,SHA3-SD %s
+; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefixes=NOSHA3,NOSHA3-SD %s
+; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s -global-isel | FileCheck --check-prefixes=SHA3,SHA3-GI %s
+; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s -global-isel | FileCheck --check-prefixes=NOSHA3,NOSHA3-GI %s
 
 define <2 x i64> @rax1(<2 x i64> %x, <2 x i64> %y) {
-; SHA3-LABEL: rax1:
-; SHA3:       // %bb.0:
-; SHA3-NEXT:    rax1 v0.2d, v0.2d, v1.2d
-; SHA3-NEXT:    ret
+; SHA3-SD-LABEL: rax1:
+; SHA3-SD:       // %bb.0:
+; SHA3-SD-NEXT:    rax1 v0.2d, v0.2d, v1.2d
+; SHA3-SD-NEXT:    ret
 ;
-; NOSHA3-LABEL: rax1:
-; NOSHA3:       // %bb.0:
-; NOSHA3-NEXT:    add v2.2d, v1.2d, v1.2d
-; NOSHA3-NEXT:    usra v2.2d, v1.2d, #63
-; NOSHA3-NEXT:    eor v0.16b, v0.16b, v2.16b
-; NOSHA3-NEXT:    ret
+; NOSHA3-SD-LABEL: rax1:
+; NOSHA3-SD:       // %bb.0:
+; NOSHA3-SD-NEXT:    add v2.2d, v1.2d, v1.2d
+; NOSHA3-SD-NEXT:    usra v2.2d, v1.2d, #63
+; NOSHA3-SD-NEXT:    eor v0.16b, v0.16b, v2.16b
+; NOSHA3-SD-NEXT:    ret
+;
+; SHA3-GI-LABEL: rax1:
+; SHA3-GI:       // %bb.0:
+; SHA3-GI-NEXT:    shl v2.2d, v1.2d, #1
+; SHA3-GI-NEXT:    ushr v1.2d, v1.2d, #63
+; SHA3-GI-NEXT:    orr v1.16b, v2.16b, v1.16b
+; SHA3-GI-NEXT:    eor v0.16b, v0.16b, v1.16b
+; SHA3-GI-NEXT:    ret
+;
+; NOSHA3-GI-LABEL: rax1:
+; NOSHA3-GI:       // %bb.0:
+; NOSHA3-GI-NEXT:    shl v2.2d, v1.2d, #1
+; NOSHA3-GI-NEXT:    ushr v1.2d, v1.2d, #63
+; NOSHA3-GI-NEXT:    orr v1.16b, v2.16b, v1.16b
+; NOSHA3-GI-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NOSHA3-GI-NEXT:    ret
     %a = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %y, <2 x i64> %y, <2 x i64> <i64 1, i64 1>)
     %b = xor <2 x i64> %x, %a
     ret <2 x i64> %b
 }
 
 declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; NOSHA3: {{.*}}
+; SHA3: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index 002c03aa..e86f747 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -551,7 +551,9 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 {
   ;
   ; GFX10CU-LABEL: name: workgroup_one_as_release
   ; GFX10CU: bb.0.entry:
+  ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_one_as_release
@@ -562,6 +564,8 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 {
   ;
   ; GFX11CU-LABEL: name: workgroup_one_as_release
   ; GFX11CU: bb.0.entry:
+  ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup-one-as") release
@@ -587,7 +591,9 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
   ;
   ; GFX10CU-LABEL: name: workgroup_one_as_acq_rel
   ; GFX10CU: bb.0.entry:
+  ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel
@@ -599,6 +605,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
   ;
   ; GFX11CU-LABEL: name: workgroup_one_as_acq_rel
   ; GFX11CU: bb.0.entry:
+  ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup-one-as") acq_rel
@@ -624,7 +632,9 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
   ;
   ; GFX10CU-LABEL: name: workgroup_one_as_seq_cst
   ; GFX10CU: bb.0.entry:
+  ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst
@@ -636,6 +646,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
   ;
   ; GFX11CU-LABEL: name: workgroup_one_as_seq_cst
   ; GFX11CU: bb.0.entry:
+  ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup-one-as") seq_cst
@@ -1305,8 +1317,9 @@ define amdgpu_kernel void @workgroup_release() #0 {
   ;
   ; GFX10CU-LABEL: name: workgroup_release
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10CU-NEXT:   S_WAITCNT_soft 112
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_release
@@ -1317,7 +1330,8 @@ define amdgpu_kernel void @workgroup_release() #0 {
   ;
   ; GFX11CU-LABEL: name: workgroup_release
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 64519
+  ; GFX11CU-NEXT:   S_WAITCNT_soft 7
+  ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup") release
@@ -1345,8 +1359,9 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
   ;
   ; GFX10CU-LABEL: name: workgroup_acq_rel
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10CU-NEXT:   S_WAITCNT_soft 112
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_acq_rel
@@ -1358,7 +1373,8 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
   ;
   ; GFX11CU-LABEL: name: workgroup_acq_rel
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 64519
+  ; GFX11CU-NEXT:   S_WAITCNT_soft 7
+  ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup") acq_rel
@@ -1386,8 +1402,9 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
   ;
   ; GFX10CU-LABEL: name: workgroup_seq_cst
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10CU-NEXT:   S_WAITCNT_soft 112
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_seq_cst
@@ -1399,7 +1416,8 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
   ;
   ; GFX11CU-LABEL: name: workgroup_seq_cst
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 64519
+  ; GFX11CU-NEXT:   S_WAITCNT_soft 7
+  ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup") seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index 53b2542..142290a 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -3611,10 +3611,10 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0 glc
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    buffer_load_ushort v6, off, s[16:19], 0 offset:4 glc
-; SDAG-NEXT:    s_waitcnt vmcnt(1)
-; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
 ; SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v6
 ; SDAG-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -3627,12 +3627,12 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0 glc
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4 glc
-; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3652,6 +3652,7 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    buffer_store_short v4, off, s[16:19], 0 offset:4
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -3671,6 +3672,7 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GISEL-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    buffer_store_short v2, off, s[16:19], 0 offset:4
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
index b91963f..d23509b 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
@@ -150,7 +150,6 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
 ; GFX10CU-NEXT:    buffer_load_dword v0, s[8:11], 0 offen lds
 ; GFX10CU-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX10CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10CU-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10CU-NEXT:    s_barrier
 ; GFX10CU-NEXT:    ds_read_b32 v0, v0
 ; GFX10CU-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
index 5d03dfb..352af04 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
@@ -218,3 +218,174 @@ main_body:
   ret void
 }
 
+define amdgpu_ps void @global_load_lds_dword_volatile(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_volatile:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_mov_b32 m0, s0
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:256 lds
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_volatile:
+; GFX942:       ; %bb.0: ; %main_body
+; GFX942-NEXT:    s_mov_b32 m0, s0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:256
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_volatile:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_mov_b32 m0, s0
+; GFX10-NEXT:    global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:256 lds
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_volatile:
+; GFX942-GISEL:       ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:256
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 256, i32 0)
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: buffer_load_lds_dword_volatile:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s5
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: buffer_load_lds_dword_volatile:
+; GFX942:       ; %bb.0: ; %main_body
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    s_mov_b32 m0, s5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX10-LABEL: buffer_load_lds_dword_volatile:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_mov_b32 m0, s5
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: buffer_load_lds_dword_volatile:
+; GFX942-GISEL:       ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+main_body:
+  %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 256, i32 0)
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @global_load_lds_dword_nontemporal(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_nontemporal:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v0, s[0:1] glc slc lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_nontemporal:
+; GFX942:       ; %bb.0: ; %main_body
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v0, s[0:1] nt
+; GFX942-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_nontemporal:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v0, s[0:1] slc lds
+; GFX10-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    global_load_lds_dword v0, s[0:1] nt
+; GFX942-GISEL-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_nontemporal(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s5
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc slc lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX942:       ; %bb.0: ; %main_body
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    s_mov_b32 m0, s5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX10-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_mov_b32 m0, s5
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen slc lds
+; GFX10-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-GISEL-NEXT:    s_endpgm
+main_body:
+  %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+  ret void
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
index f0204bd..1dcd032 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
@@ -110,3 +110,43 @@ main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 1, i32 0, i32 0, i32 2048, i32 0)
   ret void
 }
+
+define amdgpu_ps float @buffer_load_lds_dword_volatile(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; GCN-LABEL: buffer_load_lds_dword_volatile:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 m0, s4
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    buffer_load_dword off, s[0:3], 0 glc lds
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword off, s[0:3], 0 offset:256 lds
+; GCN-NEXT:    buffer_load_dword off, s[0:3], 0 offset:512 lds
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ds_read_b32 v0, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
+main_body:
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 256, i32 0)
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 512, i32 0)
+  %res = load float, ptr addrspace(3) %lds
+  ret float %res
+}
+
+define amdgpu_ps float @buffer_load_lds_dword_nontemporal(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; GCN-LABEL: buffer_load_lds_dword_nontemporal:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 m0, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_load_dword off, s[0:3], 0 glc slc lds
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ds_read_b32 v0, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
+main_body:
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0), !nontemporal !0
+  %res = load float, ptr addrspace(3) %lds
+  ret float %res
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
index b5d741b..a979999 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
@@ -80,25 +80,29 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_volatile(p
 ; PREGFX10-LABEL: buffer_load_volatile:
 ; PREGFX10:       ; %bb.0: ; %main_body
 ; PREGFX10-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
 ; PREGFX10-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
 ; PREGFX10-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc
 ; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
 ; PREGFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: buffer_load_volatile:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_clause 0x2
 ; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: buffer_load_volatile:
 ; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_clause 0x2
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[0:3], 0 glc slc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index e8b8d05..e8eccb0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -147,14 +147,13 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
 ; GFX8-OPT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8-OPT-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-OPT-NEXT:    s_barrier
-; GFX8-OPT-NEXT:    v_add_u32_e32 v1, vcc, v1, v1
-; GFX8-OPT-NEXT:    s_nop 1
-; GFX8-OPT-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
-; GFX8-OPT-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-OPT-NEXT:    v_add_u32_e32 v4, vcc, v1, v1
+; GFX8-OPT-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-OPT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-OPT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-OPT-NEXT:    v_mov_b32_dpp v2, v4 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; GFX8-OPT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-OPT-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-OPT-NEXT:    s_barrier
 ; GFX8-OPT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-OPT-NEXT:    s_endpgm
 ;
@@ -194,14 +193,14 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    ds_read_b32 v1, v0
-; GFX10-NEXT:    s_barrier
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v1
-; GFX10-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v1, v1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
+; GFX10-NEXT:    v_mov_b32_dpp v2, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX10-NEXT:    s_barrier
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -213,15 +212,15 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffc, v0
 ; GFX11-NEXT:    ds_load_b32 v1, v0
-; GFX11-NEXT:    s_barrier
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, v1, v1
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_dpp v2, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX11-NEXT:    s_barrier
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
index 97db15b..1d1d3e4 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
@@ -107,6 +107,7 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
@@ -120,7 +121,6 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i
 ; GFX12-NEXT:    s_mov_b32 s3, s12
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen
 ; GFX12-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index 9dac239..1e4b633 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -354,6 +354,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX9-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[10:11]
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
 ; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
 ; GFX9-SDAG-NEXT:    s_mov_b32 s5, s10
@@ -365,8 +366,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX9-SDAG-NEXT:    s_mov_b32 s3, s10
 ; GFX9-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[10:11]
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -384,6 +385,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
 ; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
 ; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
@@ -395,8 +397,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
 ; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX942-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -414,6 +416,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX942-SDAG-NEXT:    s_or_b64 s[8:9], s[2:3], s[12:13]
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-SDAG-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-SDAG-NEXT:    s_load_dword s13, s[4:5], 0x30
 ; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
 ; GFX942-SDAG-NEXT:    s_mov_b32 s5, s12
@@ -425,8 +428,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX942-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX942-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-SDAG-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -444,6 +447,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX942-GISEL-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-GISEL-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
 ; GFX942-GISEL-NEXT:    s_load_dword s9, s[4:5], 0x30
 ; GFX942-GISEL-NEXT:    s_mov_b32 s4, s7
@@ -455,8 +459,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX942-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX942-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -475,6 +479,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-SDAG-NEXT:    s_mov_b32 s11, s2
 ; GFX10-SDAG-NEXT:    s_or_b64 s[4:5], s[12:13], s[10:11]
 ; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_clause 0x1
 ; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
 ; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
@@ -488,8 +493,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-SDAG-NEXT:    s_mov_b32 s2, s1
 ; GFX10-SDAG-NEXT:    s_mov_b32 s3, s10
 ; GFX10-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[10:11]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX10-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -508,6 +513,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX10-GISEL-NEXT:    s_or_b64 s[2:3], s[6:7], s[4:5]
 ; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc dlc
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_clause 0x1
 ; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
@@ -519,8 +525,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX10-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -541,6 +547,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GFX11-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_clause 0x1
 ; GFX11-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
@@ -554,8 +561,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -576,6 +583,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
 ; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
@@ -588,8 +596,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -610,6 +618,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GFX12-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
@@ -623,8 +632,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -645,6 +654,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
 ; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
@@ -657,8 +667,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %val = load volatile i32, ptr addrspace(7) %in, !nontemporal !0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
index 516c3946..282a7ae 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
@@ -15,7 +15,6 @@ define amdgpu_kernel void @test_s_barrier() {
 ;
 ; GFX10-CU-LABEL: test_s_barrier:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-CU-NEXT:    s_barrier
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -26,7 +25,6 @@ define amdgpu_kernel void @test_s_barrier() {
 ;
 ; GFX11-CU-LABEL: test_s_barrier:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX11-CU-NEXT:    s_barrier
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -38,7 +36,6 @@ define amdgpu_kernel void @test_s_barrier() {
 ;
 ; GFX12-CU-LABEL: test_s_barrier:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_wait_alu 0xffe3
 ; GFX12-CU-NEXT:    s_barrier_signal -1
 ; GFX12-CU-NEXT:    s_barrier_wait -1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -63,8 +60,8 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
 ;
 ; GFX10-CU-LABEL: test_s_barrier_workgroup_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_barrier
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -77,8 +74,8 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
 ;
 ; GFX11-CU-LABEL: test_s_barrier_workgroup_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_barrier
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -94,8 +91,10 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
 ;
 ; GFX12-CU-LABEL: test_s_barrier_workgroup_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    s_wait_alu 0xffe3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_barrier_signal -1
 ; GFX12-CU-NEXT:    s_barrier_wait -1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -125,7 +124,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
 ; GFX10-CU:       ; %bb.0: ; %entry
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-CU-NEXT:    s_barrier
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -140,7 +138,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
 ; GFX11-CU:       ; %bb.0: ; %entry
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX11-CU-NEXT:    s_barrier
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -160,7 +157,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
 ; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT:    s_wait_alu 0xffe3
 ; GFX12-CU-NEXT:    s_barrier_signal -1
 ; GFX12-CU-NEXT:    s_barrier_wait -1
 ; GFX12-CU-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
index 6a76f43..7efbff9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
@@ -107,6 +107,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_release_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_release_fence:
@@ -139,6 +141,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_release_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_release_fence:
@@ -151,6 +155,10 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_release_fence:
@@ -181,6 +189,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence:
@@ -216,6 +226,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_acq_rel_fence:
@@ -229,6 +241,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_acq_rel_fence:
@@ -259,6 +275,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence:
@@ -294,6 +312,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_seq_cst_fence:
@@ -307,6 +327,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_seq_cst_fence:
@@ -412,6 +436,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_one_as_release_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence:
@@ -444,6 +470,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_one_as_release_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_release_fence:
@@ -456,6 +484,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_one_as_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_one_as_release_fence:
@@ -486,6 +518,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence:
@@ -521,6 +555,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence:
@@ -534,6 +570,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
@@ -564,6 +604,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence:
@@ -599,6 +641,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence:
@@ -612,6 +656,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index d288bfc..1cca64a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -1093,7 +1093,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_release_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_release_fence:
@@ -1129,7 +1130,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_release_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_release_fence:
@@ -1142,7 +1144,10 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_release_fence:
@@ -1175,7 +1180,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence:
@@ -1214,7 +1220,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_acq_rel_fence:
@@ -1228,7 +1235,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_acq_rel_fence:
@@ -1261,7 +1271,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence:
@@ -1300,7 +1311,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_seq_cst_fence:
@@ -1314,7 +1326,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_seq_cst_fence:
@@ -1420,6 +1435,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_one_as_release_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence:
@@ -1452,6 +1469,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_one_as_release_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_release_fence:
@@ -1464,6 +1483,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_one_as_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_one_as_release_fence:
@@ -1494,6 +1517,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence:
@@ -1529,6 +1554,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence:
@@ -1542,6 +1569,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
@@ -1572,6 +1603,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence:
@@ -1607,6 +1640,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence:
@@ -1620,6 +1655,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index d277441..2afa577 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -1072,7 +1072,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1109,7 +1110,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1136,7 +1138,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index 3826953..d384aec 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -656,12 +656,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -765,12 +765,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -800,12 +800,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1193,7 +1195,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1278,7 +1281,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1305,7 +1309,10 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1372,7 +1379,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1457,7 +1465,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1484,7 +1493,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1891,7 +1903,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1976,7 +1989,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -2003,7 +2017,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -2074,9 +2091,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_atomicrmw:
@@ -2170,9 +2189,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
@@ -2200,9 +2221,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_acq_rel_atomicrmw:
@@ -2273,9 +2297,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_atomicrmw:
@@ -2369,9 +2395,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
@@ -2399,9 +2427,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_atomicrmw:
@@ -2697,12 +2728,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -2813,12 +2844,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -2850,12 +2881,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -2935,12 +2968,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -3051,12 +3084,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -3088,12 +3121,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -3731,7 +3766,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -3854,7 +3890,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -3889,7 +3926,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -4007,9 +4047,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4141,9 +4183,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4179,9 +4223,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4299,9 +4346,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4433,9 +4482,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4471,9 +4522,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
@@ -5137,9 +5191,11 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_cmpxchg:
@@ -5271,9 +5327,11 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
@@ -5309,9 +5367,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_release_acquire_cmpxchg:
@@ -5429,9 +5490,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
@@ -5563,9 +5626,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
@@ -5601,9 +5666,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
@@ -5721,9 +5789,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
@@ -5855,9 +5925,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
@@ -5893,9 +5965,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
@@ -6013,9 +6088,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6147,9 +6224,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6185,9 +6264,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6923,7 +7005,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
@@ -7070,7 +7153,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -7113,7 +7197,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -7245,12 +7332,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -7399,12 +7486,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -7444,12 +7531,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -7577,12 +7666,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -7731,12 +7820,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -7776,12 +7865,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -8535,12 +8626,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -8689,12 +8780,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -8734,12 +8825,14 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -8867,12 +8960,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -9021,12 +9114,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -9066,12 +9159,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -9199,12 +9294,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -9353,12 +9448,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -9398,12 +9493,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -9531,7 +9628,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9685,7 +9783,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9730,7 +9829,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9863,7 +9965,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -10017,7 +10120,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -10062,7 +10166,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -10195,12 +10302,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -10349,12 +10456,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -10394,12 +10501,14 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -10527,12 +10636,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -10681,12 +10790,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -10726,12 +10835,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -10859,12 +10970,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11013,12 +11124,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -11058,12 +11169,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -11732,10 +11845,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11834,10 +11950,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -11868,10 +11987,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -12258,6 +12382,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12339,6 +12465,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -12365,6 +12493,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -12430,6 +12562,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12511,6 +12645,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -12537,6 +12673,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -12933,6 +13073,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -13014,6 +13156,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -13040,6 +13184,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -13107,7 +13255,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
@@ -13194,7 +13345,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
@@ -13222,7 +13376,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
@@ -13290,7 +13449,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
@@ -13377,7 +13539,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
@@ -13405,7 +13570,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
@@ -13696,10 +13866,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -13805,10 +13978,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -13841,10 +14017,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -13923,10 +14104,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -14032,10 +14216,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -14068,10 +14255,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -14699,6 +14891,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -14818,6 +15012,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -14852,6 +15048,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -14966,7 +15166,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15091,7 +15294,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15127,7 +15333,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15242,7 +15453,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15367,7 +15581,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15403,7 +15620,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -16046,7 +16268,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
@@ -16171,7 +16396,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
@@ -16207,7 +16435,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
@@ -16322,7 +16555,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16447,7 +16683,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16483,7 +16722,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16598,7 +16842,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16723,7 +16970,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16759,7 +17009,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16874,6 +17129,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -16999,6 +17256,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -17035,6 +17294,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -17150,6 +17413,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -17275,6 +17540,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -17311,6 +17578,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -17426,7 +17697,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17551,7 +17825,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17587,7 +17864,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17702,7 +17984,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17827,7 +18112,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17863,7 +18151,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17978,7 +18271,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18103,7 +18399,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18139,7 +18438,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18870,6 +19174,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
@@ -19013,6 +19319,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -19055,6 +19363,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -19185,10 +19497,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -19332,10 +19647,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -19376,10 +19694,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -19506,10 +19829,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -19653,10 +19979,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -19697,10 +20026,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -20445,10 +20779,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -20592,10 +20929,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -20636,10 +20976,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -20766,10 +21111,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -20913,10 +21261,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -20957,10 +21308,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -21087,10 +21443,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -21234,10 +21593,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -21278,10 +21640,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -21408,6 +21775,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
@@ -21555,6 +21924,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -21599,6 +21970,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -21729,6 +22104,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
@@ -21876,6 +22253,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -21920,6 +22299,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -22050,10 +22433,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -22197,10 +22583,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -22241,10 +22630,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -22371,10 +22765,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -22518,10 +22915,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -22562,10 +22962,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -22692,10 +23097,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -22839,10 +23247,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -22883,10 +23294,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index 3bf5ed8..c326edf 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -959,7 +959,8 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1001,7 +1002,8 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1026,7 +1028,10 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index b755c5d..868b438 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -667,7 +667,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -763,7 +764,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -790,7 +792,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
@@ -1204,7 +1209,8 @@ define amdgpu_kernel void @global_workgroup_release_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1290,7 +1296,8 @@ define amdgpu_kernel void @global_workgroup_release_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1315,7 +1322,10 @@ define amdgpu_kernel void @global_workgroup_release_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1391,7 +1401,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1477,7 +1488,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1502,7 +1514,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1918,7 +1933,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -2003,7 +2019,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -2028,7 +2045,10 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -2105,8 +2125,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_atomicrmw:
@@ -2196,8 +2218,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_atomicrmw:
@@ -2223,8 +2247,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_acq_rel_atomicrmw:
@@ -2301,8 +2329,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_atomicrmw:
@@ -2392,8 +2422,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_atomicrmw:
@@ -2419,8 +2451,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_atomicrmw:
@@ -2705,7 +2741,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -2807,7 +2844,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -2837,7 +2875,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -2926,7 +2967,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -3028,7 +3070,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3058,7 +3101,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3644,7 +3690,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -3758,7 +3805,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -3791,7 +3839,10 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -3900,8 +3951,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4020,8 +4073,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4055,8 +4110,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4165,8 +4224,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4285,8 +4346,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4320,8 +4383,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4920,8 +4987,10 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_cmpxchg:
@@ -5040,8 +5109,10 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_release_acquire_cmpxchg:
@@ -5075,8 +5146,12 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_release_acquire_cmpxchg:
@@ -5185,8 +5260,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
@@ -5305,8 +5382,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
@@ -5340,8 +5419,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
@@ -5450,8 +5533,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
@@ -5570,8 +5655,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
@@ -5605,8 +5692,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
@@ -5715,7 +5806,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5835,7 +5927,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -5870,7 +5963,10 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -5980,7 +6076,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6100,7 +6197,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -6135,7 +6233,10 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -6245,8 +6346,10 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_release_seq_cst_cmpxchg:
@@ -6365,8 +6468,10 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg:
@@ -6400,8 +6505,12 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_release_seq_cst_cmpxchg:
@@ -6510,8 +6619,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
@@ -6630,8 +6741,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
@@ -6665,8 +6778,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
@@ -6775,8 +6892,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6895,8 +7014,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6930,8 +7051,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -7588,7 +7713,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -7717,7 +7843,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -7754,7 +7881,10 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -7877,7 +8007,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -8009,7 +8140,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8047,7 +8179,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8170,7 +8305,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -8302,7 +8438,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8340,7 +8477,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9009,7 +9149,8 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -9141,7 +9282,8 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9179,7 +9321,10 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9302,7 +9447,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -9434,7 +9580,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9472,7 +9619,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9595,7 +9745,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -9727,7 +9878,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9765,7 +9917,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9888,7 +10043,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -10020,7 +10176,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10058,7 +10215,10 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10181,7 +10341,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -10313,7 +10474,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10351,7 +10513,10 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10474,7 +10639,8 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -10606,7 +10772,8 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10644,7 +10811,10 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10767,7 +10937,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -10899,7 +11070,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10937,7 +11109,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11060,7 +11235,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -11192,7 +11368,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11230,7 +11407,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11914,7 +12094,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -12009,7 +12190,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -12036,6 +12218,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
@@ -12447,6 +12633,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12529,6 +12717,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -12553,6 +12743,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -12626,6 +12820,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12708,6 +12904,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -12732,6 +12930,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -13145,6 +13347,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -13226,6 +13430,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -13250,6 +13456,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -13324,7 +13534,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
@@ -13411,7 +13624,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
@@ -13437,7 +13653,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
@@ -13512,7 +13733,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
@@ -13599,7 +13823,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
@@ -13625,7 +13852,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
@@ -13908,6 +14140,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -14006,6 +14240,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -14035,6 +14271,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -14121,6 +14361,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -14219,6 +14461,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -14248,6 +14492,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -14831,6 +15079,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -14941,6 +15191,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -14973,6 +15225,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -15079,7 +15335,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15195,7 +15454,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15229,7 +15491,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15336,7 +15603,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15452,7 +15722,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15486,7 +15759,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -16083,7 +16361,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
@@ -16199,7 +16480,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
@@ -16233,7 +16517,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
@@ -16340,7 +16629,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16456,7 +16748,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16490,7 +16785,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16597,7 +16897,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16713,7 +17016,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16747,7 +17053,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16854,6 +17165,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -16970,6 +17283,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -17004,6 +17319,10 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -17111,6 +17430,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -17227,6 +17548,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -17261,6 +17584,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -17368,7 +17695,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17484,7 +17814,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17518,7 +17851,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17625,7 +17963,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17741,7 +18082,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17775,7 +18119,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17882,7 +18231,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -17998,7 +18350,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18032,7 +18387,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18687,6 +19047,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -18812,6 +19174,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -18848,6 +19212,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -18968,6 +19336,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -19096,6 +19466,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19133,6 +19505,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19253,6 +19629,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -19381,6 +19759,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19418,6 +19798,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20084,6 +20468,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -20212,6 +20598,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20249,6 +20637,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20369,6 +20761,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -20497,6 +20891,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20534,6 +20930,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20654,6 +21054,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -20782,6 +21184,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20819,6 +21223,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20939,6 +21347,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -21067,6 +21477,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21104,6 +21516,10 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21224,6 +21640,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -21352,6 +21770,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21389,6 +21809,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21509,6 +21933,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -21637,6 +22063,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21674,6 +22102,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21794,6 +22226,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -21922,6 +22356,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21959,6 +22395,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22079,6 +22519,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -22207,6 +22649,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22244,6 +22688,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll
new file mode 100644
index 0000000..0b40c81
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll
@@ -0,0 +1,542 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx90a -global-isel=0 < %s | FileCheck --check-prefixes=GFX90A %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -global-isel=0 < %s | FileCheck --check-prefixes=GFX942 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -global-isel=1 < %s | FileCheck --check-prefixes=GFX942-GISEL %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -global-isel=0 < %s | FileCheck --check-prefixes=GFX10 %s
+
+define amdgpu_ps void @global_load_lds_dword_volatile(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_volatile:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_volatile:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_volatile:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_volatile:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @global_load_lds_dword_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc slc lds
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off nt
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off nt
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off slc lds
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @global_load_lds_dword_volatile_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p1_dword_volatile(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p1_dword_volatile:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p1_dword_volatile:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p1_dword_volatile:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p1_dword_volatile:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p1_dword_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc slc lds
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off nt
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off nt
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off slc lds
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p1_dword_volatile_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p7_dword_volatile(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p7_dword_volatile:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s5
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p7_dword_volatile:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    s_mov_b32 m0, s5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p7_dword_volatile:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p7_dword_volatile:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_mov_b32 m0, s5
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p7_dword_nontemporal(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s5
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc slc lds
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    s_mov_b32 m0, s5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_mov_b32 m0, s5
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen slc lds
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p7_dword_volatile_nontemporal(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s5
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    s_mov_b32 m0, s5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_mov_b32 m0, s5
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_volatile(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_mov_b32 m0, s4
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b32 m0, s4
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s4
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 m0, s4
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_nontemporal(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_mov_b32 m0, s4
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc slc lds
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b32 m0, s4
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s4
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 m0, s4
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen slc lds
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 0), !nontemporal !0
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_volatile_nontemporal(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_mov_b32 m0, s4
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b32 m0, s4
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s4
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 m0, s4
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 2147483648), !nontemporal !0
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0)
+  ret void
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index 986b48b..712109d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
 ; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_read_b32 v1, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_agent_release_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_agent_release_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_agent_release_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
index 8926893..6d1e4e6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_load(
 ; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_read_b32 v1, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_load(
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_load(
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_cluster_release_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_cluster_release_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_cluster_release_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index 81bbe0a..577d2ca 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_system_seq_cst_load(
 ; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_read_b32 v1, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_system_seq_cst_load(
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_system_seq_cst_load(
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_system_release_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_system_release_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_system_release_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_system_seq_cst_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_system_seq_cst_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_system_seq_cst_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index 980141a..d686e7a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -819,7 +819,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -854,7 +855,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -879,7 +881,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index 6a233a2..ab4d783 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
 ; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_read_b32 v1, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_workgroup_release_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_workgroup_release_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_workgroup_release_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir
new file mode 100644
index 0000000..93f7bcc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir
@@ -0,0 +1,83 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes=postmisched -o - %s | FileCheck %s
+
+# Ensure WMMA operations stay before the final atomic fence and barrier group.
+# This allows the latency of the WMMA operations to be hidden by barrier wait.
+---
+name: test
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32
+
+    ; CHECK-LABEL: name: test
+    ; CHECK: liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: ATOMIC_FENCE 5, 2
+    ; CHECK-NEXT: S_BARRIER
+    ; CHECK-NEXT: ATOMIC_FENCE 4, 2
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr41_vgpr42_vgpr43_vgpr44, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr41_vgpr42, implicit-def $vgpr41_vgpr42_vgpr43, implicit-def $vgpr42_vgpr43, implicit-def $vgpr42_vgpr43_vgpr44, implicit-def $vgpr43_vgpr44, implicit-def $vgpr45_vgpr46_vgpr47_vgpr48, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr45_vgpr46, implicit-def $vgpr45_vgpr46_vgpr47, implicit-def $vgpr46_vgpr47, implicit-def $vgpr46_vgpr47_vgpr48, implicit-def $vgpr47_vgpr48, implicit-def $vgpr49_vgpr50_vgpr51_vgpr52, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr49_vgpr50, implicit-def $vgpr49_vgpr50_vgpr51, implicit-def $vgpr50_vgpr51, implicit-def $vgpr50_vgpr51_vgpr52, implicit-def $vgpr51_vgpr52, implicit-def $vgpr53_vgpr54_vgpr55_vgpr56, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr53_vgpr54, implicit-def $vgpr53_vgpr54_vgpr55, implicit-def $vgpr54_vgpr55, implicit-def $vgpr54_vgpr55_vgpr56, implicit-def $vgpr55_vgpr56, implicit-def $vgpr57_vgpr58_vgpr59_vgpr60, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr57_vgpr58, implicit-def $vgpr57_vgpr58_vgpr59, implicit-def $vgpr58_vgpr59, implicit-def $vgpr58_vgpr59_vgpr60, implicit-def $vgpr59_vgpr60, implicit-def $vgpr61_vgpr62_vgpr63_vgpr64, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit-def $vgpr61_vgpr62, implicit-def $vgpr61_vgpr62_vgpr63, implicit-def $vgpr62_vgpr63, implicit-def $vgpr62_vgpr63_vgpr64, implicit-def $vgpr63_vgpr64, implicit-def $vgpr65_vgpr66_vgpr67_vgpr68, implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr65_vgpr66, implicit-def $vgpr65_vgpr66_vgpr67, implicit-def $vgpr66_vgpr67, implicit-def $vgpr66_vgpr67_vgpr68, implicit-def $vgpr67_vgpr68, implicit-def $vgpr69_vgpr70_vgpr71_vgpr72, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr69_vgpr70, implicit-def $vgpr69_vgpr70_vgpr71, implicit-def $vgpr70_vgpr71, implicit-def $vgpr70_vgpr71_vgpr72, implicit-def $vgpr71_vgpr72, implicit-def $vgpr73_vgpr74_vgpr75_vgpr76, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr73_vgpr74, implicit-def $vgpr73_vgpr74_vgpr75, implicit-def $vgpr74_vgpr75, implicit-def $vgpr74_vgpr75_vgpr76, implicit-def $vgpr75_vgpr76, implicit-def $vgpr77_vgpr78_vgpr79_vgpr80, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit-def $vgpr77_vgpr78, implicit-def $vgpr77_vgpr78_vgpr79, implicit-def $vgpr78_vgpr79, implicit-def $vgpr78_vgpr79_vgpr80, implicit-def $vgpr79_vgpr80, implicit-def $vgpr81_vgpr82_vgpr83_vgpr84, implicit-def $vgpr81, implicit-def $vgpr81_lo16, implicit-def $vgpr81_hi16, implicit-def $vgpr82, implicit-def $vgpr82_lo16, implicit-def $vgpr82_hi16, implicit-def $vgpr83, implicit-def $vgpr83_lo16, implicit-def $vgpr83_hi16, implicit-def $vgpr84, implicit-def $vgpr84_lo16, implicit-def $vgpr84_hi16, implicit-def $vgpr81_vgpr82, implicit-def $vgpr81_vgpr82_vgpr83, implicit-def $vgpr82_vgpr83, implicit-def $vgpr82_vgpr83_vgpr84, implicit-def $vgpr83_vgpr84, implicit-def $vgpr85_vgpr86_vgpr87_vgpr88, implicit-def $vgpr85, implicit-def $vgpr85_lo16, implicit-def $vgpr85_hi16, implicit-def $vgpr86, implicit-def $vgpr86_lo16, implicit-def $vgpr86_hi16, implicit-def $vgpr87, implicit-def $vgpr87_lo16, implicit-def $vgpr87_hi16, implicit-def $vgpr88, implicit-def $vgpr88_lo16, implicit-def $vgpr88_hi16, implicit-def $vgpr85_vgpr86, implicit-def $vgpr85_vgpr86_vgpr87, implicit-def $vgpr86_vgpr87, implicit-def $vgpr86_vgpr87_vgpr88, implicit-def $vgpr87_vgpr88, implicit killed $vgpr36, implicit $exec, implicit killed $vgpr37, implicit killed $vgpr38 {
+    ; CHECK-NEXT:   $vgpr41_vgpr42_vgpr43_vgpr44 = DS_READ_B128_gfx9 $vgpr36, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr45_vgpr46_vgpr47_vgpr48 = DS_READ2_B64_gfx9 $vgpr36, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr49_vgpr50_vgpr51_vgpr52 = DS_READ_B128_gfx9 $vgpr37, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr53_vgpr54_vgpr55_vgpr56 = DS_READ2_B64_gfx9 $vgpr37, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr57_vgpr58_vgpr59_vgpr60 = DS_READ_B128_gfx9 $vgpr37, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr61_vgpr62_vgpr63_vgpr64 = DS_READ2_B64_gfx9 killed $vgpr37, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr65_vgpr66_vgpr67_vgpr68 = DS_READ_B128_gfx9 $vgpr36, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr69_vgpr70_vgpr71_vgpr72 = DS_READ2_B64_gfx9 $vgpr36, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr73_vgpr74_vgpr75_vgpr76 = DS_READ_B128_gfx9 $vgpr36, 1536, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr77_vgpr78_vgpr79_vgpr80 = DS_READ2_B64_gfx9 $vgpr36, 194, 195, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr81_vgpr82_vgpr83_vgpr84 = DS_READ_B128_gfx9 killed $vgpr36, 2304, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr85_vgpr86_vgpr87_vgpr88 = DS_READ2_B64_gfx9 killed $vgpr38, 0, 1, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: $sgpr1 = S_ADD_I32 $sgpr0, 16, implicit-def dead $scc
+    ; CHECK-NEXT: $vgpr39 = V_ADD_U32_e32 32, killed $vgpr39, implicit $exec
+    ; CHECK-NEXT: $vgpr40 = V_ADD_U32_e32 32, killed $vgpr40, implicit $exec
+    ; CHECK-NEXT: S_CMP_LT_U32 killed $sgpr0, killed $sgpr12, implicit-def $scc
+    ; CHECK-NEXT: $sgpr0 = S_MOV_B32 killed $sgpr1
+    ; CHECK-NEXT: early-clobber $vgpr29_vgpr30_vgpr31_vgpr32 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr29_vgpr30_vgpr31_vgpr32, 0, 0, implicit $exec
+    ; CHECK-NEXT: early-clobber $vgpr25_vgpr26_vgpr27_vgpr28 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr25_vgpr26_vgpr27_vgpr28, 0, 0, implicit $exec
+    ; CHECK-NEXT: early-clobber $vgpr21_vgpr22_vgpr23_vgpr24 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr21_vgpr22_vgpr23_vgpr24, 0, 0, implicit $exec
+    ; CHECK-NEXT: early-clobber $vgpr17_vgpr18_vgpr19_vgpr20 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr17_vgpr18_vgpr19_vgpr20, 0, 0, implicit $exec
+    ; CHECK-NEXT: early-clobber $vgpr13_vgpr14_vgpr15_vgpr16 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr13_vgpr14_vgpr15_vgpr16, 0, 0, implicit $exec
+    ; CHECK-NEXT: early-clobber $vgpr9_vgpr10_vgpr11_vgpr12 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr9_vgpr10_vgpr11_vgpr12, 0, 0, implicit $exec
+    ; CHECK-NEXT: early-clobber $vgpr5_vgpr6_vgpr7_vgpr8 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr5_vgpr6_vgpr7_vgpr8, 0, 0, implicit $exec
+    ; CHECK-NEXT: early-clobber $vgpr1_vgpr2_vgpr3_vgpr4 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr1_vgpr2_vgpr3_vgpr4, 0, 0, implicit $exec
+    ; CHECK-NEXT: ATOMIC_FENCE 5, 2
+    ; CHECK-NEXT: S_BARRIER
+    ; CHECK-NEXT: ATOMIC_FENCE 4, 2
+    ATOMIC_FENCE 5, 2
+    S_BARRIER
+    ATOMIC_FENCE 4, 2
+    BUNDLE implicit-def $vgpr41_vgpr42_vgpr43_vgpr44, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr41_vgpr42, implicit-def $vgpr41_vgpr42_vgpr43, implicit-def $vgpr42_vgpr43, implicit-def $vgpr42_vgpr43_vgpr44, implicit-def $vgpr43_vgpr44, implicit-def $vgpr45_vgpr46_vgpr47_vgpr48, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr45_vgpr46, implicit-def $vgpr45_vgpr46_vgpr47, implicit-def $vgpr46_vgpr47, implicit-def $vgpr46_vgpr47_vgpr48, implicit-def $vgpr47_vgpr48, implicit-def $vgpr49_vgpr50_vgpr51_vgpr52, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr49_vgpr50, implicit-def $vgpr49_vgpr50_vgpr51, implicit-def $vgpr50_vgpr51, implicit-def $vgpr50_vgpr51_vgpr52, implicit-def $vgpr51_vgpr52, implicit-def $vgpr53_vgpr54_vgpr55_vgpr56, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr53_vgpr54, implicit-def $vgpr53_vgpr54_vgpr55, implicit-def $vgpr54_vgpr55, implicit-def $vgpr54_vgpr55_vgpr56, implicit-def $vgpr55_vgpr56, implicit-def $vgpr57_vgpr58_vgpr59_vgpr60, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr57_vgpr58, implicit-def $vgpr57_vgpr58_vgpr59, implicit-def $vgpr58_vgpr59, implicit-def $vgpr58_vgpr59_vgpr60, implicit-def $vgpr59_vgpr60, implicit-def $vgpr61_vgpr62_vgpr63_vgpr64, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit-def $vgpr61_vgpr62, implicit-def $vgpr61_vgpr62_vgpr63, implicit-def $vgpr62_vgpr63, implicit-def $vgpr62_vgpr63_vgpr64, implicit-def $vgpr63_vgpr64, implicit-def $vgpr65_vgpr66_vgpr67_vgpr68, implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr65_vgpr66, implicit-def $vgpr65_vgpr66_vgpr67, implicit-def $vgpr66_vgpr67, implicit-def $vgpr66_vgpr67_vgpr68, implicit-def $vgpr67_vgpr68, implicit-def $vgpr69_vgpr70_vgpr71_vgpr72, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr69_vgpr70, implicit-def $vgpr69_vgpr70_vgpr71, implicit-def $vgpr70_vgpr71, implicit-def $vgpr70_vgpr71_vgpr72, implicit-def $vgpr71_vgpr72, implicit-def $vgpr73_vgpr74_vgpr75_vgpr76, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr73_vgpr74, implicit-def $vgpr73_vgpr74_vgpr75, implicit-def $vgpr74_vgpr75, implicit-def $vgpr74_vgpr75_vgpr76, implicit-def $vgpr75_vgpr76, implicit-def $vgpr77_vgpr78_vgpr79_vgpr80, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit-def $vgpr77_vgpr78, implicit-def $vgpr77_vgpr78_vgpr79, implicit-def $vgpr78_vgpr79, implicit-def $vgpr78_vgpr79_vgpr80, implicit-def $vgpr79_vgpr80, implicit-def $vgpr81_vgpr82_vgpr83_vgpr84, implicit-def $vgpr81, implicit-def $vgpr81_lo16, implicit-def $vgpr81_hi16, implicit-def $vgpr82, implicit-def $vgpr82_lo16, implicit-def $vgpr82_hi16, implicit-def $vgpr83, implicit-def $vgpr83_lo16, implicit-def $vgpr83_hi16, implicit-def $vgpr84, implicit-def $vgpr84_lo16, implicit-def $vgpr84_hi16, implicit-def $vgpr81_vgpr82, implicit-def $vgpr81_vgpr82_vgpr83, implicit-def $vgpr82_vgpr83, implicit-def $vgpr82_vgpr83_vgpr84, implicit-def $vgpr83_vgpr84, implicit-def $vgpr85_vgpr86_vgpr87_vgpr88, implicit-def $vgpr85, implicit-def $vgpr85_lo16, implicit-def $vgpr85_hi16, implicit-def $vgpr86, implicit-def $vgpr86_lo16, implicit-def $vgpr86_hi16, implicit-def $vgpr87, implicit-def $vgpr87_lo16, implicit-def $vgpr87_hi16, implicit-def $vgpr88, implicit-def $vgpr88_lo16, implicit-def $vgpr88_hi16, implicit-def $vgpr85_vgpr86, implicit-def $vgpr85_vgpr86_vgpr87, implicit-def $vgpr86_vgpr87, implicit-def $vgpr86_vgpr87_vgpr88, implicit-def $vgpr87_vgpr88, implicit $vgpr36, implicit $exec, implicit $vgpr37, implicit $vgpr38 {
+      $vgpr41_vgpr42_vgpr43_vgpr44 = DS_READ_B128_gfx9 $vgpr36, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr45_vgpr46_vgpr47_vgpr48 = DS_READ2_B64_gfx9 $vgpr36, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr49_vgpr50_vgpr51_vgpr52 = DS_READ_B128_gfx9 $vgpr37, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr53_vgpr54_vgpr55_vgpr56 = DS_READ2_B64_gfx9 $vgpr37, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr57_vgpr58_vgpr59_vgpr60 = DS_READ_B128_gfx9 $vgpr37, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr61_vgpr62_vgpr63_vgpr64 = DS_READ2_B64_gfx9 $vgpr37, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr65_vgpr66_vgpr67_vgpr68 = DS_READ_B128_gfx9 $vgpr36, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr69_vgpr70_vgpr71_vgpr72 = DS_READ2_B64_gfx9 $vgpr36, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr73_vgpr74_vgpr75_vgpr76 = DS_READ_B128_gfx9 $vgpr36, 1536, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr77_vgpr78_vgpr79_vgpr80 = DS_READ2_B64_gfx9 $vgpr36, 194, 195, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr81_vgpr82_vgpr83_vgpr84 = DS_READ_B128_gfx9 $vgpr36, 2304, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr85_vgpr86_vgpr87_vgpr88 = DS_READ2_B64_gfx9 $vgpr38, 0, 1, 0, implicit $exec :: (load (s128), addrspace 3)
+    }
+    $sgpr1 = S_ADD_I32 $sgpr0, 16, implicit-def dead $scc
+    $vgpr39 = V_ADD_U32_e32 32, killed $vgpr39, implicit $exec
+    $vgpr40 = V_ADD_U32_e32 32, killed $vgpr40, implicit $exec
+    S_CMP_LT_U32 killed $sgpr0, $sgpr12, implicit-def $scc
+    early-clobber $vgpr29_vgpr30_vgpr31_vgpr32 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr29_vgpr30_vgpr31_vgpr32, 0, 0, implicit $exec
+    early-clobber $vgpr25_vgpr26_vgpr27_vgpr28 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr25_vgpr26_vgpr27_vgpr28, 0, 0, implicit $exec
+    early-clobber $vgpr21_vgpr22_vgpr23_vgpr24 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr21_vgpr22_vgpr23_vgpr24, 0, 0, implicit $exec
+    early-clobber $vgpr17_vgpr18_vgpr19_vgpr20 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr17_vgpr18_vgpr19_vgpr20, 0, 0, implicit $exec
+    early-clobber $vgpr13_vgpr14_vgpr15_vgpr16 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr13_vgpr14_vgpr15_vgpr16, 0, 0, implicit $exec
+    early-clobber $vgpr9_vgpr10_vgpr11_vgpr12 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr9_vgpr10_vgpr11_vgpr12, 0, 0, implicit $exec
+    early-clobber $vgpr5_vgpr6_vgpr7_vgpr8 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr5_vgpr6_vgpr7_vgpr8, 0, 0, implicit $exec
+    early-clobber $vgpr1_vgpr2_vgpr3_vgpr4 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr1_vgpr2_vgpr3_vgpr4, 0, 0, implicit $exec
+    $sgpr0 = S_MOV_B32 killed $sgpr1
+    ATOMIC_FENCE 5, 2
+    S_BARRIER
+    ATOMIC_FENCE 4, 2
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
index f3cb5a7..30f5277 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
@@ -26,17 +26,17 @@ define amdgpu_kernel void @barrier_vmcnt_global(ptr addrspace(1) %arg) {
 ; GFX9-LABEL: barrier_vmcnt_global:
 ; GFX9:         s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v1, s[0:1]
-; GFX9-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 30, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    global_load_dword v3, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_barrier
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    global_store_dword v[0:1], v3, off
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -369,10 +369,9 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) {
 ; GFX8-NEXT:    flat_load_dword v3, v[2:3]
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_barrier
 ; GFX8-NEXT:    flat_store_dword v[0:1], v3
 ; GFX8-NEXT:    s_endpgm
@@ -393,10 +392,9 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) {
 ; GFX9-NEXT:    flat_load_dword v3, v[2:3]
 ; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_barrier
 ; GFX9-NEXT:    flat_store_dword v[0:1], v3
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/BPF/BTF/variant-part.ll b/llvm/test/CodeGen/BPF/BTF/variant-part.ll
new file mode 100644
index 0000000..1071e61
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/variant-part.ll
@@ -0,0 +1,87 @@
+; RUN: llc -mtriple=bpfel -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+; RUN: llc -mtriple=bpfeb -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+;
+; Source:
+;   #![no_std]
+;   #![no_main]
+;
+;   pub enum MyEnum {
+;       First { a: u32, b: i32 },
+;       Second(u32),
+;   }
+;
+;   #[unsafe(no_mangle)]
+;   pub static X: MyEnum = MyEnum::First { a: 54, b: -23 };
+;
+;   #[cfg(not(test))]
+;   #[panic_handler]
+;   fn panic(_info: &core::panic::PanicInfo) -> ! {
+;       loop {}
+;   }
+; Compilation flag:
+;   cargo +nightly rustc -Zbuild-std=core --target=bpfel-unknown-none -- --emit=llvm-bc
+;   llvm-extract --glob=X $(find target/ -name "*.bc" | head -n 1) -o variant-part.bc
+;   llvm-dis variant-part.bc -o variant-part.ll
+
+; ModuleID = 'variant-part.bc'
+source_filename = "c0znihgkvro8hs0n88fgrtg6x"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "bpfel"
+
+@X = constant [12 x i8] c"\00\00\00\006\00\00\00\E9\FF\FF\FF", align 4, !dbg !0
+
+!llvm.module.flags = !{!22, !23, !24, !25}
+!llvm.ident = !{!26}
+!llvm.dbg.cu = !{!27}
+
+; CHECK-BTF:      [1] STRUCT 'MyEnum' size=12 vlen=1
+; CHECK-BTF-NEXT:         '(anon)' type_id=3 bits_offset=0
+; CHECK-BTF-NEXT: [2] INT 'u32' size=4 bits_offset=0 nr_bits=32 encoding=(none)
+; CHECK-BTF-NEXT: [3] UNION '(anon)' size=12 vlen=3
+; CHECK-BTF-NEXT:         '(anon)' type_id=2 bits_offset=0
+; CHECK-BTF-NEXT:         'First' type_id=4 bits_offset=0
+; CHECK-BTF-NEXT:         'Second' type_id=6 bits_offset=0
+; CHECK-BTF-NEXT: [4] STRUCT 'First' size=12 vlen=2
+; CHECK-BTF-NEXT:         'a' type_id=2 bits_offset=32
+; CHECK-BTF-NEXT:         'b' type_id=5 bits_offset=64
+; CHECK-BTF-NEXT: [5] INT 'i32' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
+; CHECK-BTF-NEXT: [6] STRUCT 'Second' size=12 vlen=1
+; CHECK-BTF-NEXT:         '__0' type_id=2 bits_offset=32
+; CHECK-BTF-NEXT: [7] VAR 'X' type_id=1, linkage=global
+; CHECK-BTF-NEXT: [8] DATASEC '.rodata' size=0 vlen=1
+; CHECK-BTF-NEXT:         type_id=7 offset=0 size=12
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "X", scope: !2, file: !3, line: 10, type: !4, isLocal: false, isDefinition: true, align: 32)
+!2 = !DINamespace(name: "variant_part", scope: null)
+!3 = !DIFile(filename: "variant-part/src/main.rs", directory: "/tmp/variant-part", checksumkind: CSK_MD5, checksum: "b94cd53886ea8f14cbc116b36bc7dd36")
+!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyEnum", scope: !2, file: !5, size: 96, align: 32, flags: DIFlagPublic, elements: !6, templateParams: !16, identifier: "faba668fd9f71e9b7cf3b9ac5e8b93cb")
+!5 = !DIFile(filename: "<unknown>", directory: "")
+!6 = !{!7}
+!7 = !DICompositeType(tag: DW_TAG_variant_part, scope: !4, file: !5, size: 96, align: 32, elements: !8, templateParams: !16, identifier: "e4aee046fc86d111657622fdcb8c42f7", discriminator: !21)
+!8 = !{!9, !17}
+!9 = !DIDerivedType(tag: DW_TAG_member, name: "First", scope: !7, file: !5, baseType: !10, size: 96, align: 32, extraData: i32 0)
+!10 = !DICompositeType(tag: DW_TAG_structure_type, name: "First", scope: !4, file: !5, size: 96, align: 32, flags: DIFlagPublic, elements: !11, templateParams: !16, identifier: "cc7748c842e275452db4205b190c8ff7")
+!11 = !{!12, !14}
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !10, file: !5, baseType: !13, size: 32, align: 32, offset: 32, flags: DIFlagPublic)
+!13 = !DIBasicType(name: "u32", size: 32, encoding: DW_ATE_unsigned)
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !10, file: !5, baseType: !15, size: 32, align: 32, offset: 64, flags: DIFlagPublic)
+!15 = !DIBasicType(name: "i32", size: 32, encoding: DW_ATE_signed)
+!16 = !{}
+!17 = !DIDerivedType(tag: DW_TAG_member, name: "Second", scope: !7, file: !5, baseType: !18, size: 96, align: 32, extraData: i32 1)
+!18 = !DICompositeType(tag: DW_TAG_structure_type, name: "Second", scope: !4, file: !5, size: 96, align: 32, flags: DIFlagPublic, elements: !19, templateParams: !16, identifier: "a2094b1381f3082d504fbd0903aa7c06")
+!19 = !{!20}
+!20 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !18, file: !5, baseType: !13, size: 32, align: 32, offset: 32, flags: DIFlagPublic)
+!21 = !DIDerivedType(tag: DW_TAG_member, scope: !4, file: !5, baseType: !13, size: 32, align: 32, flags: DIFlagArtificial)
+!22 = !{i32 8, !"PIC Level", i32 2}
+!23 = !{i32 7, !"PIE Level", i32 2}
+!24 = !{i32 7, !"Dwarf Version", i32 4}
+!25 = !{i32 2, !"Debug Info Version", i32 3}
+!26 = !{!"rustc version 1.91.0-nightly (160e7623e 2025-08-26)"}
+!27 = distinct !DICompileUnit(language: DW_LANG_Rust, file: !28, producer: "clang LLVM (rustc version 1.91.0-nightly (160e7623e 2025-08-26))", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !29, splitDebugInlining: false, nameTableKind: None)
+!28 = !DIFile(filename: "variant-part/src/main.rs/@/c0znihgkvro8hs0n88fgrtg6x", directory: "/tmp/variant-part")
+!29 = !{!0}
diff --git a/llvm/test/CodeGen/Hexagon/insert-big.ll b/llvm/test/CodeGen/Hexagon/insert-big.ll
new file mode 100644
index 0000000..8735a66
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/insert-big.ll
@@ -0,0 +1,47 @@
+; Check that llc does not abort, which happened due to incorrect MIR.
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=1 < %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=2 < %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=3 < %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=4 < %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=5 < %s
+
+; Look for this symptom, in case llc does not check invalid IR.
+; CHECK-NOT: insert(%14,%5,#5,#5)
+
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=1 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=2 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=3 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=4 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=5 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+
+; REQUIRES: asserts
+
+define i32 @f(i32 %0, i32 %1, i32 %2) {
+entry:
+  switch i32 %0, label %common.ret1 [
+    i32 8907, label %3
+    i32 4115, label %6
+  ]
+
+common.ret1:
+  %common.ret1.op = phi i32 [ %5, %3 ], [ %526, %6 ], [ 0, %entry ]
+  ret i32 %common.ret1.op
+
+3:
+  %4 = shl i32 %2, 5
+  %5 = and i32 %4, 992
+  br label %common.ret1
+
+6:
+  %7 = shl i32 %0, 10
+  %8 = and i32 %7, 7168
+  %9 = shl i32 %0, 5
+  %10 = and i32 %9, 992
+  %11 = or i32 %10, %8
+  %12 = and i32 %0, 1
+  %13 = or i32 %11, %12
+  %14 = shl i32 %1, 1
+  %15 = and i32 %14, 2031616
+  %526 = or i32 %13, %15
+  br label %common.ret1
+}
diff --git a/llvm/test/CodeGen/Hexagon/qfp-conv.ll b/llvm/test/CodeGen/Hexagon/qfp-conv.ll
new file mode 100644
index 0000000..d2d393e
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfp-conv.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=hexagon -mattr=+hvxv68,+hvx,+hvx-length128b < %s | FileCheck %s
+
+; Test that the Qfloat optimization pass doesn't crash due to an invalid
+; instructions.
+
+; CHECK: v{{[0-9]+}}.hf = v{{[0-9]:[0-9]}}.qf32
+
+define void @test(
+    <32 x i32>* %optr,
+    <64 x i32> %in64,
+    <32 x i32> %va,
+    <32 x i32> %vb
+) local_unnamed_addr #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %optr.068 = phi <32 x i32>* [ %optr, %entry ], [ %incdec.ptr6, %for.body ]
+  %0 = tail call <32 x i32> @llvm.hexagon.V6.vconv.hf.qf32.128B(<64 x i32> %in64) #2
+  %1 = tail call <32 x i32> @llvm.hexagon.V6.vdealh.128B(<32 x i32> %0) #2
+  %2 = tail call <128 x i1> @llvm.hexagon.V6.vgth.128B(<32 x i32> %va, <32 x i32> %1) #2
+  %3 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<128 x i1> %2, <32 x i32> %va, <32 x i32> %vb) #2
+  %4 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %3, <32 x i32> %vb) #2
+  %5 = tail call <32 x i32> @llvm.hexagon.V6.vpackhub.sat.128B(<32 x i32> %va, <32 x i32> %4) #2
+  store <32 x i32> %5, <32 x i32>* %optr.068, align 1
+  %incdec.ptr6 = getelementptr inbounds <32 x i32>, <32 x i32>* %optr.068, i32 1
+  br label %for.body
+}
+
+declare <32 x i32> @llvm.hexagon.V6.vdealh.128B(<32 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vconv.hf.qf32.128B(<64 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32>, <32 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vpackhub.sat.128B(<32 x i32>, <32 x i32>) #1
+declare <128 x i1> @llvm.hexagon.V6.vgth.128B(<32 x i32>, <32 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vmux.128B(<128 x i1>, <32 x i32>, <32 x i32>) #1
diff --git a/llvm/test/CodeGen/Hexagon/qfp-enabled.ll b/llvm/test/CodeGen/Hexagon/qfp-enabled.ll
new file mode 100644
index 0000000..a5cc5fa
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfp-enabled.ll
@@ -0,0 +1,19 @@
+; Tests if the flag to disable qfp optimizer pass works or not.
+
+; RUN: llc -march=hexagon -mcpu=hexagonv69 -mattr=+hvxv69,+hvx-length128b \
+; RUN: < %s -o -| FileCheck %s --check-prefix=ENABLED
+; RUN: llc -march=hexagon -mcpu=hexagonv69 -mattr=+hvxv69,+hvx-length128b \
+; RUN: -disable-qfp-opt < %s -o -| FileCheck %s --check-prefix=DISABLED
+
+define dso_local <32 x i32> @conv1_qf32(<32 x i32> noundef %input1, <32 x i32> noundef %input2) local_unnamed_addr {
+entry:
+; DISABLED: [[V2:v[0-9]+]].qf32 = vadd(v0.sf,v1.sf)
+; DISABLED: [[V3:v[0-9]+]].sf = [[V2]].qf32
+; DISABLED: qf32 = vadd(v0.sf,[[V3]].sf)
+; ENABLED: [[V4:v[0-9]+]].qf32 = vadd(v0.sf,v1.sf)
+; ENABLED: qf32 = vadd([[V4]].qf32,v0.sf)
+  %0 = tail call <32 x i32> @llvm.hexagon.V6.vadd.sf.128B(<32 x i32> %input1, <32 x i32> %input2)
+  %1 = tail call <32 x i32> @llvm.hexagon.V6.vconv.sf.qf32.128B(<32 x i32> %0)
+  %2 = tail call <32 x i32> @llvm.hexagon.V6.vadd.sf.128B(<32 x i32> %input1, <32 x i32> %1)
+  ret <32 x i32> %2
+}
diff --git a/llvm/test/CodeGen/Hexagon/qfp-remove-kill.mir b/llvm/test/CodeGen/Hexagon/qfp-remove-kill.mir
new file mode 100644
index 0000000..d8dde7d
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfp-remove-kill.mir
@@ -0,0 +1,95 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \
+# RUN: -run-pass hexagon-qfp-optimizer -run-pass machineverifier %s -o - | FileCheck %s
+
+# Test that the killed RegState from DefMI operands are removed
+# killed RegState should be set for MI operands
+# CHECK-LABEL: name: qfpAdd
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG1:([0-9]+)]]
+# CHECK-NEXT: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG2:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32 killed %[[REG1]], killed %[[REG2]]
+# CHECK-NEXT: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG3:([0-9]+)]]
+# CHECK-NEXT: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG4:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32 killed %[[REG3]], killed %[[REG4]]
+
+---
+name: qfpAdd
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:intregs = COPY $r3
+    %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %6:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+    %7:hvxvr = V6_vL32Ub_ai %3:intregs, 0
+    %8:hvxvr = V6_vconv_sf_qf32 killed %4:hvxvr
+    %9:hvxvr = V6_vconv_sf_qf32 killed %5:hvxvr
+    %10:hvxvr = V6_vadd_sf %8:hvxvr, %9:hvxvr
+    %11:hvxvr = V6_vconv_sf_qf32 killed %6:hvxvr
+    %12:hvxvr = V6_vconv_sf_qf32 killed %7:hvxvr
+    %13:hvxvr = V6_vadd_sf killed %11:hvxvr, killed %12:hvxvr
+...
+
+
+# Test that the killed RegState from DefMI operands are removed
+# CHECK-LABEL: name: qfpAddMix
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG1:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32_mix killed %[[REG1]], %{{[0-9]+}}
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG2:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32_mix killed %[[REG2]], %{{[0-9]+}}
+
+---
+name: qfpAddMix
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+    %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+    %7:hvxvr = V6_vconv_sf_qf32 killed %6:hvxvr
+    %8:hvxvr = V6_vadd_sf %3:hvxvr, %7:hvxvr
+    %9:hvxvr = V6_vmpy_qf32_sf %4, %5
+    %10:hvxvr = V6_vconv_sf_qf32 killed %9:hvxvr
+    %11:hvxvr = V6_vadd_sf %3:hvxvr, killed %10:hvxvr
+...
+
+
+# Test that we do generate V6_vsub_qf32_mix for the below test.
+# V6_vsub_qf32_mix only allowes qf32 as first operand. In the test qf32
+# is passed as first operand. So, V6_vsub_qf32_mix must be generated.
+# CHECK-LABEL: name: qfpAddSwapMix
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG1:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32_mix killed %[[REG1]], %{{[0-9]+}}
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG2:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32_mix killed %[[REG2]], %{{[0-9]+}}
+
+---
+name: qfpAddSwapMix
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+    %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+    %7:hvxvr = V6_vconv_sf_qf32 killed %6:hvxvr
+    %8:hvxvr = V6_vadd_sf %7:hvxvr, %3:hvxvr
+    %9:hvxvr = V6_vmpy_qf32_sf %4, %5
+    %10:hvxvr = V6_vconv_sf_qf32 killed %9:hvxvr
+    %11:hvxvr = V6_vadd_sf killed %10:hvxvr, %3:hvxvr
+...
diff --git a/llvm/test/CodeGen/Hexagon/qfp-subreg-bug.mir b/llvm/test/CodeGen/Hexagon/qfp-subreg-bug.mir
new file mode 100644
index 0000000..1d78203
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfp-subreg-bug.mir
@@ -0,0 +1,33 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv69 -mattr=+hvxv69,+hvx-length128b -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s
+
+# CHECK: V6_vshuffvdd
+# CHECK: V6_vadd_sf
+# CHECK: V6_vadd_qf32_mix{{.*}}vsub_lo
+# CHECK: V6_vadd_qf32_mix{{.*}}vsub_hi
+
+---
+name:            qfp_subreg_fix
+alignment:       16
+tracksRegLiveness: true
+
+body:             |
+  bb.0:
+    %10:intregs = IMPLICIT_DEF
+    %9:hvxvr = V6_vL32Ub_ai %10, 0 :: (load (s1024) from `ptr undef`, align 4)
+    %11:intregs = A2_tfrsi 15360
+    %12:hvxvr = V6_lvsplath %11
+    %13:hvxwr = V6_vmpy_qf32_hf %9, %12
+    %15:hvxvr = V6_vconv_sf_qf32 %13.vsub_lo
+    %17:hvxvr = V6_vconv_sf_qf32 %13.vsub_hi
+    %18:intregslow8 = A2_tfrsi -4
+    %19:hvxwr = V6_vshuffvdd %17, %15, %18
+    %21:hvxvr = V6_vadd_sf %19.vsub_hi, %19.vsub_hi
+    %22:hvxvr = V6_vconv_sf_qf32 %21
+    %24:hvxvr = V6_vadd_sf %19.vsub_lo, %19.vsub_lo
+    %25:hvxvr = V6_vconv_sf_qf32 %24
+    %26:hvxvr = V6_vadd_sf %25, %19.vsub_lo
+    %27:hvxvr = V6_vconv_sf_qf32 %26
+    %28:hvxvr = V6_vadd_sf %22, %19.vsub_hi
+    %29:hvxvr = V6_vconv_sf_qf32 %28
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll b/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll
new file mode 100644
index 0000000..c16370c
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll
@@ -0,0 +1,21 @@
+; Tests if generated vadd instruction takes in qf32
+; type as first parameter instead of a sf type without
+; any conversion instruction of type sf = qf32
+
+; RUN: llc -mtriple=hexagon < %s -o - | FileCheck %s
+
+; CHECK: [[V2:v[0-9]+]] = vxor([[V2]],[[V2]])
+; CHECK: [[V0:v[0-9]+]].qf32 = vmpy([[V0]].sf,[[V2]].sf)
+; CHECK: [[V1:v[0-9]+]].qf32 = vmpy([[V1]].sf,[[V2]].sf)
+; CHECK: [[V4:v[0-9]+]].qf32 = vadd([[V0]].qf32,[[V2]].sf)
+; CHECK: [[V5:v[0-9]+]].qf32 = vadd([[V1]].qf32,[[V2]].sf)
+
+define void @_Z19compute_ripple_geluIDF16_EviPT_PKS0_(ptr %out_ptr, <64 x float> %conv14.ripple.vectorized) #0 {
+entry:
+  %mul16.ripple.vectorized = fmul <64 x float> %conv14.ripple.vectorized, zeroinitializer
+  %conv17.ripple.vectorized = fptrunc <64 x float> %mul16.ripple.vectorized to <64 x half>
+  store <64 x half> %conv17.ripple.vectorized, ptr %out_ptr, align 2
+  ret void
+}
+
+attributes #0 = { "target-features"="+hvx-length128b,+hvxv75,+v75,-long-calls,-small-data" }
diff --git a/llvm/test/CodeGen/Hexagon/vect/qfp-mix.mir b/llvm/test/CodeGen/Hexagon/vect/qfp-mix.mir
new file mode 100644
index 0000000..9a9e938
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vect/qfp-mix.mir
@@ -0,0 +1,79 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \
+# RUN: -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s
+
+
+# Test that the operands are swapped for Add if the second operand
+# is a qf32 to sf conversion. V6_vadd_qf32_mix supports first operand
+# as qf32.
+# CHECK-LABEL: name: qfpAddMix
+# CHECK: %[[REG:([0-9]+)]]:hvxvr = V6_vmpy_qf32_sf
+# CHECK: V6_vadd_qf32_mix %[[REG]]
+
+---
+name: qfpAddMix
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+    %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+    %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr
+    %8:hvxvr = V6_vadd_sf %3:hvxvr, %7:hvxvr
+...
+
+
+# Test that we do not generate V6_vsub_qf32_mix for the below test.
+# V6_vsub_qf32_mix only allowes qf32 as first operand. In the test qf32
+# is passed as second operand. As sub is not commutative, we should not
+# generate the mix instruction.
+# CHECK-LABEL: name: qfpSubNoMix
+# CHECK-NOT: V6_vsub_qf32_mix
+
+---
+name: qfpSubNoMix
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+    %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+    %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr
+    %8:hvxvr = V6_vsub_sf %3:hvxvr, %7:hvxvr
+...
+
+
+# Test that we do generate V6_vsub_qf32_mix for the below test.
+# V6_vsub_qf32_mix only allowes qf32 as first operand. In the test qf32
+# is passed as first operand. So, V6_vsub_qf32_mix must be generated.
+# CHECK-LABEL: name: qfpSubMix
+# CHECK: V6_vsub_qf32_mix
+
+---
+name: qfpSubMix
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+    %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+    %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr
+    %8:hvxvr = V6_vsub_sf %7:hvxvr, %3:hvxvr
+...
diff --git a/llvm/test/CodeGen/Hexagon/vect/qfp-zeroinit.mir b/llvm/test/CodeGen/Hexagon/vect/qfp-zeroinit.mir
new file mode 100644
index 0000000..f0b1d3c
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vect/qfp-zeroinit.mir
@@ -0,0 +1,23 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s
+
+# CHECK-LABEL: name: qfpAdd32
+# CHECK: V6_vd0
+# CHECK-NEXT: V6_vL32Ub_ai
+# CHECK-NEXT: V6_vadd_sf
+# CHECK-NEXT: V6_vconv_sf_qf32
+# CHECK-NEXT: V6_vS32Ub_ai
+---
+name: qfpAdd32
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %3:hvxvr = V6_vd0
+    %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %5:hvxvr = V6_vadd_sf %3:hvxvr, %4:hvxvr
+    %6:hvxvr = V6_vconv_sf_qf32 %5:hvxvr
+    V6_vS32Ub_ai %1:intregs, 0, %6:hvxvr
+...
diff --git a/llvm/test/CodeGen/Hexagon/vect/unique-vreg-def.ll b/llvm/test/CodeGen/Hexagon/vect/unique-vreg-def.ll
new file mode 100644
index 0000000..2d46da7
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vect/unique-vreg-def.ll
@@ -0,0 +1,32 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; REQUIRES: hexagon
+
+; This test was asserting because getVRegDef() was called on a register with
+; multiple defs.
+; Checks that the test does not assert and vsub is generated.
+; CHECK: vsub
+
+target triple = "hexagon"
+
+@v = common dso_local local_unnamed_addr global <32 x i32> zeroinitializer, align 128
+
+; Function Attrs: nounwind
+define dso_local void @hvx_twoSum(<32 x i32>* nocapture noundef writeonly %s2lo) local_unnamed_addr #0 {
+entry:
+  %0 = load <32 x i32>, <32 x i32>* @v, align 128
+  %call = tail call inreg <32 x i32> @MY_Vsf_equals_Vqf32(<32 x i32> noundef %0) #3
+  %1 = tail call <32 x i32> @llvm.hexagon.V6.vsub.sf.128B(<32 x i32> %call, <32 x i32> %call)
+  store <32 x i32> %1, <32 x i32>* @v, align 128
+  store <32 x i32> %1, <32 x i32>* %s2lo, align 128
+  ret void
+}
+
+declare dso_local inreg <32 x i32> @MY_Vsf_equals_Vqf32(<32 x i32> noundef) local_unnamed_addr #1
+
+; Function Attrs: nofree nosync nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vsub.sf.128B(<32 x i32>, <32 x i32>) #2
+
+attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv73" "target-features"="+hvx-length128b,+hvxv73,+v73,-long-calls" }
+attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv73" "target-features"="+hvx-length128b,+hvxv73,+v73,-long-calls" }
+attributes #2 = { nofree nosync nounwind readnone }
+attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/RISCV/rv32zbs.ll b/llvm/test/CodeGen/RISCV/rv32zbs.ll
index dcb70f8..f9527ef 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbs.ll
@@ -45,6 +45,32 @@ define i32 @bclr_i32_no_mask(i32 %a, i32 %b) nounwind {
   ret i32 %and1
 }
 
+define i32 @bclr_i32_mask_multiple(i32 %a, i32 %b, i32 %shamt) nounwind {
+; RV32I-LABEL: bclr_i32_mask_multiple:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a3, 1
+; RV32I-NEXT:    sll a2, a3, a2
+; RV32I-NEXT:    not a3, a2
+; RV32I-NEXT:    and a0, a3, a0
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBS-LABEL: bclr_i32_mask_multiple:
+; RV32ZBS:       # %bb.0:
+; RV32ZBS-NEXT:    bclr a0, a0, a2
+; RV32ZBS-NEXT:    bset a1, a1, a2
+; RV32ZBS-NEXT:    add a0, a0, a1
+; RV32ZBS-NEXT:    ret
+  %shamt_masked = and i32 %shamt, 63
+  %shl = shl nuw i32 1, %shamt_masked
+  %neg = xor i32 %shl, -1
+  %and = and i32 %neg, %a
+  %or = or i32 %b, %shl
+  %c = add i32 %and, %or
+  ret i32 %c
+}
+
 define i64 @bclr_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-LABEL: bclr_i64:
 ; RV32I:       # %bb.0:
@@ -301,17 +327,17 @@ define i64 @bext_i64(i64 %a, i64 %b) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a3, a2, 63
 ; CHECK-NEXT:    addi a4, a3, -32
-; CHECK-NEXT:    bltz a4, .LBB12_2
+; CHECK-NEXT:    bltz a4, .LBB13_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    srl a0, a1, a3
-; CHECK-NEXT:    j .LBB12_3
-; CHECK-NEXT:  .LBB12_2:
+; CHECK-NEXT:    j .LBB13_3
+; CHECK-NEXT:  .LBB13_2:
 ; CHECK-NEXT:    srl a0, a0, a2
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    not a2, a3
 ; CHECK-NEXT:    sll a1, a1, a2
 ; CHECK-NEXT:    or a0, a0, a1
-; CHECK-NEXT:  .LBB12_3:
+; CHECK-NEXT:  .LBB13_3:
 ; CHECK-NEXT:    andi a0, a0, 1
 ; CHECK-NEXT:    li a1, 0
 ; CHECK-NEXT:    ret
@@ -789,17 +815,17 @@ define i64 @bset_trailing_ones_i64_mask(i64 %a) nounwind {
 ; CHECK-NEXT:    li a3, -1
 ; CHECK-NEXT:    addi a1, a2, -32
 ; CHECK-NEXT:    sll a0, a3, a0
-; CHECK-NEXT:    bltz a1, .LBB43_2
+; CHECK-NEXT:    bltz a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    sll a2, a3, a2
-; CHECK-NEXT:    j .LBB43_3
-; CHECK-NEXT:  .LBB43_2:
+; CHECK-NEXT:    j .LBB44_3
+; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    not a2, a2
 ; CHECK-NEXT:    lui a3, 524288
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    srl a2, a3, a2
 ; CHECK-NEXT:    or a2, a0, a2
-; CHECK-NEXT:  .LBB43_3:
+; CHECK-NEXT:  .LBB44_3:
 ; CHECK-NEXT:    srai a1, a1, 31
 ; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    not a1, a2
@@ -817,17 +843,17 @@ define i64 @bset_trailing_ones_i64_no_mask(i64 %a) nounwind {
 ; CHECK-NEXT:    li a1, -1
 ; CHECK-NEXT:    addi a2, a0, -32
 ; CHECK-NEXT:    sll a1, a1, a0
-; CHECK-NEXT:    bltz a2, .LBB44_2
+; CHECK-NEXT:    bltz a2, .LBB45_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:    j .LBB44_3
-; CHECK-NEXT:  .LBB44_2:
+; CHECK-NEXT:    j .LBB45_3
+; CHECK-NEXT:  .LBB45_2:
 ; CHECK-NEXT:    not a0, a0
 ; CHECK-NEXT:    lui a3, 524288
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    srl a0, a3, a0
 ; CHECK-NEXT:    or a0, a1, a0
-; CHECK-NEXT:  .LBB44_3:
+; CHECK-NEXT:  .LBB45_3:
 ; CHECK-NEXT:    srai a2, a2, 31
 ; CHECK-NEXT:    and a2, a2, a1
 ; CHECK-NEXT:    not a1, a0
diff --git a/llvm/test/CodeGen/RISCV/rv64zbs.ll b/llvm/test/CodeGen/RISCV/rv64zbs.ll
index b4edcf6c..d42bc8e 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbs.ll
@@ -110,6 +110,32 @@ define i64 @bclr_i64_no_mask(i64 %a, i64 %b) nounwind {
   ret i64 %and1
 }
 
+define i64 @bclr_i64_mask_multiple(i64 %a, i64 %b, i64 %shamt) nounwind {
+; RV64I-LABEL: bclr_i64_mask_multiple:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a3, 1
+; RV64I-NEXT:    sll a2, a3, a2
+; RV64I-NEXT:    not a3, a2
+; RV64I-NEXT:    and a0, a3, a0
+; RV64I-NEXT:    or a1, a1, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bclr_i64_mask_multiple:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bclr a0, a0, a2
+; RV64ZBS-NEXT:    bset a1, a1, a2
+; RV64ZBS-NEXT:    add a0, a0, a1
+; RV64ZBS-NEXT:    ret
+  %shamt_masked = and i64 %shamt, 63
+  %shl = shl nuw i64 1, %shamt_masked
+  %neg = xor i64 %shl, -1
+  %and = and i64 %neg, %a
+  %or = or i64 %b, %shl
+  %c = add i64 %and, %or
+  ret i64 %c
+}
+
 define signext i32 @bset_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: bset_i32:
 ; RV64I:       # %bb.0:
@@ -372,19 +398,19 @@ define void @bext_i32_trunc(i32 signext %0, i32 signext %1) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srlw a0, a0, a1
 ; RV64I-NEXT:    andi a0, a0, 1
-; RV64I-NEXT:    beqz a0, .LBB19_2
+; RV64I-NEXT:    beqz a0, .LBB20_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    ret
-; RV64I-NEXT:  .LBB19_2:
+; RV64I-NEXT:  .LBB20_2:
 ; RV64I-NEXT:    tail bar
 ;
 ; RV64ZBS-LABEL: bext_i32_trunc:
 ; RV64ZBS:       # %bb.0:
 ; RV64ZBS-NEXT:    bext a0, a0, a1
-; RV64ZBS-NEXT:    beqz a0, .LBB19_2
+; RV64ZBS-NEXT:    beqz a0, .LBB20_2
 ; RV64ZBS-NEXT:  # %bb.1:
 ; RV64ZBS-NEXT:    ret
-; RV64ZBS-NEXT:  .LBB19_2:
+; RV64ZBS-NEXT:  .LBB20_2:
 ; RV64ZBS-NEXT:    tail bar
   %3 = shl i32 1, %1
   %4 = and i32 %3, %0
diff --git a/llvm/test/CodeGen/SystemZ/int-conv-14.ll b/llvm/test/CodeGen/SystemZ/int-conv-14.ll
index 98dc88f..baab5ac 100644
--- a/llvm/test/CodeGen/SystemZ/int-conv-14.ll
+++ b/llvm/test/CodeGen/SystemZ/int-conv-14.ll
@@ -55,14 +55,15 @@ define i128 @f4(ptr %ptr) {
 }
 
 ; Truncation to i64.
-define i64 @f5(i128 %a) {
+define i64 @f5(i128 %a, i128 %b) {
 ; CHECK-LABEL: f5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvg %r2, %v0, 1
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i64
   ret i64 %res
 }
@@ -134,15 +135,16 @@ define i128 @f10(ptr %ptr) {
 }
 
 ; Truncation to i32.
-define i32 @f11(i128 %a) {
+define i32 @f11(i128 %a, i128 %b) {
 ; CHECK-LABEL: f11:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i32
   ret i32 %res
 }
@@ -215,15 +217,16 @@ define i128 @f16(ptr %ptr) {
 }
 
 ; Truncation to i16.
-define i16 @f17(i128 %a) {
+define i16 @f17(i128 %a, i128 %b) {
 ; CHECK-LABEL: f17:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i16
   ret i16 %res
 }
@@ -296,15 +299,16 @@ define i128 @f22(ptr %ptr) {
 }
 
 ; Truncation to i8.
-define i8 @f23(i128 %a) {
+define i8 @f23(i128 %a, i128 %b) {
 ; CHECK-LABEL: f23:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i8
   ret i8 %res
 }
@@ -385,15 +389,16 @@ define i128 @f28(ptr %ptr) {
 }
 
 ; Truncation to i1.
-define i1 @f29(i128 %a) {
+define i1 @f29(i128 %a, i128 %b) {
 ; CHECK-LABEL: f29:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i1
   ret i1 %res
 }
diff --git a/llvm/test/CodeGen/SystemZ/int-conv-15.ll b/llvm/test/CodeGen/SystemZ/int-conv-15.ll
index 0d8ee75..f2c9ee5 100644
--- a/llvm/test/CodeGen/SystemZ/int-conv-15.ll
+++ b/llvm/test/CodeGen/SystemZ/int-conv-15.ll
@@ -55,14 +55,15 @@ define i128 @f4(ptr %ptr) {
 }
 
 ; Truncation to i64.
-define i64 @f5(i128 %a) {
+define i64 @f5(i128 %a, i128 %b) {
 ; CHECK-LABEL: f5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvg %r2, %v0, 1
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i64
   ret i64 %res
 }
@@ -134,15 +135,16 @@ define i128 @f10(ptr %ptr) {
 }
 
 ; Truncation to i32.
-define i32 @f11(i128 %a) {
+define i32 @f11(i128 %a, i128 %b) {
 ; CHECK-LABEL: f11:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i32
   ret i32 %res
 }
@@ -215,15 +217,16 @@ define i128 @f16(ptr %ptr) {
 }
 
 ; Truncation to i16.
-define i16 @f17(i128 %a) {
+define i16 @f17(i128 %a, i128 %b) {
 ; CHECK-LABEL: f17:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i16
   ret i16 %res
 }
@@ -296,15 +299,16 @@ define i128 @f22(ptr %ptr) {
 }
 
 ; Truncation to i8.
-define i8 @f23(i128 %a) {
+define i8 @f23(i128 %a, i128 %b) {
 ; CHECK-LABEL: f23:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i8
   ret i8 %res
 }
@@ -383,15 +387,16 @@ define i128 @f28(ptr %ptr) {
 }
 
 ; Truncation to i1.
-define i1 @f29(i128 %a) {
+define i1 @f29(i128 %a, i128 %b) {
 ; CHECK-LABEL: f29:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i1
   ret i1 %res
 }
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s b/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
index 83d71cd..5293958 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
@@ -16,8 +16,7 @@
 # CHECK-DAG: Symbols: { _foo }, Dependencies: { (main, { _external_func }) }
 # CHECK-DAG: Symbols: { _baz }, Dependencies: { (main, { _foo }) }
 # CHECK: Simplified dependencies:
-# CHECK-DAG: Symbols: { _foo }, Dependencies: { (main, { _external_func }) }
-# CHECK-DAG: Symbols: { _baz }, Dependencies: { (main, { _external_func }) }
+# CHECK-DAG: Defs: { (main, [ _baz _foo ]) }, Deps: { (main, [ _external_func ]) }
 
         .section	__TEXT,__text,regular,pure_instructions
 
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
new file mode 100644
index 0000000..1ddcd4b
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=msan -mattr=+sme -o - %s | FileCheck %s
+
+; XFAIL: *
+
+; Forked from llvm/test/CodeGen/AArch64/sme-aarch64-svcount.ll
+; Manually minimized to show MSan leads to a compiler crash
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+define target("aarch64.svcount") @test_return_arg1(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1) nounwind {
+  ret target("aarch64.svcount") %arg1
+}
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll
new file mode 100644
index 0000000..9caa89d
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=msan -mattr=+sme -o - %s | FileCheck %s
+
+; XFAIL: *
+
+; Forked from llvm/test/CodeGen/AArch64/sme-aarch64-svcount.ll
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+;
+; Test simple loads, stores and return.
+;
+define target("aarch64.svcount") @test_load(ptr %ptr) nounwind {
+  %res = load target("aarch64.svcount"), ptr %ptr
+  ret target("aarch64.svcount") %res
+}
+
+define void @test_store(ptr %ptr, target("aarch64.svcount") %val) nounwind {
+  store target("aarch64.svcount") %val, ptr %ptr
+  ret void
+}
+
+define target("aarch64.svcount") @test_alloca_store_reload(target("aarch64.svcount") %val) nounwind {
+  %ptr = alloca target("aarch64.svcount"), align 1
+  store target("aarch64.svcount") %val, ptr %ptr
+  %res = load target("aarch64.svcount"), ptr %ptr
+  ret target("aarch64.svcount") %res
+}
+
+;
+; Test passing as arguments (from perspective of callee)
+;
+
+define target("aarch64.svcount") @test_return_arg1(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1) nounwind {
+  ret target("aarch64.svcount") %arg1
+}
+
+define target("aarch64.svcount") @test_return_arg4(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1, target("aarch64.svcount") %arg2, target("aarch64.svcount") %arg3, target("aarch64.svcount") %arg4) nounwind {
+  ret target("aarch64.svcount") %arg4
+}
+
+;
+; Test passing as arguments (from perspective of caller)
+;
+
+declare void @take_svcount_1(target("aarch64.svcount") %arg)
+define void @test_pass_1arg(target("aarch64.svcount") %arg) nounwind {
+  call void @take_svcount_1(target("aarch64.svcount") %arg)
+  ret void
+}
+
+declare void @take_svcount_5(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1, target("aarch64.svcount") %arg2, target("aarch64.svcount") %arg3, target("aarch64.svcount") %arg4)
+define void @test_pass_5args(target("aarch64.svcount") %arg) nounwind {
+  call void @take_svcount_5(target("aarch64.svcount") %arg, target("aarch64.svcount") %arg, target("aarch64.svcount") %arg, target("aarch64.svcount") %arg, target("aarch64.svcount") %arg)
+  ret void
+}
+
+define target("aarch64.svcount") @test_sel(target("aarch64.svcount") %x, target("aarch64.svcount") %y, i1 %cmp) sanitize_memory {
+  %x.y = select i1 %cmp, target("aarch64.svcount") %x, target("aarch64.svcount") %y
+  ret target("aarch64.svcount") %x.y
+}
+
+define target("aarch64.svcount") @test_sel_cc(target("aarch64.svcount") %x, target("aarch64.svcount") %y, i32 %k) sanitize_memory {
+  %cmp = icmp sgt i32 %k, 42
+  %x.y = select i1 %cmp, target("aarch64.svcount") %x, target("aarch64.svcount") %y
+  ret target("aarch64.svcount") %x.y
+}
diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter.td b/llvm/test/TableGen/RuntimeLibcallEmitter.td
index 0c23e3b..7aaf3a0 100644
--- a/llvm/test/TableGen/RuntimeLibcallEmitter.td
+++ b/llvm/test/TableGen/RuntimeLibcallEmitter.td
@@ -73,6 +73,8 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 
 // All entries should be emitted in Libcall enum.
 // CHECK: #ifdef GET_RUNTIME_LIBCALL_ENUM
+// CHECK-NEXT: #undef GET_RUNTIME_LIBCALL_ENUM
+// CHECK-EMPTY:
 // CHECK-NEXT: namespace llvm {
 // CHECK-NEXT: namespace RTLIB {
 // CHECK-NEXT: enum Libcall : unsigned short {
@@ -101,9 +103,12 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT: constexpr size_t NumLibcallImpls = 9;
 // CHECK-NEXT: } // End namespace RTLIB
 // CHECK-NEXT: } // End namespace llvm
-// CHECK-NEXT: #endif
+// CHECK-EMPTY:
+// CHECK-NEXT: #endif // GET_RUNTIME_LIBCALL_ENUM
 
 // CHECK: #ifdef GET_INIT_RUNTIME_LIBCALL_NAMES
+// CHECK-NEXT: #undef GET_INIT_RUNTIME_LIBCALL_NAMES
+// CHECK-EMPTY:
 // CHECK-EMPTY:
 // CHECK-NEXT: #ifdef __GNUC__
 // CHECK-NEXT: #pragma GCC diagnostic push
@@ -163,13 +168,18 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT: };
 
 // CHECK: #ifdef GET_LOOKUP_LIBCALL_IMPL_NAME_BODY
+// CHECK-NEXT: #undef GET_LOOKUP_LIBCALL_IMPL_NAME_BODY
+// CHECK-EMPTY:
 // CHECK-NEXT: size_t Size = Name.size();
 // CHECK-NEXT: if (Size == 0 || Size > 9)
 // CHECK-NEXT:   return enum_seq(RTLIB::Unsupported, RTLIB::Unsupported);
 // CHECK-NEXT: return lookupLibcallImplNameImpl(Name);
-// CHECK-NEXT: #endif
+// CHECK-EMPTY:
+// CHECK-NEXT: #endif // GET_LOOKUP_LIBCALL_IMPL_NAME_BODY
 
 // CHECK: #ifdef DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
+// CHECK-NEXT: #undef DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
+// CHECK-EMPTY:
 // CHECK-NEXT: static inline uint64_t hash(StringRef Str) {
 // CHECK-NEXT: return static_cast<uint32_t>(xxh3_64bits(Str));
 // CHECK-NEXT: }
diff --git a/llvm/test/Transforms/GVNSink/ptrtoaddr.ll b/llvm/test/Transforms/GVNSink/ptrtoaddr.ll
new file mode 100644
index 0000000..3a9d16e
--- /dev/null
+++ b/llvm/test/Transforms/GVNSink/ptrtoaddr.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=gvn-sink < %s | FileCheck %s
+
+define i64 @test(i1 %c, ptr %p, ptr %p2) {
+; CHECK-LABEL: define i64 @test(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT:    br i1 [[C]], label %[[IF:.*]], label %[[ELSE:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    br label %[[JOIN:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    [[P2_SINK:%.*]] = phi ptr [ [[P2]], %[[ELSE]] ], [ [[P]], %[[IF]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = ptrtoaddr ptr [[P2_SINK]] to i64
+; CHECK-NEXT:    ret i64 [[PHI]]
+;
+  br i1 %c, label %if, label %else
+
+if:
+  %p.addr = ptrtoaddr ptr %p to i64
+  br label %join
+
+else:
+  %p2.addr = ptrtoaddr ptr %p2 to i64
+  br label %join
+
+join:
+  %phi = phi i64 [ %p.addr, %if ], [ %p2.addr, %else ]
+  ret i64 %phi
+}
diff --git a/llvm/test/Transforms/Inline/ML/recursive.ll b/llvm/test/Transforms/Inline/ML/recursive.ll
new file mode 100644
index 0000000..2d9240a
--- /dev/null
+++ b/llvm/test/Transforms/Inline/ML/recursive.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
+; REQUIRES: llvm_inliner_model_autogenerated
+; RUN: opt -S %s -o - -passes='inliner-ml-advisor-release' -ml-inliner-skip-policy=if-caller-not-cold | FileCheck %s
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-android29"
+
+define i32 @a_func(ptr %this, i32 %color_id, i1 %dark_mode) local_unnamed_addr {
+; CHECK-LABEL: define i32 @a_func(
+; CHECK-SAME: ptr [[THIS:%.*]], i32 [[COLOR_ID:%.*]], i1 [[DARK_MODE:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[DARK_MODE]], label %[[SW_BB97:.*]], label %[[COMMON_RET:.*]]
+; CHECK:       [[COMMON_RET]]:
+; CHECK-NEXT:    ret i32 0
+; CHECK:       [[SW_BB97]]:
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+entry:
+  br i1 %dark_mode, label %sw.bb97, label %common.ret
+
+common.ret:                                       ; preds = %sw.bb97, %entry
+  ret i32 0
+
+sw.bb97:                                          ; preds = %entry
+  %call.i = tail call i32 @a_func(ptr null, i32 0, i1 false)
+  br label %common.ret
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
+declare ptr @llvm.load.relative.i32(ptr %0, i32 %1) #0
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+;.
diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
deleted file mode 100644
index ffaa8b1..0000000
--- a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
+++ /dev/null
@@ -1,163 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-; The ptrtoaddr folds are also valid for pointers that have external state.
-target datalayout = "pe1:64:64:64:32"
-
-@g = external global i8
-@g2 = external global i8
-
-@g.as1 = external addrspace(1) global i8
-@g2.as1 = external addrspace(1) global i8
-
-define i64 @ptrtoaddr_inttoptr_arg(i64 %a) {
-; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_arg(
-; CHECK-SAME: i64 [[A:%.*]]) {
-; CHECK-NEXT:    ret i64 [[A]]
-;
-  %toptr = inttoptr i64 %a to ptr
-  %toaddr = ptrtoaddr ptr %toptr to i64
-  ret i64 %toaddr
-}
-
-define i32 @ptrtoaddr_inttoptr_arg_addrsize(i32 %a) {
-; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_arg_addrsize(
-; CHECK-SAME: i32 [[A:%.*]]) {
-; CHECK-NEXT:    ret i32 [[A]]
-;
-  %toptr = inttoptr i32 %a to ptr addrspace(1)
-  %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
-  ret i32 %toaddr
-}
-
-define i32 @ptrtoaddr_inttoptr() {
-; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr() {
-; CHECK-NEXT:    ret i32 -1
-;
-  ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i32 -1 to ptr addrspace(1)) to i32)
-}
-
-define i32 @ptrtoaddr_inttoptr_diff_size1() {
-; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size1() {
-; CHECK-NEXT:    ret i32 -1
-;
-  ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i64 -1 to ptr addrspace(1)) to i32)
-}
-
-define i32 @ptrtoaddr_inttoptr_diff_size2() {
-; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size2() {
-; CHECK-NEXT:    ret i32 65535
-;
-  ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i16 -1 to ptr addrspace(1)) to i32)
-}
-
-define i64 @ptrtoaddr_inttoptr_noas1() {
-; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas1() {
-; CHECK-NEXT:    ret i64 1
-;
-  ret i64 ptrtoaddr (ptr getelementptr (i8, ptr null, i64 1) to i64)
-}
-
-define i64 @ptr2addr2_inttoptr_noas2() {
-; CHECK-LABEL: define i64 @ptr2addr2_inttoptr_noas2() {
-; CHECK-NEXT:    ret i64 123
-;
-  ret i64 ptrtoaddr (ptr inttoptr (i64 123 to ptr) to i64)
-}
-
-define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
-; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
-; CHECK-NEXT:    ret i64 4294967295
-;
-  ret i64 ptrtoaddr (ptr inttoptr (i32 -1 to ptr) to i64)
-}
-
-define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
-; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
-; CHECK-NEXT:    ret i64 -1
-;
-  ret i64 ptrtoaddr (ptr inttoptr (i128 -1 to ptr) to i64)
-}
-
-define i64 @ptrtoaddr_gep_null() {
-; CHECK-LABEL: define i64 @ptrtoaddr_gep_null() {
-; CHECK-NEXT:    ret i64 42
-;
-  ret i64 ptrtoaddr (ptr getelementptr (i8, ptr null, i64 42) to i64)
-}
-
-define i32 @ptrtoaddr_gep_null_addrsize() {
-; CHECK-LABEL: define i32 @ptrtoaddr_gep_null_addrsize() {
-; CHECK-NEXT:    ret i32 42
-;
-  ret i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) null, i32 42) to i32)
-}
-
-define i64 @ptrtoaddr_gep_sub() {
-; CHECK-LABEL: define i64 @ptrtoaddr_gep_sub() {
-; CHECK-NEXT:    ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
-;
-  ret i64 ptrtoaddr (ptr getelementptr (i8, ptr @g, i64 sub (i64 0, i64 ptrtoaddr (ptr @g2 to i64))) to i64)
-}
-
-define i32 @ptrtoaddr_gep_sub_addrsize() {
-; CHECK-LABEL: define i32 @ptrtoaddr_gep_sub_addrsize() {
-; CHECK-NEXT:    ret i32 sub (i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32), i32 ptrtoaddr (ptr addrspace(1) @g2.as1 to i32))
-;
-  ret i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g.as1, i32 sub (i32 0, i32 ptrtoaddr (ptr addrspace(1) @g2.as1 to i32))) to i32)
-}
-
-; Don't fold inttoptr of ptrtoaddr away. inttoptr will pick a previously
-; exposed provenance, which is not necessarily that of @g (especially as
-; ptrtoaddr does not expose the provenance.)
-define ptr @inttoptr_of_ptrtoaddr() {
-; CHECK-LABEL: define ptr @inttoptr_of_ptrtoaddr() {
-; CHECK-NEXT:    ret ptr inttoptr (i64 ptrtoaddr (ptr @g to i64) to ptr)
-;
-  ret ptr inttoptr (i64 ptrtoaddr (ptr @g to i64) to ptr)
-}
-
-define i64 @ptrtoaddr_sub_consts_unrelated() {
-; CHECK-LABEL: define i64 @ptrtoaddr_sub_consts_unrelated() {
-; CHECK-NEXT:    ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
-;
-  ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
-}
-
-define i64 @ptrtoaddr_sub_consts_offset() {
-; CHECK-LABEL: define i64 @ptrtoaddr_sub_consts_offset() {
-; CHECK-NEXT:    ret i64 42
-;
-  ret i64 sub (i64 ptrtoaddr (ptr getelementptr (i8, ptr @g, i64 42) to i64), i64 ptrtoaddr (ptr @g to i64))
-}
-
-define i32 @ptrtoaddr_sub_consts_offset_addrsize() {
-; CHECK-LABEL: define i32 @ptrtoaddr_sub_consts_offset_addrsize() {
-; CHECK-NEXT:    ret i32 42
-;
-  ret i32 sub (i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g.as1, i32 42) to i32), i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32))
-}
-
-define i64 @ptrtoaddr_sub_known_offset(ptr %p) {
-; CHECK-LABEL: define i64 @ptrtoaddr_sub_known_offset(
-; CHECK-SAME: ptr [[P:%.*]]) {
-; CHECK-NEXT:    ret i64 42
-;
-  %p2 = getelementptr inbounds i8, ptr %p, i64 42
-  %p.addr = ptrtoaddr ptr %p to i64
-  %p2.addr = ptrtoaddr ptr %p2 to i64
-  %sub = sub i64 %p2.addr, %p.addr
-  ret i64 %sub
-}
-
-define i32 @ptrtoaddr_sub_known_offset_addrsize(ptr addrspace(1) %p) {
-; CHECK-LABEL: define i32 @ptrtoaddr_sub_known_offset_addrsize(
-; CHECK-SAME: ptr addrspace(1) [[P:%.*]]) {
-; CHECK-NEXT:    ret i32 42
-;
-  %p2 = getelementptr inbounds i8, ptr addrspace(1) %p, i32 42
-  %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
-  %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
-  %sub = sub i32 %p2.addr, %p.addr
-  ret i32 %sub
-}
diff --git a/llvm/test/Transforms/InstSimplify/ptrtoaddr.ll b/llvm/test/Transforms/InstSimplify/ptrtoaddr.ll
new file mode 100644
index 0000000..d06b520
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/ptrtoaddr.ll
@@ -0,0 +1,318 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+
+; The ptrtoaddr folds are also valid for pointers that have external state.
+target datalayout = "pe1:64:64:64:32"
+
+@g = external global i8
+@g2 = external global i8
+
+@g.as1 = external addrspace(1) global i8
+@g2.as1 = external addrspace(1) global i8
+
+define i64 @ptrtoaddr_inttoptr_arg(i64 %a) {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_arg(
+; CHECK-SAME: i64 [[A:%.*]]) {
+; CHECK-NEXT:    ret i64 [[A]]
+;
+  %toptr = inttoptr i64 %a to ptr
+  %toaddr = ptrtoaddr ptr %toptr to i64
+  ret i64 %toaddr
+}
+
+define i32 @ptrtoaddr_inttoptr_arg_addrsize(i32 %a) {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_arg_addrsize(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    ret i32 [[A]]
+;
+  %toptr = inttoptr i32 %a to ptr addrspace(1)
+  %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
+  ret i32 %toaddr
+}
+
+define i32 @ptrtoaddr_inttoptr() {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %toptr = inttoptr i32 -1 to ptr addrspace(1)
+  %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
+  ret i32 %toaddr
+}
+
+define i32 @ptrtoaddr_inttoptr_diff_size1() {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size1() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %toptr = inttoptr i64 -1 to ptr addrspace(1)
+  %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
+  ret i32 %toaddr
+}
+
+define i32 @ptrtoaddr_inttoptr_diff_size2() {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size2() {
+; CHECK-NEXT:    ret i32 65535
+;
+  %toptr = inttoptr i16 -1 to ptr addrspace(1)
+  %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
+  ret i32 %toaddr
+}
+
+define i64 @ptrtoaddr_inttoptr_noas1() {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas1() {
+; CHECK-NEXT:    ret i64 1
+;
+  %toptr = getelementptr i8, ptr null, i64 1
+  %toaddr = ptrtoaddr ptr %toptr to i64
+  ret i64 %toaddr
+}
+
+define i64 @ptr2addr2_inttoptr_noas2() {
+; CHECK-LABEL: define i64 @ptr2addr2_inttoptr_noas2() {
+; CHECK-NEXT:    ret i64 123
+;
+  %toptr = inttoptr i64 123 to ptr
+  %toaddr = ptrtoaddr ptr %toptr to i64
+  ret i64 %toaddr
+}
+
+define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
+; CHECK-NEXT:    ret i64 4294967295
+;
+  %toptr = inttoptr i32 -1 to ptr
+  %toaddr = ptrtoaddr ptr %toptr to i64
+  ret i64 %toaddr
+}
+
+define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %toptr = inttoptr i128 -1 to ptr
+  %toaddr = ptrtoaddr ptr %toptr to i64
+  ret i64 %toaddr
+}
+
+define i64 @ptrtoaddr_gep_null() {
+; CHECK-LABEL: define i64 @ptrtoaddr_gep_null() {
+; CHECK-NEXT:    ret i64 42
+;
+  %toaddr = ptrtoaddr ptr getelementptr (i8, ptr null, i64 42) to i64
+  ret i64 %toaddr
+}
+
+define i32 @ptrtoaddr_gep_null_addrsize() {
+; CHECK-LABEL: define i32 @ptrtoaddr_gep_null_addrsize() {
+; CHECK-NEXT:    ret i32 42
+;
+  %toaddr = ptrtoaddr ptr addrspace(1) getelementptr (i8, ptr addrspace(1) null, i32 42) to i32
+  ret i32 %toaddr
+}
+
+define i64 @ptrtoaddr_gep_sub() {
+; CHECK-LABEL: define i64 @ptrtoaddr_gep_sub() {
+; CHECK-NEXT:    ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
+;
+  %toaddr = ptrtoaddr ptr getelementptr (i8, ptr @g, i64 sub (i64 0, i64 ptrtoaddr (ptr @g2 to i64))) to i64
+  ret i64 %toaddr
+}
+
+define i32 @ptrtoaddr_gep_sub_addrsize() {
+; CHECK-LABEL: define i32 @ptrtoaddr_gep_sub_addrsize() {
+; CHECK-NEXT:    ret i32 sub (i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32), i32 ptrtoaddr (ptr addrspace(1) @g2.as1 to i32))
+;
+  %toaddr = ptrtoaddr ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g.as1, i32 sub (i32 0, i32 ptrtoaddr (ptr addrspace(1) @g2.as1 to i32))) to i32
+  ret i32 %toaddr
+}
+
+; Don't fold inttoptr of ptrtoaddr away. inttoptr will pick a previously
+; exposed provenance, which is not necessarily that of @g (especially as
+; ptrtoaddr does not expose the provenance.)
+define ptr @inttoptr_of_ptrtoaddr() {
+; CHECK-LABEL: define ptr @inttoptr_of_ptrtoaddr() {
+; CHECK-NEXT:    ret ptr inttoptr (i64 ptrtoaddr (ptr @g to i64) to ptr)
+;
+  %toptr = inttoptr i64 ptrtoaddr (ptr @g to i64) to ptr
+  ret ptr %toptr
+}
+
+define i64 @ptrtoaddr_sub_consts_unrelated() {
+; CHECK-LABEL: define i64 @ptrtoaddr_sub_consts_unrelated() {
+; CHECK-NEXT:    ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
+;
+  %sub = sub i64 ptrtoaddr (ptr @g to i64), ptrtoaddr (ptr @g2 to i64)
+  ret i64 %sub
+}
+
+define i64 @ptrtoaddr_sub_consts_offset() {
+; CHECK-LABEL: define i64 @ptrtoaddr_sub_consts_offset() {
+; CHECK-NEXT:    ret i64 42
+;
+  %sub = sub i64 ptrtoaddr (ptr getelementptr (i8, ptr @g, i64 42) to i64), ptrtoaddr (ptr @g to i64)
+  ret i64 %sub
+}
+
+define i32 @ptrtoaddr_sub_consts_offset_addrsize() {
+; CHECK-LABEL: define i32 @ptrtoaddr_sub_consts_offset_addrsize() {
+; CHECK-NEXT:    ret i32 42
+;
+  %sub = sub i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g.as1, i32 42) to i32), ptrtoaddr (ptr addrspace(1) @g.as1 to i32)
+  ret i32 %sub
+}
+
+define i64 @ptrtoaddr_sub_known_offset(ptr %p) {
+; CHECK-LABEL: define i64 @ptrtoaddr_sub_known_offset(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    ret i64 42
+;
+  %p2 = getelementptr inbounds i8, ptr %p, i64 42
+  %p.addr = ptrtoaddr ptr %p to i64
+  %p2.addr = ptrtoaddr ptr %p2 to i64
+  %sub = sub i64 %p2.addr, %p.addr
+  ret i64 %sub
+}
+
+define i32 @ptrtoaddr_sub_known_offset_addrsize(ptr addrspace(1) %p) {
+; CHECK-LABEL: define i32 @ptrtoaddr_sub_known_offset_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]]) {
+; CHECK-NEXT:    ret i32 42
+;
+  %p2 = getelementptr inbounds i8, ptr addrspace(1) %p, i32 42
+  %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+  %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
+  %sub = sub i32 %p2.addr, %p.addr
+  ret i32 %sub
+}
+
+define i64 @ptrtoaddr_of_ptradd_of_sub(i64 %x, ptr %p) {
+; CHECK-LABEL: define i64 @ptrtoaddr_of_ptradd_of_sub(
+; CHECK-SAME: i64 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    ret i64 [[X]]
+;
+  %p.addr = ptrtoaddr ptr %p to i64
+  %sub = sub i64 %x, %p.addr
+  %ptradd = getelementptr i8, ptr %p, i64 %sub
+  %ptradd.addr = ptrtoaddr ptr %ptradd to i64
+  ret i64 %ptradd.addr
+}
+
+define i32 @ptrtoaddr_of_ptradd_of_sub_addrsize(i32 %x, ptr addrspace(1) %p) {
+; CHECK-LABEL: define i32 @ptrtoaddr_of_ptradd_of_sub_addrsize(
+; CHECK-SAME: i32 [[X:%.*]], ptr addrspace(1) [[P:%.*]]) {
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+  %sub = sub i32 %x, %p.addr
+  %ptradd = getelementptr i8, ptr addrspace(1) %p, i32 %sub
+  %ptradd.addr = ptrtoaddr ptr addrspace(1) %ptradd to i32
+  ret i32 %ptradd.addr
+}
+
+define ptr @gep_of_sub_ptrtoaddr_unrelated_pointers(ptr %p, ptr %p2, i64 %x) {
+; CHECK-LABEL: define ptr @gep_of_sub_ptrtoaddr_unrelated_pointers(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[P2:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[P2_ADDR:%.*]] = ptrtoaddr ptr [[P2]] to i64
+; CHECK-NEXT:    [[P_ADDR:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[P2_ADDR]], [[P_ADDR]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[P]], i64 [[SUB]]
+; CHECK-NEXT:    ret ptr [[GEP2]]
+;
+  %p2.addr = ptrtoaddr ptr %p2 to i64
+  %p.addr = ptrtoaddr ptr %p to i64
+  %sub = sub i64 %p2.addr, %p.addr
+  %gep2 = getelementptr i8, ptr %p, i64 %sub
+  ret ptr %gep2
+}
+
+define ptr @gep_of_sub_ptrtoaddr(ptr %p, i64 %x) {
+; CHECK-LABEL: define ptr @gep_of_sub_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[X]]
+; CHECK-NEXT:    ret ptr [[GEP1]]
+;
+  %gep1 = getelementptr i8, ptr %p, i64 %x
+  %gep1.addr = ptrtoaddr ptr %gep1 to i64
+  %p.addr = ptrtoaddr ptr %p to i64
+  %sub = sub i64 %gep1.addr, %p.addr
+  %gep2 = getelementptr i8, ptr %p, i64 %sub
+  ret ptr %gep2
+}
+
+define ptr addrspace(1) @gep_of_sub_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %x) {
+; CHECK-LABEL: define ptr addrspace(1) @gep_of_sub_ptrtoaddr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(1) [[P]], i32 [[X]]
+; CHECK-NEXT:    ret ptr addrspace(1) [[GEP1]]
+;
+  %gep1 = getelementptr i8, ptr addrspace(1) %p, i32 %x
+  %gep1.addr = ptrtoaddr ptr addrspace(1) %gep1 to i32
+  %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+  %sub = sub i32 %gep1.addr, %p.addr
+  %gep2 = getelementptr i8, ptr addrspace(1) %p, i32 %sub
+  ret ptr addrspace(1) %gep2
+}
+
+define ptr @gep_of_sub_ptrtoaddr_ashr(ptr %p, i64 %x) {
+; CHECK-LABEL: define ptr @gep_of_sub_ptrtoaddr_ashr(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[X]]
+; CHECK-NEXT:    ret ptr [[GEP1]]
+;
+  %gep1 = getelementptr i8, ptr %p, i64 %x
+  %gep1.addr = ptrtoaddr ptr %gep1 to i64
+  %p.addr = ptrtoaddr ptr %p to i64
+  %sub = sub i64 %gep1.addr, %p.addr
+  %ashr = ashr i64 %sub, 1
+  %gep2 = getelementptr i16, ptr %p, i64 %ashr
+  ret ptr %gep2
+}
+
+define ptr addrspace(1) @gep_of_sub_ptrtoaddr_ashr_addrsize(ptr addrspace(1) %p, i32 %x) {
+; CHECK-LABEL: define ptr addrspace(1) @gep_of_sub_ptrtoaddr_ashr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(1) [[P]], i32 [[X]]
+; CHECK-NEXT:    ret ptr addrspace(1) [[GEP1]]
+;
+  %gep1 = getelementptr i8, ptr addrspace(1) %p, i32 %x
+  %gep1.addr = ptrtoaddr ptr addrspace(1) %gep1 to i32
+  %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+  %sub = sub i32 %gep1.addr, %p.addr
+  %sdiv = sdiv i32 %sub, 3
+  %gep2 = getelementptr [3 x i8], ptr addrspace(1) %p, i32 %sdiv
+  ret ptr addrspace(1) %gep2
+}
+
+; Not folding this to inttoptr(123), as this may have different provenance from
+; %p, and the use of ptrtoaddr implies that the provenance of %p may not be
+; exposed, such that inttoptr cannot recover it.
+define ptr @gep_gep_neg_ptrtoaddr(ptr %p) {
+; CHECK-LABEL: define ptr @gep_gep_neg_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 123
+; CHECK-NEXT:    [[P_ADDR:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT:    [[P_ADDR_NEG:%.*]] = sub i64 0, [[P_ADDR]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[GEP1]], i64 [[P_ADDR_NEG]]
+; CHECK-NEXT:    ret ptr [[GEP2]]
+;
+  %gep1 = getelementptr inbounds i8, ptr %p, i64 123
+  %p.addr = ptrtoaddr ptr %p to i64
+  %p.addr.neg = sub i64 0, %p.addr
+  %gep2 = getelementptr i8, ptr %gep1, i64 %p.addr.neg
+  ret ptr %gep2
+}
+
+define ptr @gep_gep_inv_ptrtoaddr(ptr %p) {
+; CHECK-LABEL: define ptr @gep_gep_inv_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 123
+; CHECK-NEXT:    [[P_ADDR:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT:    [[P_ADDR_INV:%.*]] = xor i64 [[P_ADDR]], -1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[GEP1]], i64 [[P_ADDR_INV]]
+; CHECK-NEXT:    ret ptr [[GEP2]]
+;
+  %gep1 = getelementptr inbounds i8, ptr %p, i64 123
+  %p.addr = ptrtoaddr ptr %p to i64
+  %p.addr.inv = xor i64 %p.addr, -1
+  %gep2 = getelementptr i8, ptr %gep1, i64 %p.addr.inv
+  ret ptr %gep2
+}
diff --git a/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll b/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
index 2b5960e..90d0db2 100644
--- a/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
@@ -56,7 +56,7 @@ if.then46:                                        ; preds = %for.body40
   br label %for.inc50
 
 for.inc50:                                        ; preds = %if.then46, %for.body40
-  %k.1 = phi i32 [ undef, %for.body40 ], [ %inc47, %if.then46 ]
+  %k.1 = phi i32 [ 0, %for.body40 ], [ %inc47, %if.then46 ]
   %step.1 = phi i32 [ %step.0121, %for.body40 ], [ %inc47, %if.then46 ]
   %indvars.iv.next124 = add i64 %indvars.iv123, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next124 to i32
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 1c2840f..70532ad 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -1117,24 +1117,24 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
 ; CHECK-INTERLEAVE1-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4
 ; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2]] = add <2 x i64> [[VEC_PHI]], [[TMP1]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP1]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP2]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
@@ -1143,42 +1143,42 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
 ; CHECK-INTERLEAVED-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 8
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 16
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 8
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 6
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = sext <2 x i32> [[WIDE_LOAD4]] to <2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = sext <2 x i32> [[WIDE_LOAD5]] to <2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = sext <2 x i32> [[WIDE_LOAD6]] to <2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP8]] = add <2 x i64> [[VEC_PHI]], [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9]] = add <2 x i64> [[VEC_PHI1]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP10]] = add <2 x i64> [[VEC_PHI2]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP11]] = add <2 x i64> [[VEC_PHI3]], [[TMP7]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD4]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP15]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI1]], <4 x i64> [[TMP5]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE8]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI2]], <4 x i64> [[TMP6]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE9]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI3]], <4 x i64> [[TMP7]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP9]], [[TMP8]]
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX7:%.*]] = add <2 x i64> [[TMP10]], [[BIN_RDX]]
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX8:%.*]] = add <2 x i64> [[TMP11]], [[BIN_RDX7]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX8]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE7]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX10:%.*]] = add <2 x i64> [[PARTIAL_REDUCE8]], [[BIN_RDX]]
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX11:%.*]] = add <2 x i64> [[PARTIAL_REDUCE9]], [[BIN_RDX10]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX11]])
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       scalar.ph:
@@ -1187,24 +1187,24 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
 ; CHECK-MAXBW-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 2
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 2
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP2]] = add <2 x i64> [[VEC_PHI]], [[TMP1]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP1]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP2]])
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
index 20b5364..ebf4a4f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
@@ -9,9 +9,7 @@ define i32 @fn1() local_unnamed_addr #0 {
 ; We expect the backend to expand all reductions.
 ; CHECK: @llvm.vector.reduce
 entry:
-  %0 = load i32, ptr @b, align 4, !tbaa !1
-  %cmp40 = icmp sgt i32 %0, 0
-  br i1 %cmp40, label %for.body.lr.ph, label %for.end
+  br label %for.body.lr.ph
 
 for.body.lr.ph:                                   ; preds = %entry
   %1 = load ptr, ptr @a, align 8, !tbaa !5
@@ -21,8 +19,8 @@ for.body.lr.ph:                                   ; preds = %entry
 
 for.body:                                         ; preds = %for.body.lr.ph, %for.body
   %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %d.043 = phi i16 [ undef, %for.body.lr.ph ], [ %.sink28, %for.body ]
-  %c.042 = phi i16 [ undef, %for.body.lr.ph ], [ %c.0., %for.body ]
+  %d.043 = phi i16 [ 0, %for.body.lr.ph ], [ %.sink28, %for.body ]
+  %c.042 = phi i16 [ 0, %for.body.lr.ph ], [ %c.0., %for.body ]
   %arrayidx = getelementptr inbounds i16, ptr %1, i64 %indvars.iv
   %4 = load i16, ptr %arrayidx, align 2, !tbaa !7
   %cmp2 = icmp sgt i16 %c.042, %4
@@ -33,10 +31,8 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
   %cmp = icmp slt i64 %indvars.iv.next, %3
   br i1 %cmp, label %for.body, label %for.end
 
-for.end:                                          ; preds = %for.body, %entry
-  %c.0.lcssa = phi i16 [ undef, %entry ], [ %c.0., %for.body ]
-  %d.0.lcssa = phi i16 [ undef, %entry ], [ %.sink28, %for.body ]
-  %cmp26 = icmp sgt i16 %c.0.lcssa, %d.0.lcssa
+for.end:                                          ; preds = %for.body
+  %cmp26 = icmp sgt i16 %c.0., %.sink28
   %conv27 = zext i1 %cmp26 to i32
   ret i32 %conv27
 }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll
index 44820e0..33ce300 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll
@@ -18,7 +18,7 @@ define void @_Z1dv() local_unnamed_addr #0 {
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    [[F_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD5:%.*]], [[FOR_COND_CLEANUP:%.*]] ]
-; CHECK-NEXT:    [[G_0:%.*]] = phi i32 [ undef, [[ENTRY]] ], [ [[G_1_LCSSA:%.*]], [[FOR_COND_CLEANUP]] ]
+; CHECK-NEXT:    [[G_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[G_1_LCSSA:%.*]], [[FOR_COND_CLEANUP]] ]
 ; CHECK-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[G_0]], 4
 ; CHECK-NEXT:    [[CONV:%.*]] = and i32 [[F_0]], 65535
 ; CHECK-NEXT:    br i1 [[CMP12]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP]]
@@ -50,7 +50,7 @@ entry:
 
 for.cond:                                         ; preds = %for.cond.cleanup, %entry
   %f.0 = phi i32 [ 0, %entry ], [ %add5, %for.cond.cleanup ]
-  %g.0 = phi i32 [ undef, %entry ], [ %g.1.lcssa, %for.cond.cleanup ]
+  %g.0 = phi i32 [ 0, %entry ], [ %g.1.lcssa, %for.cond.cleanup ]
   %cmp12 = icmp ult i32 %g.0, 4
   %conv = and i32 %f.0, 65535
   br i1 %cmp12, label %for.body.lr.ph, label %for.cond.cleanup
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index de804600..786a2aa 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -728,8 +728,8 @@ define void @int_float_struct(ptr nocapture readonly %p) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> zeroinitializer, float undef, i32 0), [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 undef, i32 0), [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
@@ -755,8 +755,8 @@ for.cond.cleanup:                                 ; preds = %for.body
 
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
-  %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
+  %SumB.014 = phi float [ 0.0e+00, %entry ], [ %add3, %for.body ]
+  %SumA.013 = phi i32 [ 0, %entry ], [ %add, %for.body ]
   %a = getelementptr inbounds %struct.IntFloat, ptr %p, i64 %indvars.iv, i32 0
   %load1 = load i32, ptr %a, align 4
   %add = add nsw i32 %load1, %SumA.013
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
index 44a48a9..0f398a6 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
@@ -84,15 +84,14 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; CHECK: We can vectorize this loop!
 define i32 @redi(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) {
 entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
+  br label %for.body.preheader
 
 for.body.preheader:                               ; preds = %entry
   br label %for.body
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %Red.06 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
   %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.07
   %0 = load i32, ptr %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.07
@@ -107,9 +106,8 @@ for.end.loopexit:                                 ; preds = %for.body
   %add.lcssa = phi i32 [ %add, %for.body ]
   br label %for.end
 
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
-  ret i32 %Red.0.lcssa
+for.end:                                          ; preds = %for.end.loopexit
+  ret i32 %add.lcssa
 }
 
 ; Floating-point loops need fast-math to be vectorizeable
@@ -121,15 +119,14 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; DARWIN: We can vectorize this loop!
 define float @redf(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) {
 entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
+  br label %for.body.preheader
 
 for.body.preheader:                               ; preds = %entry
   br label %for.body
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %Red.06 = phi float [ %add, %for.body ], [ 0.0e+00, %for.body.preheader ]
   %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.07
   %0 = load float, ptr %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.07
@@ -144,9 +141,8 @@ for.end.loopexit:                                 ; preds = %for.body
   %add.lcssa = phi float [ %add, %for.body ]
   br label %for.end
 
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
-  ret float %Red.0.lcssa
+for.end:                                          ; preds = %for.end.loopexit
+  ret float %add.lcssa
 }
 
 ; Make sure calls that turn into builtins are also covered
@@ -252,7 +248,7 @@ for.body.preheader:                               ; preds = %entry
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %Red.06 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
   %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.07
   %0 = load i32, ptr %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.07
@@ -268,7 +264,7 @@ for.end.loopexit:                                 ; preds = %for.body
   br label %for.end
 
 for.end:                                          ; preds = %for.end.loopexit, %entry
-  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  %Red.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.end.loopexit ]
   ret i32 %Red.0.lcssa
 }
 
@@ -277,15 +273,14 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; CHECK: We can vectorize this loop!
 define float @redf_fast(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) {
 entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
+  br label %for.body.preheader
 
 for.body.preheader:                               ; preds = %entry
   br label %for.body
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %Red.06 = phi float [ %add, %for.body ], [ 0.0e+00, %for.body.preheader ]
   %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.07
   %0 = load float, ptr %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.07
@@ -300,9 +295,8 @@ for.end.loopexit:                                 ; preds = %for.body
   %add.lcssa = phi float [ %add, %for.body ]
   br label %for.end
 
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
-  ret float %Red.0.lcssa
+for.end:                                          ; preds = %for.end.loopexit
+  ret float %add.lcssa
 }
 
 ; Make sure calls that turn into builtins are also covered
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
index fe3504b..23609b1 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
@@ -249,7 +249,7 @@ define dso_local i32 @predicated_test(i32 noundef %0, ptr %glob) #0 {
   br label %7
 
 7:                                                ; preds = %5, %155
-  %8 = phi i32 [ %10, %155 ], [ undef, %5 ]
+  %8 = phi i32 [ %10, %155 ], [ 0, %5 ]
   %9 = phi i32 [ %156, %155 ], [ 0, %5 ]
   %10 = shl i32 %8, 4
   store i32 %10, ptr %6, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
index c5319c6..f4c7c6f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
@@ -246,6 +246,57 @@ exit:
   ret void
 }
 
+; Test for https://github.com/llvm/llvm-project/issues/162688.
+define void @test_minbws_for_trunc(i32 %n, ptr noalias %p1, ptr noalias %p2) {
+; CHECK-LABEL: define void @test_minbws_for_trunc(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[P1:%.*]], ptr noalias [[P2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_EXT:%.*]] = sext i16 [[IV]] to i64
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, ptr [[P1]], i64 [[IV_EXT]]
+; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[V1_TRUNC:%.*]] = trunc i32 [[V1]] to i16
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr [1 x [1 x i16]], ptr [[P2]], i64 [[IV_EXT]]
+; CHECK-NEXT:    store i16 [[V1_TRUNC]], ptr [[GEP2]], align 2
+; CHECK-NEXT:    [[V1_TRUNC_I8:%.*]] = trunc i32 [[V1]] to i8
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[P2]], i64 [[IV_EXT]]
+; CHECK-NEXT:    store i8 [[V1_TRUNC_I8]], ptr [[GEP3]], align 1
+; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr [1 x i64], ptr [[P2]], i64 [[IV_EXT]]
+; CHECK-NEXT:    store i64 0, ptr [[GEP4]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 4
+; CHECK-NEXT:    [[IV_NEXT_EXT:%.*]] = sext i16 [[IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[IV_NEXT_EXT]], 1024
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i16 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.ext = sext i16 %iv to i64
+  %gep1 = getelementptr i32, ptr %p1, i64 %iv.ext
+  %v1 = load i32, ptr %gep1, align 4
+  %v1.trunc = trunc i32 %v1 to i16
+  %gep2 = getelementptr [1 x [1 x i16]], ptr %p2, i64 %iv.ext
+  store i16 %v1.trunc, ptr %gep2, align 2
+  %v1.trunc.i8 = trunc i32 %v1 to i8
+  %gep3 = getelementptr i8, ptr %p2, i64 %iv.ext
+  store i8 %v1.trunc.i8, ptr %gep3, align 1
+  %gep4 = getelementptr [1 x i64], ptr %p2, i64 %iv.ext
+  store i64 0, ptr %gep4, align 8
+  %iv.next = add i16 %iv, 4
+  %iv.next.ext = sext i16 %iv.next to i32
+  %cmp = icmp ne i32 %iv.next.ext, 1024
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
 attributes #0 = { "target-features"="+64bit,+v,+zvl256b" }
 attributes #1 = { "target-features"="+64bit,+v" }
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll b/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll
index 2ef9d4b..3718ad2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll
@@ -14,7 +14,7 @@ define x86_fp80 @test() {
 ; CHECK-NEXT:    br label [[FOR_BODY3_I_3:%.*]]
 ; CHECK:       for.body3.i.3:
 ; CHECK-NEXT:    [[N_ADDR_112_I_3:%.*]] = phi i64 [ [[DEC_I_3:%.*]], [[FOR_BODY3_I_3]] ], [ 24, [[FOO_EXIT:%.*]] ]
-; CHECK-NEXT:    [[X_ADDR_111_I_3:%.*]] = phi x86_fp80 [ [[MUL_I_3:%.*]], [[FOR_BODY3_I_3]] ], [ undef, [[FOO_EXIT]] ]
+; CHECK-NEXT:    [[X_ADDR_111_I_3:%.*]] = phi x86_fp80 [ [[MUL_I_3:%.*]], [[FOR_BODY3_I_3]] ], [ 0xK00000000000000000000, [[FOO_EXIT]] ]
 ; CHECK-NEXT:    [[MUL_I_3]] = fmul x86_fp80 [[X_ADDR_111_I_3]], 0xK40008000000000000000
 ; CHECK-NEXT:    [[DEC_I_3]] = add nsw i64 [[N_ADDR_112_I_3]], -1
 ; CHECK-NEXT:    [[CMP2_I_3:%.*]] = icmp sgt i64 [[N_ADDR_112_I_3]], 1
@@ -28,7 +28,7 @@ foo.exit:
 
 for.body3.i.3:                                    ; preds = %for.body3.i.3, %foo.exit
   %n.addr.112.i.3 = phi i64 [ %dec.i.3, %for.body3.i.3 ], [ 24, %foo.exit ]
-  %x.addr.111.i.3 = phi x86_fp80 [ %mul.i.3, %for.body3.i.3 ], [ undef, %foo.exit ]
+  %x.addr.111.i.3 = phi x86_fp80 [ %mul.i.3, %for.body3.i.3 ], [ zeroinitializer, %foo.exit ]
   %mul.i.3 = fmul x86_fp80 %x.addr.111.i.3, 0xK40008000000000000000
   %dec.i.3 = add nsw i64 %n.addr.112.i.3, -1
   %cmp2.i.3 = icmp sgt i64 %n.addr.112.i.3, 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll b/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll
index df1c4f9..5321d69 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll
@@ -21,10 +21,10 @@ while.cond63.preheader.while.end76_crit_edge:
   ret void
 
 while.body:
-  %d2_fx.015 = phi double [ %sub52, %while.body ], [ undef, %entry ]
-  %d2_fy.014 = phi double [ %sub58, %while.body ], [ undef, %entry ]
-  %d3_fy.013 = phi double [ %div56, %while.body ], [ undef, %entry ]
-  %d3_fx.012 = phi double [ %div50, %while.body ], [ undef, %entry ]
+  %d2_fx.015 = phi double [ %sub52, %while.body ], [ 0.0e+00, %entry ]
+  %d2_fy.014 = phi double [ %sub58, %while.body ], [ 0.0e+00, %entry ]
+  %d3_fy.013 = phi double [ %div56, %while.body ], [ 0.0e+00, %entry ]
+  %d3_fx.012 = phi double [ %div50, %while.body ], [ 0.0e+00, %entry ]
   %div50 = fmul double %d3_fx.012, 1.250000e-01
   %sub52 = fsub double 0.000000e+00, %div50
   %div56 = fmul double %d3_fy.013, 1.250000e-01
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll
index 368361f..0f55d79 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll
@@ -15,7 +15,7 @@ for.cond.cleanup:
 
 for.body:
   %i.09 = phi i16 [ 0, %entry ], [ %add3, %for.body ]
-  %res.08 = phi x86_fp80 [ undef, %entry ], [ %3, %for.body ]
+  %res.08 = phi x86_fp80 [ zeroinitializer, %entry ], [ %3, %for.body ]
   %arrayidx = getelementptr inbounds x86_fp80, ptr %a, i16 %i.09
   %0 = load x86_fp80, ptr %arrayidx, align 1
   %add = or i16 %i.09, 1
diff --git a/llvm/test/Transforms/LoopVectorize/i8-induction.ll b/llvm/test/Transforms/LoopVectorize/i8-induction.ll
index 220fd64..712c75d 100644
--- a/llvm/test/Transforms/LoopVectorize/i8-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/i8-induction.ll
@@ -20,7 +20,7 @@ scalar.ph:
 
 for.body:
   %mul16 = phi i8 [ 0, %scalar.ph ], [ %mul, %for.body ]              ; <------- i8 induction var.
-  %c.015 = phi i8 [ undef, %scalar.ph ], [ %conv8, %for.body ]
+  %c.015 = phi i8 [ 0, %scalar.ph ], [ %conv8, %for.body ]
   %conv2 = sext i8 %c.015 to i32
   %tobool = icmp ne i8 %c.015, 0
   %.sink = select i1 %tobool, i8 %c.015, i8 %0
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index f7376a0..c164c4a 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -277,7 +277,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INEWCHUNKS_2_LCSSA:%.*]] = phi i32 [ [[INEWCHUNKS_2]], [[FOR_INC23]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_INC26]]
 ; UNROLL-NOSIMPLIFY:       for.inc26:
-; UNROLL-NOSIMPLIFY-NEXT:    [[INEWCHUNKS_1_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY9]] ], [ [[INEWCHUNKS_2_LCSSA]], [[FOR_INC26_LOOPEXIT]] ]
+; UNROLL-NOSIMPLIFY-NEXT:    [[INEWCHUNKS_1_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY9]] ], [ [[INEWCHUNKS_2_LCSSA]], [[FOR_INC26_LOOPEXIT]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    unreachable
 ;
 ; VEC-LABEL: @bug18724(
@@ -376,7 +376,7 @@ for.inc23:
   br i1 %cmp13, label %for.body14, label %for.inc26
 
 for.inc26:
-  %iNewChunks.1.lcssa = phi i32 [ undef, %for.body9 ], [ %iNewChunks.2, %for.inc23 ]
+  %iNewChunks.1.lcssa = phi i32 [ 0, %for.body9 ], [ %iNewChunks.2, %for.inc23 ]
   unreachable
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
index 9e75002..5cf99b8 100644
--- a/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
@@ -53,8 +53,8 @@ thread-pre-split.loopexit:                        ; preds = %11, %.thread-pre-sp
   br i1 false, label %thread-pre-split._crit_edge, label %.lr.ph21
 
 .lr.ph21:                                         ; preds = %26, %thread-pre-split.loopexit, %thread-pre-split.preheader
-  %d.020 = phi ptr [ undef, %26 ], [ %d.1.lcssa, %thread-pre-split.loopexit ], [ undef, %thread-pre-split.preheader ]
-  %10 = phi i64 [ %28, %26 ], [ undef, %thread-pre-split.loopexit ], [ undef, %thread-pre-split.preheader ]
+  %d.020 = phi ptr [ zeroinitializer, %26 ], [ %d.1.lcssa, %thread-pre-split.loopexit ], [ zeroinitializer, %thread-pre-split.preheader ]
+  %10 = phi i64 [ %28, %26 ], [ zeroinitializer, %thread-pre-split.loopexit ], [ zeroinitializer, %thread-pre-split.preheader ]
   br i1 %arg, label %11, label %22
 
 ; <label>:11                                      ; preds = %.lr.ph21
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 16357b3..8efe29a 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -791,8 +791,8 @@ define void @int_float_struct(ptr nocapture readonly %A) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 undef, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -822,8 +822,8 @@ for.cond.cleanup:                                 ; preds = %for.body
 
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
-  %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
+  %SumB.014 = phi float [ 0.0e+00, %entry ], [ %add3, %for.body ]
+  %SumA.013 = phi i32 [ 0, %entry ], [ %add, %for.body ]
   %a = getelementptr inbounds %struct.IntFloat, ptr %A, i64 %indvars.iv, i32 0
   %tmp = load i32, ptr %a, align 4
   %add = add nsw i32 %tmp, %SumA.013
diff --git a/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll b/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll
index 741ca55..1675a59 100644
--- a/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll
+++ b/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll
@@ -48,7 +48,7 @@ for.cond.cleanup.loopexit:
   br label %for.cond.cleanup, !dbg !33
 
 for.cond.cleanup:
-  %2 = phi i32 [ %.pre, %for.cond.cleanup.loopexit ], [ undef, %entry ], !dbg !33
+  %2 = phi i32 [ %.pre, %for.cond.cleanup.loopexit ], [ 0, %entry ], !dbg !33
   %sub = add nsw i32 %0, -5, !dbg !33
   %idxprom3 = sext i32 %sub to i64, !dbg !33
   %arrayidx4 = getelementptr inbounds i32, ptr %vla, i64 %idxprom3, !dbg !33
diff --git a/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll b/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll
index 659dc62..2038633 100644
--- a/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll
+++ b/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll
@@ -22,8 +22,8 @@ entry:
   br label %for.body
 
 for.body:
-  %inc107 = phi i32 [ undef, %entry ], [ %inc10, %for.body ]
-  %inc6 = phi i32 [ %nf.promoted, %entry ], [ undef, %for.body ]
+  %inc107 = phi i32 [ 0, %entry ], [ %inc10, %for.body ]
+  %inc6 = phi i32 [ %nf.promoted, %entry ], [ 3, %for.body ]
   %add55 = phi i32 [ %n.promoted, %entry ], [ %add5, %for.body ]
   %.neg2 = sub i32 0, %inc6
   %add.neg = add i32 0, %add55
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index ec7fde8..964a257 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -1207,11 +1207,11 @@ for.end:
 define i32 @reduction_sum_multiuse(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-LABEL: define i32 @reduction_sum_multiuse(
 ; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]]) {
-; CHECK-NEXT:  [[_LR_PH1:.*]]:
+; CHECK-NEXT:  [[_LR_PH:.*]]:
 ; CHECK-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK:       [[_LR_PH:.*:]]
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ]
-; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ]
+; CHECK:       [[_LR_PH1:.*:]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH]] ]
+; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH]] ]
 ; CHECK-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
 ; CHECK-NEXT:    [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
@@ -1231,11 +1231,11 @@ define i32 @reduction_sum_multiuse(ptr noalias nocapture %A, ptr noalias nocaptu
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @reduction_sum_multiuse(
 ; CHECK-INTERLEAVED-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]]) {
-; CHECK-INTERLEAVED-NEXT:  [[_LR_PH1:.*]]:
+; CHECK-INTERLEAVED-NEXT:  [[_LR_PH:.*]]:
 ; CHECK-INTERLEAVED-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK-INTERLEAVED:       [[_LR_PH:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ]
+; CHECK-INTERLEAVED:       [[_LR_PH1:.*:]]
+; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH]] ]
+; CHECK-INTERLEAVED-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
 ; CHECK-INTERLEAVED-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
 ; CHECK-INTERLEAVED-NEXT:    [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
@@ -1947,7 +1947,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], zeroinitializer
@@ -1966,7 +1966,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY2:.*]]
 ; CHECK:       [[FOR_BODY2]]:
 ; CHECK-NEXT:    [[A_117:%.*]] = phi i32 [ [[INC6:%.*]], %[[FOR_INC5:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -2002,7 +2002,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h
 ; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H]], i32 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
@@ -2033,7 +2033,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       [[SCALAR_PH]]:
 ; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY2:.*]]
 ; CHECK-INTERLEAVED:       [[FOR_BODY2]]:
 ; CHECK-INTERLEAVED-NEXT:    [[A_117:%.*]] = phi i32 [ [[INC6:%.*]], %[[FOR_INC5:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -2063,7 +2063,7 @@ entry:
 
 for.body2:                                        ; preds = %entry, %for.inc5
   %a.117 = phi i32 [ %inc6, %for.inc5 ], [ 0, %entry ]
-  %g.016 = phi i32 [ %g.1, %for.inc5 ], [ undef, %entry ]
+  %g.016 = phi i32 [ %g.1, %for.inc5 ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds i8, ptr %h, i32 %a.117
   %0 = load i8, ptr %arrayidx, align 1
   %tobool3.not = icmp eq i8 %0, 0
@@ -2100,7 +2100,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], zeroinitializer
@@ -2122,7 +2122,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY2:.*]]
 ; CHECK:       [[FOR_BODY2]]:
 ; CHECK-NEXT:    [[A_117:%.*]] = phi i32 [ [[INC6:%.*]], %[[FOR_INC5:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -2159,7 +2159,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
 ; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H]], i32 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
@@ -2196,7 +2196,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       [[SCALAR_PH]]:
 ; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY2:.*]]
 ; CHECK-INTERLEAVED:       [[FOR_BODY2]]:
 ; CHECK-INTERLEAVED-NEXT:    [[A_117:%.*]] = phi i32 [ [[INC6:%.*]], %[[FOR_INC5:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -2227,7 +2227,7 @@ entry:
 
 for.body2:                                        ; preds = %entry, %for.inc5
   %a.117 = phi i32 [ %inc6, %for.inc5 ], [ 0, %entry ]
-  %g.016 = phi i32 [ %g.1, %for.inc5 ], [ undef, %entry ]
+  %g.016 = phi i32 [ %g.1, %for.inc5 ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds i8, ptr %h, i32 %a.117
   %0 = load i8, ptr %arrayidx, align 1
   %tobool3.not = icmp eq i8 %0, 0
@@ -2263,7 +2263,7 @@ define i32 @predicated_or_dominates_reduction(ptr %b) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE6:.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP48:%.*]], %[[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP48:%.*]], %[[PRED_LOAD_CONTINUE6]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
@@ -2340,7 +2340,7 @@ define i32 @predicated_or_dominates_reduction(ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE15:.*]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP94:%.*]], %[[PRED_LOAD_CONTINUE15]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP94:%.*]], %[[PRED_LOAD_CONTINUE15]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP98:%.*]], %[[PRED_LOAD_CONTINUE15]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
@@ -2480,7 +2480,7 @@ for.cond.cleanup:                                 ; preds = %for.inc
 
 for.body:                                         ; preds = %entry, %for.inc
   %g.09 = phi i32 [ 0, %entry ], [ %inc3, %for.inc ]
-  %a.08 = phi i32 [ undef, %entry ], [ %a.1, %for.inc ]
+  %a.08 = phi i32 [ 0, %entry ], [ %a.1, %for.inc ]
   %d = getelementptr inbounds [0 x %struct.e], ptr %b, i32 0, i32 %g.09, i32 1
   %0 = load i32, ptr %d, align 4
   %tobool.not = icmp eq i32 %0, 0
diff --git a/llvm/test/Transforms/LoopVectorize/reverse_iter.ll b/llvm/test/Transforms/LoopVectorize/reverse_iter.ll
index 43e916b..6576675 100644
--- a/llvm/test/Transforms/LoopVectorize/reverse_iter.ll
+++ b/llvm/test/Transforms/LoopVectorize/reverse_iter.ll
@@ -17,8 +17,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ;CHECK:  <i32 0, i32 -1, i32 -2, i32 -3>
 ;CHECK: ret
 define i32 @foo(i32 %n, ptr nocapture %A) {
-  %1 = icmp sgt i32 %n, 0
-  br i1 %1, label %.lr.ph, label %._crit_edge
+  br label %.lr.ph
 
 .lr.ph:                                           ; preds = %0
   %2 = sext i32 %n to i64
@@ -26,7 +25,7 @@ define i32 @foo(i32 %n, ptr nocapture %A) {
 
 ; <label>:3                                       ; preds = %.lr.ph, %3
   %indvars.iv = phi i64 [ %2, %.lr.ph ], [ %indvars.iv.next, %3 ]
-  %sum.01 = phi i32 [ undef, %.lr.ph ], [ %9, %3 ]
+  %sum.01 = phi i32 [ 0, %.lr.ph ], [ %9, %3 ]
   %4 = trunc i64 %indvars.iv to i32
   %5 = shl nsw i32 %4, 1
   %6 = sext i32 %5 to i64
@@ -38,8 +37,7 @@ define i32 @foo(i32 %n, ptr nocapture %A) {
   %11 = icmp sgt i32 %10, 0
   br i1 %11, label %3, label %._crit_edge
 
-._crit_edge:                                      ; preds = %3, %0
-  %sum.0.lcssa = phi i32 [ undef, %0 ], [ %9, %3 ]
-  ret i32 %sum.0.lcssa
+._crit_edge:                                      ; preds = %3
+  ret i32 %9
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
index 60da336..1216bc1 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
@@ -178,13 +178,14 @@ for.exit:
 define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF1-LABEL: define i32 @recurrence_2(
 ; CHECK-VF4UF1-SAME: ptr readonly captures(none) [[A:%.*]], i32 [[N:%.*]]) {
-; CHECK-VF4UF1-NEXT:  [[ENTRY:.*]]:
-; CHECK-VF4UF1-NEXT:    [[CMP27:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-VF4UF1-NEXT:    br i1 [[CMP27]], label %[[FOR_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK-VF4UF1-NEXT:  [[ENTRY:.*:]]
+; CHECK-VF4UF1-NEXT:    br label %[[FOR_PREHEADER:.*]]
 ; CHECK-VF4UF1:       [[FOR_PREHEADER]]:
 ; CHECK-VF4UF1-NEXT:    [[ARRAYIDX2_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 -1
 ; CHECK-VF4UF1-NEXT:    [[DOTPRE:%.*]] = load i32, ptr [[ARRAYIDX2_PHI_TRANS_INSERT]], align 4
-; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = add i32 [[N]], -1
+; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[TMP6]], 1
 ; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
 ; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
@@ -202,7 +203,7 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF1:       [[VECTOR_BODY]]:
 ; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
 ; CHECK-VF4UF1-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP10]], align 4
 ; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
@@ -225,25 +226,25 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF1:       [[SCALAR_PH]]:
 ; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[DOTPRE]], %[[FOR_PREHEADER]] ]
 ; CHECK-VF4UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
-; CHECK-VF4UF1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ undef, %[[FOR_PREHEADER]] ]
+; CHECK-VF4UF1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
 ; CHECK-VF4UF1-NEXT:    br label %[[SCALAR_BODY:.*]]
 ; CHECK-VF4UF1:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
-; CHECK-VF4UF1-NEXT:    [[MINMAX_0_COND_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP19]], %[[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF1-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK-VF4UF1-NEXT:    [[MINMAX_0_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP19]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4UF1-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; CHECK-VF4UF1:       [[FOR_COND_CLEANUP]]:
-; CHECK-VF4UF1-NEXT:    [[MINMAX_0_LCSSA:%.*]] = phi i32 [ undef, %[[ENTRY]] ], [ [[MINMAX_0_COND_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; CHECK-VF4UF1-NEXT:    ret i32 [[MINMAX_0_LCSSA]]
 ; CHECK-VF4UF1:       [[SCALAR_BODY]]:
 ;
 ; CHECK-VF4UF2-LABEL: define i32 @recurrence_2(
 ; CHECK-VF4UF2-SAME: ptr readonly captures(none) [[A:%.*]], i32 [[N:%.*]]) {
-; CHECK-VF4UF2-NEXT:  [[ENTRY:.*]]:
-; CHECK-VF4UF2-NEXT:    [[CMP27:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-VF4UF2-NEXT:    br i1 [[CMP27]], label %[[FOR_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK-VF4UF2-NEXT:  [[ENTRY:.*:]]
+; CHECK-VF4UF2-NEXT:    br label %[[FOR_PREHEADER:.*]]
 ; CHECK-VF4UF2:       [[FOR_PREHEADER]]:
 ; CHECK-VF4UF2-NEXT:    [[ARRAYIDX2_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 -1
 ; CHECK-VF4UF2-NEXT:    [[DOTPRE:%.*]] = load i32, ptr [[ARRAYIDX2_PHI_TRANS_INSERT]], align 4
-; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = add i32 [[N]], -1
+; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[TMP6]], 1
 ; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 3
 ; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
@@ -261,8 +262,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF2:       [[VECTOR_BODY]]:
 ; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD2:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
 ; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = shl nuw i64 [[TMP12]], 2
@@ -296,19 +297,17 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF2:       [[SCALAR_PH]]:
 ; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[DOTPRE]], %[[FOR_PREHEADER]] ]
 ; CHECK-VF4UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
-; CHECK-VF4UF2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP28]], %[[MIDDLE_BLOCK]] ], [ undef, %[[FOR_PREHEADER]] ]
+; CHECK-VF4UF2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP28]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
 ; CHECK-VF4UF2-NEXT:    br label %[[SCALAR_BODY:.*]]
 ; CHECK-VF4UF2:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
-; CHECK-VF4UF2-NEXT:    [[MINMAX_0_COND_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP28]], %[[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF2-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK-VF4UF2-NEXT:    [[MINMAX_0_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP28]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4UF2-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; CHECK-VF4UF2:       [[FOR_COND_CLEANUP]]:
-; CHECK-VF4UF2-NEXT:    [[MINMAX_0_LCSSA:%.*]] = phi i32 [ undef, %[[ENTRY]] ], [ [[MINMAX_0_COND_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; CHECK-VF4UF2-NEXT:    ret i32 [[MINMAX_0_LCSSA]]
 ; CHECK-VF4UF2:       [[SCALAR_BODY]]:
 ;
 entry:
-  %cmp27 = icmp sgt i32 %n, 0
-  br i1 %cmp27, label %for.preheader, label %for.cond.cleanup
+  br label %for.preheader
 
 for.preheader:
   %arrayidx2.phi.trans.insert = getelementptr inbounds i32, ptr %a, i64 -1
@@ -320,13 +319,12 @@ for.cond.cleanup.loopexit:
   br label %for.cond.cleanup
 
 for.cond.cleanup:
-  %minmax.0.lcssa = phi i32 [ undef, %entry ], [ %minmax.0.cond.lcssa, %for.cond.cleanup.loopexit ]
-  ret i32 %minmax.0.lcssa
+  ret i32 %minmax.0.cond.lcssa
 
 scalar.body:
   %0 = phi i32 [ %.pre, %for.preheader ], [ %1, %scalar.body ]
   %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
-  %minmax.028 = phi i32 [ undef, %for.preheader ], [ %minmax.0.cond, %scalar.body ]
+  %minmax.028 = phi i32 [ 0, %for.preheader ], [ %minmax.0.cond, %scalar.body ]
   %arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
   %1 = load i32, ptr %arrayidx, align 4
   %sub3 = sub nsw i32 %1, %0
diff --git a/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
index 58ad64d..8224d6b 100644
--- a/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
@@ -16,7 +16,7 @@ entry:
   br label %for.cond
 
 for.cond:                                         ; preds = %for.cond, %entry
-  %i.0 = phi i32 [ undef, %entry ], [ %inc, %for.cond ]
+  %i.0 = phi i32 [ poison, %entry ], [ %inc, %for.cond ]
   %cmp = icmp slt i32 %i.0, 0
   %fsub = fsub double undef, undef
   %fadd = fadd double %fsub, 1.000000e+00
@@ -36,7 +36,7 @@ for.cond7.preheader.lr.ph:                        ; preds = %for.cond4.preheader
 for.cond7.preheader:                              ; preds = %for.cond7.preheader.lr.ph, %for.inc23
   %y.017 = phi i32 [ 0, %for.cond7.preheader.lr.ph ], [ %inc24, %for.inc23 ]
   %i.116 = phi i32 [ 0, %for.cond7.preheader.lr.ph ], [ %i.2.lcssa, %for.inc23 ]
-  %n.015 = phi i32 [ undef, %for.cond7.preheader.lr.ph ], [ %inc25, %for.inc23 ]
+  %n.015 = phi i32 [ poison, %for.cond7.preheader.lr.ph ], [ %inc25, %for.inc23 ]
   %1 = load i32, ptr @b, align 4, !tbaa !5
   %tobool11 = icmp eq i32 %1, 0
   br i1 %tobool11, label %for.inc23, label %for.body8.lr.ph
diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
index f4d5a84..cd9eb72 100644
--- a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
@@ -23,7 +23,7 @@ define i64 @pr62565_incoming_value_known_undef(i64 %a, ptr %src) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = freeze i1 [[TMP4]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP5]], i64 [[A]], i64 undef
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP5]], i64 [[A]], i64 poison
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i64 [[RDX_SELECT]]
@@ -33,7 +33,7 @@ entry:
 
 loop:
   %iv = phi i32 [ 1, %entry ], [ %add, %loop ]
-  %red = phi i64 [ undef, %entry ], [ %select, %loop ]
+  %red = phi i64 [ poison, %entry ], [ %select, %loop ]
   %gep = getelementptr inbounds i32, ptr %src, i32 %iv
   %l = load i32, ptr %gep
   %c = icmp eq i32 %l, 1
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index 358f1b0..71311db 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -102,7 +102,7 @@ define void @blend_chain_iv(i1 %c) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[PREDPHI1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI1]], <4 x i64> undef
+; CHECK-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI1]], <4 x i64> poison
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 2
@@ -139,11 +139,11 @@ loop.next.2:
   br label %loop.next.3
 
 loop.next.3:
-  %blend.1 = phi i64 [ undef, %loop.next ], [ %iv, %loop.next.2 ]
+  %blend.1 = phi i64 [ poison, %loop.next ], [ %iv, %loop.next.2 ]
   br label %loop.latch
 
 loop.latch:                                       ; preds = %loop.next, %loop.header
-  %blend = phi i64 [ undef, %loop.header ], [ %blend.1, %loop.next.3 ]
+  %blend = phi i64 [ poison, %loop.header ], [ %blend.1, %loop.next.3 ]
   %dst.ptr = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 %blend
   store i16 0, ptr %dst.ptr
   %iv.next = add nuw nsw i64 %iv, 1
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
index 3ee9cf8..ffdff21 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
@@ -124,18 +124,14 @@ define void @test_known_trip_count() {
 ; CHECK-NEXT:    [[TMP27:%.*]] = fadd <2 x double> [[WIDE_LOAD3_13]], [[WIDE_LOAD5_13]]
 ; CHECK-NEXT:    store <2 x double> [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 416), align 16
 ; CHECK-NEXT:    store <2 x double> [[TMP27]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 432), align 16
-; CHECK-NEXT:    [[WIDE_LOAD_14:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @b, i64 448), align 16
-; CHECK-NEXT:    [[WIDE_LOAD3_14:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @b, i64 464), align 16
-; CHECK-NEXT:    [[WIDE_LOAD4_14:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @c, i64 448), align 16
-; CHECK-NEXT:    [[WIDE_LOAD5_14:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @c, i64 464), align 16
-; CHECK-NEXT:    [[TMP28:%.*]] = fadd <2 x double> [[WIDE_LOAD_14]], [[WIDE_LOAD4_14]]
-; CHECK-NEXT:    [[TMP29:%.*]] = fadd <2 x double> [[WIDE_LOAD3_14]], [[WIDE_LOAD5_14]]
-; CHECK-NEXT:    store <2 x double> [[TMP28]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 448), align 16
-; CHECK-NEXT:    store <2 x double> [[TMP29]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 464), align 16
-; CHECK-NEXT:    [[TMP30:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @b, i64 480), align 16
-; CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @c, i64 480), align 16
+; CHECK-NEXT:    [[TMP30:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @b, i64 448), align 16
+; CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @c, i64 448), align 16
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP30]], [[TMP31]]
-; CHECK-NEXT:    store double [[ADD]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 480), align 16
+; CHECK-NEXT:    store double [[ADD]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 448), align 16
+; CHECK-NEXT:    [[TMP32:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @b, i64 456), align 8
+; CHECK-NEXT:    [[TMP33:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @c, i64 456), align 8
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP32]], [[TMP33]]
+; CHECK-NEXT:    store double [[ADD_1]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 456), align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -143,7 +139,7 @@ entry:
 
 for.cond:
   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %cmp = icmp slt i32 %i.0, 61
+  %cmp = icmp slt i32 %i.0, 58
   br i1 %cmp, label %for.body, label %exit
 
 for.body:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll
new file mode 100644
index 0000000..d16843c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-100 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+define ptr @test(ptr %d) {
+; CHECK-LABEL: define ptr @test(
+; CHECK-SAME: ptr [[D:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT:    [[CMP4_2:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[CMP4_2]], i64 0, i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 0, 0
+; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 1, 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <6 x i64> poison, i64 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <6 x i64> [[TMP5]], i64 [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <6 x i64> [[TMP6]], i64 [[TMP4]], i32 4
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <6 x i64> [[TMP7]], <6 x i64> poison, <6 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <6 x i64> [[TMP8]], <i64 2, i64 6, i64 1, i64 1, i64 1, i64 0>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <6 x i64> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <6 x i64> [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <6 x i64> [[TMP9]], i32 2
+; CHECK-NEXT:    [[SCEVGEP42:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <6 x i64> [[TMP9]], i32 3
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <6 x i64> [[TMP9]], i32 4
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <6 x i64> [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP19]]
+; CHECK-NEXT:    ret ptr [[TMP20]]
+;
+entry:
+  %0 = load i8, ptr null, align 1
+  %cmp4.2 = icmp eq i8 %0, 0
+  %1 = select i1 %cmp4.2, i64 0, i64 0
+  %2 = shl i64 %1, 1
+  %3 = getelementptr i8, ptr %d, i64 %2
+  %4 = xor i64 0, 0
+  %5 = udiv i64 %4, 0
+  %6 = mul i64 %5, 6
+  %7 = getelementptr i8, ptr %d, i64 %6
+  %8 = shl i64 %1, 0
+  %scevgep42 = getelementptr i8, ptr %d, i64 %8
+  %9 = mul i64 %5, 1
+  %10 = getelementptr i8, ptr %d, i64 %9
+  %11 = udiv i64 1, 0
+  %12 = mul i64 %11, 1
+  %13 = getelementptr i8, ptr %d, i64 %12
+  %14 = mul i64 %11, 0
+  %15 = getelementptr i8, ptr %d, i64 %14
+  ret ptr %15
+}
diff --git a/llvm/test/Transforms/SpeculativeExecution/spec-casts.ll b/llvm/test/Transforms/SpeculativeExecution/spec-casts.ll
index c966bed..c7500cf 100644
--- a/llvm/test/Transforms/SpeculativeExecution/spec-casts.ll
+++ b/llvm/test/Transforms/SpeculativeExecution/spec-casts.ll
@@ -1,147 +1,234 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -S -passes=speculative-execution \
 ; RUN:   -spec-exec-max-speculation-cost 4 -spec-exec-max-not-hoisted 3 \
 ; RUN:   | FileCheck %s
 
-; CHECK-LABEL: @ifThen_bitcast(
-; CHECK: bitcast
-; CHECK: br i1 true
-define void @ifThen_bitcast() {
+define void @ifThen_bitcast(i32 %arg) {
+; CHECK-LABEL: define void @ifThen_bitcast(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = bitcast i32 [[ARG]] to float
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 
 a:
-  %x = bitcast i32 undef to float
+  %x = bitcast i32 %arg to float
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_ptrtoint(
-; CHECK: ptrtoint
-; CHECK: br i1 true
-define void @ifThen_ptrtoint() {
+define void @ifThen_ptrtoint(ptr %arg) {
+; CHECK-LABEL: define void @ifThen_ptrtoint(
+; CHECK-SAME: ptr [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = ptrtoint ptr [[ARG]] to i64
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 
 a:
-  %x = ptrtoint ptr undef to i64
+  %x = ptrtoint ptr %arg to i64
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_inttoptr(
-; CHECK: inttoptr
-; CHECK: br i1 true
-define void @ifThen_inttoptr() {
+define void @ifThen_ptrtoaddr(ptr %arg) {
+; CHECK-LABEL: define void @ifThen_ptrtoaddr(
+; CHECK-SAME: ptr [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = ptrtoaddr ptr [[ARG]] to i64
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 
 a:
-  %x = inttoptr i64 undef to ptr
+  %x = ptrtoaddr ptr %arg to i64
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_addrspacecast(
-; CHECK: addrspacecast
-; CHECK: br i1 true
-define void @ifThen_addrspacecast() {
+define void @ifThen_inttoptr(i64 %arg) {
+; CHECK-LABEL: define void @ifThen_inttoptr(
+; CHECK-SAME: i64 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = inttoptr i64 [[ARG]] to ptr
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
+  br i1 true, label %a, label %b
+
+a:
+  %x = inttoptr i64 %arg to ptr
+  br label %b
+
+b:
+  ret void
+}
+
+define void @ifThen_addrspacecast(ptr %arg) {
+; CHECK-LABEL: define void @ifThen_addrspacecast(
+; CHECK-SAME: ptr [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = addrspacecast ptr [[ARG]] to ptr addrspace(1)
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = addrspacecast ptr undef to ptr addrspace(1)
+  %x = addrspacecast ptr %arg to ptr addrspace(1)
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_fptoui(
-; CHECK: fptoui
-; CHECK: br i1 true
-define void @ifThen_fptoui() {
+define void @ifThen_fptoui(float %arg) {
+; CHECK-LABEL: define void @ifThen_fptoui(
+; CHECK-SAME: float [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = fptoui float [[ARG]] to i32
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = fptoui float undef to i32
+  %x = fptoui float %arg to i32
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_fptosi(
-; CHECK: fptosi
-; CHECK: br i1 true
-define void @ifThen_fptosi() {
+define void @ifThen_fptosi(float %arg) {
+; CHECK-LABEL: define void @ifThen_fptosi(
+; CHECK-SAME: float [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = fptosi float [[ARG]] to i32
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = fptosi float undef to i32
+  %x = fptosi float %arg to i32
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_uitofp(
-; CHECK: uitofp
-; CHECK: br i1 true
-define void @ifThen_uitofp() {
+define void @ifThen_uitofp(i32 %arg) {
+; CHECK-LABEL: define void @ifThen_uitofp(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = uitofp i32 [[ARG]] to float
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = uitofp i32 undef to float
+  %x = uitofp i32 %arg to float
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_sitofp(
-; CHECK: sitofp
-; CHECK: br i1 true
-define void @ifThen_sitofp() {
+define void @ifThen_sitofp(i32 %arg) {
+; CHECK-LABEL: define void @ifThen_sitofp(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = sitofp i32 [[ARG]] to float
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = sitofp i32 undef to float
+  %x = sitofp i32 %arg to float
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_fpext(
-; CHECK: fpext
-; CHECK: br i1 true
-define void @ifThen_fpext() {
+define void @ifThen_fpext(float %arg) {
+; CHECK-LABEL: define void @ifThen_fpext(
+; CHECK-SAME: float [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = fpext float [[ARG]] to double
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = fpext float undef to double
+  %x = fpext float %arg to double
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_fptrunc(
-; CHECK: fptrunc
-; CHECK: br i1 true
-define void @ifThen_fptrunc() {
+define void @ifThen_fptrunc(double %arg) {
+; CHECK-LABEL: define void @ifThen_fptrunc(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = fptrunc double [[ARG]] to float
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = fptrunc double undef to float
+  %x = fptrunc double %arg to float
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_trunc(
-; CHECK: trunc
-; CHECK: br i1 true
-define void @ifThen_trunc() {
+define void @ifThen_trunc(i32 %arg) {
+; CHECK-LABEL: define void @ifThen_trunc(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = trunc i32 [[ARG]] to i16
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = trunc i32 undef to i16
+  %x = trunc i32 %arg to i16
   br label %b
 
 b:
diff --git a/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-gsym-merged-callsites-dsym.yaml b/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-gsym-merged-callsites-dsym.yaml
index 42ef7f3..5f19a29 100644
--- a/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-gsym-merged-callsites-dsym.yaml
+++ b/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-gsym-merged-callsites-dsym.yaml
@@ -4,6 +4,7 @@
 
 # RUN: split-file %s %t
 # RUN: yaml2obj %t/merged_callsites.dSYM.yaml -o %t/merged_callsites.dSYM
+# The object file is manually tampered with such that the LLVM_stmt_seq of function_bad_stmt_seq is 0xFFFFFFFF.
 
 # RUN: llvm-gsymutil --num-threads=1 --convert=%t/merged_callsites.dSYM --merged-functions --callsites-yaml-file=%t/callsites.yaml -o %t/call_sites_dSYM.gsym
 # RUN: llvm-gsymutil --num-threads=1 --convert=%t/merged_callsites.dSYM --merged-functions --dwarf-callsites -o %t/dwarf_call_sites_dSYM.gsym
@@ -36,7 +37,13 @@
 # CHECK-MERGED-CALLSITES:      CallSites (by relative return offset):
 # CHECK-MERGED-CALLSITES-NEXT:   0x[[#%.4x,]] Flags[None] MatchRegex[function2_copy1]
 
-# CHECK-MERGED-CALLSITES:      FunctionInfo @ 0x[[#%x,MAIN:]]: [0x[[#%x,MAIN_START:]] - 0x[[#%x,MAIN_END:]]) "main"
+# If we don't do anything, a function with bad LLVM_stmt_seq won't have any call site filter
+# More importantly, the line number will be at the function definition.
+# CHECK-MERGED-CALLSITES:      FunctionInfo @ 0x[[#%x,BAD_STMT_SEQ:]]: [0x[[#%x,BAD_STMT_SEQ_START:]] - 0x[[#%x,BAD_STMT_SEQ_END:]]) "function_bad_stmt_seq"
+# CHECK-MERGED-CALLSITES-NEXT: LineTable:
+# CHECK-MERGED-CALLSITES-NEXT:   0x00000001000003b0 ./merged_funcs_test.cpp:65
+
+# CHECK-MERGED-CALLSITES-NEXT: FunctionInfo @ 0x[[#%x,MAIN:]]: [0x[[#%x,MAIN_START:]] - 0x[[#%x,MAIN_END:]]) "main"
 # CHECK-MERGED-CALLSITES:      CallSites (by relative return offset):
 # CHECK-MERGED-CALLSITES-NEXT:   0x[[#%.4x,]] Flags[None] MatchRegex[function1]
 # CHECK-MERGED-CALLSITES-NEXT:   0x[[#%.4x,]] Flags[None] MatchRegex[function2_copy2]
@@ -44,16 +51,17 @@
 # CHECK-MERGED-CALLSITES-NEXT:   0x[[#%.4x,]] Flags[None] MatchRegex[function3_copy2]
 # CHECK-MERGED-CALLSITES-NEXT:   0x[[#%.4x,]] Flags[None] MatchRegex[function2_copy1]
 
-
 ### Check that we can correctly resove merged functions using callstacks:
 ### Resolve two callstacks containing merged functions.
 ### We use the value obtained from `CallSites:[FILTER]` to pass to the next call to `llvm-gsymutil` via `--merged-functions-filter`.
 ### The callstacks resolve differently based on the merged functions filter.
-###     0x00000001000003d0  =>  0x000000010000037c  =>  0x000000010000035c  =>  0x0000000100000340
-###     0x00000001000003e8  =========================>  0x000000010000035c  =>  0x0000000100000340
+###     0x00000001000003d8  =>  0x000000010000037c  =>  0x000000010000035c  =>  0x0000000100000340
+###     0x00000001000003f0  =========================>  0x000000010000035c  =>  0x0000000100000340
+###
+### We added a new function, for main function, +8 for line numbers, +0x8 for addresses.
 
-# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --merged-functions --address=0x00000001000003d0 | FileCheck --check-prefix=CHECK-C1 %s
-# CHECK-C1:       0x00000001000003d0: main + 32 @ ./merged_funcs_test.cpp:63
+# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --merged-functions --address=0x00000001000003d8 | FileCheck --check-prefix=CHECK-C1 %s
+# CHECK-C1:       0x00000001000003d8: main + 32 @ ./merged_funcs_test.cpp:71
 # CHECK-C1-NEXT:      CallSites: function2_copy2
 
 # RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --merged-functions --address=0x000000010000037c --merged-functions-filter="function2_copy2" | FileCheck --check-prefix=CHECK-C2 %s
@@ -73,9 +81,9 @@
 ### ----------------------------------------------------------------------------------------------------------------------------------
 ### Resolve the 2nd call stack - the 2nd and 3rd addresses are the same but they resolve to a different function because of the filter
 
-# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --address=0x00000001000003e8 --merged-functions | FileCheck --check-prefix=CHECK-C5 %s
-# CHECK-C5:       Found 1 function at address 0x00000001000003e8:
-# CHECK-C5-NEXT:     0x00000001000003e8: main + 56 @ ./merged_funcs_test.cpp:64
+# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --address=0x00000001000003f0 --merged-functions | FileCheck --check-prefix=CHECK-C5 %s
+# CHECK-C5:       Found 1 function at address 0x00000001000003f0:
+# CHECK-C5-NEXT:     0x00000001000003f0: main + 56 @ ./merged_funcs_test.cpp:72
 # CHECK-C5-NEXT:        CallSites: function3_copy2
 
 # RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --merged-functions --address=0x000000010000035c --merged-functions-filter="function3_copy2" | FileCheck --check-prefix=CHECK-C6 %s
@@ -87,6 +95,9 @@
 # CHECK-C7:       Found 1 function at address 0x0000000100000340:
 # CHECK-C7-NEXT:     0x0000000100000340: function4_copy2 + 8 @ ./merged_funcs_test.cpp:14
 
+# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --address=0x00000001000003b4 --merged-functions | FileCheck --check-prefix=CHECK-BAD %s
+# CHECK-BAD:       Found 1 function at address 0x00000001000003b4:
+# CHECK-BAD-NEXT:     0x00000001000003b4: function_bad_stmt_seq + 4 @ ./merged_funcs_test.cpp:65
 
 #--- merged_funcs_test.cpp
 #define ATTRIB extern "C" __attribute__((noinline))
@@ -148,6 +159,14 @@ ATTRIB int function1(int a) {
     return result;
 }
 
+// Intentional multi-line function definition
+ATTRIB
+int function_bad_stmt_seq(
+    int a
+) {
+    return a + 1;
+}
+
 int main() {
     int sum = 0;
     sum += function1(1);
@@ -155,6 +174,7 @@ int main() {
     sum += function4_copy2(4);
     sum += function3_copy2(41);
     sum += function2_copy1(11);
+    sum += function_bad_stmt_seq(sum);
     return sum;
 }
 
@@ -229,7 +249,7 @@ FileHeader:
 LoadCommands:
   - cmd:             LC_UUID
     cmdsize:         24
-    uuid:            4C4C441A-5555-3144-A124-E395C0E7AA96
+    uuid:            4C4C44C9-5555-3144-A16C-82A90B2087B3
   - cmd:             LC_BUILD_VERSION
     cmdsize:         24
     platform:        1
@@ -239,9 +259,9 @@ LoadCommands:
   - cmd:             LC_SYMTAB
     cmdsize:         24
     symoff:          4096
-    nsyms:           10
-    stroff:          4256
-    strsize:         156
+    nsyms:           11
+    stroff:          4272
+    strsize:         179
   - cmd:             LC_SEGMENT_64
     cmdsize:         72
     segname:         __PAGEZERO
@@ -268,7 +288,7 @@ LoadCommands:
       - sectname:        __text
         segname:         __TEXT
         addr:            0x100000338
-        size:            208
+        size:            228
         offset:          0x0
         align:           2
         reloff:          0x0
@@ -277,7 +297,7 @@ LoadCommands:
         reserved1:       0x0
         reserved2:       0x0
         reserved3:       0x0
-        content:         CFFAEDFE0C000001000000000A00000008000000C005000000000000000000001B000000180000004C4C441A55553144A124E395C0E7AA9632000000180000000100000000000B0000000B00000000000200000018000000001000000A000000A01000009C00000019000000480000005F5F504147455A45524F00000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000019000000980000005F5F54455854000000000000000000000000000001000000
+        content:         CFFAEDFE0C000001000000000A00000008000000C005000000000000000000001B000000180000004C4C44C955553144A16C82A90B2087B332000000180000000100000000000B0000000B00000000000200000018000000001000000B000000B0100000B300000019000000480000005F5F504147455A45524F00000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000019000000980000005F5F544558540000000000000000000000000000010000000040000000000000000000000000000000000000
   - cmd:             LC_SEGMENT_64
     cmdsize:         152
     segname:         __DATA
@@ -308,7 +328,7 @@ LoadCommands:
     vmaddr:          4295000064
     vmsize:          4096
     fileoff:         4096
-    filesize:        316
+    filesize:        355
     maxprot:         1
     initprot:        1
     nsects:          0
@@ -319,7 +339,7 @@ LoadCommands:
     vmaddr:          4295004160
     vmsize:          4096
     fileoff:         8192
-    filesize:        3507
+    filesize:        3884
     maxprot:         7
     initprot:        3
     nsects:          11
@@ -328,7 +348,7 @@ LoadCommands:
       - sectname:        __debug_line
         segname:         __DWARF
         addr:            0x100009000
-        size:            323
+        size:            357
         offset:          0x2000
         align:           0
         reloff:          0x0
@@ -339,9 +359,9 @@ LoadCommands:
         reserved3:       0x0
       - sectname:        __debug_aranges
         segname:         __DWARF
-        addr:            0x100009143
+        addr:            0x100009165
         size:            48
-        offset:          0x2143
+        offset:          0x2165
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -351,9 +371,9 @@ LoadCommands:
         reserved3:       0x0
       - sectname:        __debug_loc
         segname:         __DWARF
-        addr:            0x100009173
-        size:            1026
-        offset:          0x2173
+        addr:            0x100009195
+        size:            1083
+        offset:          0x2195
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -361,12 +381,12 @@ LoadCommands:
         reserved1:       0x0
         reserved2:       0x0
         reserved3:       0x0
-        content:         00000000000000000800000000000000010050080000000000000014000000000000000400A301509F0000000000000000000000000000000004000000000000000C0000000000000001005800000000000000000000000000000000080000000000000014000000000000000100500000000000000000000000000000000000000000000000000800000000000000010050080000000000000014000000000000000400A301509F0000000000000000000000000000000004000000000000000C0000000000000001005800000000000000000000000000000000080000000000000014000000000000000100500000000000000000000000000000000014000000000000002000000000000000010050200000000000000034000000000000000400A301509F00000000000000000000000000000000240000000000000034000000000000000100500000000000000000000000000000000014000000000000002000000000000000010050200000000000000034000000000000000400A301509F00000000000000000000000000000000240000000000000034000000000000000100500000000000000000000000000000000034000000000000004000000000000000010050400000000000000058000000000000000400A301509F000000000000000000000000000000003C0000000000000040000000000000000300707E9F000000000000000000000000000000004400000000000000580000000000000001005000000000000000000000000000000000340000000000000040000000000000000300707E9F00000000000000000000000000000000440000000000000058000000000000000100500000000000000000000000000000000034000000000000004000000000000000010050400000000000000058000000000000000400A301509F000000000000000000000000000000003C0000000000000040000000000000000300707E9F000000000000000000000000000000004400000000000000580000000000000001005000000000000000000000000000000000340000000000000040000000000000000300707E9F00000000000000000000000000000000440000000000000058000000000000000100500000000000000000000000000000000058000000000000006400000000000000010050640000000000000078000000000000000400A301509F00000000000000000000000000000000680000000000000078000000000000000100500000000000000000000000000000000084000000000000009000000000000000030011009F90000000000000009C000000000000000100639C00000000000000A800000000000000010064B800000000000000C00000000000000001006300000000000000000000000000000000
+        content:         00000000000000000800000000000000010050080000000000000014000000000000000400A301509F0000000000000000000000000000000004000000000000000C0000000000000001005800000000000000000000000000000000080000000000000014000000000000000100500000000000000000000000000000000000000000000000000800000000000000010050080000000000000014000000000000000400A301509F0000000000000000000000000000000004000000000000000C0000000000000001005800000000000000000000000000000000080000000000000014000000000000000100500000000000000000000000000000000014000000000000002000000000000000010050200000000000000034000000000000000400A301509F00000000000000000000000000000000240000000000000034000000000000000100500000000000000000000000000000000014000000000000002000000000000000010050200000000000000034000000000000000400A301509F00000000000000000000000000000000240000000000000034000000000000000100500000000000000000000000000000000034000000000000004000000000000000010050400000000000000058000000000000000400A301509F000000000000000000000000000000003C0000000000000040000000000000000300707E9F000000000000000000000000000000004400000000000000580000000000000001005000000000000000000000000000000000340000000000000040000000000000000300707E9F00000000000000000000000000000000440000000000000058000000000000000100500000000000000000000000000000000034000000000000004000000000000000010050400000000000000058000000000000000400A301509F000000000000000000000000000000003C0000000000000040000000000000000300707E9F000000000000000000000000000000004400000000000000580000000000000001005000000000000000000000000000000000340000000000000040000000000000000300707E9F00000000000000000000000000000000440000000000000058000000000000000100500000000000000000000000000000000058000000000000006400000000000000010050640000000000000078000000000000000400A301509F00000000000000000000000000000000680000000000000078000000000000000100500000000000000000000000000000000078000000000000007C000000000000000100507C0000000000000080000000000000000400A301509F000000000000000000000000000000008C000000000000009800000000000000030011009F9800000000000000A400000000000000010063A400000000000000B000000000000000010064C000000000000000D40000000000000001006300000000000000000000000000000000
       - sectname:        __debug_info
         segname:         __DWARF
-        addr:            0x100009575
-        size:            923
-        offset:          0x2575
+        addr:            0x1000095D0
+        size:            988
+        offset:          0x25D0
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -376,9 +396,9 @@ LoadCommands:
         reserved3:       0x0
       - sectname:        __debug_frame
         segname:         __DWARF
-        addr:            0x100009910
-        size:            272
-        offset:          0x2910
+        addr:            0x1000099AC
+        size:            296
+        offset:          0x29AC
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -386,12 +406,12 @@ LoadCommands:
         reserved1:       0x0
         reserved2:       0x0
         reserved3:       0x0
-        content:         14000000FFFFFFFF0400080001781E0C1F000000000000001400000000000000380300000100000014000000000000001400000000000000380300000100000014000000000000001C000000000000004C030000010000002000000000000000480C1D109E019D021C000000000000004C030000010000002000000000000000480C1D109E019D021C000000000000006C030000010000002400000000000000480C1D109E019D021C000000000000006C030000010000002400000000000000480C1D109E019D021C0000000000000090030000010000002000000000000000480C1D109E019D022400000000000000B00300000100000058000000000000004C0C1D109E019D029303940400000000
+        content:         14000000FFFFFFFF0400080001781E0C1F000000000000001400000000000000380300000100000014000000000000001400000000000000380300000100000014000000000000001C000000000000004C030000010000002000000000000000480C1D109E019D021C000000000000004C030000010000002000000000000000480C1D109E019D021C000000000000006C030000010000002400000000000000480C1D109E019D021C000000000000006C030000010000002400000000000000480C1D109E019D021C0000000000000090030000010000002000000000000000480C1D109E019D021400000000000000B00300000100000008000000000000002400000000000000B80300000100000064000000000000004C0C1D109E019D029303940400000000
       - sectname:        __debug_abbrev
         segname:         __DWARF
-        addr:            0x100009A20
+        addr:            0x100009AD4
         size:            260
-        offset:          0x2A20
+        offset:          0x2AD4
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -401,9 +421,9 @@ LoadCommands:
         reserved3:       0x0
       - sectname:        __debug_str
         segname:         __DWARF
-        addr:            0x100009B24
-        size:            188
-        offset:          0x2B24
+        addr:            0x100009BD8
+        size:            357
+        offset:          0x2BD8
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -413,9 +433,9 @@ LoadCommands:
         reserved3:       0x0
       - sectname:        __apple_namespac
         segname:         __DWARF
-        addr:            0x100009BE0
+        addr:            0x100009D3D
         size:            36
-        offset:          0x2BE0
+        offset:          0x2D3D
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -426,9 +446,9 @@ LoadCommands:
         content:         485341480100000001000000000000000C000000000000000100000001000600FFFFFFFF
       - sectname:        __apple_names
         segname:         __DWARF
-        addr:            0x100009C04
-        size:            316
-        offset:          0x2C04
+        addr:            0x100009D61
+        size:            344
+        offset:          0x2D61
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -436,12 +456,12 @@ LoadCommands:
         reserved1:       0x0
         reserved2:       0x0
         reserved3:       0x0
-        content:         48534148010000000A0000000A0000000C0000000000000001000000010006000000000002000000030000000400000006000000FFFFFFFF0800000009000000FFFFFFFFFFFFFFFF88CB36CFF4B03BD389CB36CF0A452B694908311C0B452B694A08311CDC41AB586A7F9A7CAD7ED75898000000A8000000B8000000C8000000D8000000E8000000F80000000801000018010000280100008900000001000000BB010000000000001B000000010000002E0000000000000099000000010000003A020000000000002D000000010000004F000000000000005800000001000000E50000000000000048000000010000009A0000000000000068000000010000003901000000000000A900000001000000B902000000000000B3000000010000000D030000000000007800000002000000050200008402000000000000
+        content:         48534148010000000B0000000B0000000C0000000000000001000000010006000000000002000000FFFFFFFFFFFFFFFF03000000FFFFFFFFFFFFFFFF04000000080000000A000000FFFFFFFFAD7ED75888CB36CF89CB36CFF4B03BD34908311CDC41AB580A452B696A7F9A7C4A08311C0B452B69404C8F68A4000000B8000000C8000000D8000000E8000000F800000008010000180100002801000038010000480100000B010000020000000502000084020000000000001C01000001000000BB010000000000002C010000010000003A02000000000000AE000000010000002E00000000000000EB00000001000000E5000000000000003C01000001000000B902000000000000C0000000010000004F000000000000005C010000010000003A03000000000000FB000000010000003901000000000000DB000000010000009A0000000000000046010000010000000D03000000000000
       - sectname:        __apple_types
         segname:         __DWARF
-        addr:            0x100009D40
+        addr:            0x100009EB9
         size:            79
-        offset:          0x2D40
+        offset:          0x2EB9
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -449,12 +469,12 @@ LoadCommands:
         reserved1:       0x0
         reserved2:       0x0
         reserved3:       0x0
-        content:         48534148010000000100000001000000180000000000000004000000010006000300050005000B0006000600000000003080880B38000000290000000100000048000000240000A4283A0C00000000
+        content:         48534148010000000100000001000000180000000000000004000000010006000300050005000B0006000600000000003080880B38000000BC0000000100000048000000240000A4283A0C00000000
       - sectname:        __apple_objc
         segname:         __DWARF
-        addr:            0x100009D8F
+        addr:            0x100009F08
         size:            36
-        offset:          0x2D8F
+        offset:          0x2F08
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -469,7 +489,7 @@ LinkEditData:
       n_type:          0xF
       n_sect:          1
       n_desc:          0
-      n_value:         4294968240
+      n_value:         4294968248
     - n_strx:          8
       n_type:          0xF
       n_sect:          1
@@ -507,10 +527,15 @@ LinkEditData:
       n_value:         4294968208
     - n_strx:          121
       n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         4294968240
+    - n_strx:          144
+      n_type:          0xF
       n_sect:          2
       n_desc:          0
       n_value:         4294983680
-    - n_strx:          136
+    - n_strx:          159
       n_type:          0xF
       n_sect:          1
       n_desc:          16
@@ -526,11 +551,13 @@ LinkEditData:
     - _function2_copy1
     - _function2_copy2
     - _function1
+    - _function_bad_stmt_seq
     - _global_result
     - __mh_execute_header
 DWARF:
   debug_str:
     - ''
+    - 'Facebook clang version 15.82.1 (https://git.internal.tfbnw.net/repos/git/ro/osmeta/external/llvm-project 3c2ff7d3c6ef63fb9a83574cee1b376c969bd5ec)'
     - merged_funcs_test.cpp
     - '/'
     - .
@@ -547,6 +574,7 @@ DWARF:
     - function2_copy1
     - function2_copy2
     - function1
+    - function_bad_stmt_seq
     - main
     - sum
   debug_abbrev:
@@ -793,9 +821,9 @@ DWARF:
       AddressSize:     0x8
       Descriptors:
         - Address:         0x100000338
-          Length:          0xD0
+          Length:          0xE4
   debug_info:
-    - Length:          0x397
+    - Length:          0x3D8
       Version:         4
       AbbrevTableID:   0
       AbbrOffset:      0x0
@@ -803,31 +831,31 @@ DWARF:
       Entries:
         - AbbrCode:        0x1
           Values:
-            - Value:           0x0
-            - Value:           0x21
             - Value:           0x1
-            - Value:           0x17
+            - Value:           0x21
+            - Value:           0x94
+            - Value:           0xAA
             - Value:           0x0
-            - Value:           0x19
+            - Value:           0xAC
             - Value:           0x1
             - Value:           0x100000338
-            - Value:           0xD0
+            - Value:           0xE4
         - AbbrCode:        0x2
           Values:
-            - Value:           0x1B
+            - Value:           0xAE
             - Value:           0x43
             - Value:           0x1
             - Value:           0x1
             - Value:           0x2
             - Value:           0x9
-              BlockData:       [ 0x3, 0x0, 0x40, 0x0, 0x0, 0x1, 0x0, 0x0, 
+              BlockData:       [ 0x3, 0x0, 0x40, 0x0, 0x0, 0x1, 0x0, 0x0,
                                  0x0 ]
         - AbbrCode:        0x3
           Values:
             - Value:           0x48
         - AbbrCode:        0x4
           Values:
-            - Value:           0x29
+            - Value:           0xBC
             - Value:           0x5
             - Value:           0x4
         - AbbrCode:        0x5
@@ -839,7 +867,7 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x6F ]
             - Value:           0x1
-            - Value:           0x2D
+            - Value:           0xC0
             - Value:           0x1
             - Value:           0x4
             - Value:           0x48
@@ -848,21 +876,21 @@ DWARF:
         - AbbrCode:        0x6
           Values:
             - Value:           0x0
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0x4
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x39
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0x5
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x5C
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0x6
             - Value:           0x48
@@ -876,7 +904,7 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x6F ]
             - Value:           0x1
-            - Value:           0x48
+            - Value:           0xDB
             - Value:           0x1
             - Value:           0xB
             - Value:           0x48
@@ -885,21 +913,21 @@ DWARF:
         - AbbrCode:        0x6
           Values:
             - Value:           0x7F
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0xB
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0xB8
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0xC
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0xDB
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0xD
             - Value:           0x48
@@ -912,7 +940,7 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x6D ]
             - Value:           0x1
-            - Value:           0x58
+            - Value:           0xEB
             - Value:           0x1
             - Value:           0x12
             - Value:           0x48
@@ -921,20 +949,20 @@ DWARF:
         - AbbrCode:        0x6
           Values:
             - Value:           0xFE
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0x12
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x137
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0x14
             - Value:           0x48
         - AbbrCode:        0x9
           Values:
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0x13
             - Value:           0x48
@@ -951,7 +979,7 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x6D ]
             - Value:           0x1
-            - Value:           0x68
+            - Value:           0xFB
             - Value:           0x1
             - Value:           0x19
             - Value:           0x48
@@ -960,20 +988,20 @@ DWARF:
         - AbbrCode:        0x6
           Values:
             - Value:           0x15A
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0x19
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x193
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0x1B
             - Value:           0x48
         - AbbrCode:        0x9
           Values:
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0x1A
             - Value:           0x48
@@ -984,7 +1012,7 @@ DWARF:
         - AbbrCode:        0x0
         - AbbrCode:        0xB
           Values:
-            - Value:           0x78
+            - Value:           0x10B
             - Value:           0x1
             - Value:           0x20
             - Value:           0x48
@@ -993,19 +1021,19 @@ DWARF:
             - Value:           0x1
         - AbbrCode:        0xC
           Values:
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0x20
             - Value:           0x48
         - AbbrCode:        0x9
           Values:
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0x22
             - Value:           0x48
         - AbbrCode:        0x9
           Values:
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0x21
             - Value:           0x48
@@ -1018,7 +1046,7 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x6D ]
             - Value:           0x1
-            - Value:           0x89
+            - Value:           0x11C
             - Value:           0x1
             - Value:           0x27
             - Value:           0x48
@@ -1027,21 +1055,21 @@ DWARF:
         - AbbrCode:        0x6
           Values:
             - Value:           0x1B6
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0x27
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x1EF
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0x28
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x214
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0x29
             - Value:           0x48
@@ -1075,7 +1103,7 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x6D ]
             - Value:           0x1
-            - Value:           0x99
+            - Value:           0x12C
             - Value:           0x1
             - Value:           0x2E
             - Value:           0x48
@@ -1084,21 +1112,21 @@ DWARF:
         - AbbrCode:        0x6
           Values:
             - Value:           0x27F
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0x2E
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x2B8
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0x2F
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x2DD
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0x30
             - Value:           0x48
@@ -1132,7 +1160,7 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x6D ]
             - Value:           0x1
-            - Value:           0xA9
+            - Value:           0x13C
             - Value:           0x1
             - Value:           0x35
             - Value:           0x48
@@ -1141,20 +1169,20 @@ DWARF:
         - AbbrCode:        0x6
           Values:
             - Value:           0x348
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0x35
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x381
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0x37
             - Value:           0x48
         - AbbrCode:        0x9
           Values:
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0x36
             - Value:           0x48
@@ -1163,31 +1191,54 @@ DWARF:
             - Value:           0x1BB
             - Value:           0x1000003A0
         - AbbrCode:        0x0
-        - AbbrCode:        0x8
+        - AbbrCode:        0x5
           Values:
             - Value:           0x1000003B0
-            - Value:           0x58
-            - Value:           0x112
+            - Value:           0x8
+            - Value:           0x1
+            - Value:           0xFFFFFFFF
+            - Value:           0x1
+              BlockData:       [ 0x6F ]
+            - Value:           0x1
+            - Value:           0x146
+            - Value:           0x1
+            - Value:           0x3E
+            - Value:           0x48
+            - Value:           0x1
+            - Value:           0x1
+        - AbbrCode:        0x6
+          Values:
+            - Value:           0x3A4
+            - Value:           0xD0
+            - Value:           0x1
+            - Value:           0x3F
+            - Value:           0x48
+        - AbbrCode:        0x0
+        - AbbrCode:        0x8
+          Values:
+            - Value:           0x1000003B8
+            - Value:           0x64
+            - Value:           0x12B
             - Value:           0x1
               BlockData:       [ 0x6D ]
             - Value:           0x1
-            - Value:           0xB3
+            - Value:           0x15C
             - Value:           0x1
-            - Value:           0x3C
+            - Value:           0x44
             - Value:           0x48
             - Value:           0x1
             - Value:           0x1
         - AbbrCode:        0x7
           Values:
-            - Value:           0x3A4
-            - Value:           0xB8
+            - Value:           0x3DD
+            - Value:           0x161
             - Value:           0x1
-            - Value:           0x3D
+            - Value:           0x45
             - Value:           0x48
         - AbbrCode:        0x10
           Values:
             - Value:           0x2B9
-            - Value:           0x1000003C4
+            - Value:           0x1000003CC
         - AbbrCode:        0x11
           Values:
             - Value:           0x1
@@ -1198,7 +1249,7 @@ DWARF:
         - AbbrCode:        0x10
           Values:
             - Value:           0x23A
-            - Value:           0x1000003D0
+            - Value:           0x1000003D8
         - AbbrCode:        0x11
           Values:
             - Value:           0x1
@@ -1209,7 +1260,7 @@ DWARF:
         - AbbrCode:        0x10
           Values:
             - Value:           0x9A
-            - Value:           0x1000003DC
+            - Value:           0x1000003E4
         - AbbrCode:        0x11
           Values:
             - Value:           0x1
@@ -1220,7 +1271,7 @@ DWARF:
         - AbbrCode:        0x10
           Values:
             - Value:           0x139
-            - Value:           0x1000003E8
+            - Value:           0x1000003F0
         - AbbrCode:        0x11
           Values:
             - Value:           0x1
@@ -1231,7 +1282,7 @@ DWARF:
         - AbbrCode:        0x10
           Values:
             - Value:           0x1BB
-            - Value:           0x1000003F8
+            - Value:           0x100000400
         - AbbrCode:        0x11
           Values:
             - Value:           0x1
@@ -1239,10 +1290,21 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x3B ]
         - AbbrCode:        0x0
+        - AbbrCode:        0x10
+          Values:
+            - Value:           0x30D
+            - Value:           0x10000040C
+        - AbbrCode:        0x11
+          Values:
+            - Value:           0x1
+              BlockData:       [ 0x50 ]
+            - Value:           0x2
+              BlockData:       [ 0x83, 0x0 ]
+        - AbbrCode:        0x0
         - AbbrCode:        0x0
         - AbbrCode:        0x0
   debug_line:
-    - Length:          319
+    - Length:          353
       Version:         4
       PrologueLength:  45
       MinInstLength:   1
@@ -1495,8 +1557,31 @@ DWARF:
           ExtLen:          9
           SubOpcode:       DW_LNE_set_address
           Data:            4294968240
+        - Opcode:          DW_LNS_set_column
+          Data:            14
+        - Opcode:          DW_LNS_set_prologue_end
+          Data:            0
         - Opcode:          DW_LNS_advance_line
-          SData:           59
+          SData:           64
+          Data:            0
+        - Opcode:          DW_LNS_copy
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            5
+        - Opcode:          DW_LNS_negate_stmt
+          Data:            0
+        - Opcode:          0x4A
+          Data:            0
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          1
+          SubOpcode:       DW_LNE_end_sequence
+          Data:            0
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          9
+          SubOpcode:       DW_LNE_set_address
+          Data:            4294968248
+        - Opcode:          DW_LNS_advance_line
+          SData:           67
           Data:            0
         - Opcode:          DW_LNS_copy
           Data:            0
@@ -1539,6 +1624,18 @@ DWARF:
         - Opcode:          0x82
           Data:            0
         - Opcode:          DW_LNS_set_column
+          Data:            12
+        - Opcode:          DW_LNS_negate_stmt
+          Data:            0
+        - Opcode:          0x4B
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            9
+        - Opcode:          DW_LNS_negate_stmt
+          Data:            0
+        - Opcode:          0x82
+          Data:            0
+        - Opcode:          DW_LNS_set_column
           Data:            5
         - Opcode:          DW_LNS_negate_stmt
           Data:            0