aboutsummaryrefslogtreecommitdiff
path: root/llvm/test
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/Analysis/DependenceAnalysis/monotonicity-cast.ll207
-rw-r--r--llvm/test/Analysis/DependenceAnalysis/monotonicity-delinearize.ll59
-rw-r--r--llvm/test/Analysis/DependenceAnalysis/monotonicity-invariant.ll150
-rw-r--r--llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll519
-rw-r--r--llvm/test/Analysis/DependenceAnalysis/non-monotonic.ll77
-rw-r--r--llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll53
-rw-r--r--llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll41
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll9
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir126
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll80
-rw-r--r--llvm/test/CodeGen/AArch64/peephole-csel.ll5
-rw-r--r--llvm/test/CodeGen/AArch64/rax1.ll46
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll171
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll40
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll35
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll57
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll786
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll632
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll542
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll330
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll330
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll330
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll330
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir83
-rw-r--r--llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll20
-rw-r--r--llvm/test/CodeGen/BPF/BTF/variant-part.ll87
-rw-r--r--llvm/test/CodeGen/Hexagon/insert-big.ll47
-rw-r--r--llvm/test/CodeGen/Hexagon/qfp-conv.ll35
-rw-r--r--llvm/test/CodeGen/Hexagon/qfp-enabled.ll19
-rw-r--r--llvm/test/CodeGen/Hexagon/qfp-remove-kill.mir95
-rw-r--r--llvm/test/CodeGen/Hexagon/qfp-subreg-bug.mir33
-rw-r--r--llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll21
-rw-r--r--llvm/test/CodeGen/Hexagon/vect/qfp-mix.mir79
-rw-r--r--llvm/test/CodeGen/Hexagon/vect/qfp-zeroinit.mir23
-rw-r--r--llvm/test/CodeGen/Hexagon/vect/unique-vreg-def.ll32
-rw-r--r--llvm/test/CodeGen/RISCV/rv32zbs.ll50
-rw-r--r--llvm/test/CodeGen/RISCV/rv64zbs.ll34
-rw-r--r--llvm/test/CodeGen/SystemZ/int-conv-14.ll45
-rw-r--r--llvm/test/CodeGen/SystemZ/int-conv-15.ll45
-rw-r--r--llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s3
-rw-r--r--llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll14
-rw-r--r--llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll68
-rw-r--r--llvm/test/TableGen/RuntimeLibcallEmitter.td14
-rw-r--r--llvm/test/Transforms/GVNSink/ptrtoaddr.ll30
-rw-r--r--llvm/test/Transforms/Inline/ML/recursive.ll34
-rw-r--r--llvm/test/Transforms/InstCombine/ptrtoaddr.ll163
-rw-r--r--llvm/test/Transforms/InstSimplify/ptrtoaddr.ll318
-rw-r--r--llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll82
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll14
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll8
-rw-r--r--llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll34
-rw-r--r--llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll51
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll8
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/i8-induction.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/if-pred-stores.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll8
-rw-r--r--llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/reduction-inloop.ll42
-rw-r--r--llvm/test/Transforms/LoopVectorize/reverse_iter.ll10
-rw-r--r--llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll46
-rw-r--r--llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/uniform-blend.ll6
-rw-r--r--llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll20
-rw-r--r--llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll53
-rw-r--r--llvm/test/Transforms/SpeculativeExecution/spec-casts.ll197
-rw-r--r--llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-gsym-merged-callsites-dsym.yaml309
85 files changed, 6087 insertions, 1368 deletions
diff --git a/llvm/test/Analysis/DependenceAnalysis/monotonicity-cast.ll b/llvm/test/Analysis/DependenceAnalysis/monotonicity-cast.ll
new file mode 100644
index 0000000..e43d00d
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/monotonicity-cast.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output -passes="print<da>" -da-dump-monotonicity-report \
+; RUN: -da-enable-monotonicity-check 2>&1 | FileCheck %s
+
+; int8_t offset = start;
+; for (int i = 0; i < 100; i++, offset += step)
+; a[sext(offset)] = 0;
+;
+define void @sext_nsw(ptr %a, i8 %start, i8 %step) {
+; CHECK-LABEL: 'sext_nsw'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: Expr: {(sext i8 %start to i64),+,(sext i8 %step to i64)}<nsw><%loop>
+; CHECK-NEXT: Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: da analyze - none!
+;
+entry:
+ br label %loop
+
+loop:
+ %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+ %offset = phi i8 [ %start, %entry ], [ %offset.next, %loop ]
+ %offset.sext = sext i8 %offset to i64
+ %idx = getelementptr i8, ptr %a, i64 %offset.sext
+ store i8 0, ptr %idx
+ %i.inc = add nsw i64 %i, 1
+ %offset.next = add nsw i8 %offset, %step
+ %exitcond = icmp eq i64 %i.inc, 100
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+; The addition for `%offset.next` can wrap, so we cannot prove monotonicity.
+;
+; int8_t offset = start;
+; for (int i = 0; i < 100; i++, offset += step)
+; a[sext(offset)] = 0;
+;
+define void @sext_may_wrap(ptr %a, i8 %start, i8 %step) {
+; CHECK-LABEL: 'sext_may_wrap'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: Expr: (sext i8 {%start,+,%step}<%loop> to i64)
+; CHECK-NEXT: Monotonicity: Unknown
+; CHECK-NEXT: Reason: (sext i8 {%start,+,%step}<%loop> to i64)
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: da analyze - confused!
+;
+entry:
+ br label %loop
+
+loop:
+ %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+ %offset = phi i8 [ %start, %entry ], [ %offset.next, %loop ]
+ %offset.sext = sext i8 %offset to i64
+ %idx = getelementptr i8, ptr %a, i64 %offset.sext
+ store i8 0, ptr %idx
+ %i.inc = add nsw i64 %i, 1
+ %offset.next = add i8 %offset, %step
+ %exitcond = icmp eq i64 %i.inc, 100
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+; for (int8_t i = 0; i < 100; i++)
+; a[zext(offset)] = 0;
+;
+define void @zext_pos(ptr %a) {
+; CHECK-LABEL: 'zext_pos'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: Expr: {0,+,1}<nuw><nsw><%loop>
+; CHECK-NEXT: Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: da analyze - none!
+;
+entry:
+ br label %loop
+
+loop:
+ %i = phi i8 [ 0, %entry ], [ %i.inc, %loop ]
+ %offset.zext = zext nneg i8 %i to i64
+ %idx = getelementptr i8, ptr %a, i64 %offset.zext
+ store i8 0, ptr %idx
+ %i.inc = add nsw i8 %i, 1
+ %exitcond = icmp eq i8 %i.inc, 100
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+; The zero-extened value of `offset` is no longer monotonic. In fact, the
+; values of `offset` in each iteration are:
+;
+; iteration | 0 | 1 | 2 | ...
+; -------------|-----|---|---|---------
+; offset | -1 | 0 | 1 | ...
+; zext(offset) | 255 | 0 | 1 | ...
+;
+;
+; for (int8_t i = -1; i < 100; i++)
+; a[zext(offset)] = 0;
+;
+define void @zext_cross_zero(ptr %a) {
+; CHECK-LABEL: 'zext_cross_zero'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: Expr: (zext i8 {-1,+,1}<nsw><%loop> to i64)
+; CHECK-NEXT: Monotonicity: Unknown
+; CHECK-NEXT: Reason: (zext i8 {-1,+,1}<nsw><%loop> to i64)
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: da analyze - confused!
+;
+entry:
+ br label %loop
+
+loop:
+ %i = phi i8 [ -1, %entry ], [ %i.inc, %loop ]
+ %offset.zext = zext nneg i8 %i to i64
+ %idx = getelementptr i8, ptr %a, i64 %offset.zext
+ store i8 0, ptr %idx
+ %i.inc = add nsw i8 %i, 1
+ %exitcond = icmp eq i8 %i.inc, 100
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+; In principle, we can prove that `zext(offset)` is monotonic since we know
+; that `offset` is non-negative.
+;
+; int8_t offset = 0;
+; for (int i = 0; i < 100; i++, offset += step)
+; a[zext(offset)] = 0;
+;
+define void @zext_nneg_nsw(ptr %a, i8 %step) {
+; CHECK-LABEL: 'zext_nneg_nsw'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: Expr: (zext i8 {0,+,%step}<nsw><%loop> to i64)
+; CHECK-NEXT: Monotonicity: Unknown
+; CHECK-NEXT: Reason: (zext i8 {0,+,%step}<nsw><%loop> to i64)
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: da analyze - confused!
+;
+entry:
+ br label %loop
+
+loop:
+ %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+ %offset = phi i8 [ 0, %entry ], [ %offset.next, %loop ]
+ %offset.zext = zext nneg i8 %offset to i64
+ %idx = getelementptr i8, ptr %a, i64 %offset.zext
+ store i8 0, ptr %idx
+ %i.inc = add nsw i64 %i, 1
+ %offset.next = add nsw i8 %offset, %step
+ %exitcond = icmp eq i64 %i.inc, 100
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+; SCEV handles `i & 1` as an i1 addrec. Ensure that the monotonicity analysis
+; properly analyzes it.
+;
+; for (i = 0; i < 100; i++)
+; a[i & 1] = 0;
+;
+define void @offset_truncated_to_i1(ptr %a) {
+; CHECK-LABEL: 'offset_truncated_to_i1'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: Expr: (zext i1 {false,+,true}<%loop> to i64)
+; CHECK-NEXT: Monotonicity: Unknown
+; CHECK-NEXT: Reason: (zext i1 {false,+,true}<%loop> to i64)
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: da analyze - confused!
+;
+entry:
+ br label %loop
+
+loop:
+ %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+ %and = and i64 %i, 1
+ %idx = getelementptr inbounds i8, ptr %a, i64 %and
+ store i8 0, ptr %idx
+ %i.inc = add nsw i64 %i, 1
+ %exitcond = icmp eq i64 %i.inc, 100
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/monotonicity-delinearize.ll b/llvm/test/Analysis/DependenceAnalysis/monotonicity-delinearize.ll
new file mode 100644
index 0000000..71ea4e9
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/monotonicity-delinearize.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output -passes="print<da>" -da-dump-monotonicity-report \
+; RUN: -da-enable-monotonicity-check 2>&1 | FileCheck %s
+
+; The offset SCEV will be delinearized into a 2D array access, like as follows:
+;
+; - Outer subscript: {0,+,1}<nuw><nsw><%loop.i.header>
+; - Inner subscript: {0,+,1}<nuw><nsw><%loop.j.header>
+;
+; These subscripts are both monotonic, but we also need to check the
+; monotonicity of the original addrec.
+;
+; char A[...][32];
+; for (i = 0; i < 1ll << 62; i++)
+; for (j = 0; j < 32; j++)
+; if (i < (1ll << 57))
+; A[i][j] = 0;
+;
+define void @linearized_offset_wrap(ptr %a) {
+; CHECK-LABEL: 'linearized_offset_wrap'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %gep, align 1
+; CHECK-NEXT: Expr: {{\{\{}}0,+,32}<%loop.i.header>,+,1}<nw><%loop.j.header>
+; CHECK-NEXT: Monotonicity: Unknown
+; CHECK-NEXT: Reason: {{\{\{}}0,+,32}<%loop.i.header>,+,1}<nw><%loop.j.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %gep, align 1 --> Dst: store i8 0, ptr %gep, align 1
+; CHECK-NEXT: da analyze - confused!
+;
+entry:
+ br label %loop.i.header
+
+loop.i.header:
+ %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+ br label %loop.j.header
+
+loop.j.header:
+ %j = phi i64 [ 0, %loop.i.header ], [ %j.inc, %loop.j.latch ]
+ %cond = icmp slt i64 %i, 144115188075855872 ; 2^57
+ br i1 %cond, label %if.then, label %loop.j.latch
+
+if.then:
+ %gep = getelementptr inbounds [32 x i8], ptr %a, i64 %i, i64 %j
+ store i8 0, ptr %gep
+ br label %loop.j.latch
+
+loop.j.latch:
+ %j.inc = add nuw nsw i64 %j, 1
+ %ec.j = icmp eq i64 %j.inc, 32
+ br i1 %ec.j, label %loop.i.latch, label %loop.j.header
+
+loop.i.latch:
+ %i.inc = add nuw nsw i64 %i, 1
+ %ec.i = icmp eq i64 %i.inc, 4611686018427387904 ; 2^62
+ br i1 %ec.i, label %exit, label %loop.i.header
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/monotonicity-invariant.ll b/llvm/test/Analysis/DependenceAnalysis/monotonicity-invariant.ll
new file mode 100644
index 0000000..e5b6ddb
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/monotonicity-invariant.ll
@@ -0,0 +1,150 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output -passes="print<da>" -da-dump-monotonicity-report \
+; RUN: -da-enable-monotonicity-check 2>&1 | FileCheck %s
+
+; for (int i = 0; i < n; i++)
+; a[x] = 0;
+define void @single_loop_invariant(ptr %a, i64 %x, i64 %n) {
+; CHECK-LABEL: 'single_loop_invariant'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: Expr: %x
+; CHECK-NEXT: Monotonicity: Invariant
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: da analyze - consistent output [S]!
+;
+entry:
+ %guard = icmp sgt i64 %n, 0
+ br i1 %guard, label %loop, label %exit
+
+loop:
+ %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+ %idx = getelementptr inbounds i8, ptr %a, i64 %x
+ store i8 0, ptr %idx
+ %i.inc = add nsw i64 %i, 1
+ %exitcond = icmp eq i64 %i.inc, %n
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+; for (int i = 0; i < n; i++)
+; a[(i % 2 == 0 ? x : y)] = 0;
+define void @single_loop_variant(ptr %a, i64 %x, i64 %y, i64 %n) {
+; CHECK-LABEL: 'single_loop_variant'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: Expr: %offset
+; CHECK-NEXT: Monotonicity: Unknown
+; CHECK-NEXT: Reason: %offset
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: da analyze - confused!
+;
+entry:
+ %guard = icmp sgt i64 %n, 0
+ br i1 %guard, label %loop, label %exit
+
+loop:
+ %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+ %offset = phi i64 [ %x, %entry ], [ %offset.next, %loop ]
+ %offset.next = phi i64 [ %y, %entry ], [ %offset, %loop ]
+ %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+ store i8 0, ptr %idx
+ %i.inc = add nsw i64 %i, 1
+ %exitcond = icmp eq i64 %i.inc, %n
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+; for (int i = 0; i < n; i++)
+; for (int j = 0; j < m; j++)
+; a[x + i] = 0;
+define void @invariant_plus_monotonic0(ptr %a, i64 %x, i64 %n, i64 %m) {
+; CHECK-LABEL: 'invariant_plus_monotonic0'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: Expr: {%x,+,1}<nsw><%loop.i.header>
+; CHECK-NEXT: Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: da analyze - consistent output [0 S]!
+;
+entry:
+ %guard.i = icmp sgt i64 %n, 0
+ br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+ %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+ %offset = phi i64 [ %x, %entry ], [ %offset.inc, %loop.i.latch ]
+ br label %loop.j.preheader
+
+loop.j.preheader:
+ %gurard.j = icmp sgt i64 %m, 0
+ br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+ %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+ %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+ store i8 0, ptr %idx
+ %j.inc = add nuw nsw i64 %j, 1
+ %exitcond.j = icmp eq i64 %j.inc, %m
+ br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+ %i.inc = add nsw i64 %i, 1
+ %offset.inc = add nsw i64 %offset, 1
+ %exitcond.i = icmp eq i64 %i.inc, %n
+ br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+ ret void
+}
+
+; for (int i = 0; i < n; i++)
+; for (int j = 0; j < m; j++)
+; a[x + j] = 0;
+define void @invariant_plus_monotonic1(ptr %a, i64 %x, i64 %n, i64 %m) {
+; CHECK-LABEL: 'invariant_plus_monotonic1'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: Expr: {%x,+,1}<nsw><%loop.j>
+; CHECK-NEXT: Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: da analyze - consistent output [S 0]!
+;
+entry:
+ %guard.i = icmp sgt i64 %n, 0
+ br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+ %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+ br label %loop.j.preheader
+
+loop.j.preheader:
+ %gurard.j = icmp sgt i64 %m, 0
+ br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+ %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+ %offset = phi i64 [ %x, %loop.j.preheader ], [ %offset.inc, %loop.j ]
+ %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+ store i8 0, ptr %idx
+ %j.inc = add nuw nsw i64 %j, 1
+ %offset.inc = add nsw i64 %offset, 1
+ %exitcond.j = icmp eq i64 %j.inc, %m
+ br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+ %i.inc = add nsw i64 %i, 1
+ %exitcond.i = icmp eq i64 %i.inc, %n
+ br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll b/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll
new file mode 100644
index 0000000..7411dc9
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll
@@ -0,0 +1,519 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output -passes="print<da>" -da-dump-monotonicity-report \
+; RUN: -da-enable-monotonicity-check 2>&1 | FileCheck %s
+
+; for (int i = 0; i < n; i++)
+; a[i] = 0;
+;
+define void @single_loop_nsw(ptr %a, i64 %n) {
+; CHECK-LABEL: 'single_loop_nsw'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: Expr: {0,+,1}<nuw><nsw><%loop>
+; CHECK-NEXT: Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: da analyze - none!
+;
+entry:
+ %guard = icmp sgt i64 %n, 0
+ br i1 %guard, label %loop, label %exit
+
+loop:
+ %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+ %idx = getelementptr inbounds i8, ptr %a, i64 %i
+ store i8 0, ptr %idx
+ %i.inc = add nsw i64 %i, 1
+ %exitcond = icmp eq i64 %i.inc, %n
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+; The purpose of the variable `begin` is to avoid violating the size limitation
+; of the allocated object in LLVM IR, which would cause UB.
+;
+; for (unsigned long long i = begin; i < end; i++)
+; a[i] = 0;
+;
+define void @single_loop_nuw(ptr %a, i64 %begin, i64 %end) {
+; CHECK-LABEL: 'single_loop_nuw'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: Expr: {%begin,+,1}<nuw><%loop>
+; CHECK-NEXT: Monotonicity: Unknown
+; CHECK-NEXT: Reason: {%begin,+,1}<nuw><%loop>
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: da analyze - confused!
+;
+entry:
+ %guard = icmp ult i64 %begin, %end
+ br i1 %guard, label %loop, label %exit
+
+loop:
+ %i = phi i64 [ %begin, %entry ], [ %i.inc, %loop ]
+ %idx = getelementptr i8, ptr %a, i64 %i
+ store i8 0, ptr %idx
+ %i.inc = add nuw i64 %i, 1
+ %exitcond = icmp eq i64 %i.inc, %end
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+; for (int i = 0; i < n; i++)
+; for (int j = 0; j < m; j++)
+; a[i + j] = 0;
+;
+define void @nested_loop_nsw0(ptr %a, i64 %n, i64 %m) {
+; CHECK-LABEL: 'nested_loop_nsw0'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: Expr: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,1}<nuw><nsw><%loop.j>
+; CHECK-NEXT: Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: da analyze - output [* *]!
+;
+entry:
+ %guard.i = icmp sgt i64 %n, 0
+ br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+ %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+ br label %loop.j.preheader
+
+loop.j.preheader:
+ %gurard.j = icmp sgt i64 %m, 0
+ br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+ %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+ %offset = add nsw i64 %i, %j
+ %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+ store i8 0, ptr %idx
+ %j.inc = add nsw i64 %j, 1
+ %exitcond.j = icmp eq i64 %j.inc, %m
+ br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+ %i.inc = add nsw i64 %i, 1
+ %exitcond.i = icmp eq i64 %i.inc, %n
+ br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+ ret void
+}
+
+; for (int i = n - 1; i >= 0; i--)
+; for (int j = 0; j < m; j++)
+; a[i + j] = 0;
+;
+define void @nested_loop_nsw1(ptr %a, i64 %n, i64 %m) {
+; CHECK-LABEL: 'nested_loop_nsw1'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: Expr: {{\{\{}}(-1 + %n),+,-1}<nsw><%loop.i.header>,+,1}<nsw><%loop.j>
+; CHECK-NEXT: Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: da analyze - output [* *]!
+;
+entry:
+ %guard.i = icmp sgt i64 %n, 0
+ br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+ %i = phi i64 [ %n, %entry ], [ %i.dec, %loop.i.latch ]
+ %i.dec = add nsw i64 %i, -1
+ br label %loop.j.preheader
+
+loop.j.preheader:
+ %gurard.j = icmp sgt i64 %m, 0
+ br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+ %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+ %offset = add nsw i64 %i.dec, %j
+ %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+ store i8 0, ptr %idx
+ %j.inc = add nsw i64 %j, 1
+ %exitcond.j = icmp eq i64 %j.inc, %m
+ br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+ %exitcond.i = icmp eq i64 %i.dec, 0
+ br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+ ret void
+}
+
+; for (int i = 0; i < n; i--)
+; for (int j = 0; j < m; j++)
+; a[i - j] = 0;
+;
+define void @nested_loop_nsw2(ptr %a, i64 %n, i64 %m) {
+; CHECK-LABEL: 'nested_loop_nsw2'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: Expr: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,-1}<nsw><%loop.j>
+; CHECK-NEXT: Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: da analyze - output [* *]!
+;
+entry:
+ %guard.i = icmp sgt i64 %n, 0
+ br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+ %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+ br label %loop.j.preheader
+
+loop.j.preheader:
+ %gurard.j = icmp sgt i64 %m, 0
+ br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+ %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+ %offset = sub nsw i64 %i, %j
+ %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+ store i8 0, ptr %idx
+ %j.inc = add nsw i64 %j, 1
+ %exitcond.j = icmp eq i64 %j.inc, %m
+ br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+ %i.inc = add nsw i64 %i, 1
+ %exitcond.i = icmp eq i64 %i.inc, %n
+ br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+ ret void
+}
+
+; for (int i = begin0; i < end0; i++)
+; for (int j = begin1; j < end1; j++) {
+; unsigned long long offset = (unsigned long long)i + (unsigned long long)j;
+; a[offset] = 0;
+; }
+;
+define void @nested_loop_nuw(ptr %a, i64 %begin0, i64 %end0, i64 %begin1, i64 %end1) {
+; CHECK-LABEL: 'nested_loop_nuw'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: Expr: {{\{\{}}(%begin0 + %begin1),+,1}<nw><%loop.i.header>,+,1}<nw><%loop.j>
+; CHECK-NEXT: Monotonicity: Unknown
+; CHECK-NEXT: Reason: {{\{\{}}(%begin0 + %begin1),+,1}<nw><%loop.i.header>,+,1}<nw><%loop.j>
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: da analyze - confused!
+;
+entry:
+ %guard.i.0 = icmp slt i64 0, %begin0
+ %guard.i.1 = icmp slt i64 %begin0, %end0
+ %guard.i.2 = icmp slt i64 0, %end0
+ %and.i.0 = and i1 %guard.i.0, %guard.i.1
+ %and.i.1 = and i1 %and.i.0, %guard.i.2
+ br i1 %and.i.1, label %loop.i.header, label %exit
+
+loop.i.header:
+ %i = phi i64 [ %begin0, %entry ], [ %i.inc, %loop.i.latch ]
+ br label %loop.j.preheader
+
+loop.j.preheader:
+ %guard.j.0 = icmp slt i64 0, %begin1
+ %guard.j.1 = icmp slt i64 %begin1, %end1
+ %guard.j.2 = icmp slt i64 0, %end1
+ %and.j.0 = and i1 %guard.j.0, %guard.j.1
+ %and.j.1 = and i1 %and.j.0, %guard.j.2
+ br i1 %and.j.1, label %loop.j, label %loop.i.latch
+
+loop.j:
+ %j = phi i64 [ %begin1, %loop.j.preheader ], [ %j.inc, %loop.j ]
+ %offset = add nuw i64 %i, %j
+ %idx = getelementptr i8, ptr %a, i64 %offset
+ store i8 0, ptr %idx
+ %j.inc = add nsw i64 %j, 1
+ %exitcond.j = icmp eq i64 %j.inc, %end1
+ br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+ %i.inc = add nsw i64 %i, 1
+ %exitcond.i = icmp eq i64 %i.inc, %end0
+ br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+ ret void
+}
+
+; for (int i = 0; i < n; i++)
+; for (int j = 0; j < m; j++)
+; a[i + step*j] = 0;
+;
+define void @nested_loop_step(ptr %a, i64 %n, i64 %m, i64 %step) {
+; CHECK-LABEL: 'nested_loop_step'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: Expr: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,%step}<nsw><%loop.j>
+; CHECK-NEXT: Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: da analyze - output [* *]!
+;
+entry:
+ %guard.i = icmp sgt i64 %n, 0
+ br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+ %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+ br label %loop.j.preheader
+
+loop.j.preheader:
+ %gurard.j = icmp sgt i64 %m, 0
+ br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+ %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+ %offset.j = phi i64 [ 0, %loop.j.preheader ], [ %offset.j.next, %loop.j ]
+ %offset = add nsw i64 %i, %offset.j
+ %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+ store i8 0, ptr %idx
+ %j.inc = add nsw i64 %j, 1
+ %offset.j.next = add nsw i64 %offset.j, %step
+ %exitcond.j = icmp eq i64 %j.inc, %m
+ br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+ %i.inc = add nsw i64 %i, 1
+ %exitcond.i = icmp eq i64 %i.inc, %n
+ br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+ ret void
+}
+
+; The value of step reccurence is not invariant with respect to the outer most
+; loop (the i-loop).
+;
+; offset_i = 0;
+; for (int i = 0; i < 100; i++) {
+; for (int j = 0; j < 100; j++)
+; a[offset_i + j] = 0;
+; offset_i += (i % 2 == 0) ? 0 : 3;
+; }
+;
+define void @step_is_variant(ptr %a) {
+; CHECK-LABEL: 'step_is_variant'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: Expr: {%offset.i,+,1}<nuw><nsw><%loop.j>
+; CHECK-NEXT: Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: da analyze - confused!
+;
+entry:
+ br label %loop.i.header
+
+loop.i.header:
+ %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+ %offset.i = phi i64 [ 0, %entry ], [ %offset.i.next, %loop.i.latch ]
+ %step.i.0 = phi i64 [ 0, %entry ], [ %step.i.1, %loop.i.latch ]
+ %step.i.1 = phi i64 [ 3, %entry ], [ %step.i.0, %loop.i.latch ]
+ br label %loop.j
+
+loop.j:
+ %j = phi i64 [ 0, %loop.i.header ], [ %j.inc, %loop.j ]
+ %offset = add nsw i64 %offset.i, %j
+ %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+ store i8 0, ptr %idx
+ %j.inc = add nsw i64 %j, 1
+ %exitcond.j = icmp eq i64 %j.inc, 100
+ br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+ %i.inc = add nsw i64 %i, 1
+ %offset.i.next = add nsw i64 %offset.i, %step.i.0
+ %exitcond.i = icmp eq i64 %i.inc, 100
+ br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+ ret void
+}
+
+; The AddRec doesn't have nsw flag for the j-loop, since the store may not be
+; executed.
+;
+; for (int i = 0; i < n; i++)
+; for (int j = 0; j < m; j++)
+; if (cond)
+; a[i + j] = 0;
+;
+define void @conditional_store0(ptr %a, i64 %n, i64 %m) {
+; CHECK-LABEL: 'conditional_store0'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: Expr: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,1}<nw><%loop.j.header>
+; CHECK-NEXT: Monotonicity: Unknown
+; CHECK-NEXT: Reason: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,1}<nw><%loop.j.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: da analyze - confused!
+;
+entry:
+ %guard.i = icmp sgt i64 %n, 0
+ br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+ %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+ br label %loop.j.preheader
+
+loop.j.preheader:
+ %gurard.j = icmp sgt i64 %m, 0
+ br i1 %gurard.j, label %loop.j.header, label %loop.i.latch
+
+loop.j.header:
+ %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j.latch ]
+ %offset = add nsw i64 %i, %j
+ %cond = freeze i1 poison
+ br i1 %cond, label %if.then, label %loop.j.latch
+
+if.then:
+ %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+ store i8 0, ptr %idx
+ br label %loop.j.latch
+
+loop.j.latch:
+ %j.inc = add nsw i64 %j, 1
+ %exitcond.j = icmp eq i64 %j.inc, %m
+ br i1 %exitcond.j, label %loop.i.latch, label %loop.j.header
+
+loop.i.latch:
+ %i.inc = add nsw i64 %i, 1
+ %exitcond.i = icmp eq i64 %i.inc, %n
+ br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+ ret void
+}
+
+; Similar to the @conditional_store0, but the definition of the `%offset` is
+; different from it and we can infer `nsw` in this case.
+;
+; for (int i = 0; i < n; i++)
+; for (int j = 0; j < m; j++)
+; if (cond)
+; a[i + j] = 0;
+;
+define void @conditional_store1(ptr %a, i64 %n, i64 %m) {
+; CHECK-LABEL: 'conditional_store1'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: Expr: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,1}<nuw><nsw><%loop.j.header>
+; CHECK-NEXT: Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: da analyze - output [* *]!
+;
+entry:
+ %guard.i = icmp sgt i64 %n, 0
+ br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+ %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+ br label %loop.j.preheader
+
+loop.j.preheader:
+ %gurard.j = icmp sgt i64 %m, 0
+ br i1 %gurard.j, label %loop.j.header, label %loop.i.latch
+
+loop.j.header:
+ %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j.latch ]
+ %offset = phi i64 [ %i, %loop.j.preheader ], [ %offset.next, %loop.j.latch ]
+ %cond = freeze i1 poison
+ br i1 %cond, label %if.then, label %loop.j.latch
+
+if.then:
+ %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+ store i8 0, ptr %idx
+ br label %loop.j.latch
+
+loop.j.latch:
+ %j.inc = add nsw i64 %j, 1
+ %offset.next = add nsw i64 %offset, 1
+ %exitcond.j = icmp eq i64 %j.inc, %m
+ br i1 %exitcond.j, label %loop.i.latch, label %loop.j.header
+
+loop.i.latch:
+ %i.inc = add nsw i64 %i, 1
+ %exitcond.i = icmp eq i64 %i.inc, %n
+ br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+ ret void
+}
+
+; In the following case, the computation `offset = offset_i + j` will not wrap,
+; but `offset_i += 1024` will wrap both in a signed sense and an unsigned
+; sense. We cannot prove the monotonicity in this case.
+;
+; offset_i = (1ULL << 63) - 256;
+; for (i = 0; i < (1ULL << 62); i++, offset_i += 1024)
+; for (j = 0; j < 32; j++) {
+; offset = offset_i + j;
+;
+; // The value of `offset` is positive in a signed sense.
+; if (offset < (1ULL << 63))
+; a[offset] = 0;
+; }
+;
+define void @outer_loop_may_wrap(ptr %a) {
+; CHECK-LABEL: 'outer_loop_may_wrap'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %gep, align 1
+; CHECK-NEXT: Expr: {{\{\{}}9223372036854775552,+,1024}<%loop.i.header>,+,1}<nuw><nsw><%loop.j.header>
+; CHECK-NEXT: Monotonicity: Unknown
+; CHECK-NEXT: Reason: {9223372036854775552,+,1024}<%loop.i.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %gep, align 1 --> Dst: store i8 0, ptr %gep, align 1
+; CHECK-NEXT: da analyze - confused!
+;
+entry:
+ br label %loop.i.header
+
+loop.i.header:
+ %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+ %subscript.i = phi i64 [ 9223372036854775552, %entry ], [ %subscript.i.next, %loop.i.latch ] ; The initial value is 2^63 - 256
+ br label %loop.j.header
+
+loop.j.header:
+ %j = phi i64 [ 0, %loop.i.header ], [ %j.inc, %loop.j.latch ]
+ %subscript = phi i64 [ %subscript.i, %loop.i.header ], [ %subscript.next, %loop.j.latch ]
+ %cond = icmp sge i64 %subscript, 0
+ br i1 %cond, label %if.then, label %loop.j.latch
+
+if.then:
+ %gep = getelementptr inbounds i8, ptr %a, i64 %subscript
+ store i8 0, ptr %gep
+ br label %loop.j.latch
+
+loop.j.latch:
+ %j.inc = add nuw nsw i64 %j, 1
+ %subscript.next = add nuw nsw i64 %subscript, 1
+ %ec.j = icmp eq i64 %j.inc, 32
+ br i1 %ec.j, label %loop.i.latch, label %loop.j.header
+
+loop.i.latch:
+ %i.inc = add nuw nsw i64 %i, 1
+ %subscript.i.next = add i64 %subscript.i, 1024
+ %ec.i = icmp eq i64 %i.inc, 4611686018427387904 ; 2^62
+ br i1 %ec.i, label %exit, label %loop.i.header
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/non-monotonic.ll b/llvm/test/Analysis/DependenceAnalysis/non-monotonic.ll
new file mode 100644
index 0000000..6247336
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/non-monotonic.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output -passes="print<da>" -da-dump-monotonicity-report \
+; RUN: -da-enable-monotonicity-check 2>&1 | FileCheck %s
+; RUN: opt < %s -disable-output -passes="print<da>" 2>&1 | FileCheck %s -check-prefix=DISABLE-CHECK
+
+;
+; for (i = 0; i < (1ULL << 60); i++) {
+; A[i] = 1;
+;
+; unsigned long long offset = i * 32 + (1ULL << 62);
+; // offset is positive when interpreted as a signed value.
+; // To prevent violating the size limitation for an allocated object.
+; if (offset < (1ULL << 63))
+; A[offset] = 2;
+; }
+;
+; -----------------------------------------------------------------------------
+;
+; There is a dependency between the two stores. To detect it, we need to check
+; the monotonicity and bail out the analysis since `offset` is not monotonic.
+;
+; memory location | first store (A[i]) | second store (A[offset])
+; ------------------|--------------------|----------------------------
+; A[0] | i = 0 | i = 2^59 - 2^57
+; A[2^60 - 32] | i = 2^60 - 32 | i = 2^59 - 2^57 + 2^55 - 1
+;
+define void @f(ptr %A) {
+; CHECK-LABEL: 'f'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 1, ptr %idx.0, align 1
+; CHECK-NEXT: Expr: {0,+,1}<nuw><nsw><%loop.header>
+; CHECK-NEXT: Monotonicity: MultivariateSignedMonotonic
+; CHECK-NEXT: Inst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT: Expr: {4611686018427387904,+,32}<%loop.header>
+; CHECK-NEXT: Monotonicity: Unknown
+; CHECK-NEXT: Reason: {4611686018427387904,+,32}<%loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.0, align 1
+; CHECK-NEXT: da analyze - none!
+; CHECK-NEXT: Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT: da analyze - confused!
+; CHECK-NEXT: Src: store i8 2, ptr %idx.1, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT: da analyze - confused!
+;
+; DISABLE-CHECK-LABEL: 'f'
+; DISABLE-CHECK-NEXT: Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.0, align 1
+; DISABLE-CHECK-NEXT: da analyze - none!
+; DISABLE-CHECK-NEXT: Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; DISABLE-CHECK-NEXT: da analyze - none!
+; DISABLE-CHECK-NEXT: Src: store i8 2, ptr %idx.1, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; DISABLE-CHECK-NEXT: da analyze - none!
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %loop.latch ]
+ %idx.0 = getelementptr inbounds i8, ptr %A, i64 %i
+ store i8 1, ptr %idx.0
+ %offset.tmp = mul i64 %i, 32
+ %offset = add i64 %offset.tmp, 4611686018427387904 ; 1ULL << 62
+ %if.cond = icmp sge i64 %offset, 0
+ br i1 %if.cond, label %if.then, label %loop.latch
+
+if.then:
+ %idx.1 = getelementptr inbounds i8, ptr %A, i64 %offset
+ store i8 2, ptr %idx.1
+ br label %loop.latch
+
+loop.latch:
+ %i.next = add nuw nsw i64 %i, 1
+ %exit.cond = icmp eq i64 %i.next, 1152921504606846976 ; 1ULL << 60
+ br i1 %exit.cond, label %exit, label %loop.header
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll
index 3c3748a..1964fca 100644
--- a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll
+++ b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll
@@ -104,3 +104,56 @@ exit:
}
declare void @clobber()
+
+
+declare void @clobber.i32(i32)
+
+define void @test_guards_across_loops(i32 %N) {
+; CHECK-LABEL: 'test_guards_across_loops'
+; CHECK-NEXT: Classifying expressions for: @test_guards_across_loops
+; CHECK-NEXT: %iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %loop.1 ]
+; CHECK-NEXT: --> {0,+,1}<%loop.1> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop.1: Computable }
+; CHECK-NEXT: %iv.1.next = add i32 %iv.1, 1
+; CHECK-NEXT: --> {1,+,1}<%loop.1> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop.1: Computable }
+; CHECK-NEXT: %iv.2 = phi i32 [ 0, %loop.1 ], [ %iv.2.next, %loop.2 ]
+; CHECK-NEXT: --> {0,+,1}<nuw><%loop.2> U: full-set S: full-set Exits: (1 + %N) LoopDispositions: { %loop.2: Computable }
+; CHECK-NEXT: %iv.2.next = add i32 %iv.2, 1
+; CHECK-NEXT: --> {1,+,1}<nw><%loop.2> U: full-set S: full-set Exits: (2 + %N) LoopDispositions: { %loop.2: Computable }
+; CHECK-NEXT: Determining loop execution counts for: @test_guards_across_loops
+; CHECK-NEXT: Loop %loop.2: backedge-taken count is (1 + (zext i32 %N to i64))<nuw><nsw>
+; CHECK-NEXT: Loop %loop.2: constant max backedge-taken count is i64 4294967296
+; CHECK-NEXT: Loop %loop.2: symbolic max backedge-taken count is (1 + (zext i32 %N to i64))<nuw><nsw>
+; CHECK-NEXT: Loop %loop.2: Trip multiple is 1
+; CHECK-NEXT: Loop %loop.1: Unpredictable backedge-taken count.
+; CHECK-NEXT: Loop %loop.1: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT: Loop %loop.1: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT: Loop %loop.1: Predicated backedge-taken count is (1 + (zext i32 %N to i64))<nuw><nsw>
+; CHECK-NEXT: Predicates:
+; CHECK-NEXT: {0,+,1}<%loop.1> Added Flags: <nusw>
+; CHECK-NEXT: Loop %loop.1: Predicated constant max backedge-taken count is i64 4294967296
+; CHECK-NEXT: Predicates:
+; CHECK-NEXT: {0,+,1}<%loop.1> Added Flags: <nusw>
+; CHECK-NEXT: Loop %loop.1: Predicated symbolic max backedge-taken count is (1 + (zext i32 %N to i64))<nuw><nsw>
+; CHECK-NEXT: Predicates:
+; CHECK-NEXT: {0,+,1}<%loop.1> Added Flags: <nusw>
+;
+entry:
+ br label %loop.1
+
+loop.1:
+ %iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %loop.1 ]
+ call void @clobber.i32(i32 %iv.1)
+ %ec.1 = icmp ugt i32 %iv.1, %N
+ %iv.1.next = add i32 %iv.1, 1
+ br i1 %ec.1, label %loop.2, label %loop.1
+
+loop.2:
+ %iv.2 = phi i32 [ 0, %loop.1 ], [ %iv.2.next, %loop.2 ]
+ call void @clobber.i32(i32 %iv.2)
+ %ec.2 = icmp ugt i32 %iv.2, %N
+ %iv.2.next = add i32 %iv.2, 1
+ br i1 %ec.2, label %exit, label %loop.2
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
index a477465c..f35c48b 100644
--- a/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
@@ -717,5 +717,46 @@ exit:
ret void
}
+define void @test_urem_non_constant(ptr %dst, i32 %a, i32 %b) {
+; CHECK-LABEL: 'test_urem_non_constant'
+; CHECK-NEXT: Classifying expressions for: @test_urem_non_constant
+; CHECK-NEXT: %rem = urem i32 %a, %b
+; CHECK-NEXT: --> ((-1 * (%a /u %b) * %b) + %a) U: full-set S: full-set
+; CHECK-NEXT: %and.0 = and i1 %pre.0, %pre.1
+; CHECK-NEXT: --> (%pre.1 umin %pre.0) U: full-set S: full-set
+; CHECK-NEXT: %and.1 = and i1 %and.0, %pre.2
+; CHECK-NEXT: --> (%pre.1 umin %pre.2 umin %pre.0) U: full-set S: full-set
+; CHECK-NEXT: %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT: --> {0,+,%b}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT: %gep.dst = getelementptr inbounds i8, ptr %dst, i32 %b
+; CHECK-NEXT: --> ((sext i32 %b to i64) + %dst) U: full-set S: full-set Exits: ((sext i32 %b to i64) + %dst) LoopDispositions: { %loop: Invariant }
+; CHECK-NEXT: %iv.next = add i32 %iv, %b
+; CHECK-NEXT: --> {%b,+,%b}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT: Determining loop execution counts for: @test_urem_non_constant
+; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count.
+; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count.
+;
+entry:
+ %rem = urem i32 %a, %b
+ %pre.0 = icmp eq i32 %rem, 0
+ %pre.1 = icmp ne i32 %a, 0
+ %pre.2 = icmp ne i32 %b, 0
+ %and.0 = and i1 %pre.0, %pre.1
+ %and.1 = and i1 %and.0, %pre.2
+ br i1 %and.1, label %loop, label %exit
+
+loop:
+ %iv = phi i32 [ 0, %entry], [ %iv.next, %loop ]
+ %gep.dst = getelementptr inbounds i8, ptr %dst, i32 %b
+ store i8 0, ptr %gep.dst
+ %iv.next = add i32 %iv, %b
+ %ec = icmp ne i32 %iv.next, %a
+ br i1 %ec, label %loop, label %exit
+
+exit:
+ ret void
+}
+
declare void @llvm.assume(i1)
declare void @llvm.experimental.guard(i1, ...)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index b215c51..0933e67 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -1371,11 +1371,10 @@ define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %sc
; CHECK-SD-NEXT: lsr x9, x0, #16
; CHECK-SD-NEXT: adrp x8, .LCPI14_0
; CHECK-SD-NEXT: dup v4.8h, w0
-; CHECK-SD-NEXT: dup v1.8h, w9
-; CHECK-SD-NEXT: fmov s3, w9
-; CHECK-SD-NEXT: sqneg v2.8h, v1.8h
-; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI14_0]
-; CHECK-SD-NEXT: tbl v1.16b, { v2.16b, v3.16b }, v1.16b
+; CHECK-SD-NEXT: ldr q3, [x8, :lo12:.LCPI14_0]
+; CHECK-SD-NEXT: dup v2.8h, w9
+; CHECK-SD-NEXT: sqneg v1.8h, v2.8h
+; CHECK-SD-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v3.16b
; CHECK-SD-NEXT: rev32 v2.8h, v0.8h
; CHECK-SD-NEXT: sqdmull v3.4s, v0.4h, v4.4h
; CHECK-SD-NEXT: sqdmull2 v0.4s, v0.8h, v4.8h
diff --git a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
index 284d624..f34d3ed 100644
--- a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
+++ b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
@@ -1,16 +1,12 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
-# RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
-# RUN: | FileCheck --check-prefix=CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
-# RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
-# RUN: | FileCheck --check-prefix=CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
-# RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
-# RUN: | FileCheck --check-prefix=CHECK-ZCM-ZCZ %s
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcz-gpr32,-zcz-gpr64" %s \
+# RUN: | FileCheck --check-prefix=CHECK-NOZCZ-GPR32-NOZCZ-GPR64 %s
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcz-gpr32,-zcz-gpr64" %s \
+# RUN: | FileCheck --check-prefix=CHECK-ZCZ-GPR32-NOZCZ-GPR64 %s
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcz-gpr32,+zcz-gpr64" %s \
+# RUN: | FileCheck --check-prefix=CHECK-NOZCZ-GPR32-ZCZ-GPR64 %s
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcz-gpr32,+zcz-gpr64" %s \
+# RUN: | FileCheck --check-prefix=CHECK-ZCZ-GPR32-ZCZ-GPR64 %s
--- |
define void @f0(i64 noundef %x) { ret void }
@@ -24,41 +20,29 @@ liveins:
body: |
bb.0:
liveins: $x0, $lr
- ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-LABEL: name: f0
- ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
- ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: {{ $}}
- ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: $w0 = ORRWrr $wzr, $wzr
- ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+ ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-LABEL: name: f0
+ ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64: liveins: $x0, $lr
+ ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: {{ $}}
+ ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: $w0 = ORRWrr $wzr, $wzr
+ ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
;
- ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-LABEL: name: f0
- ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
- ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: {{ $}}
- ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: $w0 = ORRWrr $wzr, $wzr
- ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+ ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-LABEL: name: f0
+ ; CHECK-ZCZ-GPR32-NOZCZ-GPR64: liveins: $x0, $lr
+ ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: {{ $}}
+ ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: $w0 = MOVZWi 0, 0
+ ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
;
- ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-LABEL: name: f0
- ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
- ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: {{ $}}
- ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, undef $xzr, implicit $wzr
- ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+ ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-LABEL: name: f0
+ ; CHECK-NOZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr
+ ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: {{ $}}
+ ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: $w0 = ORRWrr $wzr, $wzr
+ ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
;
- ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-LABEL: name: f0
- ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
- ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: {{ $}}
- ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: $w0 = ORRWrr $wzr, $wzr
- ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
- ;
- ; CHECK-NO-ZCM-ZCZ-LABEL: name: f0
- ; CHECK-NO-ZCM-ZCZ: liveins: $x0, $lr
- ; CHECK-NO-ZCM-ZCZ-NEXT: {{ $}}
- ; CHECK-NO-ZCM-ZCZ-NEXT: $w0 = MOVZWi 0, 0
- ; CHECK-NO-ZCM-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
- ;
- ; CHECK-ZCM-ZCZ-LABEL: name: f0
- ; CHECK-ZCM-ZCZ: liveins: $x0, $lr
- ; CHECK-ZCM-ZCZ-NEXT: {{ $}}
- ; CHECK-ZCM-ZCZ-NEXT: $w0 = MOVZWi 0, 0
- ; CHECK-ZCM-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+ ; CHECK-ZCZ-GPR32-ZCZ-GPR64-LABEL: name: f0
+ ; CHECK-ZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr
+ ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: {{ $}}
+ ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: $w0 = MOVZWi 0, 0
+ ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
$w0 = COPY $wzr
BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
...
@@ -69,41 +53,29 @@ liveins:
body: |
bb.0:
liveins: $x0, $lr
- ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-LABEL: name: f1
- ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
- ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: {{ $}}
- ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, $xzr
- ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
- ;
- ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-LABEL: name: f1
- ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
- ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: {{ $}}
- ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, $xzr
- ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
- ;
- ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-LABEL: name: f1
- ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
- ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: {{ $}}
- ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, $xzr
- ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+ ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-LABEL: name: f1
+ ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64: liveins: $x0, $lr
+ ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: {{ $}}
+ ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: $x0 = ORRXrr $xzr, $xzr
+ ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
;
- ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-LABEL: name: f1
- ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
- ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: {{ $}}
- ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, $xzr
- ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+ ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-LABEL: name: f1
+ ; CHECK-ZCZ-GPR32-NOZCZ-GPR64: liveins: $x0, $lr
+ ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: {{ $}}
+ ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: $x0 = ORRXrr $xzr, $xzr
+ ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
;
- ; CHECK-NO-ZCM-ZCZ-LABEL: name: f1
- ; CHECK-NO-ZCM-ZCZ: liveins: $x0, $lr
- ; CHECK-NO-ZCM-ZCZ-NEXT: {{ $}}
- ; CHECK-NO-ZCM-ZCZ-NEXT: $x0 = MOVZXi 0, 0
- ; CHECK-NO-ZCM-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+ ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-LABEL: name: f1
+ ; CHECK-NOZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr
+ ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: {{ $}}
+ ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: $x0 = MOVZXi 0, 0
+ ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
;
- ; CHECK-ZCM-ZCZ-LABEL: name: f1
- ; CHECK-ZCM-ZCZ: liveins: $x0, $lr
- ; CHECK-ZCM-ZCZ-NEXT: {{ $}}
- ; CHECK-ZCM-ZCZ-NEXT: $x0 = MOVZXi 0, 0
- ; CHECK-ZCM-ZCZ-NEXT:BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+ ; CHECK-ZCZ-GPR32-ZCZ-GPR64-LABEL: name: f1
+ ; CHECK-ZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr
+ ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: {{ $}}
+ ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: $x0 = MOVZXi 0, 0
+ ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
$x0 = COPY $xzr
BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
...
diff --git a/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll b/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll
index 97a7741..849323f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll
@@ -421,3 +421,83 @@ for.body51: ; preds = %is_sbox.exit155
unreachable
}
declare fastcc void @get_switch_type(i32, i32, i16 signext, i16 signext, ptr nocapture) nounwind ssp
+
+; CHECK-LABEL: fold_imm1_csinc_32:
+; CHECK: cmp w0, w1
+; CHECK-NEXT: csinc w0, w2, wzr, ge
+; CHECK-NEXT: ret
+define i32 @fold_imm1_csinc_32(i32 %x, i32 %y, i32 %n) nounwind ssp {
+entry:
+ %cmp = icmp slt i32 %x, %y
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ br label %exit
+
+if.else:
+ br label %exit
+
+exit:
+ %result = phi i32 [ 1, %if.then ], [ %n, %if.else ]
+ ret i32 %result
+}
+
+; CHECK-LABEL: fold_imm1_csinc_64:
+; CHECK: cmp x0, x1
+; CHECK-NEXT: csinc x0, x2, xzr, ge
+; CHECK-NEXT: ret
+define i64 @fold_imm1_csinc_64(i64 %x, i64 %y, i64 %n) nounwind ssp {
+entry:
+ %cmp = icmp slt i64 %x, %y
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ br label %exit
+
+if.else:
+ br label %exit
+
+exit:
+ %result = phi i64 [ 1, %if.then ], [ %n, %if.else ]
+ ret i64 %result
+}
+
+; CHECK-LABEL: fold_imm1_cset_32:
+; CHECK: cmp w0, w1
+; CHECK-NEXT: cset w0, lt
+; CHECK-NEXT: ret
+define i32 @fold_imm1_cset_32(i32 %x, i32 %y) nounwind ssp {
+entry:
+ %cmp = icmp slt i32 %x, %y
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ br label %exit
+
+if.else:
+ br label %exit
+
+exit:
+ %result = phi i32 [ 1, %if.then ], [ 0, %if.else ]
+ ret i32 %result
+}
+
+; CHECK-LABEL: fold_imm1_cset_64:
+; CHECK: cmp x0, x1
+; CHECK-NEXT: cset x0, lt
+; CHECK-NEXT: ret
+define i64 @fold_imm1_cset_64(i64 %x, i64 %y) nounwind ssp {
+entry:
+ %cmp = icmp slt i64 %x, %y
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ br label %exit
+
+if.else:
+ br label %exit
+
+exit:
+ %result = phi i64 [ 1, %if.then ], [ 0, %if.else ]
+ ret i64 %result
+}
diff --git a/llvm/test/CodeGen/AArch64/peephole-csel.ll b/llvm/test/CodeGen/AArch64/peephole-csel.ll
index 868b9f1..b0852580 100644
--- a/llvm/test/CodeGen/AArch64/peephole-csel.ll
+++ b/llvm/test/CodeGen/AArch64/peephole-csel.ll
@@ -5,10 +5,9 @@ define void @peephole_csel(ptr %dst, i1 %0, i1 %cmp) {
; CHECK-LABEL: peephole_csel:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: tst w2, #0x1
-; CHECK-NEXT: mov w8, #1 // =0x1
-; CHECK-NEXT: mov x9, xzr
+; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: tst w1, #0x1
-; CHECK-NEXT: csel x8, x8, x9, eq
+; CHECK-NEXT: csinc x8, x8, xzr, ne
; CHECK-NEXT: str x8, [x0]
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/rax1.ll b/llvm/test/CodeGen/AArch64/rax1.ll
index f679e90..4bb551f 100644
--- a/llvm/test/CodeGen/AArch64/rax1.ll
+++ b/llvm/test/CodeGen/AArch64/rax1.ll
@@ -1,22 +1,44 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
-; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefixes=SHA3,SHA3-SD %s
+; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefixes=NOSHA3,NOSHA3-SD %s
+; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s -global-isel | FileCheck --check-prefixes=SHA3,SHA3-GI %s
+; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s -global-isel | FileCheck --check-prefixes=NOSHA3,NOSHA3-GI %s
define <2 x i64> @rax1(<2 x i64> %x, <2 x i64> %y) {
-; SHA3-LABEL: rax1:
-; SHA3: // %bb.0:
-; SHA3-NEXT: rax1 v0.2d, v0.2d, v1.2d
-; SHA3-NEXT: ret
+; SHA3-SD-LABEL: rax1:
+; SHA3-SD: // %bb.0:
+; SHA3-SD-NEXT: rax1 v0.2d, v0.2d, v1.2d
+; SHA3-SD-NEXT: ret
;
-; NOSHA3-LABEL: rax1:
-; NOSHA3: // %bb.0:
-; NOSHA3-NEXT: add v2.2d, v1.2d, v1.2d
-; NOSHA3-NEXT: usra v2.2d, v1.2d, #63
-; NOSHA3-NEXT: eor v0.16b, v0.16b, v2.16b
-; NOSHA3-NEXT: ret
+; NOSHA3-SD-LABEL: rax1:
+; NOSHA3-SD: // %bb.0:
+; NOSHA3-SD-NEXT: add v2.2d, v1.2d, v1.2d
+; NOSHA3-SD-NEXT: usra v2.2d, v1.2d, #63
+; NOSHA3-SD-NEXT: eor v0.16b, v0.16b, v2.16b
+; NOSHA3-SD-NEXT: ret
+;
+; SHA3-GI-LABEL: rax1:
+; SHA3-GI: // %bb.0:
+; SHA3-GI-NEXT: shl v2.2d, v1.2d, #1
+; SHA3-GI-NEXT: ushr v1.2d, v1.2d, #63
+; SHA3-GI-NEXT: orr v1.16b, v2.16b, v1.16b
+; SHA3-GI-NEXT: eor v0.16b, v0.16b, v1.16b
+; SHA3-GI-NEXT: ret
+;
+; NOSHA3-GI-LABEL: rax1:
+; NOSHA3-GI: // %bb.0:
+; NOSHA3-GI-NEXT: shl v2.2d, v1.2d, #1
+; NOSHA3-GI-NEXT: ushr v1.2d, v1.2d, #63
+; NOSHA3-GI-NEXT: orr v1.16b, v2.16b, v1.16b
+; NOSHA3-GI-NEXT: eor v0.16b, v0.16b, v1.16b
+; NOSHA3-GI-NEXT: ret
%a = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %y, <2 x i64> %y, <2 x i64> <i64 1, i64 1>)
%b = xor <2 x i64> %x, %a
ret <2 x i64> %b
}
declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; NOSHA3: {{.*}}
+; SHA3: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index 002c03aa..e86f747 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -551,7 +551,9 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 {
;
; GFX10CU-LABEL: name: workgroup_one_as_release
; GFX10CU: bb.0.entry:
+ ; GFX10CU-NEXT: S_WAITCNT_soft 16240
; GFX10CU-NEXT: S_WAITCNT_lds_direct
+ ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_release
@@ -562,6 +564,8 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 {
;
; GFX11CU-LABEL: name: workgroup_one_as_release
; GFX11CU: bb.0.entry:
+ ; GFX11CU-NEXT: S_WAITCNT_soft 1015
+ ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup-one-as") release
@@ -587,7 +591,9 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
;
; GFX10CU-LABEL: name: workgroup_one_as_acq_rel
; GFX10CU: bb.0.entry:
+ ; GFX10CU-NEXT: S_WAITCNT_soft 16240
; GFX10CU-NEXT: S_WAITCNT_lds_direct
+ ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel
@@ -599,6 +605,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
;
; GFX11CU-LABEL: name: workgroup_one_as_acq_rel
; GFX11CU: bb.0.entry:
+ ; GFX11CU-NEXT: S_WAITCNT_soft 1015
+ ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup-one-as") acq_rel
@@ -624,7 +632,9 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
;
; GFX10CU-LABEL: name: workgroup_one_as_seq_cst
; GFX10CU: bb.0.entry:
+ ; GFX10CU-NEXT: S_WAITCNT_soft 16240
; GFX10CU-NEXT: S_WAITCNT_lds_direct
+ ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst
@@ -636,6 +646,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
;
; GFX11CU-LABEL: name: workgroup_one_as_seq_cst
; GFX11CU: bb.0.entry:
+ ; GFX11CU-NEXT: S_WAITCNT_soft 1015
+ ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup-one-as") seq_cst
@@ -1305,8 +1317,9 @@ define amdgpu_kernel void @workgroup_release() #0 {
;
; GFX10CU-LABEL: name: workgroup_release
; GFX10CU: bb.0.entry:
- ; GFX10CU-NEXT: S_WAITCNT_soft 49279
+ ; GFX10CU-NEXT: S_WAITCNT_soft 112
; GFX10CU-NEXT: S_WAITCNT_lds_direct
+ ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_release
@@ -1317,7 +1330,8 @@ define amdgpu_kernel void @workgroup_release() #0 {
;
; GFX11CU-LABEL: name: workgroup_release
; GFX11CU: bb.0.entry:
- ; GFX11CU-NEXT: S_WAITCNT_soft 64519
+ ; GFX11CU-NEXT: S_WAITCNT_soft 7
+ ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup") release
@@ -1345,8 +1359,9 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
;
; GFX10CU-LABEL: name: workgroup_acq_rel
; GFX10CU: bb.0.entry:
- ; GFX10CU-NEXT: S_WAITCNT_soft 49279
+ ; GFX10CU-NEXT: S_WAITCNT_soft 112
; GFX10CU-NEXT: S_WAITCNT_lds_direct
+ ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_acq_rel
@@ -1358,7 +1373,8 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
;
; GFX11CU-LABEL: name: workgroup_acq_rel
; GFX11CU: bb.0.entry:
- ; GFX11CU-NEXT: S_WAITCNT_soft 64519
+ ; GFX11CU-NEXT: S_WAITCNT_soft 7
+ ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup") acq_rel
@@ -1386,8 +1402,9 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
;
; GFX10CU-LABEL: name: workgroup_seq_cst
; GFX10CU: bb.0.entry:
- ; GFX10CU-NEXT: S_WAITCNT_soft 49279
+ ; GFX10CU-NEXT: S_WAITCNT_soft 112
; GFX10CU-NEXT: S_WAITCNT_lds_direct
+ ; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_seq_cst
@@ -1399,7 +1416,8 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
;
; GFX11CU-LABEL: name: workgroup_seq_cst
; GFX11CU: bb.0.entry:
- ; GFX11CU-NEXT: S_WAITCNT_soft 64519
+ ; GFX11CU-NEXT: S_WAITCNT_soft 7
+ ; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup") seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index 53b2542..142290a 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -3611,10 +3611,10 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc
+; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: buffer_load_ushort v6, off, s[16:19], 0 offset:4 glc
-; SDAG-NEXT: s_waitcnt vmcnt(1)
-; SDAG-NEXT: v_lshrrev_b32_e32 v7, 8, v0
; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshrrev_b32_e32 v7, 8, v0
; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v6
; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
@@ -3627,12 +3627,12 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4 glc
-; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
-; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3652,6 +3652,7 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: buffer_store_short v4, off, s[16:19], 0 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -3671,6 +3672,7 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GISEL-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:4
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
index b91963f..d23509b 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
@@ -150,7 +150,6 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
; GFX10CU-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
; GFX10CU-NEXT: v_mov_b32_e32 v0, s13
; GFX10CU-NEXT: s_waitcnt vmcnt(0)
-; GFX10CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX10CU-NEXT: s_barrier
; GFX10CU-NEXT: ds_read_b32 v0, v0
; GFX10CU-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
index 5d03dfb..352af04 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
@@ -218,3 +218,174 @@ main_body:
ret void
}
+define amdgpu_ps void @global_load_lds_dword_volatile(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_volatile:
+; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_mov_b32 m0, s0
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_load_dword v[0:1], off offset:256 lds
+; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_volatile:
+; GFX942: ; %bb.0: ; %main_body
+; GFX942-NEXT: s_mov_b32 m0, s0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:256
+; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_volatile:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_mov_b32 m0, s0
+; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_load_dword v[0:1], off offset:256 lds
+; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_volatile:
+; GFX942-GISEL: ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s0
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:256
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 256, i32 0)
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: buffer_load_lds_dword_volatile:
+; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT: s_mov_b32 m0, s5
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: buffer_load_lds_dword_volatile:
+; GFX942: ; %bb.0: ; %main_body
+; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT: s_mov_b32 m0, s5
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT: s_endpgm
+;
+; GFX10-LABEL: buffer_load_lds_dword_volatile:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT: s_mov_b32 m0, s5
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: buffer_load_lds_dword_volatile:
+; GFX942-GISEL: ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT: s_endpgm
+main_body:
+ %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 256, i32 0)
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_load_lds_dword_nontemporal(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_nontemporal:
+; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_mov_b32 m0, s2
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_load_dword v0, s[0:1] glc slc lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_nontemporal:
+; GFX942: ; %bb.0: ; %main_body
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_mov_b32 m0, s2
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_load_lds_dword v0, s[0:1] nt
+; GFX942-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_nontemporal:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_mov_b32 m0, s2
+; GFX10-NEXT: global_load_dword v0, s[0:1] slc lds
+; GFX10-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_nontemporal:
+; GFX942-GISEL: ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: global_load_lds_dword v0, s[0:1] nt
+; GFX942-GISEL-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+ ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_nontemporal(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT: s_mov_b32 m0, s5
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc slc lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX942: ; %bb.0: ; %main_body
+; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT: s_mov_b32 m0, s5
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-NEXT: s_endpgm
+;
+; GFX10-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT: s_mov_b32 m0, s5
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen slc lds
+; GFX10-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX942-GISEL: ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-GISEL-NEXT: s_endpgm
+main_body:
+ %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+ ret void
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
index f0204bd..1dcd032 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
@@ -110,3 +110,43 @@ main_body:
call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 1, i32 0, i32 0, i32 2048, i32 0)
ret void
}
+
+define amdgpu_ps float @buffer_load_lds_dword_volatile(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; GCN-LABEL: buffer_load_lds_dword_volatile:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_dword off, s[0:3], 0 glc lds
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword off, s[0:3], 0 offset:256 lds
+; GCN-NEXT: buffer_load_dword off, s[0:3], 0 offset:512 lds
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ds_read_b32 v0, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
+main_body:
+ call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 2147483648)
+ call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 256, i32 0)
+ call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 512, i32 0)
+ %res = load float, ptr addrspace(3) %lds
+ ret float %res
+}
+
+define amdgpu_ps float @buffer_load_lds_dword_nontemporal(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; GCN-LABEL: buffer_load_lds_dword_nontemporal:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: buffer_load_dword off, s[0:3], 0 glc slc lds
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ds_read_b32 v0, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
+main_body:
+ call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0), !nontemporal !0
+ %res = load float, ptr addrspace(3) %lds
+ ret float %res
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
index b5d741b..a979999 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
@@ -80,25 +80,29 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_volatile(p
; PREGFX10-LABEL: buffer_load_volatile:
; PREGFX10: ; %bb.0: ; %main_body
; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; PREGFX10-NEXT: s_waitcnt vmcnt(0)
; PREGFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc
+; PREGFX10-NEXT: s_waitcnt vmcnt(0)
; PREGFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc
; PREGFX10-NEXT: s_waitcnt vmcnt(0)
; PREGFX10-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: buffer_load_volatile:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: buffer_load_volatile:
; GFX11: ; %bb.0: ; %main_body
-; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 glc slc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index e8b8d05..e8eccb0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -147,14 +147,13 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
; GFX8-OPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-OPT-NEXT: v_mov_b32_e32 v2, 0
; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-OPT-NEXT: s_barrier
-; GFX8-OPT-NEXT: v_add_u32_e32 v1, vcc, v1, v1
-; GFX8-OPT-NEXT: s_nop 1
-; GFX8-OPT-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
-; GFX8-OPT-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-OPT-NEXT: v_add_u32_e32 v4, vcc, v1, v1
+; GFX8-OPT-NEXT: v_mov_b32_e32 v3, s1
; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-OPT-NEXT: v_mov_b32_dpp v2, v4 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-OPT-NEXT: v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-OPT-NEXT: s_barrier
; GFX8-OPT-NEXT: flat_store_dword v[0:1], v2
; GFX8-OPT-NEXT: s_endpgm
;
@@ -194,14 +193,14 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: ds_read_b32 v1, v0
-; GFX10-NEXT: s_barrier
-; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v1
-; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
-; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v3, v1, v1
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
+; GFX10-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX10-NEXT: s_barrier
+; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: flat_store_dword v[0:1], v2
; GFX10-NEXT: s_endpgm
;
@@ -213,15 +212,15 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX11-NEXT: ds_load_b32 v1, v0
-; GFX11-NEXT: s_barrier
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v3, v1, v1
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX11-NEXT: s_barrier
+; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
index 97db15b..1d1d3e4 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
@@ -107,6 +107,7 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
@@ -120,7 +121,6 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i
; GFX12-NEXT: s_mov_b32 s3, s12
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen
; GFX12-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index 9dac239..1e4b633 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -354,6 +354,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX9-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11]
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX9-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30
; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
; GFX9-SDAG-NEXT: s_mov_b32 s5, s10
@@ -365,8 +366,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX9-SDAG-NEXT: s_mov_b32 s3, s10
; GFX9-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11]
; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -384,6 +385,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX9-GISEL-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
; GFX9-GISEL-NEXT: s_load_dword s7, s[8:9], 0x30
; GFX9-GISEL-NEXT: s_mov_b32 s4, s11
@@ -395,8 +397,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX9-GISEL-NEXT: s_mov_b32 s10, s3
; GFX9-GISEL-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX942-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -414,6 +416,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX942-SDAG-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX942-SDAG-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: s_load_dword s13, s[4:5], 0x30
; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
; GFX942-SDAG-NEXT: s_mov_b32 s5, s12
@@ -425,8 +428,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX942-SDAG-NEXT: s_mov_b32 s3, s12
; GFX942-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -444,6 +447,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX942-GISEL-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX942-GISEL-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
; GFX942-GISEL-NEXT: s_load_dword s9, s[4:5], 0x30
; GFX942-GISEL-NEXT: s_mov_b32 s4, s7
@@ -455,8 +459,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX942-GISEL-NEXT: s_mov_b32 s6, s3
; GFX942-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -475,6 +479,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX10-SDAG-NEXT: s_mov_b32 s11, s2
; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11]
; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30
; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
@@ -488,8 +493,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX10-SDAG-NEXT: s_mov_b32 s2, s1
; GFX10-SDAG-NEXT: s_mov_b32 s3, s10
; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11]
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -508,6 +513,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX10-GISEL-NEXT: s_mov_b32 s6, s3
; GFX10-GISEL-NEXT: s_or_b64 s[2:3], s[6:7], s[4:5]
; GFX10-GISEL-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc dlc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
@@ -519,8 +525,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX10-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; GFX10-GISEL-NEXT: s_mov_b32 s6, s3
; GFX10-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11]
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -541,6 +547,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
@@ -554,8 +561,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX11-SDAG-NEXT: s_mov_b32 s3, s12
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -576,6 +583,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7]
; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
@@ -588,8 +596,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX11-GISEL-NEXT: s_mov_b32 s8, s3
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7]
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -610,6 +618,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
@@ -623,8 +632,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX12-SDAG-NEXT: s_mov_b32 s3, s12
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -645,6 +654,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7]
; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
@@ -657,8 +667,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX12-GISEL-NEXT: s_mov_b32 s8, s3
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7]
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm
entry:
%val = load volatile i32, ptr addrspace(7) %in, !nontemporal !0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
index 516c3946..282a7ae 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
@@ -15,7 +15,6 @@ define amdgpu_kernel void @test_s_barrier() {
;
; GFX10-CU-LABEL: test_s_barrier:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-CU-NEXT: s_barrier
; GFX10-CU-NEXT: s_endpgm
;
@@ -26,7 +25,6 @@ define amdgpu_kernel void @test_s_barrier() {
;
; GFX11-CU-LABEL: test_s_barrier:
; GFX11-CU: ; %bb.0: ; %entry
-; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX11-CU-NEXT: s_barrier
; GFX11-CU-NEXT: s_endpgm
;
@@ -38,7 +36,6 @@ define amdgpu_kernel void @test_s_barrier() {
;
; GFX12-CU-LABEL: test_s_barrier:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: s_wait_alu 0xffe3
; GFX12-CU-NEXT: s_barrier_signal -1
; GFX12-CU-NEXT: s_barrier_wait -1
; GFX12-CU-NEXT: s_endpgm
@@ -63,8 +60,8 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
;
; GFX10-CU-LABEL: test_s_barrier_workgroup_fence:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_barrier
; GFX10-CU-NEXT: s_endpgm
;
@@ -77,8 +74,8 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
;
; GFX11-CU-LABEL: test_s_barrier_workgroup_fence:
; GFX11-CU: ; %bb.0: ; %entry
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_barrier
; GFX11-CU-NEXT: s_endpgm
;
@@ -94,8 +91,10 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
;
; GFX12-CU-LABEL: test_s_barrier_workgroup_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
-; GFX12-CU-NEXT: s_wait_alu 0xffe3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_barrier_signal -1
; GFX12-CU-NEXT: s_barrier_wait -1
; GFX12-CU-NEXT: s_endpgm
@@ -125,7 +124,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-CU-NEXT: s_barrier
; GFX10-CU-NEXT: s_endpgm
;
@@ -140,7 +138,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX11-CU-NEXT: s_barrier
; GFX11-CU-NEXT: s_endpgm
;
@@ -160,7 +157,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT: s_wait_alu 0xffe3
; GFX12-CU-NEXT: s_barrier_signal -1
; GFX12-CU-NEXT: s_barrier_wait -1
; GFX12-CU-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
index 6a76f43..7efbff9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
@@ -107,6 +107,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
;
; GFX10-CU-LABEL: workgroup_release_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_release_fence:
@@ -139,6 +141,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
;
; GFX11-CU-LABEL: workgroup_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_release_fence:
@@ -151,6 +155,10 @@ define amdgpu_kernel void @workgroup_release_fence() {
;
; GFX12-CU-LABEL: workgroup_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_release_fence:
@@ -181,6 +189,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
;
; GFX10-CU-LABEL: workgroup_acq_rel_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence:
@@ -216,6 +226,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
;
; GFX11-CU-LABEL: workgroup_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_acq_rel_fence:
@@ -229,6 +241,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
;
; GFX12-CU-LABEL: workgroup_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_acq_rel_fence:
@@ -259,6 +275,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
;
; GFX10-CU-LABEL: workgroup_seq_cst_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence:
@@ -294,6 +312,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
;
; GFX11-CU-LABEL: workgroup_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_seq_cst_fence:
@@ -307,6 +327,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
;
; GFX12-CU-LABEL: workgroup_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_seq_cst_fence:
@@ -412,6 +436,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
;
; GFX10-CU-LABEL: workgroup_one_as_release_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence:
@@ -444,6 +470,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
;
; GFX11-CU-LABEL: workgroup_one_as_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_one_as_release_fence:
@@ -456,6 +484,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
;
; GFX12-CU-LABEL: workgroup_one_as_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_one_as_release_fence:
@@ -486,6 +518,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
;
; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence:
@@ -521,6 +555,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
;
; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence:
@@ -534,6 +570,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
;
; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
@@ -564,6 +604,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
;
; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence:
@@ -599,6 +641,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
;
; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence:
@@ -612,6 +656,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
;
; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index d288bfc..1cca64a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -1093,7 +1093,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
;
; GFX10-CU-LABEL: workgroup_release_fence:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_release_fence:
@@ -1129,7 +1130,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
;
; GFX11-CU-LABEL: workgroup_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_release_fence:
@@ -1142,7 +1144,10 @@ define amdgpu_kernel void @workgroup_release_fence() {
;
; GFX12-CU-LABEL: workgroup_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_release_fence:
@@ -1175,7 +1180,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
;
; GFX10-CU-LABEL: workgroup_acq_rel_fence:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence:
@@ -1214,7 +1220,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
;
; GFX11-CU-LABEL: workgroup_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_acq_rel_fence:
@@ -1228,7 +1235,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
;
; GFX12-CU-LABEL: workgroup_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_acq_rel_fence:
@@ -1261,7 +1271,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
;
; GFX10-CU-LABEL: workgroup_seq_cst_fence:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence:
@@ -1300,7 +1311,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
;
; GFX11-CU-LABEL: workgroup_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_seq_cst_fence:
@@ -1314,7 +1326,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
;
; GFX12-CU-LABEL: workgroup_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_seq_cst_fence:
@@ -1420,6 +1435,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
;
; GFX10-CU-LABEL: workgroup_one_as_release_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence:
@@ -1452,6 +1469,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
;
; GFX11-CU-LABEL: workgroup_one_as_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_one_as_release_fence:
@@ -1464,6 +1483,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
;
; GFX12-CU-LABEL: workgroup_one_as_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_one_as_release_fence:
@@ -1494,6 +1517,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
;
; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence:
@@ -1529,6 +1554,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
;
; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence:
@@ -1542,6 +1569,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
;
; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
@@ -1572,6 +1603,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
;
; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence:
@@ -1607,6 +1640,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
;
; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence:
@@ -1620,6 +1655,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
;
; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index d277441..2afa577 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -1072,7 +1072,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -1109,7 +1110,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -1136,7 +1138,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index 3826953..d384aec 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -656,12 +656,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -765,12 +765,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -800,12 +800,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -1193,7 +1195,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -1278,7 +1281,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -1305,7 +1309,10 @@ define amdgpu_kernel void @flat_workgroup_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -1372,7 +1379,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -1457,7 +1465,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -1484,7 +1493,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -1891,7 +1903,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -1976,7 +1989,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -2003,7 +2017,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -2074,9 +2091,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_atomicrmw:
@@ -2170,9 +2189,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
@@ -2200,9 +2221,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_acq_rel_atomicrmw:
@@ -2273,9 +2297,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_atomicrmw:
@@ -2369,9 +2395,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
@@ -2399,9 +2427,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_seq_cst_atomicrmw:
@@ -2697,12 +2728,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -2813,12 +2844,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -2850,12 +2881,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -2935,12 +2968,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -3051,12 +3084,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -3088,12 +3121,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -3731,7 +3766,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
@@ -3854,7 +3890,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -3889,7 +3926,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -4007,9 +4047,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4141,9 +4183,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4179,9 +4223,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4299,9 +4346,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4433,9 +4482,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4471,9 +4522,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
@@ -5137,9 +5191,11 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_cmpxchg:
@@ -5271,9 +5327,11 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
@@ -5309,9 +5367,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_release_acquire_cmpxchg:
@@ -5429,9 +5490,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
@@ -5563,9 +5626,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
@@ -5601,9 +5666,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
@@ -5721,9 +5789,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
@@ -5855,9 +5925,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
@@ -5893,9 +5965,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
@@ -6013,9 +6088,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6147,9 +6224,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6185,9 +6264,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6923,7 +7005,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
@@ -7070,7 +7153,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -7113,7 +7197,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -7245,12 +7332,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -7399,12 +7486,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -7444,12 +7531,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -7577,12 +7666,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -7731,12 +7820,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -7776,12 +7865,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -8535,12 +8626,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -8689,12 +8780,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -8734,12 +8825,14 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -8867,12 +8960,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -9021,12 +9114,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -9066,12 +9159,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -9199,12 +9294,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -9353,12 +9448,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -9398,12 +9493,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -9531,7 +9628,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9685,7 +9783,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9730,7 +9829,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9863,7 +9965,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -10017,7 +10120,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -10062,7 +10166,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -10195,12 +10302,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -10349,12 +10456,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -10394,12 +10501,14 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -10527,12 +10636,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -10681,12 +10790,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -10726,12 +10835,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -10859,12 +10970,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -11013,12 +11124,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -11058,12 +11169,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -11732,10 +11845,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -11834,10 +11950,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -11868,10 +11987,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -12258,6 +12382,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -12339,6 +12465,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -12365,6 +12493,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -12430,6 +12562,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -12511,6 +12645,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -12537,6 +12673,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -12933,6 +13073,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -13014,6 +13156,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -13040,6 +13184,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -13107,7 +13255,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
@@ -13194,7 +13345,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
@@ -13222,7 +13376,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
@@ -13290,7 +13449,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
@@ -13377,7 +13539,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
@@ -13405,7 +13570,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
@@ -13696,10 +13866,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -13805,10 +13978,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -13841,10 +14017,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -13923,10 +14104,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -14032,10 +14216,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -14068,10 +14255,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -14699,6 +14891,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
@@ -14818,6 +15012,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -14852,6 +15048,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -14966,7 +15166,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15091,7 +15294,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15127,7 +15333,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15242,7 +15453,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15367,7 +15581,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15403,7 +15620,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -16046,7 +16268,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
@@ -16171,7 +16396,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
@@ -16207,7 +16435,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
@@ -16322,7 +16555,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16447,7 +16683,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16483,7 +16722,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16598,7 +16842,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16723,7 +16970,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16759,7 +17009,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16874,6 +17129,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
@@ -16999,6 +17256,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -17035,6 +17294,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -17150,6 +17413,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
@@ -17275,6 +17540,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -17311,6 +17578,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -17426,7 +17697,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17551,7 +17825,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17587,7 +17864,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17702,7 +17984,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17827,7 +18112,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17863,7 +18151,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17978,7 +18271,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18103,7 +18399,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18139,7 +18438,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18870,6 +19174,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
@@ -19013,6 +19319,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -19055,6 +19363,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -19185,10 +19497,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -19332,10 +19647,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -19376,10 +19694,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -19506,10 +19829,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -19653,10 +19979,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -19697,10 +20026,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -20445,10 +20779,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -20592,10 +20929,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -20636,10 +20976,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -20766,10 +21111,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -20913,10 +21261,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -20957,10 +21308,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -21087,10 +21443,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -21234,10 +21593,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -21278,10 +21640,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -21408,6 +21775,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
@@ -21555,6 +21924,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -21599,6 +21970,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -21729,6 +22104,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
@@ -21876,6 +22253,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -21920,6 +22299,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
@@ -22050,10 +22433,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -22197,10 +22583,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -22241,10 +22630,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -22371,10 +22765,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -22518,10 +22915,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -22562,10 +22962,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
@@ -22692,10 +23097,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -22839,10 +23247,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
@@ -22883,10 +23294,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index 3bf5ed8..c326edf 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -959,7 +959,8 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
@@ -1001,7 +1002,8 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
@@ -1026,7 +1028,10 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index b755c5d..868b438 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -667,7 +667,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -763,7 +764,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -790,7 +792,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -1204,7 +1209,8 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
@@ -1290,7 +1296,8 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
@@ -1315,7 +1322,10 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
@@ -1391,7 +1401,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
@@ -1477,7 +1488,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
@@ -1502,7 +1514,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
@@ -1918,7 +1933,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
@@ -2003,7 +2019,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
@@ -2028,7 +2045,10 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
@@ -2105,8 +2125,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_atomicrmw:
@@ -2196,8 +2218,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_acq_rel_atomicrmw:
@@ -2223,8 +2247,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_acq_rel_atomicrmw:
@@ -2301,8 +2329,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_atomicrmw:
@@ -2392,8 +2422,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_seq_cst_atomicrmw:
@@ -2419,8 +2451,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_seq_cst_atomicrmw:
@@ -2705,7 +2741,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -2807,7 +2844,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -2837,7 +2875,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -2926,7 +2967,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -3028,7 +3070,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -3058,7 +3101,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -3644,7 +3690,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -3758,7 +3805,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -3791,7 +3839,10 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -3900,8 +3951,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4020,8 +4073,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4055,8 +4110,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4165,8 +4224,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4285,8 +4346,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4320,8 +4383,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4920,8 +4987,10 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_cmpxchg:
@@ -5040,8 +5109,10 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_release_acquire_cmpxchg:
@@ -5075,8 +5146,12 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_release_acquire_cmpxchg:
@@ -5185,8 +5260,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
@@ -5305,8 +5382,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
@@ -5340,8 +5419,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
@@ -5450,8 +5533,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
@@ -5570,8 +5655,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
@@ -5605,8 +5692,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
@@ -5715,7 +5806,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -5835,7 +5927,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -5870,7 +5963,10 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -5980,7 +6076,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -6100,7 +6197,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -6135,7 +6233,10 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -6245,8 +6346,10 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_release_seq_cst_cmpxchg:
@@ -6365,8 +6468,10 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg:
@@ -6400,8 +6505,12 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_release_seq_cst_cmpxchg:
@@ -6510,8 +6619,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
@@ -6630,8 +6741,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
@@ -6665,8 +6778,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
@@ -6775,8 +6892,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6895,8 +7014,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6930,8 +7051,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -7588,7 +7713,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -7717,7 +7843,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -7754,7 +7881,10 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -7877,7 +8007,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -8009,7 +8140,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -8047,7 +8179,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -8170,7 +8305,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -8302,7 +8438,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -8340,7 +8477,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -9009,7 +9149,8 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -9141,7 +9282,8 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -9179,7 +9321,10 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -9302,7 +9447,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -9434,7 +9580,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -9472,7 +9619,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -9595,7 +9745,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -9727,7 +9878,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -9765,7 +9917,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -9888,7 +10043,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -10020,7 +10176,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -10058,7 +10215,10 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -10181,7 +10341,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -10313,7 +10474,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -10351,7 +10513,10 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -10474,7 +10639,8 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -10606,7 +10772,8 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -10644,7 +10811,10 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -10767,7 +10937,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -10899,7 +11070,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -10937,7 +11109,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -11060,7 +11235,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -11192,7 +11368,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -11230,7 +11407,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -11914,7 +12094,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -12009,7 +12190,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -12036,6 +12218,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -12447,6 +12633,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
@@ -12529,6 +12717,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
@@ -12553,6 +12743,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
@@ -12626,6 +12820,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
@@ -12708,6 +12904,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
@@ -12732,6 +12930,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
@@ -13145,6 +13347,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
@@ -13226,6 +13430,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
@@ -13250,6 +13456,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
@@ -13324,7 +13534,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
@@ -13411,7 +13624,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
@@ -13437,7 +13653,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
@@ -13512,7 +13733,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
@@ -13599,7 +13823,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
@@ -13625,7 +13852,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
@@ -13908,6 +14140,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -14006,6 +14240,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -14035,6 +14271,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -14121,6 +14361,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -14219,6 +14461,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -14248,6 +14492,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -14831,6 +15079,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -14941,6 +15191,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -14973,6 +15225,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -15079,7 +15335,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15195,7 +15454,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15229,7 +15491,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15336,7 +15603,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15452,7 +15722,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15486,7 +15759,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -16083,7 +16361,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
@@ -16199,7 +16480,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
@@ -16233,7 +16517,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
@@ -16340,7 +16629,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16456,7 +16748,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16490,7 +16785,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16597,7 +16897,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16713,7 +17016,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16747,7 +17053,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16854,6 +17165,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -16970,6 +17283,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -17004,6 +17319,10 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -17111,6 +17430,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -17227,6 +17548,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -17261,6 +17584,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -17368,7 +17695,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17484,7 +17814,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17518,7 +17851,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17625,7 +17963,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17741,7 +18082,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17775,7 +18119,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17882,7 +18231,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -17998,7 +18350,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18032,7 +18387,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18687,6 +19047,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -18812,6 +19174,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -18848,6 +19212,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -18968,6 +19336,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -19096,6 +19466,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -19133,6 +19505,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -19253,6 +19629,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -19381,6 +19759,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -19418,6 +19798,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -20084,6 +20468,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -20212,6 +20598,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -20249,6 +20637,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -20369,6 +20761,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -20497,6 +20891,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -20534,6 +20930,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -20654,6 +21054,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -20782,6 +21184,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -20819,6 +21223,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -20939,6 +21347,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -21067,6 +21477,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -21104,6 +21516,10 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -21224,6 +21640,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -21352,6 +21770,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -21389,6 +21809,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -21509,6 +21933,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -21637,6 +22063,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -21674,6 +22102,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -21794,6 +22226,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -21922,6 +22356,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -21959,6 +22395,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -22079,6 +22519,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -22207,6 +22649,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -22244,6 +22688,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll
new file mode 100644
index 0000000..0b40c81
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll
@@ -0,0 +1,542 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx90a -global-isel=0 < %s | FileCheck --check-prefixes=GFX90A %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -global-isel=0 < %s | FileCheck --check-prefixes=GFX942 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -global-isel=1 < %s | FileCheck --check-prefixes=GFX942-GISEL %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -global-isel=0 < %s | FileCheck --check-prefixes=GFX10 %s
+
+define amdgpu_ps void @global_load_lds_dword_volatile(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_volatile:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: v_mov_b32_e32 v2, s1
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT: s_mov_b32 m0, s2
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_volatile:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT: s_mov_b32 m0, s2
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_volatile:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_volatile:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 m0, s2
+; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT: s_endpgm
+ %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_load_lds_dword_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_nontemporal:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: v_mov_b32_e32 v2, s1
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT: s_mov_b32 m0, s2
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT: global_load_dword v[0:1], off glc slc lds
+; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_nontemporal:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_mov_b32 m0, s2
+; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT: global_load_lds_dword v[0:1], off nt
+; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_nontemporal:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off nt
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_nontemporal:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 m0, s2
+; GFX10-NEXT: global_load_dword v[0:1], off slc lds
+; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT: s_endpgm
+ %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_load_lds_dword_volatile_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: v_mov_b32_e32 v2, s1
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT: s_mov_b32 m0, s2
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT: s_mov_b32 m0, s2
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 m0, s2
+; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT: s_endpgm
+ %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @load_to_lds_p1_dword_volatile(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p1_dword_volatile:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: v_mov_b32_e32 v2, s1
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT: s_mov_b32 m0, s2
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p1_dword_volatile:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT: s_mov_b32 m0, s2
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p1_dword_volatile:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p1_dword_volatile:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 m0, s2
+; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT: s_endpgm
+ %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @load_to_lds_p1_dword_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: v_mov_b32_e32 v2, s1
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT: s_mov_b32 m0, s2
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT: global_load_dword v[0:1], off glc slc lds
+; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_mov_b32 m0, s2
+; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT: global_load_lds_dword v[0:1], off nt
+; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off nt
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 m0, s2
+; GFX10-NEXT: global_load_dword v[0:1], off slc lds
+; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT: s_endpgm
+ %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @load_to_lds_p1_dword_volatile_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: v_mov_b32_e32 v2, s1
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT: s_mov_b32 m0, s2
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT: s_mov_b32 m0, s2
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 m0, s2
+; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT: s_endpgm
+ %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @load_to_lds_p7_dword_volatile(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p7_dword_volatile:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT: s_mov_b32 m0, s5
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p7_dword_volatile:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT: s_mov_b32 m0, s5
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p7_dword_volatile:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p7_dword_volatile:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT: s_mov_b32 m0, s5
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT: s_endpgm
+ %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @load_to_lds_p7_dword_nontemporal(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT: s_mov_b32 m0, s5
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc slc lds
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT: s_mov_b32 m0, s5
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT: s_mov_b32 m0, s5
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen slc lds
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT: s_endpgm
+ %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @load_to_lds_p7_dword_volatile_nontemporal(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT: s_mov_b32 m0, s5
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT: s_mov_b32 m0, s5
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT: s_mov_b32 m0, s5
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT: s_endpgm
+ %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_volatile(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_mov_b32 m0, s4
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_mov_b32 m0, s4
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s4
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b32 m0, s4
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT: s_endpgm
+ call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 2147483648)
+ call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_nontemporal(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_mov_b32 m0, s4
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc slc lds
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_mov_b32 m0, s4
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s4
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b32 m0, s4
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen slc lds
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT: s_endpgm
+ call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 0), !nontemporal !0
+ call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_volatile_nontemporal(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_mov_b32 m0, s4
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_mov_b32 m0, s4
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: s_mov_b32 m0, s4
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b32 m0, s4
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT: s_endpgm
+ call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 2147483648), !nontemporal !0
+ call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0)
+ ret void
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index 986b48b..712109d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_read_b32 v1, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_load_b32 v1, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_load_b32 v1, v0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_agent_release_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_agent_release_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_agent_release_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
index 8926893..6d1e4e6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_load(
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_read_b32 v1, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_load(
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_load_b32 v1, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_load(
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_load_b32 v1, v0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_cluster_release_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_cluster_release_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_cluster_release_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index 81bbe0a..577d2ca 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_system_seq_cst_load(
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_read_b32 v1, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_system_seq_cst_load(
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_load_b32 v1, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_system_seq_cst_load(
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_load_b32 v1, v0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_system_release_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_system_release_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_system_release_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_system_seq_cst_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_system_seq_cst_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_system_seq_cst_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index 980141a..d686e7a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -819,7 +819,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -854,7 +855,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -879,7 +881,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index 6a233a2..ab4d783 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_read_b32 v1, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_load_b32 v1, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_load_b32 v1, v0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_workgroup_release_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_workgroup_release_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_endpgm
;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_endpgm
;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir
new file mode 100644
index 0000000..93f7bcc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir
@@ -0,0 +1,83 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes=postmisched -o - %s | FileCheck %s
+
+# Ensure WMMA operations stay before the final atomic fence and barrier group.
+# This allows the latency of the WMMA operations to be hidden by barrier wait.
+---
+name: test
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32
+
+ ; CHECK-LABEL: name: test
+ ; CHECK: liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: ATOMIC_FENCE 5, 2
+ ; CHECK-NEXT: S_BARRIER
+ ; CHECK-NEXT: ATOMIC_FENCE 4, 2
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr41_vgpr42_vgpr43_vgpr44, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr41_vgpr42, implicit-def $vgpr41_vgpr42_vgpr43, implicit-def $vgpr42_vgpr43, implicit-def $vgpr42_vgpr43_vgpr44, implicit-def $vgpr43_vgpr44, implicit-def $vgpr45_vgpr46_vgpr47_vgpr48, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr45_vgpr46, implicit-def $vgpr45_vgpr46_vgpr47, implicit-def $vgpr46_vgpr47, implicit-def $vgpr46_vgpr47_vgpr48, implicit-def $vgpr47_vgpr48, implicit-def $vgpr49_vgpr50_vgpr51_vgpr52, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr49_vgpr50, implicit-def $vgpr49_vgpr50_vgpr51, implicit-def $vgpr50_vgpr51, implicit-def $vgpr50_vgpr51_vgpr52, implicit-def $vgpr51_vgpr52, implicit-def $vgpr53_vgpr54_vgpr55_vgpr56, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr53_vgpr54, implicit-def $vgpr53_vgpr54_vgpr55, implicit-def $vgpr54_vgpr55, implicit-def $vgpr54_vgpr55_vgpr56, implicit-def $vgpr55_vgpr56, implicit-def $vgpr57_vgpr58_vgpr59_vgpr60, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr57_vgpr58, implicit-def $vgpr57_vgpr58_vgpr59, implicit-def $vgpr58_vgpr59, implicit-def $vgpr58_vgpr59_vgpr60, implicit-def $vgpr59_vgpr60, implicit-def $vgpr61_vgpr62_vgpr63_vgpr64, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit-def $vgpr61_vgpr62, implicit-def $vgpr61_vgpr62_vgpr63, implicit-def $vgpr62_vgpr63, implicit-def $vgpr62_vgpr63_vgpr64, implicit-def $vgpr63_vgpr64, implicit-def $vgpr65_vgpr66_vgpr67_vgpr68, implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr65_vgpr66, implicit-def $vgpr65_vgpr66_vgpr67, implicit-def $vgpr66_vgpr67, implicit-def $vgpr66_vgpr67_vgpr68, implicit-def $vgpr67_vgpr68, implicit-def $vgpr69_vgpr70_vgpr71_vgpr72, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr69_vgpr70, implicit-def $vgpr69_vgpr70_vgpr71, implicit-def $vgpr70_vgpr71, implicit-def $vgpr70_vgpr71_vgpr72, implicit-def $vgpr71_vgpr72, implicit-def $vgpr73_vgpr74_vgpr75_vgpr76, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr73_vgpr74, implicit-def $vgpr73_vgpr74_vgpr75, implicit-def $vgpr74_vgpr75, implicit-def $vgpr74_vgpr75_vgpr76, implicit-def $vgpr75_vgpr76, implicit-def $vgpr77_vgpr78_vgpr79_vgpr80, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit-def $vgpr77_vgpr78, implicit-def $vgpr77_vgpr78_vgpr79, implicit-def $vgpr78_vgpr79, implicit-def $vgpr78_vgpr79_vgpr80, implicit-def $vgpr79_vgpr80, implicit-def $vgpr81_vgpr82_vgpr83_vgpr84, implicit-def $vgpr81, implicit-def $vgpr81_lo16, implicit-def $vgpr81_hi16, implicit-def $vgpr82, implicit-def $vgpr82_lo16, implicit-def $vgpr82_hi16, implicit-def $vgpr83, implicit-def $vgpr83_lo16, implicit-def $vgpr83_hi16, implicit-def $vgpr84, implicit-def $vgpr84_lo16, implicit-def $vgpr84_hi16, implicit-def $vgpr81_vgpr82, implicit-def $vgpr81_vgpr82_vgpr83, implicit-def $vgpr82_vgpr83, implicit-def $vgpr82_vgpr83_vgpr84, implicit-def $vgpr83_vgpr84, implicit-def $vgpr85_vgpr86_vgpr87_vgpr88, implicit-def $vgpr85, implicit-def $vgpr85_lo16, implicit-def $vgpr85_hi16, implicit-def $vgpr86, implicit-def $vgpr86_lo16, implicit-def $vgpr86_hi16, implicit-def $vgpr87, implicit-def $vgpr87_lo16, implicit-def $vgpr87_hi16, implicit-def $vgpr88, implicit-def $vgpr88_lo16, implicit-def $vgpr88_hi16, implicit-def $vgpr85_vgpr86, implicit-def $vgpr85_vgpr86_vgpr87, implicit-def $vgpr86_vgpr87, implicit-def $vgpr86_vgpr87_vgpr88, implicit-def $vgpr87_vgpr88, implicit killed $vgpr36, implicit $exec, implicit killed $vgpr37, implicit killed $vgpr38 {
+ ; CHECK-NEXT: $vgpr41_vgpr42_vgpr43_vgpr44 = DS_READ_B128_gfx9 $vgpr36, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr45_vgpr46_vgpr47_vgpr48 = DS_READ2_B64_gfx9 $vgpr36, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr49_vgpr50_vgpr51_vgpr52 = DS_READ_B128_gfx9 $vgpr37, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr53_vgpr54_vgpr55_vgpr56 = DS_READ2_B64_gfx9 $vgpr37, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr57_vgpr58_vgpr59_vgpr60 = DS_READ_B128_gfx9 $vgpr37, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr61_vgpr62_vgpr63_vgpr64 = DS_READ2_B64_gfx9 killed $vgpr37, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr65_vgpr66_vgpr67_vgpr68 = DS_READ_B128_gfx9 $vgpr36, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr69_vgpr70_vgpr71_vgpr72 = DS_READ2_B64_gfx9 $vgpr36, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr73_vgpr74_vgpr75_vgpr76 = DS_READ_B128_gfx9 $vgpr36, 1536, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr77_vgpr78_vgpr79_vgpr80 = DS_READ2_B64_gfx9 $vgpr36, 194, 195, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr81_vgpr82_vgpr83_vgpr84 = DS_READ_B128_gfx9 killed $vgpr36, 2304, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: $vgpr85_vgpr86_vgpr87_vgpr88 = DS_READ2_B64_gfx9 killed $vgpr38, 0, 1, 0, implicit $exec :: (load (s128), addrspace 3)
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: $sgpr1 = S_ADD_I32 $sgpr0, 16, implicit-def dead $scc
+ ; CHECK-NEXT: $vgpr39 = V_ADD_U32_e32 32, killed $vgpr39, implicit $exec
+ ; CHECK-NEXT: $vgpr40 = V_ADD_U32_e32 32, killed $vgpr40, implicit $exec
+ ; CHECK-NEXT: S_CMP_LT_U32 killed $sgpr0, killed $sgpr12, implicit-def $scc
+ ; CHECK-NEXT: $sgpr0 = S_MOV_B32 killed $sgpr1
+ ; CHECK-NEXT: early-clobber $vgpr29_vgpr30_vgpr31_vgpr32 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr29_vgpr30_vgpr31_vgpr32, 0, 0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr25_vgpr26_vgpr27_vgpr28 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr25_vgpr26_vgpr27_vgpr28, 0, 0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr21_vgpr22_vgpr23_vgpr24 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr21_vgpr22_vgpr23_vgpr24, 0, 0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr17_vgpr18_vgpr19_vgpr20 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr17_vgpr18_vgpr19_vgpr20, 0, 0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr13_vgpr14_vgpr15_vgpr16 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr13_vgpr14_vgpr15_vgpr16, 0, 0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr9_vgpr10_vgpr11_vgpr12 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr9_vgpr10_vgpr11_vgpr12, 0, 0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr5_vgpr6_vgpr7_vgpr8 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr5_vgpr6_vgpr7_vgpr8, 0, 0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr1_vgpr2_vgpr3_vgpr4 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr1_vgpr2_vgpr3_vgpr4, 0, 0, implicit $exec
+ ; CHECK-NEXT: ATOMIC_FENCE 5, 2
+ ; CHECK-NEXT: S_BARRIER
+ ; CHECK-NEXT: ATOMIC_FENCE 4, 2
+ ATOMIC_FENCE 5, 2
+ S_BARRIER
+ ATOMIC_FENCE 4, 2
+ BUNDLE implicit-def $vgpr41_vgpr42_vgpr43_vgpr44, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr41_vgpr42, implicit-def $vgpr41_vgpr42_vgpr43, implicit-def $vgpr42_vgpr43, implicit-def $vgpr42_vgpr43_vgpr44, implicit-def $vgpr43_vgpr44, implicit-def $vgpr45_vgpr46_vgpr47_vgpr48, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr45_vgpr46, implicit-def $vgpr45_vgpr46_vgpr47, implicit-def $vgpr46_vgpr47, implicit-def $vgpr46_vgpr47_vgpr48, implicit-def $vgpr47_vgpr48, implicit-def $vgpr49_vgpr50_vgpr51_vgpr52, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr49_vgpr50, implicit-def $vgpr49_vgpr50_vgpr51, implicit-def $vgpr50_vgpr51, implicit-def $vgpr50_vgpr51_vgpr52, implicit-def $vgpr51_vgpr52, implicit-def $vgpr53_vgpr54_vgpr55_vgpr56, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr53_vgpr54, implicit-def $vgpr53_vgpr54_vgpr55, implicit-def $vgpr54_vgpr55, implicit-def $vgpr54_vgpr55_vgpr56, implicit-def $vgpr55_vgpr56, implicit-def $vgpr57_vgpr58_vgpr59_vgpr60, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr57_vgpr58, implicit-def $vgpr57_vgpr58_vgpr59, implicit-def $vgpr58_vgpr59, implicit-def $vgpr58_vgpr59_vgpr60, implicit-def $vgpr59_vgpr60, implicit-def $vgpr61_vgpr62_vgpr63_vgpr64, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit-def $vgpr61_vgpr62, implicit-def $vgpr61_vgpr62_vgpr63, implicit-def $vgpr62_vgpr63, implicit-def $vgpr62_vgpr63_vgpr64, implicit-def $vgpr63_vgpr64, implicit-def $vgpr65_vgpr66_vgpr67_vgpr68, implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr65_vgpr66, implicit-def $vgpr65_vgpr66_vgpr67, implicit-def $vgpr66_vgpr67, implicit-def $vgpr66_vgpr67_vgpr68, implicit-def $vgpr67_vgpr68, implicit-def $vgpr69_vgpr70_vgpr71_vgpr72, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr69_vgpr70, implicit-def $vgpr69_vgpr70_vgpr71, implicit-def $vgpr70_vgpr71, implicit-def $vgpr70_vgpr71_vgpr72, implicit-def $vgpr71_vgpr72, implicit-def $vgpr73_vgpr74_vgpr75_vgpr76, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr73_vgpr74, implicit-def $vgpr73_vgpr74_vgpr75, implicit-def $vgpr74_vgpr75, implicit-def $vgpr74_vgpr75_vgpr76, implicit-def $vgpr75_vgpr76, implicit-def $vgpr77_vgpr78_vgpr79_vgpr80, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit-def $vgpr77_vgpr78, implicit-def $vgpr77_vgpr78_vgpr79, implicit-def $vgpr78_vgpr79, implicit-def $vgpr78_vgpr79_vgpr80, implicit-def $vgpr79_vgpr80, implicit-def $vgpr81_vgpr82_vgpr83_vgpr84, implicit-def $vgpr81, implicit-def $vgpr81_lo16, implicit-def $vgpr81_hi16, implicit-def $vgpr82, implicit-def $vgpr82_lo16, implicit-def $vgpr82_hi16, implicit-def $vgpr83, implicit-def $vgpr83_lo16, implicit-def $vgpr83_hi16, implicit-def $vgpr84, implicit-def $vgpr84_lo16, implicit-def $vgpr84_hi16, implicit-def $vgpr81_vgpr82, implicit-def $vgpr81_vgpr82_vgpr83, implicit-def $vgpr82_vgpr83, implicit-def $vgpr82_vgpr83_vgpr84, implicit-def $vgpr83_vgpr84, implicit-def $vgpr85_vgpr86_vgpr87_vgpr88, implicit-def $vgpr85, implicit-def $vgpr85_lo16, implicit-def $vgpr85_hi16, implicit-def $vgpr86, implicit-def $vgpr86_lo16, implicit-def $vgpr86_hi16, implicit-def $vgpr87, implicit-def $vgpr87_lo16, implicit-def $vgpr87_hi16, implicit-def $vgpr88, implicit-def $vgpr88_lo16, implicit-def $vgpr88_hi16, implicit-def $vgpr85_vgpr86, implicit-def $vgpr85_vgpr86_vgpr87, implicit-def $vgpr86_vgpr87, implicit-def $vgpr86_vgpr87_vgpr88, implicit-def $vgpr87_vgpr88, implicit $vgpr36, implicit $exec, implicit $vgpr37, implicit $vgpr38 {
+ $vgpr41_vgpr42_vgpr43_vgpr44 = DS_READ_B128_gfx9 $vgpr36, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr45_vgpr46_vgpr47_vgpr48 = DS_READ2_B64_gfx9 $vgpr36, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr49_vgpr50_vgpr51_vgpr52 = DS_READ_B128_gfx9 $vgpr37, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr53_vgpr54_vgpr55_vgpr56 = DS_READ2_B64_gfx9 $vgpr37, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr57_vgpr58_vgpr59_vgpr60 = DS_READ_B128_gfx9 $vgpr37, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr61_vgpr62_vgpr63_vgpr64 = DS_READ2_B64_gfx9 $vgpr37, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr65_vgpr66_vgpr67_vgpr68 = DS_READ_B128_gfx9 $vgpr36, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr69_vgpr70_vgpr71_vgpr72 = DS_READ2_B64_gfx9 $vgpr36, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr73_vgpr74_vgpr75_vgpr76 = DS_READ_B128_gfx9 $vgpr36, 1536, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr77_vgpr78_vgpr79_vgpr80 = DS_READ2_B64_gfx9 $vgpr36, 194, 195, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr81_vgpr82_vgpr83_vgpr84 = DS_READ_B128_gfx9 $vgpr36, 2304, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr85_vgpr86_vgpr87_vgpr88 = DS_READ2_B64_gfx9 $vgpr38, 0, 1, 0, implicit $exec :: (load (s128), addrspace 3)
+ }
+ $sgpr1 = S_ADD_I32 $sgpr0, 16, implicit-def dead $scc
+ $vgpr39 = V_ADD_U32_e32 32, killed $vgpr39, implicit $exec
+ $vgpr40 = V_ADD_U32_e32 32, killed $vgpr40, implicit $exec
+ S_CMP_LT_U32 killed $sgpr0, $sgpr12, implicit-def $scc
+ early-clobber $vgpr29_vgpr30_vgpr31_vgpr32 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr29_vgpr30_vgpr31_vgpr32, 0, 0, implicit $exec
+ early-clobber $vgpr25_vgpr26_vgpr27_vgpr28 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr25_vgpr26_vgpr27_vgpr28, 0, 0, implicit $exec
+ early-clobber $vgpr21_vgpr22_vgpr23_vgpr24 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr21_vgpr22_vgpr23_vgpr24, 0, 0, implicit $exec
+ early-clobber $vgpr17_vgpr18_vgpr19_vgpr20 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr17_vgpr18_vgpr19_vgpr20, 0, 0, implicit $exec
+ early-clobber $vgpr13_vgpr14_vgpr15_vgpr16 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr13_vgpr14_vgpr15_vgpr16, 0, 0, implicit $exec
+ early-clobber $vgpr9_vgpr10_vgpr11_vgpr12 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr9_vgpr10_vgpr11_vgpr12, 0, 0, implicit $exec
+ early-clobber $vgpr5_vgpr6_vgpr7_vgpr8 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr5_vgpr6_vgpr7_vgpr8, 0, 0, implicit $exec
+ early-clobber $vgpr1_vgpr2_vgpr3_vgpr4 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr1_vgpr2_vgpr3_vgpr4, 0, 0, implicit $exec
+ $sgpr0 = S_MOV_B32 killed $sgpr1
+ ATOMIC_FENCE 5, 2
+ S_BARRIER
+ ATOMIC_FENCE 4, 2
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
index f3cb5a7..30f5277 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
@@ -26,17 +26,17 @@ define amdgpu_kernel void @barrier_vmcnt_global(ptr addrspace(1) %arg) {
; GFX9-LABEL: barrier_vmcnt_global:
; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v1, s[0:1]
-; GFX9-NEXT: v_add_u32_e32 v1, 1, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: global_load_dword v3, v1, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_barrier
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: global_store_dword v[0:1], v3, off
; GFX9-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -369,10 +369,9 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) {
; GFX8-NEXT: flat_load_dword v3, v[2:3]
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0
; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
-; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_barrier
; GFX8-NEXT: flat_store_dword v[0:1], v3
; GFX8-NEXT: s_endpgm
@@ -393,10 +392,9 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) {
; GFX9-NEXT: flat_load_dword v3, v[2:3]
; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_barrier
; GFX9-NEXT: flat_store_dword v[0:1], v3
; GFX9-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/BPF/BTF/variant-part.ll b/llvm/test/CodeGen/BPF/BTF/variant-part.ll
new file mode 100644
index 0000000..1071e61
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/variant-part.ll
@@ -0,0 +1,87 @@
+; RUN: llc -mtriple=bpfel -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+; RUN: llc -mtriple=bpfeb -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+;
+; Source:
+; #![no_std]
+; #![no_main]
+;
+; pub enum MyEnum {
+; First { a: u32, b: i32 },
+; Second(u32),
+; }
+;
+; #[unsafe(no_mangle)]
+; pub static X: MyEnum = MyEnum::First { a: 54, b: -23 };
+;
+; #[cfg(not(test))]
+; #[panic_handler]
+; fn panic(_info: &core::panic::PanicInfo) -> ! {
+; loop {}
+; }
+; Compilation flag:
+; cargo +nightly rustc -Zbuild-std=core --target=bpfel-unknown-none -- --emit=llvm-bc
+; llvm-extract --glob=X $(find target/ -name "*.bc" | head -n 1) -o variant-part.bc
+; llvm-dis variant-part.bc -o variant-part.ll
+
+; ModuleID = 'variant-part.bc'
+source_filename = "c0znihgkvro8hs0n88fgrtg6x"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "bpfel"
+
+@X = constant [12 x i8] c"\00\00\00\006\00\00\00\E9\FF\FF\FF", align 4, !dbg !0
+
+!llvm.module.flags = !{!22, !23, !24, !25}
+!llvm.ident = !{!26}
+!llvm.dbg.cu = !{!27}
+
+; CHECK-BTF: [1] STRUCT 'MyEnum' size=12 vlen=1
+; CHECK-BTF-NEXT: '(anon)' type_id=3 bits_offset=0
+; CHECK-BTF-NEXT: [2] INT 'u32' size=4 bits_offset=0 nr_bits=32 encoding=(none)
+; CHECK-BTF-NEXT: [3] UNION '(anon)' size=12 vlen=3
+; CHECK-BTF-NEXT: '(anon)' type_id=2 bits_offset=0
+; CHECK-BTF-NEXT: 'First' type_id=4 bits_offset=0
+; CHECK-BTF-NEXT: 'Second' type_id=6 bits_offset=0
+; CHECK-BTF-NEXT: [4] STRUCT 'First' size=12 vlen=2
+; CHECK-BTF-NEXT: 'a' type_id=2 bits_offset=32
+; CHECK-BTF-NEXT: 'b' type_id=5 bits_offset=64
+; CHECK-BTF-NEXT: [5] INT 'i32' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
+; CHECK-BTF-NEXT: [6] STRUCT 'Second' size=12 vlen=1
+; CHECK-BTF-NEXT: '__0' type_id=2 bits_offset=32
+; CHECK-BTF-NEXT: [7] VAR 'X' type_id=1, linkage=global
+; CHECK-BTF-NEXT: [8] DATASEC '.rodata' size=0 vlen=1
+; CHECK-BTF-NEXT: type_id=7 offset=0 size=12
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "X", scope: !2, file: !3, line: 10, type: !4, isLocal: false, isDefinition: true, align: 32)
+!2 = !DINamespace(name: "variant_part", scope: null)
+!3 = !DIFile(filename: "variant-part/src/main.rs", directory: "/tmp/variant-part", checksumkind: CSK_MD5, checksum: "b94cd53886ea8f14cbc116b36bc7dd36")
+!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyEnum", scope: !2, file: !5, size: 96, align: 32, flags: DIFlagPublic, elements: !6, templateParams: !16, identifier: "faba668fd9f71e9b7cf3b9ac5e8b93cb")
+!5 = !DIFile(filename: "<unknown>", directory: "")
+!6 = !{!7}
+!7 = !DICompositeType(tag: DW_TAG_variant_part, scope: !4, file: !5, size: 96, align: 32, elements: !8, templateParams: !16, identifier: "e4aee046fc86d111657622fdcb8c42f7", discriminator: !21)
+!8 = !{!9, !17}
+!9 = !DIDerivedType(tag: DW_TAG_member, name: "First", scope: !7, file: !5, baseType: !10, size: 96, align: 32, extraData: i32 0)
+!10 = !DICompositeType(tag: DW_TAG_structure_type, name: "First", scope: !4, file: !5, size: 96, align: 32, flags: DIFlagPublic, elements: !11, templateParams: !16, identifier: "cc7748c842e275452db4205b190c8ff7")
+!11 = !{!12, !14}
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !10, file: !5, baseType: !13, size: 32, align: 32, offset: 32, flags: DIFlagPublic)
+!13 = !DIBasicType(name: "u32", size: 32, encoding: DW_ATE_unsigned)
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !10, file: !5, baseType: !15, size: 32, align: 32, offset: 64, flags: DIFlagPublic)
+!15 = !DIBasicType(name: "i32", size: 32, encoding: DW_ATE_signed)
+!16 = !{}
+!17 = !DIDerivedType(tag: DW_TAG_member, name: "Second", scope: !7, file: !5, baseType: !18, size: 96, align: 32, extraData: i32 1)
+!18 = !DICompositeType(tag: DW_TAG_structure_type, name: "Second", scope: !4, file: !5, size: 96, align: 32, flags: DIFlagPublic, elements: !19, templateParams: !16, identifier: "a2094b1381f3082d504fbd0903aa7c06")
+!19 = !{!20}
+!20 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !18, file: !5, baseType: !13, size: 32, align: 32, offset: 32, flags: DIFlagPublic)
+!21 = !DIDerivedType(tag: DW_TAG_member, scope: !4, file: !5, baseType: !13, size: 32, align: 32, flags: DIFlagArtificial)
+!22 = !{i32 8, !"PIC Level", i32 2}
+!23 = !{i32 7, !"PIE Level", i32 2}
+!24 = !{i32 7, !"Dwarf Version", i32 4}
+!25 = !{i32 2, !"Debug Info Version", i32 3}
+!26 = !{!"rustc version 1.91.0-nightly (160e7623e 2025-08-26)"}
+!27 = distinct !DICompileUnit(language: DW_LANG_Rust, file: !28, producer: "clang LLVM (rustc version 1.91.0-nightly (160e7623e 2025-08-26))", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !29, splitDebugInlining: false, nameTableKind: None)
+!28 = !DIFile(filename: "variant-part/src/main.rs/@/c0znihgkvro8hs0n88fgrtg6x", directory: "/tmp/variant-part")
+!29 = !{!0}
diff --git a/llvm/test/CodeGen/Hexagon/insert-big.ll b/llvm/test/CodeGen/Hexagon/insert-big.ll
new file mode 100644
index 0000000..8735a66
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/insert-big.ll
@@ -0,0 +1,47 @@
+; Check that llc does not abort, which happened due to incorrect MIR.
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=1 < %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=2 < %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=3 < %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=4 < %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=5 < %s
+
+; Look for this symptom, in case llc does not check invalid IR.
+; CHECK-NOT: insert(%14,%5,#5,#5)
+
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=1 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=2 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=3 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=4 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=5 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+
+; REQUIRES: asserts
+
+define i32 @f(i32 %0, i32 %1, i32 %2) {
+entry:
+ switch i32 %0, label %common.ret1 [
+ i32 8907, label %3
+ i32 4115, label %6
+ ]
+
+common.ret1:
+ %common.ret1.op = phi i32 [ %5, %3 ], [ %526, %6 ], [ 0, %entry ]
+ ret i32 %common.ret1.op
+
+3:
+ %4 = shl i32 %2, 5
+ %5 = and i32 %4, 992
+ br label %common.ret1
+
+6:
+ %7 = shl i32 %0, 10
+ %8 = and i32 %7, 7168
+ %9 = shl i32 %0, 5
+ %10 = and i32 %9, 992
+ %11 = or i32 %10, %8
+ %12 = and i32 %0, 1
+ %13 = or i32 %11, %12
+ %14 = shl i32 %1, 1
+ %15 = and i32 %14, 2031616
+ %526 = or i32 %13, %15
+ br label %common.ret1
+}
diff --git a/llvm/test/CodeGen/Hexagon/qfp-conv.ll b/llvm/test/CodeGen/Hexagon/qfp-conv.ll
new file mode 100644
index 0000000..d2d393e
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfp-conv.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=hexagon -mattr=+hvxv68,+hvx,+hvx-length128b < %s | FileCheck %s
+
+; Test that the Qfloat optimization pass doesn't crash due to an invalid
+; instructions.
+
+; CHECK: v{{[0-9]+}}.hf = v{{[0-9]:[0-9]}}.qf32
+
+define void @test(
+ <32 x i32>* %optr,
+ <64 x i32> %in64,
+ <32 x i32> %va,
+ <32 x i32> %vb
+) local_unnamed_addr #0 {
+entry:
+ br label %for.body
+
+for.body:
+ %optr.068 = phi <32 x i32>* [ %optr, %entry ], [ %incdec.ptr6, %for.body ]
+ %0 = tail call <32 x i32> @llvm.hexagon.V6.vconv.hf.qf32.128B(<64 x i32> %in64) #2
+ %1 = tail call <32 x i32> @llvm.hexagon.V6.vdealh.128B(<32 x i32> %0) #2
+ %2 = tail call <128 x i1> @llvm.hexagon.V6.vgth.128B(<32 x i32> %va, <32 x i32> %1) #2
+ %3 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<128 x i1> %2, <32 x i32> %va, <32 x i32> %vb) #2
+ %4 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %3, <32 x i32> %vb) #2
+ %5 = tail call <32 x i32> @llvm.hexagon.V6.vpackhub.sat.128B(<32 x i32> %va, <32 x i32> %4) #2
+ store <32 x i32> %5, <32 x i32>* %optr.068, align 1
+ %incdec.ptr6 = getelementptr inbounds <32 x i32>, <32 x i32>* %optr.068, i32 1
+ br label %for.body
+}
+
+declare <32 x i32> @llvm.hexagon.V6.vdealh.128B(<32 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vconv.hf.qf32.128B(<64 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32>, <32 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vpackhub.sat.128B(<32 x i32>, <32 x i32>) #1
+declare <128 x i1> @llvm.hexagon.V6.vgth.128B(<32 x i32>, <32 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vmux.128B(<128 x i1>, <32 x i32>, <32 x i32>) #1
diff --git a/llvm/test/CodeGen/Hexagon/qfp-enabled.ll b/llvm/test/CodeGen/Hexagon/qfp-enabled.ll
new file mode 100644
index 0000000..a5cc5fa
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfp-enabled.ll
@@ -0,0 +1,19 @@
+; Tests if the flag to disable qfp optimizer pass works or not.
+
+; RUN: llc -march=hexagon -mcpu=hexagonv69 -mattr=+hvxv69,+hvx-length128b \
+; RUN: < %s -o -| FileCheck %s --check-prefix=ENABLED
+; RUN: llc -march=hexagon -mcpu=hexagonv69 -mattr=+hvxv69,+hvx-length128b \
+; RUN: -disable-qfp-opt < %s -o -| FileCheck %s --check-prefix=DISABLED
+
+define dso_local <32 x i32> @conv1_qf32(<32 x i32> noundef %input1, <32 x i32> noundef %input2) local_unnamed_addr {
+entry:
+; DISABLED: [[V2:v[0-9]+]].qf32 = vadd(v0.sf,v1.sf)
+; DISABLED: [[V3:v[0-9]+]].sf = [[V2]].qf32
+; DISABLED: qf32 = vadd(v0.sf,[[V3]].sf)
+; ENABLED: [[V4:v[0-9]+]].qf32 = vadd(v0.sf,v1.sf)
+; ENABLED: qf32 = vadd([[V4]].qf32,v0.sf)
+ %0 = tail call <32 x i32> @llvm.hexagon.V6.vadd.sf.128B(<32 x i32> %input1, <32 x i32> %input2)
+ %1 = tail call <32 x i32> @llvm.hexagon.V6.vconv.sf.qf32.128B(<32 x i32> %0)
+ %2 = tail call <32 x i32> @llvm.hexagon.V6.vadd.sf.128B(<32 x i32> %input1, <32 x i32> %1)
+ ret <32 x i32> %2
+}
diff --git a/llvm/test/CodeGen/Hexagon/qfp-remove-kill.mir b/llvm/test/CodeGen/Hexagon/qfp-remove-kill.mir
new file mode 100644
index 0000000..d8dde7d
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfp-remove-kill.mir
@@ -0,0 +1,95 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \
+# RUN: -run-pass hexagon-qfp-optimizer -run-pass machineverifier %s -o - | FileCheck %s
+
+# Test that the killed RegState from DefMI operands are removed
+# killed RegState should be set for MI operands
+# CHECK-LABEL: name: qfpAdd
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG1:([0-9]+)]]
+# CHECK-NEXT: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG2:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32 killed %[[REG1]], killed %[[REG2]]
+# CHECK-NEXT: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG3:([0-9]+)]]
+# CHECK-NEXT: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG4:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32 killed %[[REG3]], killed %[[REG4]]
+
+---
+name: qfpAdd
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2, $r3
+ %0:intregs = COPY $r0
+ %1:intregs = COPY $r1
+ %2:intregs = COPY $r2
+ %3:intregs = COPY $r3
+ %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+ %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+ %6:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+ %7:hvxvr = V6_vL32Ub_ai %3:intregs, 0
+ %8:hvxvr = V6_vconv_sf_qf32 killed %4:hvxvr
+ %9:hvxvr = V6_vconv_sf_qf32 killed %5:hvxvr
+ %10:hvxvr = V6_vadd_sf %8:hvxvr, %9:hvxvr
+ %11:hvxvr = V6_vconv_sf_qf32 killed %6:hvxvr
+ %12:hvxvr = V6_vconv_sf_qf32 killed %7:hvxvr
+ %13:hvxvr = V6_vadd_sf killed %11:hvxvr, killed %12:hvxvr
+...
+
+
+# Test that the killed RegState from DefMI operands are removed
+# CHECK-LABEL: name: qfpAddMix
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG1:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32_mix killed %[[REG1]], %{{[0-9]+}}
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG2:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32_mix killed %[[REG2]], %{{[0-9]+}}
+
+---
+name: qfpAddMix
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2
+ %0:intregs = COPY $r0
+ %1:intregs = COPY $r1
+ %2:intregs = COPY $r2
+ %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+ %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+ %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+ %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+ %7:hvxvr = V6_vconv_sf_qf32 killed %6:hvxvr
+ %8:hvxvr = V6_vadd_sf %3:hvxvr, %7:hvxvr
+ %9:hvxvr = V6_vmpy_qf32_sf %4, %5
+ %10:hvxvr = V6_vconv_sf_qf32 killed %9:hvxvr
+ %11:hvxvr = V6_vadd_sf %3:hvxvr, killed %10:hvxvr
+...
+
+
+# Test that we do generate V6_vsub_qf32_mix for the below test.
+# V6_vsub_qf32_mix only allowes qf32 as first operand. In the test qf32
+# is passed as first operand. So, V6_vsub_qf32_mix must be generated.
+# CHECK-LABEL: name: qfpAddSwapMix
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG1:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32_mix killed %[[REG1]], %{{[0-9]+}}
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG2:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32_mix killed %[[REG2]], %{{[0-9]+}}
+
+---
+name: qfpAddSwapMix
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2
+ %0:intregs = COPY $r0
+ %1:intregs = COPY $r1
+ %2:intregs = COPY $r2
+ %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+ %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+ %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+ %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+ %7:hvxvr = V6_vconv_sf_qf32 killed %6:hvxvr
+ %8:hvxvr = V6_vadd_sf %7:hvxvr, %3:hvxvr
+ %9:hvxvr = V6_vmpy_qf32_sf %4, %5
+ %10:hvxvr = V6_vconv_sf_qf32 killed %9:hvxvr
+ %11:hvxvr = V6_vadd_sf killed %10:hvxvr, %3:hvxvr
+...
diff --git a/llvm/test/CodeGen/Hexagon/qfp-subreg-bug.mir b/llvm/test/CodeGen/Hexagon/qfp-subreg-bug.mir
new file mode 100644
index 0000000..1d78203
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfp-subreg-bug.mir
@@ -0,0 +1,33 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv69 -mattr=+hvxv69,+hvx-length128b -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s
+
+# CHECK: V6_vshuffvdd
+# CHECK: V6_vadd_sf
+# CHECK: V6_vadd_qf32_mix{{.*}}vsub_lo
+# CHECK: V6_vadd_qf32_mix{{.*}}vsub_hi
+
+---
+name: qfp_subreg_fix
+alignment: 16
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ %10:intregs = IMPLICIT_DEF
+ %9:hvxvr = V6_vL32Ub_ai %10, 0 :: (load (s1024) from `ptr undef`, align 4)
+ %11:intregs = A2_tfrsi 15360
+ %12:hvxvr = V6_lvsplath %11
+ %13:hvxwr = V6_vmpy_qf32_hf %9, %12
+ %15:hvxvr = V6_vconv_sf_qf32 %13.vsub_lo
+ %17:hvxvr = V6_vconv_sf_qf32 %13.vsub_hi
+ %18:intregslow8 = A2_tfrsi -4
+ %19:hvxwr = V6_vshuffvdd %17, %15, %18
+ %21:hvxvr = V6_vadd_sf %19.vsub_hi, %19.vsub_hi
+ %22:hvxvr = V6_vconv_sf_qf32 %21
+ %24:hvxvr = V6_vadd_sf %19.vsub_lo, %19.vsub_lo
+ %25:hvxvr = V6_vconv_sf_qf32 %24
+ %26:hvxvr = V6_vadd_sf %25, %19.vsub_lo
+ %27:hvxvr = V6_vconv_sf_qf32 %26
+ %28:hvxvr = V6_vadd_sf %22, %19.vsub_hi
+ %29:hvxvr = V6_vconv_sf_qf32 %28
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll b/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll
new file mode 100644
index 0000000..c16370c
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll
@@ -0,0 +1,21 @@
+; Tests if generated vadd instruction takes in qf32
+; type as first parameter instead of a sf type without
+; any conversion instruction of type sf = qf32
+
+; RUN: llc -mtriple=hexagon < %s -o - | FileCheck %s
+
+; CHECK: [[V2:v[0-9]+]] = vxor([[V2]],[[V2]])
+; CHECK: [[V0:v[0-9]+]].qf32 = vmpy([[V0]].sf,[[V2]].sf)
+; CHECK: [[V1:v[0-9]+]].qf32 = vmpy([[V1]].sf,[[V2]].sf)
+; CHECK: [[V4:v[0-9]+]].qf32 = vadd([[V0]].qf32,[[V2]].sf)
+; CHECK: [[V5:v[0-9]+]].qf32 = vadd([[V1]].qf32,[[V2]].sf)
+
+define void @_Z19compute_ripple_geluIDF16_EviPT_PKS0_(ptr %out_ptr, <64 x float> %conv14.ripple.vectorized) #0 {
+entry:
+ %mul16.ripple.vectorized = fmul <64 x float> %conv14.ripple.vectorized, zeroinitializer
+ %conv17.ripple.vectorized = fptrunc <64 x float> %mul16.ripple.vectorized to <64 x half>
+ store <64 x half> %conv17.ripple.vectorized, ptr %out_ptr, align 2
+ ret void
+}
+
+attributes #0 = { "target-features"="+hvx-length128b,+hvxv75,+v75,-long-calls,-small-data" }
diff --git a/llvm/test/CodeGen/Hexagon/vect/qfp-mix.mir b/llvm/test/CodeGen/Hexagon/vect/qfp-mix.mir
new file mode 100644
index 0000000..9a9e938
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vect/qfp-mix.mir
@@ -0,0 +1,79 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \
+# RUN: -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s
+
+
+# Test that the operands are swapped for Add if the second operand
+# is a qf32 to sf conversion. V6_vadd_qf32_mix supports first operand
+# as qf32.
+# CHECK-LABEL: name: qfpAddMix
+# CHECK: %[[REG:([0-9]+)]]:hvxvr = V6_vmpy_qf32_sf
+# CHECK: V6_vadd_qf32_mix %[[REG]]
+
+---
+name: qfpAddMix
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2, $r3
+ %0:intregs = COPY $r0
+ %1:intregs = COPY $r1
+ %2:intregs = COPY $r2
+ %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+ %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+ %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+ %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+ %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr
+ %8:hvxvr = V6_vadd_sf %3:hvxvr, %7:hvxvr
+...
+
+
+# Test that we do not generate V6_vsub_qf32_mix for the below test.
+# V6_vsub_qf32_mix only allowes qf32 as first operand. In the test qf32
+# is passed as second operand. As sub is not commutative, we should not
+# generate the mix instruction.
+# CHECK-LABEL: name: qfpSubNoMix
+# CHECK-NOT: V6_vsub_qf32_mix
+
+---
+name: qfpSubNoMix
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2, $r3
+ %0:intregs = COPY $r0
+ %1:intregs = COPY $r1
+ %2:intregs = COPY $r2
+ %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+ %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+ %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+ %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+ %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr
+ %8:hvxvr = V6_vsub_sf %3:hvxvr, %7:hvxvr
+...
+
+
+# Test that we do generate V6_vsub_qf32_mix for the below test.
+# V6_vsub_qf32_mix only allowes qf32 as first operand. In the test qf32
+# is passed as first operand. So, V6_vsub_qf32_mix must be generated.
+# CHECK-LABEL: name: qfpSubMix
+# CHECK: V6_vsub_qf32_mix
+
+---
+name: qfpSubMix
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2, $r3
+ %0:intregs = COPY $r0
+ %1:intregs = COPY $r1
+ %2:intregs = COPY $r2
+ %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+ %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+ %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+ %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+ %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr
+ %8:hvxvr = V6_vsub_sf %7:hvxvr, %3:hvxvr
+...
diff --git a/llvm/test/CodeGen/Hexagon/vect/qfp-zeroinit.mir b/llvm/test/CodeGen/Hexagon/vect/qfp-zeroinit.mir
new file mode 100644
index 0000000..f0b1d3c
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vect/qfp-zeroinit.mir
@@ -0,0 +1,23 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s
+
+# CHECK-LABEL: name: qfpAdd32
+# CHECK: V6_vd0
+# CHECK-NEXT: V6_vL32Ub_ai
+# CHECK-NEXT: V6_vadd_sf
+# CHECK-NEXT: V6_vconv_sf_qf32
+# CHECK-NEXT: V6_vS32Ub_ai
+---
+name: qfpAdd32
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $r0, $r1
+ %0:intregs = COPY $r0
+ %1:intregs = COPY $r1
+ %3:hvxvr = V6_vd0
+ %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+ %5:hvxvr = V6_vadd_sf %3:hvxvr, %4:hvxvr
+ %6:hvxvr = V6_vconv_sf_qf32 %5:hvxvr
+ V6_vS32Ub_ai %1:intregs, 0, %6:hvxvr
+...
diff --git a/llvm/test/CodeGen/Hexagon/vect/unique-vreg-def.ll b/llvm/test/CodeGen/Hexagon/vect/unique-vreg-def.ll
new file mode 100644
index 0000000..2d46da7
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vect/unique-vreg-def.ll
@@ -0,0 +1,32 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; REQUIRES: hexagon
+
+; This test was asserting because getVRegDef() was called on a register with
+; multiple defs.
+; Checks that the test does not assert and vsub is generated.
+; CHECK: vsub
+
+target triple = "hexagon"
+
+@v = common dso_local local_unnamed_addr global <32 x i32> zeroinitializer, align 128
+
+; Function Attrs: nounwind
+define dso_local void @hvx_twoSum(<32 x i32>* nocapture noundef writeonly %s2lo) local_unnamed_addr #0 {
+entry:
+ %0 = load <32 x i32>, <32 x i32>* @v, align 128
+ %call = tail call inreg <32 x i32> @MY_Vsf_equals_Vqf32(<32 x i32> noundef %0) #3
+ %1 = tail call <32 x i32> @llvm.hexagon.V6.vsub.sf.128B(<32 x i32> %call, <32 x i32> %call)
+ store <32 x i32> %1, <32 x i32>* @v, align 128
+ store <32 x i32> %1, <32 x i32>* %s2lo, align 128
+ ret void
+}
+
+declare dso_local inreg <32 x i32> @MY_Vsf_equals_Vqf32(<32 x i32> noundef) local_unnamed_addr #1
+
+; Function Attrs: nofree nosync nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vsub.sf.128B(<32 x i32>, <32 x i32>) #2
+
+attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv73" "target-features"="+hvx-length128b,+hvxv73,+v73,-long-calls" }
+attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv73" "target-features"="+hvx-length128b,+hvxv73,+v73,-long-calls" }
+attributes #2 = { nofree nosync nounwind readnone }
+attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/RISCV/rv32zbs.ll b/llvm/test/CodeGen/RISCV/rv32zbs.ll
index dcb70f8..f9527ef 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbs.ll
@@ -45,6 +45,32 @@ define i32 @bclr_i32_no_mask(i32 %a, i32 %b) nounwind {
ret i32 %and1
}
+define i32 @bclr_i32_mask_multiple(i32 %a, i32 %b, i32 %shamt) nounwind {
+; RV32I-LABEL: bclr_i32_mask_multiple:
+; RV32I: # %bb.0:
+; RV32I-NEXT: li a3, 1
+; RV32I-NEXT: sll a2, a3, a2
+; RV32I-NEXT: not a3, a2
+; RV32I-NEXT: and a0, a3, a0
+; RV32I-NEXT: or a1, a1, a2
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: ret
+;
+; RV32ZBS-LABEL: bclr_i32_mask_multiple:
+; RV32ZBS: # %bb.0:
+; RV32ZBS-NEXT: bclr a0, a0, a2
+; RV32ZBS-NEXT: bset a1, a1, a2
+; RV32ZBS-NEXT: add a0, a0, a1
+; RV32ZBS-NEXT: ret
+ %shamt_masked = and i32 %shamt, 63
+ %shl = shl nuw i32 1, %shamt_masked
+ %neg = xor i32 %shl, -1
+ %and = and i32 %neg, %a
+ %or = or i32 %b, %shl
+ %c = add i32 %and, %or
+ ret i32 %c
+}
+
define i64 @bclr_i64(i64 %a, i64 %b) nounwind {
; RV32I-LABEL: bclr_i64:
; RV32I: # %bb.0:
@@ -301,17 +327,17 @@ define i64 @bext_i64(i64 %a, i64 %b) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: andi a3, a2, 63
; CHECK-NEXT: addi a4, a3, -32
-; CHECK-NEXT: bltz a4, .LBB12_2
+; CHECK-NEXT: bltz a4, .LBB13_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: srl a0, a1, a3
-; CHECK-NEXT: j .LBB12_3
-; CHECK-NEXT: .LBB12_2:
+; CHECK-NEXT: j .LBB13_3
+; CHECK-NEXT: .LBB13_2:
; CHECK-NEXT: srl a0, a0, a2
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: not a2, a3
; CHECK-NEXT: sll a1, a1, a2
; CHECK-NEXT: or a0, a0, a1
-; CHECK-NEXT: .LBB12_3:
+; CHECK-NEXT: .LBB13_3:
; CHECK-NEXT: andi a0, a0, 1
; CHECK-NEXT: li a1, 0
; CHECK-NEXT: ret
@@ -789,17 +815,17 @@ define i64 @bset_trailing_ones_i64_mask(i64 %a) nounwind {
; CHECK-NEXT: li a3, -1
; CHECK-NEXT: addi a1, a2, -32
; CHECK-NEXT: sll a0, a3, a0
-; CHECK-NEXT: bltz a1, .LBB43_2
+; CHECK-NEXT: bltz a1, .LBB44_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: sll a2, a3, a2
-; CHECK-NEXT: j .LBB43_3
-; CHECK-NEXT: .LBB43_2:
+; CHECK-NEXT: j .LBB44_3
+; CHECK-NEXT: .LBB44_2:
; CHECK-NEXT: not a2, a2
; CHECK-NEXT: lui a3, 524288
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: srl a2, a3, a2
; CHECK-NEXT: or a2, a0, a2
-; CHECK-NEXT: .LBB43_3:
+; CHECK-NEXT: .LBB44_3:
; CHECK-NEXT: srai a1, a1, 31
; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: not a1, a2
@@ -817,17 +843,17 @@ define i64 @bset_trailing_ones_i64_no_mask(i64 %a) nounwind {
; CHECK-NEXT: li a1, -1
; CHECK-NEXT: addi a2, a0, -32
; CHECK-NEXT: sll a1, a1, a0
-; CHECK-NEXT: bltz a2, .LBB44_2
+; CHECK-NEXT: bltz a2, .LBB45_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: j .LBB44_3
-; CHECK-NEXT: .LBB44_2:
+; CHECK-NEXT: j .LBB45_3
+; CHECK-NEXT: .LBB45_2:
; CHECK-NEXT: not a0, a0
; CHECK-NEXT: lui a3, 524288
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: srl a0, a3, a0
; CHECK-NEXT: or a0, a1, a0
-; CHECK-NEXT: .LBB44_3:
+; CHECK-NEXT: .LBB45_3:
; CHECK-NEXT: srai a2, a2, 31
; CHECK-NEXT: and a2, a2, a1
; CHECK-NEXT: not a1, a0
diff --git a/llvm/test/CodeGen/RISCV/rv64zbs.ll b/llvm/test/CodeGen/RISCV/rv64zbs.ll
index b4edcf6c..d42bc8e 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbs.ll
@@ -110,6 +110,32 @@ define i64 @bclr_i64_no_mask(i64 %a, i64 %b) nounwind {
ret i64 %and1
}
+define i64 @bclr_i64_mask_multiple(i64 %a, i64 %b, i64 %shamt) nounwind {
+; RV64I-LABEL: bclr_i64_mask_multiple:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a3, 1
+; RV64I-NEXT: sll a2, a3, a2
+; RV64I-NEXT: not a3, a2
+; RV64I-NEXT: and a0, a3, a0
+; RV64I-NEXT: or a1, a1, a2
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBS-LABEL: bclr_i64_mask_multiple:
+; RV64ZBS: # %bb.0:
+; RV64ZBS-NEXT: bclr a0, a0, a2
+; RV64ZBS-NEXT: bset a1, a1, a2
+; RV64ZBS-NEXT: add a0, a0, a1
+; RV64ZBS-NEXT: ret
+ %shamt_masked = and i64 %shamt, 63
+ %shl = shl nuw i64 1, %shamt_masked
+ %neg = xor i64 %shl, -1
+ %and = and i64 %neg, %a
+ %or = or i64 %b, %shl
+ %c = add i64 %and, %or
+ ret i64 %c
+}
+
define signext i32 @bset_i32(i32 signext %a, i32 signext %b) nounwind {
; RV64I-LABEL: bset_i32:
; RV64I: # %bb.0:
@@ -372,19 +398,19 @@ define void @bext_i32_trunc(i32 signext %0, i32 signext %1) {
; RV64I: # %bb.0:
; RV64I-NEXT: srlw a0, a0, a1
; RV64I-NEXT: andi a0, a0, 1
-; RV64I-NEXT: beqz a0, .LBB19_2
+; RV64I-NEXT: beqz a0, .LBB20_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: ret
-; RV64I-NEXT: .LBB19_2:
+; RV64I-NEXT: .LBB20_2:
; RV64I-NEXT: tail bar
;
; RV64ZBS-LABEL: bext_i32_trunc:
; RV64ZBS: # %bb.0:
; RV64ZBS-NEXT: bext a0, a0, a1
-; RV64ZBS-NEXT: beqz a0, .LBB19_2
+; RV64ZBS-NEXT: beqz a0, .LBB20_2
; RV64ZBS-NEXT: # %bb.1:
; RV64ZBS-NEXT: ret
-; RV64ZBS-NEXT: .LBB19_2:
+; RV64ZBS-NEXT: .LBB20_2:
; RV64ZBS-NEXT: tail bar
%3 = shl i32 1, %1
%4 = and i32 %3, %0
diff --git a/llvm/test/CodeGen/SystemZ/int-conv-14.ll b/llvm/test/CodeGen/SystemZ/int-conv-14.ll
index 98dc88f..baab5ac 100644
--- a/llvm/test/CodeGen/SystemZ/int-conv-14.ll
+++ b/llvm/test/CodeGen/SystemZ/int-conv-14.ll
@@ -55,14 +55,15 @@ define i128 @f4(ptr %ptr) {
}
; Truncation to i64.
-define i64 @f5(i128 %a) {
+define i64 @f5(i128 %a, i128 %b) {
; CHECK-LABEL: f5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vaq %v0, %v0, %v0
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r2), 3
+; CHECK-NEXT: vaq %v0, %v1, %v0
; CHECK-NEXT: vlgvg %r2, %v0, 1
; CHECK-NEXT: br %r14
- %op = add i128 %a, %a
+ %op = add i128 %a, %b
%res = trunc i128 %op to i64
ret i64 %res
}
@@ -134,15 +135,16 @@ define i128 @f10(ptr %ptr) {
}
; Truncation to i32.
-define i32 @f11(i128 %a) {
+define i32 @f11(i128 %a, i128 %b) {
; CHECK-LABEL: f11:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vaq %v0, %v0, %v0
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r2), 3
+; CHECK-NEXT: vaq %v0, %v1, %v0
; CHECK-NEXT: vlgvf %r2, %v0, 3
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
; CHECK-NEXT: br %r14
- %op = add i128 %a, %a
+ %op = add i128 %a, %b
%res = trunc i128 %op to i32
ret i32 %res
}
@@ -215,15 +217,16 @@ define i128 @f16(ptr %ptr) {
}
; Truncation to i16.
-define i16 @f17(i128 %a) {
+define i16 @f17(i128 %a, i128 %b) {
; CHECK-LABEL: f17:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vaq %v0, %v0, %v0
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r2), 3
+; CHECK-NEXT: vaq %v0, %v1, %v0
; CHECK-NEXT: vlgvf %r2, %v0, 3
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
; CHECK-NEXT: br %r14
- %op = add i128 %a, %a
+ %op = add i128 %a, %b
%res = trunc i128 %op to i16
ret i16 %res
}
@@ -296,15 +299,16 @@ define i128 @f22(ptr %ptr) {
}
; Truncation to i8.
-define i8 @f23(i128 %a) {
+define i8 @f23(i128 %a, i128 %b) {
; CHECK-LABEL: f23:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vaq %v0, %v0, %v0
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r2), 3
+; CHECK-NEXT: vaq %v0, %v1, %v0
; CHECK-NEXT: vlgvf %r2, %v0, 3
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
; CHECK-NEXT: br %r14
- %op = add i128 %a, %a
+ %op = add i128 %a, %b
%res = trunc i128 %op to i8
ret i8 %res
}
@@ -385,15 +389,16 @@ define i128 @f28(ptr %ptr) {
}
; Truncation to i1.
-define i1 @f29(i128 %a) {
+define i1 @f29(i128 %a, i128 %b) {
; CHECK-LABEL: f29:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vaq %v0, %v0, %v0
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r2), 3
+; CHECK-NEXT: vaq %v0, %v1, %v0
; CHECK-NEXT: vlgvf %r2, %v0, 3
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
; CHECK-NEXT: br %r14
- %op = add i128 %a, %a
+ %op = add i128 %a, %b
%res = trunc i128 %op to i1
ret i1 %res
}
diff --git a/llvm/test/CodeGen/SystemZ/int-conv-15.ll b/llvm/test/CodeGen/SystemZ/int-conv-15.ll
index 0d8ee75..f2c9ee5 100644
--- a/llvm/test/CodeGen/SystemZ/int-conv-15.ll
+++ b/llvm/test/CodeGen/SystemZ/int-conv-15.ll
@@ -55,14 +55,15 @@ define i128 @f4(ptr %ptr) {
}
; Truncation to i64.
-define i64 @f5(i128 %a) {
+define i64 @f5(i128 %a, i128 %b) {
; CHECK-LABEL: f5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vaq %v0, %v0, %v0
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r2), 3
+; CHECK-NEXT: vaq %v0, %v1, %v0
; CHECK-NEXT: vlgvg %r2, %v0, 1
; CHECK-NEXT: br %r14
- %op = add i128 %a, %a
+ %op = add i128 %a, %b
%res = trunc i128 %op to i64
ret i64 %res
}
@@ -134,15 +135,16 @@ define i128 @f10(ptr %ptr) {
}
; Truncation to i32.
-define i32 @f11(i128 %a) {
+define i32 @f11(i128 %a, i128 %b) {
; CHECK-LABEL: f11:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vaq %v0, %v0, %v0
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r2), 3
+; CHECK-NEXT: vaq %v0, %v1, %v0
; CHECK-NEXT: vlgvf %r2, %v0, 3
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
; CHECK-NEXT: br %r14
- %op = add i128 %a, %a
+ %op = add i128 %a, %b
%res = trunc i128 %op to i32
ret i32 %res
}
@@ -215,15 +217,16 @@ define i128 @f16(ptr %ptr) {
}
; Truncation to i16.
-define i16 @f17(i128 %a) {
+define i16 @f17(i128 %a, i128 %b) {
; CHECK-LABEL: f17:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vaq %v0, %v0, %v0
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r2), 3
+; CHECK-NEXT: vaq %v0, %v1, %v0
; CHECK-NEXT: vlgvf %r2, %v0, 3
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
; CHECK-NEXT: br %r14
- %op = add i128 %a, %a
+ %op = add i128 %a, %b
%res = trunc i128 %op to i16
ret i16 %res
}
@@ -296,15 +299,16 @@ define i128 @f22(ptr %ptr) {
}
; Truncation to i8.
-define i8 @f23(i128 %a) {
+define i8 @f23(i128 %a, i128 %b) {
; CHECK-LABEL: f23:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vaq %v0, %v0, %v0
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r2), 3
+; CHECK-NEXT: vaq %v0, %v1, %v0
; CHECK-NEXT: vlgvf %r2, %v0, 3
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
; CHECK-NEXT: br %r14
- %op = add i128 %a, %a
+ %op = add i128 %a, %b
%res = trunc i128 %op to i8
ret i8 %res
}
@@ -383,15 +387,16 @@ define i128 @f28(ptr %ptr) {
}
; Truncation to i1.
-define i1 @f29(i128 %a) {
+define i1 @f29(i128 %a, i128 %b) {
; CHECK-LABEL: f29:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vaq %v0, %v0, %v0
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r2), 3
+; CHECK-NEXT: vaq %v0, %v1, %v0
; CHECK-NEXT: vlgvf %r2, %v0, 3
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
; CHECK-NEXT: br %r14
- %op = add i128 %a, %a
+ %op = add i128 %a, %b
%res = trunc i128 %op to i1
ret i1 %res
}
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s b/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
index 83d71cd..5293958 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
@@ -16,8 +16,7 @@
# CHECK-DAG: Symbols: { _foo }, Dependencies: { (main, { _external_func }) }
# CHECK-DAG: Symbols: { _baz }, Dependencies: { (main, { _foo }) }
# CHECK: Simplified dependencies:
-# CHECK-DAG: Symbols: { _foo }, Dependencies: { (main, { _external_func }) }
-# CHECK-DAG: Symbols: { _baz }, Dependencies: { (main, { _external_func }) }
+# CHECK-DAG: Defs: { (main, [ _baz _foo ]) }, Deps: { (main, [ _external_func ]) }
.section __TEXT,__text,regular,pure_instructions
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
new file mode 100644
index 0000000..1ddcd4b
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=msan -mattr=+sme -o - %s | FileCheck %s
+
+; XFAIL: *
+
+; Forked from llvm/test/CodeGen/AArch64/sme-aarch64-svcount.ll
+; Manually minimized to show MSan leads to a compiler crash
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+define target("aarch64.svcount") @test_return_arg1(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1) nounwind {
+ ret target("aarch64.svcount") %arg1
+}
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll
new file mode 100644
index 0000000..9caa89d
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=msan -mattr=+sme -o - %s | FileCheck %s
+
+; XFAIL: *
+
+; Forked from llvm/test/CodeGen/AArch64/sme-aarch64-svcount.ll
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+;
+; Test simple loads, stores and return.
+;
+define target("aarch64.svcount") @test_load(ptr %ptr) nounwind {
+ %res = load target("aarch64.svcount"), ptr %ptr
+ ret target("aarch64.svcount") %res
+}
+
+define void @test_store(ptr %ptr, target("aarch64.svcount") %val) nounwind {
+ store target("aarch64.svcount") %val, ptr %ptr
+ ret void
+}
+
+define target("aarch64.svcount") @test_alloca_store_reload(target("aarch64.svcount") %val) nounwind {
+ %ptr = alloca target("aarch64.svcount"), align 1
+ store target("aarch64.svcount") %val, ptr %ptr
+ %res = load target("aarch64.svcount"), ptr %ptr
+ ret target("aarch64.svcount") %res
+}
+
+;
+; Test passing as arguments (from perspective of callee)
+;
+
+define target("aarch64.svcount") @test_return_arg1(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1) nounwind {
+ ret target("aarch64.svcount") %arg1
+}
+
+define target("aarch64.svcount") @test_return_arg4(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1, target("aarch64.svcount") %arg2, target("aarch64.svcount") %arg3, target("aarch64.svcount") %arg4) nounwind {
+ ret target("aarch64.svcount") %arg4
+}
+
+;
+; Test passing as arguments (from perspective of caller)
+;
+
+declare void @take_svcount_1(target("aarch64.svcount") %arg)
+define void @test_pass_1arg(target("aarch64.svcount") %arg) nounwind {
+ call void @take_svcount_1(target("aarch64.svcount") %arg)
+ ret void
+}
+
+declare void @take_svcount_5(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1, target("aarch64.svcount") %arg2, target("aarch64.svcount") %arg3, target("aarch64.svcount") %arg4)
+define void @test_pass_5args(target("aarch64.svcount") %arg) nounwind {
+ call void @take_svcount_5(target("aarch64.svcount") %arg, target("aarch64.svcount") %arg, target("aarch64.svcount") %arg, target("aarch64.svcount") %arg, target("aarch64.svcount") %arg)
+ ret void
+}
+
+define target("aarch64.svcount") @test_sel(target("aarch64.svcount") %x, target("aarch64.svcount") %y, i1 %cmp) sanitize_memory {
+ %x.y = select i1 %cmp, target("aarch64.svcount") %x, target("aarch64.svcount") %y
+ ret target("aarch64.svcount") %x.y
+}
+
+define target("aarch64.svcount") @test_sel_cc(target("aarch64.svcount") %x, target("aarch64.svcount") %y, i32 %k) sanitize_memory {
+ %cmp = icmp sgt i32 %k, 42
+ %x.y = select i1 %cmp, target("aarch64.svcount") %x, target("aarch64.svcount") %y
+ ret target("aarch64.svcount") %x.y
+}
diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter.td b/llvm/test/TableGen/RuntimeLibcallEmitter.td
index 0c23e3b..7aaf3a0 100644
--- a/llvm/test/TableGen/RuntimeLibcallEmitter.td
+++ b/llvm/test/TableGen/RuntimeLibcallEmitter.td
@@ -73,6 +73,8 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
// All entries should be emitted in Libcall enum.
// CHECK: #ifdef GET_RUNTIME_LIBCALL_ENUM
+// CHECK-NEXT: #undef GET_RUNTIME_LIBCALL_ENUM
+// CHECK-EMPTY:
// CHECK-NEXT: namespace llvm {
// CHECK-NEXT: namespace RTLIB {
// CHECK-NEXT: enum Libcall : unsigned short {
@@ -101,9 +103,12 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
// CHECK-NEXT: constexpr size_t NumLibcallImpls = 9;
// CHECK-NEXT: } // End namespace RTLIB
// CHECK-NEXT: } // End namespace llvm
-// CHECK-NEXT: #endif
+// CHECK-EMPTY:
+// CHECK-NEXT: #endif // GET_RUNTIME_LIBCALL_ENUM
// CHECK: #ifdef GET_INIT_RUNTIME_LIBCALL_NAMES
+// CHECK-NEXT: #undef GET_INIT_RUNTIME_LIBCALL_NAMES
+// CHECK-EMPTY:
// CHECK-EMPTY:
// CHECK-NEXT: #ifdef __GNUC__
// CHECK-NEXT: #pragma GCC diagnostic push
@@ -163,13 +168,18 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
// CHECK-NEXT: };
// CHECK: #ifdef GET_LOOKUP_LIBCALL_IMPL_NAME_BODY
+// CHECK-NEXT: #undef GET_LOOKUP_LIBCALL_IMPL_NAME_BODY
+// CHECK-EMPTY:
// CHECK-NEXT: size_t Size = Name.size();
// CHECK-NEXT: if (Size == 0 || Size > 9)
// CHECK-NEXT: return enum_seq(RTLIB::Unsupported, RTLIB::Unsupported);
// CHECK-NEXT: return lookupLibcallImplNameImpl(Name);
-// CHECK-NEXT: #endif
+// CHECK-EMPTY:
+// CHECK-NEXT: #endif // GET_LOOKUP_LIBCALL_IMPL_NAME_BODY
// CHECK: #ifdef DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
+// CHECK-NEXT: #undef DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
+// CHECK-EMPTY:
// CHECK-NEXT: static inline uint64_t hash(StringRef Str) {
// CHECK-NEXT: return static_cast<uint32_t>(xxh3_64bits(Str));
// CHECK-NEXT: }
diff --git a/llvm/test/Transforms/GVNSink/ptrtoaddr.ll b/llvm/test/Transforms/GVNSink/ptrtoaddr.ll
new file mode 100644
index 0000000..3a9d16e
--- /dev/null
+++ b/llvm/test/Transforms/GVNSink/ptrtoaddr.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=gvn-sink < %s | FileCheck %s
+
+define i64 @test(i1 %c, ptr %p, ptr %p2) {
+; CHECK-LABEL: define i64 @test(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT: br i1 [[C]], label %[[IF:.*]], label %[[ELSE:.*]]
+; CHECK: [[IF]]:
+; CHECK-NEXT: br label %[[JOIN:.*]]
+; CHECK: [[ELSE]]:
+; CHECK-NEXT: br label %[[JOIN]]
+; CHECK: [[JOIN]]:
+; CHECK-NEXT: [[P2_SINK:%.*]] = phi ptr [ [[P2]], %[[ELSE]] ], [ [[P]], %[[IF]] ]
+; CHECK-NEXT: [[PHI:%.*]] = ptrtoaddr ptr [[P2_SINK]] to i64
+; CHECK-NEXT: ret i64 [[PHI]]
+;
+ br i1 %c, label %if, label %else
+
+if:
+ %p.addr = ptrtoaddr ptr %p to i64
+ br label %join
+
+else:
+ %p2.addr = ptrtoaddr ptr %p2 to i64
+ br label %join
+
+join:
+ %phi = phi i64 [ %p.addr, %if ], [ %p2.addr, %else ]
+ ret i64 %phi
+}
diff --git a/llvm/test/Transforms/Inline/ML/recursive.ll b/llvm/test/Transforms/Inline/ML/recursive.ll
new file mode 100644
index 0000000..2d9240a
--- /dev/null
+++ b/llvm/test/Transforms/Inline/ML/recursive.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
+; REQUIRES: llvm_inliner_model_autogenerated
+; RUN: opt -S %s -o - -passes='inliner-ml-advisor-release' -ml-inliner-skip-policy=if-caller-not-cold | FileCheck %s
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-android29"
+
+define i32 @a_func(ptr %this, i32 %color_id, i1 %dark_mode) local_unnamed_addr {
+; CHECK-LABEL: define i32 @a_func(
+; CHECK-SAME: ptr [[THIS:%.*]], i32 [[COLOR_ID:%.*]], i1 [[DARK_MODE:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br i1 [[DARK_MODE]], label %[[SW_BB97:.*]], label %[[COMMON_RET:.*]]
+; CHECK: [[COMMON_RET]]:
+; CHECK-NEXT: ret i32 0
+; CHECK: [[SW_BB97]]:
+; CHECK-NEXT: br label %[[COMMON_RET]]
+;
+entry:
+ br i1 %dark_mode, label %sw.bb97, label %common.ret
+
+common.ret: ; preds = %sw.bb97, %entry
+ ret i32 0
+
+sw.bb97: ; preds = %entry
+ %call.i = tail call i32 @a_func(ptr null, i32 0, i1 false)
+ br label %common.ret
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
+declare ptr @llvm.load.relative.i32(ptr %0, i32 %1) #0
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+;.
diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
deleted file mode 100644
index ffaa8b1..0000000
--- a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
+++ /dev/null
@@ -1,163 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-; The ptrtoaddr folds are also valid for pointers that have external state.
-target datalayout = "pe1:64:64:64:32"
-
-@g = external global i8
-@g2 = external global i8
-
-@g.as1 = external addrspace(1) global i8
-@g2.as1 = external addrspace(1) global i8
-
-define i64 @ptrtoaddr_inttoptr_arg(i64 %a) {
-; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_arg(
-; CHECK-SAME: i64 [[A:%.*]]) {
-; CHECK-NEXT: ret i64 [[A]]
-;
- %toptr = inttoptr i64 %a to ptr
- %toaddr = ptrtoaddr ptr %toptr to i64
- ret i64 %toaddr
-}
-
-define i32 @ptrtoaddr_inttoptr_arg_addrsize(i32 %a) {
-; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_arg_addrsize(
-; CHECK-SAME: i32 [[A:%.*]]) {
-; CHECK-NEXT: ret i32 [[A]]
-;
- %toptr = inttoptr i32 %a to ptr addrspace(1)
- %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
- ret i32 %toaddr
-}
-
-define i32 @ptrtoaddr_inttoptr() {
-; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr() {
-; CHECK-NEXT: ret i32 -1
-;
- ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i32 -1 to ptr addrspace(1)) to i32)
-}
-
-define i32 @ptrtoaddr_inttoptr_diff_size1() {
-; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size1() {
-; CHECK-NEXT: ret i32 -1
-;
- ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i64 -1 to ptr addrspace(1)) to i32)
-}
-
-define i32 @ptrtoaddr_inttoptr_diff_size2() {
-; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size2() {
-; CHECK-NEXT: ret i32 65535
-;
- ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i16 -1 to ptr addrspace(1)) to i32)
-}
-
-define i64 @ptrtoaddr_inttoptr_noas1() {
-; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas1() {
-; CHECK-NEXT: ret i64 1
-;
- ret i64 ptrtoaddr (ptr getelementptr (i8, ptr null, i64 1) to i64)
-}
-
-define i64 @ptr2addr2_inttoptr_noas2() {
-; CHECK-LABEL: define i64 @ptr2addr2_inttoptr_noas2() {
-; CHECK-NEXT: ret i64 123
-;
- ret i64 ptrtoaddr (ptr inttoptr (i64 123 to ptr) to i64)
-}
-
-define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
-; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
-; CHECK-NEXT: ret i64 4294967295
-;
- ret i64 ptrtoaddr (ptr inttoptr (i32 -1 to ptr) to i64)
-}
-
-define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
-; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
-; CHECK-NEXT: ret i64 -1
-;
- ret i64 ptrtoaddr (ptr inttoptr (i128 -1 to ptr) to i64)
-}
-
-define i64 @ptrtoaddr_gep_null() {
-; CHECK-LABEL: define i64 @ptrtoaddr_gep_null() {
-; CHECK-NEXT: ret i64 42
-;
- ret i64 ptrtoaddr (ptr getelementptr (i8, ptr null, i64 42) to i64)
-}
-
-define i32 @ptrtoaddr_gep_null_addrsize() {
-; CHECK-LABEL: define i32 @ptrtoaddr_gep_null_addrsize() {
-; CHECK-NEXT: ret i32 42
-;
- ret i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) null, i32 42) to i32)
-}
-
-define i64 @ptrtoaddr_gep_sub() {
-; CHECK-LABEL: define i64 @ptrtoaddr_gep_sub() {
-; CHECK-NEXT: ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
-;
- ret i64 ptrtoaddr (ptr getelementptr (i8, ptr @g, i64 sub (i64 0, i64 ptrtoaddr (ptr @g2 to i64))) to i64)
-}
-
-define i32 @ptrtoaddr_gep_sub_addrsize() {
-; CHECK-LABEL: define i32 @ptrtoaddr_gep_sub_addrsize() {
-; CHECK-NEXT: ret i32 sub (i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32), i32 ptrtoaddr (ptr addrspace(1) @g2.as1 to i32))
-;
- ret i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g.as1, i32 sub (i32 0, i32 ptrtoaddr (ptr addrspace(1) @g2.as1 to i32))) to i32)
-}
-
-; Don't fold inttoptr of ptrtoaddr away. inttoptr will pick a previously
-; exposed provenance, which is not necessarily that of @g (especially as
-; ptrtoaddr does not expose the provenance.)
-define ptr @inttoptr_of_ptrtoaddr() {
-; CHECK-LABEL: define ptr @inttoptr_of_ptrtoaddr() {
-; CHECK-NEXT: ret ptr inttoptr (i64 ptrtoaddr (ptr @g to i64) to ptr)
-;
- ret ptr inttoptr (i64 ptrtoaddr (ptr @g to i64) to ptr)
-}
-
-define i64 @ptrtoaddr_sub_consts_unrelated() {
-; CHECK-LABEL: define i64 @ptrtoaddr_sub_consts_unrelated() {
-; CHECK-NEXT: ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
-;
- ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
-}
-
-define i64 @ptrtoaddr_sub_consts_offset() {
-; CHECK-LABEL: define i64 @ptrtoaddr_sub_consts_offset() {
-; CHECK-NEXT: ret i64 42
-;
- ret i64 sub (i64 ptrtoaddr (ptr getelementptr (i8, ptr @g, i64 42) to i64), i64 ptrtoaddr (ptr @g to i64))
-}
-
-define i32 @ptrtoaddr_sub_consts_offset_addrsize() {
-; CHECK-LABEL: define i32 @ptrtoaddr_sub_consts_offset_addrsize() {
-; CHECK-NEXT: ret i32 42
-;
- ret i32 sub (i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g.as1, i32 42) to i32), i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32))
-}
-
-define i64 @ptrtoaddr_sub_known_offset(ptr %p) {
-; CHECK-LABEL: define i64 @ptrtoaddr_sub_known_offset(
-; CHECK-SAME: ptr [[P:%.*]]) {
-; CHECK-NEXT: ret i64 42
-;
- %p2 = getelementptr inbounds i8, ptr %p, i64 42
- %p.addr = ptrtoaddr ptr %p to i64
- %p2.addr = ptrtoaddr ptr %p2 to i64
- %sub = sub i64 %p2.addr, %p.addr
- ret i64 %sub
-}
-
-define i32 @ptrtoaddr_sub_known_offset_addrsize(ptr addrspace(1) %p) {
-; CHECK-LABEL: define i32 @ptrtoaddr_sub_known_offset_addrsize(
-; CHECK-SAME: ptr addrspace(1) [[P:%.*]]) {
-; CHECK-NEXT: ret i32 42
-;
- %p2 = getelementptr inbounds i8, ptr addrspace(1) %p, i32 42
- %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
- %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
- %sub = sub i32 %p2.addr, %p.addr
- ret i32 %sub
-}
diff --git a/llvm/test/Transforms/InstSimplify/ptrtoaddr.ll b/llvm/test/Transforms/InstSimplify/ptrtoaddr.ll
new file mode 100644
index 0000000..d06b520
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/ptrtoaddr.ll
@@ -0,0 +1,318 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+
+; The ptrtoaddr folds are also valid for pointers that have external state.
+target datalayout = "pe1:64:64:64:32"
+
+@g = external global i8
+@g2 = external global i8
+
+@g.as1 = external addrspace(1) global i8
+@g2.as1 = external addrspace(1) global i8
+
+define i64 @ptrtoaddr_inttoptr_arg(i64 %a) {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_arg(
+; CHECK-SAME: i64 [[A:%.*]]) {
+; CHECK-NEXT: ret i64 [[A]]
+;
+ %toptr = inttoptr i64 %a to ptr
+ %toaddr = ptrtoaddr ptr %toptr to i64
+ ret i64 %toaddr
+}
+
+define i32 @ptrtoaddr_inttoptr_arg_addrsize(i32 %a) {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_arg_addrsize(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT: ret i32 [[A]]
+;
+ %toptr = inttoptr i32 %a to ptr addrspace(1)
+ %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
+ ret i32 %toaddr
+}
+
+define i32 @ptrtoaddr_inttoptr() {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr() {
+; CHECK-NEXT: ret i32 -1
+;
+ %toptr = inttoptr i32 -1 to ptr addrspace(1)
+ %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
+ ret i32 %toaddr
+}
+
+define i32 @ptrtoaddr_inttoptr_diff_size1() {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size1() {
+; CHECK-NEXT: ret i32 -1
+;
+ %toptr = inttoptr i64 -1 to ptr addrspace(1)
+ %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
+ ret i32 %toaddr
+}
+
+define i32 @ptrtoaddr_inttoptr_diff_size2() {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size2() {
+; CHECK-NEXT: ret i32 65535
+;
+ %toptr = inttoptr i16 -1 to ptr addrspace(1)
+ %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
+ ret i32 %toaddr
+}
+
+define i64 @ptrtoaddr_inttoptr_noas1() {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas1() {
+; CHECK-NEXT: ret i64 1
+;
+ %toptr = getelementptr i8, ptr null, i64 1
+ %toaddr = ptrtoaddr ptr %toptr to i64
+ ret i64 %toaddr
+}
+
+define i64 @ptr2addr2_inttoptr_noas2() {
+; CHECK-LABEL: define i64 @ptr2addr2_inttoptr_noas2() {
+; CHECK-NEXT: ret i64 123
+;
+ %toptr = inttoptr i64 123 to ptr
+ %toaddr = ptrtoaddr ptr %toptr to i64
+ ret i64 %toaddr
+}
+
+define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
+; CHECK-NEXT: ret i64 4294967295
+;
+ %toptr = inttoptr i32 -1 to ptr
+ %toaddr = ptrtoaddr ptr %toptr to i64
+ ret i64 %toaddr
+}
+
+define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
+; CHECK-NEXT: ret i64 -1
+;
+ %toptr = inttoptr i128 -1 to ptr
+ %toaddr = ptrtoaddr ptr %toptr to i64
+ ret i64 %toaddr
+}
+
+define i64 @ptrtoaddr_gep_null() {
+; CHECK-LABEL: define i64 @ptrtoaddr_gep_null() {
+; CHECK-NEXT: ret i64 42
+;
+ %toaddr = ptrtoaddr ptr getelementptr (i8, ptr null, i64 42) to i64
+ ret i64 %toaddr
+}
+
+define i32 @ptrtoaddr_gep_null_addrsize() {
+; CHECK-LABEL: define i32 @ptrtoaddr_gep_null_addrsize() {
+; CHECK-NEXT: ret i32 42
+;
+ %toaddr = ptrtoaddr ptr addrspace(1) getelementptr (i8, ptr addrspace(1) null, i32 42) to i32
+ ret i32 %toaddr
+}
+
+define i64 @ptrtoaddr_gep_sub() {
+; CHECK-LABEL: define i64 @ptrtoaddr_gep_sub() {
+; CHECK-NEXT: ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
+;
+ %toaddr = ptrtoaddr ptr getelementptr (i8, ptr @g, i64 sub (i64 0, i64 ptrtoaddr (ptr @g2 to i64))) to i64
+ ret i64 %toaddr
+}
+
+define i32 @ptrtoaddr_gep_sub_addrsize() {
+; CHECK-LABEL: define i32 @ptrtoaddr_gep_sub_addrsize() {
+; CHECK-NEXT: ret i32 sub (i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32), i32 ptrtoaddr (ptr addrspace(1) @g2.as1 to i32))
+;
+ %toaddr = ptrtoaddr ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g.as1, i32 sub (i32 0, i32 ptrtoaddr (ptr addrspace(1) @g2.as1 to i32))) to i32
+ ret i32 %toaddr
+}
+
+; Don't fold inttoptr of ptrtoaddr away. inttoptr will pick a previously
+; exposed provenance, which is not necessarily that of @g (especially as
+; ptrtoaddr does not expose the provenance.)
+define ptr @inttoptr_of_ptrtoaddr() {
+; CHECK-LABEL: define ptr @inttoptr_of_ptrtoaddr() {
+; CHECK-NEXT: ret ptr inttoptr (i64 ptrtoaddr (ptr @g to i64) to ptr)
+;
+ %toptr = inttoptr i64 ptrtoaddr (ptr @g to i64) to ptr
+ ret ptr %toptr
+}
+
+define i64 @ptrtoaddr_sub_consts_unrelated() {
+; CHECK-LABEL: define i64 @ptrtoaddr_sub_consts_unrelated() {
+; CHECK-NEXT: ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
+;
+ %sub = sub i64 ptrtoaddr (ptr @g to i64), ptrtoaddr (ptr @g2 to i64)
+ ret i64 %sub
+}
+
+define i64 @ptrtoaddr_sub_consts_offset() {
+; CHECK-LABEL: define i64 @ptrtoaddr_sub_consts_offset() {
+; CHECK-NEXT: ret i64 42
+;
+ %sub = sub i64 ptrtoaddr (ptr getelementptr (i8, ptr @g, i64 42) to i64), ptrtoaddr (ptr @g to i64)
+ ret i64 %sub
+}
+
+define i32 @ptrtoaddr_sub_consts_offset_addrsize() {
+; CHECK-LABEL: define i32 @ptrtoaddr_sub_consts_offset_addrsize() {
+; CHECK-NEXT: ret i32 42
+;
+ %sub = sub i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g.as1, i32 42) to i32), ptrtoaddr (ptr addrspace(1) @g.as1 to i32)
+ ret i32 %sub
+}
+
+define i64 @ptrtoaddr_sub_known_offset(ptr %p) {
+; CHECK-LABEL: define i64 @ptrtoaddr_sub_known_offset(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT: ret i64 42
+;
+ %p2 = getelementptr inbounds i8, ptr %p, i64 42
+ %p.addr = ptrtoaddr ptr %p to i64
+ %p2.addr = ptrtoaddr ptr %p2 to i64
+ %sub = sub i64 %p2.addr, %p.addr
+ ret i64 %sub
+}
+
+define i32 @ptrtoaddr_sub_known_offset_addrsize(ptr addrspace(1) %p) {
+; CHECK-LABEL: define i32 @ptrtoaddr_sub_known_offset_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]]) {
+; CHECK-NEXT: ret i32 42
+;
+ %p2 = getelementptr inbounds i8, ptr addrspace(1) %p, i32 42
+ %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+ %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
+ %sub = sub i32 %p2.addr, %p.addr
+ ret i32 %sub
+}
+
+define i64 @ptrtoaddr_of_ptradd_of_sub(i64 %x, ptr %p) {
+; CHECK-LABEL: define i64 @ptrtoaddr_of_ptradd_of_sub(
+; CHECK-SAME: i64 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: ret i64 [[X]]
+;
+ %p.addr = ptrtoaddr ptr %p to i64
+ %sub = sub i64 %x, %p.addr
+ %ptradd = getelementptr i8, ptr %p, i64 %sub
+ %ptradd.addr = ptrtoaddr ptr %ptradd to i64
+ ret i64 %ptradd.addr
+}
+
+define i32 @ptrtoaddr_of_ptradd_of_sub_addrsize(i32 %x, ptr addrspace(1) %p) {
+; CHECK-LABEL: define i32 @ptrtoaddr_of_ptradd_of_sub_addrsize(
+; CHECK-SAME: i32 [[X:%.*]], ptr addrspace(1) [[P:%.*]]) {
+; CHECK-NEXT: ret i32 [[X]]
+;
+ %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+ %sub = sub i32 %x, %p.addr
+ %ptradd = getelementptr i8, ptr addrspace(1) %p, i32 %sub
+ %ptradd.addr = ptrtoaddr ptr addrspace(1) %ptradd to i32
+ ret i32 %ptradd.addr
+}
+
+define ptr @gep_of_sub_ptrtoaddr_unrelated_pointers(ptr %p, ptr %p2, i64 %x) {
+; CHECK-LABEL: define ptr @gep_of_sub_ptrtoaddr_unrelated_pointers(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[P2:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT: [[P2_ADDR:%.*]] = ptrtoaddr ptr [[P2]] to i64
+; CHECK-NEXT: [[P_ADDR:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT: [[SUB:%.*]] = sub i64 [[P2_ADDR]], [[P_ADDR]]
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[P]], i64 [[SUB]]
+; CHECK-NEXT: ret ptr [[GEP2]]
+;
+ %p2.addr = ptrtoaddr ptr %p2 to i64
+ %p.addr = ptrtoaddr ptr %p to i64
+ %sub = sub i64 %p2.addr, %p.addr
+ %gep2 = getelementptr i8, ptr %p, i64 %sub
+ ret ptr %gep2
+}
+
+define ptr @gep_of_sub_ptrtoaddr(ptr %p, i64 %x) {
+; CHECK-LABEL: define ptr @gep_of_sub_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[X]]
+; CHECK-NEXT: ret ptr [[GEP1]]
+;
+ %gep1 = getelementptr i8, ptr %p, i64 %x
+ %gep1.addr = ptrtoaddr ptr %gep1 to i64
+ %p.addr = ptrtoaddr ptr %p to i64
+ %sub = sub i64 %gep1.addr, %p.addr
+ %gep2 = getelementptr i8, ptr %p, i64 %sub
+ ret ptr %gep2
+}
+
+define ptr addrspace(1) @gep_of_sub_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %x) {
+; CHECK-LABEL: define ptr addrspace(1) @gep_of_sub_ptrtoaddr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[X:%.*]]) {
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(1) [[P]], i32 [[X]]
+; CHECK-NEXT: ret ptr addrspace(1) [[GEP1]]
+;
+ %gep1 = getelementptr i8, ptr addrspace(1) %p, i32 %x
+ %gep1.addr = ptrtoaddr ptr addrspace(1) %gep1 to i32
+ %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+ %sub = sub i32 %gep1.addr, %p.addr
+ %gep2 = getelementptr i8, ptr addrspace(1) %p, i32 %sub
+ ret ptr addrspace(1) %gep2
+}
+
+define ptr @gep_of_sub_ptrtoaddr_ashr(ptr %p, i64 %x) {
+; CHECK-LABEL: define ptr @gep_of_sub_ptrtoaddr_ashr(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[X]]
+; CHECK-NEXT: ret ptr [[GEP1]]
+;
+ %gep1 = getelementptr i8, ptr %p, i64 %x
+ %gep1.addr = ptrtoaddr ptr %gep1 to i64
+ %p.addr = ptrtoaddr ptr %p to i64
+ %sub = sub i64 %gep1.addr, %p.addr
+ %ashr = ashr i64 %sub, 1
+ %gep2 = getelementptr i16, ptr %p, i64 %ashr
+ ret ptr %gep2
+}
+
+define ptr addrspace(1) @gep_of_sub_ptrtoaddr_ashr_addrsize(ptr addrspace(1) %p, i32 %x) {
+; CHECK-LABEL: define ptr addrspace(1) @gep_of_sub_ptrtoaddr_ashr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[X:%.*]]) {
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(1) [[P]], i32 [[X]]
+; CHECK-NEXT: ret ptr addrspace(1) [[GEP1]]
+;
+ %gep1 = getelementptr i8, ptr addrspace(1) %p, i32 %x
+ %gep1.addr = ptrtoaddr ptr addrspace(1) %gep1 to i32
+ %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+ %sub = sub i32 %gep1.addr, %p.addr
+ %sdiv = sdiv i32 %sub, 3
+ %gep2 = getelementptr [3 x i8], ptr addrspace(1) %p, i32 %sdiv
+ ret ptr addrspace(1) %gep2
+}
+
+; Not folding this to inttoptr(123), as this may have different provenance from
+; %p, and the use of ptrtoaddr implies that the provenance of %p may not be
+; exposed, such that inttoptr cannot recover it.
+define ptr @gep_gep_neg_ptrtoaddr(ptr %p) {
+; CHECK-LABEL: define ptr @gep_gep_neg_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 123
+; CHECK-NEXT: [[P_ADDR:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT: [[P_ADDR_NEG:%.*]] = sub i64 0, [[P_ADDR]]
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[GEP1]], i64 [[P_ADDR_NEG]]
+; CHECK-NEXT: ret ptr [[GEP2]]
+;
+ %gep1 = getelementptr inbounds i8, ptr %p, i64 123
+ %p.addr = ptrtoaddr ptr %p to i64
+ %p.addr.neg = sub i64 0, %p.addr
+ %gep2 = getelementptr i8, ptr %gep1, i64 %p.addr.neg
+ ret ptr %gep2
+}
+
+define ptr @gep_gep_inv_ptrtoaddr(ptr %p) {
+; CHECK-LABEL: define ptr @gep_gep_inv_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 123
+; CHECK-NEXT: [[P_ADDR:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT: [[P_ADDR_INV:%.*]] = xor i64 [[P_ADDR]], -1
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[GEP1]], i64 [[P_ADDR_INV]]
+; CHECK-NEXT: ret ptr [[GEP2]]
+;
+ %gep1 = getelementptr inbounds i8, ptr %p, i64 123
+ %p.addr = ptrtoaddr ptr %p to i64
+ %p.addr.inv = xor i64 %p.addr, -1
+ %gep2 = getelementptr i8, ptr %gep1, i64 %p.addr.inv
+ ret ptr %gep2
+}
diff --git a/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll b/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
index 2b5960e..90d0db2 100644
--- a/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
@@ -56,7 +56,7 @@ if.then46: ; preds = %for.body40
br label %for.inc50
for.inc50: ; preds = %if.then46, %for.body40
- %k.1 = phi i32 [ undef, %for.body40 ], [ %inc47, %if.then46 ]
+ %k.1 = phi i32 [ 0, %for.body40 ], [ %inc47, %if.then46 ]
%step.1 = phi i32 [ %step.0121, %for.body40 ], [ %inc47, %if.then46 ]
%indvars.iv.next124 = add i64 %indvars.iv123, 1
%lftr.wideiv = trunc i64 %indvars.iv.next124 to i32
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 1c2840f..70532ad 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -1117,24 +1117,24 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
; CHECK-INTERLEAVE1-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 2
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4
; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 2
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[TMP2]] = add <2 x i64> [[VEC_PHI]], [[TMP1]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP1]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP2]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
@@ -1143,42 +1143,42 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
; CHECK-INTERLEAVED-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 8
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 16
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 8
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 16
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 6
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <2 x i32> [[WIDE_LOAD4]] to <2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <2 x i32> [[WIDE_LOAD5]] to <2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <2 x i32> [[WIDE_LOAD6]] to <2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <2 x i64> [[VEC_PHI]], [[TMP14]]
-; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <2 x i64> [[VEC_PHI1]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT: [[TMP10]] = add <2 x i64> [[VEC_PHI2]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT: [[TMP11]] = add <2 x i64> [[VEC_PHI3]], [[TMP7]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD4]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP15]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI1]], <4 x i64> [[TMP5]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE8]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI2]], <4 x i64> [[TMP6]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE9]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI3]], <4 x i64> [[TMP7]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP9]], [[TMP8]]
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX7:%.*]] = add <2 x i64> [[TMP10]], [[BIN_RDX]]
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX8:%.*]] = add <2 x i64> [[TMP11]], [[BIN_RDX7]]
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX8]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE7]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX10:%.*]] = add <2 x i64> [[PARTIAL_REDUCE8]], [[BIN_RDX]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX11:%.*]] = add <2 x i64> [[PARTIAL_REDUCE9]], [[BIN_RDX10]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX11]])
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
@@ -1187,24 +1187,24 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
; CHECK-MAXBW-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
; CHECK-MAXBW-NEXT: entry:
; CHECK-MAXBW-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 2
+; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4
; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-MAXBW: vector.ph:
-; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 2
+; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4
; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-MAXBW: vector.body:
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
-; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
-; CHECK-MAXBW-NEXT: [[TMP2]] = add <2 x i64> [[VEC_PHI]], [[TMP1]]
-; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP1]])
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP2]])
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-MAXBW: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
index 20b5364..ebf4a4f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
@@ -9,9 +9,7 @@ define i32 @fn1() local_unnamed_addr #0 {
; We expect the backend to expand all reductions.
; CHECK: @llvm.vector.reduce
entry:
- %0 = load i32, ptr @b, align 4, !tbaa !1
- %cmp40 = icmp sgt i32 %0, 0
- br i1 %cmp40, label %for.body.lr.ph, label %for.end
+ br label %for.body.lr.ph
for.body.lr.ph: ; preds = %entry
%1 = load ptr, ptr @a, align 8, !tbaa !5
@@ -21,8 +19,8 @@ for.body.lr.ph: ; preds = %entry
for.body: ; preds = %for.body.lr.ph, %for.body
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
- %d.043 = phi i16 [ undef, %for.body.lr.ph ], [ %.sink28, %for.body ]
- %c.042 = phi i16 [ undef, %for.body.lr.ph ], [ %c.0., %for.body ]
+ %d.043 = phi i16 [ 0, %for.body.lr.ph ], [ %.sink28, %for.body ]
+ %c.042 = phi i16 [ 0, %for.body.lr.ph ], [ %c.0., %for.body ]
%arrayidx = getelementptr inbounds i16, ptr %1, i64 %indvars.iv
%4 = load i16, ptr %arrayidx, align 2, !tbaa !7
%cmp2 = icmp sgt i16 %c.042, %4
@@ -33,10 +31,8 @@ for.body: ; preds = %for.body.lr.ph, %fo
%cmp = icmp slt i64 %indvars.iv.next, %3
br i1 %cmp, label %for.body, label %for.end
-for.end: ; preds = %for.body, %entry
- %c.0.lcssa = phi i16 [ undef, %entry ], [ %c.0., %for.body ]
- %d.0.lcssa = phi i16 [ undef, %entry ], [ %.sink28, %for.body ]
- %cmp26 = icmp sgt i16 %c.0.lcssa, %d.0.lcssa
+for.end: ; preds = %for.body
+ %cmp26 = icmp sgt i16 %c.0., %.sink28
%conv27 = zext i1 %cmp26 to i32
ret i32 %conv27
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll
index 44820e0..33ce300 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll
@@ -18,7 +18,7 @@ define void @_Z1dv() local_unnamed_addr #0 {
; CHECK-NEXT: br label [[FOR_COND:%.*]]
; CHECK: for.cond:
; CHECK-NEXT: [[F_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD5:%.*]], [[FOR_COND_CLEANUP:%.*]] ]
-; CHECK-NEXT: [[G_0:%.*]] = phi i32 [ undef, [[ENTRY]] ], [ [[G_1_LCSSA:%.*]], [[FOR_COND_CLEANUP]] ]
+; CHECK-NEXT: [[G_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[G_1_LCSSA:%.*]], [[FOR_COND_CLEANUP]] ]
; CHECK-NEXT: [[CMP12:%.*]] = icmp ult i32 [[G_0]], 4
; CHECK-NEXT: [[CONV:%.*]] = and i32 [[F_0]], 65535
; CHECK-NEXT: br i1 [[CMP12]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP]]
@@ -50,7 +50,7 @@ entry:
for.cond: ; preds = %for.cond.cleanup, %entry
%f.0 = phi i32 [ 0, %entry ], [ %add5, %for.cond.cleanup ]
- %g.0 = phi i32 [ undef, %entry ], [ %g.1.lcssa, %for.cond.cleanup ]
+ %g.0 = phi i32 [ 0, %entry ], [ %g.1.lcssa, %for.cond.cleanup ]
%cmp12 = icmp ult i32 %g.0, 4
%conv = and i32 %f.0, 65535
br i1 %cmp12, label %for.body.lr.ph, label %for.cond.cleanup
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index de804600..786a2aa 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -728,8 +728,8 @@ define void @int_float_struct(ptr nocapture readonly %p) #0 {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> zeroinitializer, float undef, i32 0), [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 undef, i32 0), [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
@@ -755,8 +755,8 @@ for.cond.cleanup: ; preds = %for.body
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
- %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
- %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
+ %SumB.014 = phi float [ 0.0e+00, %entry ], [ %add3, %for.body ]
+ %SumA.013 = phi i32 [ 0, %entry ], [ %add, %for.body ]
%a = getelementptr inbounds %struct.IntFloat, ptr %p, i64 %indvars.iv, i32 0
%load1 = load i32, ptr %a, align 4
%add = add nsw i32 %load1, %SumA.013
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
index 44a48a9..0f398a6 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
@@ -84,15 +84,14 @@ for.end: ; preds = %for.end.loopexit, %
; CHECK: We can vectorize this loop!
define i32 @redi(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) {
entry:
- %cmp5 = icmp eq i32 %N, 0
- br i1 %cmp5, label %for.end, label %for.body.preheader
+ br label %for.body.preheader
for.body.preheader: ; preds = %entry
br label %for.body
for.body: ; preds = %for.body.preheader, %for.body
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
- %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+ %Red.06 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
%arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.07
%0 = load i32, ptr %arrayidx, align 4
%arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.07
@@ -107,9 +106,8 @@ for.end.loopexit: ; preds = %for.body
%add.lcssa = phi i32 [ %add, %for.body ]
br label %for.end
-for.end: ; preds = %for.end.loopexit, %entry
- %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
- ret i32 %Red.0.lcssa
+for.end: ; preds = %for.end.loopexit
+ ret i32 %add.lcssa
}
; Floating-point loops need fast-math to be vectorizeable
@@ -121,15 +119,14 @@ for.end: ; preds = %for.end.loopexit, %
; DARWIN: We can vectorize this loop!
define float @redf(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) {
entry:
- %cmp5 = icmp eq i32 %N, 0
- br i1 %cmp5, label %for.end, label %for.body.preheader
+ br label %for.body.preheader
for.body.preheader: ; preds = %entry
br label %for.body
for.body: ; preds = %for.body.preheader, %for.body
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
- %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+ %Red.06 = phi float [ %add, %for.body ], [ 0.0e+00, %for.body.preheader ]
%arrayidx = getelementptr inbounds float, ptr %a, i32 %i.07
%0 = load float, ptr %arrayidx, align 4
%arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.07
@@ -144,9 +141,8 @@ for.end.loopexit: ; preds = %for.body
%add.lcssa = phi float [ %add, %for.body ]
br label %for.end
-for.end: ; preds = %for.end.loopexit, %entry
- %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
- ret float %Red.0.lcssa
+for.end: ; preds = %for.end.loopexit
+ ret float %add.lcssa
}
; Make sure calls that turn into builtins are also covered
@@ -252,7 +248,7 @@ for.body.preheader: ; preds = %entry
for.body: ; preds = %for.body.preheader, %for.body
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
- %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+ %Red.06 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
%arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.07
%0 = load i32, ptr %arrayidx, align 4
%arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.07
@@ -268,7 +264,7 @@ for.end.loopexit: ; preds = %for.body
br label %for.end
for.end: ; preds = %for.end.loopexit, %entry
- %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+ %Red.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.end.loopexit ]
ret i32 %Red.0.lcssa
}
@@ -277,15 +273,14 @@ for.end: ; preds = %for.end.loopexit, %
; CHECK: We can vectorize this loop!
define float @redf_fast(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) {
entry:
- %cmp5 = icmp eq i32 %N, 0
- br i1 %cmp5, label %for.end, label %for.body.preheader
+ br label %for.body.preheader
for.body.preheader: ; preds = %entry
br label %for.body
for.body: ; preds = %for.body.preheader, %for.body
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
- %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+ %Red.06 = phi float [ %add, %for.body ], [ 0.0e+00, %for.body.preheader ]
%arrayidx = getelementptr inbounds float, ptr %a, i32 %i.07
%0 = load float, ptr %arrayidx, align 4
%arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.07
@@ -300,9 +295,8 @@ for.end.loopexit: ; preds = %for.body
%add.lcssa = phi float [ %add, %for.body ]
br label %for.end
-for.end: ; preds = %for.end.loopexit, %entry
- %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
- ret float %Red.0.lcssa
+for.end: ; preds = %for.end.loopexit
+ ret float %add.lcssa
}
; Make sure calls that turn into builtins are also covered
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
index fe3504b..23609b1 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
@@ -249,7 +249,7 @@ define dso_local i32 @predicated_test(i32 noundef %0, ptr %glob) #0 {
br label %7
7: ; preds = %5, %155
- %8 = phi i32 [ %10, %155 ], [ undef, %5 ]
+ %8 = phi i32 [ %10, %155 ], [ 0, %5 ]
%9 = phi i32 [ %156, %155 ], [ 0, %5 ]
%10 = shl i32 %8, 4
store i32 %10, ptr %6, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
index c5319c6..f4c7c6f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
@@ -246,6 +246,57 @@ exit:
ret void
}
+; Test for https://github.com/llvm/llvm-project/issues/162688.
+define void @test_minbws_for_trunc(i32 %n, ptr noalias %p1, ptr noalias %p2) {
+; CHECK-LABEL: define void @test_minbws_for_trunc(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[P1:%.*]], ptr noalias [[P2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV_EXT:%.*]] = sext i16 [[IV]] to i64
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, ptr [[P1]], i64 [[IV_EXT]]
+; CHECK-NEXT: [[V1:%.*]] = load i32, ptr [[GEP1]], align 4
+; CHECK-NEXT: [[V1_TRUNC:%.*]] = trunc i32 [[V1]] to i16
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr [1 x [1 x i16]], ptr [[P2]], i64 [[IV_EXT]]
+; CHECK-NEXT: store i16 [[V1_TRUNC]], ptr [[GEP2]], align 2
+; CHECK-NEXT: [[V1_TRUNC_I8:%.*]] = trunc i32 [[V1]] to i8
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr [[P2]], i64 [[IV_EXT]]
+; CHECK-NEXT: store i8 [[V1_TRUNC_I8]], ptr [[GEP3]], align 1
+; CHECK-NEXT: [[GEP4:%.*]] = getelementptr [1 x i64], ptr [[P2]], i64 [[IV_EXT]]
+; CHECK-NEXT: store i64 0, ptr [[GEP4]], align 8
+; CHECK-NEXT: [[IV_NEXT]] = add i16 [[IV]], 4
+; CHECK-NEXT: [[IV_NEXT_EXT:%.*]] = sext i16 [[IV_NEXT]] to i32
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[IV_NEXT_EXT]], 1024
+; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i16 [ 0, %entry ], [ %iv.next, %loop ]
+ %iv.ext = sext i16 %iv to i64
+ %gep1 = getelementptr i32, ptr %p1, i64 %iv.ext
+ %v1 = load i32, ptr %gep1, align 4
+ %v1.trunc = trunc i32 %v1 to i16
+ %gep2 = getelementptr [1 x [1 x i16]], ptr %p2, i64 %iv.ext
+ store i16 %v1.trunc, ptr %gep2, align 2
+ %v1.trunc.i8 = trunc i32 %v1 to i8
+ %gep3 = getelementptr i8, ptr %p2, i64 %iv.ext
+ store i8 %v1.trunc.i8, ptr %gep3, align 1
+ %gep4 = getelementptr [1 x i64], ptr %p2, i64 %iv.ext
+ store i64 0, ptr %gep4, align 8
+ %iv.next = add i16 %iv, 4
+ %iv.next.ext = sext i16 %iv.next to i32
+ %cmp = icmp ne i32 %iv.next.ext, 1024
+ br i1 %cmp, label %loop, label %exit
+
+exit:
+ ret void
+}
+
attributes #0 = { "target-features"="+64bit,+v,+zvl256b" }
attributes #1 = { "target-features"="+64bit,+v" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll b/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll
index 2ef9d4b..3718ad2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll
@@ -14,7 +14,7 @@ define x86_fp80 @test() {
; CHECK-NEXT: br label [[FOR_BODY3_I_3:%.*]]
; CHECK: for.body3.i.3:
; CHECK-NEXT: [[N_ADDR_112_I_3:%.*]] = phi i64 [ [[DEC_I_3:%.*]], [[FOR_BODY3_I_3]] ], [ 24, [[FOO_EXIT:%.*]] ]
-; CHECK-NEXT: [[X_ADDR_111_I_3:%.*]] = phi x86_fp80 [ [[MUL_I_3:%.*]], [[FOR_BODY3_I_3]] ], [ undef, [[FOO_EXIT]] ]
+; CHECK-NEXT: [[X_ADDR_111_I_3:%.*]] = phi x86_fp80 [ [[MUL_I_3:%.*]], [[FOR_BODY3_I_3]] ], [ 0xK00000000000000000000, [[FOO_EXIT]] ]
; CHECK-NEXT: [[MUL_I_3]] = fmul x86_fp80 [[X_ADDR_111_I_3]], 0xK40008000000000000000
; CHECK-NEXT: [[DEC_I_3]] = add nsw i64 [[N_ADDR_112_I_3]], -1
; CHECK-NEXT: [[CMP2_I_3:%.*]] = icmp sgt i64 [[N_ADDR_112_I_3]], 1
@@ -28,7 +28,7 @@ foo.exit:
for.body3.i.3: ; preds = %for.body3.i.3, %foo.exit
%n.addr.112.i.3 = phi i64 [ %dec.i.3, %for.body3.i.3 ], [ 24, %foo.exit ]
- %x.addr.111.i.3 = phi x86_fp80 [ %mul.i.3, %for.body3.i.3 ], [ undef, %foo.exit ]
+ %x.addr.111.i.3 = phi x86_fp80 [ %mul.i.3, %for.body3.i.3 ], [ zeroinitializer, %foo.exit ]
%mul.i.3 = fmul x86_fp80 %x.addr.111.i.3, 0xK40008000000000000000
%dec.i.3 = add nsw i64 %n.addr.112.i.3, -1
%cmp2.i.3 = icmp sgt i64 %n.addr.112.i.3, 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll b/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll
index df1c4f9..5321d69 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll
@@ -21,10 +21,10 @@ while.cond63.preheader.while.end76_crit_edge:
ret void
while.body:
- %d2_fx.015 = phi double [ %sub52, %while.body ], [ undef, %entry ]
- %d2_fy.014 = phi double [ %sub58, %while.body ], [ undef, %entry ]
- %d3_fy.013 = phi double [ %div56, %while.body ], [ undef, %entry ]
- %d3_fx.012 = phi double [ %div50, %while.body ], [ undef, %entry ]
+ %d2_fx.015 = phi double [ %sub52, %while.body ], [ 0.0e+00, %entry ]
+ %d2_fy.014 = phi double [ %sub58, %while.body ], [ 0.0e+00, %entry ]
+ %d3_fy.013 = phi double [ %div56, %while.body ], [ 0.0e+00, %entry ]
+ %d3_fx.012 = phi double [ %div50, %while.body ], [ 0.0e+00, %entry ]
%div50 = fmul double %d3_fx.012, 1.250000e-01
%sub52 = fsub double 0.000000e+00, %div50
%div56 = fmul double %d3_fy.013, 1.250000e-01
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll
index 368361f..0f55d79 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll
@@ -15,7 +15,7 @@ for.cond.cleanup:
for.body:
%i.09 = phi i16 [ 0, %entry ], [ %add3, %for.body ]
- %res.08 = phi x86_fp80 [ undef, %entry ], [ %3, %for.body ]
+ %res.08 = phi x86_fp80 [ zeroinitializer, %entry ], [ %3, %for.body ]
%arrayidx = getelementptr inbounds x86_fp80, ptr %a, i16 %i.09
%0 = load x86_fp80, ptr %arrayidx, align 1
%add = or i16 %i.09, 1
diff --git a/llvm/test/Transforms/LoopVectorize/i8-induction.ll b/llvm/test/Transforms/LoopVectorize/i8-induction.ll
index 220fd64..712c75d 100644
--- a/llvm/test/Transforms/LoopVectorize/i8-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/i8-induction.ll
@@ -20,7 +20,7 @@ scalar.ph:
for.body:
%mul16 = phi i8 [ 0, %scalar.ph ], [ %mul, %for.body ] ; <------- i8 induction var.
- %c.015 = phi i8 [ undef, %scalar.ph ], [ %conv8, %for.body ]
+ %c.015 = phi i8 [ 0, %scalar.ph ], [ %conv8, %for.body ]
%conv2 = sext i8 %c.015 to i32
%tobool = icmp ne i8 %c.015, 0
%.sink = select i1 %tobool, i8 %c.015, i8 %0
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index f7376a0..c164c4a 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -277,7 +277,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
; UNROLL-NOSIMPLIFY-NEXT: [[INEWCHUNKS_2_LCSSA:%.*]] = phi i32 [ [[INEWCHUNKS_2]], [[FOR_INC23]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
; UNROLL-NOSIMPLIFY-NEXT: br label [[FOR_INC26]]
; UNROLL-NOSIMPLIFY: for.inc26:
-; UNROLL-NOSIMPLIFY-NEXT: [[INEWCHUNKS_1_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY9]] ], [ [[INEWCHUNKS_2_LCSSA]], [[FOR_INC26_LOOPEXIT]] ]
+; UNROLL-NOSIMPLIFY-NEXT: [[INEWCHUNKS_1_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY9]] ], [ [[INEWCHUNKS_2_LCSSA]], [[FOR_INC26_LOOPEXIT]] ]
; UNROLL-NOSIMPLIFY-NEXT: unreachable
;
; VEC-LABEL: @bug18724(
@@ -376,7 +376,7 @@ for.inc23:
br i1 %cmp13, label %for.body14, label %for.inc26
for.inc26:
- %iNewChunks.1.lcssa = phi i32 [ undef, %for.body9 ], [ %iNewChunks.2, %for.inc23 ]
+ %iNewChunks.1.lcssa = phi i32 [ 0, %for.body9 ], [ %iNewChunks.2, %for.inc23 ]
unreachable
}
diff --git a/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
index 9e75002..5cf99b8 100644
--- a/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
@@ -53,8 +53,8 @@ thread-pre-split.loopexit: ; preds = %11, %.thread-pre-sp
br i1 false, label %thread-pre-split._crit_edge, label %.lr.ph21
.lr.ph21: ; preds = %26, %thread-pre-split.loopexit, %thread-pre-split.preheader
- %d.020 = phi ptr [ undef, %26 ], [ %d.1.lcssa, %thread-pre-split.loopexit ], [ undef, %thread-pre-split.preheader ]
- %10 = phi i64 [ %28, %26 ], [ undef, %thread-pre-split.loopexit ], [ undef, %thread-pre-split.preheader ]
+ %d.020 = phi ptr [ zeroinitializer, %26 ], [ %d.1.lcssa, %thread-pre-split.loopexit ], [ zeroinitializer, %thread-pre-split.preheader ]
+ %10 = phi i64 [ %28, %26 ], [ zeroinitializer, %thread-pre-split.loopexit ], [ zeroinitializer, %thread-pre-split.preheader ]
br i1 %arg, label %11, label %22
; <label>:11 ; preds = %.lr.ph21
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 16357b3..8efe29a 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -791,8 +791,8 @@ define void @int_float_struct(ptr nocapture readonly %A) #0 {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 undef, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -822,8 +822,8 @@ for.cond.cleanup: ; preds = %for.body
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
- %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
- %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
+ %SumB.014 = phi float [ 0.0e+00, %entry ], [ %add3, %for.body ]
+ %SumA.013 = phi i32 [ 0, %entry ], [ %add, %for.body ]
%a = getelementptr inbounds %struct.IntFloat, ptr %A, i64 %indvars.iv, i32 0
%tmp = load i32, ptr %a, align 4
%add = add nsw i32 %tmp, %SumA.013
diff --git a/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll b/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll
index 741ca55..1675a59 100644
--- a/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll
+++ b/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll
@@ -48,7 +48,7 @@ for.cond.cleanup.loopexit:
br label %for.cond.cleanup, !dbg !33
for.cond.cleanup:
- %2 = phi i32 [ %.pre, %for.cond.cleanup.loopexit ], [ undef, %entry ], !dbg !33
+ %2 = phi i32 [ %.pre, %for.cond.cleanup.loopexit ], [ 0, %entry ], !dbg !33
%sub = add nsw i32 %0, -5, !dbg !33
%idxprom3 = sext i32 %sub to i64, !dbg !33
%arrayidx4 = getelementptr inbounds i32, ptr %vla, i64 %idxprom3, !dbg !33
diff --git a/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll b/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll
index 659dc62..2038633 100644
--- a/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll
+++ b/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll
@@ -22,8 +22,8 @@ entry:
br label %for.body
for.body:
- %inc107 = phi i32 [ undef, %entry ], [ %inc10, %for.body ]
- %inc6 = phi i32 [ %nf.promoted, %entry ], [ undef, %for.body ]
+ %inc107 = phi i32 [ 0, %entry ], [ %inc10, %for.body ]
+ %inc6 = phi i32 [ %nf.promoted, %entry ], [ 3, %for.body ]
%add55 = phi i32 [ %n.promoted, %entry ], [ %add5, %for.body ]
%.neg2 = sub i32 0, %inc6
%add.neg = add i32 0, %add55
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index ec7fde8..964a257 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -1207,11 +1207,11 @@ for.end:
define i32 @reduction_sum_multiuse(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-LABEL: define i32 @reduction_sum_multiuse(
; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]]) {
-; CHECK-NEXT: [[_LR_PH1:.*]]:
+; CHECK-NEXT: [[_LR_PH:.*]]:
; CHECK-NEXT: br label %[[DOTLR_PH:.*]]
-; CHECK: [[_LR_PH:.*:]]
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ]
-; CHECK-NEXT: [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ]
+; CHECK: [[_LR_PH1:.*:]]
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH]] ]
+; CHECK-NEXT: [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH]] ]
; CHECK-NEXT: [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[L3:%.*]] = load i32, ptr [[L2]], align 4
; CHECK-NEXT: [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
@@ -1231,11 +1231,11 @@ define i32 @reduction_sum_multiuse(ptr noalias nocapture %A, ptr noalias nocaptu
;
; CHECK-INTERLEAVED-LABEL: define i32 @reduction_sum_multiuse(
; CHECK-INTERLEAVED-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]]) {
-; CHECK-INTERLEAVED-NEXT: [[_LR_PH1:.*]]:
+; CHECK-INTERLEAVED-NEXT: [[_LR_PH:.*]]:
; CHECK-INTERLEAVED-NEXT: br label %[[DOTLR_PH:.*]]
-; CHECK-INTERLEAVED: [[_LR_PH:.*:]]
-; CHECK-INTERLEAVED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ]
-; CHECK-INTERLEAVED-NEXT: [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ]
+; CHECK-INTERLEAVED: [[_LR_PH1:.*:]]
+; CHECK-INTERLEAVED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH]] ]
+; CHECK-INTERLEAVED-NEXT: [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH]] ]
; CHECK-INTERLEAVED-NEXT: [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
; CHECK-INTERLEAVED-NEXT: [[L3:%.*]] = load i32, ptr [[L2]], align 4
; CHECK-INTERLEAVED-NEXT: [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
@@ -1947,7 +1947,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], zeroinitializer
@@ -1966,7 +1966,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h
; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[FOR_BODY2:.*]]
; CHECK: [[FOR_BODY2]]:
; CHECK-NEXT: [[A_117:%.*]] = phi i32 [ [[INC6:%.*]], %[[FOR_INC5:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -2002,7 +2002,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h
; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK-INTERLEAVED: [[VECTOR_BODY]]:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H]], i32 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
@@ -2033,7 +2033,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]]
; CHECK-INTERLEAVED: [[SCALAR_PH]]:
; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ]
+; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-INTERLEAVED-NEXT: br label %[[FOR_BODY2:.*]]
; CHECK-INTERLEAVED: [[FOR_BODY2]]:
; CHECK-INTERLEAVED-NEXT: [[A_117:%.*]] = phi i32 [ [[INC6:%.*]], %[[FOR_INC5:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -2063,7 +2063,7 @@ entry:
for.body2: ; preds = %entry, %for.inc5
%a.117 = phi i32 [ %inc6, %for.inc5 ], [ 0, %entry ]
- %g.016 = phi i32 [ %g.1, %for.inc5 ], [ undef, %entry ]
+ %g.016 = phi i32 [ %g.1, %for.inc5 ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i8, ptr %h, i32 %a.117
%0 = load i8, ptr %arrayidx, align 1
%tobool3.not = icmp eq i8 %0, 0
@@ -2100,7 +2100,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], zeroinitializer
@@ -2122,7 +2122,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[FOR_BODY2:.*]]
; CHECK: [[FOR_BODY2]]:
; CHECK-NEXT: [[A_117:%.*]] = phi i32 [ [[INC6:%.*]], %[[FOR_INC5:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -2159,7 +2159,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK-INTERLEAVED: [[VECTOR_BODY]]:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H]], i32 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
@@ -2196,7 +2196,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]]
; CHECK-INTERLEAVED: [[SCALAR_PH]]:
; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ]
+; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-INTERLEAVED-NEXT: br label %[[FOR_BODY2:.*]]
; CHECK-INTERLEAVED: [[FOR_BODY2]]:
; CHECK-INTERLEAVED-NEXT: [[A_117:%.*]] = phi i32 [ [[INC6:%.*]], %[[FOR_INC5:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -2227,7 +2227,7 @@ entry:
for.body2: ; preds = %entry, %for.inc5
%a.117 = phi i32 [ %inc6, %for.inc5 ], [ 0, %entry ]
- %g.016 = phi i32 [ %g.1, %for.inc5 ], [ undef, %entry ]
+ %g.016 = phi i32 [ %g.1, %for.inc5 ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i8, ptr %h, i32 %a.117
%0 = load i8, ptr %arrayidx, align 1
%tobool3.not = icmp eq i8 %0, 0
@@ -2263,7 +2263,7 @@ define i32 @predicated_or_dominates_reduction(ptr %b) {
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE6:.*]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP48:%.*]], %[[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP48:%.*]], %[[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2
@@ -2340,7 +2340,7 @@ define i32 @predicated_or_dominates_reduction(ptr %b) {
; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK-INTERLEAVED: [[VECTOR_BODY]]:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE15:.*]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP94:%.*]], %[[PRED_LOAD_CONTINUE15]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP94:%.*]], %[[PRED_LOAD_CONTINUE15]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP98:%.*]], %[[PRED_LOAD_CONTINUE15]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1
@@ -2480,7 +2480,7 @@ for.cond.cleanup: ; preds = %for.inc
for.body: ; preds = %entry, %for.inc
%g.09 = phi i32 [ 0, %entry ], [ %inc3, %for.inc ]
- %a.08 = phi i32 [ undef, %entry ], [ %a.1, %for.inc ]
+ %a.08 = phi i32 [ 0, %entry ], [ %a.1, %for.inc ]
%d = getelementptr inbounds [0 x %struct.e], ptr %b, i32 0, i32 %g.09, i32 1
%0 = load i32, ptr %d, align 4
%tobool.not = icmp eq i32 %0, 0
diff --git a/llvm/test/Transforms/LoopVectorize/reverse_iter.ll b/llvm/test/Transforms/LoopVectorize/reverse_iter.ll
index 43e916b..6576675 100644
--- a/llvm/test/Transforms/LoopVectorize/reverse_iter.ll
+++ b/llvm/test/Transforms/LoopVectorize/reverse_iter.ll
@@ -17,8 +17,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
;CHECK: <i32 0, i32 -1, i32 -2, i32 -3>
;CHECK: ret
define i32 @foo(i32 %n, ptr nocapture %A) {
- %1 = icmp sgt i32 %n, 0
- br i1 %1, label %.lr.ph, label %._crit_edge
+ br label %.lr.ph
.lr.ph: ; preds = %0
%2 = sext i32 %n to i64
@@ -26,7 +25,7 @@ define i32 @foo(i32 %n, ptr nocapture %A) {
; <label>:3 ; preds = %.lr.ph, %3
%indvars.iv = phi i64 [ %2, %.lr.ph ], [ %indvars.iv.next, %3 ]
- %sum.01 = phi i32 [ undef, %.lr.ph ], [ %9, %3 ]
+ %sum.01 = phi i32 [ 0, %.lr.ph ], [ %9, %3 ]
%4 = trunc i64 %indvars.iv to i32
%5 = shl nsw i32 %4, 1
%6 = sext i32 %5 to i64
@@ -38,8 +37,7 @@ define i32 @foo(i32 %n, ptr nocapture %A) {
%11 = icmp sgt i32 %10, 0
br i1 %11, label %3, label %._crit_edge
-._crit_edge: ; preds = %3, %0
- %sum.0.lcssa = phi i32 [ undef, %0 ], [ %9, %3 ]
- ret i32 %sum.0.lcssa
+._crit_edge: ; preds = %3
+ ret i32 %9
}
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
index 60da336..1216bc1 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
@@ -178,13 +178,14 @@ for.exit:
define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
; CHECK-VF4UF1-LABEL: define i32 @recurrence_2(
; CHECK-VF4UF1-SAME: ptr readonly captures(none) [[A:%.*]], i32 [[N:%.*]]) {
-; CHECK-VF4UF1-NEXT: [[ENTRY:.*]]:
-; CHECK-VF4UF1-NEXT: [[CMP27:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-VF4UF1-NEXT: br i1 [[CMP27]], label %[[FOR_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK-VF4UF1-NEXT: [[ENTRY:.*:]]
+; CHECK-VF4UF1-NEXT: br label %[[FOR_PREHEADER:.*]]
; CHECK-VF4UF1: [[FOR_PREHEADER]]:
; CHECK-VF4UF1-NEXT: [[ARRAYIDX2_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 -1
; CHECK-VF4UF1-NEXT: [[DOTPRE:%.*]] = load i32, ptr [[ARRAYIDX2_PHI_TRANS_INSERT]], align 4
-; CHECK-VF4UF1-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4UF1-NEXT: [[TMP5:%.*]] = add i32 [[N]], -1
+; CHECK-VF4UF1-NEXT: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-VF4UF1-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[TMP6]], 1
; CHECK-VF4UF1-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-VF4UF1-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
; CHECK-VF4UF1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
@@ -202,7 +203,7 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
; CHECK-VF4UF1: [[VECTOR_BODY]]:
; CHECK-VF4UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-VF4UF1-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
; CHECK-VF4UF1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
; CHECK-VF4UF1-NEXT: [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP10]], align 4
; CHECK-VF4UF1-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
@@ -225,25 +226,25 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
; CHECK-VF4UF1: [[SCALAR_PH]]:
; CHECK-VF4UF1-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[DOTPRE]], %[[FOR_PREHEADER]] ]
; CHECK-VF4UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
-; CHECK-VF4UF1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ undef, %[[FOR_PREHEADER]] ]
+; CHECK-VF4UF1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
; CHECK-VF4UF1-NEXT: br label %[[SCALAR_BODY:.*]]
; CHECK-VF4UF1: [[FOR_COND_CLEANUP_LOOPEXIT]]:
-; CHECK-VF4UF1-NEXT: [[MINMAX_0_COND_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP19]], %[[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF1-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK-VF4UF1-NEXT: [[MINMAX_0_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP19]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4UF1-NEXT: br label %[[FOR_COND_CLEANUP:.*]]
; CHECK-VF4UF1: [[FOR_COND_CLEANUP]]:
-; CHECK-VF4UF1-NEXT: [[MINMAX_0_LCSSA:%.*]] = phi i32 [ undef, %[[ENTRY]] ], [ [[MINMAX_0_COND_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
; CHECK-VF4UF1-NEXT: ret i32 [[MINMAX_0_LCSSA]]
; CHECK-VF4UF1: [[SCALAR_BODY]]:
;
; CHECK-VF4UF2-LABEL: define i32 @recurrence_2(
; CHECK-VF4UF2-SAME: ptr readonly captures(none) [[A:%.*]], i32 [[N:%.*]]) {
-; CHECK-VF4UF2-NEXT: [[ENTRY:.*]]:
-; CHECK-VF4UF2-NEXT: [[CMP27:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-VF4UF2-NEXT: br i1 [[CMP27]], label %[[FOR_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK-VF4UF2-NEXT: [[ENTRY:.*:]]
+; CHECK-VF4UF2-NEXT: br label %[[FOR_PREHEADER:.*]]
; CHECK-VF4UF2: [[FOR_PREHEADER]]:
; CHECK-VF4UF2-NEXT: [[ARRAYIDX2_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 -1
; CHECK-VF4UF2-NEXT: [[DOTPRE:%.*]] = load i32, ptr [[ARRAYIDX2_PHI_TRANS_INSERT]], align 4
-; CHECK-VF4UF2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4UF2-NEXT: [[TMP5:%.*]] = add i32 [[N]], -1
+; CHECK-VF4UF2-NEXT: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-VF4UF2-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[TMP6]], 1
; CHECK-VF4UF2-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-VF4UF2-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 3
; CHECK-VF4UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
@@ -261,8 +262,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
; CHECK-VF4UF2: [[VECTOR_BODY]]:
; CHECK-VF4UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-VF4UF2-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD2:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ]
; CHECK-VF4UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
; CHECK-VF4UF2-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-VF4UF2-NEXT: [[TMP13:%.*]] = shl nuw i64 [[TMP12]], 2
@@ -296,19 +297,17 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
; CHECK-VF4UF2: [[SCALAR_PH]]:
; CHECK-VF4UF2-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[DOTPRE]], %[[FOR_PREHEADER]] ]
; CHECK-VF4UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
-; CHECK-VF4UF2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP28]], %[[MIDDLE_BLOCK]] ], [ undef, %[[FOR_PREHEADER]] ]
+; CHECK-VF4UF2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP28]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
; CHECK-VF4UF2-NEXT: br label %[[SCALAR_BODY:.*]]
; CHECK-VF4UF2: [[FOR_COND_CLEANUP_LOOPEXIT]]:
-; CHECK-VF4UF2-NEXT: [[MINMAX_0_COND_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP28]], %[[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF2-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK-VF4UF2-NEXT: [[MINMAX_0_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP28]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4UF2-NEXT: br label %[[FOR_COND_CLEANUP:.*]]
; CHECK-VF4UF2: [[FOR_COND_CLEANUP]]:
-; CHECK-VF4UF2-NEXT: [[MINMAX_0_LCSSA:%.*]] = phi i32 [ undef, %[[ENTRY]] ], [ [[MINMAX_0_COND_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
; CHECK-VF4UF2-NEXT: ret i32 [[MINMAX_0_LCSSA]]
; CHECK-VF4UF2: [[SCALAR_BODY]]:
;
entry:
- %cmp27 = icmp sgt i32 %n, 0
- br i1 %cmp27, label %for.preheader, label %for.cond.cleanup
+ br label %for.preheader
for.preheader:
%arrayidx2.phi.trans.insert = getelementptr inbounds i32, ptr %a, i64 -1
@@ -320,13 +319,12 @@ for.cond.cleanup.loopexit:
br label %for.cond.cleanup
for.cond.cleanup:
- %minmax.0.lcssa = phi i32 [ undef, %entry ], [ %minmax.0.cond.lcssa, %for.cond.cleanup.loopexit ]
- ret i32 %minmax.0.lcssa
+ ret i32 %minmax.0.cond.lcssa
scalar.body:
%0 = phi i32 [ %.pre, %for.preheader ], [ %1, %scalar.body ]
%indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
- %minmax.028 = phi i32 [ undef, %for.preheader ], [ %minmax.0.cond, %scalar.body ]
+ %minmax.028 = phi i32 [ 0, %for.preheader ], [ %minmax.0.cond, %scalar.body ]
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
%1 = load i32, ptr %arrayidx, align 4
%sub3 = sub nsw i32 %1, %0
diff --git a/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
index 58ad64d..8224d6b 100644
--- a/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
@@ -16,7 +16,7 @@ entry:
br label %for.cond
for.cond: ; preds = %for.cond, %entry
- %i.0 = phi i32 [ undef, %entry ], [ %inc, %for.cond ]
+ %i.0 = phi i32 [ poison, %entry ], [ %inc, %for.cond ]
%cmp = icmp slt i32 %i.0, 0
%fsub = fsub double undef, undef
%fadd = fadd double %fsub, 1.000000e+00
@@ -36,7 +36,7 @@ for.cond7.preheader.lr.ph: ; preds = %for.cond4.preheader
for.cond7.preheader: ; preds = %for.cond7.preheader.lr.ph, %for.inc23
%y.017 = phi i32 [ 0, %for.cond7.preheader.lr.ph ], [ %inc24, %for.inc23 ]
%i.116 = phi i32 [ 0, %for.cond7.preheader.lr.ph ], [ %i.2.lcssa, %for.inc23 ]
- %n.015 = phi i32 [ undef, %for.cond7.preheader.lr.ph ], [ %inc25, %for.inc23 ]
+ %n.015 = phi i32 [ poison, %for.cond7.preheader.lr.ph ], [ %inc25, %for.inc23 ]
%1 = load i32, ptr @b, align 4, !tbaa !5
%tobool11 = icmp eq i32 %1, 0
br i1 %tobool11, label %for.inc23, label %for.body8.lr.ph
diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
index f4d5a84..cd9eb72 100644
--- a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
@@ -23,7 +23,7 @@ define i64 @pr62565_incoming_value_known_undef(i64 %a, ptr %src) {
; CHECK: middle.block:
; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP2]])
; CHECK-NEXT: [[TMP5:%.*]] = freeze i1 [[TMP4]]
-; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP5]], i64 [[A]], i64 undef
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP5]], i64 [[A]], i64 poison
; CHECK-NEXT: br label [[EXIT:%.*]]
; CHECK: exit:
; CHECK-NEXT: ret i64 [[RDX_SELECT]]
@@ -33,7 +33,7 @@ entry:
loop:
%iv = phi i32 [ 1, %entry ], [ %add, %loop ]
- %red = phi i64 [ undef, %entry ], [ %select, %loop ]
+ %red = phi i64 [ poison, %entry ], [ %select, %loop ]
%gep = getelementptr inbounds i32, ptr %src, i32 %iv
%l = load i32, ptr %gep
%c = icmp eq i32 %l, 1
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index 358f1b0..71311db 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -102,7 +102,7 @@ define void @blend_chain_iv(i1 %c) {
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[PREDPHI1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI1]], <4 x i64> undef
+; CHECK-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI1]], <4 x i64> poison
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 1
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 2
@@ -139,11 +139,11 @@ loop.next.2:
br label %loop.next.3
loop.next.3:
- %blend.1 = phi i64 [ undef, %loop.next ], [ %iv, %loop.next.2 ]
+ %blend.1 = phi i64 [ poison, %loop.next ], [ %iv, %loop.next.2 ]
br label %loop.latch
loop.latch: ; preds = %loop.next, %loop.header
- %blend = phi i64 [ undef, %loop.header ], [ %blend.1, %loop.next.3 ]
+ %blend = phi i64 [ poison, %loop.header ], [ %blend.1, %loop.next.3 ]
%dst.ptr = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 %blend
store i16 0, ptr %dst.ptr
%iv.next = add nuw nsw i64 %iv, 1
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
index 3ee9cf8..ffdff21 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
@@ -124,18 +124,14 @@ define void @test_known_trip_count() {
; CHECK-NEXT: [[TMP27:%.*]] = fadd <2 x double> [[WIDE_LOAD3_13]], [[WIDE_LOAD5_13]]
; CHECK-NEXT: store <2 x double> [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 416), align 16
; CHECK-NEXT: store <2 x double> [[TMP27]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 432), align 16
-; CHECK-NEXT: [[WIDE_LOAD_14:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @b, i64 448), align 16
-; CHECK-NEXT: [[WIDE_LOAD3_14:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @b, i64 464), align 16
-; CHECK-NEXT: [[WIDE_LOAD4_14:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @c, i64 448), align 16
-; CHECK-NEXT: [[WIDE_LOAD5_14:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @c, i64 464), align 16
-; CHECK-NEXT: [[TMP28:%.*]] = fadd <2 x double> [[WIDE_LOAD_14]], [[WIDE_LOAD4_14]]
-; CHECK-NEXT: [[TMP29:%.*]] = fadd <2 x double> [[WIDE_LOAD3_14]], [[WIDE_LOAD5_14]]
-; CHECK-NEXT: store <2 x double> [[TMP28]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 448), align 16
-; CHECK-NEXT: store <2 x double> [[TMP29]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 464), align 16
-; CHECK-NEXT: [[TMP30:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @b, i64 480), align 16
-; CHECK-NEXT: [[TMP31:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @c, i64 480), align 16
+; CHECK-NEXT: [[TMP30:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @b, i64 448), align 16
+; CHECK-NEXT: [[TMP31:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @c, i64 448), align 16
; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP30]], [[TMP31]]
-; CHECK-NEXT: store double [[ADD]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 480), align 16
+; CHECK-NEXT: store double [[ADD]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 448), align 16
+; CHECK-NEXT: [[TMP32:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @b, i64 456), align 8
+; CHECK-NEXT: [[TMP33:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @c, i64 456), align 8
+; CHECK-NEXT: [[ADD_1:%.*]] = fadd double [[TMP32]], [[TMP33]]
+; CHECK-NEXT: store double [[ADD_1]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 456), align 8
; CHECK-NEXT: ret void
;
entry:
@@ -143,7 +139,7 @@ entry:
for.cond:
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
- %cmp = icmp slt i32 %i.0, 61
+ %cmp = icmp slt i32 %i.0, 58
br i1 %cmp, label %for.body, label %exit
for.body:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll
new file mode 100644
index 0000000..d16843c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-100 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+define ptr @test(ptr %d) {
+; CHECK-LABEL: define ptr @test(
+; CHECK-SAME: ptr [[D:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT: [[CMP4_2:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[CMP4_2]], i64 0, i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = xor i64 0, 0
+; CHECK-NEXT: [[TMP3:%.*]] = udiv i64 [[TMP2]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = udiv i64 1, 0
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <6 x i64> poison, i64 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <6 x i64> [[TMP5]], i64 [[TMP3]], i32 1
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <6 x i64> [[TMP6]], i64 [[TMP4]], i32 4
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <6 x i64> [[TMP7]], <6 x i64> poison, <6 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 4>
+; CHECK-NEXT: [[TMP9:%.*]] = mul <6 x i64> [[TMP8]], <i64 2, i64 6, i64 1, i64 1, i64 1, i64 0>
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <6 x i64> [[TMP9]], i32 0
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP10]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <6 x i64> [[TMP9]], i32 1
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <6 x i64> [[TMP9]], i32 2
+; CHECK-NEXT: [[SCEVGEP42:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <6 x i64> [[TMP9]], i32 3
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <6 x i64> [[TMP9]], i32 4
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP17]]
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <6 x i64> [[TMP9]], i32 5
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP19]]
+; CHECK-NEXT: ret ptr [[TMP20]]
+;
+entry:
+ %0 = load i8, ptr null, align 1
+ %cmp4.2 = icmp eq i8 %0, 0
+ %1 = select i1 %cmp4.2, i64 0, i64 0
+ %2 = shl i64 %1, 1
+ %3 = getelementptr i8, ptr %d, i64 %2
+ %4 = xor i64 0, 0
+ %5 = udiv i64 %4, 0
+ %6 = mul i64 %5, 6
+ %7 = getelementptr i8, ptr %d, i64 %6
+ %8 = shl i64 %1, 0
+ %scevgep42 = getelementptr i8, ptr %d, i64 %8
+ %9 = mul i64 %5, 1
+ %10 = getelementptr i8, ptr %d, i64 %9
+ %11 = udiv i64 1, 0
+ %12 = mul i64 %11, 1
+ %13 = getelementptr i8, ptr %d, i64 %12
+ %14 = mul i64 %11, 0
+ %15 = getelementptr i8, ptr %d, i64 %14
+ ret ptr %15
+}
diff --git a/llvm/test/Transforms/SpeculativeExecution/spec-casts.ll b/llvm/test/Transforms/SpeculativeExecution/spec-casts.ll
index c966bed..c7500cf 100644
--- a/llvm/test/Transforms/SpeculativeExecution/spec-casts.ll
+++ b/llvm/test/Transforms/SpeculativeExecution/spec-casts.ll
@@ -1,147 +1,234 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt < %s -S -passes=speculative-execution \
; RUN: -spec-exec-max-speculation-cost 4 -spec-exec-max-not-hoisted 3 \
; RUN: | FileCheck %s
-; CHECK-LABEL: @ifThen_bitcast(
-; CHECK: bitcast
-; CHECK: br i1 true
-define void @ifThen_bitcast() {
+define void @ifThen_bitcast(i32 %arg) {
+; CHECK-LABEL: define void @ifThen_bitcast(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT: [[X:%.*]] = bitcast i32 [[ARG]] to float
+; CHECK-NEXT: br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK: [[A]]:
+; CHECK-NEXT: br label %[[B]]
+; CHECK: [[B]]:
+; CHECK-NEXT: ret void
+;
br i1 true, label %a, label %b
a:
- %x = bitcast i32 undef to float
+ %x = bitcast i32 %arg to float
br label %b
b:
ret void
}
-; CHECK-LABEL: @ifThen_ptrtoint(
-; CHECK: ptrtoint
-; CHECK: br i1 true
-define void @ifThen_ptrtoint() {
+define void @ifThen_ptrtoint(ptr %arg) {
+; CHECK-LABEL: define void @ifThen_ptrtoint(
+; CHECK-SAME: ptr [[ARG:%.*]]) {
+; CHECK-NEXT: [[X:%.*]] = ptrtoint ptr [[ARG]] to i64
+; CHECK-NEXT: br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK: [[A]]:
+; CHECK-NEXT: br label %[[B]]
+; CHECK: [[B]]:
+; CHECK-NEXT: ret void
+;
br i1 true, label %a, label %b
a:
- %x = ptrtoint ptr undef to i64
+ %x = ptrtoint ptr %arg to i64
br label %b
b:
ret void
}
-; CHECK-LABEL: @ifThen_inttoptr(
-; CHECK: inttoptr
-; CHECK: br i1 true
-define void @ifThen_inttoptr() {
+define void @ifThen_ptrtoaddr(ptr %arg) {
+; CHECK-LABEL: define void @ifThen_ptrtoaddr(
+; CHECK-SAME: ptr [[ARG:%.*]]) {
+; CHECK-NEXT: [[X:%.*]] = ptrtoaddr ptr [[ARG]] to i64
+; CHECK-NEXT: br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK: [[A]]:
+; CHECK-NEXT: br label %[[B]]
+; CHECK: [[B]]:
+; CHECK-NEXT: ret void
+;
br i1 true, label %a, label %b
a:
- %x = inttoptr i64 undef to ptr
+ %x = ptrtoaddr ptr %arg to i64
br label %b
b:
ret void
}
-; CHECK-LABEL: @ifThen_addrspacecast(
-; CHECK: addrspacecast
-; CHECK: br i1 true
-define void @ifThen_addrspacecast() {
+define void @ifThen_inttoptr(i64 %arg) {
+; CHECK-LABEL: define void @ifThen_inttoptr(
+; CHECK-SAME: i64 [[ARG:%.*]]) {
+; CHECK-NEXT: [[X:%.*]] = inttoptr i64 [[ARG]] to ptr
+; CHECK-NEXT: br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK: [[A]]:
+; CHECK-NEXT: br label %[[B]]
+; CHECK: [[B]]:
+; CHECK-NEXT: ret void
+;
+ br i1 true, label %a, label %b
+
+a:
+ %x = inttoptr i64 %arg to ptr
+ br label %b
+
+b:
+ ret void
+}
+
+define void @ifThen_addrspacecast(ptr %arg) {
+; CHECK-LABEL: define void @ifThen_addrspacecast(
+; CHECK-SAME: ptr [[ARG:%.*]]) {
+; CHECK-NEXT: [[X:%.*]] = addrspacecast ptr [[ARG]] to ptr addrspace(1)
+; CHECK-NEXT: br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK: [[A]]:
+; CHECK-NEXT: br label %[[B]]
+; CHECK: [[B]]:
+; CHECK-NEXT: ret void
+;
br i1 true, label %a, label %b
a:
- %x = addrspacecast ptr undef to ptr addrspace(1)
+ %x = addrspacecast ptr %arg to ptr addrspace(1)
br label %b
b:
ret void
}
-; CHECK-LABEL: @ifThen_fptoui(
-; CHECK: fptoui
-; CHECK: br i1 true
-define void @ifThen_fptoui() {
+define void @ifThen_fptoui(float %arg) {
+; CHECK-LABEL: define void @ifThen_fptoui(
+; CHECK-SAME: float [[ARG:%.*]]) {
+; CHECK-NEXT: [[X:%.*]] = fptoui float [[ARG]] to i32
+; CHECK-NEXT: br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK: [[A]]:
+; CHECK-NEXT: br label %[[B]]
+; CHECK: [[B]]:
+; CHECK-NEXT: ret void
+;
br i1 true, label %a, label %b
a:
- %x = fptoui float undef to i32
+ %x = fptoui float %arg to i32
br label %b
b:
ret void
}
-; CHECK-LABEL: @ifThen_fptosi(
-; CHECK: fptosi
-; CHECK: br i1 true
-define void @ifThen_fptosi() {
+define void @ifThen_fptosi(float %arg) {
+; CHECK-LABEL: define void @ifThen_fptosi(
+; CHECK-SAME: float [[ARG:%.*]]) {
+; CHECK-NEXT: [[X:%.*]] = fptosi float [[ARG]] to i32
+; CHECK-NEXT: br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK: [[A]]:
+; CHECK-NEXT: br label %[[B]]
+; CHECK: [[B]]:
+; CHECK-NEXT: ret void
+;
br i1 true, label %a, label %b
a:
- %x = fptosi float undef to i32
+ %x = fptosi float %arg to i32
br label %b
b:
ret void
}
-; CHECK-LABEL: @ifThen_uitofp(
-; CHECK: uitofp
-; CHECK: br i1 true
-define void @ifThen_uitofp() {
+define void @ifThen_uitofp(i32 %arg) {
+; CHECK-LABEL: define void @ifThen_uitofp(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT: [[X:%.*]] = uitofp i32 [[ARG]] to float
+; CHECK-NEXT: br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK: [[A]]:
+; CHECK-NEXT: br label %[[B]]
+; CHECK: [[B]]:
+; CHECK-NEXT: ret void
+;
br i1 true, label %a, label %b
a:
- %x = uitofp i32 undef to float
+ %x = uitofp i32 %arg to float
br label %b
b:
ret void
}
-; CHECK-LABEL: @ifThen_sitofp(
-; CHECK: sitofp
-; CHECK: br i1 true
-define void @ifThen_sitofp() {
+define void @ifThen_sitofp(i32 %arg) {
+; CHECK-LABEL: define void @ifThen_sitofp(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT: [[X:%.*]] = sitofp i32 [[ARG]] to float
+; CHECK-NEXT: br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK: [[A]]:
+; CHECK-NEXT: br label %[[B]]
+; CHECK: [[B]]:
+; CHECK-NEXT: ret void
+;
br i1 true, label %a, label %b
a:
- %x = sitofp i32 undef to float
+ %x = sitofp i32 %arg to float
br label %b
b:
ret void
}
-; CHECK-LABEL: @ifThen_fpext(
-; CHECK: fpext
-; CHECK: br i1 true
-define void @ifThen_fpext() {
+define void @ifThen_fpext(float %arg) {
+; CHECK-LABEL: define void @ifThen_fpext(
+; CHECK-SAME: float [[ARG:%.*]]) {
+; CHECK-NEXT: [[X:%.*]] = fpext float [[ARG]] to double
+; CHECK-NEXT: br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK: [[A]]:
+; CHECK-NEXT: br label %[[B]]
+; CHECK: [[B]]:
+; CHECK-NEXT: ret void
+;
br i1 true, label %a, label %b
a:
- %x = fpext float undef to double
+ %x = fpext float %arg to double
br label %b
b:
ret void
}
-; CHECK-LABEL: @ifThen_fptrunc(
-; CHECK: fptrunc
-; CHECK: br i1 true
-define void @ifThen_fptrunc() {
+define void @ifThen_fptrunc(double %arg) {
+; CHECK-LABEL: define void @ifThen_fptrunc(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT: [[X:%.*]] = fptrunc double [[ARG]] to float
+; CHECK-NEXT: br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK: [[A]]:
+; CHECK-NEXT: br label %[[B]]
+; CHECK: [[B]]:
+; CHECK-NEXT: ret void
+;
br i1 true, label %a, label %b
a:
- %x = fptrunc double undef to float
+ %x = fptrunc double %arg to float
br label %b
b:
ret void
}
-; CHECK-LABEL: @ifThen_trunc(
-; CHECK: trunc
-; CHECK: br i1 true
-define void @ifThen_trunc() {
+define void @ifThen_trunc(i32 %arg) {
+; CHECK-LABEL: define void @ifThen_trunc(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT: [[X:%.*]] = trunc i32 [[ARG]] to i16
+; CHECK-NEXT: br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK: [[A]]:
+; CHECK-NEXT: br label %[[B]]
+; CHECK: [[B]]:
+; CHECK-NEXT: ret void
+;
br i1 true, label %a, label %b
a:
- %x = trunc i32 undef to i16
+ %x = trunc i32 %arg to i16
br label %b
b:
diff --git a/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-gsym-merged-callsites-dsym.yaml b/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-gsym-merged-callsites-dsym.yaml
index 42ef7f3..5f19a29 100644
--- a/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-gsym-merged-callsites-dsym.yaml
+++ b/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-gsym-merged-callsites-dsym.yaml
@@ -4,6 +4,7 @@
# RUN: split-file %s %t
# RUN: yaml2obj %t/merged_callsites.dSYM.yaml -o %t/merged_callsites.dSYM
+# The object file is manually tampered with such that the LLVM_stmt_seq of function_bad_stmt_seq is 0xFFFFFFFF.
# RUN: llvm-gsymutil --num-threads=1 --convert=%t/merged_callsites.dSYM --merged-functions --callsites-yaml-file=%t/callsites.yaml -o %t/call_sites_dSYM.gsym
# RUN: llvm-gsymutil --num-threads=1 --convert=%t/merged_callsites.dSYM --merged-functions --dwarf-callsites -o %t/dwarf_call_sites_dSYM.gsym
@@ -36,7 +37,13 @@
# CHECK-MERGED-CALLSITES: CallSites (by relative return offset):
# CHECK-MERGED-CALLSITES-NEXT: 0x[[#%.4x,]] Flags[None] MatchRegex[function2_copy1]
-# CHECK-MERGED-CALLSITES: FunctionInfo @ 0x[[#%x,MAIN:]]: [0x[[#%x,MAIN_START:]] - 0x[[#%x,MAIN_END:]]) "main"
+# If we don't do anything, a function with bad LLVM_stmt_seq won't have any call site filter
+# More importantly, the line number will be at the function definition.
+# CHECK-MERGED-CALLSITES: FunctionInfo @ 0x[[#%x,BAD_STMT_SEQ:]]: [0x[[#%x,BAD_STMT_SEQ_START:]] - 0x[[#%x,BAD_STMT_SEQ_END:]]) "function_bad_stmt_seq"
+# CHECK-MERGED-CALLSITES-NEXT: LineTable:
+# CHECK-MERGED-CALLSITES-NEXT: 0x00000001000003b0 ./merged_funcs_test.cpp:65
+
+# CHECK-MERGED-CALLSITES-NEXT: FunctionInfo @ 0x[[#%x,MAIN:]]: [0x[[#%x,MAIN_START:]] - 0x[[#%x,MAIN_END:]]) "main"
# CHECK-MERGED-CALLSITES: CallSites (by relative return offset):
# CHECK-MERGED-CALLSITES-NEXT: 0x[[#%.4x,]] Flags[None] MatchRegex[function1]
# CHECK-MERGED-CALLSITES-NEXT: 0x[[#%.4x,]] Flags[None] MatchRegex[function2_copy2]
@@ -44,16 +51,17 @@
# CHECK-MERGED-CALLSITES-NEXT: 0x[[#%.4x,]] Flags[None] MatchRegex[function3_copy2]
# CHECK-MERGED-CALLSITES-NEXT: 0x[[#%.4x,]] Flags[None] MatchRegex[function2_copy1]
-
### Check that we can correctly resove merged functions using callstacks:
### Resolve two callstacks containing merged functions.
### We use the value obtained from `CallSites:[FILTER]` to pass to the next call to `llvm-gsymutil` via `--merged-functions-filter`.
### The callstacks resolve differently based on the merged functions filter.
-### 0x00000001000003d0 => 0x000000010000037c => 0x000000010000035c => 0x0000000100000340
-### 0x00000001000003e8 =========================> 0x000000010000035c => 0x0000000100000340
+### 0x00000001000003d8 => 0x000000010000037c => 0x000000010000035c => 0x0000000100000340
+### 0x00000001000003f0 =========================> 0x000000010000035c => 0x0000000100000340
+###
+### We added a new function, for main function, +8 for line numbers, +0x8 for addresses.
-# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --merged-functions --address=0x00000001000003d0 | FileCheck --check-prefix=CHECK-C1 %s
-# CHECK-C1: 0x00000001000003d0: main + 32 @ ./merged_funcs_test.cpp:63
+# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --merged-functions --address=0x00000001000003d8 | FileCheck --check-prefix=CHECK-C1 %s
+# CHECK-C1: 0x00000001000003d8: main + 32 @ ./merged_funcs_test.cpp:71
# CHECK-C1-NEXT: CallSites: function2_copy2
# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --merged-functions --address=0x000000010000037c --merged-functions-filter="function2_copy2" | FileCheck --check-prefix=CHECK-C2 %s
@@ -73,9 +81,9 @@
### ----------------------------------------------------------------------------------------------------------------------------------
### Resolve the 2nd call stack - the 2nd and 3rd addresses are the same but they resolve to a different function because of the filter
-# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --address=0x00000001000003e8 --merged-functions | FileCheck --check-prefix=CHECK-C5 %s
-# CHECK-C5: Found 1 function at address 0x00000001000003e8:
-# CHECK-C5-NEXT: 0x00000001000003e8: main + 56 @ ./merged_funcs_test.cpp:64
+# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --address=0x00000001000003f0 --merged-functions | FileCheck --check-prefix=CHECK-C5 %s
+# CHECK-C5: Found 1 function at address 0x00000001000003f0:
+# CHECK-C5-NEXT: 0x00000001000003f0: main + 56 @ ./merged_funcs_test.cpp:72
# CHECK-C5-NEXT: CallSites: function3_copy2
# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --merged-functions --address=0x000000010000035c --merged-functions-filter="function3_copy2" | FileCheck --check-prefix=CHECK-C6 %s
@@ -87,6 +95,9 @@
# CHECK-C7: Found 1 function at address 0x0000000100000340:
# CHECK-C7-NEXT: 0x0000000100000340: function4_copy2 + 8 @ ./merged_funcs_test.cpp:14
+# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --address=0x00000001000003b4 --merged-functions | FileCheck --check-prefix=CHECK-BAD %s
+# CHECK-BAD: Found 1 function at address 0x00000001000003b4:
+# CHECK-BAD-NEXT: 0x00000001000003b4: function_bad_stmt_seq + 4 @ ./merged_funcs_test.cpp:65
#--- merged_funcs_test.cpp
#define ATTRIB extern "C" __attribute__((noinline))
@@ -148,6 +159,14 @@ ATTRIB int function1(int a) {
return result;
}
+// Intentional multi-line function definition
+ATTRIB
+int function_bad_stmt_seq(
+ int a
+) {
+ return a + 1;
+}
+
int main() {
int sum = 0;
sum += function1(1);
@@ -155,6 +174,7 @@ int main() {
sum += function4_copy2(4);
sum += function3_copy2(41);
sum += function2_copy1(11);
+ sum += function_bad_stmt_seq(sum);
return sum;
}
@@ -229,7 +249,7 @@ FileHeader:
LoadCommands:
- cmd: LC_UUID
cmdsize: 24
- uuid: 4C4C441A-5555-3144-A124-E395C0E7AA96
+ uuid: 4C4C44C9-5555-3144-A16C-82A90B2087B3
- cmd: LC_BUILD_VERSION
cmdsize: 24
platform: 1
@@ -239,9 +259,9 @@ LoadCommands:
- cmd: LC_SYMTAB
cmdsize: 24
symoff: 4096
- nsyms: 10
- stroff: 4256
- strsize: 156
+ nsyms: 11
+ stroff: 4272
+ strsize: 179
- cmd: LC_SEGMENT_64
cmdsize: 72
segname: __PAGEZERO
@@ -268,7 +288,7 @@ LoadCommands:
- sectname: __text
segname: __TEXT
addr: 0x100000338
- size: 208
+ size: 228
offset: 0x0
align: 2
reloff: 0x0
@@ -277,7 +297,7 @@ LoadCommands:
reserved1: 0x0
reserved2: 0x0
reserved3: 0x0
- content: CFFAEDFE0C000001000000000A00000008000000C005000000000000000000001B000000180000004C4C441A55553144A124E395C0E7AA9632000000180000000100000000000B0000000B00000000000200000018000000001000000A000000A01000009C00000019000000480000005F5F504147455A45524F00000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000019000000980000005F5F54455854000000000000000000000000000001000000
+ content: CFFAEDFE0C000001000000000A00000008000000C005000000000000000000001B000000180000004C4C44C955553144A16C82A90B2087B332000000180000000100000000000B0000000B00000000000200000018000000001000000B000000B0100000B300000019000000480000005F5F504147455A45524F00000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000019000000980000005F5F544558540000000000000000000000000000010000000040000000000000000000000000000000000000
- cmd: LC_SEGMENT_64
cmdsize: 152
segname: __DATA
@@ -308,7 +328,7 @@ LoadCommands:
vmaddr: 4295000064
vmsize: 4096
fileoff: 4096
- filesize: 316
+ filesize: 355
maxprot: 1
initprot: 1
nsects: 0
@@ -319,7 +339,7 @@ LoadCommands:
vmaddr: 4295004160
vmsize: 4096
fileoff: 8192
- filesize: 3507
+ filesize: 3884
maxprot: 7
initprot: 3
nsects: 11
@@ -328,7 +348,7 @@ LoadCommands:
- sectname: __debug_line
segname: __DWARF
addr: 0x100009000
- size: 323
+ size: 357
offset: 0x2000
align: 0
reloff: 0x0
@@ -339,9 +359,9 @@ LoadCommands:
reserved3: 0x0
- sectname: __debug_aranges
segname: __DWARF
- addr: 0x100009143
+ addr: 0x100009165
size: 48
- offset: 0x2143
+ offset: 0x2165
align: 0
reloff: 0x0
nreloc: 0
@@ -351,9 +371,9 @@ LoadCommands:
reserved3: 0x0
- sectname: __debug_loc
segname: __DWARF
- addr: 0x100009173
- size: 1026
- offset: 0x2173
+ addr: 0x100009195
+ size: 1083
+ offset: 0x2195
align: 0
reloff: 0x0
nreloc: 0
@@ -361,12 +381,12 @@ LoadCommands:
reserved1: 0x0
reserved2: 0x0
reserved3: 0x0
- content: 00000000000000000800000000000000010050080000000000000014000000000000000400A301509F0000000000000000000000000000000004000000000000000C0000000000000001005800000000000000000000000000000000080000000000000014000000000000000100500000000000000000000000000000000000000000000000000800000000000000010050080000000000000014000000000000000400A301509F0000000000000000000000000000000004000000000000000C0000000000000001005800000000000000000000000000000000080000000000000014000000000000000100500000000000000000000000000000000014000000000000002000000000000000010050200000000000000034000000000000000400A301509F00000000000000000000000000000000240000000000000034000000000000000100500000000000000000000000000000000014000000000000002000000000000000010050200000000000000034000000000000000400A301509F00000000000000000000000000000000240000000000000034000000000000000100500000000000000000000000000000000034000000000000004000000000000000010050400000000000000058000000000000000400A301509F000000000000000000000000000000003C0000000000000040000000000000000300707E9F000000000000000000000000000000004400000000000000580000000000000001005000000000000000000000000000000000340000000000000040000000000000000300707E9F00000000000000000000000000000000440000000000000058000000000000000100500000000000000000000000000000000034000000000000004000000000000000010050400000000000000058000000000000000400A301509F000000000000000000000000000000003C0000000000000040000000000000000300707E9F000000000000000000000000000000004400000000000000580000000000000001005000000000000000000000000000000000340000000000000040000000000000000300707E9F00000000000000000000000000000000440000000000000058000000000000000100500000000000000000000000000000000058000000000000006400000000000000010050640000000000000078000000000000000400A301509F00000000000000000000000000000000680000000000000078000000000000000100500000000000000000000000000000000084000000000000009000000000000000030011009F90000000000000009C000000000000000100639C00000000000000A800000000000000010064B800000000000000C00000000000000001006300000000000000000000000000000000
+ content: 00000000000000000800000000000000010050080000000000000014000000000000000400A301509F0000000000000000000000000000000004000000000000000C0000000000000001005800000000000000000000000000000000080000000000000014000000000000000100500000000000000000000000000000000000000000000000000800000000000000010050080000000000000014000000000000000400A301509F0000000000000000000000000000000004000000000000000C0000000000000001005800000000000000000000000000000000080000000000000014000000000000000100500000000000000000000000000000000014000000000000002000000000000000010050200000000000000034000000000000000400A301509F00000000000000000000000000000000240000000000000034000000000000000100500000000000000000000000000000000014000000000000002000000000000000010050200000000000000034000000000000000400A301509F00000000000000000000000000000000240000000000000034000000000000000100500000000000000000000000000000000034000000000000004000000000000000010050400000000000000058000000000000000400A301509F000000000000000000000000000000003C0000000000000040000000000000000300707E9F000000000000000000000000000000004400000000000000580000000000000001005000000000000000000000000000000000340000000000000040000000000000000300707E9F00000000000000000000000000000000440000000000000058000000000000000100500000000000000000000000000000000034000000000000004000000000000000010050400000000000000058000000000000000400A301509F000000000000000000000000000000003C0000000000000040000000000000000300707E9F000000000000000000000000000000004400000000000000580000000000000001005000000000000000000000000000000000340000000000000040000000000000000300707E9F00000000000000000000000000000000440000000000000058000000000000000100500000000000000000000000000000000058000000000000006400000000000000010050640000000000000078000000000000000400A301509F00000000000000000000000000000000680000000000000078000000000000000100500000000000000000000000000000000078000000000000007C000000000000000100507C0000000000000080000000000000000400A301509F000000000000000000000000000000008C000000000000009800000000000000030011009F9800000000000000A400000000000000010063A400000000000000B000000000000000010064C000000000000000D40000000000000001006300000000000000000000000000000000
- sectname: __debug_info
segname: __DWARF
- addr: 0x100009575
- size: 923
- offset: 0x2575
+ addr: 0x1000095D0
+ size: 988
+ offset: 0x25D0
align: 0
reloff: 0x0
nreloc: 0
@@ -376,9 +396,9 @@ LoadCommands:
reserved3: 0x0
- sectname: __debug_frame
segname: __DWARF
- addr: 0x100009910
- size: 272
- offset: 0x2910
+ addr: 0x1000099AC
+ size: 296
+ offset: 0x29AC
align: 0
reloff: 0x0
nreloc: 0
@@ -386,12 +406,12 @@ LoadCommands:
reserved1: 0x0
reserved2: 0x0
reserved3: 0x0
- content: 14000000FFFFFFFF0400080001781E0C1F000000000000001400000000000000380300000100000014000000000000001400000000000000380300000100000014000000000000001C000000000000004C030000010000002000000000000000480C1D109E019D021C000000000000004C030000010000002000000000000000480C1D109E019D021C000000000000006C030000010000002400000000000000480C1D109E019D021C000000000000006C030000010000002400000000000000480C1D109E019D021C0000000000000090030000010000002000000000000000480C1D109E019D022400000000000000B00300000100000058000000000000004C0C1D109E019D029303940400000000
+ content: 14000000FFFFFFFF0400080001781E0C1F000000000000001400000000000000380300000100000014000000000000001400000000000000380300000100000014000000000000001C000000000000004C030000010000002000000000000000480C1D109E019D021C000000000000004C030000010000002000000000000000480C1D109E019D021C000000000000006C030000010000002400000000000000480C1D109E019D021C000000000000006C030000010000002400000000000000480C1D109E019D021C0000000000000090030000010000002000000000000000480C1D109E019D021400000000000000B00300000100000008000000000000002400000000000000B80300000100000064000000000000004C0C1D109E019D029303940400000000
- sectname: __debug_abbrev
segname: __DWARF
- addr: 0x100009A20
+ addr: 0x100009AD4
size: 260
- offset: 0x2A20
+ offset: 0x2AD4
align: 0
reloff: 0x0
nreloc: 0
@@ -401,9 +421,9 @@ LoadCommands:
reserved3: 0x0
- sectname: __debug_str
segname: __DWARF
- addr: 0x100009B24
- size: 188
- offset: 0x2B24
+ addr: 0x100009BD8
+ size: 357
+ offset: 0x2BD8
align: 0
reloff: 0x0
nreloc: 0
@@ -413,9 +433,9 @@ LoadCommands:
reserved3: 0x0
- sectname: __apple_namespac
segname: __DWARF
- addr: 0x100009BE0
+ addr: 0x100009D3D
size: 36
- offset: 0x2BE0
+ offset: 0x2D3D
align: 0
reloff: 0x0
nreloc: 0
@@ -426,9 +446,9 @@ LoadCommands:
content: 485341480100000001000000000000000C000000000000000100000001000600FFFFFFFF
- sectname: __apple_names
segname: __DWARF
- addr: 0x100009C04
- size: 316
- offset: 0x2C04
+ addr: 0x100009D61
+ size: 344
+ offset: 0x2D61
align: 0
reloff: 0x0
nreloc: 0
@@ -436,12 +456,12 @@ LoadCommands:
reserved1: 0x0
reserved2: 0x0
reserved3: 0x0
- content: 48534148010000000A0000000A0000000C0000000000000001000000010006000000000002000000030000000400000006000000FFFFFFFF0800000009000000FFFFFFFFFFFFFFFF88CB36CFF4B03BD389CB36CF0A452B694908311C0B452B694A08311CDC41AB586A7F9A7CAD7ED75898000000A8000000B8000000C8000000D8000000E8000000F80000000801000018010000280100008900000001000000BB010000000000001B000000010000002E0000000000000099000000010000003A020000000000002D000000010000004F000000000000005800000001000000E50000000000000048000000010000009A0000000000000068000000010000003901000000000000A900000001000000B902000000000000B3000000010000000D030000000000007800000002000000050200008402000000000000
+ content: 48534148010000000B0000000B0000000C0000000000000001000000010006000000000002000000FFFFFFFFFFFFFFFF03000000FFFFFFFFFFFFFFFF04000000080000000A000000FFFFFFFFAD7ED75888CB36CF89CB36CFF4B03BD34908311CDC41AB580A452B696A7F9A7C4A08311C0B452B69404C8F68A4000000B8000000C8000000D8000000E8000000F800000008010000180100002801000038010000480100000B010000020000000502000084020000000000001C01000001000000BB010000000000002C010000010000003A02000000000000AE000000010000002E00000000000000EB00000001000000E5000000000000003C01000001000000B902000000000000C0000000010000004F000000000000005C010000010000003A03000000000000FB000000010000003901000000000000DB000000010000009A0000000000000046010000010000000D03000000000000
- sectname: __apple_types
segname: __DWARF
- addr: 0x100009D40
+ addr: 0x100009EB9
size: 79
- offset: 0x2D40
+ offset: 0x2EB9
align: 0
reloff: 0x0
nreloc: 0
@@ -449,12 +469,12 @@ LoadCommands:
reserved1: 0x0
reserved2: 0x0
reserved3: 0x0
- content: 48534148010000000100000001000000180000000000000004000000010006000300050005000B0006000600000000003080880B38000000290000000100000048000000240000A4283A0C00000000
+ content: 48534148010000000100000001000000180000000000000004000000010006000300050005000B0006000600000000003080880B38000000BC0000000100000048000000240000A4283A0C00000000
- sectname: __apple_objc
segname: __DWARF
- addr: 0x100009D8F
+ addr: 0x100009F08
size: 36
- offset: 0x2D8F
+ offset: 0x2F08
align: 0
reloff: 0x0
nreloc: 0
@@ -469,7 +489,7 @@ LinkEditData:
n_type: 0xF
n_sect: 1
n_desc: 0
- n_value: 4294968240
+ n_value: 4294968248
- n_strx: 8
n_type: 0xF
n_sect: 1
@@ -507,10 +527,15 @@ LinkEditData:
n_value: 4294968208
- n_strx: 121
n_type: 0xF
+ n_sect: 1
+ n_desc: 0
+ n_value: 4294968240
+ - n_strx: 144
+ n_type: 0xF
n_sect: 2
n_desc: 0
n_value: 4294983680
- - n_strx: 136
+ - n_strx: 159
n_type: 0xF
n_sect: 1
n_desc: 16
@@ -526,11 +551,13 @@ LinkEditData:
- _function2_copy1
- _function2_copy2
- _function1
+ - _function_bad_stmt_seq
- _global_result
- __mh_execute_header
DWARF:
debug_str:
- ''
+ - 'Facebook clang version 15.82.1 (https://git.internal.tfbnw.net/repos/git/ro/osmeta/external/llvm-project 3c2ff7d3c6ef63fb9a83574cee1b376c969bd5ec)'
- merged_funcs_test.cpp
- '/'
- .
@@ -547,6 +574,7 @@ DWARF:
- function2_copy1
- function2_copy2
- function1
+ - function_bad_stmt_seq
- main
- sum
debug_abbrev:
@@ -793,9 +821,9 @@ DWARF:
AddressSize: 0x8
Descriptors:
- Address: 0x100000338
- Length: 0xD0
+ Length: 0xE4
debug_info:
- - Length: 0x397
+ - Length: 0x3D8
Version: 4
AbbrevTableID: 0
AbbrOffset: 0x0
@@ -803,31 +831,31 @@ DWARF:
Entries:
- AbbrCode: 0x1
Values:
- - Value: 0x0
- - Value: 0x21
- Value: 0x1
- - Value: 0x17
+ - Value: 0x21
+ - Value: 0x94
+ - Value: 0xAA
- Value: 0x0
- - Value: 0x19
+ - Value: 0xAC
- Value: 0x1
- Value: 0x100000338
- - Value: 0xD0
+ - Value: 0xE4
- AbbrCode: 0x2
Values:
- - Value: 0x1B
+ - Value: 0xAE
- Value: 0x43
- Value: 0x1
- Value: 0x1
- Value: 0x2
- Value: 0x9
- BlockData: [ 0x3, 0x0, 0x40, 0x0, 0x0, 0x1, 0x0, 0x0,
+ BlockData: [ 0x3, 0x0, 0x40, 0x0, 0x0, 0x1, 0x0, 0x0,
0x0 ]
- AbbrCode: 0x3
Values:
- Value: 0x48
- AbbrCode: 0x4
Values:
- - Value: 0x29
+ - Value: 0xBC
- Value: 0x5
- Value: 0x4
- AbbrCode: 0x5
@@ -839,7 +867,7 @@ DWARF:
- Value: 0x1
BlockData: [ 0x6F ]
- Value: 0x1
- - Value: 0x2D
+ - Value: 0xC0
- Value: 0x1
- Value: 0x4
- Value: 0x48
@@ -848,21 +876,21 @@ DWARF:
- AbbrCode: 0x6
Values:
- Value: 0x0
- - Value: 0x3D
+ - Value: 0xD0
- Value: 0x1
- Value: 0x4
- Value: 0x48
- AbbrCode: 0x7
Values:
- Value: 0x39
- - Value: 0x3F
+ - Value: 0xD2
- Value: 0x1
- Value: 0x5
- Value: 0x48
- AbbrCode: 0x7
Values:
- Value: 0x5C
- - Value: 0x41
+ - Value: 0xD4
- Value: 0x1
- Value: 0x6
- Value: 0x48
@@ -876,7 +904,7 @@ DWARF:
- Value: 0x1
BlockData: [ 0x6F ]
- Value: 0x1
- - Value: 0x48
+ - Value: 0xDB
- Value: 0x1
- Value: 0xB
- Value: 0x48
@@ -885,21 +913,21 @@ DWARF:
- AbbrCode: 0x6
Values:
- Value: 0x7F
- - Value: 0x3D
+ - Value: 0xD0
- Value: 0x1
- Value: 0xB
- Value: 0x48
- AbbrCode: 0x7
Values:
- Value: 0xB8
- - Value: 0x3F
+ - Value: 0xD2
- Value: 0x1
- Value: 0xC
- Value: 0x48
- AbbrCode: 0x7
Values:
- Value: 0xDB
- - Value: 0x41
+ - Value: 0xD4
- Value: 0x1
- Value: 0xD
- Value: 0x48
@@ -912,7 +940,7 @@ DWARF:
- Value: 0x1
BlockData: [ 0x6D ]
- Value: 0x1
- - Value: 0x58
+ - Value: 0xEB
- Value: 0x1
- Value: 0x12
- Value: 0x48
@@ -921,20 +949,20 @@ DWARF:
- AbbrCode: 0x6
Values:
- Value: 0xFE
- - Value: 0x3D
+ - Value: 0xD0
- Value: 0x1
- Value: 0x12
- Value: 0x48
- AbbrCode: 0x7
Values:
- Value: 0x137
- - Value: 0x41
+ - Value: 0xD4
- Value: 0x1
- Value: 0x14
- Value: 0x48
- AbbrCode: 0x9
Values:
- - Value: 0x3F
+ - Value: 0xD2
- Value: 0x1
- Value: 0x13
- Value: 0x48
@@ -951,7 +979,7 @@ DWARF:
- Value: 0x1
BlockData: [ 0x6D ]
- Value: 0x1
- - Value: 0x68
+ - Value: 0xFB
- Value: 0x1
- Value: 0x19
- Value: 0x48
@@ -960,20 +988,20 @@ DWARF:
- AbbrCode: 0x6
Values:
- Value: 0x15A
- - Value: 0x3D
+ - Value: 0xD0
- Value: 0x1
- Value: 0x19
- Value: 0x48
- AbbrCode: 0x7
Values:
- Value: 0x193
- - Value: 0x41
+ - Value: 0xD4
- Value: 0x1
- Value: 0x1B
- Value: 0x48
- AbbrCode: 0x9
Values:
- - Value: 0x3F
+ - Value: 0xD2
- Value: 0x1
- Value: 0x1A
- Value: 0x48
@@ -984,7 +1012,7 @@ DWARF:
- AbbrCode: 0x0
- AbbrCode: 0xB
Values:
- - Value: 0x78
+ - Value: 0x10B
- Value: 0x1
- Value: 0x20
- Value: 0x48
@@ -993,19 +1021,19 @@ DWARF:
- Value: 0x1
- AbbrCode: 0xC
Values:
- - Value: 0x3D
+ - Value: 0xD0
- Value: 0x1
- Value: 0x20
- Value: 0x48
- AbbrCode: 0x9
Values:
- - Value: 0x41
+ - Value: 0xD4
- Value: 0x1
- Value: 0x22
- Value: 0x48
- AbbrCode: 0x9
Values:
- - Value: 0x3F
+ - Value: 0xD2
- Value: 0x1
- Value: 0x21
- Value: 0x48
@@ -1018,7 +1046,7 @@ DWARF:
- Value: 0x1
BlockData: [ 0x6D ]
- Value: 0x1
- - Value: 0x89
+ - Value: 0x11C
- Value: 0x1
- Value: 0x27
- Value: 0x48
@@ -1027,21 +1055,21 @@ DWARF:
- AbbrCode: 0x6
Values:
- Value: 0x1B6
- - Value: 0x3D
+ - Value: 0xD0
- Value: 0x1
- Value: 0x27
- Value: 0x48
- AbbrCode: 0x7
Values:
- Value: 0x1EF
- - Value: 0x3F
+ - Value: 0xD2
- Value: 0x1
- Value: 0x28
- Value: 0x48
- AbbrCode: 0x7
Values:
- Value: 0x214
- - Value: 0x41
+ - Value: 0xD4
- Value: 0x1
- Value: 0x29
- Value: 0x48
@@ -1075,7 +1103,7 @@ DWARF:
- Value: 0x1
BlockData: [ 0x6D ]
- Value: 0x1
- - Value: 0x99
+ - Value: 0x12C
- Value: 0x1
- Value: 0x2E
- Value: 0x48
@@ -1084,21 +1112,21 @@ DWARF:
- AbbrCode: 0x6
Values:
- Value: 0x27F
- - Value: 0x3D
+ - Value: 0xD0
- Value: 0x1
- Value: 0x2E
- Value: 0x48
- AbbrCode: 0x7
Values:
- Value: 0x2B8
- - Value: 0x3F
+ - Value: 0xD2
- Value: 0x1
- Value: 0x2F
- Value: 0x48
- AbbrCode: 0x7
Values:
- Value: 0x2DD
- - Value: 0x41
+ - Value: 0xD4
- Value: 0x1
- Value: 0x30
- Value: 0x48
@@ -1132,7 +1160,7 @@ DWARF:
- Value: 0x1
BlockData: [ 0x6D ]
- Value: 0x1
- - Value: 0xA9
+ - Value: 0x13C
- Value: 0x1
- Value: 0x35
- Value: 0x48
@@ -1141,20 +1169,20 @@ DWARF:
- AbbrCode: 0x6
Values:
- Value: 0x348
- - Value: 0x3D
+ - Value: 0xD0
- Value: 0x1
- Value: 0x35
- Value: 0x48
- AbbrCode: 0x7
Values:
- Value: 0x381
- - Value: 0x41
+ - Value: 0xD4
- Value: 0x1
- Value: 0x37
- Value: 0x48
- AbbrCode: 0x9
Values:
- - Value: 0x3F
+ - Value: 0xD2
- Value: 0x1
- Value: 0x36
- Value: 0x48
@@ -1163,31 +1191,54 @@ DWARF:
- Value: 0x1BB
- Value: 0x1000003A0
- AbbrCode: 0x0
- - AbbrCode: 0x8
+ - AbbrCode: 0x5
Values:
- Value: 0x1000003B0
- - Value: 0x58
- - Value: 0x112
+ - Value: 0x8
+ - Value: 0x1
+ - Value: 0xFFFFFFFF
+ - Value: 0x1
+ BlockData: [ 0x6F ]
+ - Value: 0x1
+ - Value: 0x146
+ - Value: 0x1
+ - Value: 0x3E
+ - Value: 0x48
+ - Value: 0x1
+ - Value: 0x1
+ - AbbrCode: 0x6
+ Values:
+ - Value: 0x3A4
+ - Value: 0xD0
+ - Value: 0x1
+ - Value: 0x3F
+ - Value: 0x48
+ - AbbrCode: 0x0
+ - AbbrCode: 0x8
+ Values:
+ - Value: 0x1000003B8
+ - Value: 0x64
+ - Value: 0x12B
- Value: 0x1
BlockData: [ 0x6D ]
- Value: 0x1
- - Value: 0xB3
+ - Value: 0x15C
- Value: 0x1
- - Value: 0x3C
+ - Value: 0x44
- Value: 0x48
- Value: 0x1
- Value: 0x1
- AbbrCode: 0x7
Values:
- - Value: 0x3A4
- - Value: 0xB8
+ - Value: 0x3DD
+ - Value: 0x161
- Value: 0x1
- - Value: 0x3D
+ - Value: 0x45
- Value: 0x48
- AbbrCode: 0x10
Values:
- Value: 0x2B9
- - Value: 0x1000003C4
+ - Value: 0x1000003CC
- AbbrCode: 0x11
Values:
- Value: 0x1
@@ -1198,7 +1249,7 @@ DWARF:
- AbbrCode: 0x10
Values:
- Value: 0x23A
- - Value: 0x1000003D0
+ - Value: 0x1000003D8
- AbbrCode: 0x11
Values:
- Value: 0x1
@@ -1209,7 +1260,7 @@ DWARF:
- AbbrCode: 0x10
Values:
- Value: 0x9A
- - Value: 0x1000003DC
+ - Value: 0x1000003E4
- AbbrCode: 0x11
Values:
- Value: 0x1
@@ -1220,7 +1271,7 @@ DWARF:
- AbbrCode: 0x10
Values:
- Value: 0x139
- - Value: 0x1000003E8
+ - Value: 0x1000003F0
- AbbrCode: 0x11
Values:
- Value: 0x1
@@ -1231,7 +1282,7 @@ DWARF:
- AbbrCode: 0x10
Values:
- Value: 0x1BB
- - Value: 0x1000003F8
+ - Value: 0x100000400
- AbbrCode: 0x11
Values:
- Value: 0x1
@@ -1239,10 +1290,21 @@ DWARF:
- Value: 0x1
BlockData: [ 0x3B ]
- AbbrCode: 0x0
+ - AbbrCode: 0x10
+ Values:
+ - Value: 0x30D
+ - Value: 0x10000040C
+ - AbbrCode: 0x11
+ Values:
+ - Value: 0x1
+ BlockData: [ 0x50 ]
+ - Value: 0x2
+ BlockData: [ 0x83, 0x0 ]
+ - AbbrCode: 0x0
- AbbrCode: 0x0
- AbbrCode: 0x0
debug_line:
- - Length: 319
+ - Length: 353
Version: 4
PrologueLength: 45
MinInstLength: 1
@@ -1495,8 +1557,31 @@ DWARF:
ExtLen: 9
SubOpcode: DW_LNE_set_address
Data: 4294968240
+ - Opcode: DW_LNS_set_column
+ Data: 14
+ - Opcode: DW_LNS_set_prologue_end
+ Data: 0
- Opcode: DW_LNS_advance_line
- SData: 59
+ SData: 64
+ Data: 0
+ - Opcode: DW_LNS_copy
+ Data: 0
+ - Opcode: DW_LNS_set_column
+ Data: 5
+ - Opcode: DW_LNS_negate_stmt
+ Data: 0
+ - Opcode: 0x4A
+ Data: 0
+ - Opcode: DW_LNS_extended_op
+ ExtLen: 1
+ SubOpcode: DW_LNE_end_sequence
+ Data: 0
+ - Opcode: DW_LNS_extended_op
+ ExtLen: 9
+ SubOpcode: DW_LNE_set_address
+ Data: 4294968248
+ - Opcode: DW_LNS_advance_line
+ SData: 67
Data: 0
- Opcode: DW_LNS_copy
Data: 0
@@ -1539,6 +1624,18 @@ DWARF:
- Opcode: 0x82
Data: 0
- Opcode: DW_LNS_set_column
+ Data: 12
+ - Opcode: DW_LNS_negate_stmt
+ Data: 0
+ - Opcode: 0x4B
+ Data: 0
+ - Opcode: DW_LNS_set_column
+ Data: 9
+ - Opcode: DW_LNS_negate_stmt
+ Data: 0
+ - Opcode: 0x82
+ Data: 0
+ - Opcode: DW_LNS_set_column
Data: 5
- Opcode: DW_LNS_negate_stmt
Data: 0